aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorAnton Vorontsov <avorontsov@ru.mvista.com>2009-09-22 19:49:27 -0400
committerAnton Vorontsov <avorontsov@ru.mvista.com>2009-09-22 19:49:27 -0400
commitf056878332a91ed984a116bad4e7d49aefff9e6e (patch)
tree572f4757c8e7811d45e0be0c2ae529c78fb63441 /kernel
parent3961f7c3cf247eee5df7fabadc7a40f2deeb98f3 (diff)
parent7fa07729e439a6184bd824746d06a49cca553f15 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts: drivers/power/wm97xx_battery.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile10
-rw-r--r--kernel/acct.c14
-rw-r--r--kernel/cgroup.c158
-rw-r--r--kernel/cpu.c15
-rw-r--r--kernel/cred.c296
-rw-r--r--kernel/delayacct.c1
-rw-r--r--kernel/dma-coherent.c176
-rw-r--r--kernel/exit.c14
-rw-r--r--kernel/fork.c88
-rw-r--r--kernel/freezer.c7
-rw-r--r--kernel/futex.c76
-rw-r--r--kernel/futex_compat.c6
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/hrtimer.c169
-rw-r--r--kernel/irq/chip.c74
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/internals.h16
-rw-r--r--kernel/irq/manage.c184
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/numa_migrate.c4
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/resend.c3
-rw-r--r--kernel/irq/spurious.c1
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kfifo.c2
-rw-r--r--kernel/kmod.c10
-rw-r--r--kernel/kprobes.c38
-rw-r--r--kernel/kthread.c14
-rw-r--r--kernel/lockdep.c792
-rw-r--r--kernel/lockdep_internals.h2
-rw-r--r--kernel/lockdep_proc.c131
-rw-r--r--kernel/marker.c930
-rw-r--r--kernel/module.c59
-rw-r--r--kernel/panic.c3
-rw-r--r--kernel/perf_counter.c4383
-rw-r--r--kernel/perf_event.c5000
-rw-r--r--kernel/pid.c15
-rw-r--r--kernel/posix-cpu-timers.c7
-rw-r--r--kernel/posix-timers.c42
-rw-r--r--kernel/power/Kconfig14
-rw-r--r--kernel/power/console.c63
-rw-r--r--kernel/power/hibernate.c21
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/snapshot.c414
-rw-r--r--kernel/power/user.c1
-rw-r--r--kernel/printk.c181
-rw-r--r--kernel/profile.c50
-rw-r--r--kernel/ptrace.c6
-rw-r--r--kernel/rcuclassic.c807
-rw-r--r--kernel/rcupdate.c92
-rw-r--r--kernel/rcupreempt.c1539
-rw-r--r--kernel/rcupreempt_trace.c334
-rw-r--r--kernel/rcutorture.c241
-rw-r--r--kernel/rcutree.c370
-rw-r--r--kernel/rcutree.h253
-rw-r--r--kernel/rcutree_plugin.h566
-rw-r--r--kernel/rcutree_trace.c90
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex.c4
-rw-r--r--kernel/sched.c1749
-rw-r--r--kernel/sched_clock.c122
-rw-r--r--kernel/sched_cpupri.c45
-rw-r--r--kernel/sched_debug.c5
-rw-r--r--kernel/sched_fair.c559
-rw-r--r--kernel/sched_features.h122
-rw-r--r--kernel/sched_idletask.c11
-rw-r--r--kernel/sched_rt.c96
-rw-r--r--kernel/signal.c25
-rw-r--r--kernel/smp.c42
-rw-r--r--kernel/softirq.c70
-rw-r--r--kernel/spinlock.c230
-rw-r--r--kernel/sys.c10
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c66
-rw-r--r--kernel/taskstats.c10
-rw-r--r--kernel/time.c9
-rw-r--r--kernel/time/clockevents.c27
-rw-r--r--kernel/time/clocksource.c529
-rw-r--r--kernel/time/jiffies.c6
-rw-r--r--kernel/time/ntp.c7
-rw-r--r--kernel/time/tick-broadcast.c7
-rw-r--r--kernel/time/timekeeping.c535
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/timer.c37
-rw-r--r--kernel/trace/Kconfig49
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/blktrace.c25
-rw-r--r--kernel/trace/ftrace.c305
-rw-r--r--kernel/trace/kmemtrace.c149
-rw-r--r--kernel/trace/power-traces.c20
-rw-r--r--kernel/trace/ring_buffer.c1140
-rw-r--r--kernel/trace/trace.c865
-rw-r--r--kernel/trace/trace.h359
-rw-r--r--kernel/trace/trace_boot.c20
-rw-r--r--kernel/trace/trace_clock.c24
-rw-r--r--kernel/trace/trace_entries.h366
-rw-r--r--kernel/trace/trace_event_profile.c87
-rw-r--r--kernel/trace/trace_event_types.h175
-rw-r--r--kernel/trace/trace_events.c282
-rw-r--r--kernel/trace/trace_events_filter.c298
-rw-r--r--kernel/trace/trace_export.c290
-rw-r--r--kernel/trace/trace_functions.c6
-rw-r--r--kernel/trace/trace_functions_graph.c239
-rw-r--r--kernel/trace/trace_hw_branches.c2
-rw-r--r--kernel/trace/trace_irqsoff.c19
-rw-r--r--kernel/trace/trace_mmiotrace.c16
-rw-r--r--kernel/trace/trace_output.c45
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_power.c214
-rw-r--r--kernel/trace/trace_printk.c3
-rw-r--r--kernel/trace/trace_sched_switch.c59
-rw-r--r--kernel/trace/trace_sched_wakeup.c59
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stack.c54
-rw-r--r--kernel/trace/trace_stat.c51
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_syscalls.c530
-rw-r--r--kernel/trace/trace_workqueue.c32
-rw-r--r--kernel/tracepoint.c52
-rw-r--r--kernel/wait.c5
-rw-r--r--kernel/workqueue.c9
123 files changed, 14721 insertions, 13285 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 2093a691f1c2..187c89b4783d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -80,26 +80,22 @@ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 80obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
81obj-$(CONFIG_SECCOMP) += seccomp.o 81obj-$(CONFIG_SECCOMP) += seccomp.o
82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 82obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
83obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
84obj-$(CONFIG_TREE_RCU) += rcutree.o 83obj-$(CONFIG_TREE_RCU) += rcutree.o
85obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o 84obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
86obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 85obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
87obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
88obj-$(CONFIG_RELAY) += relay.o 86obj-$(CONFIG_RELAY) += relay.o
89obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 87obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
90obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 88obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
91obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 89obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
92obj-$(CONFIG_MARKERS) += marker.o
93obj-$(CONFIG_TRACEPOINTS) += tracepoint.o 90obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
94obj-$(CONFIG_LATENCYTOP) += latencytop.o 91obj-$(CONFIG_LATENCYTOP) += latencytop.o
95obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
96obj-$(CONFIG_FUNCTION_TRACER) += trace/ 92obj-$(CONFIG_FUNCTION_TRACER) += trace/
97obj-$(CONFIG_TRACING) += trace/ 93obj-$(CONFIG_TRACING) += trace/
98obj-$(CONFIG_X86_DS) += trace/ 94obj-$(CONFIG_X86_DS) += trace/
99obj-$(CONFIG_RING_BUFFER) += trace/ 95obj-$(CONFIG_RING_BUFFER) += trace/
100obj-$(CONFIG_SMP) += sched_cpupri.o 96obj-$(CONFIG_SMP) += sched_cpupri.o
101obj-$(CONFIG_SLOW_WORK) += slow-work.o 97obj-$(CONFIG_SLOW_WORK) += slow-work.o
102obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 98obj-$(CONFIG_PERF_EVENTS) += perf_event.o
103 99
104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 100ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 101# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
@@ -119,7 +115,7 @@ $(obj)/config_data.gz: .config FORCE
119 $(call if_changed,gzip) 115 $(call if_changed,gzip)
120 116
121quiet_cmd_ikconfiggz = IKCFG $@ 117quiet_cmd_ikconfiggz = IKCFG $@
122 cmd_ikconfiggz = (echo "static const char kernel_config_data[] = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ 118 cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@
123targets += config_data.h 119targets += config_data.h
124$(obj)/config_data.h: $(obj)/config_data.gz FORCE 120$(obj)/config_data.h: $(obj)/config_data.gz FORCE
125 $(call if_changed,ikconfiggz) 121 $(call if_changed,ikconfiggz)
diff --git a/kernel/acct.c b/kernel/acct.c
index 7afa31564162..9a4715a2f6bf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -215,6 +215,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
215static int acct_on(char *name) 215static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt;
218 int error; 219 int error;
219 struct pid_namespace *ns; 220 struct pid_namespace *ns;
220 struct bsd_acct_struct *acct = NULL; 221 struct bsd_acct_struct *acct = NULL;
@@ -256,11 +257,12 @@ static int acct_on(char *name)
256 acct = NULL; 257 acct = NULL;
257 } 258 }
258 259
259 mnt_pin(file->f_path.mnt); 260 mnt = file->f_path.mnt;
261 mnt_pin(mnt);
260 acct_file_reopen(ns->bacct, file, ns); 262 acct_file_reopen(ns->bacct, file, ns);
261 spin_unlock(&acct_lock); 263 spin_unlock(&acct_lock);
262 264
263 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ 265 mntput(mnt); /* it's pinned, now give up active reference */
264 kfree(acct); 266 kfree(acct);
265 267
266 return 0; 268 return 0;
@@ -489,13 +491,17 @@ static void do_acct_process(struct bsd_acct_struct *acct,
489 u64 run_time; 491 u64 run_time;
490 struct timespec uptime; 492 struct timespec uptime;
491 struct tty_struct *tty; 493 struct tty_struct *tty;
494 const struct cred *orig_cred;
495
496 /* Perform file operations on behalf of whoever enabled accounting */
497 orig_cred = override_creds(file->f_cred);
492 498
493 /* 499 /*
494 * First check to see if there is enough free_space to continue 500 * First check to see if there is enough free_space to continue
495 * the process accounting system. 501 * the process accounting system.
496 */ 502 */
497 if (!check_free_space(acct, file)) 503 if (!check_free_space(acct, file))
498 return; 504 goto out;
499 505
500 /* 506 /*
501 * Fill the accounting struct with the needed info as recorded 507 * Fill the accounting struct with the needed info as recorded
@@ -576,6 +582,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
576 sizeof(acct_t), &file->f_pos); 582 sizeof(acct_t), &file->f_pos);
577 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; 583 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
578 set_fs(fs); 584 set_fs(fs);
585out:
586 revert_creds(orig_cred);
579} 587}
580 588
581/** 589/**
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3737a682cdf5..213b7f92fcdd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -47,6 +47,7 @@
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h> 48#include <linux/namei.h>
49#include <linux/smp_lock.h> 49#include <linux/smp_lock.h>
50#include <linux/pid_namespace.h>
50 51
51#include <asm/atomic.h> 52#include <asm/atomic.h>
52 53
@@ -595,10 +596,11 @@ void cgroup_unlock(void)
595static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); 596static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
596static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 597static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
597static int cgroup_populate_dir(struct cgroup *cgrp); 598static int cgroup_populate_dir(struct cgroup *cgrp);
598static struct inode_operations cgroup_dir_inode_operations; 599static const struct inode_operations cgroup_dir_inode_operations;
599static struct file_operations proc_cgroupstats_operations; 600static struct file_operations proc_cgroupstats_operations;
600 601
601static struct backing_dev_info cgroup_backing_dev_info = { 602static struct backing_dev_info cgroup_backing_dev_info = {
603 .name = "cgroup",
602 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 604 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
603}; 605};
604 606
@@ -734,16 +736,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
734 * reference to css->refcnt. In general, this refcnt is expected to goes down 736 * reference to css->refcnt. In general, this refcnt is expected to goes down
735 * to zero, soon. 737 * to zero, soon.
736 * 738 *
737 * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; 739 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
738 */ 740 */
739DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 741DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
740 742
741static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) 743static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
742{ 744{
743 if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) 745 if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
744 wake_up_all(&cgroup_rmdir_waitq); 746 wake_up_all(&cgroup_rmdir_waitq);
745} 747}
746 748
749void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
750{
751 css_get(css);
752}
753
754void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
755{
756 cgroup_wakeup_rmdir_waiter(css->cgroup);
757 css_put(css);
758}
759
760
747static int rebind_subsystems(struct cgroupfs_root *root, 761static int rebind_subsystems(struct cgroupfs_root *root,
748 unsigned long final_bits) 762 unsigned long final_bits)
749{ 763{
@@ -947,7 +961,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
947 return ret; 961 return ret;
948} 962}
949 963
950static struct super_operations cgroup_ops = { 964static const struct super_operations cgroup_ops = {
951 .statfs = simple_statfs, 965 .statfs = simple_statfs,
952 .drop_inode = generic_delete_inode, 966 .drop_inode = generic_delete_inode,
953 .show_options = cgroup_show_options, 967 .show_options = cgroup_show_options,
@@ -960,6 +974,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
960 INIT_LIST_HEAD(&cgrp->children); 974 INIT_LIST_HEAD(&cgrp->children);
961 INIT_LIST_HEAD(&cgrp->css_sets); 975 INIT_LIST_HEAD(&cgrp->css_sets);
962 INIT_LIST_HEAD(&cgrp->release_list); 976 INIT_LIST_HEAD(&cgrp->release_list);
977 INIT_LIST_HEAD(&cgrp->pids_list);
963 init_rwsem(&cgrp->pids_mutex); 978 init_rwsem(&cgrp->pids_mutex);
964} 979}
965static void init_cgroup_root(struct cgroupfs_root *root) 980static void init_cgroup_root(struct cgroupfs_root *root)
@@ -1357,7 +1372,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1357 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1372 * wake up rmdir() waiter. the rmdir should fail since the cgroup
1358 * is no longer empty. 1373 * is no longer empty.
1359 */ 1374 */
1360 cgroup_wakeup_rmdir_waiters(cgrp); 1375 cgroup_wakeup_rmdir_waiter(cgrp);
1361 return 0; 1376 return 0;
1362} 1377}
1363 1378
@@ -1696,7 +1711,7 @@ static struct file_operations cgroup_file_operations = {
1696 .release = cgroup_file_release, 1711 .release = cgroup_file_release,
1697}; 1712};
1698 1713
1699static struct inode_operations cgroup_dir_inode_operations = { 1714static const struct inode_operations cgroup_dir_inode_operations = {
1700 .lookup = simple_lookup, 1715 .lookup = simple_lookup,
1701 .mkdir = cgroup_mkdir, 1716 .mkdir = cgroup_mkdir,
1702 .rmdir = cgroup_rmdir, 1717 .rmdir = cgroup_rmdir,
@@ -2201,12 +2216,30 @@ err:
2201 return ret; 2216 return ret;
2202} 2217}
2203 2218
2219/*
2220 * Cache pids for all threads in the same pid namespace that are
2221 * opening the same "tasks" file.
2222 */
2223struct cgroup_pids {
2224 /* The node in cgrp->pids_list */
2225 struct list_head list;
2226 /* The cgroup those pids belong to */
2227 struct cgroup *cgrp;
2228 /* The namepsace those pids belong to */
2229 struct pid_namespace *ns;
2230 /* Array of process ids in the cgroup */
2231 pid_t *tasks_pids;
2232 /* How many files are using the this tasks_pids array */
2233 int use_count;
2234 /* Length of the current tasks_pids array */
2235 int length;
2236};
2237
2204static int cmppid(const void *a, const void *b) 2238static int cmppid(const void *a, const void *b)
2205{ 2239{
2206 return *(pid_t *)a - *(pid_t *)b; 2240 return *(pid_t *)a - *(pid_t *)b;
2207} 2241}
2208 2242
2209
2210/* 2243/*
2211 * seq_file methods for the "tasks" file. The seq_file position is the 2244 * seq_file methods for the "tasks" file. The seq_file position is the
2212 * next pid to display; the seq_file iterator is a pointer to the pid 2245 * next pid to display; the seq_file iterator is a pointer to the pid
@@ -2221,45 +2254,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2221 * after a seek to the start). Use a binary-search to find the 2254 * after a seek to the start). Use a binary-search to find the
2222 * next pid to display, if any 2255 * next pid to display, if any
2223 */ 2256 */
2224 struct cgroup *cgrp = s->private; 2257 struct cgroup_pids *cp = s->private;
2258 struct cgroup *cgrp = cp->cgrp;
2225 int index = 0, pid = *pos; 2259 int index = 0, pid = *pos;
2226 int *iter; 2260 int *iter;
2227 2261
2228 down_read(&cgrp->pids_mutex); 2262 down_read(&cgrp->pids_mutex);
2229 if (pid) { 2263 if (pid) {
2230 int end = cgrp->pids_length; 2264 int end = cp->length;
2231 2265
2232 while (index < end) { 2266 while (index < end) {
2233 int mid = (index + end) / 2; 2267 int mid = (index + end) / 2;
2234 if (cgrp->tasks_pids[mid] == pid) { 2268 if (cp->tasks_pids[mid] == pid) {
2235 index = mid; 2269 index = mid;
2236 break; 2270 break;
2237 } else if (cgrp->tasks_pids[mid] <= pid) 2271 } else if (cp->tasks_pids[mid] <= pid)
2238 index = mid + 1; 2272 index = mid + 1;
2239 else 2273 else
2240 end = mid; 2274 end = mid;
2241 } 2275 }
2242 } 2276 }
2243 /* If we're off the end of the array, we're done */ 2277 /* If we're off the end of the array, we're done */
2244 if (index >= cgrp->pids_length) 2278 if (index >= cp->length)
2245 return NULL; 2279 return NULL;
2246 /* Update the abstract position to be the actual pid that we found */ 2280 /* Update the abstract position to be the actual pid that we found */
2247 iter = cgrp->tasks_pids + index; 2281 iter = cp->tasks_pids + index;
2248 *pos = *iter; 2282 *pos = *iter;
2249 return iter; 2283 return iter;
2250} 2284}
2251 2285
2252static void cgroup_tasks_stop(struct seq_file *s, void *v) 2286static void cgroup_tasks_stop(struct seq_file *s, void *v)
2253{ 2287{
2254 struct cgroup *cgrp = s->private; 2288 struct cgroup_pids *cp = s->private;
2289 struct cgroup *cgrp = cp->cgrp;
2255 up_read(&cgrp->pids_mutex); 2290 up_read(&cgrp->pids_mutex);
2256} 2291}
2257 2292
2258static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) 2293static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2259{ 2294{
2260 struct cgroup *cgrp = s->private; 2295 struct cgroup_pids *cp = s->private;
2261 int *p = v; 2296 int *p = v;
2262 int *end = cgrp->tasks_pids + cgrp->pids_length; 2297 int *end = cp->tasks_pids + cp->length;
2263 2298
2264 /* 2299 /*
2265 * Advance to the next pid in the array. If this goes off the 2300 * Advance to the next pid in the array. If this goes off the
@@ -2286,26 +2321,33 @@ static struct seq_operations cgroup_tasks_seq_operations = {
2286 .show = cgroup_tasks_show, 2321 .show = cgroup_tasks_show,
2287}; 2322};
2288 2323
2289static void release_cgroup_pid_array(struct cgroup *cgrp) 2324static void release_cgroup_pid_array(struct cgroup_pids *cp)
2290{ 2325{
2326 struct cgroup *cgrp = cp->cgrp;
2327
2291 down_write(&cgrp->pids_mutex); 2328 down_write(&cgrp->pids_mutex);
2292 BUG_ON(!cgrp->pids_use_count); 2329 BUG_ON(!cp->use_count);
2293 if (!--cgrp->pids_use_count) { 2330 if (!--cp->use_count) {
2294 kfree(cgrp->tasks_pids); 2331 list_del(&cp->list);
2295 cgrp->tasks_pids = NULL; 2332 put_pid_ns(cp->ns);
2296 cgrp->pids_length = 0; 2333 kfree(cp->tasks_pids);
2334 kfree(cp);
2297 } 2335 }
2298 up_write(&cgrp->pids_mutex); 2336 up_write(&cgrp->pids_mutex);
2299} 2337}
2300 2338
2301static int cgroup_tasks_release(struct inode *inode, struct file *file) 2339static int cgroup_tasks_release(struct inode *inode, struct file *file)
2302{ 2340{
2303 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2341 struct seq_file *seq;
2342 struct cgroup_pids *cp;
2304 2343
2305 if (!(file->f_mode & FMODE_READ)) 2344 if (!(file->f_mode & FMODE_READ))
2306 return 0; 2345 return 0;
2307 2346
2308 release_cgroup_pid_array(cgrp); 2347 seq = file->private_data;
2348 cp = seq->private;
2349
2350 release_cgroup_pid_array(cp);
2309 return seq_release(inode, file); 2351 return seq_release(inode, file);
2310} 2352}
2311 2353
@@ -2324,6 +2366,8 @@ static struct file_operations cgroup_tasks_operations = {
2324static int cgroup_tasks_open(struct inode *unused, struct file *file) 2366static int cgroup_tasks_open(struct inode *unused, struct file *file)
2325{ 2367{
2326 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2368 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2369 struct pid_namespace *ns = current->nsproxy->pid_ns;
2370 struct cgroup_pids *cp;
2327 pid_t *pidarray; 2371 pid_t *pidarray;
2328 int npids; 2372 int npids;
2329 int retval; 2373 int retval;
@@ -2350,20 +2394,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2350 * array if necessary 2394 * array if necessary
2351 */ 2395 */
2352 down_write(&cgrp->pids_mutex); 2396 down_write(&cgrp->pids_mutex);
2353 kfree(cgrp->tasks_pids); 2397
2354 cgrp->tasks_pids = pidarray; 2398 list_for_each_entry(cp, &cgrp->pids_list, list) {
2355 cgrp->pids_length = npids; 2399 if (ns == cp->ns)
2356 cgrp->pids_use_count++; 2400 goto found;
2401 }
2402
2403 cp = kzalloc(sizeof(*cp), GFP_KERNEL);
2404 if (!cp) {
2405 up_write(&cgrp->pids_mutex);
2406 kfree(pidarray);
2407 return -ENOMEM;
2408 }
2409 cp->cgrp = cgrp;
2410 cp->ns = ns;
2411 get_pid_ns(ns);
2412 list_add(&cp->list, &cgrp->pids_list);
2413found:
2414 kfree(cp->tasks_pids);
2415 cp->tasks_pids = pidarray;
2416 cp->length = npids;
2417 cp->use_count++;
2357 up_write(&cgrp->pids_mutex); 2418 up_write(&cgrp->pids_mutex);
2358 2419
2359 file->f_op = &cgroup_tasks_operations; 2420 file->f_op = &cgroup_tasks_operations;
2360 2421
2361 retval = seq_open(file, &cgroup_tasks_seq_operations); 2422 retval = seq_open(file, &cgroup_tasks_seq_operations);
2362 if (retval) { 2423 if (retval) {
2363 release_cgroup_pid_array(cgrp); 2424 release_cgroup_pid_array(cp);
2364 return retval; 2425 return retval;
2365 } 2426 }
2366 ((struct seq_file *)file->private_data)->private = cgrp; 2427 ((struct seq_file *)file->private_data)->private = cp;
2367 return 0; 2428 return 0;
2368} 2429}
2369 2430
@@ -2696,33 +2757,42 @@ again:
2696 mutex_unlock(&cgroup_mutex); 2757 mutex_unlock(&cgroup_mutex);
2697 2758
2698 /* 2759 /*
2760 * In general, subsystem has no css->refcnt after pre_destroy(). But
2761 * in racy cases, subsystem may have to get css->refcnt after
2762 * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
2763 * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
2764 * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
2765 * and subsystem's reference count handling. Please see css_get/put
2766 * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
2767 */
2768 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2769
2770 /*
2699 * Call pre_destroy handlers of subsys. Notify subsystems 2771 * Call pre_destroy handlers of subsys. Notify subsystems
2700 * that rmdir() request comes. 2772 * that rmdir() request comes.
2701 */ 2773 */
2702 ret = cgroup_call_pre_destroy(cgrp); 2774 ret = cgroup_call_pre_destroy(cgrp);
2703 if (ret) 2775 if (ret) {
2776 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2704 return ret; 2777 return ret;
2778 }
2705 2779
2706 mutex_lock(&cgroup_mutex); 2780 mutex_lock(&cgroup_mutex);
2707 parent = cgrp->parent; 2781 parent = cgrp->parent;
2708 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { 2782 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
2783 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2709 mutex_unlock(&cgroup_mutex); 2784 mutex_unlock(&cgroup_mutex);
2710 return -EBUSY; 2785 return -EBUSY;
2711 } 2786 }
2712 /*
2713 * css_put/get is provided for subsys to grab refcnt to css. In typical
2714 * case, subsystem has no reference after pre_destroy(). But, under
2715 * hierarchy management, some *temporal* refcnt can be hold.
2716 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
2717 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
2718 * is called when css_put() is called and refcnt goes down to 0.
2719 */
2720 set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2721 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); 2787 prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
2722
2723 if (!cgroup_clear_css_refs(cgrp)) { 2788 if (!cgroup_clear_css_refs(cgrp)) {
2724 mutex_unlock(&cgroup_mutex); 2789 mutex_unlock(&cgroup_mutex);
2725 schedule(); 2790 /*
2791 * Because someone may call cgroup_wakeup_rmdir_waiter() before
2792 * prepare_to_wait(), we need to check this flag.
2793 */
2794 if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
2795 schedule();
2726 finish_wait(&cgroup_rmdir_waitq, &wait); 2796 finish_wait(&cgroup_rmdir_waitq, &wait);
2727 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); 2797 clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
2728 if (signal_pending(current)) 2798 if (signal_pending(current))
@@ -3294,7 +3364,7 @@ void __css_put(struct cgroup_subsys_state *css)
3294 set_bit(CGRP_RELEASABLE, &cgrp->flags); 3364 set_bit(CGRP_RELEASABLE, &cgrp->flags);
3295 check_for_release(cgrp); 3365 check_for_release(cgrp);
3296 } 3366 }
3297 cgroup_wakeup_rmdir_waiters(cgrp); 3367 cgroup_wakeup_rmdir_waiter(cgrp);
3298 } 3368 }
3299 rcu_read_unlock(); 3369 rcu_read_unlock();
3300} 3370}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 8ce10043e4ac..6ba0f1ecb212 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -401,6 +401,7 @@ int disable_nonboot_cpus(void)
401 break; 401 break;
402 } 402 }
403 } 403 }
404
404 if (!error) { 405 if (!error) {
405 BUG_ON(num_online_cpus() > 1); 406 BUG_ON(num_online_cpus() > 1);
406 /* Make sure the CPUs won't be enabled by someone else */ 407 /* Make sure the CPUs won't be enabled by someone else */
@@ -413,6 +414,14 @@ int disable_nonboot_cpus(void)
413 return error; 414 return error;
414} 415}
415 416
417void __weak arch_enable_nonboot_cpus_begin(void)
418{
419}
420
421void __weak arch_enable_nonboot_cpus_end(void)
422{
423}
424
416void __ref enable_nonboot_cpus(void) 425void __ref enable_nonboot_cpus(void)
417{ 426{
418 int cpu, error; 427 int cpu, error;
@@ -424,6 +433,9 @@ void __ref enable_nonboot_cpus(void)
424 goto out; 433 goto out;
425 434
426 printk("Enabling non-boot CPUs ...\n"); 435 printk("Enabling non-boot CPUs ...\n");
436
437 arch_enable_nonboot_cpus_begin();
438
427 for_each_cpu(cpu, frozen_cpus) { 439 for_each_cpu(cpu, frozen_cpus) {
428 error = _cpu_up(cpu, 1); 440 error = _cpu_up(cpu, 1);
429 if (!error) { 441 if (!error) {
@@ -432,6 +444,9 @@ void __ref enable_nonboot_cpus(void)
432 } 444 }
433 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 445 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
434 } 446 }
447
448 arch_enable_nonboot_cpus_end();
449
435 cpumask_clear(frozen_cpus); 450 cpumask_clear(frozen_cpus);
436out: 451out:
437 cpu_maps_update_done(); 452 cpu_maps_update_done();
diff --git a/kernel/cred.c b/kernel/cred.c
index 1bb4d7e5d616..d7f7a01082eb 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -18,6 +18,18 @@
18#include <linux/cn_proc.h> 18#include <linux/cn_proc.h>
19#include "cred-internals.h" 19#include "cred-internals.h"
20 20
21#if 0
22#define kdebug(FMT, ...) \
23 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
24#else
25static inline __attribute__((format(printf, 1, 2)))
26void no_printk(const char *fmt, ...)
27{
28}
29#define kdebug(FMT, ...) \
30 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__)
31#endif
32
21static struct kmem_cache *cred_jar; 33static struct kmem_cache *cred_jar;
22 34
23/* 35/*
@@ -36,6 +48,10 @@ static struct thread_group_cred init_tgcred = {
36 */ 48 */
37struct cred init_cred = { 49struct cred init_cred = {
38 .usage = ATOMIC_INIT(4), 50 .usage = ATOMIC_INIT(4),
51#ifdef CONFIG_DEBUG_CREDENTIALS
52 .subscribers = ATOMIC_INIT(2),
53 .magic = CRED_MAGIC,
54#endif
39 .securebits = SECUREBITS_DEFAULT, 55 .securebits = SECUREBITS_DEFAULT,
40 .cap_inheritable = CAP_INIT_INH_SET, 56 .cap_inheritable = CAP_INIT_INH_SET,
41 .cap_permitted = CAP_FULL_SET, 57 .cap_permitted = CAP_FULL_SET,
@@ -48,6 +64,31 @@ struct cred init_cred = {
48#endif 64#endif
49}; 65};
50 66
67static inline void set_cred_subscribers(struct cred *cred, int n)
68{
69#ifdef CONFIG_DEBUG_CREDENTIALS
70 atomic_set(&cred->subscribers, n);
71#endif
72}
73
74static inline int read_cred_subscribers(const struct cred *cred)
75{
76#ifdef CONFIG_DEBUG_CREDENTIALS
77 return atomic_read(&cred->subscribers);
78#else
79 return 0;
80#endif
81}
82
83static inline void alter_cred_subscribers(const struct cred *_cred, int n)
84{
85#ifdef CONFIG_DEBUG_CREDENTIALS
86 struct cred *cred = (struct cred *) _cred;
87
88 atomic_add(n, &cred->subscribers);
89#endif
90}
91
51/* 92/*
52 * Dispose of the shared task group credentials 93 * Dispose of the shared task group credentials
53 */ 94 */
@@ -85,15 +126,29 @@ static void put_cred_rcu(struct rcu_head *rcu)
85{ 126{
86 struct cred *cred = container_of(rcu, struct cred, rcu); 127 struct cred *cred = container_of(rcu, struct cred, rcu);
87 128
129 kdebug("put_cred_rcu(%p)", cred);
130
131#ifdef CONFIG_DEBUG_CREDENTIALS
132 if (cred->magic != CRED_MAGIC_DEAD ||
133 atomic_read(&cred->usage) != 0 ||
134 read_cred_subscribers(cred) != 0)
135 panic("CRED: put_cred_rcu() sees %p with"
136 " mag %x, put %p, usage %d, subscr %d\n",
137 cred, cred->magic, cred->put_addr,
138 atomic_read(&cred->usage),
139 read_cred_subscribers(cred));
140#else
88 if (atomic_read(&cred->usage) != 0) 141 if (atomic_read(&cred->usage) != 0)
89 panic("CRED: put_cred_rcu() sees %p with usage %d\n", 142 panic("CRED: put_cred_rcu() sees %p with usage %d\n",
90 cred, atomic_read(&cred->usage)); 143 cred, atomic_read(&cred->usage));
144#endif
91 145
92 security_cred_free(cred); 146 security_cred_free(cred);
93 key_put(cred->thread_keyring); 147 key_put(cred->thread_keyring);
94 key_put(cred->request_key_auth); 148 key_put(cred->request_key_auth);
95 release_tgcred(cred); 149 release_tgcred(cred);
96 put_group_info(cred->group_info); 150 if (cred->group_info)
151 put_group_info(cred->group_info);
97 free_uid(cred->user); 152 free_uid(cred->user);
98 kmem_cache_free(cred_jar, cred); 153 kmem_cache_free(cred_jar, cred);
99} 154}
@@ -106,12 +161,90 @@ static void put_cred_rcu(struct rcu_head *rcu)
106 */ 161 */
107void __put_cred(struct cred *cred) 162void __put_cred(struct cred *cred)
108{ 163{
164 kdebug("__put_cred(%p{%d,%d})", cred,
165 atomic_read(&cred->usage),
166 read_cred_subscribers(cred));
167
109 BUG_ON(atomic_read(&cred->usage) != 0); 168 BUG_ON(atomic_read(&cred->usage) != 0);
169#ifdef CONFIG_DEBUG_CREDENTIALS
170 BUG_ON(read_cred_subscribers(cred) != 0);
171 cred->magic = CRED_MAGIC_DEAD;
172 cred->put_addr = __builtin_return_address(0);
173#endif
174 BUG_ON(cred == current->cred);
175 BUG_ON(cred == current->real_cred);
110 176
111 call_rcu(&cred->rcu, put_cred_rcu); 177 call_rcu(&cred->rcu, put_cred_rcu);
112} 178}
113EXPORT_SYMBOL(__put_cred); 179EXPORT_SYMBOL(__put_cred);
114 180
181/*
182 * Clean up a task's credentials when it exits
183 */
184void exit_creds(struct task_struct *tsk)
185{
186 struct cred *cred;
187
188 kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred,
189 atomic_read(&tsk->cred->usage),
190 read_cred_subscribers(tsk->cred));
191
192 cred = (struct cred *) tsk->real_cred;
193 tsk->real_cred = NULL;
194 validate_creds(cred);
195 alter_cred_subscribers(cred, -1);
196 put_cred(cred);
197
198 cred = (struct cred *) tsk->cred;
199 tsk->cred = NULL;
200 validate_creds(cred);
201 alter_cred_subscribers(cred, -1);
202 put_cred(cred);
203
204 cred = (struct cred *) tsk->replacement_session_keyring;
205 if (cred) {
206 tsk->replacement_session_keyring = NULL;
207 validate_creds(cred);
208 put_cred(cred);
209 }
210}
211
212/*
213 * Allocate blank credentials, such that the credentials can be filled in at a
214 * later date without risk of ENOMEM.
215 */
216struct cred *cred_alloc_blank(void)
217{
218 struct cred *new;
219
220 new = kmem_cache_zalloc(cred_jar, GFP_KERNEL);
221 if (!new)
222 return NULL;
223
224#ifdef CONFIG_KEYS
225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
226 if (!new->tgcred) {
227 kfree(new);
228 return NULL;
229 }
230 atomic_set(&new->tgcred->usage, 1);
231#endif
232
233 atomic_set(&new->usage, 1);
234
235 if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
236 goto error;
237
238#ifdef CONFIG_DEBUG_CREDENTIALS
239 new->magic = CRED_MAGIC;
240#endif
241 return new;
242
243error:
244 abort_creds(new);
245 return NULL;
246}
247
115/** 248/**
116 * prepare_creds - Prepare a new set of credentials for modification 249 * prepare_creds - Prepare a new set of credentials for modification
117 * 250 *
@@ -132,16 +265,19 @@ struct cred *prepare_creds(void)
132 const struct cred *old; 265 const struct cred *old;
133 struct cred *new; 266 struct cred *new;
134 267
135 BUG_ON(atomic_read(&task->real_cred->usage) < 1); 268 validate_process_creds();
136 269
137 new = kmem_cache_alloc(cred_jar, GFP_KERNEL); 270 new = kmem_cache_alloc(cred_jar, GFP_KERNEL);
138 if (!new) 271 if (!new)
139 return NULL; 272 return NULL;
140 273
274 kdebug("prepare_creds() alloc %p", new);
275
141 old = task->cred; 276 old = task->cred;
142 memcpy(new, old, sizeof(struct cred)); 277 memcpy(new, old, sizeof(struct cred));
143 278
144 atomic_set(&new->usage, 1); 279 atomic_set(&new->usage, 1);
280 set_cred_subscribers(new, 0);
145 get_group_info(new->group_info); 281 get_group_info(new->group_info);
146 get_uid(new->user); 282 get_uid(new->user);
147 283
@@ -157,6 +293,7 @@ struct cred *prepare_creds(void)
157 293
158 if (security_prepare_creds(new, old, GFP_KERNEL) < 0) 294 if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
159 goto error; 295 goto error;
296 validate_creds(new);
160 return new; 297 return new;
161 298
162error: 299error:
@@ -229,9 +366,12 @@ struct cred *prepare_usermodehelper_creds(void)
229 if (!new) 366 if (!new)
230 return NULL; 367 return NULL;
231 368
369 kdebug("prepare_usermodehelper_creds() alloc %p", new);
370
232 memcpy(new, &init_cred, sizeof(struct cred)); 371 memcpy(new, &init_cred, sizeof(struct cred));
233 372
234 atomic_set(&new->usage, 1); 373 atomic_set(&new->usage, 1);
374 set_cred_subscribers(new, 0);
235 get_group_info(new->group_info); 375 get_group_info(new->group_info);
236 get_uid(new->user); 376 get_uid(new->user);
237 377
@@ -250,6 +390,7 @@ struct cred *prepare_usermodehelper_creds(void)
250#endif 390#endif
251 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0) 391 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
252 goto error; 392 goto error;
393 validate_creds(new);
253 394
254 BUG_ON(atomic_read(&new->usage) != 1); 395 BUG_ON(atomic_read(&new->usage) != 1);
255 return new; 396 return new;
@@ -286,6 +427,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
286 ) { 427 ) {
287 p->real_cred = get_cred(p->cred); 428 p->real_cred = get_cred(p->cred);
288 get_cred(p->cred); 429 get_cred(p->cred);
430 alter_cred_subscribers(p->cred, 2);
431 kdebug("share_creds(%p{%d,%d})",
432 p->cred, atomic_read(&p->cred->usage),
433 read_cred_subscribers(p->cred));
289 atomic_inc(&p->cred->user->processes); 434 atomic_inc(&p->cred->user->processes);
290 return 0; 435 return 0;
291 } 436 }
@@ -331,6 +476,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
331 476
332 atomic_inc(&new->user->processes); 477 atomic_inc(&new->user->processes);
333 p->cred = p->real_cred = get_cred(new); 478 p->cred = p->real_cred = get_cred(new);
479 alter_cred_subscribers(new, 2);
480 validate_creds(new);
334 return 0; 481 return 0;
335 482
336error_put: 483error_put:
@@ -355,13 +502,20 @@ error_put:
355int commit_creds(struct cred *new) 502int commit_creds(struct cred *new)
356{ 503{
357 struct task_struct *task = current; 504 struct task_struct *task = current;
358 const struct cred *old; 505 const struct cred *old = task->real_cred;
359 506
360 BUG_ON(task->cred != task->real_cred); 507 kdebug("commit_creds(%p{%d,%d})", new,
361 BUG_ON(atomic_read(&task->real_cred->usage) < 2); 508 atomic_read(&new->usage),
509 read_cred_subscribers(new));
510
511 BUG_ON(task->cred != old);
512#ifdef CONFIG_DEBUG_CREDENTIALS
513 BUG_ON(read_cred_subscribers(old) < 2);
514 validate_creds(old);
515 validate_creds(new);
516#endif
362 BUG_ON(atomic_read(&new->usage) < 1); 517 BUG_ON(atomic_read(&new->usage) < 1);
363 518
364 old = task->real_cred;
365 security_commit_creds(new, old); 519 security_commit_creds(new, old);
366 520
367 get_cred(new); /* we will require a ref for the subj creds too */ 521 get_cred(new); /* we will require a ref for the subj creds too */
@@ -390,12 +544,14 @@ int commit_creds(struct cred *new)
390 * cheaply with the new uid cache, so if it matters 544 * cheaply with the new uid cache, so if it matters
391 * we should be checking for it. -DaveM 545 * we should be checking for it. -DaveM
392 */ 546 */
547 alter_cred_subscribers(new, 2);
393 if (new->user != old->user) 548 if (new->user != old->user)
394 atomic_inc(&new->user->processes); 549 atomic_inc(&new->user->processes);
395 rcu_assign_pointer(task->real_cred, new); 550 rcu_assign_pointer(task->real_cred, new);
396 rcu_assign_pointer(task->cred, new); 551 rcu_assign_pointer(task->cred, new);
397 if (new->user != old->user) 552 if (new->user != old->user)
398 atomic_dec(&old->user->processes); 553 atomic_dec(&old->user->processes);
554 alter_cred_subscribers(old, -2);
399 555
400 sched_switch_user(task); 556 sched_switch_user(task);
401 557
@@ -428,6 +584,13 @@ EXPORT_SYMBOL(commit_creds);
428 */ 584 */
429void abort_creds(struct cred *new) 585void abort_creds(struct cred *new)
430{ 586{
587 kdebug("abort_creds(%p{%d,%d})", new,
588 atomic_read(&new->usage),
589 read_cred_subscribers(new));
590
591#ifdef CONFIG_DEBUG_CREDENTIALS
592 BUG_ON(read_cred_subscribers(new) != 0);
593#endif
431 BUG_ON(atomic_read(&new->usage) < 1); 594 BUG_ON(atomic_read(&new->usage) < 1);
432 put_cred(new); 595 put_cred(new);
433} 596}
@@ -444,7 +607,20 @@ const struct cred *override_creds(const struct cred *new)
444{ 607{
445 const struct cred *old = current->cred; 608 const struct cred *old = current->cred;
446 609
447 rcu_assign_pointer(current->cred, get_cred(new)); 610 kdebug("override_creds(%p{%d,%d})", new,
611 atomic_read(&new->usage),
612 read_cred_subscribers(new));
613
614 validate_creds(old);
615 validate_creds(new);
616 get_cred(new);
617 alter_cred_subscribers(new, 1);
618 rcu_assign_pointer(current->cred, new);
619 alter_cred_subscribers(old, -1);
620
621 kdebug("override_creds() = %p{%d,%d}", old,
622 atomic_read(&old->usage),
623 read_cred_subscribers(old));
448 return old; 624 return old;
449} 625}
450EXPORT_SYMBOL(override_creds); 626EXPORT_SYMBOL(override_creds);
@@ -460,7 +636,15 @@ void revert_creds(const struct cred *old)
460{ 636{
461 const struct cred *override = current->cred; 637 const struct cred *override = current->cred;
462 638
639 kdebug("revert_creds(%p{%d,%d})", old,
640 atomic_read(&old->usage),
641 read_cred_subscribers(old));
642
643 validate_creds(old);
644 validate_creds(override);
645 alter_cred_subscribers(old, 1);
463 rcu_assign_pointer(current->cred, old); 646 rcu_assign_pointer(current->cred, old);
647 alter_cred_subscribers(override, -1);
464 put_cred(override); 648 put_cred(override);
465} 649}
466EXPORT_SYMBOL(revert_creds); 650EXPORT_SYMBOL(revert_creds);
@@ -502,11 +686,15 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
502 if (!new) 686 if (!new)
503 return NULL; 687 return NULL;
504 688
689 kdebug("prepare_kernel_cred() alloc %p", new);
690
505 if (daemon) 691 if (daemon)
506 old = get_task_cred(daemon); 692 old = get_task_cred(daemon);
507 else 693 else
508 old = get_cred(&init_cred); 694 old = get_cred(&init_cred);
509 695
696 validate_creds(old);
697
510 *new = *old; 698 *new = *old;
511 get_uid(new->user); 699 get_uid(new->user);
512 get_group_info(new->group_info); 700 get_group_info(new->group_info);
@@ -526,7 +714,9 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
526 goto error; 714 goto error;
527 715
528 atomic_set(&new->usage, 1); 716 atomic_set(&new->usage, 1);
717 set_cred_subscribers(new, 0);
529 put_cred(old); 718 put_cred(old);
719 validate_creds(new);
530 return new; 720 return new;
531 721
532error: 722error:
@@ -589,3 +779,95 @@ int set_create_files_as(struct cred *new, struct inode *inode)
589 return security_kernel_create_files_as(new, inode); 779 return security_kernel_create_files_as(new, inode);
590} 780}
591EXPORT_SYMBOL(set_create_files_as); 781EXPORT_SYMBOL(set_create_files_as);
782
783#ifdef CONFIG_DEBUG_CREDENTIALS
784
785/*
786 * dump invalid credentials
787 */
788static void dump_invalid_creds(const struct cred *cred, const char *label,
789 const struct task_struct *tsk)
790{
791 printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n",
792 label, cred,
793 cred == &init_cred ? "[init]" : "",
794 cred == tsk->real_cred ? "[real]" : "",
795 cred == tsk->cred ? "[eff]" : "");
796 printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
797 cred->magic, cred->put_addr);
798 printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
799 atomic_read(&cred->usage),
800 read_cred_subscribers(cred));
801 printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n",
802 cred->uid, cred->euid, cred->suid, cred->fsuid);
803 printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n",
804 cred->gid, cred->egid, cred->sgid, cred->fsgid);
805#ifdef CONFIG_SECURITY
806 printk(KERN_ERR "CRED: ->security is %p\n", cred->security);
807 if ((unsigned long) cred->security >= PAGE_SIZE &&
808 (((unsigned long) cred->security & 0xffffff00) !=
809 (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)))
810 printk(KERN_ERR "CRED: ->security {%x, %x}\n",
811 ((u32*)cred->security)[0],
812 ((u32*)cred->security)[1]);
813#endif
814}
815
816/*
817 * report use of invalid credentials
818 */
819void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
820{
821 printk(KERN_ERR "CRED: Invalid credentials\n");
822 printk(KERN_ERR "CRED: At %s:%u\n", file, line);
823 dump_invalid_creds(cred, "Specified", current);
824 BUG();
825}
826EXPORT_SYMBOL(__invalid_creds);
827
828/*
829 * check the credentials on a process
830 */
831void __validate_process_creds(struct task_struct *tsk,
832 const char *file, unsigned line)
833{
834 if (tsk->cred == tsk->real_cred) {
835 if (unlikely(read_cred_subscribers(tsk->cred) < 2 ||
836 creds_are_invalid(tsk->cred)))
837 goto invalid_creds;
838 } else {
839 if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 ||
840 read_cred_subscribers(tsk->cred) < 1 ||
841 creds_are_invalid(tsk->real_cred) ||
842 creds_are_invalid(tsk->cred)))
843 goto invalid_creds;
844 }
845 return;
846
847invalid_creds:
848 printk(KERN_ERR "CRED: Invalid process credentials\n");
849 printk(KERN_ERR "CRED: At %s:%u\n", file, line);
850
851 dump_invalid_creds(tsk->real_cred, "Real", tsk);
852 if (tsk->cred != tsk->real_cred)
853 dump_invalid_creds(tsk->cred, "Effective", tsk);
854 else
855 printk(KERN_ERR "CRED: Effective creds == Real creds\n");
856 BUG();
857}
858EXPORT_SYMBOL(__validate_process_creds);
859
860/*
861 * check creds for do_exit()
862 */
863void validate_creds_for_do_exit(struct task_struct *tsk)
864{
865 kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})",
866 tsk->real_cred, tsk->cred,
867 atomic_read(&tsk->cred->usage),
868 read_cred_subscribers(tsk->cred));
869
870 __validate_process_creds(tsk, __FILE__, __LINE__);
871}
872
873#endif /* CONFIG_DEBUG_CREDENTIALS */
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index abb6e17505e2..ead9b610aa71 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -15,6 +15,7 @@
15 15
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/taskstats.h>
18#include <linux/time.h> 19#include <linux/time.h>
19#include <linux/sysctl.h> 20#include <linux/sysctl.h>
20#include <linux/delayacct.h> 21#include <linux/delayacct.h>
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
deleted file mode 100644
index 962a3b574f21..000000000000
--- a/kernel/dma-coherent.c
+++ /dev/null
@@ -1,176 +0,0 @@
1/*
2 * Coherent per-device memory handling.
3 * Borrowed from i386
4 */
5#include <linux/kernel.h>
6#include <linux/dma-mapping.h>
7
8struct dma_coherent_mem {
9 void *virt_base;
10 u32 device_base;
11 int size;
12 int flags;
13 unsigned long *bitmap;
14};
15
16int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
17 dma_addr_t device_addr, size_t size, int flags)
18{
19 void __iomem *mem_base = NULL;
20 int pages = size >> PAGE_SHIFT;
21 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
22
23 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
24 goto out;
25 if (!size)
26 goto out;
27 if (dev->dma_mem)
28 goto out;
29
30 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
31
32 mem_base = ioremap(bus_addr, size);
33 if (!mem_base)
34 goto out;
35
36 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
37 if (!dev->dma_mem)
38 goto out;
39 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
40 if (!dev->dma_mem->bitmap)
41 goto free1_out;
42
43 dev->dma_mem->virt_base = mem_base;
44 dev->dma_mem->device_base = device_addr;
45 dev->dma_mem->size = pages;
46 dev->dma_mem->flags = flags;
47
48 if (flags & DMA_MEMORY_MAP)
49 return DMA_MEMORY_MAP;
50
51 return DMA_MEMORY_IO;
52
53 free1_out:
54 kfree(dev->dma_mem);
55 out:
56 if (mem_base)
57 iounmap(mem_base);
58 return 0;
59}
60EXPORT_SYMBOL(dma_declare_coherent_memory);
61
62void dma_release_declared_memory(struct device *dev)
63{
64 struct dma_coherent_mem *mem = dev->dma_mem;
65
66 if (!mem)
67 return;
68 dev->dma_mem = NULL;
69 iounmap(mem->virt_base);
70 kfree(mem->bitmap);
71 kfree(mem);
72}
73EXPORT_SYMBOL(dma_release_declared_memory);
74
75void *dma_mark_declared_memory_occupied(struct device *dev,
76 dma_addr_t device_addr, size_t size)
77{
78 struct dma_coherent_mem *mem = dev->dma_mem;
79 int pos, err;
80
81 size += device_addr & ~PAGE_MASK;
82
83 if (!mem)
84 return ERR_PTR(-EINVAL);
85
86 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
87 err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
88 if (err != 0)
89 return ERR_PTR(err);
90 return mem->virt_base + (pos << PAGE_SHIFT);
91}
92EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
93
94/**
95 * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
96 *
97 * @dev: device from which we allocate memory
98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area.
102 *
103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools.
105 *
106 * Returns 0 if dma_alloc_coherent should continue with allocating from
107 * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
108 */
109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret)
111{
112 struct dma_coherent_mem *mem;
113 int order = get_order(size);
114 int pageno;
115
116 if (!dev)
117 return 0;
118 mem = dev->dma_mem;
119 if (!mem)
120 return 0;
121
122 *ret = NULL;
123
124 if (unlikely(size > (mem->size << PAGE_SHIFT)))
125 goto err;
126
127 pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
128 if (unlikely(pageno < 0))
129 goto err;
130
131 /*
132 * Memory was found in the per-device area.
133 */
134 *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
135 *ret = mem->virt_base + (pageno << PAGE_SHIFT);
136 memset(*ret, 0, size);
137
138 return 1;
139
140err:
141 /*
142 * In the case where the allocation can not be satisfied from the
143 * per-device area, try to fall back to generic memory if the
144 * constraints allow it.
145 */
146 return mem->flags & DMA_MEMORY_EXCLUSIVE;
147}
148EXPORT_SYMBOL(dma_alloc_from_coherent);
149
150/**
151 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
152 * @dev: device from which the memory was allocated
153 * @order: the order of pages allocated
154 * @vaddr: virtual address of allocated pages
155 *
156 * This checks whether the memory was allocated from the per-device
157 * coherent memory pool and if so, releases that memory.
158 *
159 * Returns 1 if we correctly released the memory, or 0 if
160 * dma_release_coherent() should proceed with releasing memory from
161 * generic pools.
162 */
163int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
164{
165 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
166
167 if (mem && vaddr >= mem->virt_base && vaddr <
168 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
169 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
170
171 bitmap_release_region(mem->bitmap, page, order);
172 return 1;
173 }
174 return 0;
175}
176EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/exit.c b/kernel/exit.c
index 628d41f0dd54..e47ee8a06135 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -12,7 +12,6 @@
12#include <linux/completion.h> 12#include <linux/completion.h>
13#include <linux/personality.h> 13#include <linux/personality.h>
14#include <linux/tty.h> 14#include <linux/tty.h>
15#include <linux/mnt_namespace.h>
16#include <linux/iocontext.h> 15#include <linux/iocontext.h>
17#include <linux/key.h> 16#include <linux/key.h>
18#include <linux/security.h> 17#include <linux/security.h>
@@ -48,7 +47,7 @@
48#include <linux/tracehook.h> 47#include <linux/tracehook.h>
49#include <linux/fs_struct.h> 48#include <linux/fs_struct.h>
50#include <linux/init_task.h> 49#include <linux/init_task.h>
51#include <linux/perf_counter.h> 50#include <linux/perf_event.h>
52#include <trace/events/sched.h> 51#include <trace/events/sched.h>
53 52
54#include <asm/uaccess.h> 53#include <asm/uaccess.h>
@@ -155,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
155{ 154{
156 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 155 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
157 156
158#ifdef CONFIG_PERF_COUNTERS 157#ifdef CONFIG_PERF_EVENTS
159 WARN_ON_ONCE(tsk->perf_counter_ctxp); 158 WARN_ON_ONCE(tsk->perf_event_ctxp);
160#endif 159#endif
161 trace_sched_process_free(tsk); 160 trace_sched_process_free(tsk);
162 put_task_struct(tsk); 161 put_task_struct(tsk);
@@ -902,6 +901,8 @@ NORET_TYPE void do_exit(long code)
902 901
903 tracehook_report_exit(&code); 902 tracehook_report_exit(&code);
904 903
904 validate_creds_for_do_exit(tsk);
905
905 /* 906 /*
906 * We're taking recursive faults here in do_exit. Safest is to just 907 * We're taking recursive faults here in do_exit. Safest is to just
907 * leave this task alone and wait for reboot. 908 * leave this task alone and wait for reboot.
@@ -980,7 +981,7 @@ NORET_TYPE void do_exit(long code)
980 * Flush inherited counters to the parent - before the parent 981 * Flush inherited counters to the parent - before the parent
981 * gets woken up by child-exit notifications. 982 * gets woken up by child-exit notifications.
982 */ 983 */
983 perf_counter_exit_task(tsk); 984 perf_event_exit_task(tsk);
984 985
985 exit_notify(tsk, group_dead); 986 exit_notify(tsk, group_dead);
986#ifdef CONFIG_NUMA 987#ifdef CONFIG_NUMA
@@ -1010,7 +1011,10 @@ NORET_TYPE void do_exit(long code)
1010 if (tsk->splice_pipe) 1011 if (tsk->splice_pipe)
1011 __free_pipe_info(tsk->splice_pipe); 1012 __free_pipe_info(tsk->splice_pipe);
1012 1013
1014 validate_creds_for_do_exit(tsk);
1015
1013 preempt_disable(); 1016 preempt_disable();
1017 exit_rcu();
1014 /* causes final put_task_struct in finish_task_switch(). */ 1018 /* causes final put_task_struct in finish_task_switch(). */
1015 tsk->state = TASK_DEAD; 1019 tsk->state = TASK_DEAD;
1016 schedule(); 1020 schedule();
diff --git a/kernel/fork.c b/kernel/fork.c
index 467746b3f0aa..1020977b57ca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -17,7 +17,6 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/completion.h> 19#include <linux/completion.h>
20#include <linux/mnt_namespace.h>
21#include <linux/personality.h> 20#include <linux/personality.h>
22#include <linux/mempolicy.h> 21#include <linux/mempolicy.h>
23#include <linux/sem.h> 22#include <linux/sem.h>
@@ -50,6 +49,7 @@
50#include <linux/ftrace.h> 49#include <linux/ftrace.h>
51#include <linux/profile.h> 50#include <linux/profile.h>
52#include <linux/rmap.h> 51#include <linux/rmap.h>
52#include <linux/ksm.h>
53#include <linux/acct.h> 53#include <linux/acct.h>
54#include <linux/tsacct_kern.h> 54#include <linux/tsacct_kern.h>
55#include <linux/cn_proc.h> 55#include <linux/cn_proc.h>
@@ -62,7 +62,7 @@
62#include <linux/blkdev.h> 62#include <linux/blkdev.h>
63#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
64#include <linux/magic.h> 64#include <linux/magic.h>
65#include <linux/perf_counter.h> 65#include <linux/perf_event.h>
66 66
67#include <asm/pgtable.h> 67#include <asm/pgtable.h>
68#include <asm/pgalloc.h> 68#include <asm/pgalloc.h>
@@ -137,9 +137,17 @@ struct kmem_cache *vm_area_cachep;
137/* SLAB cache for mm_struct structures (tsk->mm) */ 137/* SLAB cache for mm_struct structures (tsk->mm) */
138static struct kmem_cache *mm_cachep; 138static struct kmem_cache *mm_cachep;
139 139
140static void account_kernel_stack(struct thread_info *ti, int account)
141{
142 struct zone *zone = page_zone(virt_to_page(ti));
143
144 mod_zone_page_state(zone, NR_KERNEL_STACK, account);
145}
146
140void free_task(struct task_struct *tsk) 147void free_task(struct task_struct *tsk)
141{ 148{
142 prop_local_destroy_single(&tsk->dirties); 149 prop_local_destroy_single(&tsk->dirties);
150 account_kernel_stack(tsk->stack, -1);
143 free_thread_info(tsk->stack); 151 free_thread_info(tsk->stack);
144 rt_mutex_debug_task_free(tsk); 152 rt_mutex_debug_task_free(tsk);
145 ftrace_graph_exit_task(tsk); 153 ftrace_graph_exit_task(tsk);
@@ -153,8 +161,7 @@ void __put_task_struct(struct task_struct *tsk)
153 WARN_ON(atomic_read(&tsk->usage)); 161 WARN_ON(atomic_read(&tsk->usage));
154 WARN_ON(tsk == current); 162 WARN_ON(tsk == current);
155 163
156 put_cred(tsk->real_cred); 164 exit_creds(tsk);
157 put_cred(tsk->cred);
158 delayacct_tsk_free(tsk); 165 delayacct_tsk_free(tsk);
159 166
160 if (!profile_handoff_task(tsk)) 167 if (!profile_handoff_task(tsk))
@@ -255,6 +262,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
255 tsk->btrace_seq = 0; 262 tsk->btrace_seq = 0;
256#endif 263#endif
257 tsk->splice_pipe = NULL; 264 tsk->splice_pipe = NULL;
265
266 account_kernel_stack(ti, 1);
267
258 return tsk; 268 return tsk;
259 269
260out: 270out:
@@ -290,6 +300,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
290 rb_link = &mm->mm_rb.rb_node; 300 rb_link = &mm->mm_rb.rb_node;
291 rb_parent = NULL; 301 rb_parent = NULL;
292 pprev = &mm->mmap; 302 pprev = &mm->mmap;
303 retval = ksm_fork(mm, oldmm);
304 if (retval)
305 goto out;
293 306
294 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { 307 for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
295 struct file *file; 308 struct file *file;
@@ -426,7 +439,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
426 atomic_set(&mm->mm_count, 1); 439 atomic_set(&mm->mm_count, 1);
427 init_rwsem(&mm->mmap_sem); 440 init_rwsem(&mm->mmap_sem);
428 INIT_LIST_HEAD(&mm->mmlist); 441 INIT_LIST_HEAD(&mm->mmlist);
429 mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; 442 mm->flags = (current->mm) ?
443 (current->mm->flags & MMF_INIT_MASK) : default_dump_filter;
430 mm->core_state = NULL; 444 mm->core_state = NULL;
431 mm->nr_ptes = 0; 445 mm->nr_ptes = 0;
432 set_mm_counter(mm, file_rss, 0); 446 set_mm_counter(mm, file_rss, 0);
@@ -487,6 +501,7 @@ void mmput(struct mm_struct *mm)
487 501
488 if (atomic_dec_and_test(&mm->mm_users)) { 502 if (atomic_dec_and_test(&mm->mm_users)) {
489 exit_aio(mm); 503 exit_aio(mm);
504 ksm_exit(mm);
490 exit_mmap(mm); 505 exit_mmap(mm);
491 set_mm_exe_file(mm, NULL); 506 set_mm_exe_file(mm, NULL);
492 if (!list_empty(&mm->mmlist)) { 507 if (!list_empty(&mm->mmlist)) {
@@ -568,18 +583,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
568 * the value intact in a core dump, and to save the unnecessary 583 * the value intact in a core dump, and to save the unnecessary
569 * trouble otherwise. Userland only wants this done for a sys_exit. 584 * trouble otherwise. Userland only wants this done for a sys_exit.
570 */ 585 */
571 if (tsk->clear_child_tid 586 if (tsk->clear_child_tid) {
572 && !(tsk->flags & PF_SIGNALED) 587 if (!(tsk->flags & PF_SIGNALED) &&
573 && atomic_read(&mm->mm_users) > 1) { 588 atomic_read(&mm->mm_users) > 1) {
574 u32 __user * tidptr = tsk->clear_child_tid; 589 /*
590 * We don't check the error code - if userspace has
591 * not set up a proper pointer then tough luck.
592 */
593 put_user(0, tsk->clear_child_tid);
594 sys_futex(tsk->clear_child_tid, FUTEX_WAKE,
595 1, NULL, NULL, 0);
596 }
575 tsk->clear_child_tid = NULL; 597 tsk->clear_child_tid = NULL;
576
577 /*
578 * We don't check the error code - if userspace has
579 * not set up a proper pointer then tough luck.
580 */
581 put_user(0, tidptr);
582 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL, 0);
583 } 598 }
584} 599}
585 600
@@ -816,11 +831,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
816{ 831{
817 struct signal_struct *sig; 832 struct signal_struct *sig;
818 833
819 if (clone_flags & CLONE_THREAD) { 834 if (clone_flags & CLONE_THREAD)
820 atomic_inc(&current->signal->count);
821 atomic_inc(&current->signal->live);
822 return 0; 835 return 0;
823 }
824 836
825 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 837 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
826 tsk->signal = sig; 838 tsk->signal = sig;
@@ -868,6 +880,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
868 880
869 tty_audit_fork(sig); 881 tty_audit_fork(sig);
870 882
883 sig->oom_adj = current->signal->oom_adj;
884
871 return 0; 885 return 0;
872} 886}
873 887
@@ -878,16 +892,6 @@ void __cleanup_signal(struct signal_struct *sig)
878 kmem_cache_free(signal_cachep, sig); 892 kmem_cache_free(signal_cachep, sig);
879} 893}
880 894
881static void cleanup_signal(struct task_struct *tsk)
882{
883 struct signal_struct *sig = tsk->signal;
884
885 atomic_dec(&sig->live);
886
887 if (atomic_dec_and_test(&sig->count))
888 __cleanup_signal(sig);
889}
890
891static void copy_flags(unsigned long clone_flags, struct task_struct *p) 895static void copy_flags(unsigned long clone_flags, struct task_struct *p)
892{ 896{
893 unsigned long new_flags = p->flags; 897 unsigned long new_flags = p->flags;
@@ -1022,10 +1026,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1022 copy_flags(clone_flags, p); 1026 copy_flags(clone_flags, p);
1023 INIT_LIST_HEAD(&p->children); 1027 INIT_LIST_HEAD(&p->children);
1024 INIT_LIST_HEAD(&p->sibling); 1028 INIT_LIST_HEAD(&p->sibling);
1025#ifdef CONFIG_PREEMPT_RCU 1029 rcu_copy_process(p);
1026 p->rcu_read_lock_nesting = 0;
1027 p->rcu_flipctr_idx = 0;
1028#endif /* #ifdef CONFIG_PREEMPT_RCU */
1029 p->vfork_done = NULL; 1030 p->vfork_done = NULL;
1030 spin_lock_init(&p->alloc_lock); 1031 spin_lock_init(&p->alloc_lock);
1031 1032
@@ -1096,7 +1097,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1096 /* Perform scheduler related setup. Assign this task to a CPU. */ 1097 /* Perform scheduler related setup. Assign this task to a CPU. */
1097 sched_fork(p, clone_flags); 1098 sched_fork(p, clone_flags);
1098 1099
1099 retval = perf_counter_init_task(p); 1100 retval = perf_event_init_task(p);
1100 if (retval) 1101 if (retval)
1101 goto bad_fork_cleanup_policy; 1102 goto bad_fork_cleanup_policy;
1102 1103
@@ -1240,6 +1241,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1240 } 1241 }
1241 1242
1242 if (clone_flags & CLONE_THREAD) { 1243 if (clone_flags & CLONE_THREAD) {
1244 atomic_inc(&current->signal->count);
1245 atomic_inc(&current->signal->live);
1243 p->group_leader = current->group_leader; 1246 p->group_leader = current->group_leader;
1244 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1247 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1245 } 1248 }
@@ -1269,6 +1272,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1269 write_unlock_irq(&tasklist_lock); 1272 write_unlock_irq(&tasklist_lock);
1270 proc_fork_connector(p); 1273 proc_fork_connector(p);
1271 cgroup_post_fork(p); 1274 cgroup_post_fork(p);
1275 perf_event_fork(p);
1272 return p; 1276 return p;
1273 1277
1274bad_fork_free_pid: 1278bad_fork_free_pid:
@@ -1282,7 +1286,8 @@ bad_fork_cleanup_mm:
1282 if (p->mm) 1286 if (p->mm)
1283 mmput(p->mm); 1287 mmput(p->mm);
1284bad_fork_cleanup_signal: 1288bad_fork_cleanup_signal:
1285 cleanup_signal(p); 1289 if (!(clone_flags & CLONE_THREAD))
1290 __cleanup_signal(p->signal);
1286bad_fork_cleanup_sighand: 1291bad_fork_cleanup_sighand:
1287 __cleanup_sighand(p->sighand); 1292 __cleanup_sighand(p->sighand);
1288bad_fork_cleanup_fs: 1293bad_fork_cleanup_fs:
@@ -1294,7 +1299,7 @@ bad_fork_cleanup_semundo:
1294bad_fork_cleanup_audit: 1299bad_fork_cleanup_audit:
1295 audit_free(p); 1300 audit_free(p);
1296bad_fork_cleanup_policy: 1301bad_fork_cleanup_policy:
1297 perf_counter_free_task(p); 1302 perf_event_free_task(p);
1298#ifdef CONFIG_NUMA 1303#ifdef CONFIG_NUMA
1299 mpol_put(p->mempolicy); 1304 mpol_put(p->mempolicy);
1300bad_fork_cleanup_cgroup: 1305bad_fork_cleanup_cgroup:
@@ -1307,8 +1312,7 @@ bad_fork_cleanup_put_domain:
1307 module_put(task_thread_info(p)->exec_domain->module); 1312 module_put(task_thread_info(p)->exec_domain->module);
1308bad_fork_cleanup_count: 1313bad_fork_cleanup_count:
1309 atomic_dec(&p->cred->user->processes); 1314 atomic_dec(&p->cred->user->processes);
1310 put_cred(p->real_cred); 1315 exit_creds(p);
1311 put_cred(p->cred);
1312bad_fork_free: 1316bad_fork_free:
1313 free_task(p); 1317 free_task(p);
1314fork_out: 1318fork_out:
@@ -1408,12 +1412,6 @@ long do_fork(unsigned long clone_flags,
1408 if (clone_flags & CLONE_VFORK) { 1412 if (clone_flags & CLONE_VFORK) {
1409 p->vfork_done = &vfork; 1413 p->vfork_done = &vfork;
1410 init_completion(&vfork); 1414 init_completion(&vfork);
1411 } else if (!(clone_flags & CLONE_VM)) {
1412 /*
1413 * vfork will do an exec which will call
1414 * set_task_comm()
1415 */
1416 perf_counter_fork(p);
1417 } 1415 }
1418 1416
1419 audit_finish_fork(p); 1417 audit_finish_fork(p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 2f4936cf7083..bd1d42b17cb2 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -44,12 +44,19 @@ void refrigerator(void)
44 recalc_sigpending(); /* We sent fake signal, clean it up */ 44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock); 45 spin_unlock_irq(&current->sighand->siglock);
46 46
47 /* prevent accounting of that task to load */
48 current->flags |= PF_FREEZING;
49
47 for (;;) { 50 for (;;) {
48 set_current_state(TASK_UNINTERRUPTIBLE); 51 set_current_state(TASK_UNINTERRUPTIBLE);
49 if (!frozen(current)) 52 if (!frozen(current))
50 break; 53 break;
51 schedule(); 54 schedule();
52 } 55 }
56
57 /* Remove the accounting blocker */
58 current->flags &= ~PF_FREEZING;
59
53 pr_debug("%s left refrigerator\n", current->comm); 60 pr_debug("%s left refrigerator\n", current->comm);
54 __set_current_state(save); 61 __set_current_state(save);
55} 62}
diff --git a/kernel/futex.c b/kernel/futex.c
index 794c862125fe..248dd119a86e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -115,6 +115,9 @@ struct futex_q {
115 /* rt_waiter storage for requeue_pi: */ 115 /* rt_waiter storage for requeue_pi: */
116 struct rt_mutex_waiter *rt_waiter; 116 struct rt_mutex_waiter *rt_waiter;
117 117
118 /* The expected requeue pi target futex key: */
119 union futex_key *requeue_pi_key;
120
118 /* Bitset for the optional bitmasked wakeup */ 121 /* Bitset for the optional bitmasked wakeup */
119 u32 bitset; 122 u32 bitset;
120}; 123};
@@ -247,6 +250,7 @@ again:
247 if (err < 0) 250 if (err < 0)
248 return err; 251 return err;
249 252
253 page = compound_head(page);
250 lock_page(page); 254 lock_page(page);
251 if (!page->mapping) { 255 if (!page->mapping) {
252 unlock_page(page); 256 unlock_page(page);
@@ -1009,15 +1013,19 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1009 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue 1013 * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1010 * q: the futex_q 1014 * q: the futex_q
1011 * key: the key of the requeue target futex 1015 * key: the key of the requeue target futex
1016 * hb: the hash_bucket of the requeue target futex
1012 * 1017 *
1013 * During futex_requeue, with requeue_pi=1, it is possible to acquire the 1018 * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1014 * target futex if it is uncontended or via a lock steal. Set the futex_q key 1019 * target futex if it is uncontended or via a lock steal. Set the futex_q key
1015 * to the requeue target futex so the waiter can detect the wakeup on the right 1020 * to the requeue target futex so the waiter can detect the wakeup on the right
1016 * futex, but remove it from the hb and NULL the rt_waiter so it can detect 1021 * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1017 * atomic lock acquisition. Must be called with the q->lock_ptr held. 1022 * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
1023 * to protect access to the pi_state to fixup the owner later. Must be called
1024 * with both q->lock_ptr and hb->lock held.
1018 */ 1025 */
1019static inline 1026static inline
1020void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) 1027void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1028 struct futex_hash_bucket *hb)
1021{ 1029{
1022 drop_futex_key_refs(&q->key); 1030 drop_futex_key_refs(&q->key);
1023 get_futex_key_refs(key); 1031 get_futex_key_refs(key);
@@ -1029,6 +1037,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
1029 WARN_ON(!q->rt_waiter); 1037 WARN_ON(!q->rt_waiter);
1030 q->rt_waiter = NULL; 1038 q->rt_waiter = NULL;
1031 1039
1040 q->lock_ptr = &hb->lock;
1041#ifdef CONFIG_DEBUG_PI_LIST
1042 q->list.plist.lock = &hb->lock;
1043#endif
1044
1032 wake_up_state(q->task, TASK_NORMAL); 1045 wake_up_state(q->task, TASK_NORMAL);
1033} 1046}
1034 1047
@@ -1079,6 +1092,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1079 if (!top_waiter) 1092 if (!top_waiter)
1080 return 0; 1093 return 0;
1081 1094
1095 /* Ensure we requeue to the expected futex. */
1096 if (!match_futex(top_waiter->requeue_pi_key, key2))
1097 return -EINVAL;
1098
1082 /* 1099 /*
1083 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in 1100 * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
1084 * the contended case or if set_waiters is 1. The pi_state is returned 1101 * the contended case or if set_waiters is 1. The pi_state is returned
@@ -1087,7 +1104,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1087 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, 1104 ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1088 set_waiters); 1105 set_waiters);
1089 if (ret == 1) 1106 if (ret == 1)
1090 requeue_pi_wake_futex(top_waiter, key2); 1107 requeue_pi_wake_futex(top_waiter, key2, hb2);
1091 1108
1092 return ret; 1109 return ret;
1093} 1110}
@@ -1246,8 +1263,15 @@ retry_private:
1246 if (!match_futex(&this->key, &key1)) 1263 if (!match_futex(&this->key, &key1))
1247 continue; 1264 continue;
1248 1265
1249 WARN_ON(!requeue_pi && this->rt_waiter); 1266 /*
1250 WARN_ON(requeue_pi && !this->rt_waiter); 1267 * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1268 * be paired with each other and no other futex ops.
1269 */
1270 if ((requeue_pi && !this->rt_waiter) ||
1271 (!requeue_pi && this->rt_waiter)) {
1272 ret = -EINVAL;
1273 break;
1274 }
1251 1275
1252 /* 1276 /*
1253 * Wake nr_wake waiters. For requeue_pi, if we acquired the 1277 * Wake nr_wake waiters. For requeue_pi, if we acquired the
@@ -1259,6 +1283,12 @@ retry_private:
1259 continue; 1283 continue;
1260 } 1284 }
1261 1285
1286 /* Ensure we requeue to the expected futex for requeue_pi. */
1287 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1288 ret = -EINVAL;
1289 break;
1290 }
1291
1262 /* 1292 /*
1263 * Requeue nr_requeue waiters and possibly one more in the case 1293 * Requeue nr_requeue waiters and possibly one more in the case
1264 * of requeue_pi if we couldn't acquire the lock atomically. 1294 * of requeue_pi if we couldn't acquire the lock atomically.
@@ -1272,7 +1302,7 @@ retry_private:
1272 this->task, 1); 1302 this->task, 1);
1273 if (ret == 1) { 1303 if (ret == 1) {
1274 /* We got the lock. */ 1304 /* We got the lock. */
1275 requeue_pi_wake_futex(this, &key2); 1305 requeue_pi_wake_futex(this, &key2, hb2);
1276 continue; 1306 continue;
1277 } else if (ret) { 1307 } else if (ret) {
1278 /* -EDEADLK */ 1308 /* -EDEADLK */
@@ -1734,6 +1764,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1734 q.pi_state = NULL; 1764 q.pi_state = NULL;
1735 q.bitset = bitset; 1765 q.bitset = bitset;
1736 q.rt_waiter = NULL; 1766 q.rt_waiter = NULL;
1767 q.requeue_pi_key = NULL;
1737 1768
1738 if (abs_time) { 1769 if (abs_time) {
1739 to = &timeout; 1770 to = &timeout;
@@ -1841,6 +1872,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1841 1872
1842 q.pi_state = NULL; 1873 q.pi_state = NULL;
1843 q.rt_waiter = NULL; 1874 q.rt_waiter = NULL;
1875 q.requeue_pi_key = NULL;
1844retry: 1876retry:
1845 q.key = FUTEX_KEY_INIT; 1877 q.key = FUTEX_KEY_INIT;
1846 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1878 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -2101,11 +2133,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2101 * We call schedule in futex_wait_queue_me() when we enqueue and return there 2133 * We call schedule in futex_wait_queue_me() when we enqueue and return there
2102 * via the following: 2134 * via the following:
2103 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() 2135 * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2104 * 2) wakeup on uaddr2 after a requeue and subsequent unlock 2136 * 2) wakeup on uaddr2 after a requeue
2105 * 3) signal (before or after requeue) 2137 * 3) signal
2106 * 4) timeout (before or after requeue) 2138 * 4) timeout
2107 * 2139 *
2108 * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. 2140 * If 3, cleanup and return -ERESTARTNOINTR.
2109 * 2141 *
2110 * If 2, we may then block on trying to take the rt_mutex and return via: 2142 * If 2, we may then block on trying to take the rt_mutex and return via:
2111 * 5) successful lock 2143 * 5) successful lock
@@ -2113,7 +2145,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2113 * 7) timeout 2145 * 7) timeout
2114 * 8) other lock acquisition failure 2146 * 8) other lock acquisition failure
2115 * 2147 *
2116 * If 6, we setup a restart_block with futex_lock_pi() as the function. 2148 * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2117 * 2149 *
2118 * If 4 or 7, we cleanup and return with -ETIMEDOUT. 2150 * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2119 * 2151 *
@@ -2152,15 +2184,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2152 debug_rt_mutex_init_waiter(&rt_waiter); 2184 debug_rt_mutex_init_waiter(&rt_waiter);
2153 rt_waiter.task = NULL; 2185 rt_waiter.task = NULL;
2154 2186
2155 q.pi_state = NULL;
2156 q.bitset = bitset;
2157 q.rt_waiter = &rt_waiter;
2158
2159 key2 = FUTEX_KEY_INIT; 2187 key2 = FUTEX_KEY_INIT;
2160 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 2188 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
2161 if (unlikely(ret != 0)) 2189 if (unlikely(ret != 0))
2162 goto out; 2190 goto out;
2163 2191
2192 q.pi_state = NULL;
2193 q.bitset = bitset;
2194 q.rt_waiter = &rt_waiter;
2195 q.requeue_pi_key = &key2;
2196
2164 /* Prepare to wait on uaddr. */ 2197 /* Prepare to wait on uaddr. */
2165 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2198 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2166 if (ret) 2199 if (ret)
@@ -2231,14 +2264,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2231 rt_mutex_unlock(pi_mutex); 2264 rt_mutex_unlock(pi_mutex);
2232 } else if (ret == -EINTR) { 2265 } else if (ret == -EINTR) {
2233 /* 2266 /*
2234 * We've already been requeued, but we have no way to 2267 * We've already been requeued, but cannot restart by calling
2235 * restart by calling futex_lock_pi() directly. We 2268 * futex_lock_pi() directly. We could restart this syscall, but
2236 * could restart the syscall, but that will look at 2269 * it would detect that the user space "val" changed and return
2237 * the user space value and return right away. So we 2270 * -EWOULDBLOCK. Save the overhead of the restart and return
2238 * drop back with EWOULDBLOCK to tell user space that 2271 * -EWOULDBLOCK directly.
2239 * "val" has been changed. That's the same what the
2240 * restart of the syscall would do in
2241 * futex_wait_setup().
2242 */ 2272 */
2243 ret = -EWOULDBLOCK; 2273 ret = -EWOULDBLOCK;
2244 } 2274 }
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d607a5b9ee29..235716556bf1 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -180,7 +180,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
180 int cmd = op & FUTEX_CMD_MASK; 180 int cmd = op & FUTEX_CMD_MASK;
181 181
182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 182 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
183 cmd == FUTEX_WAIT_BITSET)) { 183 cmd == FUTEX_WAIT_BITSET ||
184 cmd == FUTEX_WAIT_REQUEUE_PI)) {
184 if (get_compat_timespec(&ts, utime)) 185 if (get_compat_timespec(&ts, utime))
185 return -EFAULT; 186 return -EFAULT;
186 if (!timespec_valid(&ts)) 187 if (!timespec_valid(&ts))
@@ -191,7 +192,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
191 t = ktime_add_safe(ktime_get(), t); 192 t = ktime_add_safe(ktime_get(), t);
192 tp = &t; 193 tp = &t;
193 } 194 }
194 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) 195 if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
196 cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
195 val2 = (int) (unsigned long) utime; 197 val2 = (int) (unsigned long) utime;
196 198
197 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); 199 return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 22e9dcfaa3d3..654efd09f6a9 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
34config GCOV_PROFILE_ALL 34config GCOV_PROFILE_ALL
35 bool "Profile entire Kernel" 35 bool "Profile entire Kernel"
36 depends on GCOV_KERNEL 36 depends on GCOV_KERNEL
37 depends on S390 || X86 37 depends on S390 || X86 || (PPC && EXPERIMENTAL)
38 default n 38 default n
39 ---help--- 39 ---help---
40 This options activates profiling for the entire kernel. 40 This options activates profiling for the entire kernel.
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 9002958a96e7..c03f221fee44 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -48,37 +48,6 @@
48 48
49#include <asm/uaccess.h> 49#include <asm/uaccess.h>
50 50
51/**
52 * ktime_get - get the monotonic time in ktime_t format
53 *
54 * returns the time in ktime_t format
55 */
56ktime_t ktime_get(void)
57{
58 struct timespec now;
59
60 ktime_get_ts(&now);
61
62 return timespec_to_ktime(now);
63}
64EXPORT_SYMBOL_GPL(ktime_get);
65
66/**
67 * ktime_get_real - get the real (wall-) time in ktime_t format
68 *
69 * returns the time in ktime_t format
70 */
71ktime_t ktime_get_real(void)
72{
73 struct timespec now;
74
75 getnstimeofday(&now);
76
77 return timespec_to_ktime(now);
78}
79
80EXPORT_SYMBOL_GPL(ktime_get_real);
81
82/* 51/*
83 * The timer bases: 52 * The timer bases:
84 * 53 *
@@ -106,31 +75,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
106 } 75 }
107}; 76};
108 77
109/**
110 * ktime_get_ts - get the monotonic clock in timespec format
111 * @ts: pointer to timespec variable
112 *
113 * The function calculates the monotonic clock from the realtime
114 * clock and the wall_to_monotonic offset and stores the result
115 * in normalized timespec format in the variable pointed to by @ts.
116 */
117void ktime_get_ts(struct timespec *ts)
118{
119 struct timespec tomono;
120 unsigned long seq;
121
122 do {
123 seq = read_seqbegin(&xtime_lock);
124 getnstimeofday(ts);
125 tomono = wall_to_monotonic;
126
127 } while (read_seqretry(&xtime_lock, seq));
128
129 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
130 ts->tv_nsec + tomono.tv_nsec);
131}
132EXPORT_SYMBOL_GPL(ktime_get_ts);
133
134/* 78/*
135 * Get the coarse grained time at the softirq based on xtime and 79 * Get the coarse grained time at the softirq based on xtime and
136 * wall_to_monotonic. 80 * wall_to_monotonic.
@@ -191,6 +135,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
191 } 135 }
192} 136}
193 137
138
139/*
140 * Get the preferred target CPU for NOHZ
141 */
142static int hrtimer_get_target(int this_cpu, int pinned)
143{
144#ifdef CONFIG_NO_HZ
145 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) {
146 int preferred_cpu = get_nohz_load_balancer();
147
148 if (preferred_cpu >= 0)
149 return preferred_cpu;
150 }
151#endif
152 return this_cpu;
153}
154
155/*
156 * With HIGHRES=y we do not migrate the timer when it is expiring
157 * before the next event on the target cpu because we cannot reprogram
158 * the target cpu hardware and we would cause it to fire late.
159 *
160 * Called with cpu_base->lock of target cpu held.
161 */
162static int
163hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
164{
165#ifdef CONFIG_HIGH_RES_TIMERS
166 ktime_t expires;
167
168 if (!new_base->cpu_base->hres_active)
169 return 0;
170
171 expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
172 return expires.tv64 <= new_base->cpu_base->expires_next.tv64;
173#else
174 return 0;
175#endif
176}
177
194/* 178/*
195 * Switch the timer base to the current CPU when possible. 179 * Switch the timer base to the current CPU when possible.
196 */ 180 */
@@ -200,16 +184,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
200{ 184{
201 struct hrtimer_clock_base *new_base; 185 struct hrtimer_clock_base *new_base;
202 struct hrtimer_cpu_base *new_cpu_base; 186 struct hrtimer_cpu_base *new_cpu_base;
203 int cpu, preferred_cpu = -1; 187 int this_cpu = smp_processor_id();
204 188 int cpu = hrtimer_get_target(this_cpu, pinned);
205 cpu = smp_processor_id();
206#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
207 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) {
208 preferred_cpu = get_nohz_load_balancer();
209 if (preferred_cpu >= 0)
210 cpu = preferred_cpu;
211 }
212#endif
213 189
214again: 190again:
215 new_cpu_base = &per_cpu(hrtimer_bases, cpu); 191 new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -217,7 +193,7 @@ again:
217 193
218 if (base != new_base) { 194 if (base != new_base) {
219 /* 195 /*
220 * We are trying to schedule the timer on the local CPU. 196 * We are trying to move timer to new_base.
221 * However we can't change timer's base while it is running, 197 * However we can't change timer's base while it is running,
222 * so we keep it on the same CPU. No hassle vs. reprogramming 198 * so we keep it on the same CPU. No hassle vs. reprogramming
223 * the event source in the high resolution case. The softirq 199 * the event source in the high resolution case. The softirq
@@ -233,38 +209,12 @@ again:
233 spin_unlock(&base->cpu_base->lock); 209 spin_unlock(&base->cpu_base->lock);
234 spin_lock(&new_base->cpu_base->lock); 210 spin_lock(&new_base->cpu_base->lock);
235 211
236 /* Optimized away for NOHZ=n SMP=n */ 212 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
237 if (cpu == preferred_cpu) { 213 cpu = this_cpu;
238 /* Calculate clock monotonic expiry time */ 214 spin_unlock(&new_base->cpu_base->lock);
239#ifdef CONFIG_HIGH_RES_TIMERS 215 spin_lock(&base->cpu_base->lock);
240 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), 216 timer->base = base;
241 new_base->offset); 217 goto again;
242#else
243 ktime_t expires = hrtimer_get_expires(timer);
244#endif
245
246 /*
247 * Get the next event on target cpu from the
248 * clock events layer.
249 * This covers the highres=off nohz=on case as well.
250 */
251 ktime_t next = clockevents_get_next_event(cpu);
252
253 ktime_t delta = ktime_sub(expires, next);
254
255 /*
256 * We do not migrate the timer when it is expiring
257 * before the next event on the target cpu because
258 * we cannot reprogram the target cpu hardware and
259 * we would cause it to fire late.
260 */
261 if (delta.tv64 < 0) {
262 cpu = smp_processor_id();
263 spin_unlock(&new_base->cpu_base->lock);
264 spin_lock(&base->cpu_base->lock);
265 timer->base = base;
266 goto again;
267 }
268 } 218 }
269 timer->base = new_base; 219 timer->base = new_base;
270 } 220 }
@@ -479,6 +429,7 @@ void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id,
479 debug_object_init_on_stack(timer, &hrtimer_debug_descr); 429 debug_object_init_on_stack(timer, &hrtimer_debug_descr);
480 __hrtimer_init(timer, clock_id, mode); 430 __hrtimer_init(timer, clock_id, mode);
481} 431}
432EXPORT_SYMBOL_GPL(hrtimer_init_on_stack);
482 433
483void destroy_hrtimer_on_stack(struct hrtimer *timer) 434void destroy_hrtimer_on_stack(struct hrtimer *timer)
484{ 435{
@@ -1148,7 +1099,6 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1148 clock_id = CLOCK_MONOTONIC; 1099 clock_id = CLOCK_MONOTONIC;
1149 1100
1150 timer->base = &cpu_base->clock_base[clock_id]; 1101 timer->base = &cpu_base->clock_base[clock_id];
1151 INIT_LIST_HEAD(&timer->cb_entry);
1152 hrtimer_init_timer_hres(timer); 1102 hrtimer_init_timer_hres(timer);
1153 1103
1154#ifdef CONFIG_TIMER_STATS 1104#ifdef CONFIG_TIMER_STATS
@@ -1276,14 +1226,22 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1276 1226
1277 expires_next.tv64 = KTIME_MAX; 1227 expires_next.tv64 = KTIME_MAX;
1278 1228
1229 spin_lock(&cpu_base->lock);
1230 /*
1231 * We set expires_next to KTIME_MAX here with cpu_base->lock
1232 * held to prevent that a timer is enqueued in our queue via
1233 * the migration code. This does not affect enqueueing of
1234 * timers which run their callback and need to be requeued on
1235 * this CPU.
1236 */
1237 cpu_base->expires_next.tv64 = KTIME_MAX;
1238
1279 base = cpu_base->clock_base; 1239 base = cpu_base->clock_base;
1280 1240
1281 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1241 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1282 ktime_t basenow; 1242 ktime_t basenow;
1283 struct rb_node *node; 1243 struct rb_node *node;
1284 1244
1285 spin_lock(&cpu_base->lock);
1286
1287 basenow = ktime_add(now, base->offset); 1245 basenow = ktime_add(now, base->offset);
1288 1246
1289 while ((node = base->first)) { 1247 while ((node = base->first)) {
@@ -1316,11 +1274,15 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1316 1274
1317 __run_hrtimer(timer); 1275 __run_hrtimer(timer);
1318 } 1276 }
1319 spin_unlock(&cpu_base->lock);
1320 base++; 1277 base++;
1321 } 1278 }
1322 1279
1280 /*
1281 * Store the new expiry value so the migration code can verify
1282 * against it.
1283 */
1323 cpu_base->expires_next = expires_next; 1284 cpu_base->expires_next = expires_next;
1285 spin_unlock(&cpu_base->lock);
1324 1286
1325 /* Reprogramming necessary ? */ 1287 /* Reprogramming necessary ? */
1326 if (expires_next.tv64 != KTIME_MAX) { 1288 if (expires_next.tv64 != KTIME_MAX) {
@@ -1459,6 +1421,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1459 sl->timer.function = hrtimer_wakeup; 1421 sl->timer.function = hrtimer_wakeup;
1460 sl->task = task; 1422 sl->task = task;
1461} 1423}
1424EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
1462 1425
1463static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) 1426static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
1464{ 1427{
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 13c68e71b726..c1660194d115 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -222,6 +222,34 @@ int set_irq_chip_data(unsigned int irq, void *data)
222} 222}
223EXPORT_SYMBOL(set_irq_chip_data); 223EXPORT_SYMBOL(set_irq_chip_data);
224 224
225/**
226 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
227 *
228 * @irq: Interrupt number
229 * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
230 *
231 * The IRQ_NESTED_THREAD flag indicates that on
232 * request_threaded_irq() no separate interrupt thread should be
233 * created for the irq as the handler are called nested in the
234 * context of a demultiplexing interrupt handler thread.
235 */
236void set_irq_nested_thread(unsigned int irq, int nest)
237{
238 struct irq_desc *desc = irq_to_desc(irq);
239 unsigned long flags;
240
241 if (!desc)
242 return;
243
244 spin_lock_irqsave(&desc->lock, flags);
245 if (nest)
246 desc->status |= IRQ_NESTED_THREAD;
247 else
248 desc->status &= ~IRQ_NESTED_THREAD;
249 spin_unlock_irqrestore(&desc->lock, flags);
250}
251EXPORT_SYMBOL_GPL(set_irq_nested_thread);
252
225/* 253/*
226 * default enable function 254 * default enable function
227 */ 255 */
@@ -299,6 +327,45 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
299 } 327 }
300} 328}
301 329
330/*
331 * handle_nested_irq - Handle a nested irq from a irq thread
332 * @irq: the interrupt number
333 *
334 * Handle interrupts which are nested into a threaded interrupt
335 * handler. The handler function is called inside the calling
336 * threads context.
337 */
338void handle_nested_irq(unsigned int irq)
339{
340 struct irq_desc *desc = irq_to_desc(irq);
341 struct irqaction *action;
342 irqreturn_t action_ret;
343
344 might_sleep();
345
346 spin_lock_irq(&desc->lock);
347
348 kstat_incr_irqs_this_cpu(irq, desc);
349
350 action = desc->action;
351 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
352 goto out_unlock;
353
354 desc->status |= IRQ_INPROGRESS;
355 spin_unlock_irq(&desc->lock);
356
357 action_ret = action->thread_fn(action->irq, action->dev_id);
358 if (!noirqdebug)
359 note_interrupt(irq, desc, action_ret);
360
361 spin_lock_irq(&desc->lock);
362 desc->status &= ~IRQ_INPROGRESS;
363
364out_unlock:
365 spin_unlock_irq(&desc->lock);
366}
367EXPORT_SYMBOL_GPL(handle_nested_irq);
368
302/** 369/**
303 * handle_simple_irq - Simple and software-decoded IRQs. 370 * handle_simple_irq - Simple and software-decoded IRQs.
304 * @irq: the interrupt number 371 * @irq: the interrupt number
@@ -382,7 +449,10 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
382 449
383 spin_lock(&desc->lock); 450 spin_lock(&desc->lock);
384 desc->status &= ~IRQ_INPROGRESS; 451 desc->status &= ~IRQ_INPROGRESS;
385 if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) 452
453 if (unlikely(desc->status & IRQ_ONESHOT))
454 desc->status |= IRQ_MASKED;
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
386 desc->chip->unmask(irq); 456 desc->chip->unmask(irq);
387out_unlock: 457out_unlock:
388 spin_unlock(&desc->lock); 458 spin_unlock(&desc->lock);
@@ -572,6 +642,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
572 desc->chip = &dummy_irq_chip; 642 desc->chip = &dummy_irq_chip;
573 } 643 }
574 644
645 chip_bus_lock(irq, desc);
575 spin_lock_irqsave(&desc->lock, flags); 646 spin_lock_irqsave(&desc->lock, flags);
576 647
577 /* Uninstall? */ 648 /* Uninstall? */
@@ -591,6 +662,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
591 desc->chip->startup(irq); 662 desc->chip->startup(irq);
592 } 663 }
593 spin_unlock_irqrestore(&desc->lock, flags); 664 spin_unlock_irqrestore(&desc->lock, flags);
665 chip_bus_sync_unlock(irq, desc);
594} 666}
595EXPORT_SYMBOL_GPL(__set_irq_handler); 667EXPORT_SYMBOL_GPL(__set_irq_handler);
596 668
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 065205bdd920..a81cf80554db 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -161,7 +161,7 @@ int __init early_irq_init(void)
161 161
162 desc = irq_desc_legacy; 162 desc = irq_desc_legacy;
163 legacy_count = ARRAY_SIZE(irq_desc_legacy); 163 legacy_count = ARRAY_SIZE(irq_desc_legacy);
164 node = first_online_node; 164 node = first_online_node;
165 165
166 /* allocate irq_desc_ptrs array based on nr_irqs */ 166 /* allocate irq_desc_ptrs array based on nr_irqs */
167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); 167 irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
@@ -172,6 +172,9 @@ int __init early_irq_init(void)
172 172
173 for (i = 0; i < legacy_count; i++) { 173 for (i = 0; i < legacy_count; i++) {
174 desc[i].irq = i; 174 desc[i].irq = i;
175#ifdef CONFIG_SMP
176 desc[i].node = node;
177#endif
175 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; 178 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
176 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 179 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
177 alloc_desc_masks(&desc[i], node, true); 180 alloc_desc_masks(&desc[i], node, true);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 73468253143b..1b5d742c6a77 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -42,8 +42,20 @@ static inline void unregister_handler_proc(unsigned int irq,
42 42
43extern int irq_select_affinity_usr(unsigned int irq); 43extern int irq_select_affinity_usr(unsigned int irq);
44 44
45extern void 45extern void irq_set_thread_affinity(struct irq_desc *desc);
46irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); 46
47/* Inline functions for support of irq chips on slow busses */
48static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc)
49{
50 if (unlikely(desc->chip->bus_lock))
51 desc->chip->bus_lock(irq);
52}
53
54static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc)
55{
56 if (unlikely(desc->chip->bus_sync_unlock))
57 desc->chip->bus_sync_unlock(irq);
58}
47 59
48/* 60/*
49 * Debugging printout: 61 * Debugging printout:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 50da67672901..bde4c667d24d 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq)
80 return 1; 80 return 1;
81} 81}
82 82
83void 83/**
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) 84 * irq_set_thread_affinity - Notify irq threads to adjust affinity
85 * @desc: irq descriptor which has affitnity changed
86 *
87 * We just set IRQTF_AFFINITY and delegate the affinity setting
88 * to the interrupt thread itself. We can not call
89 * set_cpus_allowed_ptr() here as we hold desc->lock and this
90 * code can be called from hard interrupt context.
91 */
92void irq_set_thread_affinity(struct irq_desc *desc)
85{ 93{
86 struct irqaction *action = desc->action; 94 struct irqaction *action = desc->action;
87 95
88 while (action) { 96 while (action) {
89 if (action->thread) 97 if (action->thread)
90 set_cpus_allowed_ptr(action->thread, cpumask); 98 set_bit(IRQTF_AFFINITY, &action->thread_flags);
91 action = action->next; 99 action = action->next;
92 } 100 }
93} 101}
@@ -112,7 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
112 if (desc->status & IRQ_MOVE_PCNTXT) { 120 if (desc->status & IRQ_MOVE_PCNTXT) {
113 if (!desc->chip->set_affinity(irq, cpumask)) { 121 if (!desc->chip->set_affinity(irq, cpumask)) {
114 cpumask_copy(desc->affinity, cpumask); 122 cpumask_copy(desc->affinity, cpumask);
115 irq_set_thread_affinity(desc, cpumask); 123 irq_set_thread_affinity(desc);
116 } 124 }
117 } 125 }
118 else { 126 else {
@@ -122,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
122#else 130#else
123 if (!desc->chip->set_affinity(irq, cpumask)) { 131 if (!desc->chip->set_affinity(irq, cpumask)) {
124 cpumask_copy(desc->affinity, cpumask); 132 cpumask_copy(desc->affinity, cpumask);
125 irq_set_thread_affinity(desc, cpumask); 133 irq_set_thread_affinity(desc);
126 } 134 }
127#endif 135#endif
128 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
@@ -176,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq)
176 spin_lock_irqsave(&desc->lock, flags); 184 spin_lock_irqsave(&desc->lock, flags);
177 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
178 if (!ret) 186 if (!ret)
179 irq_set_thread_affinity(desc, desc->affinity); 187 irq_set_thread_affinity(desc);
180 spin_unlock_irqrestore(&desc->lock, flags); 188 spin_unlock_irqrestore(&desc->lock, flags);
181 189
182 return ret; 190 return ret;
@@ -222,9 +230,11 @@ void disable_irq_nosync(unsigned int irq)
222 if (!desc) 230 if (!desc)
223 return; 231 return;
224 232
233 chip_bus_lock(irq, desc);
225 spin_lock_irqsave(&desc->lock, flags); 234 spin_lock_irqsave(&desc->lock, flags);
226 __disable_irq(desc, irq, false); 235 __disable_irq(desc, irq, false);
227 spin_unlock_irqrestore(&desc->lock, flags); 236 spin_unlock_irqrestore(&desc->lock, flags);
237 chip_bus_sync_unlock(irq, desc);
228} 238}
229EXPORT_SYMBOL(disable_irq_nosync); 239EXPORT_SYMBOL(disable_irq_nosync);
230 240
@@ -286,7 +296,8 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
286 * matches the last disable, processing of interrupts on this 296 * matches the last disable, processing of interrupts on this
287 * IRQ line is re-enabled. 297 * IRQ line is re-enabled.
288 * 298 *
289 * This function may be called from IRQ context. 299 * This function may be called from IRQ context only when
300 * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
290 */ 301 */
291void enable_irq(unsigned int irq) 302void enable_irq(unsigned int irq)
292{ 303{
@@ -296,9 +307,11 @@ void enable_irq(unsigned int irq)
296 if (!desc) 307 if (!desc)
297 return; 308 return;
298 309
310 chip_bus_lock(irq, desc);
299 spin_lock_irqsave(&desc->lock, flags); 311 spin_lock_irqsave(&desc->lock, flags);
300 __enable_irq(desc, irq, false); 312 __enable_irq(desc, irq, false);
301 spin_unlock_irqrestore(&desc->lock, flags); 313 spin_unlock_irqrestore(&desc->lock, flags);
314 chip_bus_sync_unlock(irq, desc);
302} 315}
303EXPORT_SYMBOL(enable_irq); 316EXPORT_SYMBOL(enable_irq);
304 317
@@ -428,6 +441,26 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
428 return ret; 441 return ret;
429} 442}
430 443
444/*
445 * Default primary interrupt handler for threaded interrupts. Is
446 * assigned as primary handler when request_threaded_irq is called
447 * with handler == NULL. Useful for oneshot interrupts.
448 */
449static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
450{
451 return IRQ_WAKE_THREAD;
452}
453
454/*
455 * Primary handler for nested threaded interrupts. Should never be
456 * called.
457 */
458static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
459{
460 WARN(1, "Primary handler called for nested irq %d\n", irq);
461 return IRQ_NONE;
462}
463
431static int irq_wait_for_interrupt(struct irqaction *action) 464static int irq_wait_for_interrupt(struct irqaction *action)
432{ 465{
433 while (!kthread_should_stop()) { 466 while (!kthread_should_stop()) {
@@ -444,6 +477,56 @@ static int irq_wait_for_interrupt(struct irqaction *action)
444} 477}
445 478
446/* 479/*
480 * Oneshot interrupts keep the irq line masked until the threaded
481 * handler finished. unmask if the interrupt has not been disabled and
482 * is marked MASKED.
483 */
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{
486 chip_bus_lock(irq, desc);
487 spin_lock_irq(&desc->lock);
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq);
491 }
492 spin_unlock_irq(&desc->lock);
493 chip_bus_sync_unlock(irq, desc);
494}
495
496#ifdef CONFIG_SMP
497/*
498 * Check whether we need to change the affinity of the interrupt thread.
499 */
500static void
501irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
502{
503 cpumask_var_t mask;
504
505 if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
506 return;
507
508 /*
509 * In case we are out of memory we set IRQTF_AFFINITY again and
510 * try again next time
511 */
512 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
513 set_bit(IRQTF_AFFINITY, &action->thread_flags);
514 return;
515 }
516
517 spin_lock_irq(&desc->lock);
518 cpumask_copy(mask, desc->affinity);
519 spin_unlock_irq(&desc->lock);
520
521 set_cpus_allowed_ptr(current, mask);
522 free_cpumask_var(mask);
523}
524#else
525static inline void
526irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
527#endif
528
529/*
447 * Interrupt handler thread 530 * Interrupt handler thread
448 */ 531 */
449static int irq_thread(void *data) 532static int irq_thread(void *data)
@@ -451,13 +534,15 @@ static int irq_thread(void *data)
451 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 534 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
452 struct irqaction *action = data; 535 struct irqaction *action = data;
453 struct irq_desc *desc = irq_to_desc(action->irq); 536 struct irq_desc *desc = irq_to_desc(action->irq);
454 int wake; 537 int wake, oneshot = desc->status & IRQ_ONESHOT;
455 538
456 sched_setscheduler(current, SCHED_FIFO, &param); 539 sched_setscheduler(current, SCHED_FIFO, &param);
457 current->irqaction = action; 540 current->irqaction = action;
458 541
459 while (!irq_wait_for_interrupt(action)) { 542 while (!irq_wait_for_interrupt(action)) {
460 543
544 irq_thread_check_affinity(desc, action);
545
461 atomic_inc(&desc->threads_active); 546 atomic_inc(&desc->threads_active);
462 547
463 spin_lock_irq(&desc->lock); 548 spin_lock_irq(&desc->lock);
@@ -475,6 +560,9 @@ static int irq_thread(void *data)
475 spin_unlock_irq(&desc->lock); 560 spin_unlock_irq(&desc->lock);
476 561
477 action->thread_fn(action->irq, action->dev_id); 562 action->thread_fn(action->irq, action->dev_id);
563
564 if (oneshot)
565 irq_finalize_oneshot(action->irq, desc);
478 } 566 }
479 567
480 wake = atomic_dec_and_test(&desc->threads_active); 568 wake = atomic_dec_and_test(&desc->threads_active);
@@ -522,7 +610,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
522 struct irqaction *old, **old_ptr; 610 struct irqaction *old, **old_ptr;
523 const char *old_name = NULL; 611 const char *old_name = NULL;
524 unsigned long flags; 612 unsigned long flags;
525 int shared = 0; 613 int nested, shared = 0;
526 int ret; 614 int ret;
527 615
528 if (!desc) 616 if (!desc)
@@ -547,10 +635,32 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
547 rand_initialize_irq(irq); 635 rand_initialize_irq(irq);
548 } 636 }
549 637
638 /* Oneshot interrupts are not allowed with shared */
639 if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
640 return -EINVAL;
641
642 /*
643 * Check whether the interrupt nests into another interrupt
644 * thread.
645 */
646 nested = desc->status & IRQ_NESTED_THREAD;
647 if (nested) {
648 if (!new->thread_fn)
649 return -EINVAL;
650 /*
651 * Replace the primary handler which was provided from
652 * the driver for non nested interrupt handling by the
653 * dummy function which warns when called.
654 */
655 new->handler = irq_nested_primary_handler;
656 }
657
550 /* 658 /*
551 * Threaded handler ? 659 * Create a handler thread when a thread function is supplied
660 * and the interrupt does not nest into another interrupt
661 * thread.
552 */ 662 */
553 if (new->thread_fn) { 663 if (new->thread_fn && !nested) {
554 struct task_struct *t; 664 struct task_struct *t;
555 665
556 t = kthread_create(irq_thread, new, "irq/%d-%s", irq, 666 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
@@ -564,7 +674,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
564 */ 674 */
565 get_task_struct(t); 675 get_task_struct(t);
566 new->thread = t; 676 new->thread = t;
567 wake_up_process(t);
568 } 677 }
569 678
570 /* 679 /*
@@ -620,9 +729,12 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
620 desc->status |= IRQ_PER_CPU; 729 desc->status |= IRQ_PER_CPU;
621#endif 730#endif
622 731
623 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | 732 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
624 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 733 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
625 734
735 if (new->flags & IRQF_ONESHOT)
736 desc->status |= IRQ_ONESHOT;
737
626 if (!(desc->status & IRQ_NOAUTOEN)) { 738 if (!(desc->status & IRQ_NOAUTOEN)) {
627 desc->depth = 0; 739 desc->depth = 0;
628 desc->status &= ~IRQ_DISABLED; 740 desc->status &= ~IRQ_DISABLED;
@@ -647,6 +759,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
647 (int)(new->flags & IRQF_TRIGGER_MASK)); 759 (int)(new->flags & IRQF_TRIGGER_MASK));
648 } 760 }
649 761
762 new->irq = irq;
650 *old_ptr = new; 763 *old_ptr = new;
651 764
652 /* Reset broken irq detection when installing new handler */ 765 /* Reset broken irq detection when installing new handler */
@@ -664,7 +777,13 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
664 777
665 spin_unlock_irqrestore(&desc->lock, flags); 778 spin_unlock_irqrestore(&desc->lock, flags);
666 779
667 new->irq = irq; 780 /*
781 * Strictly no need to wake it up, but hung_task complains
782 * when no hard interrupt wakes the thread up.
783 */
784 if (new->thread)
785 wake_up_process(new->thread);
786
668 register_irq_proc(irq, desc); 787 register_irq_proc(irq, desc);
669 new->dir = NULL; 788 new->dir = NULL;
670 register_handler_proc(irq, new); 789 register_handler_proc(irq, new);
@@ -718,7 +837,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
718{ 837{
719 struct irq_desc *desc = irq_to_desc(irq); 838 struct irq_desc *desc = irq_to_desc(irq);
720 struct irqaction *action, **action_ptr; 839 struct irqaction *action, **action_ptr;
721 struct task_struct *irqthread;
722 unsigned long flags; 840 unsigned long flags;
723 841
724 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); 842 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -766,9 +884,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
766 desc->chip->disable(irq); 884 desc->chip->disable(irq);
767 } 885 }
768 886
769 irqthread = action->thread;
770 action->thread = NULL;
771
772 spin_unlock_irqrestore(&desc->lock, flags); 887 spin_unlock_irqrestore(&desc->lock, flags);
773 888
774 unregister_handler_proc(irq, action); 889 unregister_handler_proc(irq, action);
@@ -776,12 +891,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
776 /* Make sure it's not being used on another CPU: */ 891 /* Make sure it's not being used on another CPU: */
777 synchronize_irq(irq); 892 synchronize_irq(irq);
778 893
779 if (irqthread) {
780 if (!test_bit(IRQTF_DIED, &action->thread_flags))
781 kthread_stop(irqthread);
782 put_task_struct(irqthread);
783 }
784
785#ifdef CONFIG_DEBUG_SHIRQ 894#ifdef CONFIG_DEBUG_SHIRQ
786 /* 895 /*
787 * It's a shared IRQ -- the driver ought to be prepared for an IRQ 896 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -797,6 +906,13 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
797 local_irq_restore(flags); 906 local_irq_restore(flags);
798 } 907 }
799#endif 908#endif
909
910 if (action->thread) {
911 if (!test_bit(IRQTF_DIED, &action->thread_flags))
912 kthread_stop(action->thread);
913 put_task_struct(action->thread);
914 }
915
800 return action; 916 return action;
801} 917}
802 918
@@ -829,7 +945,14 @@ EXPORT_SYMBOL_GPL(remove_irq);
829 */ 945 */
830void free_irq(unsigned int irq, void *dev_id) 946void free_irq(unsigned int irq, void *dev_id)
831{ 947{
948 struct irq_desc *desc = irq_to_desc(irq);
949
950 if (!desc)
951 return;
952
953 chip_bus_lock(irq, desc);
832 kfree(__free_irq(irq, dev_id)); 954 kfree(__free_irq(irq, dev_id));
955 chip_bus_sync_unlock(irq, desc);
833} 956}
834EXPORT_SYMBOL(free_irq); 957EXPORT_SYMBOL(free_irq);
835 958
@@ -838,6 +961,8 @@ EXPORT_SYMBOL(free_irq);
838 * @irq: Interrupt line to allocate 961 * @irq: Interrupt line to allocate
839 * @handler: Function to be called when the IRQ occurs. 962 * @handler: Function to be called when the IRQ occurs.
840 * Primary handler for threaded interrupts 963 * Primary handler for threaded interrupts
964 * If NULL and thread_fn != NULL the default
965 * primary handler is installed
841 * @thread_fn: Function called from the irq handler thread 966 * @thread_fn: Function called from the irq handler thread
842 * If NULL, no irq thread is created 967 * If NULL, no irq thread is created
843 * @irqflags: Interrupt type flags 968 * @irqflags: Interrupt type flags
@@ -917,8 +1042,12 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
917 1042
918 if (desc->status & IRQ_NOREQUEST) 1043 if (desc->status & IRQ_NOREQUEST)
919 return -EINVAL; 1044 return -EINVAL;
920 if (!handler) 1045
921 return -EINVAL; 1046 if (!handler) {
1047 if (!thread_fn)
1048 return -EINVAL;
1049 handler = irq_default_primary_handler;
1050 }
922 1051
923 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); 1052 action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
924 if (!action) 1053 if (!action)
@@ -930,7 +1059,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
930 action->name = devname; 1059 action->name = devname;
931 action->dev_id = dev_id; 1060 action->dev_id = dev_id;
932 1061
1062 chip_bus_lock(irq, desc);
933 retval = __setup_irq(irq, desc, action); 1063 retval = __setup_irq(irq, desc, action);
1064 chip_bus_sync_unlock(irq, desc);
1065
934 if (retval) 1066 if (retval)
935 kfree(action); 1067 kfree(action);
936 1068
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index cfe767ca1545..fcb6c96f2627 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -45,7 +45,7 @@ void move_masked_irq(int irq)
45 < nr_cpu_ids)) 45 < nr_cpu_ids))
46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) { 46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
47 cpumask_copy(desc->affinity, desc->pending_mask); 47 cpumask_copy(desc->affinity, desc->pending_mask);
48 irq_set_thread_affinity(desc, desc->pending_mask); 48 irq_set_thread_affinity(desc);
49 } 49 }
50 50
51 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 2f69bee57bf2..3fd30197da2e 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -107,8 +107,8 @@ out_unlock:
107 107
108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) 108struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
109{ 109{
110 /* those all static, do move them */ 110 /* those static or target node is -1, do not move them */
111 if (desc->irq < NR_IRQS_LEGACY) 111 if (desc->irq < NR_IRQS_LEGACY || node == -1)
112 return desc; 112 return desc;
113 113
114 if (desc->node != node) 114 if (desc->node != node)
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 638d8bedec14..a0bb09e79867 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -15,10 +15,10 @@
15/** 15/**
16 * suspend_device_irqs - disable all currently enabled interrupt lines 16 * suspend_device_irqs - disable all currently enabled interrupt lines
17 * 17 *
18 * During system-wide suspend or hibernation device interrupts need to be 18 * During system-wide suspend or hibernation device drivers need to be prevented
19 * disabled at the chip level and this function is provided for this purpose. 19 * from receiving interrupts and this function is provided for this purpose.
20 * It disables all interrupt lines that are enabled at the moment and sets the 20 * It marks all interrupt lines in use, except for the timer ones, as disabled
21 * IRQ_SUSPENDED flag for them. 21 * and sets the IRQ_SUSPENDED flag for each of them.
22 */ 22 */
23void suspend_device_irqs(void) 23void suspend_device_irqs(void)
24{ 24{
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 89c7117acf2b..090c3763f3a2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -70,8 +70,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
72 72
73 if (!desc->chip || !desc->chip->retrigger || 73 if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) {
74 !desc->chip->retrigger(irq)) {
75#ifdef CONFIG_HARDIRQS_SW_RESEND 74#ifdef CONFIG_HARDIRQS_SW_RESEND
76 /* Set it pending and activate the softirq: */ 75 /* Set it pending and activate the softirq: */
77 set_bit(irq, irqs_resend); 76 set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 4d568294de3e..114e704760fe 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -297,7 +297,6 @@ static int __init irqfixup_setup(char *str)
297 297
298__setup("irqfixup", irqfixup_setup); 298__setup("irqfixup", irqfixup_setup);
299module_param(irqfixup, int, 0644); 299module_param(irqfixup, int, 0644);
300MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
301 300
302static int __init irqpoll_setup(char *str) 301static int __init irqpoll_setup(char *str)
303{ 302{
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ae1c35201cc8..f336e2107f98 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1228,7 +1228,7 @@ static int __init parse_crashkernel_mem(char *cmdline,
1228 } while (*cur++ == ','); 1228 } while (*cur++ == ',');
1229 1229
1230 if (*crash_size > 0) { 1230 if (*crash_size > 0) {
1231 while (*cur != ' ' && *cur != '@') 1231 while (*cur && *cur != ' ' && *cur != '@')
1232 cur++; 1232 cur++;
1233 if (*cur == '@') { 1233 if (*cur == '@') {
1234 cur++; 1234 cur++;
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 26539e3228e5..3765ff3c1bbe 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -117,7 +117,7 @@ EXPORT_SYMBOL(kfifo_free);
117 * writer, you don't need extra locking to use these functions. 117 * writer, you don't need extra locking to use these functions.
118 */ 118 */
119unsigned int __kfifo_put(struct kfifo *fifo, 119unsigned int __kfifo_put(struct kfifo *fifo,
120 unsigned char *buffer, unsigned int len) 120 const unsigned char *buffer, unsigned int len)
121{ 121{
122 unsigned int l; 122 unsigned int l;
123 123
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7e95bedb2bfc..9fcb53a11f87 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -24,7 +24,6 @@
24#include <linux/unistd.h> 24#include <linux/unistd.h>
25#include <linux/kmod.h> 25#include <linux/kmod.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/mnt_namespace.h>
28#include <linux/completion.h> 27#include <linux/completion.h>
29#include <linux/file.h> 28#include <linux/file.h>
30#include <linux/fdtable.h> 29#include <linux/fdtable.h>
@@ -38,6 +37,8 @@
38#include <linux/suspend.h> 37#include <linux/suspend.h>
39#include <asm/uaccess.h> 38#include <asm/uaccess.h>
40 39
40#include <trace/events/module.h>
41
41extern int max_threads; 42extern int max_threads;
42 43
43static struct workqueue_struct *khelper_wq; 44static struct workqueue_struct *khelper_wq;
@@ -79,6 +80,10 @@ int __request_module(bool wait, const char *fmt, ...)
79#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 80#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
80 static int kmod_loop_msg; 81 static int kmod_loop_msg;
81 82
83 ret = security_kernel_module_request();
84 if (ret)
85 return ret;
86
82 va_start(args, fmt); 87 va_start(args, fmt);
83 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); 88 ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
84 va_end(args); 89 va_end(args);
@@ -109,6 +114,8 @@ int __request_module(bool wait, const char *fmt, ...)
109 return -ENOMEM; 114 return -ENOMEM;
110 } 115 }
111 116
117 trace_module_request(module_name, wait, _RET_IP_);
118
112 ret = call_usermodehelper(modprobe_path, argv, envp, 119 ret = call_usermodehelper(modprobe_path, argv, envp,
113 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
114 atomic_dec(&kmod_concurrent); 121 atomic_dec(&kmod_concurrent);
@@ -463,6 +470,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
463 int retval = 0; 470 int retval = 0;
464 471
465 BUG_ON(atomic_read(&sub_info->cred->usage) != 1); 472 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
473 validate_creds(sub_info->cred);
466 474
467 helper_lock(); 475 helper_lock();
468 if (sub_info->path[0] == '\0') 476 if (sub_info->path[0] == '\0')
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c0fa54b276d9..ef177d653b2c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) 103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
104 104
105struct kprobe_insn_page { 105struct kprobe_insn_page {
106 struct hlist_node hlist; 106 struct list_head list;
107 kprobe_opcode_t *insns; /* Page of instruction slots */ 107 kprobe_opcode_t *insns; /* Page of instruction slots */
108 char slot_used[INSNS_PER_PAGE]; 108 char slot_used[INSNS_PER_PAGE];
109 int nused; 109 int nused;
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
117}; 117};
118 118
119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
120static struct hlist_head kprobe_insn_pages; 120static LIST_HEAD(kprobe_insn_pages);
121static int kprobe_garbage_slots; 121static int kprobe_garbage_slots;
122static int collect_garbage_slots(void); 122static int collect_garbage_slots(void);
123 123
@@ -152,10 +152,9 @@ loop_end:
152static kprobe_opcode_t __kprobes *__get_insn_slot(void) 152static kprobe_opcode_t __kprobes *__get_insn_slot(void)
153{ 153{
154 struct kprobe_insn_page *kip; 154 struct kprobe_insn_page *kip;
155 struct hlist_node *pos;
156 155
157 retry: 156 retry:
158 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 157 list_for_each_entry(kip, &kprobe_insn_pages, list) {
159 if (kip->nused < INSNS_PER_PAGE) { 158 if (kip->nused < INSNS_PER_PAGE) {
160 int i; 159 int i;
161 for (i = 0; i < INSNS_PER_PAGE; i++) { 160 for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
189 kfree(kip); 188 kfree(kip);
190 return NULL; 189 return NULL;
191 } 190 }
192 INIT_HLIST_NODE(&kip->hlist); 191 INIT_LIST_HEAD(&kip->list);
193 hlist_add_head(&kip->hlist, &kprobe_insn_pages); 192 list_add(&kip->list, &kprobe_insn_pages);
194 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); 193 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
195 kip->slot_used[0] = SLOT_USED; 194 kip->slot_used[0] = SLOT_USED;
196 kip->nused = 1; 195 kip->nused = 1;
@@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
219 * so as not to have to set it up again the 218 * so as not to have to set it up again the
220 * next time somebody inserts a probe. 219 * next time somebody inserts a probe.
221 */ 220 */
222 hlist_del(&kip->hlist); 221 if (!list_is_singular(&kprobe_insn_pages)) {
223 if (hlist_empty(&kprobe_insn_pages)) { 222 list_del(&kip->list);
224 INIT_HLIST_NODE(&kip->hlist);
225 hlist_add_head(&kip->hlist,
226 &kprobe_insn_pages);
227 } else {
228 module_free(NULL, kip->insns); 223 module_free(NULL, kip->insns);
229 kfree(kip); 224 kfree(kip);
230 } 225 }
@@ -235,18 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
235 230
236static int __kprobes collect_garbage_slots(void) 231static int __kprobes collect_garbage_slots(void)
237{ 232{
238 struct kprobe_insn_page *kip; 233 struct kprobe_insn_page *kip, *next;
239 struct hlist_node *pos, *next;
240 int safety;
241 234
242 /* Ensure no-one is preepmted on the garbages */ 235 /* Ensure no-one is preepmted on the garbages */
243 mutex_unlock(&kprobe_insn_mutex); 236 if (check_safety())
244 safety = check_safety();
245 mutex_lock(&kprobe_insn_mutex);
246 if (safety != 0)
247 return -EAGAIN; 237 return -EAGAIN;
248 238
249 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 239 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
250 int i; 240 int i;
251 if (kip->ngarbage == 0) 241 if (kip->ngarbage == 0)
252 continue; 242 continue;
@@ -264,19 +254,17 @@ static int __kprobes collect_garbage_slots(void)
264void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 254void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
265{ 255{
266 struct kprobe_insn_page *kip; 256 struct kprobe_insn_page *kip;
267 struct hlist_node *pos;
268 257
269 mutex_lock(&kprobe_insn_mutex); 258 mutex_lock(&kprobe_insn_mutex);
270 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 259 list_for_each_entry(kip, &kprobe_insn_pages, list) {
271 if (kip->insns <= slot && 260 if (kip->insns <= slot &&
272 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 261 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
273 int i = (slot - kip->insns) / MAX_INSN_SIZE; 262 int i = (slot - kip->insns) / MAX_INSN_SIZE;
274 if (dirty) { 263 if (dirty) {
275 kip->slot_used[i] = SLOT_DIRTY; 264 kip->slot_used[i] = SLOT_DIRTY;
276 kip->ngarbage++; 265 kip->ngarbage++;
277 } else { 266 } else
278 collect_one_slot(kip, i); 267 collect_one_slot(kip, i);
279 }
280 break; 268 break;
281 } 269 }
282 } 270 }
@@ -698,7 +686,7 @@ int __kprobes register_kprobe(struct kprobe *p)
698 p->addr = addr; 686 p->addr = addr;
699 687
700 preempt_disable(); 688 preempt_disable();
701 if (!__kernel_text_address((unsigned long) p->addr) || 689 if (!kernel_text_address((unsigned long) p->addr) ||
702 in_kprobes_functions((unsigned long) p->addr)) { 690 in_kprobes_functions((unsigned long) p->addr)) {
703 preempt_enable(); 691 preempt_enable();
704 return -EINVAL; 692 return -EINVAL;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 9b1a7de26979..5fe709982caa 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,8 +16,6 @@
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <trace/events/sched.h> 17#include <trace/events/sched.h>
18 18
19#define KTHREAD_NICE_LEVEL (-5)
20
21static DEFINE_SPINLOCK(kthread_create_lock); 19static DEFINE_SPINLOCK(kthread_create_lock);
22static LIST_HEAD(kthread_create_list); 20static LIST_HEAD(kthread_create_list);
23struct task_struct *kthreadd_task; 21struct task_struct *kthreadd_task;
@@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
145 * The kernel thread should not inherit these properties. 143 * The kernel thread should not inherit these properties.
146 */ 144 */
147 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param); 145 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
148 set_user_nice(create.result, KTHREAD_NICE_LEVEL);
149 set_cpus_allowed_ptr(create.result, cpu_all_mask); 146 set_cpus_allowed_ptr(create.result, cpu_all_mask);
150 } 147 }
151 return create.result; 148 return create.result;
@@ -180,10 +177,12 @@ EXPORT_SYMBOL(kthread_bind);
180 * @k: thread created by kthread_create(). 177 * @k: thread created by kthread_create().
181 * 178 *
182 * Sets kthread_should_stop() for @k to return true, wakes it, and 179 * Sets kthread_should_stop() for @k to return true, wakes it, and
183 * waits for it to exit. Your threadfn() must not call do_exit() 180 * waits for it to exit. This can also be called after kthread_create()
184 * itself if you use this function! This can also be called after 181 * instead of calling wake_up_process(): the thread will exit without
185 * kthread_create() instead of calling wake_up_process(): the thread 182 * calling threadfn().
186 * will exit without calling threadfn(). 183 *
184 * If threadfn() may call do_exit() itself, the caller must ensure
185 * task_struct can't go away.
187 * 186 *
188 * Returns the result of threadfn(), or %-EINTR if wake_up_process() 187 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
189 * was never called. 188 * was never called.
@@ -219,7 +218,6 @@ int kthreadd(void *unused)
219 /* Setup a clean context for our children to inherit. */ 218 /* Setup a clean context for our children to inherit. */
220 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
221 ignore_signals(tsk); 220 ignore_signals(tsk);
222 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
223 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
224 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_possible_map);
225 223
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8bbeef996c76..f74d2d7aa605 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,7 @@
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h>
45 46
46#include <asm/sections.h> 47#include <asm/sections.h>
47 48
@@ -366,11 +367,21 @@ static int save_trace(struct stack_trace *trace)
366 367
367 save_stack_trace(trace); 368 save_stack_trace(trace);
368 369
370 /*
371 * Some daft arches put -1 at the end to indicate its a full trace.
372 *
373 * <rant> this is buggy anyway, since it takes a whole extra entry so a
374 * complete trace that maxes out the entries provided will be reported
375 * as incomplete, friggin useless </rant>
376 */
377 if (trace->entries[trace->nr_entries-1] == ULONG_MAX)
378 trace->nr_entries--;
379
369 trace->max_entries = trace->nr_entries; 380 trace->max_entries = trace->nr_entries;
370 381
371 nr_stack_trace_entries += trace->nr_entries; 382 nr_stack_trace_entries += trace->nr_entries;
372 383
373 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { 384 if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) {
374 if (!debug_locks_off_graph_unlock()) 385 if (!debug_locks_off_graph_unlock())
375 return 0; 386 return 0;
376 387
@@ -388,20 +399,6 @@ unsigned int nr_hardirq_chains;
388unsigned int nr_softirq_chains; 399unsigned int nr_softirq_chains;
389unsigned int nr_process_chains; 400unsigned int nr_process_chains;
390unsigned int max_lockdep_depth; 401unsigned int max_lockdep_depth;
391unsigned int max_recursion_depth;
392
393static unsigned int lockdep_dependency_gen_id;
394
395static bool lockdep_dependency_visit(struct lock_class *source,
396 unsigned int depth)
397{
398 if (!depth)
399 lockdep_dependency_gen_id++;
400 if (source->dep_gen_id == lockdep_dependency_gen_id)
401 return true;
402 source->dep_gen_id = lockdep_dependency_gen_id;
403 return false;
404}
405 402
406#ifdef CONFIG_DEBUG_LOCKDEP 403#ifdef CONFIG_DEBUG_LOCKDEP
407/* 404/*
@@ -431,11 +428,8 @@ atomic_t redundant_softirqs_on;
431atomic_t redundant_softirqs_off; 428atomic_t redundant_softirqs_off;
432atomic_t nr_unused_locks; 429atomic_t nr_unused_locks;
433atomic_t nr_cyclic_checks; 430atomic_t nr_cyclic_checks;
434atomic_t nr_cyclic_check_recursions;
435atomic_t nr_find_usage_forwards_checks; 431atomic_t nr_find_usage_forwards_checks;
436atomic_t nr_find_usage_forwards_recursions;
437atomic_t nr_find_usage_backwards_checks; 432atomic_t nr_find_usage_backwards_checks;
438atomic_t nr_find_usage_backwards_recursions;
439#endif 433#endif
440 434
441/* 435/*
@@ -551,58 +545,6 @@ static void lockdep_print_held_locks(struct task_struct *curr)
551 } 545 }
552} 546}
553 547
554static void print_lock_class_header(struct lock_class *class, int depth)
555{
556 int bit;
557
558 printk("%*s->", depth, "");
559 print_lock_name(class);
560 printk(" ops: %lu", class->ops);
561 printk(" {\n");
562
563 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
564 if (class->usage_mask & (1 << bit)) {
565 int len = depth;
566
567 len += printk("%*s %s", depth, "", usage_str[bit]);
568 len += printk(" at:\n");
569 print_stack_trace(class->usage_traces + bit, len);
570 }
571 }
572 printk("%*s }\n", depth, "");
573
574 printk("%*s ... key at: ",depth,"");
575 print_ip_sym((unsigned long)class->key);
576}
577
578/*
579 * printk all lock dependencies starting at <entry>:
580 */
581static void __used
582print_lock_dependencies(struct lock_class *class, int depth)
583{
584 struct lock_list *entry;
585
586 if (lockdep_dependency_visit(class, depth))
587 return;
588
589 if (DEBUG_LOCKS_WARN_ON(depth >= 20))
590 return;
591
592 print_lock_class_header(class, depth);
593
594 list_for_each_entry(entry, &class->locks_after, entry) {
595 if (DEBUG_LOCKS_WARN_ON(!entry->class))
596 return;
597
598 print_lock_dependencies(entry->class, depth + 1);
599
600 printk("%*s ... acquired at:\n",depth,"");
601 print_stack_trace(&entry->trace, 2);
602 printk("\n");
603 }
604}
605
606static void print_kernel_version(void) 548static void print_kernel_version(void)
607{ 549{
608 printk("%s %.*s\n", init_utsname()->release, 550 printk("%s %.*s\n", init_utsname()->release,
@@ -898,22 +840,203 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
898} 840}
899 841
900/* 842/*
843 * For good efficiency of modular, we use power of 2
844 */
845#define MAX_CIRCULAR_QUEUE_SIZE 4096UL
846#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1)
847
848/*
849 * The circular_queue and helpers is used to implement the
850 * breadth-first search(BFS)algorithem, by which we can build
851 * the shortest path from the next lock to be acquired to the
852 * previous held lock if there is a circular between them.
853 */
854struct circular_queue {
855 unsigned long element[MAX_CIRCULAR_QUEUE_SIZE];
856 unsigned int front, rear;
857};
858
859static struct circular_queue lock_cq;
860
861unsigned int max_bfs_queue_depth;
862
863static unsigned int lockdep_dependency_gen_id;
864
865static inline void __cq_init(struct circular_queue *cq)
866{
867 cq->front = cq->rear = 0;
868 lockdep_dependency_gen_id++;
869}
870
871static inline int __cq_empty(struct circular_queue *cq)
872{
873 return (cq->front == cq->rear);
874}
875
876static inline int __cq_full(struct circular_queue *cq)
877{
878 return ((cq->rear + 1) & CQ_MASK) == cq->front;
879}
880
881static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem)
882{
883 if (__cq_full(cq))
884 return -1;
885
886 cq->element[cq->rear] = elem;
887 cq->rear = (cq->rear + 1) & CQ_MASK;
888 return 0;
889}
890
891static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem)
892{
893 if (__cq_empty(cq))
894 return -1;
895
896 *elem = cq->element[cq->front];
897 cq->front = (cq->front + 1) & CQ_MASK;
898 return 0;
899}
900
901static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
902{
903 return (cq->rear - cq->front) & CQ_MASK;
904}
905
906static inline void mark_lock_accessed(struct lock_list *lock,
907 struct lock_list *parent)
908{
909 unsigned long nr;
910
911 nr = lock - list_entries;
912 WARN_ON(nr >= nr_list_entries);
913 lock->parent = parent;
914 lock->class->dep_gen_id = lockdep_dependency_gen_id;
915}
916
917static inline unsigned long lock_accessed(struct lock_list *lock)
918{
919 unsigned long nr;
920
921 nr = lock - list_entries;
922 WARN_ON(nr >= nr_list_entries);
923 return lock->class->dep_gen_id == lockdep_dependency_gen_id;
924}
925
926static inline struct lock_list *get_lock_parent(struct lock_list *child)
927{
928 return child->parent;
929}
930
931static inline int get_lock_depth(struct lock_list *child)
932{
933 int depth = 0;
934 struct lock_list *parent;
935
936 while ((parent = get_lock_parent(child))) {
937 child = parent;
938 depth++;
939 }
940 return depth;
941}
942
943static int __bfs(struct lock_list *source_entry,
944 void *data,
945 int (*match)(struct lock_list *entry, void *data),
946 struct lock_list **target_entry,
947 int forward)
948{
949 struct lock_list *entry;
950 struct list_head *head;
951 struct circular_queue *cq = &lock_cq;
952 int ret = 1;
953
954 if (match(source_entry, data)) {
955 *target_entry = source_entry;
956 ret = 0;
957 goto exit;
958 }
959
960 if (forward)
961 head = &source_entry->class->locks_after;
962 else
963 head = &source_entry->class->locks_before;
964
965 if (list_empty(head))
966 goto exit;
967
968 __cq_init(cq);
969 __cq_enqueue(cq, (unsigned long)source_entry);
970
971 while (!__cq_empty(cq)) {
972 struct lock_list *lock;
973
974 __cq_dequeue(cq, (unsigned long *)&lock);
975
976 if (!lock->class) {
977 ret = -2;
978 goto exit;
979 }
980
981 if (forward)
982 head = &lock->class->locks_after;
983 else
984 head = &lock->class->locks_before;
985
986 list_for_each_entry(entry, head, entry) {
987 if (!lock_accessed(entry)) {
988 unsigned int cq_depth;
989 mark_lock_accessed(entry, lock);
990 if (match(entry, data)) {
991 *target_entry = entry;
992 ret = 0;
993 goto exit;
994 }
995
996 if (__cq_enqueue(cq, (unsigned long)entry)) {
997 ret = -1;
998 goto exit;
999 }
1000 cq_depth = __cq_get_elem_count(cq);
1001 if (max_bfs_queue_depth < cq_depth)
1002 max_bfs_queue_depth = cq_depth;
1003 }
1004 }
1005 }
1006exit:
1007 return ret;
1008}
1009
1010static inline int __bfs_forwards(struct lock_list *src_entry,
1011 void *data,
1012 int (*match)(struct lock_list *entry, void *data),
1013 struct lock_list **target_entry)
1014{
1015 return __bfs(src_entry, data, match, target_entry, 1);
1016
1017}
1018
1019static inline int __bfs_backwards(struct lock_list *src_entry,
1020 void *data,
1021 int (*match)(struct lock_list *entry, void *data),
1022 struct lock_list **target_entry)
1023{
1024 return __bfs(src_entry, data, match, target_entry, 0);
1025
1026}
1027
1028/*
901 * Recursive, forwards-direction lock-dependency checking, used for 1029 * Recursive, forwards-direction lock-dependency checking, used for
902 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe 1030 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
903 * checking. 1031 * checking.
904 *
905 * (to keep the stackframe of the recursive functions small we
906 * use these global variables, and we also mark various helper
907 * functions as noinline.)
908 */ 1032 */
909static struct held_lock *check_source, *check_target;
910 1033
911/* 1034/*
912 * Print a dependency chain entry (this is only done when a deadlock 1035 * Print a dependency chain entry (this is only done when a deadlock
913 * has been detected): 1036 * has been detected):
914 */ 1037 */
915static noinline int 1038static noinline int
916print_circular_bug_entry(struct lock_list *target, unsigned int depth) 1039print_circular_bug_entry(struct lock_list *target, int depth)
917{ 1040{
918 if (debug_locks_silent) 1041 if (debug_locks_silent)
919 return 0; 1042 return 0;
@@ -930,11 +1053,13 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth)
930 * header first: 1053 * header first:
931 */ 1054 */
932static noinline int 1055static noinline int
933print_circular_bug_header(struct lock_list *entry, unsigned int depth) 1056print_circular_bug_header(struct lock_list *entry, unsigned int depth,
1057 struct held_lock *check_src,
1058 struct held_lock *check_tgt)
934{ 1059{
935 struct task_struct *curr = current; 1060 struct task_struct *curr = current;
936 1061
937 if (!debug_locks_off_graph_unlock() || debug_locks_silent) 1062 if (debug_locks_silent)
938 return 0; 1063 return 0;
939 1064
940 printk("\n=======================================================\n"); 1065 printk("\n=======================================================\n");
@@ -943,9 +1068,9 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
943 printk( "-------------------------------------------------------\n"); 1068 printk( "-------------------------------------------------------\n");
944 printk("%s/%d is trying to acquire lock:\n", 1069 printk("%s/%d is trying to acquire lock:\n",
945 curr->comm, task_pid_nr(curr)); 1070 curr->comm, task_pid_nr(curr));
946 print_lock(check_source); 1071 print_lock(check_src);
947 printk("\nbut task is already holding lock:\n"); 1072 printk("\nbut task is already holding lock:\n");
948 print_lock(check_target); 1073 print_lock(check_tgt);
949 printk("\nwhich lock already depends on the new lock.\n\n"); 1074 printk("\nwhich lock already depends on the new lock.\n\n");
950 printk("\nthe existing dependency chain (in reverse order) is:\n"); 1075 printk("\nthe existing dependency chain (in reverse order) is:\n");
951 1076
@@ -954,19 +1079,36 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth)
954 return 0; 1079 return 0;
955} 1080}
956 1081
957static noinline int print_circular_bug_tail(void) 1082static inline int class_equal(struct lock_list *entry, void *data)
1083{
1084 return entry->class == data;
1085}
1086
1087static noinline int print_circular_bug(struct lock_list *this,
1088 struct lock_list *target,
1089 struct held_lock *check_src,
1090 struct held_lock *check_tgt)
958{ 1091{
959 struct task_struct *curr = current; 1092 struct task_struct *curr = current;
960 struct lock_list this; 1093 struct lock_list *parent;
1094 int depth;
961 1095
962 if (debug_locks_silent) 1096 if (!debug_locks_off_graph_unlock() || debug_locks_silent)
963 return 0; 1097 return 0;
964 1098
965 this.class = hlock_class(check_source); 1099 if (!save_trace(&this->trace))
966 if (!save_trace(&this.trace))
967 return 0; 1100 return 0;
968 1101
969 print_circular_bug_entry(&this, 0); 1102 depth = get_lock_depth(target);
1103
1104 print_circular_bug_header(target, depth, check_src, check_tgt);
1105
1106 parent = get_lock_parent(target);
1107
1108 while (parent) {
1109 print_circular_bug_entry(parent, --depth);
1110 parent = get_lock_parent(parent);
1111 }
970 1112
971 printk("\nother info that might help us debug this:\n\n"); 1113 printk("\nother info that might help us debug this:\n\n");
972 lockdep_print_held_locks(curr); 1114 lockdep_print_held_locks(curr);
@@ -977,73 +1119,69 @@ static noinline int print_circular_bug_tail(void)
977 return 0; 1119 return 0;
978} 1120}
979 1121
980#define RECURSION_LIMIT 40 1122static noinline int print_bfs_bug(int ret)
981
982static int noinline print_infinite_recursion_bug(void)
983{ 1123{
984 if (!debug_locks_off_graph_unlock()) 1124 if (!debug_locks_off_graph_unlock())
985 return 0; 1125 return 0;
986 1126
987 WARN_ON(1); 1127 WARN(1, "lockdep bfs error:%d\n", ret);
988 1128
989 return 0; 1129 return 0;
990} 1130}
991 1131
992unsigned long __lockdep_count_forward_deps(struct lock_class *class, 1132static int noop_count(struct lock_list *entry, void *data)
993 unsigned int depth)
994{ 1133{
995 struct lock_list *entry; 1134 (*(unsigned long *)data)++;
996 unsigned long ret = 1; 1135 return 0;
1136}
997 1137
998 if (lockdep_dependency_visit(class, depth)) 1138unsigned long __lockdep_count_forward_deps(struct lock_list *this)
999 return 0; 1139{
1140 unsigned long count = 0;
1141 struct lock_list *uninitialized_var(target_entry);
1000 1142
1001 /* 1143 __bfs_forwards(this, (void *)&count, noop_count, &target_entry);
1002 * Recurse this class's dependency list:
1003 */
1004 list_for_each_entry(entry, &class->locks_after, entry)
1005 ret += __lockdep_count_forward_deps(entry->class, depth + 1);
1006 1144
1007 return ret; 1145 return count;
1008} 1146}
1009
1010unsigned long lockdep_count_forward_deps(struct lock_class *class) 1147unsigned long lockdep_count_forward_deps(struct lock_class *class)
1011{ 1148{
1012 unsigned long ret, flags; 1149 unsigned long ret, flags;
1150 struct lock_list this;
1151
1152 this.parent = NULL;
1153 this.class = class;
1013 1154
1014 local_irq_save(flags); 1155 local_irq_save(flags);
1015 __raw_spin_lock(&lockdep_lock); 1156 __raw_spin_lock(&lockdep_lock);
1016 ret = __lockdep_count_forward_deps(class, 0); 1157 ret = __lockdep_count_forward_deps(&this);
1017 __raw_spin_unlock(&lockdep_lock); 1158 __raw_spin_unlock(&lockdep_lock);
1018 local_irq_restore(flags); 1159 local_irq_restore(flags);
1019 1160
1020 return ret; 1161 return ret;
1021} 1162}
1022 1163
1023unsigned long __lockdep_count_backward_deps(struct lock_class *class, 1164unsigned long __lockdep_count_backward_deps(struct lock_list *this)
1024 unsigned int depth)
1025{ 1165{
1026 struct lock_list *entry; 1166 unsigned long count = 0;
1027 unsigned long ret = 1; 1167 struct lock_list *uninitialized_var(target_entry);
1028 1168
1029 if (lockdep_dependency_visit(class, depth)) 1169 __bfs_backwards(this, (void *)&count, noop_count, &target_entry);
1030 return 0;
1031 /*
1032 * Recurse this class's dependency list:
1033 */
1034 list_for_each_entry(entry, &class->locks_before, entry)
1035 ret += __lockdep_count_backward_deps(entry->class, depth + 1);
1036 1170
1037 return ret; 1171 return count;
1038} 1172}
1039 1173
1040unsigned long lockdep_count_backward_deps(struct lock_class *class) 1174unsigned long lockdep_count_backward_deps(struct lock_class *class)
1041{ 1175{
1042 unsigned long ret, flags; 1176 unsigned long ret, flags;
1177 struct lock_list this;
1178
1179 this.parent = NULL;
1180 this.class = class;
1043 1181
1044 local_irq_save(flags); 1182 local_irq_save(flags);
1045 __raw_spin_lock(&lockdep_lock); 1183 __raw_spin_lock(&lockdep_lock);
1046 ret = __lockdep_count_backward_deps(class, 0); 1184 ret = __lockdep_count_backward_deps(&this);
1047 __raw_spin_unlock(&lockdep_lock); 1185 __raw_spin_unlock(&lockdep_lock);
1048 local_irq_restore(flags); 1186 local_irq_restore(flags);
1049 1187
@@ -1055,29 +1193,16 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1055 * lead to <target>. Print an error and return 0 if it does. 1193 * lead to <target>. Print an error and return 0 if it does.
1056 */ 1194 */
1057static noinline int 1195static noinline int
1058check_noncircular(struct lock_class *source, unsigned int depth) 1196check_noncircular(struct lock_list *root, struct lock_class *target,
1197 struct lock_list **target_entry)
1059{ 1198{
1060 struct lock_list *entry; 1199 int result;
1061 1200
1062 if (lockdep_dependency_visit(source, depth)) 1201 debug_atomic_inc(&nr_cyclic_checks);
1063 return 1;
1064 1202
1065 debug_atomic_inc(&nr_cyclic_check_recursions); 1203 result = __bfs_forwards(root, target, class_equal, target_entry);
1066 if (depth > max_recursion_depth) 1204
1067 max_recursion_depth = depth; 1205 return result;
1068 if (depth >= RECURSION_LIMIT)
1069 return print_infinite_recursion_bug();
1070 /*
1071 * Check this lock's dependency list:
1072 */
1073 list_for_each_entry(entry, &source->locks_after, entry) {
1074 if (entry->class == hlock_class(check_target))
1075 return print_circular_bug_header(entry, depth+1);
1076 debug_atomic_inc(&nr_cyclic_checks);
1077 if (!check_noncircular(entry->class, depth+1))
1078 return print_circular_bug_entry(entry, depth+1);
1079 }
1080 return 1;
1081} 1206}
1082 1207
1083#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) 1208#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
@@ -1086,103 +1211,121 @@ check_noncircular(struct lock_class *source, unsigned int depth)
1086 * proving that two subgraphs can be connected by a new dependency 1211 * proving that two subgraphs can be connected by a new dependency
1087 * without creating any illegal irq-safe -> irq-unsafe lock dependency. 1212 * without creating any illegal irq-safe -> irq-unsafe lock dependency.
1088 */ 1213 */
1089static enum lock_usage_bit find_usage_bit; 1214
1090static struct lock_class *forwards_match, *backwards_match; 1215static inline int usage_match(struct lock_list *entry, void *bit)
1216{
1217 return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit);
1218}
1219
1220
1091 1221
1092/* 1222/*
1093 * Find a node in the forwards-direction dependency sub-graph starting 1223 * Find a node in the forwards-direction dependency sub-graph starting
1094 * at <source> that matches <find_usage_bit>. 1224 * at @root->class that matches @bit.
1095 * 1225 *
1096 * Return 2 if such a node exists in the subgraph, and put that node 1226 * Return 0 if such a node exists in the subgraph, and put that node
1097 * into <forwards_match>. 1227 * into *@target_entry.
1098 * 1228 *
1099 * Return 1 otherwise and keep <forwards_match> unchanged. 1229 * Return 1 otherwise and keep *@target_entry unchanged.
1100 * Return 0 on error. 1230 * Return <0 on error.
1101 */ 1231 */
1102static noinline int 1232static int
1103find_usage_forwards(struct lock_class *source, unsigned int depth) 1233find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1234 struct lock_list **target_entry)
1104{ 1235{
1105 struct lock_list *entry; 1236 int result;
1106 int ret;
1107
1108 if (lockdep_dependency_visit(source, depth))
1109 return 1;
1110
1111 if (depth > max_recursion_depth)
1112 max_recursion_depth = depth;
1113 if (depth >= RECURSION_LIMIT)
1114 return print_infinite_recursion_bug();
1115 1237
1116 debug_atomic_inc(&nr_find_usage_forwards_checks); 1238 debug_atomic_inc(&nr_find_usage_forwards_checks);
1117 if (source->usage_mask & (1 << find_usage_bit)) {
1118 forwards_match = source;
1119 return 2;
1120 }
1121 1239
1122 /* 1240 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1123 * Check this lock's dependency list: 1241
1124 */ 1242 return result;
1125 list_for_each_entry(entry, &source->locks_after, entry) {
1126 debug_atomic_inc(&nr_find_usage_forwards_recursions);
1127 ret = find_usage_forwards(entry->class, depth+1);
1128 if (ret == 2 || ret == 0)
1129 return ret;
1130 }
1131 return 1;
1132} 1243}
1133 1244
1134/* 1245/*
1135 * Find a node in the backwards-direction dependency sub-graph starting 1246 * Find a node in the backwards-direction dependency sub-graph starting
1136 * at <source> that matches <find_usage_bit>. 1247 * at @root->class that matches @bit.
1137 * 1248 *
1138 * Return 2 if such a node exists in the subgraph, and put that node 1249 * Return 0 if such a node exists in the subgraph, and put that node
1139 * into <backwards_match>. 1250 * into *@target_entry.
1140 * 1251 *
1141 * Return 1 otherwise and keep <backwards_match> unchanged. 1252 * Return 1 otherwise and keep *@target_entry unchanged.
1142 * Return 0 on error. 1253 * Return <0 on error.
1143 */ 1254 */
1144static noinline int 1255static int
1145find_usage_backwards(struct lock_class *source, unsigned int depth) 1256find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1257 struct lock_list **target_entry)
1146{ 1258{
1147 struct lock_list *entry; 1259 int result;
1148 int ret;
1149 1260
1150 if (lockdep_dependency_visit(source, depth)) 1261 debug_atomic_inc(&nr_find_usage_backwards_checks);
1151 return 1;
1152 1262
1153 if (!__raw_spin_is_locked(&lockdep_lock)) 1263 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1154 return DEBUG_LOCKS_WARN_ON(1);
1155 1264
1156 if (depth > max_recursion_depth) 1265 return result;
1157 max_recursion_depth = depth; 1266}
1158 if (depth >= RECURSION_LIMIT)
1159 return print_infinite_recursion_bug();
1160 1267
1161 debug_atomic_inc(&nr_find_usage_backwards_checks); 1268static void print_lock_class_header(struct lock_class *class, int depth)
1162 if (source->usage_mask & (1 << find_usage_bit)) { 1269{
1163 backwards_match = source; 1270 int bit;
1164 return 2;
1165 }
1166 1271
1167 if (!source && debug_locks_off_graph_unlock()) { 1272 printk("%*s->", depth, "");
1168 WARN_ON(1); 1273 print_lock_name(class);
1169 return 0; 1274 printk(" ops: %lu", class->ops);
1170 } 1275 printk(" {\n");
1171 1276
1172 /* 1277 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
1173 * Check this lock's dependency list: 1278 if (class->usage_mask & (1 << bit)) {
1174 */ 1279 int len = depth;
1175 list_for_each_entry(entry, &source->locks_before, entry) { 1280
1176 debug_atomic_inc(&nr_find_usage_backwards_recursions); 1281 len += printk("%*s %s", depth, "", usage_str[bit]);
1177 ret = find_usage_backwards(entry->class, depth+1); 1282 len += printk(" at:\n");
1178 if (ret == 2 || ret == 0) 1283 print_stack_trace(class->usage_traces + bit, len);
1179 return ret; 1284 }
1180 } 1285 }
1181 return 1; 1286 printk("%*s }\n", depth, "");
1287
1288 printk("%*s ... key at: ",depth,"");
1289 print_ip_sym((unsigned long)class->key);
1290}
1291
1292/*
1293 * printk the shortest lock dependencies from @start to @end in reverse order:
1294 */
1295static void __used
1296print_shortest_lock_dependencies(struct lock_list *leaf,
1297 struct lock_list *root)
1298{
1299 struct lock_list *entry = leaf;
1300 int depth;
1301
1302 /*compute depth from generated tree by BFS*/
1303 depth = get_lock_depth(leaf);
1304
1305 do {
1306 print_lock_class_header(entry->class, depth);
1307 printk("%*s ... acquired at:\n", depth, "");
1308 print_stack_trace(&entry->trace, 2);
1309 printk("\n");
1310
1311 if (depth == 0 && (entry != root)) {
1312 printk("lockdep:%s bad BFS generated tree\n", __func__);
1313 break;
1314 }
1315
1316 entry = get_lock_parent(entry);
1317 depth--;
1318 } while (entry && (depth >= 0));
1319
1320 return;
1182} 1321}
1183 1322
1184static int 1323static int
1185print_bad_irq_dependency(struct task_struct *curr, 1324print_bad_irq_dependency(struct task_struct *curr,
1325 struct lock_list *prev_root,
1326 struct lock_list *next_root,
1327 struct lock_list *backwards_entry,
1328 struct lock_list *forwards_entry,
1186 struct held_lock *prev, 1329 struct held_lock *prev,
1187 struct held_lock *next, 1330 struct held_lock *next,
1188 enum lock_usage_bit bit1, 1331 enum lock_usage_bit bit1,
@@ -1215,26 +1358,32 @@ print_bad_irq_dependency(struct task_struct *curr,
1215 1358
1216 printk("\nbut this new dependency connects a %s-irq-safe lock:\n", 1359 printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
1217 irqclass); 1360 irqclass);
1218 print_lock_name(backwards_match); 1361 print_lock_name(backwards_entry->class);
1219 printk("\n... which became %s-irq-safe at:\n", irqclass); 1362 printk("\n... which became %s-irq-safe at:\n", irqclass);
1220 1363
1221 print_stack_trace(backwards_match->usage_traces + bit1, 1); 1364 print_stack_trace(backwards_entry->class->usage_traces + bit1, 1);
1222 1365
1223 printk("\nto a %s-irq-unsafe lock:\n", irqclass); 1366 printk("\nto a %s-irq-unsafe lock:\n", irqclass);
1224 print_lock_name(forwards_match); 1367 print_lock_name(forwards_entry->class);
1225 printk("\n... which became %s-irq-unsafe at:\n", irqclass); 1368 printk("\n... which became %s-irq-unsafe at:\n", irqclass);
1226 printk("..."); 1369 printk("...");
1227 1370
1228 print_stack_trace(forwards_match->usage_traces + bit2, 1); 1371 print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
1229 1372
1230 printk("\nother info that might help us debug this:\n\n"); 1373 printk("\nother info that might help us debug this:\n\n");
1231 lockdep_print_held_locks(curr); 1374 lockdep_print_held_locks(curr);
1232 1375
1233 printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass); 1376 printk("\nthe dependencies between %s-irq-safe lock", irqclass);
1234 print_lock_dependencies(backwards_match, 0); 1377 printk(" and the holding lock:\n");
1378 if (!save_trace(&prev_root->trace))
1379 return 0;
1380 print_shortest_lock_dependencies(backwards_entry, prev_root);
1235 1381
1236 printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass); 1382 printk("\nthe dependencies between the lock to be acquired");
1237 print_lock_dependencies(forwards_match, 0); 1383 printk(" and %s-irq-unsafe lock:\n", irqclass);
1384 if (!save_trace(&next_root->trace))
1385 return 0;
1386 print_shortest_lock_dependencies(forwards_entry, next_root);
1238 1387
1239 printk("\nstack backtrace:\n"); 1388 printk("\nstack backtrace:\n");
1240 dump_stack(); 1389 dump_stack();
@@ -1248,19 +1397,30 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
1248 enum lock_usage_bit bit_forwards, const char *irqclass) 1397 enum lock_usage_bit bit_forwards, const char *irqclass)
1249{ 1398{
1250 int ret; 1399 int ret;
1400 struct lock_list this, that;
1401 struct lock_list *uninitialized_var(target_entry);
1402 struct lock_list *uninitialized_var(target_entry1);
1251 1403
1252 find_usage_bit = bit_backwards; 1404 this.parent = NULL;
1253 /* fills in <backwards_match> */ 1405
1254 ret = find_usage_backwards(hlock_class(prev), 0); 1406 this.class = hlock_class(prev);
1255 if (!ret || ret == 1) 1407 ret = find_usage_backwards(&this, bit_backwards, &target_entry);
1408 if (ret < 0)
1409 return print_bfs_bug(ret);
1410 if (ret == 1)
1256 return ret; 1411 return ret;
1257 1412
1258 find_usage_bit = bit_forwards; 1413 that.parent = NULL;
1259 ret = find_usage_forwards(hlock_class(next), 0); 1414 that.class = hlock_class(next);
1260 if (!ret || ret == 1) 1415 ret = find_usage_forwards(&that, bit_forwards, &target_entry1);
1416 if (ret < 0)
1417 return print_bfs_bug(ret);
1418 if (ret == 1)
1261 return ret; 1419 return ret;
1262 /* ret == 2 */ 1420
1263 return print_bad_irq_dependency(curr, prev, next, 1421 return print_bad_irq_dependency(curr, &this, &that,
1422 target_entry, target_entry1,
1423 prev, next,
1264 bit_backwards, bit_forwards, irqclass); 1424 bit_backwards, bit_forwards, irqclass);
1265} 1425}
1266 1426
@@ -1472,6 +1632,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1472{ 1632{
1473 struct lock_list *entry; 1633 struct lock_list *entry;
1474 int ret; 1634 int ret;
1635 struct lock_list this;
1636 struct lock_list *uninitialized_var(target_entry);
1475 1637
1476 /* 1638 /*
1477 * Prove that the new <prev> -> <next> dependency would not 1639 * Prove that the new <prev> -> <next> dependency would not
@@ -1482,10 +1644,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1482 * We are using global variables to control the recursion, to 1644 * We are using global variables to control the recursion, to
1483 * keep the stackframe size of the recursive functions low: 1645 * keep the stackframe size of the recursive functions low:
1484 */ 1646 */
1485 check_source = next; 1647 this.class = hlock_class(next);
1486 check_target = prev; 1648 this.parent = NULL;
1487 if (!(check_noncircular(hlock_class(next), 0))) 1649 ret = check_noncircular(&this, hlock_class(prev), &target_entry);
1488 return print_circular_bug_tail(); 1650 if (unlikely(!ret))
1651 return print_circular_bug(&this, target_entry, next, prev);
1652 else if (unlikely(ret < 0))
1653 return print_bfs_bug(ret);
1489 1654
1490 if (!check_prev_add_irq(curr, prev, next)) 1655 if (!check_prev_add_irq(curr, prev, next))
1491 return 0; 1656 return 0;
@@ -1884,7 +2049,8 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
1884 * print irq inversion bug: 2049 * print irq inversion bug:
1885 */ 2050 */
1886static int 2051static int
1887print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, 2052print_irq_inversion_bug(struct task_struct *curr,
2053 struct lock_list *root, struct lock_list *other,
1888 struct held_lock *this, int forwards, 2054 struct held_lock *this, int forwards,
1889 const char *irqclass) 2055 const char *irqclass)
1890{ 2056{
@@ -1902,17 +2068,16 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1902 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); 2068 printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
1903 else 2069 else
1904 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); 2070 printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
1905 print_lock_name(other); 2071 print_lock_name(other->class);
1906 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); 2072 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
1907 2073
1908 printk("\nother info that might help us debug this:\n"); 2074 printk("\nother info that might help us debug this:\n");
1909 lockdep_print_held_locks(curr); 2075 lockdep_print_held_locks(curr);
1910 2076
1911 printk("\nthe first lock's dependencies:\n"); 2077 printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
1912 print_lock_dependencies(hlock_class(this), 0); 2078 if (!save_trace(&root->trace))
1913 2079 return 0;
1914 printk("\nthe second lock's dependencies:\n"); 2080 print_shortest_lock_dependencies(other, root);
1915 print_lock_dependencies(other, 0);
1916 2081
1917 printk("\nstack backtrace:\n"); 2082 printk("\nstack backtrace:\n");
1918 dump_stack(); 2083 dump_stack();
@@ -1929,14 +2094,19 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
1929 enum lock_usage_bit bit, const char *irqclass) 2094 enum lock_usage_bit bit, const char *irqclass)
1930{ 2095{
1931 int ret; 2096 int ret;
1932 2097 struct lock_list root;
1933 find_usage_bit = bit; 2098 struct lock_list *uninitialized_var(target_entry);
1934 /* fills in <forwards_match> */ 2099
1935 ret = find_usage_forwards(hlock_class(this), 0); 2100 root.parent = NULL;
1936 if (!ret || ret == 1) 2101 root.class = hlock_class(this);
2102 ret = find_usage_forwards(&root, bit, &target_entry);
2103 if (ret < 0)
2104 return print_bfs_bug(ret);
2105 if (ret == 1)
1937 return ret; 2106 return ret;
1938 2107
1939 return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass); 2108 return print_irq_inversion_bug(curr, &root, target_entry,
2109 this, 1, irqclass);
1940} 2110}
1941 2111
1942/* 2112/*
@@ -1948,14 +2118,19 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
1948 enum lock_usage_bit bit, const char *irqclass) 2118 enum lock_usage_bit bit, const char *irqclass)
1949{ 2119{
1950 int ret; 2120 int ret;
1951 2121 struct lock_list root;
1952 find_usage_bit = bit; 2122 struct lock_list *uninitialized_var(target_entry);
1953 /* fills in <backwards_match> */ 2123
1954 ret = find_usage_backwards(hlock_class(this), 0); 2124 root.parent = NULL;
1955 if (!ret || ret == 1) 2125 root.class = hlock_class(this);
2126 ret = find_usage_backwards(&root, bit, &target_entry);
2127 if (ret < 0)
2128 return print_bfs_bug(ret);
2129 if (ret == 1)
1956 return ret; 2130 return ret;
1957 2131
1958 return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); 2132 return print_irq_inversion_bug(curr, &root, target_entry,
2133 this, 1, irqclass);
1959} 2134}
1960 2135
1961void print_irqtrace_events(struct task_struct *curr) 2136void print_irqtrace_events(struct task_struct *curr)
@@ -2530,13 +2705,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
2530 */ 2705 */
2531static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, 2706static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2532 int trylock, int read, int check, int hardirqs_off, 2707 int trylock, int read, int check, int hardirqs_off,
2533 struct lockdep_map *nest_lock, unsigned long ip) 2708 struct lockdep_map *nest_lock, unsigned long ip,
2709 int references)
2534{ 2710{
2535 struct task_struct *curr = current; 2711 struct task_struct *curr = current;
2536 struct lock_class *class = NULL; 2712 struct lock_class *class = NULL;
2537 struct held_lock *hlock; 2713 struct held_lock *hlock;
2538 unsigned int depth, id; 2714 unsigned int depth, id;
2539 int chain_head = 0; 2715 int chain_head = 0;
2716 int class_idx;
2540 u64 chain_key; 2717 u64 chain_key;
2541 2718
2542 if (!prove_locking) 2719 if (!prove_locking)
@@ -2584,10 +2761,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2584 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) 2761 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
2585 return 0; 2762 return 0;
2586 2763
2764 class_idx = class - lock_classes + 1;
2765
2766 if (depth) {
2767 hlock = curr->held_locks + depth - 1;
2768 if (hlock->class_idx == class_idx && nest_lock) {
2769 if (hlock->references)
2770 hlock->references++;
2771 else
2772 hlock->references = 2;
2773
2774 return 1;
2775 }
2776 }
2777
2587 hlock = curr->held_locks + depth; 2778 hlock = curr->held_locks + depth;
2588 if (DEBUG_LOCKS_WARN_ON(!class)) 2779 if (DEBUG_LOCKS_WARN_ON(!class))
2589 return 0; 2780 return 0;
2590 hlock->class_idx = class - lock_classes + 1; 2781 hlock->class_idx = class_idx;
2591 hlock->acquire_ip = ip; 2782 hlock->acquire_ip = ip;
2592 hlock->instance = lock; 2783 hlock->instance = lock;
2593 hlock->nest_lock = nest_lock; 2784 hlock->nest_lock = nest_lock;
@@ -2595,6 +2786,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2595 hlock->read = read; 2786 hlock->read = read;
2596 hlock->check = check; 2787 hlock->check = check;
2597 hlock->hardirqs_off = !!hardirqs_off; 2788 hlock->hardirqs_off = !!hardirqs_off;
2789 hlock->references = references;
2598#ifdef CONFIG_LOCK_STAT 2790#ifdef CONFIG_LOCK_STAT
2599 hlock->waittime_stamp = 0; 2791 hlock->waittime_stamp = 0;
2600 hlock->holdtime_stamp = sched_clock(); 2792 hlock->holdtime_stamp = sched_clock();
@@ -2703,6 +2895,30 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2703 return 1; 2895 return 1;
2704} 2896}
2705 2897
2898static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
2899{
2900 if (hlock->instance == lock)
2901 return 1;
2902
2903 if (hlock->references) {
2904 struct lock_class *class = lock->class_cache;
2905
2906 if (!class)
2907 class = look_up_lock_class(lock, 0);
2908
2909 if (DEBUG_LOCKS_WARN_ON(!class))
2910 return 0;
2911
2912 if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
2913 return 0;
2914
2915 if (hlock->class_idx == class - lock_classes + 1)
2916 return 1;
2917 }
2918
2919 return 0;
2920}
2921
2706static int 2922static int
2707__lock_set_class(struct lockdep_map *lock, const char *name, 2923__lock_set_class(struct lockdep_map *lock, const char *name,
2708 struct lock_class_key *key, unsigned int subclass, 2924 struct lock_class_key *key, unsigned int subclass,
@@ -2726,7 +2942,7 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
2726 */ 2942 */
2727 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 2943 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2728 break; 2944 break;
2729 if (hlock->instance == lock) 2945 if (match_held_lock(hlock, lock))
2730 goto found_it; 2946 goto found_it;
2731 prev_hlock = hlock; 2947 prev_hlock = hlock;
2732 } 2948 }
@@ -2745,7 +2961,8 @@ found_it:
2745 if (!__lock_acquire(hlock->instance, 2961 if (!__lock_acquire(hlock->instance,
2746 hlock_class(hlock)->subclass, hlock->trylock, 2962 hlock_class(hlock)->subclass, hlock->trylock,
2747 hlock->read, hlock->check, hlock->hardirqs_off, 2963 hlock->read, hlock->check, hlock->hardirqs_off,
2748 hlock->nest_lock, hlock->acquire_ip)) 2964 hlock->nest_lock, hlock->acquire_ip,
2965 hlock->references))
2749 return 0; 2966 return 0;
2750 } 2967 }
2751 2968
@@ -2784,20 +3001,34 @@ lock_release_non_nested(struct task_struct *curr,
2784 */ 3001 */
2785 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3002 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2786 break; 3003 break;
2787 if (hlock->instance == lock) 3004 if (match_held_lock(hlock, lock))
2788 goto found_it; 3005 goto found_it;
2789 prev_hlock = hlock; 3006 prev_hlock = hlock;
2790 } 3007 }
2791 return print_unlock_inbalance_bug(curr, lock, ip); 3008 return print_unlock_inbalance_bug(curr, lock, ip);
2792 3009
2793found_it: 3010found_it:
2794 lock_release_holdtime(hlock); 3011 if (hlock->instance == lock)
3012 lock_release_holdtime(hlock);
3013
3014 if (hlock->references) {
3015 hlock->references--;
3016 if (hlock->references) {
3017 /*
3018 * We had, and after removing one, still have
3019 * references, the current lock stack is still
3020 * valid. We're done!
3021 */
3022 return 1;
3023 }
3024 }
2795 3025
2796 /* 3026 /*
2797 * We have the right lock to unlock, 'hlock' points to it. 3027 * We have the right lock to unlock, 'hlock' points to it.
2798 * Now we remove it from the stack, and add back the other 3028 * Now we remove it from the stack, and add back the other
2799 * entries (if any), recalculating the hash along the way: 3029 * entries (if any), recalculating the hash along the way:
2800 */ 3030 */
3031
2801 curr->lockdep_depth = i; 3032 curr->lockdep_depth = i;
2802 curr->curr_chain_key = hlock->prev_chain_key; 3033 curr->curr_chain_key = hlock->prev_chain_key;
2803 3034
@@ -2806,7 +3037,8 @@ found_it:
2806 if (!__lock_acquire(hlock->instance, 3037 if (!__lock_acquire(hlock->instance,
2807 hlock_class(hlock)->subclass, hlock->trylock, 3038 hlock_class(hlock)->subclass, hlock->trylock,
2808 hlock->read, hlock->check, hlock->hardirqs_off, 3039 hlock->read, hlock->check, hlock->hardirqs_off,
2809 hlock->nest_lock, hlock->acquire_ip)) 3040 hlock->nest_lock, hlock->acquire_ip,
3041 hlock->references))
2810 return 0; 3042 return 0;
2811 } 3043 }
2812 3044
@@ -2836,7 +3068,7 @@ static int lock_release_nested(struct task_struct *curr,
2836 /* 3068 /*
2837 * Is the unlock non-nested: 3069 * Is the unlock non-nested:
2838 */ 3070 */
2839 if (hlock->instance != lock) 3071 if (hlock->instance != lock || hlock->references)
2840 return lock_release_non_nested(curr, lock, ip); 3072 return lock_release_non_nested(curr, lock, ip);
2841 curr->lockdep_depth--; 3073 curr->lockdep_depth--;
2842 3074
@@ -2881,6 +3113,21 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2881 check_chain_key(curr); 3113 check_chain_key(curr);
2882} 3114}
2883 3115
3116static int __lock_is_held(struct lockdep_map *lock)
3117{
3118 struct task_struct *curr = current;
3119 int i;
3120
3121 for (i = 0; i < curr->lockdep_depth; i++) {
3122 struct held_lock *hlock = curr->held_locks + i;
3123
3124 if (match_held_lock(hlock, lock))
3125 return 1;
3126 }
3127
3128 return 0;
3129}
3130
2884/* 3131/*
2885 * Check whether we follow the irq-flags state precisely: 3132 * Check whether we follow the irq-flags state precisely:
2886 */ 3133 */
@@ -2957,7 +3204,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2957 3204
2958 current->lockdep_recursion = 1; 3205 current->lockdep_recursion = 1;
2959 __lock_acquire(lock, subclass, trylock, read, check, 3206 __lock_acquire(lock, subclass, trylock, read, check,
2960 irqs_disabled_flags(flags), nest_lock, ip); 3207 irqs_disabled_flags(flags), nest_lock, ip, 0);
2961 current->lockdep_recursion = 0; 3208 current->lockdep_recursion = 0;
2962 raw_local_irq_restore(flags); 3209 raw_local_irq_restore(flags);
2963} 3210}
@@ -2982,6 +3229,26 @@ void lock_release(struct lockdep_map *lock, int nested,
2982} 3229}
2983EXPORT_SYMBOL_GPL(lock_release); 3230EXPORT_SYMBOL_GPL(lock_release);
2984 3231
3232int lock_is_held(struct lockdep_map *lock)
3233{
3234 unsigned long flags;
3235 int ret = 0;
3236
3237 if (unlikely(current->lockdep_recursion))
3238 return ret;
3239
3240 raw_local_irq_save(flags);
3241 check_flags(flags);
3242
3243 current->lockdep_recursion = 1;
3244 ret = __lock_is_held(lock);
3245 current->lockdep_recursion = 0;
3246 raw_local_irq_restore(flags);
3247
3248 return ret;
3249}
3250EXPORT_SYMBOL_GPL(lock_is_held);
3251
2985void lockdep_set_current_reclaim_state(gfp_t gfp_mask) 3252void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
2986{ 3253{
2987 current->lockdep_reclaim_gfp = gfp_mask; 3254 current->lockdep_reclaim_gfp = gfp_mask;
@@ -3041,7 +3308,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3041 */ 3308 */
3042 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3309 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3043 break; 3310 break;
3044 if (hlock->instance == lock) 3311 if (match_held_lock(hlock, lock))
3045 goto found_it; 3312 goto found_it;
3046 prev_hlock = hlock; 3313 prev_hlock = hlock;
3047 } 3314 }
@@ -3049,6 +3316,9 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3049 return; 3316 return;
3050 3317
3051found_it: 3318found_it:
3319 if (hlock->instance != lock)
3320 return;
3321
3052 hlock->waittime_stamp = sched_clock(); 3322 hlock->waittime_stamp = sched_clock();
3053 3323
3054 contention_point = lock_point(hlock_class(hlock)->contention_point, ip); 3324 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
@@ -3088,7 +3358,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3088 */ 3358 */
3089 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) 3359 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
3090 break; 3360 break;
3091 if (hlock->instance == lock) 3361 if (match_held_lock(hlock, lock))
3092 goto found_it; 3362 goto found_it;
3093 prev_hlock = hlock; 3363 prev_hlock = hlock;
3094 } 3364 }
@@ -3096,6 +3366,9 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
3096 return; 3366 return;
3097 3367
3098found_it: 3368found_it:
3369 if (hlock->instance != lock)
3370 return;
3371
3099 cpu = smp_processor_id(); 3372 cpu = smp_processor_id();
3100 if (hlock->waittime_stamp) { 3373 if (hlock->waittime_stamp) {
3101 now = sched_clock(); 3374 now = sched_clock();
@@ -3326,7 +3599,12 @@ void __init lockdep_info(void)
3326 sizeof(struct list_head) * CLASSHASH_SIZE + 3599 sizeof(struct list_head) * CLASSHASH_SIZE +
3327 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + 3600 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
3328 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + 3601 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
3329 sizeof(struct list_head) * CHAINHASH_SIZE) / 1024); 3602 sizeof(struct list_head) * CHAINHASH_SIZE
3603#ifdef CONFIG_PROVE_LOCKING
3604 + sizeof(struct circular_queue)
3605#endif
3606 ) / 1024
3607 );
3330 3608
3331 printk(" per task-struct memory footprint: %lu bytes\n", 3609 printk(" per task-struct memory footprint: %lu bytes\n",
3332 sizeof(struct held_lock) * MAX_LOCK_DEPTH); 3610 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 699a2ac3a0d7..a2ee95ad1313 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -91,6 +91,8 @@ extern unsigned int nr_process_chains;
91extern unsigned int max_lockdep_depth; 91extern unsigned int max_lockdep_depth;
92extern unsigned int max_recursion_depth; 92extern unsigned int max_recursion_depth;
93 93
94extern unsigned int max_bfs_queue_depth;
95
94#ifdef CONFIG_PROVE_LOCKING 96#ifdef CONFIG_PROVE_LOCKING
95extern unsigned long lockdep_count_forward_deps(struct lock_class *); 97extern unsigned long lockdep_count_forward_deps(struct lock_class *);
96extern unsigned long lockdep_count_backward_deps(struct lock_class *); 98extern unsigned long lockdep_count_backward_deps(struct lock_class *);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d7135aa2d2c4..d4b3dbc79fdb 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -25,38 +25,12 @@
25 25
26static void *l_next(struct seq_file *m, void *v, loff_t *pos) 26static void *l_next(struct seq_file *m, void *v, loff_t *pos)
27{ 27{
28 struct lock_class *class; 28 return seq_list_next(v, &all_lock_classes, pos);
29
30 (*pos)++;
31
32 if (v == SEQ_START_TOKEN)
33 class = m->private;
34 else {
35 class = v;
36
37 if (class->lock_entry.next != &all_lock_classes)
38 class = list_entry(class->lock_entry.next,
39 struct lock_class, lock_entry);
40 else
41 class = NULL;
42 }
43
44 return class;
45} 29}
46 30
47static void *l_start(struct seq_file *m, loff_t *pos) 31static void *l_start(struct seq_file *m, loff_t *pos)
48{ 32{
49 struct lock_class *class; 33 return seq_list_start_head(&all_lock_classes, *pos);
50 loff_t i = 0;
51
52 if (*pos == 0)
53 return SEQ_START_TOKEN;
54
55 list_for_each_entry(class, &all_lock_classes, lock_entry) {
56 if (++i == *pos)
57 return class;
58 }
59 return NULL;
60} 34}
61 35
62static void l_stop(struct seq_file *m, void *v) 36static void l_stop(struct seq_file *m, void *v)
@@ -82,11 +56,11 @@ static void print_name(struct seq_file *m, struct lock_class *class)
82 56
83static int l_show(struct seq_file *m, void *v) 57static int l_show(struct seq_file *m, void *v)
84{ 58{
85 struct lock_class *class = v; 59 struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
86 struct lock_list *entry; 60 struct lock_list *entry;
87 char usage[LOCK_USAGE_CHARS]; 61 char usage[LOCK_USAGE_CHARS];
88 62
89 if (v == SEQ_START_TOKEN) { 63 if (v == &all_lock_classes) {
90 seq_printf(m, "all lock classes:\n"); 64 seq_printf(m, "all lock classes:\n");
91 return 0; 65 return 0;
92 } 66 }
@@ -128,17 +102,7 @@ static const struct seq_operations lockdep_ops = {
128 102
129static int lockdep_open(struct inode *inode, struct file *file) 103static int lockdep_open(struct inode *inode, struct file *file)
130{ 104{
131 int res = seq_open(file, &lockdep_ops); 105 return seq_open(file, &lockdep_ops);
132 if (!res) {
133 struct seq_file *m = file->private_data;
134
135 if (!list_empty(&all_lock_classes))
136 m->private = list_entry(all_lock_classes.next,
137 struct lock_class, lock_entry);
138 else
139 m->private = NULL;
140 }
141 return res;
142} 106}
143 107
144static const struct file_operations proc_lockdep_operations = { 108static const struct file_operations proc_lockdep_operations = {
@@ -149,37 +113,23 @@ static const struct file_operations proc_lockdep_operations = {
149}; 113};
150 114
151#ifdef CONFIG_PROVE_LOCKING 115#ifdef CONFIG_PROVE_LOCKING
152static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
153{
154 struct lock_chain *chain;
155
156 (*pos)++;
157
158 if (v == SEQ_START_TOKEN)
159 chain = m->private;
160 else {
161 chain = v;
162
163 if (*pos < nr_lock_chains)
164 chain = lock_chains + *pos;
165 else
166 chain = NULL;
167 }
168
169 return chain;
170}
171
172static void *lc_start(struct seq_file *m, loff_t *pos) 116static void *lc_start(struct seq_file *m, loff_t *pos)
173{ 117{
174 if (*pos == 0) 118 if (*pos == 0)
175 return SEQ_START_TOKEN; 119 return SEQ_START_TOKEN;
176 120
177 if (*pos < nr_lock_chains) 121 if (*pos - 1 < nr_lock_chains)
178 return lock_chains + *pos; 122 return lock_chains + (*pos - 1);
179 123
180 return NULL; 124 return NULL;
181} 125}
182 126
127static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
128{
129 (*pos)++;
130 return lc_start(m, pos);
131}
132
183static void lc_stop(struct seq_file *m, void *v) 133static void lc_stop(struct seq_file *m, void *v)
184{ 134{
185} 135}
@@ -220,16 +170,7 @@ static const struct seq_operations lockdep_chains_ops = {
220 170
221static int lockdep_chains_open(struct inode *inode, struct file *file) 171static int lockdep_chains_open(struct inode *inode, struct file *file)
222{ 172{
223 int res = seq_open(file, &lockdep_chains_ops); 173 return seq_open(file, &lockdep_chains_ops);
224 if (!res) {
225 struct seq_file *m = file->private_data;
226
227 if (nr_lock_chains)
228 m->private = lock_chains;
229 else
230 m->private = NULL;
231 }
232 return res;
233} 174}
234 175
235static const struct file_operations proc_lockdep_chains_operations = { 176static const struct file_operations proc_lockdep_chains_operations = {
@@ -258,16 +199,10 @@ static void lockdep_stats_debug_show(struct seq_file *m)
258 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(&chain_lookup_hits));
259 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11u\n",
260 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(&nr_cyclic_checks));
261 seq_printf(m, " cyclic-check recursions: %11u\n",
262 debug_atomic_read(&nr_cyclic_check_recursions));
263 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11u\n",
264 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(&nr_find_usage_forwards_checks));
265 seq_printf(m, " find-mask forwards recursions: %11u\n",
266 debug_atomic_read(&nr_find_usage_forwards_recursions));
267 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11u\n",
268 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(&nr_find_usage_backwards_checks));
269 seq_printf(m, " find-mask backwards recursions:%11u\n",
270 debug_atomic_read(&nr_find_usage_backwards_recursions));
271 206
272 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11u\n", hi1);
273 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11u\n", hi2);
@@ -409,8 +344,10 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
409 nr_unused); 344 nr_unused);
410 seq_printf(m, " max locking depth: %11u\n", 345 seq_printf(m, " max locking depth: %11u\n",
411 max_lockdep_depth); 346 max_lockdep_depth);
412 seq_printf(m, " max recursion depth: %11u\n", 347#ifdef CONFIG_PROVE_LOCKING
413 max_recursion_depth); 348 seq_printf(m, " max bfs queue depth: %11u\n",
349 max_bfs_queue_depth);
350#endif
414 lockdep_stats_debug_show(m); 351 lockdep_stats_debug_show(m);
415 seq_printf(m, " debug_locks: %11u\n", 352 seq_printf(m, " debug_locks: %11u\n",
416 debug_locks); 353 debug_locks);
@@ -438,7 +375,6 @@ struct lock_stat_data {
438}; 375};
439 376
440struct lock_stat_seq { 377struct lock_stat_seq {
441 struct lock_stat_data *iter;
442 struct lock_stat_data *iter_end; 378 struct lock_stat_data *iter_end;
443 struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; 379 struct lock_stat_data stats[MAX_LOCKDEP_KEYS];
444}; 380};
@@ -626,34 +562,22 @@ static void seq_header(struct seq_file *m)
626static void *ls_start(struct seq_file *m, loff_t *pos) 562static void *ls_start(struct seq_file *m, loff_t *pos)
627{ 563{
628 struct lock_stat_seq *data = m->private; 564 struct lock_stat_seq *data = m->private;
565 struct lock_stat_data *iter;
629 566
630 if (*pos == 0) 567 if (*pos == 0)
631 return SEQ_START_TOKEN; 568 return SEQ_START_TOKEN;
632 569
633 data->iter = data->stats + *pos; 570 iter = data->stats + (*pos - 1);
634 if (data->iter >= data->iter_end) 571 if (iter >= data->iter_end)
635 data->iter = NULL; 572 iter = NULL;
636 573
637 return data->iter; 574 return iter;
638} 575}
639 576
640static void *ls_next(struct seq_file *m, void *v, loff_t *pos) 577static void *ls_next(struct seq_file *m, void *v, loff_t *pos)
641{ 578{
642 struct lock_stat_seq *data = m->private;
643
644 (*pos)++; 579 (*pos)++;
645 580 return ls_start(m, pos);
646 if (v == SEQ_START_TOKEN)
647 data->iter = data->stats;
648 else {
649 data->iter = v;
650 data->iter++;
651 }
652
653 if (data->iter == data->iter_end)
654 data->iter = NULL;
655
656 return data->iter;
657} 581}
658 582
659static void ls_stop(struct seq_file *m, void *v) 583static void ls_stop(struct seq_file *m, void *v)
@@ -691,7 +615,6 @@ static int lock_stat_open(struct inode *inode, struct file *file)
691 struct lock_stat_data *iter = data->stats; 615 struct lock_stat_data *iter = data->stats;
692 struct seq_file *m = file->private_data; 616 struct seq_file *m = file->private_data;
693 617
694 data->iter = iter;
695 list_for_each_entry(class, &all_lock_classes, lock_entry) { 618 list_for_each_entry(class, &all_lock_classes, lock_entry) {
696 iter->class = class; 619 iter->class = class;
697 iter->stats = lock_stats(class); 620 iter->stats = lock_stats(class);
@@ -699,7 +622,7 @@ static int lock_stat_open(struct inode *inode, struct file *file)
699 } 622 }
700 data->iter_end = iter; 623 data->iter_end = iter;
701 624
702 sort(data->stats, data->iter_end - data->iter, 625 sort(data->stats, data->iter_end - data->stats,
703 sizeof(struct lock_stat_data), 626 sizeof(struct lock_stat_data),
704 lock_stat_cmp, NULL); 627 lock_stat_cmp, NULL);
705 628
@@ -734,7 +657,6 @@ static int lock_stat_release(struct inode *inode, struct file *file)
734 struct seq_file *seq = file->private_data; 657 struct seq_file *seq = file->private_data;
735 658
736 vfree(seq->private); 659 vfree(seq->private);
737 seq->private = NULL;
738 return seq_release(inode, file); 660 return seq_release(inode, file);
739} 661}
740 662
@@ -758,7 +680,8 @@ static int __init lockdep_proc_init(void)
758 &proc_lockdep_stats_operations); 680 &proc_lockdep_stats_operations);
759 681
760#ifdef CONFIG_LOCK_STAT 682#ifdef CONFIG_LOCK_STAT
761 proc_create("lock_stat", S_IRUSR, NULL, &proc_lock_stat_operations); 683 proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL,
684 &proc_lock_stat_operations);
762#endif 685#endif
763 686
764 return 0; 687 return 0;
diff --git a/kernel/marker.c b/kernel/marker.c
deleted file mode 100644
index ea54f2647868..000000000000
--- a/kernel/marker.c
+++ /dev/null
@@ -1,930 +0,0 @@
1/*
2 * Copyright (C) 2007 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/types.h>
21#include <linux/jhash.h>
22#include <linux/list.h>
23#include <linux/rcupdate.h>
24#include <linux/marker.h>
25#include <linux/err.h>
26#include <linux/slab.h>
27
28extern struct marker __start___markers[];
29extern struct marker __stop___markers[];
30
31/* Set to 1 to enable marker debug output */
32static const int marker_debug;
33
34/*
35 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
36 * and module markers and the hash table.
37 */
38static DEFINE_MUTEX(markers_mutex);
39
40/*
41 * Marker hash table, containing the active markers.
42 * Protected by module_mutex.
43 */
44#define MARKER_HASH_BITS 6
45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
46static struct hlist_head marker_table[MARKER_TABLE_SIZE];
47
48/*
49 * Note about RCU :
50 * It is used to make sure every handler has finished using its private data
51 * between two consecutive operation (add or remove) on a given marker. It is
52 * also used to delay the free of multiple probes array until a quiescent state
53 * is reached.
54 * marker entries modifications are protected by the markers_mutex.
55 */
56struct marker_entry {
57 struct hlist_node hlist;
58 char *format;
59 /* Probe wrapper */
60 void (*call)(const struct marker *mdata, void *call_private, ...);
61 struct marker_probe_closure single;
62 struct marker_probe_closure *multi;
63 int refcount; /* Number of times armed. 0 if disarmed. */
64 struct rcu_head rcu;
65 void *oldptr;
66 int rcu_pending;
67 unsigned char ptype:1;
68 unsigned char format_allocated:1;
69 char name[0]; /* Contains name'\0'format'\0' */
70};
71
72/**
73 * __mark_empty_function - Empty probe callback
74 * @probe_private: probe private data
75 * @call_private: call site private data
76 * @fmt: format string
77 * @...: variable argument list
78 *
79 * Empty callback provided as a probe to the markers. By providing this to a
80 * disabled marker, we make sure the execution flow is always valid even
81 * though the function pointer change and the marker enabling are two distinct
82 * operations that modifies the execution flow of preemptible code.
83 */
84notrace void __mark_empty_function(void *probe_private, void *call_private,
85 const char *fmt, va_list *args)
86{
87}
88EXPORT_SYMBOL_GPL(__mark_empty_function);
89
90/*
91 * marker_probe_cb Callback that prepares the variable argument list for probes.
92 * @mdata: pointer of type struct marker
93 * @call_private: caller site private data
94 * @...: Variable argument list.
95 *
96 * Since we do not use "typical" pointer based RCU in the 1 argument case, we
97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read.
99 */
100notrace void marker_probe_cb(const struct marker *mdata,
101 void *call_private, ...)
102{
103 va_list args;
104 char ptype;
105
106 /*
107 * rcu_read_lock_sched does two things : disabling preemption to make
108 * sure the teardown of the callbacks can be done correctly when they
109 * are in modules and they insure RCU read coherency.
110 */
111 rcu_read_lock_sched_notrace();
112 ptype = mdata->ptype;
113 if (likely(!ptype)) {
114 marker_probe_func *func;
115 /* Must read the ptype before ptr. They are not data dependant,
116 * so we put an explicit smp_rmb() here. */
117 smp_rmb();
118 func = mdata->single.func;
119 /* Must read the ptr before private data. They are not data
120 * dependant, so we put an explicit smp_rmb() here. */
121 smp_rmb();
122 va_start(args, call_private);
123 func(mdata->single.probe_private, call_private, mdata->format,
124 &args);
125 va_end(args);
126 } else {
127 struct marker_probe_closure *multi;
128 int i;
129 /*
130 * Read mdata->ptype before mdata->multi.
131 */
132 smp_rmb();
133 multi = mdata->multi;
134 /*
135 * multi points to an array, therefore accessing the array
136 * depends on reading multi. However, even in this case,
137 * we must insure that the pointer is read _before_ the array
138 * data. Same as rcu_dereference, but we need a full smp_rmb()
139 * in the fast path, so put the explicit barrier here.
140 */
141 smp_read_barrier_depends();
142 for (i = 0; multi[i].func; i++) {
143 va_start(args, call_private);
144 multi[i].func(multi[i].probe_private, call_private,
145 mdata->format, &args);
146 va_end(args);
147 }
148 }
149 rcu_read_unlock_sched_notrace();
150}
151EXPORT_SYMBOL_GPL(marker_probe_cb);
152
153/*
154 * marker_probe_cb Callback that does not prepare the variable argument list.
155 * @mdata: pointer of type struct marker
156 * @call_private: caller site private data
157 * @...: Variable argument list.
158 *
159 * Should be connected to markers "MARK_NOARGS".
160 */
161static notrace void marker_probe_cb_noarg(const struct marker *mdata,
162 void *call_private, ...)
163{
164 va_list args; /* not initialized */
165 char ptype;
166
167 rcu_read_lock_sched_notrace();
168 ptype = mdata->ptype;
169 if (likely(!ptype)) {
170 marker_probe_func *func;
171 /* Must read the ptype before ptr. They are not data dependant,
172 * so we put an explicit smp_rmb() here. */
173 smp_rmb();
174 func = mdata->single.func;
175 /* Must read the ptr before private data. They are not data
176 * dependant, so we put an explicit smp_rmb() here. */
177 smp_rmb();
178 func(mdata->single.probe_private, call_private, mdata->format,
179 &args);
180 } else {
181 struct marker_probe_closure *multi;
182 int i;
183 /*
184 * Read mdata->ptype before mdata->multi.
185 */
186 smp_rmb();
187 multi = mdata->multi;
188 /*
189 * multi points to an array, therefore accessing the array
190 * depends on reading multi. However, even in this case,
191 * we must insure that the pointer is read _before_ the array
192 * data. Same as rcu_dereference, but we need a full smp_rmb()
193 * in the fast path, so put the explicit barrier here.
194 */
195 smp_read_barrier_depends();
196 for (i = 0; multi[i].func; i++)
197 multi[i].func(multi[i].probe_private, call_private,
198 mdata->format, &args);
199 }
200 rcu_read_unlock_sched_notrace();
201}
202
203static void free_old_closure(struct rcu_head *head)
204{
205 struct marker_entry *entry = container_of(head,
206 struct marker_entry, rcu);
207 kfree(entry->oldptr);
208 /* Make sure we free the data before setting the pending flag to 0 */
209 smp_wmb();
210 entry->rcu_pending = 0;
211}
212
213static void debug_print_probes(struct marker_entry *entry)
214{
215 int i;
216
217 if (!marker_debug)
218 return;
219
220 if (!entry->ptype) {
221 printk(KERN_DEBUG "Single probe : %p %p\n",
222 entry->single.func,
223 entry->single.probe_private);
224 } else {
225 for (i = 0; entry->multi[i].func; i++)
226 printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
227 entry->multi[i].func,
228 entry->multi[i].probe_private);
229 }
230}
231
232static struct marker_probe_closure *
233marker_entry_add_probe(struct marker_entry *entry,
234 marker_probe_func *probe, void *probe_private)
235{
236 int nr_probes = 0;
237 struct marker_probe_closure *old, *new;
238
239 WARN_ON(!probe);
240
241 debug_print_probes(entry);
242 old = entry->multi;
243 if (!entry->ptype) {
244 if (entry->single.func == probe &&
245 entry->single.probe_private == probe_private)
246 return ERR_PTR(-EBUSY);
247 if (entry->single.func == __mark_empty_function) {
248 /* 0 -> 1 probes */
249 entry->single.func = probe;
250 entry->single.probe_private = probe_private;
251 entry->refcount = 1;
252 entry->ptype = 0;
253 debug_print_probes(entry);
254 return NULL;
255 } else {
256 /* 1 -> 2 probes */
257 nr_probes = 1;
258 old = NULL;
259 }
260 } else {
261 /* (N -> N+1), (N != 0, 1) probes */
262 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
263 if (old[nr_probes].func == probe
264 && old[nr_probes].probe_private
265 == probe_private)
266 return ERR_PTR(-EBUSY);
267 }
268 /* + 2 : one for new probe, one for NULL func */
269 new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
270 GFP_KERNEL);
271 if (new == NULL)
272 return ERR_PTR(-ENOMEM);
273 if (!old)
274 new[0] = entry->single;
275 else
276 memcpy(new, old,
277 nr_probes * sizeof(struct marker_probe_closure));
278 new[nr_probes].func = probe;
279 new[nr_probes].probe_private = probe_private;
280 entry->refcount = nr_probes + 1;
281 entry->multi = new;
282 entry->ptype = 1;
283 debug_print_probes(entry);
284 return old;
285}
286
287static struct marker_probe_closure *
288marker_entry_remove_probe(struct marker_entry *entry,
289 marker_probe_func *probe, void *probe_private)
290{
291 int nr_probes = 0, nr_del = 0, i;
292 struct marker_probe_closure *old, *new;
293
294 old = entry->multi;
295
296 debug_print_probes(entry);
297 if (!entry->ptype) {
298 /* 0 -> N is an error */
299 WARN_ON(entry->single.func == __mark_empty_function);
300 /* 1 -> 0 probes */
301 WARN_ON(probe && entry->single.func != probe);
302 WARN_ON(entry->single.probe_private != probe_private);
303 entry->single.func = __mark_empty_function;
304 entry->refcount = 0;
305 entry->ptype = 0;
306 debug_print_probes(entry);
307 return NULL;
308 } else {
309 /* (N -> M), (N > 1, M >= 0) probes */
310 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
311 if ((!probe || old[nr_probes].func == probe)
312 && old[nr_probes].probe_private
313 == probe_private)
314 nr_del++;
315 }
316 }
317
318 if (nr_probes - nr_del == 0) {
319 /* N -> 0, (N > 1) */
320 entry->single.func = __mark_empty_function;
321 entry->refcount = 0;
322 entry->ptype = 0;
323 } else if (nr_probes - nr_del == 1) {
324 /* N -> 1, (N > 1) */
325 for (i = 0; old[i].func; i++)
326 if ((probe && old[i].func != probe) ||
327 old[i].probe_private != probe_private)
328 entry->single = old[i];
329 entry->refcount = 1;
330 entry->ptype = 0;
331 } else {
332 int j = 0;
333 /* N -> M, (N > 1, M > 1) */
334 /* + 1 for NULL */
335 new = kzalloc((nr_probes - nr_del + 1)
336 * sizeof(struct marker_probe_closure), GFP_KERNEL);
337 if (new == NULL)
338 return ERR_PTR(-ENOMEM);
339 for (i = 0; old[i].func; i++)
340 if ((probe && old[i].func != probe) ||
341 old[i].probe_private != probe_private)
342 new[j++] = old[i];
343 entry->refcount = nr_probes - nr_del;
344 entry->ptype = 1;
345 entry->multi = new;
346 }
347 debug_print_probes(entry);
348 return old;
349}
350
351/*
352 * Get marker if the marker is present in the marker hash table.
353 * Must be called with markers_mutex held.
354 * Returns NULL if not present.
355 */
356static struct marker_entry *get_marker(const char *name)
357{
358 struct hlist_head *head;
359 struct hlist_node *node;
360 struct marker_entry *e;
361 u32 hash = jhash(name, strlen(name), 0);
362
363 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
364 hlist_for_each_entry(e, node, head, hlist) {
365 if (!strcmp(name, e->name))
366 return e;
367 }
368 return NULL;
369}
370
371/*
372 * Add the marker to the marker hash table. Must be called with markers_mutex
373 * held.
374 */
375static struct marker_entry *add_marker(const char *name, const char *format)
376{
377 struct hlist_head *head;
378 struct hlist_node *node;
379 struct marker_entry *e;
380 size_t name_len = strlen(name) + 1;
381 size_t format_len = 0;
382 u32 hash = jhash(name, name_len-1, 0);
383
384 if (format)
385 format_len = strlen(format) + 1;
386 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
387 hlist_for_each_entry(e, node, head, hlist) {
388 if (!strcmp(name, e->name)) {
389 printk(KERN_NOTICE
390 "Marker %s busy\n", name);
391 return ERR_PTR(-EBUSY); /* Already there */
392 }
393 }
394 /*
395 * Using kmalloc here to allocate a variable length element. Could
396 * cause some memory fragmentation if overused.
397 */
398 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
399 GFP_KERNEL);
400 if (!e)
401 return ERR_PTR(-ENOMEM);
402 memcpy(&e->name[0], name, name_len);
403 if (format) {
404 e->format = &e->name[name_len];
405 memcpy(e->format, format, format_len);
406 if (strcmp(e->format, MARK_NOARGS) == 0)
407 e->call = marker_probe_cb_noarg;
408 else
409 e->call = marker_probe_cb;
410 trace_mark(core_marker_format, "name %s format %s",
411 e->name, e->format);
412 } else {
413 e->format = NULL;
414 e->call = marker_probe_cb;
415 }
416 e->single.func = __mark_empty_function;
417 e->single.probe_private = NULL;
418 e->multi = NULL;
419 e->ptype = 0;
420 e->format_allocated = 0;
421 e->refcount = 0;
422 e->rcu_pending = 0;
423 hlist_add_head(&e->hlist, head);
424 return e;
425}
426
427/*
428 * Remove the marker from the marker hash table. Must be called with mutex_lock
429 * held.
430 */
431static int remove_marker(const char *name)
432{
433 struct hlist_head *head;
434 struct hlist_node *node;
435 struct marker_entry *e;
436 int found = 0;
437 size_t len = strlen(name) + 1;
438 u32 hash = jhash(name, len-1, 0);
439
440 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
441 hlist_for_each_entry(e, node, head, hlist) {
442 if (!strcmp(name, e->name)) {
443 found = 1;
444 break;
445 }
446 }
447 if (!found)
448 return -ENOENT;
449 if (e->single.func != __mark_empty_function)
450 return -EBUSY;
451 hlist_del(&e->hlist);
452 if (e->format_allocated)
453 kfree(e->format);
454 /* Make sure the call_rcu has been executed */
455 if (e->rcu_pending)
456 rcu_barrier_sched();
457 kfree(e);
458 return 0;
459}
460
461/*
462 * Set the mark_entry format to the format found in the element.
463 */
464static int marker_set_format(struct marker_entry *entry, const char *format)
465{
466 entry->format = kstrdup(format, GFP_KERNEL);
467 if (!entry->format)
468 return -ENOMEM;
469 entry->format_allocated = 1;
470
471 trace_mark(core_marker_format, "name %s format %s",
472 entry->name, entry->format);
473 return 0;
474}
475
476/*
477 * Sets the probe callback corresponding to one marker.
478 */
479static int set_marker(struct marker_entry *entry, struct marker *elem,
480 int active)
481{
482 int ret = 0;
483 WARN_ON(strcmp(entry->name, elem->name) != 0);
484
485 if (entry->format) {
486 if (strcmp(entry->format, elem->format) != 0) {
487 printk(KERN_NOTICE
488 "Format mismatch for probe %s "
489 "(%s), marker (%s)\n",
490 entry->name,
491 entry->format,
492 elem->format);
493 return -EPERM;
494 }
495 } else {
496 ret = marker_set_format(entry, elem->format);
497 if (ret)
498 return ret;
499 }
500
501 /*
502 * probe_cb setup (statically known) is done here. It is
503 * asynchronous with the rest of execution, therefore we only
504 * pass from a "safe" callback (with argument) to an "unsafe"
505 * callback (does not set arguments).
506 */
507 elem->call = entry->call;
508 /*
509 * Sanity check :
510 * We only update the single probe private data when the ptr is
511 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
512 */
513 WARN_ON(elem->single.func != __mark_empty_function
514 && elem->single.probe_private != entry->single.probe_private
515 && !elem->ptype);
516 elem->single.probe_private = entry->single.probe_private;
517 /*
518 * Make sure the private data is valid when we update the
519 * single probe ptr.
520 */
521 smp_wmb();
522 elem->single.func = entry->single.func;
523 /*
524 * We also make sure that the new probe callbacks array is consistent
525 * before setting a pointer to it.
526 */
527 rcu_assign_pointer(elem->multi, entry->multi);
528 /*
529 * Update the function or multi probe array pointer before setting the
530 * ptype.
531 */
532 smp_wmb();
533 elem->ptype = entry->ptype;
534
535 if (elem->tp_name && (active ^ elem->state)) {
536 WARN_ON(!elem->tp_cb);
537 /*
538 * It is ok to directly call the probe registration because type
539 * checking has been done in the __trace_mark_tp() macro.
540 */
541
542 if (active) {
543 /*
544 * try_module_get should always succeed because we hold
545 * lock_module() to get the tp_cb address.
546 */
547 ret = try_module_get(__module_text_address(
548 (unsigned long)elem->tp_cb));
549 BUG_ON(!ret);
550 ret = tracepoint_probe_register_noupdate(
551 elem->tp_name,
552 elem->tp_cb);
553 } else {
554 ret = tracepoint_probe_unregister_noupdate(
555 elem->tp_name,
556 elem->tp_cb);
557 /*
558 * tracepoint_probe_update_all() must be called
559 * before the module containing tp_cb is unloaded.
560 */
561 module_put(__module_text_address(
562 (unsigned long)elem->tp_cb));
563 }
564 }
565 elem->state = active;
566
567 return ret;
568}
569
570/*
571 * Disable a marker and its probe callback.
572 * Note: only waiting an RCU period after setting elem->call to the empty
573 * function insures that the original callback is not used anymore. This insured
574 * by rcu_read_lock_sched around the call site.
575 */
576static void disable_marker(struct marker *elem)
577{
578 int ret;
579
580 /* leave "call" as is. It is known statically. */
581 if (elem->tp_name && elem->state) {
582 WARN_ON(!elem->tp_cb);
583 /*
584 * It is ok to directly call the probe registration because type
585 * checking has been done in the __trace_mark_tp() macro.
586 */
587 ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
588 elem->tp_cb);
589 WARN_ON(ret);
590 /*
591 * tracepoint_probe_update_all() must be called
592 * before the module containing tp_cb is unloaded.
593 */
594 module_put(__module_text_address((unsigned long)elem->tp_cb));
595 }
596 elem->state = 0;
597 elem->single.func = __mark_empty_function;
598 /* Update the function before setting the ptype */
599 smp_wmb();
600 elem->ptype = 0; /* single probe */
601 /*
602 * Leave the private data and id there, because removal is racy and
603 * should be done only after an RCU period. These are never used until
604 * the next initialization anyway.
605 */
606}
607
608/**
609 * marker_update_probe_range - Update a probe range
610 * @begin: beginning of the range
611 * @end: end of the range
612 *
613 * Updates the probe callback corresponding to a range of markers.
614 */
615void marker_update_probe_range(struct marker *begin,
616 struct marker *end)
617{
618 struct marker *iter;
619 struct marker_entry *mark_entry;
620
621 mutex_lock(&markers_mutex);
622 for (iter = begin; iter < end; iter++) {
623 mark_entry = get_marker(iter->name);
624 if (mark_entry) {
625 set_marker(mark_entry, iter, !!mark_entry->refcount);
626 /*
627 * ignore error, continue
628 */
629 } else {
630 disable_marker(iter);
631 }
632 }
633 mutex_unlock(&markers_mutex);
634}
635
636/*
637 * Update probes, removing the faulty probes.
638 *
639 * Internal callback only changed before the first probe is connected to it.
640 * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
641 * transitions. All other transitions will leave the old private data valid.
642 * This makes the non-atomicity of the callback/private data updates valid.
643 *
644 * "special case" updates :
645 * 0 -> 1 callback
646 * 1 -> 0 callback
647 * 1 -> 2 callbacks
648 * 2 -> 1 callbacks
649 * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
650 * Site effect : marker_set_format may delete the marker entry (creating a
651 * replacement).
652 */
653static void marker_update_probes(void)
654{
655 /* Core kernel markers */
656 marker_update_probe_range(__start___markers, __stop___markers);
657 /* Markers in modules. */
658 module_update_markers();
659 tracepoint_probe_update_all();
660}
661
662/**
663 * marker_probe_register - Connect a probe to a marker
664 * @name: marker name
665 * @format: format string
666 * @probe: probe handler
667 * @probe_private: probe private data
668 *
669 * private data must be a valid allocated memory address, or NULL.
670 * Returns 0 if ok, error value on error.
671 * The probe address must at least be aligned on the architecture pointer size.
672 */
673int marker_probe_register(const char *name, const char *format,
674 marker_probe_func *probe, void *probe_private)
675{
676 struct marker_entry *entry;
677 int ret = 0;
678 struct marker_probe_closure *old;
679
680 mutex_lock(&markers_mutex);
681 entry = get_marker(name);
682 if (!entry) {
683 entry = add_marker(name, format);
684 if (IS_ERR(entry))
685 ret = PTR_ERR(entry);
686 } else if (format) {
687 if (!entry->format)
688 ret = marker_set_format(entry, format);
689 else if (strcmp(entry->format, format))
690 ret = -EPERM;
691 }
692 if (ret)
693 goto end;
694
695 /*
696 * If we detect that a call_rcu is pending for this marker,
697 * make sure it's executed now.
698 */
699 if (entry->rcu_pending)
700 rcu_barrier_sched();
701 old = marker_entry_add_probe(entry, probe, probe_private);
702 if (IS_ERR(old)) {
703 ret = PTR_ERR(old);
704 goto end;
705 }
706 mutex_unlock(&markers_mutex);
707 marker_update_probes();
708 mutex_lock(&markers_mutex);
709 entry = get_marker(name);
710 if (!entry)
711 goto end;
712 if (entry->rcu_pending)
713 rcu_barrier_sched();
714 entry->oldptr = old;
715 entry->rcu_pending = 1;
716 /* write rcu_pending before calling the RCU callback */
717 smp_wmb();
718 call_rcu_sched(&entry->rcu, free_old_closure);
719end:
720 mutex_unlock(&markers_mutex);
721 return ret;
722}
723EXPORT_SYMBOL_GPL(marker_probe_register);
724
725/**
726 * marker_probe_unregister - Disconnect a probe from a marker
727 * @name: marker name
728 * @probe: probe function pointer
729 * @probe_private: probe private data
730 *
731 * Returns the private data given to marker_probe_register, or an ERR_PTR().
732 * We do not need to call a synchronize_sched to make sure the probes have
733 * finished running before doing a module unload, because the module unload
734 * itself uses stop_machine(), which insures that every preempt disabled section
735 * have finished.
736 */
737int marker_probe_unregister(const char *name,
738 marker_probe_func *probe, void *probe_private)
739{
740 struct marker_entry *entry;
741 struct marker_probe_closure *old;
742 int ret = -ENOENT;
743
744 mutex_lock(&markers_mutex);
745 entry = get_marker(name);
746 if (!entry)
747 goto end;
748 if (entry->rcu_pending)
749 rcu_barrier_sched();
750 old = marker_entry_remove_probe(entry, probe, probe_private);
751 mutex_unlock(&markers_mutex);
752 marker_update_probes();
753 mutex_lock(&markers_mutex);
754 entry = get_marker(name);
755 if (!entry)
756 goto end;
757 if (entry->rcu_pending)
758 rcu_barrier_sched();
759 entry->oldptr = old;
760 entry->rcu_pending = 1;
761 /* write rcu_pending before calling the RCU callback */
762 smp_wmb();
763 call_rcu_sched(&entry->rcu, free_old_closure);
764 remove_marker(name); /* Ignore busy error message */
765 ret = 0;
766end:
767 mutex_unlock(&markers_mutex);
768 return ret;
769}
770EXPORT_SYMBOL_GPL(marker_probe_unregister);
771
772static struct marker_entry *
773get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
774{
775 struct marker_entry *entry;
776 unsigned int i;
777 struct hlist_head *head;
778 struct hlist_node *node;
779
780 for (i = 0; i < MARKER_TABLE_SIZE; i++) {
781 head = &marker_table[i];
782 hlist_for_each_entry(entry, node, head, hlist) {
783 if (!entry->ptype) {
784 if (entry->single.func == probe
785 && entry->single.probe_private
786 == probe_private)
787 return entry;
788 } else {
789 struct marker_probe_closure *closure;
790 closure = entry->multi;
791 for (i = 0; closure[i].func; i++) {
792 if (closure[i].func == probe &&
793 closure[i].probe_private
794 == probe_private)
795 return entry;
796 }
797 }
798 }
799 }
800 return NULL;
801}
802
803/**
804 * marker_probe_unregister_private_data - Disconnect a probe from a marker
805 * @probe: probe function
806 * @probe_private: probe private data
807 *
808 * Unregister a probe by providing the registered private data.
809 * Only removes the first marker found in hash table.
810 * Return 0 on success or error value.
811 * We do not need to call a synchronize_sched to make sure the probes have
812 * finished running before doing a module unload, because the module unload
813 * itself uses stop_machine(), which insures that every preempt disabled section
814 * have finished.
815 */
816int marker_probe_unregister_private_data(marker_probe_func *probe,
817 void *probe_private)
818{
819 struct marker_entry *entry;
820 int ret = 0;
821 struct marker_probe_closure *old;
822
823 mutex_lock(&markers_mutex);
824 entry = get_marker_from_private_data(probe, probe_private);
825 if (!entry) {
826 ret = -ENOENT;
827 goto end;
828 }
829 if (entry->rcu_pending)
830 rcu_barrier_sched();
831 old = marker_entry_remove_probe(entry, NULL, probe_private);
832 mutex_unlock(&markers_mutex);
833 marker_update_probes();
834 mutex_lock(&markers_mutex);
835 entry = get_marker_from_private_data(probe, probe_private);
836 if (!entry)
837 goto end;
838 if (entry->rcu_pending)
839 rcu_barrier_sched();
840 entry->oldptr = old;
841 entry->rcu_pending = 1;
842 /* write rcu_pending before calling the RCU callback */
843 smp_wmb();
844 call_rcu_sched(&entry->rcu, free_old_closure);
845 remove_marker(entry->name); /* Ignore busy error message */
846end:
847 mutex_unlock(&markers_mutex);
848 return ret;
849}
850EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
851
852/**
853 * marker_get_private_data - Get a marker's probe private data
854 * @name: marker name
855 * @probe: probe to match
856 * @num: get the nth matching probe's private data
857 *
858 * Returns the nth private data pointer (starting from 0) matching, or an
859 * ERR_PTR.
860 * Returns the private data pointer, or an ERR_PTR.
861 * The private data pointer should _only_ be dereferenced if the caller is the
862 * owner of the data, or its content could vanish. This is mostly used to
863 * confirm that a caller is the owner of a registered probe.
864 */
865void *marker_get_private_data(const char *name, marker_probe_func *probe,
866 int num)
867{
868 struct hlist_head *head;
869 struct hlist_node *node;
870 struct marker_entry *e;
871 size_t name_len = strlen(name) + 1;
872 u32 hash = jhash(name, name_len-1, 0);
873 int i;
874
875 head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
876 hlist_for_each_entry(e, node, head, hlist) {
877 if (!strcmp(name, e->name)) {
878 if (!e->ptype) {
879 if (num == 0 && e->single.func == probe)
880 return e->single.probe_private;
881 } else {
882 struct marker_probe_closure *closure;
883 int match = 0;
884 closure = e->multi;
885 for (i = 0; closure[i].func; i++) {
886 if (closure[i].func != probe)
887 continue;
888 if (match++ == num)
889 return closure[i].probe_private;
890 }
891 }
892 break;
893 }
894 }
895 return ERR_PTR(-ENOENT);
896}
897EXPORT_SYMBOL_GPL(marker_get_private_data);
898
899#ifdef CONFIG_MODULES
900
901int marker_module_notify(struct notifier_block *self,
902 unsigned long val, void *data)
903{
904 struct module *mod = data;
905
906 switch (val) {
907 case MODULE_STATE_COMING:
908 marker_update_probe_range(mod->markers,
909 mod->markers + mod->num_markers);
910 break;
911 case MODULE_STATE_GOING:
912 marker_update_probe_range(mod->markers,
913 mod->markers + mod->num_markers);
914 break;
915 }
916 return 0;
917}
918
919struct notifier_block marker_module_nb = {
920 .notifier_call = marker_module_notify,
921 .priority = 0,
922};
923
924static int init_markers(void)
925{
926 return register_module_notifier(&marker_module_nb);
927}
928__initcall(init_markers);
929
930#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 38928fcaff2b..e6bc4b28aa62 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -47,6 +47,7 @@
47#include <linux/rculist.h> 47#include <linux/rculist.h>
48#include <asm/uaccess.h> 48#include <asm/uaccess.h>
49#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
50#include <asm/mmu_context.h>
50#include <linux/license.h> 51#include <linux/license.h>
51#include <asm/sections.h> 52#include <asm/sections.h>
52#include <linux/tracepoint.h> 53#include <linux/tracepoint.h>
@@ -55,6 +56,11 @@
55#include <linux/percpu.h> 56#include <linux/percpu.h>
56#include <linux/kmemleak.h> 57#include <linux/kmemleak.h>
57 58
59#define CREATE_TRACE_POINTS
60#include <trace/events/module.h>
61
62EXPORT_TRACEPOINT_SYMBOL(module_get);
63
58#if 0 64#if 0
59#define DEBUGP printk 65#define DEBUGP printk
60#else 66#else
@@ -364,7 +370,7 @@ EXPORT_SYMBOL_GPL(find_module);
364 370
365#ifdef CONFIG_SMP 371#ifdef CONFIG_SMP
366 372
367#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA 373#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
368 374
369static void *percpu_modalloc(unsigned long size, unsigned long align, 375static void *percpu_modalloc(unsigned long size, unsigned long align,
370 const char *name) 376 const char *name)
@@ -389,7 +395,7 @@ static void percpu_modfree(void *freeme)
389 free_percpu(freeme); 395 free_percpu(freeme);
390} 396}
391 397
392#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 398#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
393 399
394/* Number of blocks used and allocated. */ 400/* Number of blocks used and allocated. */
395static unsigned int pcpu_num_used, pcpu_num_allocated; 401static unsigned int pcpu_num_used, pcpu_num_allocated;
@@ -535,7 +541,7 @@ static int percpu_modinit(void)
535} 541}
536__initcall(percpu_modinit); 542__initcall(percpu_modinit);
537 543
538#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ 544#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
539 545
540static unsigned int find_pcpusec(Elf_Ehdr *hdr, 546static unsigned int find_pcpusec(Elf_Ehdr *hdr,
541 Elf_Shdr *sechdrs, 547 Elf_Shdr *sechdrs,
@@ -909,16 +915,18 @@ void __symbol_put(const char *symbol)
909} 915}
910EXPORT_SYMBOL(__symbol_put); 916EXPORT_SYMBOL(__symbol_put);
911 917
918/* Note this assumes addr is a function, which it currently always is. */
912void symbol_put_addr(void *addr) 919void symbol_put_addr(void *addr)
913{ 920{
914 struct module *modaddr; 921 struct module *modaddr;
922 unsigned long a = (unsigned long)dereference_function_descriptor(addr);
915 923
916 if (core_kernel_text((unsigned long)addr)) 924 if (core_kernel_text(a))
917 return; 925 return;
918 926
919 /* module_text_address is safe here: we're supposed to have reference 927 /* module_text_address is safe here: we're supposed to have reference
920 * to module from symbol_get, so it can't go away. */ 928 * to module from symbol_get, so it can't go away. */
921 modaddr = __module_text_address((unsigned long)addr); 929 modaddr = __module_text_address(a);
922 BUG_ON(!modaddr); 930 BUG_ON(!modaddr);
923 module_put(modaddr); 931 module_put(modaddr);
924} 932}
@@ -940,6 +948,8 @@ void module_put(struct module *module)
940 if (module) { 948 if (module) {
941 unsigned int cpu = get_cpu(); 949 unsigned int cpu = get_cpu();
942 local_dec(__module_ref_addr(module, cpu)); 950 local_dec(__module_ref_addr(module, cpu));
951 trace_module_put(module, _RET_IP_,
952 local_read(__module_ref_addr(module, cpu)));
943 /* Maybe they're waiting for us to drop reference? */ 953 /* Maybe they're waiting for us to drop reference? */
944 if (unlikely(!module_is_live(module))) 954 if (unlikely(!module_is_live(module)))
945 wake_up_process(module->waiter); 955 wake_up_process(module->waiter);
@@ -1068,7 +1078,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1068{ 1078{
1069 const unsigned long *crc; 1079 const unsigned long *crc;
1070 1080
1071 if (!find_symbol("module_layout", NULL, &crc, true, false)) 1081 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1082 &crc, true, false))
1072 BUG(); 1083 BUG();
1073 return check_version(sechdrs, versindex, "module_layout", mod, crc); 1084 return check_version(sechdrs, versindex, "module_layout", mod, crc);
1074} 1085}
@@ -1271,6 +1282,10 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1271 struct module_notes_attrs *notes_attrs; 1282 struct module_notes_attrs *notes_attrs;
1272 struct bin_attribute *nattr; 1283 struct bin_attribute *nattr;
1273 1284
1285 /* failed to create section attributes, so can't create notes */
1286 if (!mod->sect_attrs)
1287 return;
1288
1274 /* Count notes sections and allocate structures. */ 1289 /* Count notes sections and allocate structures. */
1275 notes = 0; 1290 notes = 0;
1276 for (i = 0; i < nsect; i++) 1291 for (i = 0; i < nsect; i++)
@@ -1490,6 +1505,8 @@ static int __unlink_module(void *_mod)
1490/* Free a module, remove from lists, etc (must hold module_mutex). */ 1505/* Free a module, remove from lists, etc (must hold module_mutex). */
1491static void free_module(struct module *mod) 1506static void free_module(struct module *mod)
1492{ 1507{
1508 trace_module_free(mod);
1509
1493 /* Delete from various lists */ 1510 /* Delete from various lists */
1494 stop_machine(__unlink_module, mod, NULL); 1511 stop_machine(__unlink_module, mod, NULL);
1495 remove_notes_attrs(mod); 1512 remove_notes_attrs(mod);
@@ -1519,6 +1536,10 @@ static void free_module(struct module *mod)
1519 1536
1520 /* Finally, free the core (containing the module structure) */ 1537 /* Finally, free the core (containing the module structure) */
1521 module_free(mod, mod->module_core); 1538 module_free(mod, mod->module_core);
1539
1540#ifdef CONFIG_MPU
1541 update_protections(current->mm);
1542#endif
1522} 1543}
1523 1544
1524void *__symbol_get(const char *symbol) 1545void *__symbol_get(const char *symbol)
@@ -2221,10 +2242,6 @@ static noinline struct module *load_module(void __user *umod,
2221 sizeof(*mod->ctors), &mod->num_ctors); 2242 sizeof(*mod->ctors), &mod->num_ctors);
2222#endif 2243#endif
2223 2244
2224#ifdef CONFIG_MARKERS
2225 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
2226 sizeof(*mod->markers), &mod->num_markers);
2227#endif
2228#ifdef CONFIG_TRACEPOINTS 2245#ifdef CONFIG_TRACEPOINTS
2229 mod->tracepoints = section_objs(hdr, sechdrs, secstrings, 2246 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2230 "__tracepoints", 2247 "__tracepoints",
@@ -2357,6 +2374,8 @@ static noinline struct module *load_module(void __user *umod,
2357 /* Get rid of temporary copy */ 2374 /* Get rid of temporary copy */
2358 vfree(hdr); 2375 vfree(hdr);
2359 2376
2377 trace_module_load(mod);
2378
2360 /* Done! */ 2379 /* Done! */
2361 return mod; 2380 return mod;
2362 2381
@@ -2451,9 +2470,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2451 return ret; 2470 return ret;
2452 } 2471 }
2453 if (ret > 0) { 2472 if (ret > 0) {
2454 printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " 2473 printk(KERN_WARNING
2455 "it should follow 0/-E convention\n" 2474"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
2456 KERN_WARNING "%s: loading module anyway...\n", 2475"%s: loading module anyway...\n",
2457 __func__, mod->name, ret, 2476 __func__, mod->name, ret,
2458 __func__); 2477 __func__);
2459 dump_stack(); 2478 dump_stack();
@@ -2940,20 +2959,6 @@ void module_layout(struct module *mod,
2940EXPORT_SYMBOL(module_layout); 2959EXPORT_SYMBOL(module_layout);
2941#endif 2960#endif
2942 2961
2943#ifdef CONFIG_MARKERS
2944void module_update_markers(void)
2945{
2946 struct module *mod;
2947
2948 mutex_lock(&module_mutex);
2949 list_for_each_entry(mod, &modules, list)
2950 if (!mod->taints)
2951 marker_update_probe_range(mod->markers,
2952 mod->markers + mod->num_markers);
2953 mutex_unlock(&module_mutex);
2954}
2955#endif
2956
2957#ifdef CONFIG_TRACEPOINTS 2962#ifdef CONFIG_TRACEPOINTS
2958void module_update_tracepoints(void) 2963void module_update_tracepoints(void)
2959{ 2964{
diff --git a/kernel/panic.c b/kernel/panic.c
index 984b3ecbd72c..bcdef26e3332 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -177,7 +177,7 @@ static const struct tnt tnts[] = {
177 * 'W' - Taint on warning. 177 * 'W' - Taint on warning.
178 * 'C' - modules from drivers/staging are loaded. 178 * 'C' - modules from drivers/staging are loaded.
179 * 179 *
180 * The string is overwritten by the next call to print_taint(). 180 * The string is overwritten by the next call to print_tainted().
181 */ 181 */
182const char *print_tainted(void) 182const char *print_tainted(void)
183{ 183{
@@ -301,6 +301,7 @@ int oops_may_print(void)
301 */ 301 */
302void oops_enter(void) 302void oops_enter(void)
303{ 303{
304 tracing_off();
304 /* can't trust the integrity of the kernel anymore: */ 305 /* can't trust the integrity of the kernel anymore: */
305 debug_locks_off(); 306 debug_locks_off();
306 do_oops_enter_exit(); 307 do_oops_enter_exit();
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
deleted file mode 100644
index 1a933a221ea4..000000000000
--- a/kernel/perf_counter.c
+++ /dev/null
@@ -1,4383 +0,0 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_counter.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU counters:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_counters __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_counters __read_mostly;
44static atomic_t nr_comm_counters __read_mostly;
45
46/*
47 * perf counter paranoia level:
48 * 0 - not paranoid
49 * 1 - disallow cpu counters to unpriv
50 * 2 - disallow kernel profiling to unpriv
51 */
52int sysctl_perf_counter_paranoid __read_mostly;
53
54static inline bool perf_paranoid_cpu(void)
55{
56 return sysctl_perf_counter_paranoid > 0;
57}
58
59static inline bool perf_paranoid_kernel(void)
60{
61 return sysctl_perf_counter_paranoid > 1;
62}
63
64int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
65
66/*
67 * max perf counter sample rate
68 */
69int sysctl_perf_counter_sample_rate __read_mostly = 100000;
70
71static atomic64_t perf_counter_id;
72
73/*
74 * Lock for (sysadmin-configurable) counter reservations:
75 */
76static DEFINE_SPINLOCK(perf_resource_lock);
77
78/*
79 * Architecture provided APIs - weak aliases:
80 */
81extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
82{
83 return NULL;
84}
85
86void __weak hw_perf_disable(void) { barrier(); }
87void __weak hw_perf_enable(void) { barrier(); }
88
89void __weak hw_perf_counter_setup(int cpu) { barrier(); }
90
91int __weak
92hw_perf_group_sched_in(struct perf_counter *group_leader,
93 struct perf_cpu_context *cpuctx,
94 struct perf_counter_context *ctx, int cpu)
95{
96 return 0;
97}
98
99void __weak perf_counter_print_debug(void) { }
100
101static DEFINE_PER_CPU(int, disable_count);
102
103void __perf_disable(void)
104{
105 __get_cpu_var(disable_count)++;
106}
107
108bool __perf_enable(void)
109{
110 return !--__get_cpu_var(disable_count);
111}
112
113void perf_disable(void)
114{
115 __perf_disable();
116 hw_perf_disable();
117}
118
119void perf_enable(void)
120{
121 if (__perf_enable())
122 hw_perf_enable();
123}
124
125static void get_ctx(struct perf_counter_context *ctx)
126{
127 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
128}
129
130static void free_ctx(struct rcu_head *head)
131{
132 struct perf_counter_context *ctx;
133
134 ctx = container_of(head, struct perf_counter_context, rcu_head);
135 kfree(ctx);
136}
137
138static void put_ctx(struct perf_counter_context *ctx)
139{
140 if (atomic_dec_and_test(&ctx->refcount)) {
141 if (ctx->parent_ctx)
142 put_ctx(ctx->parent_ctx);
143 if (ctx->task)
144 put_task_struct(ctx->task);
145 call_rcu(&ctx->rcu_head, free_ctx);
146 }
147}
148
149/*
150 * Get the perf_counter_context for a task and lock it.
151 * This has to cope with with the fact that until it is locked,
152 * the context could get moved to another task.
153 */
154static struct perf_counter_context *
155perf_lock_task_context(struct task_struct *task, unsigned long *flags)
156{
157 struct perf_counter_context *ctx;
158
159 rcu_read_lock();
160 retry:
161 ctx = rcu_dereference(task->perf_counter_ctxp);
162 if (ctx) {
163 /*
164 * If this context is a clone of another, it might
165 * get swapped for another underneath us by
166 * perf_counter_task_sched_out, though the
167 * rcu_read_lock() protects us from any context
168 * getting freed. Lock the context and check if it
169 * got swapped before we could get the lock, and retry
170 * if so. If we locked the right context, then it
171 * can't get swapped on us any more.
172 */
173 spin_lock_irqsave(&ctx->lock, *flags);
174 if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
175 spin_unlock_irqrestore(&ctx->lock, *flags);
176 goto retry;
177 }
178
179 if (!atomic_inc_not_zero(&ctx->refcount)) {
180 spin_unlock_irqrestore(&ctx->lock, *flags);
181 ctx = NULL;
182 }
183 }
184 rcu_read_unlock();
185 return ctx;
186}
187
188/*
189 * Get the context for a task and increment its pin_count so it
190 * can't get swapped to another task. This also increments its
191 * reference count so that the context can't get freed.
192 */
193static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
194{
195 struct perf_counter_context *ctx;
196 unsigned long flags;
197
198 ctx = perf_lock_task_context(task, &flags);
199 if (ctx) {
200 ++ctx->pin_count;
201 spin_unlock_irqrestore(&ctx->lock, flags);
202 }
203 return ctx;
204}
205
206static void perf_unpin_context(struct perf_counter_context *ctx)
207{
208 unsigned long flags;
209
210 spin_lock_irqsave(&ctx->lock, flags);
211 --ctx->pin_count;
212 spin_unlock_irqrestore(&ctx->lock, flags);
213 put_ctx(ctx);
214}
215
216/*
217 * Add a counter from the lists for its context.
218 * Must be called with ctx->mutex and ctx->lock held.
219 */
220static void
221list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
222{
223 struct perf_counter *group_leader = counter->group_leader;
224
225 /*
226 * Depending on whether it is a standalone or sibling counter,
227 * add it straight to the context's counter list, or to the group
228 * leader's sibling list:
229 */
230 if (group_leader == counter)
231 list_add_tail(&counter->list_entry, &ctx->counter_list);
232 else {
233 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
234 group_leader->nr_siblings++;
235 }
236
237 list_add_rcu(&counter->event_entry, &ctx->event_list);
238 ctx->nr_counters++;
239}
240
241/*
242 * Remove a counter from the lists for its context.
243 * Must be called with ctx->mutex and ctx->lock held.
244 */
245static void
246list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
247{
248 struct perf_counter *sibling, *tmp;
249
250 if (list_empty(&counter->list_entry))
251 return;
252 ctx->nr_counters--;
253
254 list_del_init(&counter->list_entry);
255 list_del_rcu(&counter->event_entry);
256
257 if (counter->group_leader != counter)
258 counter->group_leader->nr_siblings--;
259
260 /*
261 * If this was a group counter with sibling counters then
262 * upgrade the siblings to singleton counters by adding them
263 * to the context list directly:
264 */
265 list_for_each_entry_safe(sibling, tmp,
266 &counter->sibling_list, list_entry) {
267
268 list_move_tail(&sibling->list_entry, &ctx->counter_list);
269 sibling->group_leader = sibling;
270 }
271}
272
273static void
274counter_sched_out(struct perf_counter *counter,
275 struct perf_cpu_context *cpuctx,
276 struct perf_counter_context *ctx)
277{
278 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
279 return;
280
281 counter->state = PERF_COUNTER_STATE_INACTIVE;
282 counter->tstamp_stopped = ctx->time;
283 counter->pmu->disable(counter);
284 counter->oncpu = -1;
285
286 if (!is_software_counter(counter))
287 cpuctx->active_oncpu--;
288 ctx->nr_active--;
289 if (counter->attr.exclusive || !cpuctx->active_oncpu)
290 cpuctx->exclusive = 0;
291}
292
293static void
294group_sched_out(struct perf_counter *group_counter,
295 struct perf_cpu_context *cpuctx,
296 struct perf_counter_context *ctx)
297{
298 struct perf_counter *counter;
299
300 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
301 return;
302
303 counter_sched_out(group_counter, cpuctx, ctx);
304
305 /*
306 * Schedule out siblings (if any):
307 */
308 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
309 counter_sched_out(counter, cpuctx, ctx);
310
311 if (group_counter->attr.exclusive)
312 cpuctx->exclusive = 0;
313}
314
315/*
316 * Cross CPU call to remove a performance counter
317 *
318 * We disable the counter on the hardware level first. After that we
319 * remove it from the context list.
320 */
321static void __perf_counter_remove_from_context(void *info)
322{
323 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
324 struct perf_counter *counter = info;
325 struct perf_counter_context *ctx = counter->ctx;
326
327 /*
328 * If this is a task context, we need to check whether it is
329 * the current task context of this cpu. If not it has been
330 * scheduled out before the smp call arrived.
331 */
332 if (ctx->task && cpuctx->task_ctx != ctx)
333 return;
334
335 spin_lock(&ctx->lock);
336 /*
337 * Protect the list operation against NMI by disabling the
338 * counters on a global level.
339 */
340 perf_disable();
341
342 counter_sched_out(counter, cpuctx, ctx);
343
344 list_del_counter(counter, ctx);
345
346 if (!ctx->task) {
347 /*
348 * Allow more per task counters with respect to the
349 * reservation:
350 */
351 cpuctx->max_pertask =
352 min(perf_max_counters - ctx->nr_counters,
353 perf_max_counters - perf_reserved_percpu);
354 }
355
356 perf_enable();
357 spin_unlock(&ctx->lock);
358}
359
360
361/*
362 * Remove the counter from a task's (or a CPU's) list of counters.
363 *
364 * Must be called with ctx->mutex held.
365 *
366 * CPU counters are removed with a smp call. For task counters we only
367 * call when the task is on a CPU.
368 *
369 * If counter->ctx is a cloned context, callers must make sure that
370 * every task struct that counter->ctx->task could possibly point to
371 * remains valid. This is OK when called from perf_release since
372 * that only calls us on the top-level context, which can't be a clone.
373 * When called from perf_counter_exit_task, it's OK because the
374 * context has been detached from its task.
375 */
376static void perf_counter_remove_from_context(struct perf_counter *counter)
377{
378 struct perf_counter_context *ctx = counter->ctx;
379 struct task_struct *task = ctx->task;
380
381 if (!task) {
382 /*
383 * Per cpu counters are removed via an smp call and
384 * the removal is always sucessful.
385 */
386 smp_call_function_single(counter->cpu,
387 __perf_counter_remove_from_context,
388 counter, 1);
389 return;
390 }
391
392retry:
393 task_oncpu_function_call(task, __perf_counter_remove_from_context,
394 counter);
395
396 spin_lock_irq(&ctx->lock);
397 /*
398 * If the context is active we need to retry the smp call.
399 */
400 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
401 spin_unlock_irq(&ctx->lock);
402 goto retry;
403 }
404
405 /*
406 * The lock prevents that this context is scheduled in so we
407 * can remove the counter safely, if the call above did not
408 * succeed.
409 */
410 if (!list_empty(&counter->list_entry)) {
411 list_del_counter(counter, ctx);
412 }
413 spin_unlock_irq(&ctx->lock);
414}
415
416static inline u64 perf_clock(void)
417{
418 return cpu_clock(smp_processor_id());
419}
420
421/*
422 * Update the record of the current time in a context.
423 */
424static void update_context_time(struct perf_counter_context *ctx)
425{
426 u64 now = perf_clock();
427
428 ctx->time += now - ctx->timestamp;
429 ctx->timestamp = now;
430}
431
432/*
433 * Update the total_time_enabled and total_time_running fields for a counter.
434 */
435static void update_counter_times(struct perf_counter *counter)
436{
437 struct perf_counter_context *ctx = counter->ctx;
438 u64 run_end;
439
440 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
441 return;
442
443 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
444
445 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
446 run_end = counter->tstamp_stopped;
447 else
448 run_end = ctx->time;
449
450 counter->total_time_running = run_end - counter->tstamp_running;
451}
452
453/*
454 * Update total_time_enabled and total_time_running for all counters in a group.
455 */
456static void update_group_times(struct perf_counter *leader)
457{
458 struct perf_counter *counter;
459
460 update_counter_times(leader);
461 list_for_each_entry(counter, &leader->sibling_list, list_entry)
462 update_counter_times(counter);
463}
464
465/*
466 * Cross CPU call to disable a performance counter
467 */
468static void __perf_counter_disable(void *info)
469{
470 struct perf_counter *counter = info;
471 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
472 struct perf_counter_context *ctx = counter->ctx;
473
474 /*
475 * If this is a per-task counter, need to check whether this
476 * counter's task is the current task on this cpu.
477 */
478 if (ctx->task && cpuctx->task_ctx != ctx)
479 return;
480
481 spin_lock(&ctx->lock);
482
483 /*
484 * If the counter is on, turn it off.
485 * If it is in error state, leave it in error state.
486 */
487 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
488 update_context_time(ctx);
489 update_counter_times(counter);
490 if (counter == counter->group_leader)
491 group_sched_out(counter, cpuctx, ctx);
492 else
493 counter_sched_out(counter, cpuctx, ctx);
494 counter->state = PERF_COUNTER_STATE_OFF;
495 }
496
497 spin_unlock(&ctx->lock);
498}
499
500/*
501 * Disable a counter.
502 *
503 * If counter->ctx is a cloned context, callers must make sure that
504 * every task struct that counter->ctx->task could possibly point to
505 * remains valid. This condition is satisifed when called through
506 * perf_counter_for_each_child or perf_counter_for_each because they
507 * hold the top-level counter's child_mutex, so any descendant that
508 * goes to exit will block in sync_child_counter.
509 * When called from perf_pending_counter it's OK because counter->ctx
510 * is the current context on this CPU and preemption is disabled,
511 * hence we can't get into perf_counter_task_sched_out for this context.
512 */
513static void perf_counter_disable(struct perf_counter *counter)
514{
515 struct perf_counter_context *ctx = counter->ctx;
516 struct task_struct *task = ctx->task;
517
518 if (!task) {
519 /*
520 * Disable the counter on the cpu that it's on
521 */
522 smp_call_function_single(counter->cpu, __perf_counter_disable,
523 counter, 1);
524 return;
525 }
526
527 retry:
528 task_oncpu_function_call(task, __perf_counter_disable, counter);
529
530 spin_lock_irq(&ctx->lock);
531 /*
532 * If the counter is still active, we need to retry the cross-call.
533 */
534 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
535 spin_unlock_irq(&ctx->lock);
536 goto retry;
537 }
538
539 /*
540 * Since we have the lock this context can't be scheduled
541 * in, so we can change the state safely.
542 */
543 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
544 update_counter_times(counter);
545 counter->state = PERF_COUNTER_STATE_OFF;
546 }
547
548 spin_unlock_irq(&ctx->lock);
549}
550
551static int
552counter_sched_in(struct perf_counter *counter,
553 struct perf_cpu_context *cpuctx,
554 struct perf_counter_context *ctx,
555 int cpu)
556{
557 if (counter->state <= PERF_COUNTER_STATE_OFF)
558 return 0;
559
560 counter->state = PERF_COUNTER_STATE_ACTIVE;
561 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
562 /*
563 * The new state must be visible before we turn it on in the hardware:
564 */
565 smp_wmb();
566
567 if (counter->pmu->enable(counter)) {
568 counter->state = PERF_COUNTER_STATE_INACTIVE;
569 counter->oncpu = -1;
570 return -EAGAIN;
571 }
572
573 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
574
575 if (!is_software_counter(counter))
576 cpuctx->active_oncpu++;
577 ctx->nr_active++;
578
579 if (counter->attr.exclusive)
580 cpuctx->exclusive = 1;
581
582 return 0;
583}
584
585static int
586group_sched_in(struct perf_counter *group_counter,
587 struct perf_cpu_context *cpuctx,
588 struct perf_counter_context *ctx,
589 int cpu)
590{
591 struct perf_counter *counter, *partial_group;
592 int ret;
593
594 if (group_counter->state == PERF_COUNTER_STATE_OFF)
595 return 0;
596
597 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
598 if (ret)
599 return ret < 0 ? ret : 0;
600
601 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
602 return -EAGAIN;
603
604 /*
605 * Schedule in siblings as one group (if any):
606 */
607 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
608 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
609 partial_group = counter;
610 goto group_error;
611 }
612 }
613
614 return 0;
615
616group_error:
617 /*
618 * Groups can be scheduled in as one unit only, so undo any
619 * partial group before returning:
620 */
621 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
622 if (counter == partial_group)
623 break;
624 counter_sched_out(counter, cpuctx, ctx);
625 }
626 counter_sched_out(group_counter, cpuctx, ctx);
627
628 return -EAGAIN;
629}
630
631/*
632 * Return 1 for a group consisting entirely of software counters,
633 * 0 if the group contains any hardware counters.
634 */
635static int is_software_only_group(struct perf_counter *leader)
636{
637 struct perf_counter *counter;
638
639 if (!is_software_counter(leader))
640 return 0;
641
642 list_for_each_entry(counter, &leader->sibling_list, list_entry)
643 if (!is_software_counter(counter))
644 return 0;
645
646 return 1;
647}
648
649/*
650 * Work out whether we can put this counter group on the CPU now.
651 */
652static int group_can_go_on(struct perf_counter *counter,
653 struct perf_cpu_context *cpuctx,
654 int can_add_hw)
655{
656 /*
657 * Groups consisting entirely of software counters can always go on.
658 */
659 if (is_software_only_group(counter))
660 return 1;
661 /*
662 * If an exclusive group is already on, no other hardware
663 * counters can go on.
664 */
665 if (cpuctx->exclusive)
666 return 0;
667 /*
668 * If this group is exclusive and there are already
669 * counters on the CPU, it can't go on.
670 */
671 if (counter->attr.exclusive && cpuctx->active_oncpu)
672 return 0;
673 /*
674 * Otherwise, try to add it if all previous groups were able
675 * to go on.
676 */
677 return can_add_hw;
678}
679
680static void add_counter_to_ctx(struct perf_counter *counter,
681 struct perf_counter_context *ctx)
682{
683 list_add_counter(counter, ctx);
684 counter->tstamp_enabled = ctx->time;
685 counter->tstamp_running = ctx->time;
686 counter->tstamp_stopped = ctx->time;
687}
688
689/*
690 * Cross CPU call to install and enable a performance counter
691 *
692 * Must be called with ctx->mutex held
693 */
694static void __perf_install_in_context(void *info)
695{
696 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
697 struct perf_counter *counter = info;
698 struct perf_counter_context *ctx = counter->ctx;
699 struct perf_counter *leader = counter->group_leader;
700 int cpu = smp_processor_id();
701 int err;
702
703 /*
704 * If this is a task context, we need to check whether it is
705 * the current task context of this cpu. If not it has been
706 * scheduled out before the smp call arrived.
707 * Or possibly this is the right context but it isn't
708 * on this cpu because it had no counters.
709 */
710 if (ctx->task && cpuctx->task_ctx != ctx) {
711 if (cpuctx->task_ctx || ctx->task != current)
712 return;
713 cpuctx->task_ctx = ctx;
714 }
715
716 spin_lock(&ctx->lock);
717 ctx->is_active = 1;
718 update_context_time(ctx);
719
720 /*
721 * Protect the list operation against NMI by disabling the
722 * counters on a global level. NOP for non NMI based counters.
723 */
724 perf_disable();
725
726 add_counter_to_ctx(counter, ctx);
727
728 /*
729 * Don't put the counter on if it is disabled or if
730 * it is in a group and the group isn't on.
731 */
732 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
733 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
734 goto unlock;
735
736 /*
737 * An exclusive counter can't go on if there are already active
738 * hardware counters, and no hardware counter can go on if there
739 * is already an exclusive counter on.
740 */
741 if (!group_can_go_on(counter, cpuctx, 1))
742 err = -EEXIST;
743 else
744 err = counter_sched_in(counter, cpuctx, ctx, cpu);
745
746 if (err) {
747 /*
748 * This counter couldn't go on. If it is in a group
749 * then we have to pull the whole group off.
750 * If the counter group is pinned then put it in error state.
751 */
752 if (leader != counter)
753 group_sched_out(leader, cpuctx, ctx);
754 if (leader->attr.pinned) {
755 update_group_times(leader);
756 leader->state = PERF_COUNTER_STATE_ERROR;
757 }
758 }
759
760 if (!err && !ctx->task && cpuctx->max_pertask)
761 cpuctx->max_pertask--;
762
763 unlock:
764 perf_enable();
765
766 spin_unlock(&ctx->lock);
767}
768
769/*
770 * Attach a performance counter to a context
771 *
772 * First we add the counter to the list with the hardware enable bit
773 * in counter->hw_config cleared.
774 *
775 * If the counter is attached to a task which is on a CPU we use a smp
776 * call to enable it in the task context. The task might have been
777 * scheduled away, but we check this in the smp call again.
778 *
779 * Must be called with ctx->mutex held.
780 */
781static void
782perf_install_in_context(struct perf_counter_context *ctx,
783 struct perf_counter *counter,
784 int cpu)
785{
786 struct task_struct *task = ctx->task;
787
788 if (!task) {
789 /*
790 * Per cpu counters are installed via an smp call and
791 * the install is always sucessful.
792 */
793 smp_call_function_single(cpu, __perf_install_in_context,
794 counter, 1);
795 return;
796 }
797
798retry:
799 task_oncpu_function_call(task, __perf_install_in_context,
800 counter);
801
802 spin_lock_irq(&ctx->lock);
803 /*
804 * we need to retry the smp call.
805 */
806 if (ctx->is_active && list_empty(&counter->list_entry)) {
807 spin_unlock_irq(&ctx->lock);
808 goto retry;
809 }
810
811 /*
812 * The lock prevents that this context is scheduled in so we
813 * can add the counter safely, if it the call above did not
814 * succeed.
815 */
816 if (list_empty(&counter->list_entry))
817 add_counter_to_ctx(counter, ctx);
818 spin_unlock_irq(&ctx->lock);
819}
820
821/*
822 * Cross CPU call to enable a performance counter
823 */
824static void __perf_counter_enable(void *info)
825{
826 struct perf_counter *counter = info;
827 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
828 struct perf_counter_context *ctx = counter->ctx;
829 struct perf_counter *leader = counter->group_leader;
830 int err;
831
832 /*
833 * If this is a per-task counter, need to check whether this
834 * counter's task is the current task on this cpu.
835 */
836 if (ctx->task && cpuctx->task_ctx != ctx) {
837 if (cpuctx->task_ctx || ctx->task != current)
838 return;
839 cpuctx->task_ctx = ctx;
840 }
841
842 spin_lock(&ctx->lock);
843 ctx->is_active = 1;
844 update_context_time(ctx);
845
846 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
847 goto unlock;
848 counter->state = PERF_COUNTER_STATE_INACTIVE;
849 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
850
851 /*
852 * If the counter is in a group and isn't the group leader,
853 * then don't put it on unless the group is on.
854 */
855 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
856 goto unlock;
857
858 if (!group_can_go_on(counter, cpuctx, 1)) {
859 err = -EEXIST;
860 } else {
861 perf_disable();
862 if (counter == leader)
863 err = group_sched_in(counter, cpuctx, ctx,
864 smp_processor_id());
865 else
866 err = counter_sched_in(counter, cpuctx, ctx,
867 smp_processor_id());
868 perf_enable();
869 }
870
871 if (err) {
872 /*
873 * If this counter can't go on and it's part of a
874 * group, then the whole group has to come off.
875 */
876 if (leader != counter)
877 group_sched_out(leader, cpuctx, ctx);
878 if (leader->attr.pinned) {
879 update_group_times(leader);
880 leader->state = PERF_COUNTER_STATE_ERROR;
881 }
882 }
883
884 unlock:
885 spin_unlock(&ctx->lock);
886}
887
888/*
889 * Enable a counter.
890 *
891 * If counter->ctx is a cloned context, callers must make sure that
892 * every task struct that counter->ctx->task could possibly point to
893 * remains valid. This condition is satisfied when called through
894 * perf_counter_for_each_child or perf_counter_for_each as described
895 * for perf_counter_disable.
896 */
897static void perf_counter_enable(struct perf_counter *counter)
898{
899 struct perf_counter_context *ctx = counter->ctx;
900 struct task_struct *task = ctx->task;
901
902 if (!task) {
903 /*
904 * Enable the counter on the cpu that it's on
905 */
906 smp_call_function_single(counter->cpu, __perf_counter_enable,
907 counter, 1);
908 return;
909 }
910
911 spin_lock_irq(&ctx->lock);
912 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
913 goto out;
914
915 /*
916 * If the counter is in error state, clear that first.
917 * That way, if we see the counter in error state below, we
918 * know that it has gone back into error state, as distinct
919 * from the task having been scheduled away before the
920 * cross-call arrived.
921 */
922 if (counter->state == PERF_COUNTER_STATE_ERROR)
923 counter->state = PERF_COUNTER_STATE_OFF;
924
925 retry:
926 spin_unlock_irq(&ctx->lock);
927 task_oncpu_function_call(task, __perf_counter_enable, counter);
928
929 spin_lock_irq(&ctx->lock);
930
931 /*
932 * If the context is active and the counter is still off,
933 * we need to retry the cross-call.
934 */
935 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
936 goto retry;
937
938 /*
939 * Since we have the lock this context can't be scheduled
940 * in, so we can change the state safely.
941 */
942 if (counter->state == PERF_COUNTER_STATE_OFF) {
943 counter->state = PERF_COUNTER_STATE_INACTIVE;
944 counter->tstamp_enabled =
945 ctx->time - counter->total_time_enabled;
946 }
947 out:
948 spin_unlock_irq(&ctx->lock);
949}
950
951static int perf_counter_refresh(struct perf_counter *counter, int refresh)
952{
953 /*
954 * not supported on inherited counters
955 */
956 if (counter->attr.inherit)
957 return -EINVAL;
958
959 atomic_add(refresh, &counter->event_limit);
960 perf_counter_enable(counter);
961
962 return 0;
963}
964
965void __perf_counter_sched_out(struct perf_counter_context *ctx,
966 struct perf_cpu_context *cpuctx)
967{
968 struct perf_counter *counter;
969
970 spin_lock(&ctx->lock);
971 ctx->is_active = 0;
972 if (likely(!ctx->nr_counters))
973 goto out;
974 update_context_time(ctx);
975
976 perf_disable();
977 if (ctx->nr_active) {
978 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
979 if (counter != counter->group_leader)
980 counter_sched_out(counter, cpuctx, ctx);
981 else
982 group_sched_out(counter, cpuctx, ctx);
983 }
984 }
985 perf_enable();
986 out:
987 spin_unlock(&ctx->lock);
988}
989
990/*
991 * Test whether two contexts are equivalent, i.e. whether they
992 * have both been cloned from the same version of the same context
993 * and they both have the same number of enabled counters.
994 * If the number of enabled counters is the same, then the set
995 * of enabled counters should be the same, because these are both
996 * inherited contexts, therefore we can't access individual counters
997 * in them directly with an fd; we can only enable/disable all
998 * counters via prctl, or enable/disable all counters in a family
999 * via ioctl, which will have the same effect on both contexts.
1000 */
1001static int context_equiv(struct perf_counter_context *ctx1,
1002 struct perf_counter_context *ctx2)
1003{
1004 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1005 && ctx1->parent_gen == ctx2->parent_gen
1006 && !ctx1->pin_count && !ctx2->pin_count;
1007}
1008
1009/*
1010 * Called from scheduler to remove the counters of the current task,
1011 * with interrupts disabled.
1012 *
1013 * We stop each counter and update the counter value in counter->count.
1014 *
1015 * This does not protect us against NMI, but disable()
1016 * sets the disabled bit in the control field of counter _before_
1017 * accessing the counter control register. If a NMI hits, then it will
1018 * not restart the counter.
1019 */
1020void perf_counter_task_sched_out(struct task_struct *task,
1021 struct task_struct *next, int cpu)
1022{
1023 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1024 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1025 struct perf_counter_context *next_ctx;
1026 struct perf_counter_context *parent;
1027 struct pt_regs *regs;
1028 int do_switch = 1;
1029
1030 regs = task_pt_regs(task);
1031 perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1032
1033 if (likely(!ctx || !cpuctx->task_ctx))
1034 return;
1035
1036 update_context_time(ctx);
1037
1038 rcu_read_lock();
1039 parent = rcu_dereference(ctx->parent_ctx);
1040 next_ctx = next->perf_counter_ctxp;
1041 if (parent && next_ctx &&
1042 rcu_dereference(next_ctx->parent_ctx) == parent) {
1043 /*
1044 * Looks like the two contexts are clones, so we might be
1045 * able to optimize the context switch. We lock both
1046 * contexts and check that they are clones under the
1047 * lock (including re-checking that neither has been
1048 * uncloned in the meantime). It doesn't matter which
1049 * order we take the locks because no other cpu could
1050 * be trying to lock both of these tasks.
1051 */
1052 spin_lock(&ctx->lock);
1053 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1054 if (context_equiv(ctx, next_ctx)) {
1055 /*
1056 * XXX do we need a memory barrier of sorts
1057 * wrt to rcu_dereference() of perf_counter_ctxp
1058 */
1059 task->perf_counter_ctxp = next_ctx;
1060 next->perf_counter_ctxp = ctx;
1061 ctx->task = next;
1062 next_ctx->task = task;
1063 do_switch = 0;
1064 }
1065 spin_unlock(&next_ctx->lock);
1066 spin_unlock(&ctx->lock);
1067 }
1068 rcu_read_unlock();
1069
1070 if (do_switch) {
1071 __perf_counter_sched_out(ctx, cpuctx);
1072 cpuctx->task_ctx = NULL;
1073 }
1074}
1075
1076/*
1077 * Called with IRQs disabled
1078 */
1079static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
1080{
1081 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1082
1083 if (!cpuctx->task_ctx)
1084 return;
1085
1086 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1087 return;
1088
1089 __perf_counter_sched_out(ctx, cpuctx);
1090 cpuctx->task_ctx = NULL;
1091}
1092
1093/*
1094 * Called with IRQs disabled
1095 */
1096static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
1097{
1098 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
1099}
1100
1101static void
1102__perf_counter_sched_in(struct perf_counter_context *ctx,
1103 struct perf_cpu_context *cpuctx, int cpu)
1104{
1105 struct perf_counter *counter;
1106 int can_add_hw = 1;
1107
1108 spin_lock(&ctx->lock);
1109 ctx->is_active = 1;
1110 if (likely(!ctx->nr_counters))
1111 goto out;
1112
1113 ctx->timestamp = perf_clock();
1114
1115 perf_disable();
1116
1117 /*
1118 * First go through the list and put on any pinned groups
1119 * in order to give them the best chance of going on.
1120 */
1121 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1122 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1123 !counter->attr.pinned)
1124 continue;
1125 if (counter->cpu != -1 && counter->cpu != cpu)
1126 continue;
1127
1128 if (counter != counter->group_leader)
1129 counter_sched_in(counter, cpuctx, ctx, cpu);
1130 else {
1131 if (group_can_go_on(counter, cpuctx, 1))
1132 group_sched_in(counter, cpuctx, ctx, cpu);
1133 }
1134
1135 /*
1136 * If this pinned group hasn't been scheduled,
1137 * put it in error state.
1138 */
1139 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1140 update_group_times(counter);
1141 counter->state = PERF_COUNTER_STATE_ERROR;
1142 }
1143 }
1144
1145 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1146 /*
1147 * Ignore counters in OFF or ERROR state, and
1148 * ignore pinned counters since we did them already.
1149 */
1150 if (counter->state <= PERF_COUNTER_STATE_OFF ||
1151 counter->attr.pinned)
1152 continue;
1153
1154 /*
1155 * Listen to the 'cpu' scheduling filter constraint
1156 * of counters:
1157 */
1158 if (counter->cpu != -1 && counter->cpu != cpu)
1159 continue;
1160
1161 if (counter != counter->group_leader) {
1162 if (counter_sched_in(counter, cpuctx, ctx, cpu))
1163 can_add_hw = 0;
1164 } else {
1165 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
1166 if (group_sched_in(counter, cpuctx, ctx, cpu))
1167 can_add_hw = 0;
1168 }
1169 }
1170 }
1171 perf_enable();
1172 out:
1173 spin_unlock(&ctx->lock);
1174}
1175
1176/*
1177 * Called from scheduler to add the counters of the current task
1178 * with interrupts disabled.
1179 *
1180 * We restore the counter value and then enable it.
1181 *
1182 * This does not protect us against NMI, but enable()
1183 * sets the enabled bit in the control field of counter _before_
1184 * accessing the counter control register. If a NMI hits, then it will
1185 * keep the counter running.
1186 */
1187void perf_counter_task_sched_in(struct task_struct *task, int cpu)
1188{
1189 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1190 struct perf_counter_context *ctx = task->perf_counter_ctxp;
1191
1192 if (likely(!ctx))
1193 return;
1194 if (cpuctx->task_ctx == ctx)
1195 return;
1196 __perf_counter_sched_in(ctx, cpuctx, cpu);
1197 cpuctx->task_ctx = ctx;
1198}
1199
1200static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1201{
1202 struct perf_counter_context *ctx = &cpuctx->ctx;
1203
1204 __perf_counter_sched_in(ctx, cpuctx, cpu);
1205}
1206
1207#define MAX_INTERRUPTS (~0ULL)
1208
1209static void perf_log_throttle(struct perf_counter *counter, int enable);
1210static void perf_log_period(struct perf_counter *counter, u64 period);
1211
1212static void perf_adjust_period(struct perf_counter *counter, u64 events)
1213{
1214 struct hw_perf_counter *hwc = &counter->hw;
1215 u64 period, sample_period;
1216 s64 delta;
1217
1218 events *= hwc->sample_period;
1219 period = div64_u64(events, counter->attr.sample_freq);
1220
1221 delta = (s64)(period - hwc->sample_period);
1222 delta = (delta + 7) / 8; /* low pass filter */
1223
1224 sample_period = hwc->sample_period + delta;
1225
1226 if (!sample_period)
1227 sample_period = 1;
1228
1229 perf_log_period(counter, sample_period);
1230
1231 hwc->sample_period = sample_period;
1232}
1233
1234static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
1235{
1236 struct perf_counter *counter;
1237 struct hw_perf_counter *hwc;
1238 u64 interrupts, freq;
1239
1240 spin_lock(&ctx->lock);
1241 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1242 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1243 continue;
1244
1245 hwc = &counter->hw;
1246
1247 interrupts = hwc->interrupts;
1248 hwc->interrupts = 0;
1249
1250 /*
1251 * unthrottle counters on the tick
1252 */
1253 if (interrupts == MAX_INTERRUPTS) {
1254 perf_log_throttle(counter, 1);
1255 counter->pmu->unthrottle(counter);
1256 interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
1257 }
1258
1259 if (!counter->attr.freq || !counter->attr.sample_freq)
1260 continue;
1261
1262 /*
1263 * if the specified freq < HZ then we need to skip ticks
1264 */
1265 if (counter->attr.sample_freq < HZ) {
1266 freq = counter->attr.sample_freq;
1267
1268 hwc->freq_count += freq;
1269 hwc->freq_interrupts += interrupts;
1270
1271 if (hwc->freq_count < HZ)
1272 continue;
1273
1274 interrupts = hwc->freq_interrupts;
1275 hwc->freq_interrupts = 0;
1276 hwc->freq_count -= HZ;
1277 } else
1278 freq = HZ;
1279
1280 perf_adjust_period(counter, freq * interrupts);
1281
1282 /*
1283 * In order to avoid being stalled by an (accidental) huge
1284 * sample period, force reset the sample period if we didn't
1285 * get any events in this freq period.
1286 */
1287 if (!interrupts) {
1288 perf_disable();
1289 counter->pmu->disable(counter);
1290 atomic64_set(&hwc->period_left, 0);
1291 counter->pmu->enable(counter);
1292 perf_enable();
1293 }
1294 }
1295 spin_unlock(&ctx->lock);
1296}
1297
1298/*
1299 * Round-robin a context's counters:
1300 */
1301static void rotate_ctx(struct perf_counter_context *ctx)
1302{
1303 struct perf_counter *counter;
1304
1305 if (!ctx->nr_counters)
1306 return;
1307
1308 spin_lock(&ctx->lock);
1309 /*
1310 * Rotate the first entry last (works just fine for group counters too):
1311 */
1312 perf_disable();
1313 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1314 list_move_tail(&counter->list_entry, &ctx->counter_list);
1315 break;
1316 }
1317 perf_enable();
1318
1319 spin_unlock(&ctx->lock);
1320}
1321
1322void perf_counter_task_tick(struct task_struct *curr, int cpu)
1323{
1324 struct perf_cpu_context *cpuctx;
1325 struct perf_counter_context *ctx;
1326
1327 if (!atomic_read(&nr_counters))
1328 return;
1329
1330 cpuctx = &per_cpu(perf_cpu_context, cpu);
1331 ctx = curr->perf_counter_ctxp;
1332
1333 perf_ctx_adjust_freq(&cpuctx->ctx);
1334 if (ctx)
1335 perf_ctx_adjust_freq(ctx);
1336
1337 perf_counter_cpu_sched_out(cpuctx);
1338 if (ctx)
1339 __perf_counter_task_sched_out(ctx);
1340
1341 rotate_ctx(&cpuctx->ctx);
1342 if (ctx)
1343 rotate_ctx(ctx);
1344
1345 perf_counter_cpu_sched_in(cpuctx, cpu);
1346 if (ctx)
1347 perf_counter_task_sched_in(curr, cpu);
1348}
1349
1350/*
1351 * Cross CPU call to read the hardware counter
1352 */
1353static void __read(void *info)
1354{
1355 struct perf_counter *counter = info;
1356 struct perf_counter_context *ctx = counter->ctx;
1357 unsigned long flags;
1358
1359 local_irq_save(flags);
1360 if (ctx->is_active)
1361 update_context_time(ctx);
1362 counter->pmu->read(counter);
1363 update_counter_times(counter);
1364 local_irq_restore(flags);
1365}
1366
1367static u64 perf_counter_read(struct perf_counter *counter)
1368{
1369 /*
1370 * If counter is enabled and currently active on a CPU, update the
1371 * value in the counter structure:
1372 */
1373 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1374 smp_call_function_single(counter->oncpu,
1375 __read, counter, 1);
1376 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1377 update_counter_times(counter);
1378 }
1379
1380 return atomic64_read(&counter->count);
1381}
1382
1383/*
1384 * Initialize the perf_counter context in a task_struct:
1385 */
1386static void
1387__perf_counter_init_context(struct perf_counter_context *ctx,
1388 struct task_struct *task)
1389{
1390 memset(ctx, 0, sizeof(*ctx));
1391 spin_lock_init(&ctx->lock);
1392 mutex_init(&ctx->mutex);
1393 INIT_LIST_HEAD(&ctx->counter_list);
1394 INIT_LIST_HEAD(&ctx->event_list);
1395 atomic_set(&ctx->refcount, 1);
1396 ctx->task = task;
1397}
1398
1399static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1400{
1401 struct perf_counter_context *parent_ctx;
1402 struct perf_counter_context *ctx;
1403 struct perf_cpu_context *cpuctx;
1404 struct task_struct *task;
1405 unsigned long flags;
1406 int err;
1407
1408 /*
1409 * If cpu is not a wildcard then this is a percpu counter:
1410 */
1411 if (cpu != -1) {
1412 /* Must be root to operate on a CPU counter: */
1413 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1414 return ERR_PTR(-EACCES);
1415
1416 if (cpu < 0 || cpu > num_possible_cpus())
1417 return ERR_PTR(-EINVAL);
1418
1419 /*
1420 * We could be clever and allow to attach a counter to an
1421 * offline CPU and activate it when the CPU comes up, but
1422 * that's for later.
1423 */
1424 if (!cpu_isset(cpu, cpu_online_map))
1425 return ERR_PTR(-ENODEV);
1426
1427 cpuctx = &per_cpu(perf_cpu_context, cpu);
1428 ctx = &cpuctx->ctx;
1429 get_ctx(ctx);
1430
1431 return ctx;
1432 }
1433
1434 rcu_read_lock();
1435 if (!pid)
1436 task = current;
1437 else
1438 task = find_task_by_vpid(pid);
1439 if (task)
1440 get_task_struct(task);
1441 rcu_read_unlock();
1442
1443 if (!task)
1444 return ERR_PTR(-ESRCH);
1445
1446 /*
1447 * Can't attach counters to a dying task.
1448 */
1449 err = -ESRCH;
1450 if (task->flags & PF_EXITING)
1451 goto errout;
1452
1453 /* Reuse ptrace permission checks for now. */
1454 err = -EACCES;
1455 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1456 goto errout;
1457
1458 retry:
1459 ctx = perf_lock_task_context(task, &flags);
1460 if (ctx) {
1461 parent_ctx = ctx->parent_ctx;
1462 if (parent_ctx) {
1463 put_ctx(parent_ctx);
1464 ctx->parent_ctx = NULL; /* no longer a clone */
1465 }
1466 spin_unlock_irqrestore(&ctx->lock, flags);
1467 }
1468
1469 if (!ctx) {
1470 ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
1471 err = -ENOMEM;
1472 if (!ctx)
1473 goto errout;
1474 __perf_counter_init_context(ctx, task);
1475 get_ctx(ctx);
1476 if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
1477 /*
1478 * We raced with some other task; use
1479 * the context they set.
1480 */
1481 kfree(ctx);
1482 goto retry;
1483 }
1484 get_task_struct(task);
1485 }
1486
1487 put_task_struct(task);
1488 return ctx;
1489
1490 errout:
1491 put_task_struct(task);
1492 return ERR_PTR(err);
1493}
1494
1495static void free_counter_rcu(struct rcu_head *head)
1496{
1497 struct perf_counter *counter;
1498
1499 counter = container_of(head, struct perf_counter, rcu_head);
1500 if (counter->ns)
1501 put_pid_ns(counter->ns);
1502 kfree(counter);
1503}
1504
1505static void perf_pending_sync(struct perf_counter *counter);
1506
1507static void free_counter(struct perf_counter *counter)
1508{
1509 perf_pending_sync(counter);
1510
1511 atomic_dec(&nr_counters);
1512 if (counter->attr.mmap)
1513 atomic_dec(&nr_mmap_counters);
1514 if (counter->attr.comm)
1515 atomic_dec(&nr_comm_counters);
1516
1517 if (counter->destroy)
1518 counter->destroy(counter);
1519
1520 put_ctx(counter->ctx);
1521 call_rcu(&counter->rcu_head, free_counter_rcu);
1522}
1523
1524/*
1525 * Called when the last reference to the file is gone.
1526 */
1527static int perf_release(struct inode *inode, struct file *file)
1528{
1529 struct perf_counter *counter = file->private_data;
1530 struct perf_counter_context *ctx = counter->ctx;
1531
1532 file->private_data = NULL;
1533
1534 WARN_ON_ONCE(ctx->parent_ctx);
1535 mutex_lock(&ctx->mutex);
1536 perf_counter_remove_from_context(counter);
1537 mutex_unlock(&ctx->mutex);
1538
1539 mutex_lock(&counter->owner->perf_counter_mutex);
1540 list_del_init(&counter->owner_entry);
1541 mutex_unlock(&counter->owner->perf_counter_mutex);
1542 put_task_struct(counter->owner);
1543
1544 free_counter(counter);
1545
1546 return 0;
1547}
1548
1549/*
1550 * Read the performance counter - simple non blocking version for now
1551 */
1552static ssize_t
1553perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1554{
1555 u64 values[4];
1556 int n;
1557
1558 /*
1559 * Return end-of-file for a read on a counter that is in
1560 * error state (i.e. because it was pinned but it couldn't be
1561 * scheduled on to the CPU at some point).
1562 */
1563 if (counter->state == PERF_COUNTER_STATE_ERROR)
1564 return 0;
1565
1566 WARN_ON_ONCE(counter->ctx->parent_ctx);
1567 mutex_lock(&counter->child_mutex);
1568 values[0] = perf_counter_read(counter);
1569 n = 1;
1570 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1571 values[n++] = counter->total_time_enabled +
1572 atomic64_read(&counter->child_total_time_enabled);
1573 if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1574 values[n++] = counter->total_time_running +
1575 atomic64_read(&counter->child_total_time_running);
1576 if (counter->attr.read_format & PERF_FORMAT_ID)
1577 values[n++] = counter->id;
1578 mutex_unlock(&counter->child_mutex);
1579
1580 if (count < n * sizeof(u64))
1581 return -EINVAL;
1582 count = n * sizeof(u64);
1583
1584 if (copy_to_user(buf, values, count))
1585 return -EFAULT;
1586
1587 return count;
1588}
1589
1590static ssize_t
1591perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1592{
1593 struct perf_counter *counter = file->private_data;
1594
1595 return perf_read_hw(counter, buf, count);
1596}
1597
1598static unsigned int perf_poll(struct file *file, poll_table *wait)
1599{
1600 struct perf_counter *counter = file->private_data;
1601 struct perf_mmap_data *data;
1602 unsigned int events = POLL_HUP;
1603
1604 rcu_read_lock();
1605 data = rcu_dereference(counter->data);
1606 if (data)
1607 events = atomic_xchg(&data->poll, 0);
1608 rcu_read_unlock();
1609
1610 poll_wait(file, &counter->waitq, wait);
1611
1612 return events;
1613}
1614
1615static void perf_counter_reset(struct perf_counter *counter)
1616{
1617 (void)perf_counter_read(counter);
1618 atomic64_set(&counter->count, 0);
1619 perf_counter_update_userpage(counter);
1620}
1621
1622/*
1623 * Holding the top-level counter's child_mutex means that any
1624 * descendant process that has inherited this counter will block
1625 * in sync_child_counter if it goes to exit, thus satisfying the
1626 * task existence requirements of perf_counter_enable/disable.
1627 */
1628static void perf_counter_for_each_child(struct perf_counter *counter,
1629 void (*func)(struct perf_counter *))
1630{
1631 struct perf_counter *child;
1632
1633 WARN_ON_ONCE(counter->ctx->parent_ctx);
1634 mutex_lock(&counter->child_mutex);
1635 func(counter);
1636 list_for_each_entry(child, &counter->child_list, child_list)
1637 func(child);
1638 mutex_unlock(&counter->child_mutex);
1639}
1640
1641static void perf_counter_for_each(struct perf_counter *counter,
1642 void (*func)(struct perf_counter *))
1643{
1644 struct perf_counter_context *ctx = counter->ctx;
1645 struct perf_counter *sibling;
1646
1647 WARN_ON_ONCE(ctx->parent_ctx);
1648 mutex_lock(&ctx->mutex);
1649 counter = counter->group_leader;
1650
1651 perf_counter_for_each_child(counter, func);
1652 func(counter);
1653 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1654 perf_counter_for_each_child(counter, func);
1655 mutex_unlock(&ctx->mutex);
1656}
1657
1658static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
1659{
1660 struct perf_counter_context *ctx = counter->ctx;
1661 unsigned long size;
1662 int ret = 0;
1663 u64 value;
1664
1665 if (!counter->attr.sample_period)
1666 return -EINVAL;
1667
1668 size = copy_from_user(&value, arg, sizeof(value));
1669 if (size != sizeof(value))
1670 return -EFAULT;
1671
1672 if (!value)
1673 return -EINVAL;
1674
1675 spin_lock_irq(&ctx->lock);
1676 if (counter->attr.freq) {
1677 if (value > sysctl_perf_counter_sample_rate) {
1678 ret = -EINVAL;
1679 goto unlock;
1680 }
1681
1682 counter->attr.sample_freq = value;
1683 } else {
1684 perf_log_period(counter, value);
1685
1686 counter->attr.sample_period = value;
1687 counter->hw.sample_period = value;
1688 }
1689unlock:
1690 spin_unlock_irq(&ctx->lock);
1691
1692 return ret;
1693}
1694
1695static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1696{
1697 struct perf_counter *counter = file->private_data;
1698 void (*func)(struct perf_counter *);
1699 u32 flags = arg;
1700
1701 switch (cmd) {
1702 case PERF_COUNTER_IOC_ENABLE:
1703 func = perf_counter_enable;
1704 break;
1705 case PERF_COUNTER_IOC_DISABLE:
1706 func = perf_counter_disable;
1707 break;
1708 case PERF_COUNTER_IOC_RESET:
1709 func = perf_counter_reset;
1710 break;
1711
1712 case PERF_COUNTER_IOC_REFRESH:
1713 return perf_counter_refresh(counter, arg);
1714
1715 case PERF_COUNTER_IOC_PERIOD:
1716 return perf_counter_period(counter, (u64 __user *)arg);
1717
1718 default:
1719 return -ENOTTY;
1720 }
1721
1722 if (flags & PERF_IOC_FLAG_GROUP)
1723 perf_counter_for_each(counter, func);
1724 else
1725 perf_counter_for_each_child(counter, func);
1726
1727 return 0;
1728}
1729
1730int perf_counter_task_enable(void)
1731{
1732 struct perf_counter *counter;
1733
1734 mutex_lock(&current->perf_counter_mutex);
1735 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
1736 perf_counter_for_each_child(counter, perf_counter_enable);
1737 mutex_unlock(&current->perf_counter_mutex);
1738
1739 return 0;
1740}
1741
1742int perf_counter_task_disable(void)
1743{
1744 struct perf_counter *counter;
1745
1746 mutex_lock(&current->perf_counter_mutex);
1747 list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
1748 perf_counter_for_each_child(counter, perf_counter_disable);
1749 mutex_unlock(&current->perf_counter_mutex);
1750
1751 return 0;
1752}
1753
1754/*
1755 * Callers need to ensure there can be no nesting of this function, otherwise
1756 * the seqlock logic goes bad. We can not serialize this because the arch
1757 * code calls this from NMI context.
1758 */
1759void perf_counter_update_userpage(struct perf_counter *counter)
1760{
1761 struct perf_counter_mmap_page *userpg;
1762 struct perf_mmap_data *data;
1763
1764 rcu_read_lock();
1765 data = rcu_dereference(counter->data);
1766 if (!data)
1767 goto unlock;
1768
1769 userpg = data->user_page;
1770
1771 /*
1772 * Disable preemption so as to not let the corresponding user-space
1773 * spin too long if we get preempted.
1774 */
1775 preempt_disable();
1776 ++userpg->lock;
1777 barrier();
1778 userpg->index = counter->hw.idx;
1779 userpg->offset = atomic64_read(&counter->count);
1780 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1781 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1782
1783 barrier();
1784 ++userpg->lock;
1785 preempt_enable();
1786unlock:
1787 rcu_read_unlock();
1788}
1789
1790static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1791{
1792 struct perf_counter *counter = vma->vm_file->private_data;
1793 struct perf_mmap_data *data;
1794 int ret = VM_FAULT_SIGBUS;
1795
1796 if (vmf->flags & FAULT_FLAG_MKWRITE) {
1797 if (vmf->pgoff == 0)
1798 ret = 0;
1799 return ret;
1800 }
1801
1802 rcu_read_lock();
1803 data = rcu_dereference(counter->data);
1804 if (!data)
1805 goto unlock;
1806
1807 if (vmf->pgoff == 0) {
1808 vmf->page = virt_to_page(data->user_page);
1809 } else {
1810 int nr = vmf->pgoff - 1;
1811
1812 if ((unsigned)nr > data->nr_pages)
1813 goto unlock;
1814
1815 if (vmf->flags & FAULT_FLAG_WRITE)
1816 goto unlock;
1817
1818 vmf->page = virt_to_page(data->data_pages[nr]);
1819 }
1820
1821 get_page(vmf->page);
1822 vmf->page->mapping = vma->vm_file->f_mapping;
1823 vmf->page->index = vmf->pgoff;
1824
1825 ret = 0;
1826unlock:
1827 rcu_read_unlock();
1828
1829 return ret;
1830}
1831
1832static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1833{
1834 struct perf_mmap_data *data;
1835 unsigned long size;
1836 int i;
1837
1838 WARN_ON(atomic_read(&counter->mmap_count));
1839
1840 size = sizeof(struct perf_mmap_data);
1841 size += nr_pages * sizeof(void *);
1842
1843 data = kzalloc(size, GFP_KERNEL);
1844 if (!data)
1845 goto fail;
1846
1847 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1848 if (!data->user_page)
1849 goto fail_user_page;
1850
1851 for (i = 0; i < nr_pages; i++) {
1852 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1853 if (!data->data_pages[i])
1854 goto fail_data_pages;
1855 }
1856
1857 data->nr_pages = nr_pages;
1858 atomic_set(&data->lock, -1);
1859
1860 rcu_assign_pointer(counter->data, data);
1861
1862 return 0;
1863
1864fail_data_pages:
1865 for (i--; i >= 0; i--)
1866 free_page((unsigned long)data->data_pages[i]);
1867
1868 free_page((unsigned long)data->user_page);
1869
1870fail_user_page:
1871 kfree(data);
1872
1873fail:
1874 return -ENOMEM;
1875}
1876
1877static void perf_mmap_free_page(unsigned long addr)
1878{
1879 struct page *page = virt_to_page(addr);
1880
1881 page->mapping = NULL;
1882 __free_page(page);
1883}
1884
1885static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1886{
1887 struct perf_mmap_data *data;
1888 int i;
1889
1890 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
1891
1892 perf_mmap_free_page((unsigned long)data->user_page);
1893 for (i = 0; i < data->nr_pages; i++)
1894 perf_mmap_free_page((unsigned long)data->data_pages[i]);
1895
1896 kfree(data);
1897}
1898
1899static void perf_mmap_data_free(struct perf_counter *counter)
1900{
1901 struct perf_mmap_data *data = counter->data;
1902
1903 WARN_ON(atomic_read(&counter->mmap_count));
1904
1905 rcu_assign_pointer(counter->data, NULL);
1906 call_rcu(&data->rcu_head, __perf_mmap_data_free);
1907}
1908
1909static void perf_mmap_open(struct vm_area_struct *vma)
1910{
1911 struct perf_counter *counter = vma->vm_file->private_data;
1912
1913 atomic_inc(&counter->mmap_count);
1914}
1915
1916static void perf_mmap_close(struct vm_area_struct *vma)
1917{
1918 struct perf_counter *counter = vma->vm_file->private_data;
1919
1920 WARN_ON_ONCE(counter->ctx->parent_ctx);
1921 if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
1922 struct user_struct *user = current_user();
1923
1924 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
1925 vma->vm_mm->locked_vm -= counter->data->nr_locked;
1926 perf_mmap_data_free(counter);
1927 mutex_unlock(&counter->mmap_mutex);
1928 }
1929}
1930
1931static struct vm_operations_struct perf_mmap_vmops = {
1932 .open = perf_mmap_open,
1933 .close = perf_mmap_close,
1934 .fault = perf_mmap_fault,
1935 .page_mkwrite = perf_mmap_fault,
1936};
1937
1938static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1939{
1940 struct perf_counter *counter = file->private_data;
1941 unsigned long user_locked, user_lock_limit;
1942 struct user_struct *user = current_user();
1943 unsigned long locked, lock_limit;
1944 unsigned long vma_size;
1945 unsigned long nr_pages;
1946 long user_extra, extra;
1947 int ret = 0;
1948
1949 if (!(vma->vm_flags & VM_SHARED))
1950 return -EINVAL;
1951
1952 vma_size = vma->vm_end - vma->vm_start;
1953 nr_pages = (vma_size / PAGE_SIZE) - 1;
1954
1955 /*
1956 * If we have data pages ensure they're a power-of-two number, so we
1957 * can do bitmasks instead of modulo.
1958 */
1959 if (nr_pages != 0 && !is_power_of_2(nr_pages))
1960 return -EINVAL;
1961
1962 if (vma_size != PAGE_SIZE * (1 + nr_pages))
1963 return -EINVAL;
1964
1965 if (vma->vm_pgoff != 0)
1966 return -EINVAL;
1967
1968 WARN_ON_ONCE(counter->ctx->parent_ctx);
1969 mutex_lock(&counter->mmap_mutex);
1970 if (atomic_inc_not_zero(&counter->mmap_count)) {
1971 if (nr_pages != counter->data->nr_pages)
1972 ret = -EINVAL;
1973 goto unlock;
1974 }
1975
1976 user_extra = nr_pages + 1;
1977 user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
1978
1979 /*
1980 * Increase the limit linearly with more CPUs:
1981 */
1982 user_lock_limit *= num_online_cpus();
1983
1984 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
1985
1986 extra = 0;
1987 if (user_locked > user_lock_limit)
1988 extra = user_locked - user_lock_limit;
1989
1990 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1991 lock_limit >>= PAGE_SHIFT;
1992 locked = vma->vm_mm->locked_vm + extra;
1993
1994 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1995 ret = -EPERM;
1996 goto unlock;
1997 }
1998
1999 WARN_ON(counter->data);
2000 ret = perf_mmap_data_alloc(counter, nr_pages);
2001 if (ret)
2002 goto unlock;
2003
2004 atomic_set(&counter->mmap_count, 1);
2005 atomic_long_add(user_extra, &user->locked_vm);
2006 vma->vm_mm->locked_vm += extra;
2007 counter->data->nr_locked = extra;
2008 if (vma->vm_flags & VM_WRITE)
2009 counter->data->writable = 1;
2010
2011unlock:
2012 mutex_unlock(&counter->mmap_mutex);
2013
2014 vma->vm_flags |= VM_RESERVED;
2015 vma->vm_ops = &perf_mmap_vmops;
2016
2017 return ret;
2018}
2019
2020static int perf_fasync(int fd, struct file *filp, int on)
2021{
2022 struct inode *inode = filp->f_path.dentry->d_inode;
2023 struct perf_counter *counter = filp->private_data;
2024 int retval;
2025
2026 mutex_lock(&inode->i_mutex);
2027 retval = fasync_helper(fd, filp, on, &counter->fasync);
2028 mutex_unlock(&inode->i_mutex);
2029
2030 if (retval < 0)
2031 return retval;
2032
2033 return 0;
2034}
2035
2036static const struct file_operations perf_fops = {
2037 .release = perf_release,
2038 .read = perf_read,
2039 .poll = perf_poll,
2040 .unlocked_ioctl = perf_ioctl,
2041 .compat_ioctl = perf_ioctl,
2042 .mmap = perf_mmap,
2043 .fasync = perf_fasync,
2044};
2045
2046/*
2047 * Perf counter wakeup
2048 *
2049 * If there's data, ensure we set the poll() state and publish everything
2050 * to user-space before waking everybody up.
2051 */
2052
2053void perf_counter_wakeup(struct perf_counter *counter)
2054{
2055 wake_up_all(&counter->waitq);
2056
2057 if (counter->pending_kill) {
2058 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
2059 counter->pending_kill = 0;
2060 }
2061}
2062
2063/*
2064 * Pending wakeups
2065 *
2066 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2067 *
2068 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2069 * single linked list and use cmpxchg() to add entries lockless.
2070 */
2071
2072static void perf_pending_counter(struct perf_pending_entry *entry)
2073{
2074 struct perf_counter *counter = container_of(entry,
2075 struct perf_counter, pending);
2076
2077 if (counter->pending_disable) {
2078 counter->pending_disable = 0;
2079 perf_counter_disable(counter);
2080 }
2081
2082 if (counter->pending_wakeup) {
2083 counter->pending_wakeup = 0;
2084 perf_counter_wakeup(counter);
2085 }
2086}
2087
2088#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2089
2090static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2091 PENDING_TAIL,
2092};
2093
2094static void perf_pending_queue(struct perf_pending_entry *entry,
2095 void (*func)(struct perf_pending_entry *))
2096{
2097 struct perf_pending_entry **head;
2098
2099 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2100 return;
2101
2102 entry->func = func;
2103
2104 head = &get_cpu_var(perf_pending_head);
2105
2106 do {
2107 entry->next = *head;
2108 } while (cmpxchg(head, entry->next, entry) != entry->next);
2109
2110 set_perf_counter_pending();
2111
2112 put_cpu_var(perf_pending_head);
2113}
2114
2115static int __perf_pending_run(void)
2116{
2117 struct perf_pending_entry *list;
2118 int nr = 0;
2119
2120 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2121 while (list != PENDING_TAIL) {
2122 void (*func)(struct perf_pending_entry *);
2123 struct perf_pending_entry *entry = list;
2124
2125 list = list->next;
2126
2127 func = entry->func;
2128 entry->next = NULL;
2129 /*
2130 * Ensure we observe the unqueue before we issue the wakeup,
2131 * so that we won't be waiting forever.
2132 * -- see perf_not_pending().
2133 */
2134 smp_wmb();
2135
2136 func(entry);
2137 nr++;
2138 }
2139
2140 return nr;
2141}
2142
2143static inline int perf_not_pending(struct perf_counter *counter)
2144{
2145 /*
2146 * If we flush on whatever cpu we run, there is a chance we don't
2147 * need to wait.
2148 */
2149 get_cpu();
2150 __perf_pending_run();
2151 put_cpu();
2152
2153 /*
2154 * Ensure we see the proper queue state before going to sleep
2155 * so that we do not miss the wakeup. -- see perf_pending_handle()
2156 */
2157 smp_rmb();
2158 return counter->pending.next == NULL;
2159}
2160
2161static void perf_pending_sync(struct perf_counter *counter)
2162{
2163 wait_event(counter->waitq, perf_not_pending(counter));
2164}
2165
2166void perf_counter_do_pending(void)
2167{
2168 __perf_pending_run();
2169}
2170
2171/*
2172 * Callchain support -- arch specific
2173 */
2174
2175__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2176{
2177 return NULL;
2178}
2179
2180/*
2181 * Output
2182 */
2183
2184struct perf_output_handle {
2185 struct perf_counter *counter;
2186 struct perf_mmap_data *data;
2187 unsigned long head;
2188 unsigned long offset;
2189 int nmi;
2190 int sample;
2191 int locked;
2192 unsigned long flags;
2193};
2194
2195static bool perf_output_space(struct perf_mmap_data *data,
2196 unsigned int offset, unsigned int head)
2197{
2198 unsigned long tail;
2199 unsigned long mask;
2200
2201 if (!data->writable)
2202 return true;
2203
2204 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2205 /*
2206 * Userspace could choose to issue a mb() before updating the tail
2207 * pointer. So that all reads will be completed before the write is
2208 * issued.
2209 */
2210 tail = ACCESS_ONCE(data->user_page->data_tail);
2211 smp_rmb();
2212
2213 offset = (offset - tail) & mask;
2214 head = (head - tail) & mask;
2215
2216 if ((int)(head - offset) < 0)
2217 return false;
2218
2219 return true;
2220}
2221
2222static void perf_output_wakeup(struct perf_output_handle *handle)
2223{
2224 atomic_set(&handle->data->poll, POLL_IN);
2225
2226 if (handle->nmi) {
2227 handle->counter->pending_wakeup = 1;
2228 perf_pending_queue(&handle->counter->pending,
2229 perf_pending_counter);
2230 } else
2231 perf_counter_wakeup(handle->counter);
2232}
2233
2234/*
2235 * Curious locking construct.
2236 *
2237 * We need to ensure a later event doesn't publish a head when a former
2238 * event isn't done writing. However since we need to deal with NMIs we
2239 * cannot fully serialize things.
2240 *
2241 * What we do is serialize between CPUs so we only have to deal with NMI
2242 * nesting on a single CPU.
2243 *
2244 * We only publish the head (and generate a wakeup) when the outer-most
2245 * event completes.
2246 */
2247static void perf_output_lock(struct perf_output_handle *handle)
2248{
2249 struct perf_mmap_data *data = handle->data;
2250 int cpu;
2251
2252 handle->locked = 0;
2253
2254 local_irq_save(handle->flags);
2255 cpu = smp_processor_id();
2256
2257 if (in_nmi() && atomic_read(&data->lock) == cpu)
2258 return;
2259
2260 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2261 cpu_relax();
2262
2263 handle->locked = 1;
2264}
2265
2266static void perf_output_unlock(struct perf_output_handle *handle)
2267{
2268 struct perf_mmap_data *data = handle->data;
2269 unsigned long head;
2270 int cpu;
2271
2272 data->done_head = data->head;
2273
2274 if (!handle->locked)
2275 goto out;
2276
2277again:
2278 /*
2279 * The xchg implies a full barrier that ensures all writes are done
2280 * before we publish the new head, matched by a rmb() in userspace when
2281 * reading this position.
2282 */
2283 while ((head = atomic_long_xchg(&data->done_head, 0)))
2284 data->user_page->data_head = head;
2285
2286 /*
2287 * NMI can happen here, which means we can miss a done_head update.
2288 */
2289
2290 cpu = atomic_xchg(&data->lock, -1);
2291 WARN_ON_ONCE(cpu != smp_processor_id());
2292
2293 /*
2294 * Therefore we have to validate we did not indeed do so.
2295 */
2296 if (unlikely(atomic_long_read(&data->done_head))) {
2297 /*
2298 * Since we had it locked, we can lock it again.
2299 */
2300 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2301 cpu_relax();
2302
2303 goto again;
2304 }
2305
2306 if (atomic_xchg(&data->wakeup, 0))
2307 perf_output_wakeup(handle);
2308out:
2309 local_irq_restore(handle->flags);
2310}
2311
2312static void perf_output_copy(struct perf_output_handle *handle,
2313 const void *buf, unsigned int len)
2314{
2315 unsigned int pages_mask;
2316 unsigned int offset;
2317 unsigned int size;
2318 void **pages;
2319
2320 offset = handle->offset;
2321 pages_mask = handle->data->nr_pages - 1;
2322 pages = handle->data->data_pages;
2323
2324 do {
2325 unsigned int page_offset;
2326 int nr;
2327
2328 nr = (offset >> PAGE_SHIFT) & pages_mask;
2329 page_offset = offset & (PAGE_SIZE - 1);
2330 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2331
2332 memcpy(pages[nr] + page_offset, buf, size);
2333
2334 len -= size;
2335 buf += size;
2336 offset += size;
2337 } while (len);
2338
2339 handle->offset = offset;
2340
2341 /*
2342 * Check we didn't copy past our reservation window, taking the
2343 * possible unsigned int wrap into account.
2344 */
2345 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2346}
2347
2348#define perf_output_put(handle, x) \
2349 perf_output_copy((handle), &(x), sizeof(x))
2350
2351static int perf_output_begin(struct perf_output_handle *handle,
2352 struct perf_counter *counter, unsigned int size,
2353 int nmi, int sample)
2354{
2355 struct perf_mmap_data *data;
2356 unsigned int offset, head;
2357 int have_lost;
2358 struct {
2359 struct perf_event_header header;
2360 u64 id;
2361 u64 lost;
2362 } lost_event;
2363
2364 /*
2365 * For inherited counters we send all the output towards the parent.
2366 */
2367 if (counter->parent)
2368 counter = counter->parent;
2369
2370 rcu_read_lock();
2371 data = rcu_dereference(counter->data);
2372 if (!data)
2373 goto out;
2374
2375 handle->data = data;
2376 handle->counter = counter;
2377 handle->nmi = nmi;
2378 handle->sample = sample;
2379
2380 if (!data->nr_pages)
2381 goto fail;
2382
2383 have_lost = atomic_read(&data->lost);
2384 if (have_lost)
2385 size += sizeof(lost_event);
2386
2387 perf_output_lock(handle);
2388
2389 do {
2390 offset = head = atomic_long_read(&data->head);
2391 head += size;
2392 if (unlikely(!perf_output_space(data, offset, head)))
2393 goto fail;
2394 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2395
2396 handle->offset = offset;
2397 handle->head = head;
2398
2399 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
2400 atomic_set(&data->wakeup, 1);
2401
2402 if (have_lost) {
2403 lost_event.header.type = PERF_EVENT_LOST;
2404 lost_event.header.misc = 0;
2405 lost_event.header.size = sizeof(lost_event);
2406 lost_event.id = counter->id;
2407 lost_event.lost = atomic_xchg(&data->lost, 0);
2408
2409 perf_output_put(handle, lost_event);
2410 }
2411
2412 return 0;
2413
2414fail:
2415 atomic_inc(&data->lost);
2416 perf_output_unlock(handle);
2417out:
2418 rcu_read_unlock();
2419
2420 return -ENOSPC;
2421}
2422
2423static void perf_output_end(struct perf_output_handle *handle)
2424{
2425 struct perf_counter *counter = handle->counter;
2426 struct perf_mmap_data *data = handle->data;
2427
2428 int wakeup_events = counter->attr.wakeup_events;
2429
2430 if (handle->sample && wakeup_events) {
2431 int events = atomic_inc_return(&data->events);
2432 if (events >= wakeup_events) {
2433 atomic_sub(wakeup_events, &data->events);
2434 atomic_set(&data->wakeup, 1);
2435 }
2436 }
2437
2438 perf_output_unlock(handle);
2439 rcu_read_unlock();
2440}
2441
2442static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
2443{
2444 /*
2445 * only top level counters have the pid namespace they were created in
2446 */
2447 if (counter->parent)
2448 counter = counter->parent;
2449
2450 return task_tgid_nr_ns(p, counter->ns);
2451}
2452
2453static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
2454{
2455 /*
2456 * only top level counters have the pid namespace they were created in
2457 */
2458 if (counter->parent)
2459 counter = counter->parent;
2460
2461 return task_pid_nr_ns(p, counter->ns);
2462}
2463
2464static void perf_counter_output(struct perf_counter *counter, int nmi,
2465 struct perf_sample_data *data)
2466{
2467 int ret;
2468 u64 sample_type = counter->attr.sample_type;
2469 struct perf_output_handle handle;
2470 struct perf_event_header header;
2471 u64 ip;
2472 struct {
2473 u32 pid, tid;
2474 } tid_entry;
2475 struct {
2476 u64 id;
2477 u64 counter;
2478 } group_entry;
2479 struct perf_callchain_entry *callchain = NULL;
2480 int callchain_size = 0;
2481 u64 time;
2482 struct {
2483 u32 cpu, reserved;
2484 } cpu_entry;
2485
2486 header.type = 0;
2487 header.size = sizeof(header);
2488
2489 header.misc = PERF_EVENT_MISC_OVERFLOW;
2490 header.misc |= perf_misc_flags(data->regs);
2491
2492 if (sample_type & PERF_SAMPLE_IP) {
2493 ip = perf_instruction_pointer(data->regs);
2494 header.type |= PERF_SAMPLE_IP;
2495 header.size += sizeof(ip);
2496 }
2497
2498 if (sample_type & PERF_SAMPLE_TID) {
2499 /* namespace issues */
2500 tid_entry.pid = perf_counter_pid(counter, current);
2501 tid_entry.tid = perf_counter_tid(counter, current);
2502
2503 header.type |= PERF_SAMPLE_TID;
2504 header.size += sizeof(tid_entry);
2505 }
2506
2507 if (sample_type & PERF_SAMPLE_TIME) {
2508 /*
2509 * Maybe do better on x86 and provide cpu_clock_nmi()
2510 */
2511 time = sched_clock();
2512
2513 header.type |= PERF_SAMPLE_TIME;
2514 header.size += sizeof(u64);
2515 }
2516
2517 if (sample_type & PERF_SAMPLE_ADDR) {
2518 header.type |= PERF_SAMPLE_ADDR;
2519 header.size += sizeof(u64);
2520 }
2521
2522 if (sample_type & PERF_SAMPLE_ID) {
2523 header.type |= PERF_SAMPLE_ID;
2524 header.size += sizeof(u64);
2525 }
2526
2527 if (sample_type & PERF_SAMPLE_CPU) {
2528 header.type |= PERF_SAMPLE_CPU;
2529 header.size += sizeof(cpu_entry);
2530
2531 cpu_entry.cpu = raw_smp_processor_id();
2532 }
2533
2534 if (sample_type & PERF_SAMPLE_PERIOD) {
2535 header.type |= PERF_SAMPLE_PERIOD;
2536 header.size += sizeof(u64);
2537 }
2538
2539 if (sample_type & PERF_SAMPLE_GROUP) {
2540 header.type |= PERF_SAMPLE_GROUP;
2541 header.size += sizeof(u64) +
2542 counter->nr_siblings * sizeof(group_entry);
2543 }
2544
2545 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2546 callchain = perf_callchain(data->regs);
2547
2548 if (callchain) {
2549 callchain_size = (1 + callchain->nr) * sizeof(u64);
2550
2551 header.type |= PERF_SAMPLE_CALLCHAIN;
2552 header.size += callchain_size;
2553 }
2554 }
2555
2556 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2557 if (ret)
2558 return;
2559
2560 perf_output_put(&handle, header);
2561
2562 if (sample_type & PERF_SAMPLE_IP)
2563 perf_output_put(&handle, ip);
2564
2565 if (sample_type & PERF_SAMPLE_TID)
2566 perf_output_put(&handle, tid_entry);
2567
2568 if (sample_type & PERF_SAMPLE_TIME)
2569 perf_output_put(&handle, time);
2570
2571 if (sample_type & PERF_SAMPLE_ADDR)
2572 perf_output_put(&handle, data->addr);
2573
2574 if (sample_type & PERF_SAMPLE_ID)
2575 perf_output_put(&handle, counter->id);
2576
2577 if (sample_type & PERF_SAMPLE_CPU)
2578 perf_output_put(&handle, cpu_entry);
2579
2580 if (sample_type & PERF_SAMPLE_PERIOD)
2581 perf_output_put(&handle, data->period);
2582
2583 /*
2584 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
2585 */
2586 if (sample_type & PERF_SAMPLE_GROUP) {
2587 struct perf_counter *leader, *sub;
2588 u64 nr = counter->nr_siblings;
2589
2590 perf_output_put(&handle, nr);
2591
2592 leader = counter->group_leader;
2593 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2594 if (sub != counter)
2595 sub->pmu->read(sub);
2596
2597 group_entry.id = sub->id;
2598 group_entry.counter = atomic64_read(&sub->count);
2599
2600 perf_output_put(&handle, group_entry);
2601 }
2602 }
2603
2604 if (callchain)
2605 perf_output_copy(&handle, callchain, callchain_size);
2606
2607 perf_output_end(&handle);
2608}
2609
2610/*
2611 * fork tracking
2612 */
2613
2614struct perf_fork_event {
2615 struct task_struct *task;
2616
2617 struct {
2618 struct perf_event_header header;
2619
2620 u32 pid;
2621 u32 ppid;
2622 } event;
2623};
2624
2625static void perf_counter_fork_output(struct perf_counter *counter,
2626 struct perf_fork_event *fork_event)
2627{
2628 struct perf_output_handle handle;
2629 int size = fork_event->event.header.size;
2630 struct task_struct *task = fork_event->task;
2631 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2632
2633 if (ret)
2634 return;
2635
2636 fork_event->event.pid = perf_counter_pid(counter, task);
2637 fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
2638
2639 perf_output_put(&handle, fork_event->event);
2640 perf_output_end(&handle);
2641}
2642
2643static int perf_counter_fork_match(struct perf_counter *counter)
2644{
2645 if (counter->attr.comm || counter->attr.mmap)
2646 return 1;
2647
2648 return 0;
2649}
2650
2651static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
2652 struct perf_fork_event *fork_event)
2653{
2654 struct perf_counter *counter;
2655
2656 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2657 return;
2658
2659 rcu_read_lock();
2660 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2661 if (perf_counter_fork_match(counter))
2662 perf_counter_fork_output(counter, fork_event);
2663 }
2664 rcu_read_unlock();
2665}
2666
2667static void perf_counter_fork_event(struct perf_fork_event *fork_event)
2668{
2669 struct perf_cpu_context *cpuctx;
2670 struct perf_counter_context *ctx;
2671
2672 cpuctx = &get_cpu_var(perf_cpu_context);
2673 perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
2674 put_cpu_var(perf_cpu_context);
2675
2676 rcu_read_lock();
2677 /*
2678 * doesn't really matter which of the child contexts the
2679 * events ends up in.
2680 */
2681 ctx = rcu_dereference(current->perf_counter_ctxp);
2682 if (ctx)
2683 perf_counter_fork_ctx(ctx, fork_event);
2684 rcu_read_unlock();
2685}
2686
2687void perf_counter_fork(struct task_struct *task)
2688{
2689 struct perf_fork_event fork_event;
2690
2691 if (!atomic_read(&nr_comm_counters) &&
2692 !atomic_read(&nr_mmap_counters))
2693 return;
2694
2695 fork_event = (struct perf_fork_event){
2696 .task = task,
2697 .event = {
2698 .header = {
2699 .type = PERF_EVENT_FORK,
2700 .size = sizeof(fork_event.event),
2701 },
2702 },
2703 };
2704
2705 perf_counter_fork_event(&fork_event);
2706}
2707
2708/*
2709 * comm tracking
2710 */
2711
2712struct perf_comm_event {
2713 struct task_struct *task;
2714 char *comm;
2715 int comm_size;
2716
2717 struct {
2718 struct perf_event_header header;
2719
2720 u32 pid;
2721 u32 tid;
2722 } event;
2723};
2724
2725static void perf_counter_comm_output(struct perf_counter *counter,
2726 struct perf_comm_event *comm_event)
2727{
2728 struct perf_output_handle handle;
2729 int size = comm_event->event.header.size;
2730 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2731
2732 if (ret)
2733 return;
2734
2735 comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
2736 comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
2737
2738 perf_output_put(&handle, comm_event->event);
2739 perf_output_copy(&handle, comm_event->comm,
2740 comm_event->comm_size);
2741 perf_output_end(&handle);
2742}
2743
2744static int perf_counter_comm_match(struct perf_counter *counter)
2745{
2746 if (counter->attr.comm)
2747 return 1;
2748
2749 return 0;
2750}
2751
2752static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
2753 struct perf_comm_event *comm_event)
2754{
2755 struct perf_counter *counter;
2756
2757 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2758 return;
2759
2760 rcu_read_lock();
2761 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2762 if (perf_counter_comm_match(counter))
2763 perf_counter_comm_output(counter, comm_event);
2764 }
2765 rcu_read_unlock();
2766}
2767
2768static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2769{
2770 struct perf_cpu_context *cpuctx;
2771 struct perf_counter_context *ctx;
2772 unsigned int size;
2773 char *comm = comm_event->task->comm;
2774
2775 size = ALIGN(strlen(comm)+1, sizeof(u64));
2776
2777 comm_event->comm = comm;
2778 comm_event->comm_size = size;
2779
2780 comm_event->event.header.size = sizeof(comm_event->event) + size;
2781
2782 cpuctx = &get_cpu_var(perf_cpu_context);
2783 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
2784 put_cpu_var(perf_cpu_context);
2785
2786 rcu_read_lock();
2787 /*
2788 * doesn't really matter which of the child contexts the
2789 * events ends up in.
2790 */
2791 ctx = rcu_dereference(current->perf_counter_ctxp);
2792 if (ctx)
2793 perf_counter_comm_ctx(ctx, comm_event);
2794 rcu_read_unlock();
2795}
2796
2797void perf_counter_comm(struct task_struct *task)
2798{
2799 struct perf_comm_event comm_event;
2800
2801 if (!atomic_read(&nr_comm_counters))
2802 return;
2803
2804 comm_event = (struct perf_comm_event){
2805 .task = task,
2806 .event = {
2807 .header = { .type = PERF_EVENT_COMM, },
2808 },
2809 };
2810
2811 perf_counter_comm_event(&comm_event);
2812}
2813
2814/*
2815 * mmap tracking
2816 */
2817
2818struct perf_mmap_event {
2819 struct vm_area_struct *vma;
2820
2821 const char *file_name;
2822 int file_size;
2823
2824 struct {
2825 struct perf_event_header header;
2826
2827 u32 pid;
2828 u32 tid;
2829 u64 start;
2830 u64 len;
2831 u64 pgoff;
2832 } event;
2833};
2834
2835static void perf_counter_mmap_output(struct perf_counter *counter,
2836 struct perf_mmap_event *mmap_event)
2837{
2838 struct perf_output_handle handle;
2839 int size = mmap_event->event.header.size;
2840 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2841
2842 if (ret)
2843 return;
2844
2845 mmap_event->event.pid = perf_counter_pid(counter, current);
2846 mmap_event->event.tid = perf_counter_tid(counter, current);
2847
2848 perf_output_put(&handle, mmap_event->event);
2849 perf_output_copy(&handle, mmap_event->file_name,
2850 mmap_event->file_size);
2851 perf_output_end(&handle);
2852}
2853
2854static int perf_counter_mmap_match(struct perf_counter *counter,
2855 struct perf_mmap_event *mmap_event)
2856{
2857 if (counter->attr.mmap)
2858 return 1;
2859
2860 return 0;
2861}
2862
2863static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
2864 struct perf_mmap_event *mmap_event)
2865{
2866 struct perf_counter *counter;
2867
2868 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2869 return;
2870
2871 rcu_read_lock();
2872 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2873 if (perf_counter_mmap_match(counter, mmap_event))
2874 perf_counter_mmap_output(counter, mmap_event);
2875 }
2876 rcu_read_unlock();
2877}
2878
2879static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2880{
2881 struct perf_cpu_context *cpuctx;
2882 struct perf_counter_context *ctx;
2883 struct vm_area_struct *vma = mmap_event->vma;
2884 struct file *file = vma->vm_file;
2885 unsigned int size;
2886 char tmp[16];
2887 char *buf = NULL;
2888 const char *name;
2889
2890 if (file) {
2891 buf = kzalloc(PATH_MAX, GFP_KERNEL);
2892 if (!buf) {
2893 name = strncpy(tmp, "//enomem", sizeof(tmp));
2894 goto got_name;
2895 }
2896 name = d_path(&file->f_path, buf, PATH_MAX);
2897 if (IS_ERR(name)) {
2898 name = strncpy(tmp, "//toolong", sizeof(tmp));
2899 goto got_name;
2900 }
2901 } else {
2902 name = arch_vma_name(mmap_event->vma);
2903 if (name)
2904 goto got_name;
2905
2906 if (!vma->vm_mm) {
2907 name = strncpy(tmp, "[vdso]", sizeof(tmp));
2908 goto got_name;
2909 }
2910
2911 name = strncpy(tmp, "//anon", sizeof(tmp));
2912 goto got_name;
2913 }
2914
2915got_name:
2916 size = ALIGN(strlen(name)+1, sizeof(u64));
2917
2918 mmap_event->file_name = name;
2919 mmap_event->file_size = size;
2920
2921 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2922
2923 cpuctx = &get_cpu_var(perf_cpu_context);
2924 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2925 put_cpu_var(perf_cpu_context);
2926
2927 rcu_read_lock();
2928 /*
2929 * doesn't really matter which of the child contexts the
2930 * events ends up in.
2931 */
2932 ctx = rcu_dereference(current->perf_counter_ctxp);
2933 if (ctx)
2934 perf_counter_mmap_ctx(ctx, mmap_event);
2935 rcu_read_unlock();
2936
2937 kfree(buf);
2938}
2939
2940void __perf_counter_mmap(struct vm_area_struct *vma)
2941{
2942 struct perf_mmap_event mmap_event;
2943
2944 if (!atomic_read(&nr_mmap_counters))
2945 return;
2946
2947 mmap_event = (struct perf_mmap_event){
2948 .vma = vma,
2949 .event = {
2950 .header = { .type = PERF_EVENT_MMAP, },
2951 .start = vma->vm_start,
2952 .len = vma->vm_end - vma->vm_start,
2953 .pgoff = vma->vm_pgoff,
2954 },
2955 };
2956
2957 perf_counter_mmap_event(&mmap_event);
2958}
2959
2960/*
2961 * Log sample_period changes so that analyzing tools can re-normalize the
2962 * event flow.
2963 */
2964
2965struct freq_event {
2966 struct perf_event_header header;
2967 u64 time;
2968 u64 id;
2969 u64 period;
2970};
2971
2972static void perf_log_period(struct perf_counter *counter, u64 period)
2973{
2974 struct perf_output_handle handle;
2975 struct freq_event event;
2976 int ret;
2977
2978 if (counter->hw.sample_period == period)
2979 return;
2980
2981 if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
2982 return;
2983
2984 event = (struct freq_event) {
2985 .header = {
2986 .type = PERF_EVENT_PERIOD,
2987 .misc = 0,
2988 .size = sizeof(event),
2989 },
2990 .time = sched_clock(),
2991 .id = counter->id,
2992 .period = period,
2993 };
2994
2995 ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
2996 if (ret)
2997 return;
2998
2999 perf_output_put(&handle, event);
3000 perf_output_end(&handle);
3001}
3002
3003/*
3004 * IRQ throttle logging
3005 */
3006
3007static void perf_log_throttle(struct perf_counter *counter, int enable)
3008{
3009 struct perf_output_handle handle;
3010 int ret;
3011
3012 struct {
3013 struct perf_event_header header;
3014 u64 time;
3015 u64 id;
3016 } throttle_event = {
3017 .header = {
3018 .type = PERF_EVENT_THROTTLE + 1,
3019 .misc = 0,
3020 .size = sizeof(throttle_event),
3021 },
3022 .time = sched_clock(),
3023 .id = counter->id,
3024 };
3025
3026 ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
3027 if (ret)
3028 return;
3029
3030 perf_output_put(&handle, throttle_event);
3031 perf_output_end(&handle);
3032}
3033
3034/*
3035 * Generic counter overflow handling, sampling.
3036 */
3037
3038int perf_counter_overflow(struct perf_counter *counter, int nmi,
3039 struct perf_sample_data *data)
3040{
3041 int events = atomic_read(&counter->event_limit);
3042 int throttle = counter->pmu->unthrottle != NULL;
3043 struct hw_perf_counter *hwc = &counter->hw;
3044 int ret = 0;
3045
3046 if (!throttle) {
3047 hwc->interrupts++;
3048 } else {
3049 if (hwc->interrupts != MAX_INTERRUPTS) {
3050 hwc->interrupts++;
3051 if (HZ * hwc->interrupts >
3052 (u64)sysctl_perf_counter_sample_rate) {
3053 hwc->interrupts = MAX_INTERRUPTS;
3054 perf_log_throttle(counter, 0);
3055 ret = 1;
3056 }
3057 } else {
3058 /*
3059 * Keep re-disabling counters even though on the previous
3060 * pass we disabled it - just in case we raced with a
3061 * sched-in and the counter got enabled again:
3062 */
3063 ret = 1;
3064 }
3065 }
3066
3067 if (counter->attr.freq) {
3068 u64 now = sched_clock();
3069 s64 delta = now - hwc->freq_stamp;
3070
3071 hwc->freq_stamp = now;
3072
3073 if (delta > 0 && delta < TICK_NSEC)
3074 perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
3075 }
3076
3077 /*
3078 * XXX event_limit might not quite work as expected on inherited
3079 * counters
3080 */
3081
3082 counter->pending_kill = POLL_IN;
3083 if (events && atomic_dec_and_test(&counter->event_limit)) {
3084 ret = 1;
3085 counter->pending_kill = POLL_HUP;
3086 if (nmi) {
3087 counter->pending_disable = 1;
3088 perf_pending_queue(&counter->pending,
3089 perf_pending_counter);
3090 } else
3091 perf_counter_disable(counter);
3092 }
3093
3094 perf_counter_output(counter, nmi, data);
3095 return ret;
3096}
3097
3098/*
3099 * Generic software counter infrastructure
3100 */
3101
3102static void perf_swcounter_update(struct perf_counter *counter)
3103{
3104 struct hw_perf_counter *hwc = &counter->hw;
3105 u64 prev, now;
3106 s64 delta;
3107
3108again:
3109 prev = atomic64_read(&hwc->prev_count);
3110 now = atomic64_read(&hwc->count);
3111 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
3112 goto again;
3113
3114 delta = now - prev;
3115
3116 atomic64_add(delta, &counter->count);
3117 atomic64_sub(delta, &hwc->period_left);
3118}
3119
3120static void perf_swcounter_set_period(struct perf_counter *counter)
3121{
3122 struct hw_perf_counter *hwc = &counter->hw;
3123 s64 left = atomic64_read(&hwc->period_left);
3124 s64 period = hwc->sample_period;
3125
3126 if (unlikely(left <= -period)) {
3127 left = period;
3128 atomic64_set(&hwc->period_left, left);
3129 hwc->last_period = period;
3130 }
3131
3132 if (unlikely(left <= 0)) {
3133 left += period;
3134 atomic64_add(period, &hwc->period_left);
3135 hwc->last_period = period;
3136 }
3137
3138 atomic64_set(&hwc->prev_count, -left);
3139 atomic64_set(&hwc->count, -left);
3140}
3141
3142static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3143{
3144 enum hrtimer_restart ret = HRTIMER_RESTART;
3145 struct perf_sample_data data;
3146 struct perf_counter *counter;
3147 u64 period;
3148
3149 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
3150 counter->pmu->read(counter);
3151
3152 data.addr = 0;
3153 data.regs = get_irq_regs();
3154 /*
3155 * In case we exclude kernel IPs or are somehow not in interrupt
3156 * context, provide the next best thing, the user IP.
3157 */
3158 if ((counter->attr.exclude_kernel || !data.regs) &&
3159 !counter->attr.exclude_user)
3160 data.regs = task_pt_regs(current);
3161
3162 if (data.regs) {
3163 if (perf_counter_overflow(counter, 0, &data))
3164 ret = HRTIMER_NORESTART;
3165 }
3166
3167 period = max_t(u64, 10000, counter->hw.sample_period);
3168 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3169
3170 return ret;
3171}
3172
3173static void perf_swcounter_overflow(struct perf_counter *counter,
3174 int nmi, struct perf_sample_data *data)
3175{
3176 data->period = counter->hw.last_period;
3177
3178 perf_swcounter_update(counter);
3179 perf_swcounter_set_period(counter);
3180 if (perf_counter_overflow(counter, nmi, data))
3181 /* soft-disable the counter */
3182 ;
3183}
3184
3185static int perf_swcounter_is_counting(struct perf_counter *counter)
3186{
3187 struct perf_counter_context *ctx;
3188 unsigned long flags;
3189 int count;
3190
3191 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
3192 return 1;
3193
3194 if (counter->state != PERF_COUNTER_STATE_INACTIVE)
3195 return 0;
3196
3197 /*
3198 * If the counter is inactive, it could be just because
3199 * its task is scheduled out, or because it's in a group
3200 * which could not go on the PMU. We want to count in
3201 * the first case but not the second. If the context is
3202 * currently active then an inactive software counter must
3203 * be the second case. If it's not currently active then
3204 * we need to know whether the counter was active when the
3205 * context was last active, which we can determine by
3206 * comparing counter->tstamp_stopped with ctx->time.
3207 *
3208 * We are within an RCU read-side critical section,
3209 * which protects the existence of *ctx.
3210 */
3211 ctx = counter->ctx;
3212 spin_lock_irqsave(&ctx->lock, flags);
3213 count = 1;
3214 /* Re-check state now we have the lock */
3215 if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
3216 counter->ctx->is_active ||
3217 counter->tstamp_stopped < ctx->time)
3218 count = 0;
3219 spin_unlock_irqrestore(&ctx->lock, flags);
3220 return count;
3221}
3222
3223static int perf_swcounter_match(struct perf_counter *counter,
3224 enum perf_type_id type,
3225 u32 event, struct pt_regs *regs)
3226{
3227 if (!perf_swcounter_is_counting(counter))
3228 return 0;
3229
3230 if (counter->attr.type != type)
3231 return 0;
3232 if (counter->attr.config != event)
3233 return 0;
3234
3235 if (regs) {
3236 if (counter->attr.exclude_user && user_mode(regs))
3237 return 0;
3238
3239 if (counter->attr.exclude_kernel && !user_mode(regs))
3240 return 0;
3241 }
3242
3243 return 1;
3244}
3245
3246static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3247 int nmi, struct perf_sample_data *data)
3248{
3249 int neg = atomic64_add_negative(nr, &counter->hw.count);
3250
3251 if (counter->hw.sample_period && !neg && data->regs)
3252 perf_swcounter_overflow(counter, nmi, data);
3253}
3254
3255static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3256 enum perf_type_id type,
3257 u32 event, u64 nr, int nmi,
3258 struct perf_sample_data *data)
3259{
3260 struct perf_counter *counter;
3261
3262 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3263 return;
3264
3265 rcu_read_lock();
3266 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3267 if (perf_swcounter_match(counter, type, event, data->regs))
3268 perf_swcounter_add(counter, nr, nmi, data);
3269 }
3270 rcu_read_unlock();
3271}
3272
3273static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3274{
3275 if (in_nmi())
3276 return &cpuctx->recursion[3];
3277
3278 if (in_irq())
3279 return &cpuctx->recursion[2];
3280
3281 if (in_softirq())
3282 return &cpuctx->recursion[1];
3283
3284 return &cpuctx->recursion[0];
3285}
3286
3287static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3288 u64 nr, int nmi,
3289 struct perf_sample_data *data)
3290{
3291 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3292 int *recursion = perf_swcounter_recursion_context(cpuctx);
3293 struct perf_counter_context *ctx;
3294
3295 if (*recursion)
3296 goto out;
3297
3298 (*recursion)++;
3299 barrier();
3300
3301 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3302 nr, nmi, data);
3303 rcu_read_lock();
3304 /*
3305 * doesn't really matter which of the child contexts the
3306 * events ends up in.
3307 */
3308 ctx = rcu_dereference(current->perf_counter_ctxp);
3309 if (ctx)
3310 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
3311 rcu_read_unlock();
3312
3313 barrier();
3314 (*recursion)--;
3315
3316out:
3317 put_cpu_var(perf_cpu_context);
3318}
3319
3320void
3321perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
3322{
3323 struct perf_sample_data data = {
3324 .regs = regs,
3325 .addr = addr,
3326 };
3327
3328 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
3329}
3330
3331static void perf_swcounter_read(struct perf_counter *counter)
3332{
3333 perf_swcounter_update(counter);
3334}
3335
3336static int perf_swcounter_enable(struct perf_counter *counter)
3337{
3338 perf_swcounter_set_period(counter);
3339 return 0;
3340}
3341
3342static void perf_swcounter_disable(struct perf_counter *counter)
3343{
3344 perf_swcounter_update(counter);
3345}
3346
3347static const struct pmu perf_ops_generic = {
3348 .enable = perf_swcounter_enable,
3349 .disable = perf_swcounter_disable,
3350 .read = perf_swcounter_read,
3351};
3352
3353/*
3354 * Software counter: cpu wall time clock
3355 */
3356
3357static void cpu_clock_perf_counter_update(struct perf_counter *counter)
3358{
3359 int cpu = raw_smp_processor_id();
3360 s64 prev;
3361 u64 now;
3362
3363 now = cpu_clock(cpu);
3364 prev = atomic64_read(&counter->hw.prev_count);
3365 atomic64_set(&counter->hw.prev_count, now);
3366 atomic64_add(now - prev, &counter->count);
3367}
3368
3369static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
3370{
3371 struct hw_perf_counter *hwc = &counter->hw;
3372 int cpu = raw_smp_processor_id();
3373
3374 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3375 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3376 hwc->hrtimer.function = perf_swcounter_hrtimer;
3377 if (hwc->sample_period) {
3378 u64 period = max_t(u64, 10000, hwc->sample_period);
3379 __hrtimer_start_range_ns(&hwc->hrtimer,
3380 ns_to_ktime(period), 0,
3381 HRTIMER_MODE_REL, 0);
3382 }
3383
3384 return 0;
3385}
3386
3387static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
3388{
3389 if (counter->hw.sample_period)
3390 hrtimer_cancel(&counter->hw.hrtimer);
3391 cpu_clock_perf_counter_update(counter);
3392}
3393
3394static void cpu_clock_perf_counter_read(struct perf_counter *counter)
3395{
3396 cpu_clock_perf_counter_update(counter);
3397}
3398
3399static const struct pmu perf_ops_cpu_clock = {
3400 .enable = cpu_clock_perf_counter_enable,
3401 .disable = cpu_clock_perf_counter_disable,
3402 .read = cpu_clock_perf_counter_read,
3403};
3404
3405/*
3406 * Software counter: task time clock
3407 */
3408
3409static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
3410{
3411 u64 prev;
3412 s64 delta;
3413
3414 prev = atomic64_xchg(&counter->hw.prev_count, now);
3415 delta = now - prev;
3416 atomic64_add(delta, &counter->count);
3417}
3418
3419static int task_clock_perf_counter_enable(struct perf_counter *counter)
3420{
3421 struct hw_perf_counter *hwc = &counter->hw;
3422 u64 now;
3423
3424 now = counter->ctx->time;
3425
3426 atomic64_set(&hwc->prev_count, now);
3427 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3428 hwc->hrtimer.function = perf_swcounter_hrtimer;
3429 if (hwc->sample_period) {
3430 u64 period = max_t(u64, 10000, hwc->sample_period);
3431 __hrtimer_start_range_ns(&hwc->hrtimer,
3432 ns_to_ktime(period), 0,
3433 HRTIMER_MODE_REL, 0);
3434 }
3435
3436 return 0;
3437}
3438
3439static void task_clock_perf_counter_disable(struct perf_counter *counter)
3440{
3441 if (counter->hw.sample_period)
3442 hrtimer_cancel(&counter->hw.hrtimer);
3443 task_clock_perf_counter_update(counter, counter->ctx->time);
3444
3445}
3446
3447static void task_clock_perf_counter_read(struct perf_counter *counter)
3448{
3449 u64 time;
3450
3451 if (!in_nmi()) {
3452 update_context_time(counter->ctx);
3453 time = counter->ctx->time;
3454 } else {
3455 u64 now = perf_clock();
3456 u64 delta = now - counter->ctx->timestamp;
3457 time = counter->ctx->time + delta;
3458 }
3459
3460 task_clock_perf_counter_update(counter, time);
3461}
3462
3463static const struct pmu perf_ops_task_clock = {
3464 .enable = task_clock_perf_counter_enable,
3465 .disable = task_clock_perf_counter_disable,
3466 .read = task_clock_perf_counter_read,
3467};
3468
3469#ifdef CONFIG_EVENT_PROFILE
3470void perf_tpcounter_event(int event_id)
3471{
3472 struct perf_sample_data data = {
3473 .regs = get_irq_regs();
3474 .addr = 0,
3475 };
3476
3477 if (!data.regs)
3478 data.regs = task_pt_regs(current);
3479
3480 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data);
3481}
3482EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3483
3484extern int ftrace_profile_enable(int);
3485extern void ftrace_profile_disable(int);
3486
3487static void tp_perf_counter_destroy(struct perf_counter *counter)
3488{
3489 ftrace_profile_disable(perf_event_id(&counter->attr));
3490}
3491
3492static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3493{
3494 int event_id = perf_event_id(&counter->attr);
3495 int ret;
3496
3497 ret = ftrace_profile_enable(event_id);
3498 if (ret)
3499 return NULL;
3500
3501 counter->destroy = tp_perf_counter_destroy;
3502
3503 return &perf_ops_generic;
3504}
3505#else
3506static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
3507{
3508 return NULL;
3509}
3510#endif
3511
3512static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
3513{
3514 const struct pmu *pmu = NULL;
3515
3516 /*
3517 * Software counters (currently) can't in general distinguish
3518 * between user, kernel and hypervisor events.
3519 * However, context switches and cpu migrations are considered
3520 * to be kernel events, and page faults are never hypervisor
3521 * events.
3522 */
3523 switch (counter->attr.config) {
3524 case PERF_COUNT_SW_CPU_CLOCK:
3525 pmu = &perf_ops_cpu_clock;
3526
3527 break;
3528 case PERF_COUNT_SW_TASK_CLOCK:
3529 /*
3530 * If the user instantiates this as a per-cpu counter,
3531 * use the cpu_clock counter instead.
3532 */
3533 if (counter->ctx->task)
3534 pmu = &perf_ops_task_clock;
3535 else
3536 pmu = &perf_ops_cpu_clock;
3537
3538 break;
3539 case PERF_COUNT_SW_PAGE_FAULTS:
3540 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
3541 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
3542 case PERF_COUNT_SW_CONTEXT_SWITCHES:
3543 case PERF_COUNT_SW_CPU_MIGRATIONS:
3544 pmu = &perf_ops_generic;
3545 break;
3546 }
3547
3548 return pmu;
3549}
3550
3551/*
3552 * Allocate and initialize a counter structure
3553 */
3554static struct perf_counter *
3555perf_counter_alloc(struct perf_counter_attr *attr,
3556 int cpu,
3557 struct perf_counter_context *ctx,
3558 struct perf_counter *group_leader,
3559 gfp_t gfpflags)
3560{
3561 const struct pmu *pmu;
3562 struct perf_counter *counter;
3563 struct hw_perf_counter *hwc;
3564 long err;
3565
3566 counter = kzalloc(sizeof(*counter), gfpflags);
3567 if (!counter)
3568 return ERR_PTR(-ENOMEM);
3569
3570 /*
3571 * Single counters are their own group leaders, with an
3572 * empty sibling list:
3573 */
3574 if (!group_leader)
3575 group_leader = counter;
3576
3577 mutex_init(&counter->child_mutex);
3578 INIT_LIST_HEAD(&counter->child_list);
3579
3580 INIT_LIST_HEAD(&counter->list_entry);
3581 INIT_LIST_HEAD(&counter->event_entry);
3582 INIT_LIST_HEAD(&counter->sibling_list);
3583 init_waitqueue_head(&counter->waitq);
3584
3585 mutex_init(&counter->mmap_mutex);
3586
3587 counter->cpu = cpu;
3588 counter->attr = *attr;
3589 counter->group_leader = group_leader;
3590 counter->pmu = NULL;
3591 counter->ctx = ctx;
3592 counter->oncpu = -1;
3593
3594 counter->ns = get_pid_ns(current->nsproxy->pid_ns);
3595 counter->id = atomic64_inc_return(&perf_counter_id);
3596
3597 counter->state = PERF_COUNTER_STATE_INACTIVE;
3598
3599 if (attr->disabled)
3600 counter->state = PERF_COUNTER_STATE_OFF;
3601
3602 pmu = NULL;
3603
3604 hwc = &counter->hw;
3605 hwc->sample_period = attr->sample_period;
3606 if (attr->freq && attr->sample_freq)
3607 hwc->sample_period = 1;
3608
3609 atomic64_set(&hwc->period_left, hwc->sample_period);
3610
3611 /*
3612 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
3613 */
3614 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
3615 goto done;
3616
3617 switch (attr->type) {
3618 case PERF_TYPE_RAW:
3619 case PERF_TYPE_HARDWARE:
3620 case PERF_TYPE_HW_CACHE:
3621 pmu = hw_perf_counter_init(counter);
3622 break;
3623
3624 case PERF_TYPE_SOFTWARE:
3625 pmu = sw_perf_counter_init(counter);
3626 break;
3627
3628 case PERF_TYPE_TRACEPOINT:
3629 pmu = tp_perf_counter_init(counter);
3630 break;
3631
3632 default:
3633 break;
3634 }
3635done:
3636 err = 0;
3637 if (!pmu)
3638 err = -EINVAL;
3639 else if (IS_ERR(pmu))
3640 err = PTR_ERR(pmu);
3641
3642 if (err) {
3643 if (counter->ns)
3644 put_pid_ns(counter->ns);
3645 kfree(counter);
3646 return ERR_PTR(err);
3647 }
3648
3649 counter->pmu = pmu;
3650
3651 atomic_inc(&nr_counters);
3652 if (counter->attr.mmap)
3653 atomic_inc(&nr_mmap_counters);
3654 if (counter->attr.comm)
3655 atomic_inc(&nr_comm_counters);
3656
3657 return counter;
3658}
3659
3660static int perf_copy_attr(struct perf_counter_attr __user *uattr,
3661 struct perf_counter_attr *attr)
3662{
3663 int ret;
3664 u32 size;
3665
3666 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
3667 return -EFAULT;
3668
3669 /*
3670 * zero the full structure, so that a short copy will be nice.
3671 */
3672 memset(attr, 0, sizeof(*attr));
3673
3674 ret = get_user(size, &uattr->size);
3675 if (ret)
3676 return ret;
3677
3678 if (size > PAGE_SIZE) /* silly large */
3679 goto err_size;
3680
3681 if (!size) /* abi compat */
3682 size = PERF_ATTR_SIZE_VER0;
3683
3684 if (size < PERF_ATTR_SIZE_VER0)
3685 goto err_size;
3686
3687 /*
3688 * If we're handed a bigger struct than we know of,
3689 * ensure all the unknown bits are 0.
3690 */
3691 if (size > sizeof(*attr)) {
3692 unsigned long val;
3693 unsigned long __user *addr;
3694 unsigned long __user *end;
3695
3696 addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
3697 sizeof(unsigned long));
3698 end = PTR_ALIGN((void __user *)uattr + size,
3699 sizeof(unsigned long));
3700
3701 for (; addr < end; addr += sizeof(unsigned long)) {
3702 ret = get_user(val, addr);
3703 if (ret)
3704 return ret;
3705 if (val)
3706 goto err_size;
3707 }
3708 }
3709
3710 ret = copy_from_user(attr, uattr, size);
3711 if (ret)
3712 return -EFAULT;
3713
3714 /*
3715 * If the type exists, the corresponding creation will verify
3716 * the attr->config.
3717 */
3718 if (attr->type >= PERF_TYPE_MAX)
3719 return -EINVAL;
3720
3721 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
3722 return -EINVAL;
3723
3724 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
3725 return -EINVAL;
3726
3727 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
3728 return -EINVAL;
3729
3730out:
3731 return ret;
3732
3733err_size:
3734 put_user(sizeof(*attr), &uattr->size);
3735 ret = -E2BIG;
3736 goto out;
3737}
3738
3739/**
3740 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
3741 *
3742 * @attr_uptr: event type attributes for monitoring/sampling
3743 * @pid: target pid
3744 * @cpu: target cpu
3745 * @group_fd: group leader counter fd
3746 */
3747SYSCALL_DEFINE5(perf_counter_open,
3748 struct perf_counter_attr __user *, attr_uptr,
3749 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
3750{
3751 struct perf_counter *counter, *group_leader;
3752 struct perf_counter_attr attr;
3753 struct perf_counter_context *ctx;
3754 struct file *counter_file = NULL;
3755 struct file *group_file = NULL;
3756 int fput_needed = 0;
3757 int fput_needed2 = 0;
3758 int ret;
3759
3760 /* for future expandability... */
3761 if (flags)
3762 return -EINVAL;
3763
3764 ret = perf_copy_attr(attr_uptr, &attr);
3765 if (ret)
3766 return ret;
3767
3768 if (!attr.exclude_kernel) {
3769 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
3770 return -EACCES;
3771 }
3772
3773 if (attr.freq) {
3774 if (attr.sample_freq > sysctl_perf_counter_sample_rate)
3775 return -EINVAL;
3776 }
3777
3778 /*
3779 * Get the target context (task or percpu):
3780 */
3781 ctx = find_get_context(pid, cpu);
3782 if (IS_ERR(ctx))
3783 return PTR_ERR(ctx);
3784
3785 /*
3786 * Look up the group leader (we will attach this counter to it):
3787 */
3788 group_leader = NULL;
3789 if (group_fd != -1) {
3790 ret = -EINVAL;
3791 group_file = fget_light(group_fd, &fput_needed);
3792 if (!group_file)
3793 goto err_put_context;
3794 if (group_file->f_op != &perf_fops)
3795 goto err_put_context;
3796
3797 group_leader = group_file->private_data;
3798 /*
3799 * Do not allow a recursive hierarchy (this new sibling
3800 * becoming part of another group-sibling):
3801 */
3802 if (group_leader->group_leader != group_leader)
3803 goto err_put_context;
3804 /*
3805 * Do not allow to attach to a group in a different
3806 * task or CPU context:
3807 */
3808 if (group_leader->ctx != ctx)
3809 goto err_put_context;
3810 /*
3811 * Only a group leader can be exclusive or pinned
3812 */
3813 if (attr.exclusive || attr.pinned)
3814 goto err_put_context;
3815 }
3816
3817 counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
3818 GFP_KERNEL);
3819 ret = PTR_ERR(counter);
3820 if (IS_ERR(counter))
3821 goto err_put_context;
3822
3823 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
3824 if (ret < 0)
3825 goto err_free_put_context;
3826
3827 counter_file = fget_light(ret, &fput_needed2);
3828 if (!counter_file)
3829 goto err_free_put_context;
3830
3831 counter->filp = counter_file;
3832 WARN_ON_ONCE(ctx->parent_ctx);
3833 mutex_lock(&ctx->mutex);
3834 perf_install_in_context(ctx, counter, cpu);
3835 ++ctx->generation;
3836 mutex_unlock(&ctx->mutex);
3837
3838 counter->owner = current;
3839 get_task_struct(current);
3840 mutex_lock(&current->perf_counter_mutex);
3841 list_add_tail(&counter->owner_entry, &current->perf_counter_list);
3842 mutex_unlock(&current->perf_counter_mutex);
3843
3844 fput_light(counter_file, fput_needed2);
3845
3846out_fput:
3847 fput_light(group_file, fput_needed);
3848
3849 return ret;
3850
3851err_free_put_context:
3852 kfree(counter);
3853
3854err_put_context:
3855 put_ctx(ctx);
3856
3857 goto out_fput;
3858}
3859
3860/*
3861 * inherit a counter from parent task to child task:
3862 */
3863static struct perf_counter *
3864inherit_counter(struct perf_counter *parent_counter,
3865 struct task_struct *parent,
3866 struct perf_counter_context *parent_ctx,
3867 struct task_struct *child,
3868 struct perf_counter *group_leader,
3869 struct perf_counter_context *child_ctx)
3870{
3871 struct perf_counter *child_counter;
3872
3873 /*
3874 * Instead of creating recursive hierarchies of counters,
3875 * we link inherited counters back to the original parent,
3876 * which has a filp for sure, which we use as the reference
3877 * count:
3878 */
3879 if (parent_counter->parent)
3880 parent_counter = parent_counter->parent;
3881
3882 child_counter = perf_counter_alloc(&parent_counter->attr,
3883 parent_counter->cpu, child_ctx,
3884 group_leader, GFP_KERNEL);
3885 if (IS_ERR(child_counter))
3886 return child_counter;
3887 get_ctx(child_ctx);
3888
3889 /*
3890 * Make the child state follow the state of the parent counter,
3891 * not its attr.disabled bit. We hold the parent's mutex,
3892 * so we won't race with perf_counter_{en, dis}able_family.
3893 */
3894 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
3895 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
3896 else
3897 child_counter->state = PERF_COUNTER_STATE_OFF;
3898
3899 if (parent_counter->attr.freq)
3900 child_counter->hw.sample_period = parent_counter->hw.sample_period;
3901
3902 /*
3903 * Link it up in the child's context:
3904 */
3905 add_counter_to_ctx(child_counter, child_ctx);
3906
3907 child_counter->parent = parent_counter;
3908 /*
3909 * inherit into child's child as well:
3910 */
3911 child_counter->attr.inherit = 1;
3912
3913 /*
3914 * Get a reference to the parent filp - we will fput it
3915 * when the child counter exits. This is safe to do because
3916 * we are in the parent and we know that the filp still
3917 * exists and has a nonzero count:
3918 */
3919 atomic_long_inc(&parent_counter->filp->f_count);
3920
3921 /*
3922 * Link this into the parent counter's child list
3923 */
3924 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
3925 mutex_lock(&parent_counter->child_mutex);
3926 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
3927 mutex_unlock(&parent_counter->child_mutex);
3928
3929 return child_counter;
3930}
3931
3932static int inherit_group(struct perf_counter *parent_counter,
3933 struct task_struct *parent,
3934 struct perf_counter_context *parent_ctx,
3935 struct task_struct *child,
3936 struct perf_counter_context *child_ctx)
3937{
3938 struct perf_counter *leader;
3939 struct perf_counter *sub;
3940 struct perf_counter *child_ctr;
3941
3942 leader = inherit_counter(parent_counter, parent, parent_ctx,
3943 child, NULL, child_ctx);
3944 if (IS_ERR(leader))
3945 return PTR_ERR(leader);
3946 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
3947 child_ctr = inherit_counter(sub, parent, parent_ctx,
3948 child, leader, child_ctx);
3949 if (IS_ERR(child_ctr))
3950 return PTR_ERR(child_ctr);
3951 }
3952 return 0;
3953}
3954
3955static void sync_child_counter(struct perf_counter *child_counter,
3956 struct perf_counter *parent_counter)
3957{
3958 u64 child_val;
3959
3960 child_val = atomic64_read(&child_counter->count);
3961
3962 /*
3963 * Add back the child's count to the parent's count:
3964 */
3965 atomic64_add(child_val, &parent_counter->count);
3966 atomic64_add(child_counter->total_time_enabled,
3967 &parent_counter->child_total_time_enabled);
3968 atomic64_add(child_counter->total_time_running,
3969 &parent_counter->child_total_time_running);
3970
3971 /*
3972 * Remove this counter from the parent's list
3973 */
3974 WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
3975 mutex_lock(&parent_counter->child_mutex);
3976 list_del_init(&child_counter->child_list);
3977 mutex_unlock(&parent_counter->child_mutex);
3978
3979 /*
3980 * Release the parent counter, if this was the last
3981 * reference to it.
3982 */
3983 fput(parent_counter->filp);
3984}
3985
3986static void
3987__perf_counter_exit_task(struct perf_counter *child_counter,
3988 struct perf_counter_context *child_ctx)
3989{
3990 struct perf_counter *parent_counter;
3991
3992 update_counter_times(child_counter);
3993 perf_counter_remove_from_context(child_counter);
3994
3995 parent_counter = child_counter->parent;
3996 /*
3997 * It can happen that parent exits first, and has counters
3998 * that are still around due to the child reference. These
3999 * counters need to be zapped - but otherwise linger.
4000 */
4001 if (parent_counter) {
4002 sync_child_counter(child_counter, parent_counter);
4003 free_counter(child_counter);
4004 }
4005}
4006
4007/*
4008 * When a child task exits, feed back counter values to parent counters.
4009 */
4010void perf_counter_exit_task(struct task_struct *child)
4011{
4012 struct perf_counter *child_counter, *tmp;
4013 struct perf_counter_context *child_ctx;
4014 unsigned long flags;
4015
4016 if (likely(!child->perf_counter_ctxp))
4017 return;
4018
4019 local_irq_save(flags);
4020 /*
4021 * We can't reschedule here because interrupts are disabled,
4022 * and either child is current or it is a task that can't be
4023 * scheduled, so we are now safe from rescheduling changing
4024 * our context.
4025 */
4026 child_ctx = child->perf_counter_ctxp;
4027 __perf_counter_task_sched_out(child_ctx);
4028
4029 /*
4030 * Take the context lock here so that if find_get_context is
4031 * reading child->perf_counter_ctxp, we wait until it has
4032 * incremented the context's refcount before we do put_ctx below.
4033 */
4034 spin_lock(&child_ctx->lock);
4035 child->perf_counter_ctxp = NULL;
4036 if (child_ctx->parent_ctx) {
4037 /*
4038 * This context is a clone; unclone it so it can't get
4039 * swapped to another process while we're removing all
4040 * the counters from it.
4041 */
4042 put_ctx(child_ctx->parent_ctx);
4043 child_ctx->parent_ctx = NULL;
4044 }
4045 spin_unlock(&child_ctx->lock);
4046 local_irq_restore(flags);
4047
4048 /*
4049 * We can recurse on the same lock type through:
4050 *
4051 * __perf_counter_exit_task()
4052 * sync_child_counter()
4053 * fput(parent_counter->filp)
4054 * perf_release()
4055 * mutex_lock(&ctx->mutex)
4056 *
4057 * But since its the parent context it won't be the same instance.
4058 */
4059 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4060
4061again:
4062 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
4063 list_entry)
4064 __perf_counter_exit_task(child_counter, child_ctx);
4065
4066 /*
4067 * If the last counter was a group counter, it will have appended all
4068 * its siblings to the list, but we obtained 'tmp' before that which
4069 * will still point to the list head terminating the iteration.
4070 */
4071 if (!list_empty(&child_ctx->counter_list))
4072 goto again;
4073
4074 mutex_unlock(&child_ctx->mutex);
4075
4076 put_ctx(child_ctx);
4077}
4078
4079/*
4080 * free an unexposed, unused context as created by inheritance by
4081 * init_task below, used by fork() in case of fail.
4082 */
4083void perf_counter_free_task(struct task_struct *task)
4084{
4085 struct perf_counter_context *ctx = task->perf_counter_ctxp;
4086 struct perf_counter *counter, *tmp;
4087
4088 if (!ctx)
4089 return;
4090
4091 mutex_lock(&ctx->mutex);
4092again:
4093 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
4094 struct perf_counter *parent = counter->parent;
4095
4096 if (WARN_ON_ONCE(!parent))
4097 continue;
4098
4099 mutex_lock(&parent->child_mutex);
4100 list_del_init(&counter->child_list);
4101 mutex_unlock(&parent->child_mutex);
4102
4103 fput(parent->filp);
4104
4105 list_del_counter(counter, ctx);
4106 free_counter(counter);
4107 }
4108
4109 if (!list_empty(&ctx->counter_list))
4110 goto again;
4111
4112 mutex_unlock(&ctx->mutex);
4113
4114 put_ctx(ctx);
4115}
4116
4117/*
4118 * Initialize the perf_counter context in task_struct
4119 */
4120int perf_counter_init_task(struct task_struct *child)
4121{
4122 struct perf_counter_context *child_ctx, *parent_ctx;
4123 struct perf_counter_context *cloned_ctx;
4124 struct perf_counter *counter;
4125 struct task_struct *parent = current;
4126 int inherited_all = 1;
4127 int ret = 0;
4128
4129 child->perf_counter_ctxp = NULL;
4130
4131 mutex_init(&child->perf_counter_mutex);
4132 INIT_LIST_HEAD(&child->perf_counter_list);
4133
4134 if (likely(!parent->perf_counter_ctxp))
4135 return 0;
4136
4137 /*
4138 * This is executed from the parent task context, so inherit
4139 * counters that have been marked for cloning.
4140 * First allocate and initialize a context for the child.
4141 */
4142
4143 child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
4144 if (!child_ctx)
4145 return -ENOMEM;
4146
4147 __perf_counter_init_context(child_ctx, child);
4148 child->perf_counter_ctxp = child_ctx;
4149 get_task_struct(child);
4150
4151 /*
4152 * If the parent's context is a clone, pin it so it won't get
4153 * swapped under us.
4154 */
4155 parent_ctx = perf_pin_task_context(parent);
4156
4157 /*
4158 * No need to check if parent_ctx != NULL here; since we saw
4159 * it non-NULL earlier, the only reason for it to become NULL
4160 * is if we exit, and since we're currently in the middle of
4161 * a fork we can't be exiting at the same time.
4162 */
4163
4164 /*
4165 * Lock the parent list. No need to lock the child - not PID
4166 * hashed yet and not running, so nobody can access it.
4167 */
4168 mutex_lock(&parent_ctx->mutex);
4169
4170 /*
4171 * We dont have to disable NMIs - we are only looking at
4172 * the list, not manipulating it:
4173 */
4174 list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
4175 if (counter != counter->group_leader)
4176 continue;
4177
4178 if (!counter->attr.inherit) {
4179 inherited_all = 0;
4180 continue;
4181 }
4182
4183 ret = inherit_group(counter, parent, parent_ctx,
4184 child, child_ctx);
4185 if (ret) {
4186 inherited_all = 0;
4187 break;
4188 }
4189 }
4190
4191 if (inherited_all) {
4192 /*
4193 * Mark the child context as a clone of the parent
4194 * context, or of whatever the parent is a clone of.
4195 * Note that if the parent is a clone, it could get
4196 * uncloned at any point, but that doesn't matter
4197 * because the list of counters and the generation
4198 * count can't have changed since we took the mutex.
4199 */
4200 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4201 if (cloned_ctx) {
4202 child_ctx->parent_ctx = cloned_ctx;
4203 child_ctx->parent_gen = parent_ctx->parent_gen;
4204 } else {
4205 child_ctx->parent_ctx = parent_ctx;
4206 child_ctx->parent_gen = parent_ctx->generation;
4207 }
4208 get_ctx(child_ctx->parent_ctx);
4209 }
4210
4211 mutex_unlock(&parent_ctx->mutex);
4212
4213 perf_unpin_context(parent_ctx);
4214
4215 return ret;
4216}
4217
4218static void __cpuinit perf_counter_init_cpu(int cpu)
4219{
4220 struct perf_cpu_context *cpuctx;
4221
4222 cpuctx = &per_cpu(perf_cpu_context, cpu);
4223 __perf_counter_init_context(&cpuctx->ctx, NULL);
4224
4225 spin_lock(&perf_resource_lock);
4226 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
4227 spin_unlock(&perf_resource_lock);
4228
4229 hw_perf_counter_setup(cpu);
4230}
4231
4232#ifdef CONFIG_HOTPLUG_CPU
4233static void __perf_counter_exit_cpu(void *info)
4234{
4235 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4236 struct perf_counter_context *ctx = &cpuctx->ctx;
4237 struct perf_counter *counter, *tmp;
4238
4239 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
4240 __perf_counter_remove_from_context(counter);
4241}
4242static void perf_counter_exit_cpu(int cpu)
4243{
4244 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4245 struct perf_counter_context *ctx = &cpuctx->ctx;
4246
4247 mutex_lock(&ctx->mutex);
4248 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
4249 mutex_unlock(&ctx->mutex);
4250}
4251#else
4252static inline void perf_counter_exit_cpu(int cpu) { }
4253#endif
4254
4255static int __cpuinit
4256perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4257{
4258 unsigned int cpu = (long)hcpu;
4259
4260 switch (action) {
4261
4262 case CPU_UP_PREPARE:
4263 case CPU_UP_PREPARE_FROZEN:
4264 perf_counter_init_cpu(cpu);
4265 break;
4266
4267 case CPU_DOWN_PREPARE:
4268 case CPU_DOWN_PREPARE_FROZEN:
4269 perf_counter_exit_cpu(cpu);
4270 break;
4271
4272 default:
4273 break;
4274 }
4275
4276 return NOTIFY_OK;
4277}
4278
4279/*
4280 * This has to have a higher priority than migration_notifier in sched.c.
4281 */
4282static struct notifier_block __cpuinitdata perf_cpu_nb = {
4283 .notifier_call = perf_cpu_notify,
4284 .priority = 20,
4285};
4286
4287void __init perf_counter_init(void)
4288{
4289 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4290 (void *)(long)smp_processor_id());
4291 register_cpu_notifier(&perf_cpu_nb);
4292}
4293
4294static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4295{
4296 return sprintf(buf, "%d\n", perf_reserved_percpu);
4297}
4298
4299static ssize_t
4300perf_set_reserve_percpu(struct sysdev_class *class,
4301 const char *buf,
4302 size_t count)
4303{
4304 struct perf_cpu_context *cpuctx;
4305 unsigned long val;
4306 int err, cpu, mpt;
4307
4308 err = strict_strtoul(buf, 10, &val);
4309 if (err)
4310 return err;
4311 if (val > perf_max_counters)
4312 return -EINVAL;
4313
4314 spin_lock(&perf_resource_lock);
4315 perf_reserved_percpu = val;
4316 for_each_online_cpu(cpu) {
4317 cpuctx = &per_cpu(perf_cpu_context, cpu);
4318 spin_lock_irq(&cpuctx->ctx.lock);
4319 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
4320 perf_max_counters - perf_reserved_percpu);
4321 cpuctx->max_pertask = mpt;
4322 spin_unlock_irq(&cpuctx->ctx.lock);
4323 }
4324 spin_unlock(&perf_resource_lock);
4325
4326 return count;
4327}
4328
4329static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4330{
4331 return sprintf(buf, "%d\n", perf_overcommit);
4332}
4333
4334static ssize_t
4335perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4336{
4337 unsigned long val;
4338 int err;
4339
4340 err = strict_strtoul(buf, 10, &val);
4341 if (err)
4342 return err;
4343 if (val > 1)
4344 return -EINVAL;
4345
4346 spin_lock(&perf_resource_lock);
4347 perf_overcommit = val;
4348 spin_unlock(&perf_resource_lock);
4349
4350 return count;
4351}
4352
4353static SYSDEV_CLASS_ATTR(
4354 reserve_percpu,
4355 0644,
4356 perf_show_reserve_percpu,
4357 perf_set_reserve_percpu
4358 );
4359
4360static SYSDEV_CLASS_ATTR(
4361 overcommit,
4362 0644,
4363 perf_show_overcommit,
4364 perf_set_overcommit
4365 );
4366
4367static struct attribute *perfclass_attrs[] = {
4368 &attr_reserve_percpu.attr,
4369 &attr_overcommit.attr,
4370 NULL
4371};
4372
4373static struct attribute_group perfclass_attr_group = {
4374 .attrs = perfclass_attrs,
4375 .name = "perf_counters",
4376};
4377
4378static int __init perf_counter_sysfs_init(void)
4379{
4380 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4381 &perfclass_attr_group);
4382}
4383device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
new file mode 100644
index 000000000000..76ac4db405e9
--- /dev/null
+++ b/kernel/perf_event.c
@@ -0,0 +1,5000 @@
1/*
2 * Performance events core code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/dcache.h>
20#include <linux/percpu.h>
21#include <linux/ptrace.h>
22#include <linux/vmstat.h>
23#include <linux/hardirq.h>
24#include <linux/rculist.h>
25#include <linux/uaccess.h>
26#include <linux/syscalls.h>
27#include <linux/anon_inodes.h>
28#include <linux/kernel_stat.h>
29#include <linux/perf_event.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU events:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_events __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_events __read_mostly;
43static atomic_t nr_mmap_events __read_mostly;
44static atomic_t nr_comm_events __read_mostly;
45static atomic_t nr_task_events __read_mostly;
46
47/*
48 * perf event paranoia level:
49 * -1 - not paranoid at all
50 * 0 - disallow raw tracepoint access for unpriv
51 * 1 - disallow cpu events for unpriv
52 * 2 - disallow kernel profiling for unpriv
53 */
54int sysctl_perf_event_paranoid __read_mostly = 1;
55
56static inline bool perf_paranoid_tracepoint_raw(void)
57{
58 return sysctl_perf_event_paranoid > -1;
59}
60
61static inline bool perf_paranoid_cpu(void)
62{
63 return sysctl_perf_event_paranoid > 0;
64}
65
66static inline bool perf_paranoid_kernel(void)
67{
68 return sysctl_perf_event_paranoid > 1;
69}
70
71int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
72
73/*
74 * max perf event sample rate
75 */
76int sysctl_perf_event_sample_rate __read_mostly = 100000;
77
78static atomic64_t perf_event_id;
79
80/*
81 * Lock for (sysadmin-configurable) event reservations:
82 */
83static DEFINE_SPINLOCK(perf_resource_lock);
84
85/*
86 * Architecture provided APIs - weak aliases:
87 */
88extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
89{
90 return NULL;
91}
92
93void __weak hw_perf_disable(void) { barrier(); }
94void __weak hw_perf_enable(void) { barrier(); }
95
96void __weak hw_perf_event_setup(int cpu) { barrier(); }
97void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
98
99int __weak
100hw_perf_group_sched_in(struct perf_event *group_leader,
101 struct perf_cpu_context *cpuctx,
102 struct perf_event_context *ctx, int cpu)
103{
104 return 0;
105}
106
107void __weak perf_event_print_debug(void) { }
108
109static DEFINE_PER_CPU(int, perf_disable_count);
110
111void __perf_disable(void)
112{
113 __get_cpu_var(perf_disable_count)++;
114}
115
116bool __perf_enable(void)
117{
118 return !--__get_cpu_var(perf_disable_count);
119}
120
121void perf_disable(void)
122{
123 __perf_disable();
124 hw_perf_disable();
125}
126
127void perf_enable(void)
128{
129 if (__perf_enable())
130 hw_perf_enable();
131}
132
133static void get_ctx(struct perf_event_context *ctx)
134{
135 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
136}
137
138static void free_ctx(struct rcu_head *head)
139{
140 struct perf_event_context *ctx;
141
142 ctx = container_of(head, struct perf_event_context, rcu_head);
143 kfree(ctx);
144}
145
146static void put_ctx(struct perf_event_context *ctx)
147{
148 if (atomic_dec_and_test(&ctx->refcount)) {
149 if (ctx->parent_ctx)
150 put_ctx(ctx->parent_ctx);
151 if (ctx->task)
152 put_task_struct(ctx->task);
153 call_rcu(&ctx->rcu_head, free_ctx);
154 }
155}
156
157static void unclone_ctx(struct perf_event_context *ctx)
158{
159 if (ctx->parent_ctx) {
160 put_ctx(ctx->parent_ctx);
161 ctx->parent_ctx = NULL;
162 }
163}
164
165/*
166 * If we inherit events we want to return the parent event id
167 * to userspace.
168 */
169static u64 primary_event_id(struct perf_event *event)
170{
171 u64 id = event->id;
172
173 if (event->parent)
174 id = event->parent->id;
175
176 return id;
177}
178
179/*
180 * Get the perf_event_context for a task and lock it.
181 * This has to cope with with the fact that until it is locked,
182 * the context could get moved to another task.
183 */
184static struct perf_event_context *
185perf_lock_task_context(struct task_struct *task, unsigned long *flags)
186{
187 struct perf_event_context *ctx;
188
189 rcu_read_lock();
190 retry:
191 ctx = rcu_dereference(task->perf_event_ctxp);
192 if (ctx) {
193 /*
194 * If this context is a clone of another, it might
195 * get swapped for another underneath us by
196 * perf_event_task_sched_out, though the
197 * rcu_read_lock() protects us from any context
198 * getting freed. Lock the context and check if it
199 * got swapped before we could get the lock, and retry
200 * if so. If we locked the right context, then it
201 * can't get swapped on us any more.
202 */
203 spin_lock_irqsave(&ctx->lock, *flags);
204 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
205 spin_unlock_irqrestore(&ctx->lock, *flags);
206 goto retry;
207 }
208
209 if (!atomic_inc_not_zero(&ctx->refcount)) {
210 spin_unlock_irqrestore(&ctx->lock, *flags);
211 ctx = NULL;
212 }
213 }
214 rcu_read_unlock();
215 return ctx;
216}
217
218/*
219 * Get the context for a task and increment its pin_count so it
220 * can't get swapped to another task. This also increments its
221 * reference count so that the context can't get freed.
222 */
223static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
224{
225 struct perf_event_context *ctx;
226 unsigned long flags;
227
228 ctx = perf_lock_task_context(task, &flags);
229 if (ctx) {
230 ++ctx->pin_count;
231 spin_unlock_irqrestore(&ctx->lock, flags);
232 }
233 return ctx;
234}
235
236static void perf_unpin_context(struct perf_event_context *ctx)
237{
238 unsigned long flags;
239
240 spin_lock_irqsave(&ctx->lock, flags);
241 --ctx->pin_count;
242 spin_unlock_irqrestore(&ctx->lock, flags);
243 put_ctx(ctx);
244}
245
246/*
247 * Add a event from the lists for its context.
248 * Must be called with ctx->mutex and ctx->lock held.
249 */
250static void
251list_add_event(struct perf_event *event, struct perf_event_context *ctx)
252{
253 struct perf_event *group_leader = event->group_leader;
254
255 /*
256 * Depending on whether it is a standalone or sibling event,
257 * add it straight to the context's event list, or to the group
258 * leader's sibling list:
259 */
260 if (group_leader == event)
261 list_add_tail(&event->group_entry, &ctx->group_list);
262 else {
263 list_add_tail(&event->group_entry, &group_leader->sibling_list);
264 group_leader->nr_siblings++;
265 }
266
267 list_add_rcu(&event->event_entry, &ctx->event_list);
268 ctx->nr_events++;
269 if (event->attr.inherit_stat)
270 ctx->nr_stat++;
271}
272
273/*
274 * Remove a event from the lists for its context.
275 * Must be called with ctx->mutex and ctx->lock held.
276 */
277static void
278list_del_event(struct perf_event *event, struct perf_event_context *ctx)
279{
280 struct perf_event *sibling, *tmp;
281
282 if (list_empty(&event->group_entry))
283 return;
284 ctx->nr_events--;
285 if (event->attr.inherit_stat)
286 ctx->nr_stat--;
287
288 list_del_init(&event->group_entry);
289 list_del_rcu(&event->event_entry);
290
291 if (event->group_leader != event)
292 event->group_leader->nr_siblings--;
293
294 /*
295 * If this was a group event with sibling events then
296 * upgrade the siblings to singleton events by adding them
297 * to the context list directly:
298 */
299 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
300
301 list_move_tail(&sibling->group_entry, &ctx->group_list);
302 sibling->group_leader = sibling;
303 }
304}
305
306static void
307event_sched_out(struct perf_event *event,
308 struct perf_cpu_context *cpuctx,
309 struct perf_event_context *ctx)
310{
311 if (event->state != PERF_EVENT_STATE_ACTIVE)
312 return;
313
314 event->state = PERF_EVENT_STATE_INACTIVE;
315 if (event->pending_disable) {
316 event->pending_disable = 0;
317 event->state = PERF_EVENT_STATE_OFF;
318 }
319 event->tstamp_stopped = ctx->time;
320 event->pmu->disable(event);
321 event->oncpu = -1;
322
323 if (!is_software_event(event))
324 cpuctx->active_oncpu--;
325 ctx->nr_active--;
326 if (event->attr.exclusive || !cpuctx->active_oncpu)
327 cpuctx->exclusive = 0;
328}
329
330static void
331group_sched_out(struct perf_event *group_event,
332 struct perf_cpu_context *cpuctx,
333 struct perf_event_context *ctx)
334{
335 struct perf_event *event;
336
337 if (group_event->state != PERF_EVENT_STATE_ACTIVE)
338 return;
339
340 event_sched_out(group_event, cpuctx, ctx);
341
342 /*
343 * Schedule out siblings (if any):
344 */
345 list_for_each_entry(event, &group_event->sibling_list, group_entry)
346 event_sched_out(event, cpuctx, ctx);
347
348 if (group_event->attr.exclusive)
349 cpuctx->exclusive = 0;
350}
351
352/*
353 * Cross CPU call to remove a performance event
354 *
355 * We disable the event on the hardware level first. After that we
356 * remove it from the context list.
357 */
358static void __perf_event_remove_from_context(void *info)
359{
360 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
361 struct perf_event *event = info;
362 struct perf_event_context *ctx = event->ctx;
363
364 /*
365 * If this is a task context, we need to check whether it is
366 * the current task context of this cpu. If not it has been
367 * scheduled out before the smp call arrived.
368 */
369 if (ctx->task && cpuctx->task_ctx != ctx)
370 return;
371
372 spin_lock(&ctx->lock);
373 /*
374 * Protect the list operation against NMI by disabling the
375 * events on a global level.
376 */
377 perf_disable();
378
379 event_sched_out(event, cpuctx, ctx);
380
381 list_del_event(event, ctx);
382
383 if (!ctx->task) {
384 /*
385 * Allow more per task events with respect to the
386 * reservation:
387 */
388 cpuctx->max_pertask =
389 min(perf_max_events - ctx->nr_events,
390 perf_max_events - perf_reserved_percpu);
391 }
392
393 perf_enable();
394 spin_unlock(&ctx->lock);
395}
396
397
398/*
399 * Remove the event from a task's (or a CPU's) list of events.
400 *
401 * Must be called with ctx->mutex held.
402 *
403 * CPU events are removed with a smp call. For task events we only
404 * call when the task is on a CPU.
405 *
406 * If event->ctx is a cloned context, callers must make sure that
407 * every task struct that event->ctx->task could possibly point to
408 * remains valid. This is OK when called from perf_release since
409 * that only calls us on the top-level context, which can't be a clone.
410 * When called from perf_event_exit_task, it's OK because the
411 * context has been detached from its task.
412 */
413static void perf_event_remove_from_context(struct perf_event *event)
414{
415 struct perf_event_context *ctx = event->ctx;
416 struct task_struct *task = ctx->task;
417
418 if (!task) {
419 /*
420 * Per cpu events are removed via an smp call and
421 * the removal is always sucessful.
422 */
423 smp_call_function_single(event->cpu,
424 __perf_event_remove_from_context,
425 event, 1);
426 return;
427 }
428
429retry:
430 task_oncpu_function_call(task, __perf_event_remove_from_context,
431 event);
432
433 spin_lock_irq(&ctx->lock);
434 /*
435 * If the context is active we need to retry the smp call.
436 */
437 if (ctx->nr_active && !list_empty(&event->group_entry)) {
438 spin_unlock_irq(&ctx->lock);
439 goto retry;
440 }
441
442 /*
443 * The lock prevents that this context is scheduled in so we
444 * can remove the event safely, if the call above did not
445 * succeed.
446 */
447 if (!list_empty(&event->group_entry)) {
448 list_del_event(event, ctx);
449 }
450 spin_unlock_irq(&ctx->lock);
451}
452
453static inline u64 perf_clock(void)
454{
455 return cpu_clock(smp_processor_id());
456}
457
458/*
459 * Update the record of the current time in a context.
460 */
461static void update_context_time(struct perf_event_context *ctx)
462{
463 u64 now = perf_clock();
464
465 ctx->time += now - ctx->timestamp;
466 ctx->timestamp = now;
467}
468
469/*
470 * Update the total_time_enabled and total_time_running fields for a event.
471 */
472static void update_event_times(struct perf_event *event)
473{
474 struct perf_event_context *ctx = event->ctx;
475 u64 run_end;
476
477 if (event->state < PERF_EVENT_STATE_INACTIVE ||
478 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
479 return;
480
481 event->total_time_enabled = ctx->time - event->tstamp_enabled;
482
483 if (event->state == PERF_EVENT_STATE_INACTIVE)
484 run_end = event->tstamp_stopped;
485 else
486 run_end = ctx->time;
487
488 event->total_time_running = run_end - event->tstamp_running;
489}
490
491/*
492 * Update total_time_enabled and total_time_running for all events in a group.
493 */
494static void update_group_times(struct perf_event *leader)
495{
496 struct perf_event *event;
497
498 update_event_times(leader);
499 list_for_each_entry(event, &leader->sibling_list, group_entry)
500 update_event_times(event);
501}
502
503/*
504 * Cross CPU call to disable a performance event
505 */
506static void __perf_event_disable(void *info)
507{
508 struct perf_event *event = info;
509 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
510 struct perf_event_context *ctx = event->ctx;
511
512 /*
513 * If this is a per-task event, need to check whether this
514 * event's task is the current task on this cpu.
515 */
516 if (ctx->task && cpuctx->task_ctx != ctx)
517 return;
518
519 spin_lock(&ctx->lock);
520
521 /*
522 * If the event is on, turn it off.
523 * If it is in error state, leave it in error state.
524 */
525 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
526 update_context_time(ctx);
527 update_group_times(event);
528 if (event == event->group_leader)
529 group_sched_out(event, cpuctx, ctx);
530 else
531 event_sched_out(event, cpuctx, ctx);
532 event->state = PERF_EVENT_STATE_OFF;
533 }
534
535 spin_unlock(&ctx->lock);
536}
537
538/*
539 * Disable a event.
540 *
541 * If event->ctx is a cloned context, callers must make sure that
542 * every task struct that event->ctx->task could possibly point to
543 * remains valid. This condition is satisifed when called through
544 * perf_event_for_each_child or perf_event_for_each because they
545 * hold the top-level event's child_mutex, so any descendant that
546 * goes to exit will block in sync_child_event.
547 * When called from perf_pending_event it's OK because event->ctx
548 * is the current context on this CPU and preemption is disabled,
549 * hence we can't get into perf_event_task_sched_out for this context.
550 */
551static void perf_event_disable(struct perf_event *event)
552{
553 struct perf_event_context *ctx = event->ctx;
554 struct task_struct *task = ctx->task;
555
556 if (!task) {
557 /*
558 * Disable the event on the cpu that it's on
559 */
560 smp_call_function_single(event->cpu, __perf_event_disable,
561 event, 1);
562 return;
563 }
564
565 retry:
566 task_oncpu_function_call(task, __perf_event_disable, event);
567
568 spin_lock_irq(&ctx->lock);
569 /*
570 * If the event is still active, we need to retry the cross-call.
571 */
572 if (event->state == PERF_EVENT_STATE_ACTIVE) {
573 spin_unlock_irq(&ctx->lock);
574 goto retry;
575 }
576
577 /*
578 * Since we have the lock this context can't be scheduled
579 * in, so we can change the state safely.
580 */
581 if (event->state == PERF_EVENT_STATE_INACTIVE) {
582 update_group_times(event);
583 event->state = PERF_EVENT_STATE_OFF;
584 }
585
586 spin_unlock_irq(&ctx->lock);
587}
588
589static int
590event_sched_in(struct perf_event *event,
591 struct perf_cpu_context *cpuctx,
592 struct perf_event_context *ctx,
593 int cpu)
594{
595 if (event->state <= PERF_EVENT_STATE_OFF)
596 return 0;
597
598 event->state = PERF_EVENT_STATE_ACTIVE;
599 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
600 /*
601 * The new state must be visible before we turn it on in the hardware:
602 */
603 smp_wmb();
604
605 if (event->pmu->enable(event)) {
606 event->state = PERF_EVENT_STATE_INACTIVE;
607 event->oncpu = -1;
608 return -EAGAIN;
609 }
610
611 event->tstamp_running += ctx->time - event->tstamp_stopped;
612
613 if (!is_software_event(event))
614 cpuctx->active_oncpu++;
615 ctx->nr_active++;
616
617 if (event->attr.exclusive)
618 cpuctx->exclusive = 1;
619
620 return 0;
621}
622
623static int
624group_sched_in(struct perf_event *group_event,
625 struct perf_cpu_context *cpuctx,
626 struct perf_event_context *ctx,
627 int cpu)
628{
629 struct perf_event *event, *partial_group;
630 int ret;
631
632 if (group_event->state == PERF_EVENT_STATE_OFF)
633 return 0;
634
635 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
636 if (ret)
637 return ret < 0 ? ret : 0;
638
639 if (event_sched_in(group_event, cpuctx, ctx, cpu))
640 return -EAGAIN;
641
642 /*
643 * Schedule in siblings as one group (if any):
644 */
645 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
646 if (event_sched_in(event, cpuctx, ctx, cpu)) {
647 partial_group = event;
648 goto group_error;
649 }
650 }
651
652 return 0;
653
654group_error:
655 /*
656 * Groups can be scheduled in as one unit only, so undo any
657 * partial group before returning:
658 */
659 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
660 if (event == partial_group)
661 break;
662 event_sched_out(event, cpuctx, ctx);
663 }
664 event_sched_out(group_event, cpuctx, ctx);
665
666 return -EAGAIN;
667}
668
669/*
670 * Return 1 for a group consisting entirely of software events,
671 * 0 if the group contains any hardware events.
672 */
673static int is_software_only_group(struct perf_event *leader)
674{
675 struct perf_event *event;
676
677 if (!is_software_event(leader))
678 return 0;
679
680 list_for_each_entry(event, &leader->sibling_list, group_entry)
681 if (!is_software_event(event))
682 return 0;
683
684 return 1;
685}
686
687/*
688 * Work out whether we can put this event group on the CPU now.
689 */
690static int group_can_go_on(struct perf_event *event,
691 struct perf_cpu_context *cpuctx,
692 int can_add_hw)
693{
694 /*
695 * Groups consisting entirely of software events can always go on.
696 */
697 if (is_software_only_group(event))
698 return 1;
699 /*
700 * If an exclusive group is already on, no other hardware
701 * events can go on.
702 */
703 if (cpuctx->exclusive)
704 return 0;
705 /*
706 * If this group is exclusive and there are already
707 * events on the CPU, it can't go on.
708 */
709 if (event->attr.exclusive && cpuctx->active_oncpu)
710 return 0;
711 /*
712 * Otherwise, try to add it if all previous groups were able
713 * to go on.
714 */
715 return can_add_hw;
716}
717
718static void add_event_to_ctx(struct perf_event *event,
719 struct perf_event_context *ctx)
720{
721 list_add_event(event, ctx);
722 event->tstamp_enabled = ctx->time;
723 event->tstamp_running = ctx->time;
724 event->tstamp_stopped = ctx->time;
725}
726
727/*
728 * Cross CPU call to install and enable a performance event
729 *
730 * Must be called with ctx->mutex held
731 */
732static void __perf_install_in_context(void *info)
733{
734 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
735 struct perf_event *event = info;
736 struct perf_event_context *ctx = event->ctx;
737 struct perf_event *leader = event->group_leader;
738 int cpu = smp_processor_id();
739 int err;
740
741 /*
742 * If this is a task context, we need to check whether it is
743 * the current task context of this cpu. If not it has been
744 * scheduled out before the smp call arrived.
745 * Or possibly this is the right context but it isn't
746 * on this cpu because it had no events.
747 */
748 if (ctx->task && cpuctx->task_ctx != ctx) {
749 if (cpuctx->task_ctx || ctx->task != current)
750 return;
751 cpuctx->task_ctx = ctx;
752 }
753
754 spin_lock(&ctx->lock);
755 ctx->is_active = 1;
756 update_context_time(ctx);
757
758 /*
759 * Protect the list operation against NMI by disabling the
760 * events on a global level. NOP for non NMI based events.
761 */
762 perf_disable();
763
764 add_event_to_ctx(event, ctx);
765
766 /*
767 * Don't put the event on if it is disabled or if
768 * it is in a group and the group isn't on.
769 */
770 if (event->state != PERF_EVENT_STATE_INACTIVE ||
771 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
772 goto unlock;
773
774 /*
775 * An exclusive event can't go on if there are already active
776 * hardware events, and no hardware event can go on if there
777 * is already an exclusive event on.
778 */
779 if (!group_can_go_on(event, cpuctx, 1))
780 err = -EEXIST;
781 else
782 err = event_sched_in(event, cpuctx, ctx, cpu);
783
784 if (err) {
785 /*
786 * This event couldn't go on. If it is in a group
787 * then we have to pull the whole group off.
788 * If the event group is pinned then put it in error state.
789 */
790 if (leader != event)
791 group_sched_out(leader, cpuctx, ctx);
792 if (leader->attr.pinned) {
793 update_group_times(leader);
794 leader->state = PERF_EVENT_STATE_ERROR;
795 }
796 }
797
798 if (!err && !ctx->task && cpuctx->max_pertask)
799 cpuctx->max_pertask--;
800
801 unlock:
802 perf_enable();
803
804 spin_unlock(&ctx->lock);
805}
806
807/*
808 * Attach a performance event to a context
809 *
810 * First we add the event to the list with the hardware enable bit
811 * in event->hw_config cleared.
812 *
813 * If the event is attached to a task which is on a CPU we use a smp
814 * call to enable it in the task context. The task might have been
815 * scheduled away, but we check this in the smp call again.
816 *
817 * Must be called with ctx->mutex held.
818 */
819static void
820perf_install_in_context(struct perf_event_context *ctx,
821 struct perf_event *event,
822 int cpu)
823{
824 struct task_struct *task = ctx->task;
825
826 if (!task) {
827 /*
828 * Per cpu events are installed via an smp call and
829 * the install is always sucessful.
830 */
831 smp_call_function_single(cpu, __perf_install_in_context,
832 event, 1);
833 return;
834 }
835
836retry:
837 task_oncpu_function_call(task, __perf_install_in_context,
838 event);
839
840 spin_lock_irq(&ctx->lock);
841 /*
842 * we need to retry the smp call.
843 */
844 if (ctx->is_active && list_empty(&event->group_entry)) {
845 spin_unlock_irq(&ctx->lock);
846 goto retry;
847 }
848
849 /*
850 * The lock prevents that this context is scheduled in so we
851 * can add the event safely, if it the call above did not
852 * succeed.
853 */
854 if (list_empty(&event->group_entry))
855 add_event_to_ctx(event, ctx);
856 spin_unlock_irq(&ctx->lock);
857}
858
859/*
860 * Put a event into inactive state and update time fields.
861 * Enabling the leader of a group effectively enables all
862 * the group members that aren't explicitly disabled, so we
863 * have to update their ->tstamp_enabled also.
864 * Note: this works for group members as well as group leaders
865 * since the non-leader members' sibling_lists will be empty.
866 */
867static void __perf_event_mark_enabled(struct perf_event *event,
868 struct perf_event_context *ctx)
869{
870 struct perf_event *sub;
871
872 event->state = PERF_EVENT_STATE_INACTIVE;
873 event->tstamp_enabled = ctx->time - event->total_time_enabled;
874 list_for_each_entry(sub, &event->sibling_list, group_entry)
875 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
876 sub->tstamp_enabled =
877 ctx->time - sub->total_time_enabled;
878}
879
880/*
881 * Cross CPU call to enable a performance event
882 */
883static void __perf_event_enable(void *info)
884{
885 struct perf_event *event = info;
886 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
887 struct perf_event_context *ctx = event->ctx;
888 struct perf_event *leader = event->group_leader;
889 int err;
890
891 /*
892 * If this is a per-task event, need to check whether this
893 * event's task is the current task on this cpu.
894 */
895 if (ctx->task && cpuctx->task_ctx != ctx) {
896 if (cpuctx->task_ctx || ctx->task != current)
897 return;
898 cpuctx->task_ctx = ctx;
899 }
900
901 spin_lock(&ctx->lock);
902 ctx->is_active = 1;
903 update_context_time(ctx);
904
905 if (event->state >= PERF_EVENT_STATE_INACTIVE)
906 goto unlock;
907 __perf_event_mark_enabled(event, ctx);
908
909 /*
910 * If the event is in a group and isn't the group leader,
911 * then don't put it on unless the group is on.
912 */
913 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
914 goto unlock;
915
916 if (!group_can_go_on(event, cpuctx, 1)) {
917 err = -EEXIST;
918 } else {
919 perf_disable();
920 if (event == leader)
921 err = group_sched_in(event, cpuctx, ctx,
922 smp_processor_id());
923 else
924 err = event_sched_in(event, cpuctx, ctx,
925 smp_processor_id());
926 perf_enable();
927 }
928
929 if (err) {
930 /*
931 * If this event can't go on and it's part of a
932 * group, then the whole group has to come off.
933 */
934 if (leader != event)
935 group_sched_out(leader, cpuctx, ctx);
936 if (leader->attr.pinned) {
937 update_group_times(leader);
938 leader->state = PERF_EVENT_STATE_ERROR;
939 }
940 }
941
942 unlock:
943 spin_unlock(&ctx->lock);
944}
945
946/*
947 * Enable a event.
948 *
949 * If event->ctx is a cloned context, callers must make sure that
950 * every task struct that event->ctx->task could possibly point to
951 * remains valid. This condition is satisfied when called through
952 * perf_event_for_each_child or perf_event_for_each as described
953 * for perf_event_disable.
954 */
955static void perf_event_enable(struct perf_event *event)
956{
957 struct perf_event_context *ctx = event->ctx;
958 struct task_struct *task = ctx->task;
959
960 if (!task) {
961 /*
962 * Enable the event on the cpu that it's on
963 */
964 smp_call_function_single(event->cpu, __perf_event_enable,
965 event, 1);
966 return;
967 }
968
969 spin_lock_irq(&ctx->lock);
970 if (event->state >= PERF_EVENT_STATE_INACTIVE)
971 goto out;
972
973 /*
974 * If the event is in error state, clear that first.
975 * That way, if we see the event in error state below, we
976 * know that it has gone back into error state, as distinct
977 * from the task having been scheduled away before the
978 * cross-call arrived.
979 */
980 if (event->state == PERF_EVENT_STATE_ERROR)
981 event->state = PERF_EVENT_STATE_OFF;
982
983 retry:
984 spin_unlock_irq(&ctx->lock);
985 task_oncpu_function_call(task, __perf_event_enable, event);
986
987 spin_lock_irq(&ctx->lock);
988
989 /*
990 * If the context is active and the event is still off,
991 * we need to retry the cross-call.
992 */
993 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
994 goto retry;
995
996 /*
997 * Since we have the lock this context can't be scheduled
998 * in, so we can change the state safely.
999 */
1000 if (event->state == PERF_EVENT_STATE_OFF)
1001 __perf_event_mark_enabled(event, ctx);
1002
1003 out:
1004 spin_unlock_irq(&ctx->lock);
1005}
1006
1007static int perf_event_refresh(struct perf_event *event, int refresh)
1008{
1009 /*
1010 * not supported on inherited events
1011 */
1012 if (event->attr.inherit)
1013 return -EINVAL;
1014
1015 atomic_add(refresh, &event->event_limit);
1016 perf_event_enable(event);
1017
1018 return 0;
1019}
1020
1021void __perf_event_sched_out(struct perf_event_context *ctx,
1022 struct perf_cpu_context *cpuctx)
1023{
1024 struct perf_event *event;
1025
1026 spin_lock(&ctx->lock);
1027 ctx->is_active = 0;
1028 if (likely(!ctx->nr_events))
1029 goto out;
1030 update_context_time(ctx);
1031
1032 perf_disable();
1033 if (ctx->nr_active) {
1034 list_for_each_entry(event, &ctx->group_list, group_entry) {
1035 if (event != event->group_leader)
1036 event_sched_out(event, cpuctx, ctx);
1037 else
1038 group_sched_out(event, cpuctx, ctx);
1039 }
1040 }
1041 perf_enable();
1042 out:
1043 spin_unlock(&ctx->lock);
1044}
1045
1046/*
1047 * Test whether two contexts are equivalent, i.e. whether they
1048 * have both been cloned from the same version of the same context
1049 * and they both have the same number of enabled events.
1050 * If the number of enabled events is the same, then the set
1051 * of enabled events should be the same, because these are both
1052 * inherited contexts, therefore we can't access individual events
1053 * in them directly with an fd; we can only enable/disable all
1054 * events via prctl, or enable/disable all events in a family
1055 * via ioctl, which will have the same effect on both contexts.
1056 */
1057static int context_equiv(struct perf_event_context *ctx1,
1058 struct perf_event_context *ctx2)
1059{
1060 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
1061 && ctx1->parent_gen == ctx2->parent_gen
1062 && !ctx1->pin_count && !ctx2->pin_count;
1063}
1064
1065static void __perf_event_read(void *event);
1066
1067static void __perf_event_sync_stat(struct perf_event *event,
1068 struct perf_event *next_event)
1069{
1070 u64 value;
1071
1072 if (!event->attr.inherit_stat)
1073 return;
1074
1075 /*
1076 * Update the event value, we cannot use perf_event_read()
1077 * because we're in the middle of a context switch and have IRQs
1078 * disabled, which upsets smp_call_function_single(), however
1079 * we know the event must be on the current CPU, therefore we
1080 * don't need to use it.
1081 */
1082 switch (event->state) {
1083 case PERF_EVENT_STATE_ACTIVE:
1084 __perf_event_read(event);
1085 break;
1086
1087 case PERF_EVENT_STATE_INACTIVE:
1088 update_event_times(event);
1089 break;
1090
1091 default:
1092 break;
1093 }
1094
1095 /*
1096 * In order to keep per-task stats reliable we need to flip the event
1097 * values when we flip the contexts.
1098 */
1099 value = atomic64_read(&next_event->count);
1100 value = atomic64_xchg(&event->count, value);
1101 atomic64_set(&next_event->count, value);
1102
1103 swap(event->total_time_enabled, next_event->total_time_enabled);
1104 swap(event->total_time_running, next_event->total_time_running);
1105
1106 /*
1107 * Since we swizzled the values, update the user visible data too.
1108 */
1109 perf_event_update_userpage(event);
1110 perf_event_update_userpage(next_event);
1111}
1112
1113#define list_next_entry(pos, member) \
1114 list_entry(pos->member.next, typeof(*pos), member)
1115
1116static void perf_event_sync_stat(struct perf_event_context *ctx,
1117 struct perf_event_context *next_ctx)
1118{
1119 struct perf_event *event, *next_event;
1120
1121 if (!ctx->nr_stat)
1122 return;
1123
1124 event = list_first_entry(&ctx->event_list,
1125 struct perf_event, event_entry);
1126
1127 next_event = list_first_entry(&next_ctx->event_list,
1128 struct perf_event, event_entry);
1129
1130 while (&event->event_entry != &ctx->event_list &&
1131 &next_event->event_entry != &next_ctx->event_list) {
1132
1133 __perf_event_sync_stat(event, next_event);
1134
1135 event = list_next_entry(event, event_entry);
1136 next_event = list_next_entry(next_event, event_entry);
1137 }
1138}
1139
1140/*
1141 * Called from scheduler to remove the events of the current task,
1142 * with interrupts disabled.
1143 *
1144 * We stop each event and update the event value in event->count.
1145 *
1146 * This does not protect us against NMI, but disable()
1147 * sets the disabled bit in the control field of event _before_
1148 * accessing the event control register. If a NMI hits, then it will
1149 * not restart the event.
1150 */
1151void perf_event_task_sched_out(struct task_struct *task,
1152 struct task_struct *next, int cpu)
1153{
1154 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1155 struct perf_event_context *ctx = task->perf_event_ctxp;
1156 struct perf_event_context *next_ctx;
1157 struct perf_event_context *parent;
1158 struct pt_regs *regs;
1159 int do_switch = 1;
1160
1161 regs = task_pt_regs(task);
1162 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1163
1164 if (likely(!ctx || !cpuctx->task_ctx))
1165 return;
1166
1167 update_context_time(ctx);
1168
1169 rcu_read_lock();
1170 parent = rcu_dereference(ctx->parent_ctx);
1171 next_ctx = next->perf_event_ctxp;
1172 if (parent && next_ctx &&
1173 rcu_dereference(next_ctx->parent_ctx) == parent) {
1174 /*
1175 * Looks like the two contexts are clones, so we might be
1176 * able to optimize the context switch. We lock both
1177 * contexts and check that they are clones under the
1178 * lock (including re-checking that neither has been
1179 * uncloned in the meantime). It doesn't matter which
1180 * order we take the locks because no other cpu could
1181 * be trying to lock both of these tasks.
1182 */
1183 spin_lock(&ctx->lock);
1184 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1185 if (context_equiv(ctx, next_ctx)) {
1186 /*
1187 * XXX do we need a memory barrier of sorts
1188 * wrt to rcu_dereference() of perf_event_ctxp
1189 */
1190 task->perf_event_ctxp = next_ctx;
1191 next->perf_event_ctxp = ctx;
1192 ctx->task = next;
1193 next_ctx->task = task;
1194 do_switch = 0;
1195
1196 perf_event_sync_stat(ctx, next_ctx);
1197 }
1198 spin_unlock(&next_ctx->lock);
1199 spin_unlock(&ctx->lock);
1200 }
1201 rcu_read_unlock();
1202
1203 if (do_switch) {
1204 __perf_event_sched_out(ctx, cpuctx);
1205 cpuctx->task_ctx = NULL;
1206 }
1207}
1208
1209/*
1210 * Called with IRQs disabled
1211 */
1212static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1213{
1214 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1215
1216 if (!cpuctx->task_ctx)
1217 return;
1218
1219 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1220 return;
1221
1222 __perf_event_sched_out(ctx, cpuctx);
1223 cpuctx->task_ctx = NULL;
1224}
1225
1226/*
1227 * Called with IRQs disabled
1228 */
1229static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx)
1230{
1231 __perf_event_sched_out(&cpuctx->ctx, cpuctx);
1232}
1233
1234static void
1235__perf_event_sched_in(struct perf_event_context *ctx,
1236 struct perf_cpu_context *cpuctx, int cpu)
1237{
1238 struct perf_event *event;
1239 int can_add_hw = 1;
1240
1241 spin_lock(&ctx->lock);
1242 ctx->is_active = 1;
1243 if (likely(!ctx->nr_events))
1244 goto out;
1245
1246 ctx->timestamp = perf_clock();
1247
1248 perf_disable();
1249
1250 /*
1251 * First go through the list and put on any pinned groups
1252 * in order to give them the best chance of going on.
1253 */
1254 list_for_each_entry(event, &ctx->group_list, group_entry) {
1255 if (event->state <= PERF_EVENT_STATE_OFF ||
1256 !event->attr.pinned)
1257 continue;
1258 if (event->cpu != -1 && event->cpu != cpu)
1259 continue;
1260
1261 if (event != event->group_leader)
1262 event_sched_in(event, cpuctx, ctx, cpu);
1263 else {
1264 if (group_can_go_on(event, cpuctx, 1))
1265 group_sched_in(event, cpuctx, ctx, cpu);
1266 }
1267
1268 /*
1269 * If this pinned group hasn't been scheduled,
1270 * put it in error state.
1271 */
1272 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1273 update_group_times(event);
1274 event->state = PERF_EVENT_STATE_ERROR;
1275 }
1276 }
1277
1278 list_for_each_entry(event, &ctx->group_list, group_entry) {
1279 /*
1280 * Ignore events in OFF or ERROR state, and
1281 * ignore pinned events since we did them already.
1282 */
1283 if (event->state <= PERF_EVENT_STATE_OFF ||
1284 event->attr.pinned)
1285 continue;
1286
1287 /*
1288 * Listen to the 'cpu' scheduling filter constraint
1289 * of events:
1290 */
1291 if (event->cpu != -1 && event->cpu != cpu)
1292 continue;
1293
1294 if (event != event->group_leader) {
1295 if (event_sched_in(event, cpuctx, ctx, cpu))
1296 can_add_hw = 0;
1297 } else {
1298 if (group_can_go_on(event, cpuctx, can_add_hw)) {
1299 if (group_sched_in(event, cpuctx, ctx, cpu))
1300 can_add_hw = 0;
1301 }
1302 }
1303 }
1304 perf_enable();
1305 out:
1306 spin_unlock(&ctx->lock);
1307}
1308
1309/*
1310 * Called from scheduler to add the events of the current task
1311 * with interrupts disabled.
1312 *
1313 * We restore the event value and then enable it.
1314 *
1315 * This does not protect us against NMI, but enable()
1316 * sets the enabled bit in the control field of event _before_
1317 * accessing the event control register. If a NMI hits, then it will
1318 * keep the event running.
1319 */
1320void perf_event_task_sched_in(struct task_struct *task, int cpu)
1321{
1322 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1323 struct perf_event_context *ctx = task->perf_event_ctxp;
1324
1325 if (likely(!ctx))
1326 return;
1327 if (cpuctx->task_ctx == ctx)
1328 return;
1329 __perf_event_sched_in(ctx, cpuctx, cpu);
1330 cpuctx->task_ctx = ctx;
1331}
1332
1333static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
1334{
1335 struct perf_event_context *ctx = &cpuctx->ctx;
1336
1337 __perf_event_sched_in(ctx, cpuctx, cpu);
1338}
1339
1340#define MAX_INTERRUPTS (~0ULL)
1341
1342static void perf_log_throttle(struct perf_event *event, int enable);
1343
1344static void perf_adjust_period(struct perf_event *event, u64 events)
1345{
1346 struct hw_perf_event *hwc = &event->hw;
1347 u64 period, sample_period;
1348 s64 delta;
1349
1350 events *= hwc->sample_period;
1351 period = div64_u64(events, event->attr.sample_freq);
1352
1353 delta = (s64)(period - hwc->sample_period);
1354 delta = (delta + 7) / 8; /* low pass filter */
1355
1356 sample_period = hwc->sample_period + delta;
1357
1358 if (!sample_period)
1359 sample_period = 1;
1360
1361 hwc->sample_period = sample_period;
1362}
1363
1364static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1365{
1366 struct perf_event *event;
1367 struct hw_perf_event *hwc;
1368 u64 interrupts, freq;
1369
1370 spin_lock(&ctx->lock);
1371 list_for_each_entry(event, &ctx->group_list, group_entry) {
1372 if (event->state != PERF_EVENT_STATE_ACTIVE)
1373 continue;
1374
1375 hwc = &event->hw;
1376
1377 interrupts = hwc->interrupts;
1378 hwc->interrupts = 0;
1379
1380 /*
1381 * unthrottle events on the tick
1382 */
1383 if (interrupts == MAX_INTERRUPTS) {
1384 perf_log_throttle(event, 1);
1385 event->pmu->unthrottle(event);
1386 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1387 }
1388
1389 if (!event->attr.freq || !event->attr.sample_freq)
1390 continue;
1391
1392 /*
1393 * if the specified freq < HZ then we need to skip ticks
1394 */
1395 if (event->attr.sample_freq < HZ) {
1396 freq = event->attr.sample_freq;
1397
1398 hwc->freq_count += freq;
1399 hwc->freq_interrupts += interrupts;
1400
1401 if (hwc->freq_count < HZ)
1402 continue;
1403
1404 interrupts = hwc->freq_interrupts;
1405 hwc->freq_interrupts = 0;
1406 hwc->freq_count -= HZ;
1407 } else
1408 freq = HZ;
1409
1410 perf_adjust_period(event, freq * interrupts);
1411
1412 /*
1413 * In order to avoid being stalled by an (accidental) huge
1414 * sample period, force reset the sample period if we didn't
1415 * get any events in this freq period.
1416 */
1417 if (!interrupts) {
1418 perf_disable();
1419 event->pmu->disable(event);
1420 atomic64_set(&hwc->period_left, 0);
1421 event->pmu->enable(event);
1422 perf_enable();
1423 }
1424 }
1425 spin_unlock(&ctx->lock);
1426}
1427
1428/*
1429 * Round-robin a context's events:
1430 */
1431static void rotate_ctx(struct perf_event_context *ctx)
1432{
1433 struct perf_event *event;
1434
1435 if (!ctx->nr_events)
1436 return;
1437
1438 spin_lock(&ctx->lock);
1439 /*
1440 * Rotate the first entry last (works just fine for group events too):
1441 */
1442 perf_disable();
1443 list_for_each_entry(event, &ctx->group_list, group_entry) {
1444 list_move_tail(&event->group_entry, &ctx->group_list);
1445 break;
1446 }
1447 perf_enable();
1448
1449 spin_unlock(&ctx->lock);
1450}
1451
1452void perf_event_task_tick(struct task_struct *curr, int cpu)
1453{
1454 struct perf_cpu_context *cpuctx;
1455 struct perf_event_context *ctx;
1456
1457 if (!atomic_read(&nr_events))
1458 return;
1459
1460 cpuctx = &per_cpu(perf_cpu_context, cpu);
1461 ctx = curr->perf_event_ctxp;
1462
1463 perf_ctx_adjust_freq(&cpuctx->ctx);
1464 if (ctx)
1465 perf_ctx_adjust_freq(ctx);
1466
1467 perf_event_cpu_sched_out(cpuctx);
1468 if (ctx)
1469 __perf_event_task_sched_out(ctx);
1470
1471 rotate_ctx(&cpuctx->ctx);
1472 if (ctx)
1473 rotate_ctx(ctx);
1474
1475 perf_event_cpu_sched_in(cpuctx, cpu);
1476 if (ctx)
1477 perf_event_task_sched_in(curr, cpu);
1478}
1479
1480/*
1481 * Enable all of a task's events that have been marked enable-on-exec.
1482 * This expects task == current.
1483 */
1484static void perf_event_enable_on_exec(struct task_struct *task)
1485{
1486 struct perf_event_context *ctx;
1487 struct perf_event *event;
1488 unsigned long flags;
1489 int enabled = 0;
1490
1491 local_irq_save(flags);
1492 ctx = task->perf_event_ctxp;
1493 if (!ctx || !ctx->nr_events)
1494 goto out;
1495
1496 __perf_event_task_sched_out(ctx);
1497
1498 spin_lock(&ctx->lock);
1499
1500 list_for_each_entry(event, &ctx->group_list, group_entry) {
1501 if (!event->attr.enable_on_exec)
1502 continue;
1503 event->attr.enable_on_exec = 0;
1504 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1505 continue;
1506 __perf_event_mark_enabled(event, ctx);
1507 enabled = 1;
1508 }
1509
1510 /*
1511 * Unclone this context if we enabled any event.
1512 */
1513 if (enabled)
1514 unclone_ctx(ctx);
1515
1516 spin_unlock(&ctx->lock);
1517
1518 perf_event_task_sched_in(task, smp_processor_id());
1519 out:
1520 local_irq_restore(flags);
1521}
1522
1523/*
1524 * Cross CPU call to read the hardware event
1525 */
1526static void __perf_event_read(void *info)
1527{
1528 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1529 struct perf_event *event = info;
1530 struct perf_event_context *ctx = event->ctx;
1531 unsigned long flags;
1532
1533 /*
1534 * If this is a task context, we need to check whether it is
1535 * the current task context of this cpu. If not it has been
1536 * scheduled out before the smp call arrived. In that case
1537 * event->count would have been updated to a recent sample
1538 * when the event was scheduled out.
1539 */
1540 if (ctx->task && cpuctx->task_ctx != ctx)
1541 return;
1542
1543 local_irq_save(flags);
1544 if (ctx->is_active)
1545 update_context_time(ctx);
1546 event->pmu->read(event);
1547 update_event_times(event);
1548 local_irq_restore(flags);
1549}
1550
1551static u64 perf_event_read(struct perf_event *event)
1552{
1553 /*
1554 * If event is enabled and currently active on a CPU, update the
1555 * value in the event structure:
1556 */
1557 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1558 smp_call_function_single(event->oncpu,
1559 __perf_event_read, event, 1);
1560 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
1561 update_event_times(event);
1562 }
1563
1564 return atomic64_read(&event->count);
1565}
1566
1567/*
1568 * Initialize the perf_event context in a task_struct:
1569 */
1570static void
1571__perf_event_init_context(struct perf_event_context *ctx,
1572 struct task_struct *task)
1573{
1574 memset(ctx, 0, sizeof(*ctx));
1575 spin_lock_init(&ctx->lock);
1576 mutex_init(&ctx->mutex);
1577 INIT_LIST_HEAD(&ctx->group_list);
1578 INIT_LIST_HEAD(&ctx->event_list);
1579 atomic_set(&ctx->refcount, 1);
1580 ctx->task = task;
1581}
1582
1583static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1584{
1585 struct perf_event_context *ctx;
1586 struct perf_cpu_context *cpuctx;
1587 struct task_struct *task;
1588 unsigned long flags;
1589 int err;
1590
1591 /*
1592 * If cpu is not a wildcard then this is a percpu event:
1593 */
1594 if (cpu != -1) {
1595 /* Must be root to operate on a CPU event: */
1596 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1597 return ERR_PTR(-EACCES);
1598
1599 if (cpu < 0 || cpu > num_possible_cpus())
1600 return ERR_PTR(-EINVAL);
1601
1602 /*
1603 * We could be clever and allow to attach a event to an
1604 * offline CPU and activate it when the CPU comes up, but
1605 * that's for later.
1606 */
1607 if (!cpu_isset(cpu, cpu_online_map))
1608 return ERR_PTR(-ENODEV);
1609
1610 cpuctx = &per_cpu(perf_cpu_context, cpu);
1611 ctx = &cpuctx->ctx;
1612 get_ctx(ctx);
1613
1614 return ctx;
1615 }
1616
1617 rcu_read_lock();
1618 if (!pid)
1619 task = current;
1620 else
1621 task = find_task_by_vpid(pid);
1622 if (task)
1623 get_task_struct(task);
1624 rcu_read_unlock();
1625
1626 if (!task)
1627 return ERR_PTR(-ESRCH);
1628
1629 /*
1630 * Can't attach events to a dying task.
1631 */
1632 err = -ESRCH;
1633 if (task->flags & PF_EXITING)
1634 goto errout;
1635
1636 /* Reuse ptrace permission checks for now. */
1637 err = -EACCES;
1638 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1639 goto errout;
1640
1641 retry:
1642 ctx = perf_lock_task_context(task, &flags);
1643 if (ctx) {
1644 unclone_ctx(ctx);
1645 spin_unlock_irqrestore(&ctx->lock, flags);
1646 }
1647
1648 if (!ctx) {
1649 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1650 err = -ENOMEM;
1651 if (!ctx)
1652 goto errout;
1653 __perf_event_init_context(ctx, task);
1654 get_ctx(ctx);
1655 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
1656 /*
1657 * We raced with some other task; use
1658 * the context they set.
1659 */
1660 kfree(ctx);
1661 goto retry;
1662 }
1663 get_task_struct(task);
1664 }
1665
1666 put_task_struct(task);
1667 return ctx;
1668
1669 errout:
1670 put_task_struct(task);
1671 return ERR_PTR(err);
1672}
1673
1674static void free_event_rcu(struct rcu_head *head)
1675{
1676 struct perf_event *event;
1677
1678 event = container_of(head, struct perf_event, rcu_head);
1679 if (event->ns)
1680 put_pid_ns(event->ns);
1681 kfree(event);
1682}
1683
1684static void perf_pending_sync(struct perf_event *event);
1685
1686static void free_event(struct perf_event *event)
1687{
1688 perf_pending_sync(event);
1689
1690 if (!event->parent) {
1691 atomic_dec(&nr_events);
1692 if (event->attr.mmap)
1693 atomic_dec(&nr_mmap_events);
1694 if (event->attr.comm)
1695 atomic_dec(&nr_comm_events);
1696 if (event->attr.task)
1697 atomic_dec(&nr_task_events);
1698 }
1699
1700 if (event->output) {
1701 fput(event->output->filp);
1702 event->output = NULL;
1703 }
1704
1705 if (event->destroy)
1706 event->destroy(event);
1707
1708 put_ctx(event->ctx);
1709 call_rcu(&event->rcu_head, free_event_rcu);
1710}
1711
1712/*
1713 * Called when the last reference to the file is gone.
1714 */
1715static int perf_release(struct inode *inode, struct file *file)
1716{
1717 struct perf_event *event = file->private_data;
1718 struct perf_event_context *ctx = event->ctx;
1719
1720 file->private_data = NULL;
1721
1722 WARN_ON_ONCE(ctx->parent_ctx);
1723 mutex_lock(&ctx->mutex);
1724 perf_event_remove_from_context(event);
1725 mutex_unlock(&ctx->mutex);
1726
1727 mutex_lock(&event->owner->perf_event_mutex);
1728 list_del_init(&event->owner_entry);
1729 mutex_unlock(&event->owner->perf_event_mutex);
1730 put_task_struct(event->owner);
1731
1732 free_event(event);
1733
1734 return 0;
1735}
1736
1737static int perf_event_read_size(struct perf_event *event)
1738{
1739 int entry = sizeof(u64); /* value */
1740 int size = 0;
1741 int nr = 1;
1742
1743 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1744 size += sizeof(u64);
1745
1746 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1747 size += sizeof(u64);
1748
1749 if (event->attr.read_format & PERF_FORMAT_ID)
1750 entry += sizeof(u64);
1751
1752 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1753 nr += event->group_leader->nr_siblings;
1754 size += sizeof(u64);
1755 }
1756
1757 size += entry * nr;
1758
1759 return size;
1760}
1761
1762static u64 perf_event_read_value(struct perf_event *event)
1763{
1764 struct perf_event *child;
1765 u64 total = 0;
1766
1767 total += perf_event_read(event);
1768 list_for_each_entry(child, &event->child_list, child_list)
1769 total += perf_event_read(child);
1770
1771 return total;
1772}
1773
1774static int perf_event_read_entry(struct perf_event *event,
1775 u64 read_format, char __user *buf)
1776{
1777 int n = 0, count = 0;
1778 u64 values[2];
1779
1780 values[n++] = perf_event_read_value(event);
1781 if (read_format & PERF_FORMAT_ID)
1782 values[n++] = primary_event_id(event);
1783
1784 count = n * sizeof(u64);
1785
1786 if (copy_to_user(buf, values, count))
1787 return -EFAULT;
1788
1789 return count;
1790}
1791
1792static int perf_event_read_group(struct perf_event *event,
1793 u64 read_format, char __user *buf)
1794{
1795 struct perf_event *leader = event->group_leader, *sub;
1796 int n = 0, size = 0, err = -EFAULT;
1797 u64 values[3];
1798
1799 values[n++] = 1 + leader->nr_siblings;
1800 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1801 values[n++] = leader->total_time_enabled +
1802 atomic64_read(&leader->child_total_time_enabled);
1803 }
1804 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1805 values[n++] = leader->total_time_running +
1806 atomic64_read(&leader->child_total_time_running);
1807 }
1808
1809 size = n * sizeof(u64);
1810
1811 if (copy_to_user(buf, values, size))
1812 return -EFAULT;
1813
1814 err = perf_event_read_entry(leader, read_format, buf + size);
1815 if (err < 0)
1816 return err;
1817
1818 size += err;
1819
1820 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1821 err = perf_event_read_entry(sub, read_format,
1822 buf + size);
1823 if (err < 0)
1824 return err;
1825
1826 size += err;
1827 }
1828
1829 return size;
1830}
1831
1832static int perf_event_read_one(struct perf_event *event,
1833 u64 read_format, char __user *buf)
1834{
1835 u64 values[4];
1836 int n = 0;
1837
1838 values[n++] = perf_event_read_value(event);
1839 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
1840 values[n++] = event->total_time_enabled +
1841 atomic64_read(&event->child_total_time_enabled);
1842 }
1843 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
1844 values[n++] = event->total_time_running +
1845 atomic64_read(&event->child_total_time_running);
1846 }
1847 if (read_format & PERF_FORMAT_ID)
1848 values[n++] = primary_event_id(event);
1849
1850 if (copy_to_user(buf, values, n * sizeof(u64)))
1851 return -EFAULT;
1852
1853 return n * sizeof(u64);
1854}
1855
1856/*
1857 * Read the performance event - simple non blocking version for now
1858 */
1859static ssize_t
1860perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
1861{
1862 u64 read_format = event->attr.read_format;
1863 int ret;
1864
1865 /*
1866 * Return end-of-file for a read on a event that is in
1867 * error state (i.e. because it was pinned but it couldn't be
1868 * scheduled on to the CPU at some point).
1869 */
1870 if (event->state == PERF_EVENT_STATE_ERROR)
1871 return 0;
1872
1873 if (count < perf_event_read_size(event))
1874 return -ENOSPC;
1875
1876 WARN_ON_ONCE(event->ctx->parent_ctx);
1877 mutex_lock(&event->child_mutex);
1878 if (read_format & PERF_FORMAT_GROUP)
1879 ret = perf_event_read_group(event, read_format, buf);
1880 else
1881 ret = perf_event_read_one(event, read_format, buf);
1882 mutex_unlock(&event->child_mutex);
1883
1884 return ret;
1885}
1886
1887static ssize_t
1888perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1889{
1890 struct perf_event *event = file->private_data;
1891
1892 return perf_read_hw(event, buf, count);
1893}
1894
1895static unsigned int perf_poll(struct file *file, poll_table *wait)
1896{
1897 struct perf_event *event = file->private_data;
1898 struct perf_mmap_data *data;
1899 unsigned int events = POLL_HUP;
1900
1901 rcu_read_lock();
1902 data = rcu_dereference(event->data);
1903 if (data)
1904 events = atomic_xchg(&data->poll, 0);
1905 rcu_read_unlock();
1906
1907 poll_wait(file, &event->waitq, wait);
1908
1909 return events;
1910}
1911
1912static void perf_event_reset(struct perf_event *event)
1913{
1914 (void)perf_event_read(event);
1915 atomic64_set(&event->count, 0);
1916 perf_event_update_userpage(event);
1917}
1918
1919/*
1920 * Holding the top-level event's child_mutex means that any
1921 * descendant process that has inherited this event will block
1922 * in sync_child_event if it goes to exit, thus satisfying the
1923 * task existence requirements of perf_event_enable/disable.
1924 */
1925static void perf_event_for_each_child(struct perf_event *event,
1926 void (*func)(struct perf_event *))
1927{
1928 struct perf_event *child;
1929
1930 WARN_ON_ONCE(event->ctx->parent_ctx);
1931 mutex_lock(&event->child_mutex);
1932 func(event);
1933 list_for_each_entry(child, &event->child_list, child_list)
1934 func(child);
1935 mutex_unlock(&event->child_mutex);
1936}
1937
1938static void perf_event_for_each(struct perf_event *event,
1939 void (*func)(struct perf_event *))
1940{
1941 struct perf_event_context *ctx = event->ctx;
1942 struct perf_event *sibling;
1943
1944 WARN_ON_ONCE(ctx->parent_ctx);
1945 mutex_lock(&ctx->mutex);
1946 event = event->group_leader;
1947
1948 perf_event_for_each_child(event, func);
1949 func(event);
1950 list_for_each_entry(sibling, &event->sibling_list, group_entry)
1951 perf_event_for_each_child(event, func);
1952 mutex_unlock(&ctx->mutex);
1953}
1954
1955static int perf_event_period(struct perf_event *event, u64 __user *arg)
1956{
1957 struct perf_event_context *ctx = event->ctx;
1958 unsigned long size;
1959 int ret = 0;
1960 u64 value;
1961
1962 if (!event->attr.sample_period)
1963 return -EINVAL;
1964
1965 size = copy_from_user(&value, arg, sizeof(value));
1966 if (size != sizeof(value))
1967 return -EFAULT;
1968
1969 if (!value)
1970 return -EINVAL;
1971
1972 spin_lock_irq(&ctx->lock);
1973 if (event->attr.freq) {
1974 if (value > sysctl_perf_event_sample_rate) {
1975 ret = -EINVAL;
1976 goto unlock;
1977 }
1978
1979 event->attr.sample_freq = value;
1980 } else {
1981 event->attr.sample_period = value;
1982 event->hw.sample_period = value;
1983 }
1984unlock:
1985 spin_unlock_irq(&ctx->lock);
1986
1987 return ret;
1988}
1989
1990int perf_event_set_output(struct perf_event *event, int output_fd);
1991
1992static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1993{
1994 struct perf_event *event = file->private_data;
1995 void (*func)(struct perf_event *);
1996 u32 flags = arg;
1997
1998 switch (cmd) {
1999 case PERF_EVENT_IOC_ENABLE:
2000 func = perf_event_enable;
2001 break;
2002 case PERF_EVENT_IOC_DISABLE:
2003 func = perf_event_disable;
2004 break;
2005 case PERF_EVENT_IOC_RESET:
2006 func = perf_event_reset;
2007 break;
2008
2009 case PERF_EVENT_IOC_REFRESH:
2010 return perf_event_refresh(event, arg);
2011
2012 case PERF_EVENT_IOC_PERIOD:
2013 return perf_event_period(event, (u64 __user *)arg);
2014
2015 case PERF_EVENT_IOC_SET_OUTPUT:
2016 return perf_event_set_output(event, arg);
2017
2018 default:
2019 return -ENOTTY;
2020 }
2021
2022 if (flags & PERF_IOC_FLAG_GROUP)
2023 perf_event_for_each(event, func);
2024 else
2025 perf_event_for_each_child(event, func);
2026
2027 return 0;
2028}
2029
2030int perf_event_task_enable(void)
2031{
2032 struct perf_event *event;
2033
2034 mutex_lock(&current->perf_event_mutex);
2035 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2036 perf_event_for_each_child(event, perf_event_enable);
2037 mutex_unlock(&current->perf_event_mutex);
2038
2039 return 0;
2040}
2041
2042int perf_event_task_disable(void)
2043{
2044 struct perf_event *event;
2045
2046 mutex_lock(&current->perf_event_mutex);
2047 list_for_each_entry(event, &current->perf_event_list, owner_entry)
2048 perf_event_for_each_child(event, perf_event_disable);
2049 mutex_unlock(&current->perf_event_mutex);
2050
2051 return 0;
2052}
2053
2054#ifndef PERF_EVENT_INDEX_OFFSET
2055# define PERF_EVENT_INDEX_OFFSET 0
2056#endif
2057
2058static int perf_event_index(struct perf_event *event)
2059{
2060 if (event->state != PERF_EVENT_STATE_ACTIVE)
2061 return 0;
2062
2063 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
2064}
2065
2066/*
2067 * Callers need to ensure there can be no nesting of this function, otherwise
2068 * the seqlock logic goes bad. We can not serialize this because the arch
2069 * code calls this from NMI context.
2070 */
2071void perf_event_update_userpage(struct perf_event *event)
2072{
2073 struct perf_event_mmap_page *userpg;
2074 struct perf_mmap_data *data;
2075
2076 rcu_read_lock();
2077 data = rcu_dereference(event->data);
2078 if (!data)
2079 goto unlock;
2080
2081 userpg = data->user_page;
2082
2083 /*
2084 * Disable preemption so as to not let the corresponding user-space
2085 * spin too long if we get preempted.
2086 */
2087 preempt_disable();
2088 ++userpg->lock;
2089 barrier();
2090 userpg->index = perf_event_index(event);
2091 userpg->offset = atomic64_read(&event->count);
2092 if (event->state == PERF_EVENT_STATE_ACTIVE)
2093 userpg->offset -= atomic64_read(&event->hw.prev_count);
2094
2095 userpg->time_enabled = event->total_time_enabled +
2096 atomic64_read(&event->child_total_time_enabled);
2097
2098 userpg->time_running = event->total_time_running +
2099 atomic64_read(&event->child_total_time_running);
2100
2101 barrier();
2102 ++userpg->lock;
2103 preempt_enable();
2104unlock:
2105 rcu_read_unlock();
2106}
2107
2108static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2109{
2110 struct perf_event *event = vma->vm_file->private_data;
2111 struct perf_mmap_data *data;
2112 int ret = VM_FAULT_SIGBUS;
2113
2114 if (vmf->flags & FAULT_FLAG_MKWRITE) {
2115 if (vmf->pgoff == 0)
2116 ret = 0;
2117 return ret;
2118 }
2119
2120 rcu_read_lock();
2121 data = rcu_dereference(event->data);
2122 if (!data)
2123 goto unlock;
2124
2125 if (vmf->pgoff == 0) {
2126 vmf->page = virt_to_page(data->user_page);
2127 } else {
2128 int nr = vmf->pgoff - 1;
2129
2130 if ((unsigned)nr > data->nr_pages)
2131 goto unlock;
2132
2133 if (vmf->flags & FAULT_FLAG_WRITE)
2134 goto unlock;
2135
2136 vmf->page = virt_to_page(data->data_pages[nr]);
2137 }
2138
2139 get_page(vmf->page);
2140 vmf->page->mapping = vma->vm_file->f_mapping;
2141 vmf->page->index = vmf->pgoff;
2142
2143 ret = 0;
2144unlock:
2145 rcu_read_unlock();
2146
2147 return ret;
2148}
2149
2150static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2151{
2152 struct perf_mmap_data *data;
2153 unsigned long size;
2154 int i;
2155
2156 WARN_ON(atomic_read(&event->mmap_count));
2157
2158 size = sizeof(struct perf_mmap_data);
2159 size += nr_pages * sizeof(void *);
2160
2161 data = kzalloc(size, GFP_KERNEL);
2162 if (!data)
2163 goto fail;
2164
2165 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
2166 if (!data->user_page)
2167 goto fail_user_page;
2168
2169 for (i = 0; i < nr_pages; i++) {
2170 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
2171 if (!data->data_pages[i])
2172 goto fail_data_pages;
2173 }
2174
2175 data->nr_pages = nr_pages;
2176 atomic_set(&data->lock, -1);
2177
2178 if (event->attr.watermark) {
2179 data->watermark = min_t(long, PAGE_SIZE * nr_pages,
2180 event->attr.wakeup_watermark);
2181 }
2182 if (!data->watermark)
2183 data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
2184
2185 rcu_assign_pointer(event->data, data);
2186
2187 return 0;
2188
2189fail_data_pages:
2190 for (i--; i >= 0; i--)
2191 free_page((unsigned long)data->data_pages[i]);
2192
2193 free_page((unsigned long)data->user_page);
2194
2195fail_user_page:
2196 kfree(data);
2197
2198fail:
2199 return -ENOMEM;
2200}
2201
2202static void perf_mmap_free_page(unsigned long addr)
2203{
2204 struct page *page = virt_to_page((void *)addr);
2205
2206 page->mapping = NULL;
2207 __free_page(page);
2208}
2209
2210static void __perf_mmap_data_free(struct rcu_head *rcu_head)
2211{
2212 struct perf_mmap_data *data;
2213 int i;
2214
2215 data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
2216
2217 perf_mmap_free_page((unsigned long)data->user_page);
2218 for (i = 0; i < data->nr_pages; i++)
2219 perf_mmap_free_page((unsigned long)data->data_pages[i]);
2220
2221 kfree(data);
2222}
2223
2224static void perf_mmap_data_free(struct perf_event *event)
2225{
2226 struct perf_mmap_data *data = event->data;
2227
2228 WARN_ON(atomic_read(&event->mmap_count));
2229
2230 rcu_assign_pointer(event->data, NULL);
2231 call_rcu(&data->rcu_head, __perf_mmap_data_free);
2232}
2233
2234static void perf_mmap_open(struct vm_area_struct *vma)
2235{
2236 struct perf_event *event = vma->vm_file->private_data;
2237
2238 atomic_inc(&event->mmap_count);
2239}
2240
2241static void perf_mmap_close(struct vm_area_struct *vma)
2242{
2243 struct perf_event *event = vma->vm_file->private_data;
2244
2245 WARN_ON_ONCE(event->ctx->parent_ctx);
2246 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2247 struct user_struct *user = current_user();
2248
2249 atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm);
2250 vma->vm_mm->locked_vm -= event->data->nr_locked;
2251 perf_mmap_data_free(event);
2252 mutex_unlock(&event->mmap_mutex);
2253 }
2254}
2255
2256static struct vm_operations_struct perf_mmap_vmops = {
2257 .open = perf_mmap_open,
2258 .close = perf_mmap_close,
2259 .fault = perf_mmap_fault,
2260 .page_mkwrite = perf_mmap_fault,
2261};
2262
2263static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2264{
2265 struct perf_event *event = file->private_data;
2266 unsigned long user_locked, user_lock_limit;
2267 struct user_struct *user = current_user();
2268 unsigned long locked, lock_limit;
2269 unsigned long vma_size;
2270 unsigned long nr_pages;
2271 long user_extra, extra;
2272 int ret = 0;
2273
2274 if (!(vma->vm_flags & VM_SHARED))
2275 return -EINVAL;
2276
2277 vma_size = vma->vm_end - vma->vm_start;
2278 nr_pages = (vma_size / PAGE_SIZE) - 1;
2279
2280 /*
2281 * If we have data pages ensure they're a power-of-two number, so we
2282 * can do bitmasks instead of modulo.
2283 */
2284 if (nr_pages != 0 && !is_power_of_2(nr_pages))
2285 return -EINVAL;
2286
2287 if (vma_size != PAGE_SIZE * (1 + nr_pages))
2288 return -EINVAL;
2289
2290 if (vma->vm_pgoff != 0)
2291 return -EINVAL;
2292
2293 WARN_ON_ONCE(event->ctx->parent_ctx);
2294 mutex_lock(&event->mmap_mutex);
2295 if (event->output) {
2296 ret = -EINVAL;
2297 goto unlock;
2298 }
2299
2300 if (atomic_inc_not_zero(&event->mmap_count)) {
2301 if (nr_pages != event->data->nr_pages)
2302 ret = -EINVAL;
2303 goto unlock;
2304 }
2305
2306 user_extra = nr_pages + 1;
2307 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
2308
2309 /*
2310 * Increase the limit linearly with more CPUs:
2311 */
2312 user_lock_limit *= num_online_cpus();
2313
2314 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
2315
2316 extra = 0;
2317 if (user_locked > user_lock_limit)
2318 extra = user_locked - user_lock_limit;
2319
2320 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
2321 lock_limit >>= PAGE_SHIFT;
2322 locked = vma->vm_mm->locked_vm + extra;
2323
2324 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2325 !capable(CAP_IPC_LOCK)) {
2326 ret = -EPERM;
2327 goto unlock;
2328 }
2329
2330 WARN_ON(event->data);
2331 ret = perf_mmap_data_alloc(event, nr_pages);
2332 if (ret)
2333 goto unlock;
2334
2335 atomic_set(&event->mmap_count, 1);
2336 atomic_long_add(user_extra, &user->locked_vm);
2337 vma->vm_mm->locked_vm += extra;
2338 event->data->nr_locked = extra;
2339 if (vma->vm_flags & VM_WRITE)
2340 event->data->writable = 1;
2341
2342unlock:
2343 mutex_unlock(&event->mmap_mutex);
2344
2345 vma->vm_flags |= VM_RESERVED;
2346 vma->vm_ops = &perf_mmap_vmops;
2347
2348 return ret;
2349}
2350
2351static int perf_fasync(int fd, struct file *filp, int on)
2352{
2353 struct inode *inode = filp->f_path.dentry->d_inode;
2354 struct perf_event *event = filp->private_data;
2355 int retval;
2356
2357 mutex_lock(&inode->i_mutex);
2358 retval = fasync_helper(fd, filp, on, &event->fasync);
2359 mutex_unlock(&inode->i_mutex);
2360
2361 if (retval < 0)
2362 return retval;
2363
2364 return 0;
2365}
2366
2367static const struct file_operations perf_fops = {
2368 .release = perf_release,
2369 .read = perf_read,
2370 .poll = perf_poll,
2371 .unlocked_ioctl = perf_ioctl,
2372 .compat_ioctl = perf_ioctl,
2373 .mmap = perf_mmap,
2374 .fasync = perf_fasync,
2375};
2376
2377/*
2378 * Perf event wakeup
2379 *
2380 * If there's data, ensure we set the poll() state and publish everything
2381 * to user-space before waking everybody up.
2382 */
2383
2384void perf_event_wakeup(struct perf_event *event)
2385{
2386 wake_up_all(&event->waitq);
2387
2388 if (event->pending_kill) {
2389 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
2390 event->pending_kill = 0;
2391 }
2392}
2393
2394/*
2395 * Pending wakeups
2396 *
2397 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
2398 *
2399 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
2400 * single linked list and use cmpxchg() to add entries lockless.
2401 */
2402
2403static void perf_pending_event(struct perf_pending_entry *entry)
2404{
2405 struct perf_event *event = container_of(entry,
2406 struct perf_event, pending);
2407
2408 if (event->pending_disable) {
2409 event->pending_disable = 0;
2410 __perf_event_disable(event);
2411 }
2412
2413 if (event->pending_wakeup) {
2414 event->pending_wakeup = 0;
2415 perf_event_wakeup(event);
2416 }
2417}
2418
2419#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
2420
2421static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
2422 PENDING_TAIL,
2423};
2424
2425static void perf_pending_queue(struct perf_pending_entry *entry,
2426 void (*func)(struct perf_pending_entry *))
2427{
2428 struct perf_pending_entry **head;
2429
2430 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
2431 return;
2432
2433 entry->func = func;
2434
2435 head = &get_cpu_var(perf_pending_head);
2436
2437 do {
2438 entry->next = *head;
2439 } while (cmpxchg(head, entry->next, entry) != entry->next);
2440
2441 set_perf_event_pending();
2442
2443 put_cpu_var(perf_pending_head);
2444}
2445
2446static int __perf_pending_run(void)
2447{
2448 struct perf_pending_entry *list;
2449 int nr = 0;
2450
2451 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
2452 while (list != PENDING_TAIL) {
2453 void (*func)(struct perf_pending_entry *);
2454 struct perf_pending_entry *entry = list;
2455
2456 list = list->next;
2457
2458 func = entry->func;
2459 entry->next = NULL;
2460 /*
2461 * Ensure we observe the unqueue before we issue the wakeup,
2462 * so that we won't be waiting forever.
2463 * -- see perf_not_pending().
2464 */
2465 smp_wmb();
2466
2467 func(entry);
2468 nr++;
2469 }
2470
2471 return nr;
2472}
2473
2474static inline int perf_not_pending(struct perf_event *event)
2475{
2476 /*
2477 * If we flush on whatever cpu we run, there is a chance we don't
2478 * need to wait.
2479 */
2480 get_cpu();
2481 __perf_pending_run();
2482 put_cpu();
2483
2484 /*
2485 * Ensure we see the proper queue state before going to sleep
2486 * so that we do not miss the wakeup. -- see perf_pending_handle()
2487 */
2488 smp_rmb();
2489 return event->pending.next == NULL;
2490}
2491
2492static void perf_pending_sync(struct perf_event *event)
2493{
2494 wait_event(event->waitq, perf_not_pending(event));
2495}
2496
2497void perf_event_do_pending(void)
2498{
2499 __perf_pending_run();
2500}
2501
2502/*
2503 * Callchain support -- arch specific
2504 */
2505
2506__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2507{
2508 return NULL;
2509}
2510
2511/*
2512 * Output
2513 */
2514static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2515 unsigned long offset, unsigned long head)
2516{
2517 unsigned long mask;
2518
2519 if (!data->writable)
2520 return true;
2521
2522 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2523
2524 offset = (offset - tail) & mask;
2525 head = (head - tail) & mask;
2526
2527 if ((int)(head - offset) < 0)
2528 return false;
2529
2530 return true;
2531}
2532
2533static void perf_output_wakeup(struct perf_output_handle *handle)
2534{
2535 atomic_set(&handle->data->poll, POLL_IN);
2536
2537 if (handle->nmi) {
2538 handle->event->pending_wakeup = 1;
2539 perf_pending_queue(&handle->event->pending,
2540 perf_pending_event);
2541 } else
2542 perf_event_wakeup(handle->event);
2543}
2544
2545/*
2546 * Curious locking construct.
2547 *
2548 * We need to ensure a later event_id doesn't publish a head when a former
2549 * event_id isn't done writing. However since we need to deal with NMIs we
2550 * cannot fully serialize things.
2551 *
2552 * What we do is serialize between CPUs so we only have to deal with NMI
2553 * nesting on a single CPU.
2554 *
2555 * We only publish the head (and generate a wakeup) when the outer-most
2556 * event_id completes.
2557 */
2558static void perf_output_lock(struct perf_output_handle *handle)
2559{
2560 struct perf_mmap_data *data = handle->data;
2561 int cpu;
2562
2563 handle->locked = 0;
2564
2565 local_irq_save(handle->flags);
2566 cpu = smp_processor_id();
2567
2568 if (in_nmi() && atomic_read(&data->lock) == cpu)
2569 return;
2570
2571 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2572 cpu_relax();
2573
2574 handle->locked = 1;
2575}
2576
2577static void perf_output_unlock(struct perf_output_handle *handle)
2578{
2579 struct perf_mmap_data *data = handle->data;
2580 unsigned long head;
2581 int cpu;
2582
2583 data->done_head = data->head;
2584
2585 if (!handle->locked)
2586 goto out;
2587
2588again:
2589 /*
2590 * The xchg implies a full barrier that ensures all writes are done
2591 * before we publish the new head, matched by a rmb() in userspace when
2592 * reading this position.
2593 */
2594 while ((head = atomic_long_xchg(&data->done_head, 0)))
2595 data->user_page->data_head = head;
2596
2597 /*
2598 * NMI can happen here, which means we can miss a done_head update.
2599 */
2600
2601 cpu = atomic_xchg(&data->lock, -1);
2602 WARN_ON_ONCE(cpu != smp_processor_id());
2603
2604 /*
2605 * Therefore we have to validate we did not indeed do so.
2606 */
2607 if (unlikely(atomic_long_read(&data->done_head))) {
2608 /*
2609 * Since we had it locked, we can lock it again.
2610 */
2611 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2612 cpu_relax();
2613
2614 goto again;
2615 }
2616
2617 if (atomic_xchg(&data->wakeup, 0))
2618 perf_output_wakeup(handle);
2619out:
2620 local_irq_restore(handle->flags);
2621}
2622
2623void perf_output_copy(struct perf_output_handle *handle,
2624 const void *buf, unsigned int len)
2625{
2626 unsigned int pages_mask;
2627 unsigned int offset;
2628 unsigned int size;
2629 void **pages;
2630
2631 offset = handle->offset;
2632 pages_mask = handle->data->nr_pages - 1;
2633 pages = handle->data->data_pages;
2634
2635 do {
2636 unsigned int page_offset;
2637 int nr;
2638
2639 nr = (offset >> PAGE_SHIFT) & pages_mask;
2640 page_offset = offset & (PAGE_SIZE - 1);
2641 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
2642
2643 memcpy(pages[nr] + page_offset, buf, size);
2644
2645 len -= size;
2646 buf += size;
2647 offset += size;
2648 } while (len);
2649
2650 handle->offset = offset;
2651
2652 /*
2653 * Check we didn't copy past our reservation window, taking the
2654 * possible unsigned int wrap into account.
2655 */
2656 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2657}
2658
2659int perf_output_begin(struct perf_output_handle *handle,
2660 struct perf_event *event, unsigned int size,
2661 int nmi, int sample)
2662{
2663 struct perf_event *output_event;
2664 struct perf_mmap_data *data;
2665 unsigned long tail, offset, head;
2666 int have_lost;
2667 struct {
2668 struct perf_event_header header;
2669 u64 id;
2670 u64 lost;
2671 } lost_event;
2672
2673 rcu_read_lock();
2674 /*
2675 * For inherited events we send all the output towards the parent.
2676 */
2677 if (event->parent)
2678 event = event->parent;
2679
2680 output_event = rcu_dereference(event->output);
2681 if (output_event)
2682 event = output_event;
2683
2684 data = rcu_dereference(event->data);
2685 if (!data)
2686 goto out;
2687
2688 handle->data = data;
2689 handle->event = event;
2690 handle->nmi = nmi;
2691 handle->sample = sample;
2692
2693 if (!data->nr_pages)
2694 goto fail;
2695
2696 have_lost = atomic_read(&data->lost);
2697 if (have_lost)
2698 size += sizeof(lost_event);
2699
2700 perf_output_lock(handle);
2701
2702 do {
2703 /*
2704 * Userspace could choose to issue a mb() before updating the
2705 * tail pointer. So that all reads will be completed before the
2706 * write is issued.
2707 */
2708 tail = ACCESS_ONCE(data->user_page->data_tail);
2709 smp_rmb();
2710 offset = head = atomic_long_read(&data->head);
2711 head += size;
2712 if (unlikely(!perf_output_space(data, tail, offset, head)))
2713 goto fail;
2714 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2715
2716 handle->offset = offset;
2717 handle->head = head;
2718
2719 if (head - tail > data->watermark)
2720 atomic_set(&data->wakeup, 1);
2721
2722 if (have_lost) {
2723 lost_event.header.type = PERF_RECORD_LOST;
2724 lost_event.header.misc = 0;
2725 lost_event.header.size = sizeof(lost_event);
2726 lost_event.id = event->id;
2727 lost_event.lost = atomic_xchg(&data->lost, 0);
2728
2729 perf_output_put(handle, lost_event);
2730 }
2731
2732 return 0;
2733
2734fail:
2735 atomic_inc(&data->lost);
2736 perf_output_unlock(handle);
2737out:
2738 rcu_read_unlock();
2739
2740 return -ENOSPC;
2741}
2742
2743void perf_output_end(struct perf_output_handle *handle)
2744{
2745 struct perf_event *event = handle->event;
2746 struct perf_mmap_data *data = handle->data;
2747
2748 int wakeup_events = event->attr.wakeup_events;
2749
2750 if (handle->sample && wakeup_events) {
2751 int events = atomic_inc_return(&data->events);
2752 if (events >= wakeup_events) {
2753 atomic_sub(wakeup_events, &data->events);
2754 atomic_set(&data->wakeup, 1);
2755 }
2756 }
2757
2758 perf_output_unlock(handle);
2759 rcu_read_unlock();
2760}
2761
2762static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
2763{
2764 /*
2765 * only top level events have the pid namespace they were created in
2766 */
2767 if (event->parent)
2768 event = event->parent;
2769
2770 return task_tgid_nr_ns(p, event->ns);
2771}
2772
2773static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
2774{
2775 /*
2776 * only top level events have the pid namespace they were created in
2777 */
2778 if (event->parent)
2779 event = event->parent;
2780
2781 return task_pid_nr_ns(p, event->ns);
2782}
2783
2784static void perf_output_read_one(struct perf_output_handle *handle,
2785 struct perf_event *event)
2786{
2787 u64 read_format = event->attr.read_format;
2788 u64 values[4];
2789 int n = 0;
2790
2791 values[n++] = atomic64_read(&event->count);
2792 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
2793 values[n++] = event->total_time_enabled +
2794 atomic64_read(&event->child_total_time_enabled);
2795 }
2796 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
2797 values[n++] = event->total_time_running +
2798 atomic64_read(&event->child_total_time_running);
2799 }
2800 if (read_format & PERF_FORMAT_ID)
2801 values[n++] = primary_event_id(event);
2802
2803 perf_output_copy(handle, values, n * sizeof(u64));
2804}
2805
2806/*
2807 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
2808 */
2809static void perf_output_read_group(struct perf_output_handle *handle,
2810 struct perf_event *event)
2811{
2812 struct perf_event *leader = event->group_leader, *sub;
2813 u64 read_format = event->attr.read_format;
2814 u64 values[5];
2815 int n = 0;
2816
2817 values[n++] = 1 + leader->nr_siblings;
2818
2819 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
2820 values[n++] = leader->total_time_enabled;
2821
2822 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
2823 values[n++] = leader->total_time_running;
2824
2825 if (leader != event)
2826 leader->pmu->read(leader);
2827
2828 values[n++] = atomic64_read(&leader->count);
2829 if (read_format & PERF_FORMAT_ID)
2830 values[n++] = primary_event_id(leader);
2831
2832 perf_output_copy(handle, values, n * sizeof(u64));
2833
2834 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2835 n = 0;
2836
2837 if (sub != event)
2838 sub->pmu->read(sub);
2839
2840 values[n++] = atomic64_read(&sub->count);
2841 if (read_format & PERF_FORMAT_ID)
2842 values[n++] = primary_event_id(sub);
2843
2844 perf_output_copy(handle, values, n * sizeof(u64));
2845 }
2846}
2847
2848static void perf_output_read(struct perf_output_handle *handle,
2849 struct perf_event *event)
2850{
2851 if (event->attr.read_format & PERF_FORMAT_GROUP)
2852 perf_output_read_group(handle, event);
2853 else
2854 perf_output_read_one(handle, event);
2855}
2856
2857void perf_output_sample(struct perf_output_handle *handle,
2858 struct perf_event_header *header,
2859 struct perf_sample_data *data,
2860 struct perf_event *event)
2861{
2862 u64 sample_type = data->type;
2863
2864 perf_output_put(handle, *header);
2865
2866 if (sample_type & PERF_SAMPLE_IP)
2867 perf_output_put(handle, data->ip);
2868
2869 if (sample_type & PERF_SAMPLE_TID)
2870 perf_output_put(handle, data->tid_entry);
2871
2872 if (sample_type & PERF_SAMPLE_TIME)
2873 perf_output_put(handle, data->time);
2874
2875 if (sample_type & PERF_SAMPLE_ADDR)
2876 perf_output_put(handle, data->addr);
2877
2878 if (sample_type & PERF_SAMPLE_ID)
2879 perf_output_put(handle, data->id);
2880
2881 if (sample_type & PERF_SAMPLE_STREAM_ID)
2882 perf_output_put(handle, data->stream_id);
2883
2884 if (sample_type & PERF_SAMPLE_CPU)
2885 perf_output_put(handle, data->cpu_entry);
2886
2887 if (sample_type & PERF_SAMPLE_PERIOD)
2888 perf_output_put(handle, data->period);
2889
2890 if (sample_type & PERF_SAMPLE_READ)
2891 perf_output_read(handle, event);
2892
2893 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2894 if (data->callchain) {
2895 int size = 1;
2896
2897 if (data->callchain)
2898 size += data->callchain->nr;
2899
2900 size *= sizeof(u64);
2901
2902 perf_output_copy(handle, data->callchain, size);
2903 } else {
2904 u64 nr = 0;
2905 perf_output_put(handle, nr);
2906 }
2907 }
2908
2909 if (sample_type & PERF_SAMPLE_RAW) {
2910 if (data->raw) {
2911 perf_output_put(handle, data->raw->size);
2912 perf_output_copy(handle, data->raw->data,
2913 data->raw->size);
2914 } else {
2915 struct {
2916 u32 size;
2917 u32 data;
2918 } raw = {
2919 .size = sizeof(u32),
2920 .data = 0,
2921 };
2922 perf_output_put(handle, raw);
2923 }
2924 }
2925}
2926
2927void perf_prepare_sample(struct perf_event_header *header,
2928 struct perf_sample_data *data,
2929 struct perf_event *event,
2930 struct pt_regs *regs)
2931{
2932 u64 sample_type = event->attr.sample_type;
2933
2934 data->type = sample_type;
2935
2936 header->type = PERF_RECORD_SAMPLE;
2937 header->size = sizeof(*header);
2938
2939 header->misc = 0;
2940 header->misc |= perf_misc_flags(regs);
2941
2942 if (sample_type & PERF_SAMPLE_IP) {
2943 data->ip = perf_instruction_pointer(regs);
2944
2945 header->size += sizeof(data->ip);
2946 }
2947
2948 if (sample_type & PERF_SAMPLE_TID) {
2949 /* namespace issues */
2950 data->tid_entry.pid = perf_event_pid(event, current);
2951 data->tid_entry.tid = perf_event_tid(event, current);
2952
2953 header->size += sizeof(data->tid_entry);
2954 }
2955
2956 if (sample_type & PERF_SAMPLE_TIME) {
2957 data->time = perf_clock();
2958
2959 header->size += sizeof(data->time);
2960 }
2961
2962 if (sample_type & PERF_SAMPLE_ADDR)
2963 header->size += sizeof(data->addr);
2964
2965 if (sample_type & PERF_SAMPLE_ID) {
2966 data->id = primary_event_id(event);
2967
2968 header->size += sizeof(data->id);
2969 }
2970
2971 if (sample_type & PERF_SAMPLE_STREAM_ID) {
2972 data->stream_id = event->id;
2973
2974 header->size += sizeof(data->stream_id);
2975 }
2976
2977 if (sample_type & PERF_SAMPLE_CPU) {
2978 data->cpu_entry.cpu = raw_smp_processor_id();
2979 data->cpu_entry.reserved = 0;
2980
2981 header->size += sizeof(data->cpu_entry);
2982 }
2983
2984 if (sample_type & PERF_SAMPLE_PERIOD)
2985 header->size += sizeof(data->period);
2986
2987 if (sample_type & PERF_SAMPLE_READ)
2988 header->size += perf_event_read_size(event);
2989
2990 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2991 int size = 1;
2992
2993 data->callchain = perf_callchain(regs);
2994
2995 if (data->callchain)
2996 size += data->callchain->nr;
2997
2998 header->size += size * sizeof(u64);
2999 }
3000
3001 if (sample_type & PERF_SAMPLE_RAW) {
3002 int size = sizeof(u32);
3003
3004 if (data->raw)
3005 size += data->raw->size;
3006 else
3007 size += sizeof(u32);
3008
3009 WARN_ON_ONCE(size & (sizeof(u64)-1));
3010 header->size += size;
3011 }
3012}
3013
3014static void perf_event_output(struct perf_event *event, int nmi,
3015 struct perf_sample_data *data,
3016 struct pt_regs *regs)
3017{
3018 struct perf_output_handle handle;
3019 struct perf_event_header header;
3020
3021 perf_prepare_sample(&header, data, event, regs);
3022
3023 if (perf_output_begin(&handle, event, header.size, nmi, 1))
3024 return;
3025
3026 perf_output_sample(&handle, &header, data, event);
3027
3028 perf_output_end(&handle);
3029}
3030
3031/*
3032 * read event_id
3033 */
3034
3035struct perf_read_event {
3036 struct perf_event_header header;
3037
3038 u32 pid;
3039 u32 tid;
3040};
3041
3042static void
3043perf_event_read_event(struct perf_event *event,
3044 struct task_struct *task)
3045{
3046 struct perf_output_handle handle;
3047 struct perf_read_event read_event = {
3048 .header = {
3049 .type = PERF_RECORD_READ,
3050 .misc = 0,
3051 .size = sizeof(read_event) + perf_event_read_size(event),
3052 },
3053 .pid = perf_event_pid(event, task),
3054 .tid = perf_event_tid(event, task),
3055 };
3056 int ret;
3057
3058 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
3059 if (ret)
3060 return;
3061
3062 perf_output_put(&handle, read_event);
3063 perf_output_read(&handle, event);
3064
3065 perf_output_end(&handle);
3066}
3067
3068/*
3069 * task tracking -- fork/exit
3070 *
3071 * enabled by: attr.comm | attr.mmap | attr.task
3072 */
3073
3074struct perf_task_event {
3075 struct task_struct *task;
3076 struct perf_event_context *task_ctx;
3077
3078 struct {
3079 struct perf_event_header header;
3080
3081 u32 pid;
3082 u32 ppid;
3083 u32 tid;
3084 u32 ptid;
3085 u64 time;
3086 } event_id;
3087};
3088
3089static void perf_event_task_output(struct perf_event *event,
3090 struct perf_task_event *task_event)
3091{
3092 struct perf_output_handle handle;
3093 int size;
3094 struct task_struct *task = task_event->task;
3095 int ret;
3096
3097 size = task_event->event_id.header.size;
3098 ret = perf_output_begin(&handle, event, size, 0, 0);
3099
3100 if (ret)
3101 return;
3102
3103 task_event->event_id.pid = perf_event_pid(event, task);
3104 task_event->event_id.ppid = perf_event_pid(event, current);
3105
3106 task_event->event_id.tid = perf_event_tid(event, task);
3107 task_event->event_id.ptid = perf_event_tid(event, current);
3108
3109 task_event->event_id.time = perf_clock();
3110
3111 perf_output_put(&handle, task_event->event_id);
3112
3113 perf_output_end(&handle);
3114}
3115
3116static int perf_event_task_match(struct perf_event *event)
3117{
3118 if (event->attr.comm || event->attr.mmap || event->attr.task)
3119 return 1;
3120
3121 return 0;
3122}
3123
3124static void perf_event_task_ctx(struct perf_event_context *ctx,
3125 struct perf_task_event *task_event)
3126{
3127 struct perf_event *event;
3128
3129 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3130 return;
3131
3132 rcu_read_lock();
3133 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3134 if (perf_event_task_match(event))
3135 perf_event_task_output(event, task_event);
3136 }
3137 rcu_read_unlock();
3138}
3139
3140static void perf_event_task_event(struct perf_task_event *task_event)
3141{
3142 struct perf_cpu_context *cpuctx;
3143 struct perf_event_context *ctx = task_event->task_ctx;
3144
3145 cpuctx = &get_cpu_var(perf_cpu_context);
3146 perf_event_task_ctx(&cpuctx->ctx, task_event);
3147 put_cpu_var(perf_cpu_context);
3148
3149 rcu_read_lock();
3150 if (!ctx)
3151 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3152 if (ctx)
3153 perf_event_task_ctx(ctx, task_event);
3154 rcu_read_unlock();
3155}
3156
3157static void perf_event_task(struct task_struct *task,
3158 struct perf_event_context *task_ctx,
3159 int new)
3160{
3161 struct perf_task_event task_event;
3162
3163 if (!atomic_read(&nr_comm_events) &&
3164 !atomic_read(&nr_mmap_events) &&
3165 !atomic_read(&nr_task_events))
3166 return;
3167
3168 task_event = (struct perf_task_event){
3169 .task = task,
3170 .task_ctx = task_ctx,
3171 .event_id = {
3172 .header = {
3173 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
3174 .misc = 0,
3175 .size = sizeof(task_event.event_id),
3176 },
3177 /* .pid */
3178 /* .ppid */
3179 /* .tid */
3180 /* .ptid */
3181 },
3182 };
3183
3184 perf_event_task_event(&task_event);
3185}
3186
3187void perf_event_fork(struct task_struct *task)
3188{
3189 perf_event_task(task, NULL, 1);
3190}
3191
3192/*
3193 * comm tracking
3194 */
3195
3196struct perf_comm_event {
3197 struct task_struct *task;
3198 char *comm;
3199 int comm_size;
3200
3201 struct {
3202 struct perf_event_header header;
3203
3204 u32 pid;
3205 u32 tid;
3206 } event_id;
3207};
3208
3209static void perf_event_comm_output(struct perf_event *event,
3210 struct perf_comm_event *comm_event)
3211{
3212 struct perf_output_handle handle;
3213 int size = comm_event->event_id.header.size;
3214 int ret = perf_output_begin(&handle, event, size, 0, 0);
3215
3216 if (ret)
3217 return;
3218
3219 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
3220 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
3221
3222 perf_output_put(&handle, comm_event->event_id);
3223 perf_output_copy(&handle, comm_event->comm,
3224 comm_event->comm_size);
3225 perf_output_end(&handle);
3226}
3227
3228static int perf_event_comm_match(struct perf_event *event)
3229{
3230 if (event->attr.comm)
3231 return 1;
3232
3233 return 0;
3234}
3235
3236static void perf_event_comm_ctx(struct perf_event_context *ctx,
3237 struct perf_comm_event *comm_event)
3238{
3239 struct perf_event *event;
3240
3241 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3242 return;
3243
3244 rcu_read_lock();
3245 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3246 if (perf_event_comm_match(event))
3247 perf_event_comm_output(event, comm_event);
3248 }
3249 rcu_read_unlock();
3250}
3251
3252static void perf_event_comm_event(struct perf_comm_event *comm_event)
3253{
3254 struct perf_cpu_context *cpuctx;
3255 struct perf_event_context *ctx;
3256 unsigned int size;
3257 char comm[TASK_COMM_LEN];
3258
3259 memset(comm, 0, sizeof(comm));
3260 strncpy(comm, comm_event->task->comm, sizeof(comm));
3261 size = ALIGN(strlen(comm)+1, sizeof(u64));
3262
3263 comm_event->comm = comm;
3264 comm_event->comm_size = size;
3265
3266 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
3267
3268 cpuctx = &get_cpu_var(perf_cpu_context);
3269 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3270 put_cpu_var(perf_cpu_context);
3271
3272 rcu_read_lock();
3273 /*
3274 * doesn't really matter which of the child contexts the
3275 * events ends up in.
3276 */
3277 ctx = rcu_dereference(current->perf_event_ctxp);
3278 if (ctx)
3279 perf_event_comm_ctx(ctx, comm_event);
3280 rcu_read_unlock();
3281}
3282
3283void perf_event_comm(struct task_struct *task)
3284{
3285 struct perf_comm_event comm_event;
3286
3287 if (task->perf_event_ctxp)
3288 perf_event_enable_on_exec(task);
3289
3290 if (!atomic_read(&nr_comm_events))
3291 return;
3292
3293 comm_event = (struct perf_comm_event){
3294 .task = task,
3295 /* .comm */
3296 /* .comm_size */
3297 .event_id = {
3298 .header = {
3299 .type = PERF_RECORD_COMM,
3300 .misc = 0,
3301 /* .size */
3302 },
3303 /* .pid */
3304 /* .tid */
3305 },
3306 };
3307
3308 perf_event_comm_event(&comm_event);
3309}
3310
3311/*
3312 * mmap tracking
3313 */
3314
3315struct perf_mmap_event {
3316 struct vm_area_struct *vma;
3317
3318 const char *file_name;
3319 int file_size;
3320
3321 struct {
3322 struct perf_event_header header;
3323
3324 u32 pid;
3325 u32 tid;
3326 u64 start;
3327 u64 len;
3328 u64 pgoff;
3329 } event_id;
3330};
3331
3332static void perf_event_mmap_output(struct perf_event *event,
3333 struct perf_mmap_event *mmap_event)
3334{
3335 struct perf_output_handle handle;
3336 int size = mmap_event->event_id.header.size;
3337 int ret = perf_output_begin(&handle, event, size, 0, 0);
3338
3339 if (ret)
3340 return;
3341
3342 mmap_event->event_id.pid = perf_event_pid(event, current);
3343 mmap_event->event_id.tid = perf_event_tid(event, current);
3344
3345 perf_output_put(&handle, mmap_event->event_id);
3346 perf_output_copy(&handle, mmap_event->file_name,
3347 mmap_event->file_size);
3348 perf_output_end(&handle);
3349}
3350
3351static int perf_event_mmap_match(struct perf_event *event,
3352 struct perf_mmap_event *mmap_event)
3353{
3354 if (event->attr.mmap)
3355 return 1;
3356
3357 return 0;
3358}
3359
3360static void perf_event_mmap_ctx(struct perf_event_context *ctx,
3361 struct perf_mmap_event *mmap_event)
3362{
3363 struct perf_event *event;
3364
3365 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3366 return;
3367
3368 rcu_read_lock();
3369 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3370 if (perf_event_mmap_match(event, mmap_event))
3371 perf_event_mmap_output(event, mmap_event);
3372 }
3373 rcu_read_unlock();
3374}
3375
3376static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
3377{
3378 struct perf_cpu_context *cpuctx;
3379 struct perf_event_context *ctx;
3380 struct vm_area_struct *vma = mmap_event->vma;
3381 struct file *file = vma->vm_file;
3382 unsigned int size;
3383 char tmp[16];
3384 char *buf = NULL;
3385 const char *name;
3386
3387 memset(tmp, 0, sizeof(tmp));
3388
3389 if (file) {
3390 /*
3391 * d_path works from the end of the buffer backwards, so we
3392 * need to add enough zero bytes after the string to handle
3393 * the 64bit alignment we do later.
3394 */
3395 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
3396 if (!buf) {
3397 name = strncpy(tmp, "//enomem", sizeof(tmp));
3398 goto got_name;
3399 }
3400 name = d_path(&file->f_path, buf, PATH_MAX);
3401 if (IS_ERR(name)) {
3402 name = strncpy(tmp, "//toolong", sizeof(tmp));
3403 goto got_name;
3404 }
3405 } else {
3406 if (arch_vma_name(mmap_event->vma)) {
3407 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
3408 sizeof(tmp));
3409 goto got_name;
3410 }
3411
3412 if (!vma->vm_mm) {
3413 name = strncpy(tmp, "[vdso]", sizeof(tmp));
3414 goto got_name;
3415 }
3416
3417 name = strncpy(tmp, "//anon", sizeof(tmp));
3418 goto got_name;
3419 }
3420
3421got_name:
3422 size = ALIGN(strlen(name)+1, sizeof(u64));
3423
3424 mmap_event->file_name = name;
3425 mmap_event->file_size = size;
3426
3427 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
3428
3429 cpuctx = &get_cpu_var(perf_cpu_context);
3430 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3431 put_cpu_var(perf_cpu_context);
3432
3433 rcu_read_lock();
3434 /*
3435 * doesn't really matter which of the child contexts the
3436 * events ends up in.
3437 */
3438 ctx = rcu_dereference(current->perf_event_ctxp);
3439 if (ctx)
3440 perf_event_mmap_ctx(ctx, mmap_event);
3441 rcu_read_unlock();
3442
3443 kfree(buf);
3444}
3445
3446void __perf_event_mmap(struct vm_area_struct *vma)
3447{
3448 struct perf_mmap_event mmap_event;
3449
3450 if (!atomic_read(&nr_mmap_events))
3451 return;
3452
3453 mmap_event = (struct perf_mmap_event){
3454 .vma = vma,
3455 /* .file_name */
3456 /* .file_size */
3457 .event_id = {
3458 .header = {
3459 .type = PERF_RECORD_MMAP,
3460 .misc = 0,
3461 /* .size */
3462 },
3463 /* .pid */
3464 /* .tid */
3465 .start = vma->vm_start,
3466 .len = vma->vm_end - vma->vm_start,
3467 .pgoff = vma->vm_pgoff,
3468 },
3469 };
3470
3471 perf_event_mmap_event(&mmap_event);
3472}
3473
3474/*
3475 * IRQ throttle logging
3476 */
3477
3478static void perf_log_throttle(struct perf_event *event, int enable)
3479{
3480 struct perf_output_handle handle;
3481 int ret;
3482
3483 struct {
3484 struct perf_event_header header;
3485 u64 time;
3486 u64 id;
3487 u64 stream_id;
3488 } throttle_event = {
3489 .header = {
3490 .type = PERF_RECORD_THROTTLE,
3491 .misc = 0,
3492 .size = sizeof(throttle_event),
3493 },
3494 .time = perf_clock(),
3495 .id = primary_event_id(event),
3496 .stream_id = event->id,
3497 };
3498
3499 if (enable)
3500 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
3501
3502 ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
3503 if (ret)
3504 return;
3505
3506 perf_output_put(&handle, throttle_event);
3507 perf_output_end(&handle);
3508}
3509
3510/*
3511 * Generic event overflow handling, sampling.
3512 */
3513
3514static int __perf_event_overflow(struct perf_event *event, int nmi,
3515 int throttle, struct perf_sample_data *data,
3516 struct pt_regs *regs)
3517{
3518 int events = atomic_read(&event->event_limit);
3519 struct hw_perf_event *hwc = &event->hw;
3520 int ret = 0;
3521
3522 throttle = (throttle && event->pmu->unthrottle != NULL);
3523
3524 if (!throttle) {
3525 hwc->interrupts++;
3526 } else {
3527 if (hwc->interrupts != MAX_INTERRUPTS) {
3528 hwc->interrupts++;
3529 if (HZ * hwc->interrupts >
3530 (u64)sysctl_perf_event_sample_rate) {
3531 hwc->interrupts = MAX_INTERRUPTS;
3532 perf_log_throttle(event, 0);
3533 ret = 1;
3534 }
3535 } else {
3536 /*
3537 * Keep re-disabling events even though on the previous
3538 * pass we disabled it - just in case we raced with a
3539 * sched-in and the event got enabled again:
3540 */
3541 ret = 1;
3542 }
3543 }
3544
3545 if (event->attr.freq) {
3546 u64 now = perf_clock();
3547 s64 delta = now - hwc->freq_stamp;
3548
3549 hwc->freq_stamp = now;
3550
3551 if (delta > 0 && delta < TICK_NSEC)
3552 perf_adjust_period(event, NSEC_PER_SEC / (int)delta);
3553 }
3554
3555 /*
3556 * XXX event_limit might not quite work as expected on inherited
3557 * events
3558 */
3559
3560 event->pending_kill = POLL_IN;
3561 if (events && atomic_dec_and_test(&event->event_limit)) {
3562 ret = 1;
3563 event->pending_kill = POLL_HUP;
3564 if (nmi) {
3565 event->pending_disable = 1;
3566 perf_pending_queue(&event->pending,
3567 perf_pending_event);
3568 } else
3569 perf_event_disable(event);
3570 }
3571
3572 perf_event_output(event, nmi, data, regs);
3573 return ret;
3574}
3575
3576int perf_event_overflow(struct perf_event *event, int nmi,
3577 struct perf_sample_data *data,
3578 struct pt_regs *regs)
3579{
3580 return __perf_event_overflow(event, nmi, 1, data, regs);
3581}
3582
3583/*
3584 * Generic software event infrastructure
3585 */
3586
3587/*
3588 * We directly increment event->count and keep a second value in
3589 * event->hw.period_left to count intervals. This period event
3590 * is kept in the range [-sample_period, 0] so that we can use the
3591 * sign as trigger.
3592 */
3593
3594static u64 perf_swevent_set_period(struct perf_event *event)
3595{
3596 struct hw_perf_event *hwc = &event->hw;
3597 u64 period = hwc->last_period;
3598 u64 nr, offset;
3599 s64 old, val;
3600
3601 hwc->last_period = hwc->sample_period;
3602
3603again:
3604 old = val = atomic64_read(&hwc->period_left);
3605 if (val < 0)
3606 return 0;
3607
3608 nr = div64_u64(period + val, period);
3609 offset = nr * period;
3610 val -= offset;
3611 if (atomic64_cmpxchg(&hwc->period_left, old, val) != old)
3612 goto again;
3613
3614 return nr;
3615}
3616
3617static void perf_swevent_overflow(struct perf_event *event,
3618 int nmi, struct perf_sample_data *data,
3619 struct pt_regs *regs)
3620{
3621 struct hw_perf_event *hwc = &event->hw;
3622 int throttle = 0;
3623 u64 overflow;
3624
3625 data->period = event->hw.last_period;
3626 overflow = perf_swevent_set_period(event);
3627
3628 if (hwc->interrupts == MAX_INTERRUPTS)
3629 return;
3630
3631 for (; overflow; overflow--) {
3632 if (__perf_event_overflow(event, nmi, throttle,
3633 data, regs)) {
3634 /*
3635 * We inhibit the overflow from happening when
3636 * hwc->interrupts == MAX_INTERRUPTS.
3637 */
3638 break;
3639 }
3640 throttle = 1;
3641 }
3642}
3643
3644static void perf_swevent_unthrottle(struct perf_event *event)
3645{
3646 /*
3647 * Nothing to do, we already reset hwc->interrupts.
3648 */
3649}
3650
3651static void perf_swevent_add(struct perf_event *event, u64 nr,
3652 int nmi, struct perf_sample_data *data,
3653 struct pt_regs *regs)
3654{
3655 struct hw_perf_event *hwc = &event->hw;
3656
3657 atomic64_add(nr, &event->count);
3658
3659 if (!hwc->sample_period)
3660 return;
3661
3662 if (!regs)
3663 return;
3664
3665 if (!atomic64_add_negative(nr, &hwc->period_left))
3666 perf_swevent_overflow(event, nmi, data, regs);
3667}
3668
3669static int perf_swevent_is_counting(struct perf_event *event)
3670{
3671 /*
3672 * The event is active, we're good!
3673 */
3674 if (event->state == PERF_EVENT_STATE_ACTIVE)
3675 return 1;
3676
3677 /*
3678 * The event is off/error, not counting.
3679 */
3680 if (event->state != PERF_EVENT_STATE_INACTIVE)
3681 return 0;
3682
3683 /*
3684 * The event is inactive, if the context is active
3685 * we're part of a group that didn't make it on the 'pmu',
3686 * not counting.
3687 */
3688 if (event->ctx->is_active)
3689 return 0;
3690
3691 /*
3692 * We're inactive and the context is too, this means the
3693 * task is scheduled out, we're counting events that happen
3694 * to us, like migration events.
3695 */
3696 return 1;
3697}
3698
3699static int perf_swevent_match(struct perf_event *event,
3700 enum perf_type_id type,
3701 u32 event_id, struct pt_regs *regs)
3702{
3703 if (!perf_swevent_is_counting(event))
3704 return 0;
3705
3706 if (event->attr.type != type)
3707 return 0;
3708 if (event->attr.config != event_id)
3709 return 0;
3710
3711 if (regs) {
3712 if (event->attr.exclude_user && user_mode(regs))
3713 return 0;
3714
3715 if (event->attr.exclude_kernel && !user_mode(regs))
3716 return 0;
3717 }
3718
3719 return 1;
3720}
3721
3722static void perf_swevent_ctx_event(struct perf_event_context *ctx,
3723 enum perf_type_id type,
3724 u32 event_id, u64 nr, int nmi,
3725 struct perf_sample_data *data,
3726 struct pt_regs *regs)
3727{
3728 struct perf_event *event;
3729
3730 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
3731 return;
3732
3733 rcu_read_lock();
3734 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3735 if (perf_swevent_match(event, type, event_id, regs))
3736 perf_swevent_add(event, nr, nmi, data, regs);
3737 }
3738 rcu_read_unlock();
3739}
3740
3741static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx)
3742{
3743 if (in_nmi())
3744 return &cpuctx->recursion[3];
3745
3746 if (in_irq())
3747 return &cpuctx->recursion[2];
3748
3749 if (in_softirq())
3750 return &cpuctx->recursion[1];
3751
3752 return &cpuctx->recursion[0];
3753}
3754
3755static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
3756 u64 nr, int nmi,
3757 struct perf_sample_data *data,
3758 struct pt_regs *regs)
3759{
3760 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3761 int *recursion = perf_swevent_recursion_context(cpuctx);
3762 struct perf_event_context *ctx;
3763
3764 if (*recursion)
3765 goto out;
3766
3767 (*recursion)++;
3768 barrier();
3769
3770 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
3771 nr, nmi, data, regs);
3772 rcu_read_lock();
3773 /*
3774 * doesn't really matter which of the child contexts the
3775 * events ends up in.
3776 */
3777 ctx = rcu_dereference(current->perf_event_ctxp);
3778 if (ctx)
3779 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
3780 rcu_read_unlock();
3781
3782 barrier();
3783 (*recursion)--;
3784
3785out:
3786 put_cpu_var(perf_cpu_context);
3787}
3788
3789void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3790 struct pt_regs *regs, u64 addr)
3791{
3792 struct perf_sample_data data = {
3793 .addr = addr,
3794 };
3795
3796 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi,
3797 &data, regs);
3798}
3799
3800static void perf_swevent_read(struct perf_event *event)
3801{
3802}
3803
3804static int perf_swevent_enable(struct perf_event *event)
3805{
3806 struct hw_perf_event *hwc = &event->hw;
3807
3808 if (hwc->sample_period) {
3809 hwc->last_period = hwc->sample_period;
3810 perf_swevent_set_period(event);
3811 }
3812 return 0;
3813}
3814
3815static void perf_swevent_disable(struct perf_event *event)
3816{
3817}
3818
3819static const struct pmu perf_ops_generic = {
3820 .enable = perf_swevent_enable,
3821 .disable = perf_swevent_disable,
3822 .read = perf_swevent_read,
3823 .unthrottle = perf_swevent_unthrottle,
3824};
3825
3826/*
3827 * hrtimer based swevent callback
3828 */
3829
3830static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3831{
3832 enum hrtimer_restart ret = HRTIMER_RESTART;
3833 struct perf_sample_data data;
3834 struct pt_regs *regs;
3835 struct perf_event *event;
3836 u64 period;
3837
3838 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
3839 event->pmu->read(event);
3840
3841 data.addr = 0;
3842 regs = get_irq_regs();
3843 /*
3844 * In case we exclude kernel IPs or are somehow not in interrupt
3845 * context, provide the next best thing, the user IP.
3846 */
3847 if ((event->attr.exclude_kernel || !regs) &&
3848 !event->attr.exclude_user)
3849 regs = task_pt_regs(current);
3850
3851 if (regs) {
3852 if (perf_event_overflow(event, 0, &data, regs))
3853 ret = HRTIMER_NORESTART;
3854 }
3855
3856 period = max_t(u64, 10000, event->hw.sample_period);
3857 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
3858
3859 return ret;
3860}
3861
3862/*
3863 * Software event: cpu wall time clock
3864 */
3865
3866static void cpu_clock_perf_event_update(struct perf_event *event)
3867{
3868 int cpu = raw_smp_processor_id();
3869 s64 prev;
3870 u64 now;
3871
3872 now = cpu_clock(cpu);
3873 prev = atomic64_read(&event->hw.prev_count);
3874 atomic64_set(&event->hw.prev_count, now);
3875 atomic64_add(now - prev, &event->count);
3876}
3877
3878static int cpu_clock_perf_event_enable(struct perf_event *event)
3879{
3880 struct hw_perf_event *hwc = &event->hw;
3881 int cpu = raw_smp_processor_id();
3882
3883 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
3884 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3885 hwc->hrtimer.function = perf_swevent_hrtimer;
3886 if (hwc->sample_period) {
3887 u64 period = max_t(u64, 10000, hwc->sample_period);
3888 __hrtimer_start_range_ns(&hwc->hrtimer,
3889 ns_to_ktime(period), 0,
3890 HRTIMER_MODE_REL, 0);
3891 }
3892
3893 return 0;
3894}
3895
3896static void cpu_clock_perf_event_disable(struct perf_event *event)
3897{
3898 if (event->hw.sample_period)
3899 hrtimer_cancel(&event->hw.hrtimer);
3900 cpu_clock_perf_event_update(event);
3901}
3902
3903static void cpu_clock_perf_event_read(struct perf_event *event)
3904{
3905 cpu_clock_perf_event_update(event);
3906}
3907
3908static const struct pmu perf_ops_cpu_clock = {
3909 .enable = cpu_clock_perf_event_enable,
3910 .disable = cpu_clock_perf_event_disable,
3911 .read = cpu_clock_perf_event_read,
3912};
3913
3914/*
3915 * Software event: task time clock
3916 */
3917
3918static void task_clock_perf_event_update(struct perf_event *event, u64 now)
3919{
3920 u64 prev;
3921 s64 delta;
3922
3923 prev = atomic64_xchg(&event->hw.prev_count, now);
3924 delta = now - prev;
3925 atomic64_add(delta, &event->count);
3926}
3927
3928static int task_clock_perf_event_enable(struct perf_event *event)
3929{
3930 struct hw_perf_event *hwc = &event->hw;
3931 u64 now;
3932
3933 now = event->ctx->time;
3934
3935 atomic64_set(&hwc->prev_count, now);
3936 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3937 hwc->hrtimer.function = perf_swevent_hrtimer;
3938 if (hwc->sample_period) {
3939 u64 period = max_t(u64, 10000, hwc->sample_period);
3940 __hrtimer_start_range_ns(&hwc->hrtimer,
3941 ns_to_ktime(period), 0,
3942 HRTIMER_MODE_REL, 0);
3943 }
3944
3945 return 0;
3946}
3947
3948static void task_clock_perf_event_disable(struct perf_event *event)
3949{
3950 if (event->hw.sample_period)
3951 hrtimer_cancel(&event->hw.hrtimer);
3952 task_clock_perf_event_update(event, event->ctx->time);
3953
3954}
3955
3956static void task_clock_perf_event_read(struct perf_event *event)
3957{
3958 u64 time;
3959
3960 if (!in_nmi()) {
3961 update_context_time(event->ctx);
3962 time = event->ctx->time;
3963 } else {
3964 u64 now = perf_clock();
3965 u64 delta = now - event->ctx->timestamp;
3966 time = event->ctx->time + delta;
3967 }
3968
3969 task_clock_perf_event_update(event, time);
3970}
3971
3972static const struct pmu perf_ops_task_clock = {
3973 .enable = task_clock_perf_event_enable,
3974 .disable = task_clock_perf_event_disable,
3975 .read = task_clock_perf_event_read,
3976};
3977
3978#ifdef CONFIG_EVENT_PROFILE
3979void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
3980 int entry_size)
3981{
3982 struct perf_raw_record raw = {
3983 .size = entry_size,
3984 .data = record,
3985 };
3986
3987 struct perf_sample_data data = {
3988 .addr = addr,
3989 .raw = &raw,
3990 };
3991
3992 struct pt_regs *regs = get_irq_regs();
3993
3994 if (!regs)
3995 regs = task_pt_regs(current);
3996
3997 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
3998 &data, regs);
3999}
4000EXPORT_SYMBOL_GPL(perf_tp_event);
4001
4002extern int ftrace_profile_enable(int);
4003extern void ftrace_profile_disable(int);
4004
4005static void tp_perf_event_destroy(struct perf_event *event)
4006{
4007 ftrace_profile_disable(event->attr.config);
4008}
4009
4010static const struct pmu *tp_perf_event_init(struct perf_event *event)
4011{
4012 /*
4013 * Raw tracepoint data is a severe data leak, only allow root to
4014 * have these.
4015 */
4016 if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
4017 perf_paranoid_tracepoint_raw() &&
4018 !capable(CAP_SYS_ADMIN))
4019 return ERR_PTR(-EPERM);
4020
4021 if (ftrace_profile_enable(event->attr.config))
4022 return NULL;
4023
4024 event->destroy = tp_perf_event_destroy;
4025
4026 return &perf_ops_generic;
4027}
4028#else
4029static const struct pmu *tp_perf_event_init(struct perf_event *event)
4030{
4031 return NULL;
4032}
4033#endif
4034
4035atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
4036
4037static void sw_perf_event_destroy(struct perf_event *event)
4038{
4039 u64 event_id = event->attr.config;
4040
4041 WARN_ON(event->parent);
4042
4043 atomic_dec(&perf_swevent_enabled[event_id]);
4044}
4045
4046static const struct pmu *sw_perf_event_init(struct perf_event *event)
4047{
4048 const struct pmu *pmu = NULL;
4049 u64 event_id = event->attr.config;
4050
4051 /*
4052 * Software events (currently) can't in general distinguish
4053 * between user, kernel and hypervisor events.
4054 * However, context switches and cpu migrations are considered
4055 * to be kernel events, and page faults are never hypervisor
4056 * events.
4057 */
4058 switch (event_id) {
4059 case PERF_COUNT_SW_CPU_CLOCK:
4060 pmu = &perf_ops_cpu_clock;
4061
4062 break;
4063 case PERF_COUNT_SW_TASK_CLOCK:
4064 /*
4065 * If the user instantiates this as a per-cpu event,
4066 * use the cpu_clock event instead.
4067 */
4068 if (event->ctx->task)
4069 pmu = &perf_ops_task_clock;
4070 else
4071 pmu = &perf_ops_cpu_clock;
4072
4073 break;
4074 case PERF_COUNT_SW_PAGE_FAULTS:
4075 case PERF_COUNT_SW_PAGE_FAULTS_MIN:
4076 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4077 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4078 case PERF_COUNT_SW_CPU_MIGRATIONS:
4079 if (!event->parent) {
4080 atomic_inc(&perf_swevent_enabled[event_id]);
4081 event->destroy = sw_perf_event_destroy;
4082 }
4083 pmu = &perf_ops_generic;
4084 break;
4085 }
4086
4087 return pmu;
4088}
4089
4090/*
4091 * Allocate and initialize a event structure
4092 */
4093static struct perf_event *
4094perf_event_alloc(struct perf_event_attr *attr,
4095 int cpu,
4096 struct perf_event_context *ctx,
4097 struct perf_event *group_leader,
4098 struct perf_event *parent_event,
4099 gfp_t gfpflags)
4100{
4101 const struct pmu *pmu;
4102 struct perf_event *event;
4103 struct hw_perf_event *hwc;
4104 long err;
4105
4106 event = kzalloc(sizeof(*event), gfpflags);
4107 if (!event)
4108 return ERR_PTR(-ENOMEM);
4109
4110 /*
4111 * Single events are their own group leaders, with an
4112 * empty sibling list:
4113 */
4114 if (!group_leader)
4115 group_leader = event;
4116
4117 mutex_init(&event->child_mutex);
4118 INIT_LIST_HEAD(&event->child_list);
4119
4120 INIT_LIST_HEAD(&event->group_entry);
4121 INIT_LIST_HEAD(&event->event_entry);
4122 INIT_LIST_HEAD(&event->sibling_list);
4123 init_waitqueue_head(&event->waitq);
4124
4125 mutex_init(&event->mmap_mutex);
4126
4127 event->cpu = cpu;
4128 event->attr = *attr;
4129 event->group_leader = group_leader;
4130 event->pmu = NULL;
4131 event->ctx = ctx;
4132 event->oncpu = -1;
4133
4134 event->parent = parent_event;
4135
4136 event->ns = get_pid_ns(current->nsproxy->pid_ns);
4137 event->id = atomic64_inc_return(&perf_event_id);
4138
4139 event->state = PERF_EVENT_STATE_INACTIVE;
4140
4141 if (attr->disabled)
4142 event->state = PERF_EVENT_STATE_OFF;
4143
4144 pmu = NULL;
4145
4146 hwc = &event->hw;
4147 hwc->sample_period = attr->sample_period;
4148 if (attr->freq && attr->sample_freq)
4149 hwc->sample_period = 1;
4150 hwc->last_period = hwc->sample_period;
4151
4152 atomic64_set(&hwc->period_left, hwc->sample_period);
4153
4154 /*
4155 * we currently do not support PERF_FORMAT_GROUP on inherited events
4156 */
4157 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
4158 goto done;
4159
4160 switch (attr->type) {
4161 case PERF_TYPE_RAW:
4162 case PERF_TYPE_HARDWARE:
4163 case PERF_TYPE_HW_CACHE:
4164 pmu = hw_perf_event_init(event);
4165 break;
4166
4167 case PERF_TYPE_SOFTWARE:
4168 pmu = sw_perf_event_init(event);
4169 break;
4170
4171 case PERF_TYPE_TRACEPOINT:
4172 pmu = tp_perf_event_init(event);
4173 break;
4174
4175 default:
4176 break;
4177 }
4178done:
4179 err = 0;
4180 if (!pmu)
4181 err = -EINVAL;
4182 else if (IS_ERR(pmu))
4183 err = PTR_ERR(pmu);
4184
4185 if (err) {
4186 if (event->ns)
4187 put_pid_ns(event->ns);
4188 kfree(event);
4189 return ERR_PTR(err);
4190 }
4191
4192 event->pmu = pmu;
4193
4194 if (!event->parent) {
4195 atomic_inc(&nr_events);
4196 if (event->attr.mmap)
4197 atomic_inc(&nr_mmap_events);
4198 if (event->attr.comm)
4199 atomic_inc(&nr_comm_events);
4200 if (event->attr.task)
4201 atomic_inc(&nr_task_events);
4202 }
4203
4204 return event;
4205}
4206
4207static int perf_copy_attr(struct perf_event_attr __user *uattr,
4208 struct perf_event_attr *attr)
4209{
4210 u32 size;
4211 int ret;
4212
4213 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
4214 return -EFAULT;
4215
4216 /*
4217 * zero the full structure, so that a short copy will be nice.
4218 */
4219 memset(attr, 0, sizeof(*attr));
4220
4221 ret = get_user(size, &uattr->size);
4222 if (ret)
4223 return ret;
4224
4225 if (size > PAGE_SIZE) /* silly large */
4226 goto err_size;
4227
4228 if (!size) /* abi compat */
4229 size = PERF_ATTR_SIZE_VER0;
4230
4231 if (size < PERF_ATTR_SIZE_VER0)
4232 goto err_size;
4233
4234 /*
4235 * If we're handed a bigger struct than we know of,
4236 * ensure all the unknown bits are 0 - i.e. new
4237 * user-space does not rely on any kernel feature
4238 * extensions we dont know about yet.
4239 */
4240 if (size > sizeof(*attr)) {
4241 unsigned char __user *addr;
4242 unsigned char __user *end;
4243 unsigned char val;
4244
4245 addr = (void __user *)uattr + sizeof(*attr);
4246 end = (void __user *)uattr + size;
4247
4248 for (; addr < end; addr++) {
4249 ret = get_user(val, addr);
4250 if (ret)
4251 return ret;
4252 if (val)
4253 goto err_size;
4254 }
4255 size = sizeof(*attr);
4256 }
4257
4258 ret = copy_from_user(attr, uattr, size);
4259 if (ret)
4260 return -EFAULT;
4261
4262 /*
4263 * If the type exists, the corresponding creation will verify
4264 * the attr->config.
4265 */
4266 if (attr->type >= PERF_TYPE_MAX)
4267 return -EINVAL;
4268
4269 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
4270 return -EINVAL;
4271
4272 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
4273 return -EINVAL;
4274
4275 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
4276 return -EINVAL;
4277
4278out:
4279 return ret;
4280
4281err_size:
4282 put_user(sizeof(*attr), &uattr->size);
4283 ret = -E2BIG;
4284 goto out;
4285}
4286
4287int perf_event_set_output(struct perf_event *event, int output_fd)
4288{
4289 struct perf_event *output_event = NULL;
4290 struct file *output_file = NULL;
4291 struct perf_event *old_output;
4292 int fput_needed = 0;
4293 int ret = -EINVAL;
4294
4295 if (!output_fd)
4296 goto set;
4297
4298 output_file = fget_light(output_fd, &fput_needed);
4299 if (!output_file)
4300 return -EBADF;
4301
4302 if (output_file->f_op != &perf_fops)
4303 goto out;
4304
4305 output_event = output_file->private_data;
4306
4307 /* Don't chain output fds */
4308 if (output_event->output)
4309 goto out;
4310
4311 /* Don't set an output fd when we already have an output channel */
4312 if (event->data)
4313 goto out;
4314
4315 atomic_long_inc(&output_file->f_count);
4316
4317set:
4318 mutex_lock(&event->mmap_mutex);
4319 old_output = event->output;
4320 rcu_assign_pointer(event->output, output_event);
4321 mutex_unlock(&event->mmap_mutex);
4322
4323 if (old_output) {
4324 /*
4325 * we need to make sure no existing perf_output_*()
4326 * is still referencing this event.
4327 */
4328 synchronize_rcu();
4329 fput(old_output->filp);
4330 }
4331
4332 ret = 0;
4333out:
4334 fput_light(output_file, fput_needed);
4335 return ret;
4336}
4337
4338/**
4339 * sys_perf_event_open - open a performance event, associate it to a task/cpu
4340 *
4341 * @attr_uptr: event_id type attributes for monitoring/sampling
4342 * @pid: target pid
4343 * @cpu: target cpu
4344 * @group_fd: group leader event fd
4345 */
4346SYSCALL_DEFINE5(perf_event_open,
4347 struct perf_event_attr __user *, attr_uptr,
4348 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4349{
4350 struct perf_event *event, *group_leader;
4351 struct perf_event_attr attr;
4352 struct perf_event_context *ctx;
4353 struct file *event_file = NULL;
4354 struct file *group_file = NULL;
4355 int fput_needed = 0;
4356 int fput_needed2 = 0;
4357 int err;
4358
4359 /* for future expandability... */
4360 if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
4361 return -EINVAL;
4362
4363 err = perf_copy_attr(attr_uptr, &attr);
4364 if (err)
4365 return err;
4366
4367 if (!attr.exclude_kernel) {
4368 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
4369 return -EACCES;
4370 }
4371
4372 if (attr.freq) {
4373 if (attr.sample_freq > sysctl_perf_event_sample_rate)
4374 return -EINVAL;
4375 }
4376
4377 /*
4378 * Get the target context (task or percpu):
4379 */
4380 ctx = find_get_context(pid, cpu);
4381 if (IS_ERR(ctx))
4382 return PTR_ERR(ctx);
4383
4384 /*
4385 * Look up the group leader (we will attach this event to it):
4386 */
4387 group_leader = NULL;
4388 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4389 err = -EINVAL;
4390 group_file = fget_light(group_fd, &fput_needed);
4391 if (!group_file)
4392 goto err_put_context;
4393 if (group_file->f_op != &perf_fops)
4394 goto err_put_context;
4395
4396 group_leader = group_file->private_data;
4397 /*
4398 * Do not allow a recursive hierarchy (this new sibling
4399 * becoming part of another group-sibling):
4400 */
4401 if (group_leader->group_leader != group_leader)
4402 goto err_put_context;
4403 /*
4404 * Do not allow to attach to a group in a different
4405 * task or CPU context:
4406 */
4407 if (group_leader->ctx != ctx)
4408 goto err_put_context;
4409 /*
4410 * Only a group leader can be exclusive or pinned
4411 */
4412 if (attr.exclusive || attr.pinned)
4413 goto err_put_context;
4414 }
4415
4416 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4417 NULL, GFP_KERNEL);
4418 err = PTR_ERR(event);
4419 if (IS_ERR(event))
4420 goto err_put_context;
4421
4422 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0);
4423 if (err < 0)
4424 goto err_free_put_context;
4425
4426 event_file = fget_light(err, &fput_needed2);
4427 if (!event_file)
4428 goto err_free_put_context;
4429
4430 if (flags & PERF_FLAG_FD_OUTPUT) {
4431 err = perf_event_set_output(event, group_fd);
4432 if (err)
4433 goto err_fput_free_put_context;
4434 }
4435
4436 event->filp = event_file;
4437 WARN_ON_ONCE(ctx->parent_ctx);
4438 mutex_lock(&ctx->mutex);
4439 perf_install_in_context(ctx, event, cpu);
4440 ++ctx->generation;
4441 mutex_unlock(&ctx->mutex);
4442
4443 event->owner = current;
4444 get_task_struct(current);
4445 mutex_lock(&current->perf_event_mutex);
4446 list_add_tail(&event->owner_entry, &current->perf_event_list);
4447 mutex_unlock(&current->perf_event_mutex);
4448
4449err_fput_free_put_context:
4450 fput_light(event_file, fput_needed2);
4451
4452err_free_put_context:
4453 if (err < 0)
4454 kfree(event);
4455
4456err_put_context:
4457 if (err < 0)
4458 put_ctx(ctx);
4459
4460 fput_light(group_file, fput_needed);
4461
4462 return err;
4463}
4464
4465/*
4466 * inherit a event from parent task to child task:
4467 */
4468static struct perf_event *
4469inherit_event(struct perf_event *parent_event,
4470 struct task_struct *parent,
4471 struct perf_event_context *parent_ctx,
4472 struct task_struct *child,
4473 struct perf_event *group_leader,
4474 struct perf_event_context *child_ctx)
4475{
4476 struct perf_event *child_event;
4477
4478 /*
4479 * Instead of creating recursive hierarchies of events,
4480 * we link inherited events back to the original parent,
4481 * which has a filp for sure, which we use as the reference
4482 * count:
4483 */
4484 if (parent_event->parent)
4485 parent_event = parent_event->parent;
4486
4487 child_event = perf_event_alloc(&parent_event->attr,
4488 parent_event->cpu, child_ctx,
4489 group_leader, parent_event,
4490 GFP_KERNEL);
4491 if (IS_ERR(child_event))
4492 return child_event;
4493 get_ctx(child_ctx);
4494
4495 /*
4496 * Make the child state follow the state of the parent event,
4497 * not its attr.disabled bit. We hold the parent's mutex,
4498 * so we won't race with perf_event_{en, dis}able_family.
4499 */
4500 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
4501 child_event->state = PERF_EVENT_STATE_INACTIVE;
4502 else
4503 child_event->state = PERF_EVENT_STATE_OFF;
4504
4505 if (parent_event->attr.freq)
4506 child_event->hw.sample_period = parent_event->hw.sample_period;
4507
4508 /*
4509 * Link it up in the child's context:
4510 */
4511 add_event_to_ctx(child_event, child_ctx);
4512
4513 /*
4514 * Get a reference to the parent filp - we will fput it
4515 * when the child event exits. This is safe to do because
4516 * we are in the parent and we know that the filp still
4517 * exists and has a nonzero count:
4518 */
4519 atomic_long_inc(&parent_event->filp->f_count);
4520
4521 /*
4522 * Link this into the parent event's child list
4523 */
4524 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4525 mutex_lock(&parent_event->child_mutex);
4526 list_add_tail(&child_event->child_list, &parent_event->child_list);
4527 mutex_unlock(&parent_event->child_mutex);
4528
4529 return child_event;
4530}
4531
4532static int inherit_group(struct perf_event *parent_event,
4533 struct task_struct *parent,
4534 struct perf_event_context *parent_ctx,
4535 struct task_struct *child,
4536 struct perf_event_context *child_ctx)
4537{
4538 struct perf_event *leader;
4539 struct perf_event *sub;
4540 struct perf_event *child_ctr;
4541
4542 leader = inherit_event(parent_event, parent, parent_ctx,
4543 child, NULL, child_ctx);
4544 if (IS_ERR(leader))
4545 return PTR_ERR(leader);
4546 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
4547 child_ctr = inherit_event(sub, parent, parent_ctx,
4548 child, leader, child_ctx);
4549 if (IS_ERR(child_ctr))
4550 return PTR_ERR(child_ctr);
4551 }
4552 return 0;
4553}
4554
4555static void sync_child_event(struct perf_event *child_event,
4556 struct task_struct *child)
4557{
4558 struct perf_event *parent_event = child_event->parent;
4559 u64 child_val;
4560
4561 if (child_event->attr.inherit_stat)
4562 perf_event_read_event(child_event, child);
4563
4564 child_val = atomic64_read(&child_event->count);
4565
4566 /*
4567 * Add back the child's count to the parent's count:
4568 */
4569 atomic64_add(child_val, &parent_event->count);
4570 atomic64_add(child_event->total_time_enabled,
4571 &parent_event->child_total_time_enabled);
4572 atomic64_add(child_event->total_time_running,
4573 &parent_event->child_total_time_running);
4574
4575 /*
4576 * Remove this event from the parent's list
4577 */
4578 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
4579 mutex_lock(&parent_event->child_mutex);
4580 list_del_init(&child_event->child_list);
4581 mutex_unlock(&parent_event->child_mutex);
4582
4583 /*
4584 * Release the parent event, if this was the last
4585 * reference to it.
4586 */
4587 fput(parent_event->filp);
4588}
4589
4590static void
4591__perf_event_exit_task(struct perf_event *child_event,
4592 struct perf_event_context *child_ctx,
4593 struct task_struct *child)
4594{
4595 struct perf_event *parent_event;
4596
4597 update_event_times(child_event);
4598 perf_event_remove_from_context(child_event);
4599
4600 parent_event = child_event->parent;
4601 /*
4602 * It can happen that parent exits first, and has events
4603 * that are still around due to the child reference. These
4604 * events need to be zapped - but otherwise linger.
4605 */
4606 if (parent_event) {
4607 sync_child_event(child_event, child);
4608 free_event(child_event);
4609 }
4610}
4611
4612/*
4613 * When a child task exits, feed back event values to parent events.
4614 */
4615void perf_event_exit_task(struct task_struct *child)
4616{
4617 struct perf_event *child_event, *tmp;
4618 struct perf_event_context *child_ctx;
4619 unsigned long flags;
4620
4621 if (likely(!child->perf_event_ctxp)) {
4622 perf_event_task(child, NULL, 0);
4623 return;
4624 }
4625
4626 local_irq_save(flags);
4627 /*
4628 * We can't reschedule here because interrupts are disabled,
4629 * and either child is current or it is a task that can't be
4630 * scheduled, so we are now safe from rescheduling changing
4631 * our context.
4632 */
4633 child_ctx = child->perf_event_ctxp;
4634 __perf_event_task_sched_out(child_ctx);
4635
4636 /*
4637 * Take the context lock here so that if find_get_context is
4638 * reading child->perf_event_ctxp, we wait until it has
4639 * incremented the context's refcount before we do put_ctx below.
4640 */
4641 spin_lock(&child_ctx->lock);
4642 child->perf_event_ctxp = NULL;
4643 /*
4644 * If this context is a clone; unclone it so it can't get
4645 * swapped to another process while we're removing all
4646 * the events from it.
4647 */
4648 unclone_ctx(child_ctx);
4649 spin_unlock_irqrestore(&child_ctx->lock, flags);
4650
4651 /*
4652 * Report the task dead after unscheduling the events so that we
4653 * won't get any samples after PERF_RECORD_EXIT. We can however still
4654 * get a few PERF_RECORD_READ events.
4655 */
4656 perf_event_task(child, child_ctx, 0);
4657
4658 /*
4659 * We can recurse on the same lock type through:
4660 *
4661 * __perf_event_exit_task()
4662 * sync_child_event()
4663 * fput(parent_event->filp)
4664 * perf_release()
4665 * mutex_lock(&ctx->mutex)
4666 *
4667 * But since its the parent context it won't be the same instance.
4668 */
4669 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
4670
4671again:
4672 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list,
4673 group_entry)
4674 __perf_event_exit_task(child_event, child_ctx, child);
4675
4676 /*
4677 * If the last event was a group event, it will have appended all
4678 * its siblings to the list, but we obtained 'tmp' before that which
4679 * will still point to the list head terminating the iteration.
4680 */
4681 if (!list_empty(&child_ctx->group_list))
4682 goto again;
4683
4684 mutex_unlock(&child_ctx->mutex);
4685
4686 put_ctx(child_ctx);
4687}
4688
4689/*
4690 * free an unexposed, unused context as created by inheritance by
4691 * init_task below, used by fork() in case of fail.
4692 */
4693void perf_event_free_task(struct task_struct *task)
4694{
4695 struct perf_event_context *ctx = task->perf_event_ctxp;
4696 struct perf_event *event, *tmp;
4697
4698 if (!ctx)
4699 return;
4700
4701 mutex_lock(&ctx->mutex);
4702again:
4703 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) {
4704 struct perf_event *parent = event->parent;
4705
4706 if (WARN_ON_ONCE(!parent))
4707 continue;
4708
4709 mutex_lock(&parent->child_mutex);
4710 list_del_init(&event->child_list);
4711 mutex_unlock(&parent->child_mutex);
4712
4713 fput(parent->filp);
4714
4715 list_del_event(event, ctx);
4716 free_event(event);
4717 }
4718
4719 if (!list_empty(&ctx->group_list))
4720 goto again;
4721
4722 mutex_unlock(&ctx->mutex);
4723
4724 put_ctx(ctx);
4725}
4726
4727/*
4728 * Initialize the perf_event context in task_struct
4729 */
4730int perf_event_init_task(struct task_struct *child)
4731{
4732 struct perf_event_context *child_ctx, *parent_ctx;
4733 struct perf_event_context *cloned_ctx;
4734 struct perf_event *event;
4735 struct task_struct *parent = current;
4736 int inherited_all = 1;
4737 int ret = 0;
4738
4739 child->perf_event_ctxp = NULL;
4740
4741 mutex_init(&child->perf_event_mutex);
4742 INIT_LIST_HEAD(&child->perf_event_list);
4743
4744 if (likely(!parent->perf_event_ctxp))
4745 return 0;
4746
4747 /*
4748 * This is executed from the parent task context, so inherit
4749 * events that have been marked for cloning.
4750 * First allocate and initialize a context for the child.
4751 */
4752
4753 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4754 if (!child_ctx)
4755 return -ENOMEM;
4756
4757 __perf_event_init_context(child_ctx, child);
4758 child->perf_event_ctxp = child_ctx;
4759 get_task_struct(child);
4760
4761 /*
4762 * If the parent's context is a clone, pin it so it won't get
4763 * swapped under us.
4764 */
4765 parent_ctx = perf_pin_task_context(parent);
4766
4767 /*
4768 * No need to check if parent_ctx != NULL here; since we saw
4769 * it non-NULL earlier, the only reason for it to become NULL
4770 * is if we exit, and since we're currently in the middle of
4771 * a fork we can't be exiting at the same time.
4772 */
4773
4774 /*
4775 * Lock the parent list. No need to lock the child - not PID
4776 * hashed yet and not running, so nobody can access it.
4777 */
4778 mutex_lock(&parent_ctx->mutex);
4779
4780 /*
4781 * We dont have to disable NMIs - we are only looking at
4782 * the list, not manipulating it:
4783 */
4784 list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) {
4785 if (event != event->group_leader)
4786 continue;
4787
4788 if (!event->attr.inherit) {
4789 inherited_all = 0;
4790 continue;
4791 }
4792
4793 ret = inherit_group(event, parent, parent_ctx,
4794 child, child_ctx);
4795 if (ret) {
4796 inherited_all = 0;
4797 break;
4798 }
4799 }
4800
4801 if (inherited_all) {
4802 /*
4803 * Mark the child context as a clone of the parent
4804 * context, or of whatever the parent is a clone of.
4805 * Note that if the parent is a clone, it could get
4806 * uncloned at any point, but that doesn't matter
4807 * because the list of events and the generation
4808 * count can't have changed since we took the mutex.
4809 */
4810 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
4811 if (cloned_ctx) {
4812 child_ctx->parent_ctx = cloned_ctx;
4813 child_ctx->parent_gen = parent_ctx->parent_gen;
4814 } else {
4815 child_ctx->parent_ctx = parent_ctx;
4816 child_ctx->parent_gen = parent_ctx->generation;
4817 }
4818 get_ctx(child_ctx->parent_ctx);
4819 }
4820
4821 mutex_unlock(&parent_ctx->mutex);
4822
4823 perf_unpin_context(parent_ctx);
4824
4825 return ret;
4826}
4827
4828static void __cpuinit perf_event_init_cpu(int cpu)
4829{
4830 struct perf_cpu_context *cpuctx;
4831
4832 cpuctx = &per_cpu(perf_cpu_context, cpu);
4833 __perf_event_init_context(&cpuctx->ctx, NULL);
4834
4835 spin_lock(&perf_resource_lock);
4836 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
4837 spin_unlock(&perf_resource_lock);
4838
4839 hw_perf_event_setup(cpu);
4840}
4841
4842#ifdef CONFIG_HOTPLUG_CPU
4843static void __perf_event_exit_cpu(void *info)
4844{
4845 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4846 struct perf_event_context *ctx = &cpuctx->ctx;
4847 struct perf_event *event, *tmp;
4848
4849 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry)
4850 __perf_event_remove_from_context(event);
4851}
4852static void perf_event_exit_cpu(int cpu)
4853{
4854 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4855 struct perf_event_context *ctx = &cpuctx->ctx;
4856
4857 mutex_lock(&ctx->mutex);
4858 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
4859 mutex_unlock(&ctx->mutex);
4860}
4861#else
4862static inline void perf_event_exit_cpu(int cpu) { }
4863#endif
4864
4865static int __cpuinit
4866perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
4867{
4868 unsigned int cpu = (long)hcpu;
4869
4870 switch (action) {
4871
4872 case CPU_UP_PREPARE:
4873 case CPU_UP_PREPARE_FROZEN:
4874 perf_event_init_cpu(cpu);
4875 break;
4876
4877 case CPU_ONLINE:
4878 case CPU_ONLINE_FROZEN:
4879 hw_perf_event_setup_online(cpu);
4880 break;
4881
4882 case CPU_DOWN_PREPARE:
4883 case CPU_DOWN_PREPARE_FROZEN:
4884 perf_event_exit_cpu(cpu);
4885 break;
4886
4887 default:
4888 break;
4889 }
4890
4891 return NOTIFY_OK;
4892}
4893
4894/*
4895 * This has to have a higher priority than migration_notifier in sched.c.
4896 */
4897static struct notifier_block __cpuinitdata perf_cpu_nb = {
4898 .notifier_call = perf_cpu_notify,
4899 .priority = 20,
4900};
4901
4902void __init perf_event_init(void)
4903{
4904 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
4905 (void *)(long)smp_processor_id());
4906 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
4907 (void *)(long)smp_processor_id());
4908 register_cpu_notifier(&perf_cpu_nb);
4909}
4910
4911static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
4912{
4913 return sprintf(buf, "%d\n", perf_reserved_percpu);
4914}
4915
4916static ssize_t
4917perf_set_reserve_percpu(struct sysdev_class *class,
4918 const char *buf,
4919 size_t count)
4920{
4921 struct perf_cpu_context *cpuctx;
4922 unsigned long val;
4923 int err, cpu, mpt;
4924
4925 err = strict_strtoul(buf, 10, &val);
4926 if (err)
4927 return err;
4928 if (val > perf_max_events)
4929 return -EINVAL;
4930
4931 spin_lock(&perf_resource_lock);
4932 perf_reserved_percpu = val;
4933 for_each_online_cpu(cpu) {
4934 cpuctx = &per_cpu(perf_cpu_context, cpu);
4935 spin_lock_irq(&cpuctx->ctx.lock);
4936 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
4937 perf_max_events - perf_reserved_percpu);
4938 cpuctx->max_pertask = mpt;
4939 spin_unlock_irq(&cpuctx->ctx.lock);
4940 }
4941 spin_unlock(&perf_resource_lock);
4942
4943 return count;
4944}
4945
4946static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
4947{
4948 return sprintf(buf, "%d\n", perf_overcommit);
4949}
4950
4951static ssize_t
4952perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
4953{
4954 unsigned long val;
4955 int err;
4956
4957 err = strict_strtoul(buf, 10, &val);
4958 if (err)
4959 return err;
4960 if (val > 1)
4961 return -EINVAL;
4962
4963 spin_lock(&perf_resource_lock);
4964 perf_overcommit = val;
4965 spin_unlock(&perf_resource_lock);
4966
4967 return count;
4968}
4969
4970static SYSDEV_CLASS_ATTR(
4971 reserve_percpu,
4972 0644,
4973 perf_show_reserve_percpu,
4974 perf_set_reserve_percpu
4975 );
4976
4977static SYSDEV_CLASS_ATTR(
4978 overcommit,
4979 0644,
4980 perf_show_overcommit,
4981 perf_set_overcommit
4982 );
4983
4984static struct attribute *perfclass_attrs[] = {
4985 &attr_reserve_percpu.attr,
4986 &attr_overcommit.attr,
4987 NULL
4988};
4989
4990static struct attribute_group perfclass_attr_group = {
4991 .attrs = perfclass_attrs,
4992 .name = "perf_events",
4993};
4994
4995static int __init perf_event_sysfs_init(void)
4996{
4997 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
4998 &perfclass_attr_group);
4999}
5000device_initcall(perf_event_sysfs_init);
diff --git a/kernel/pid.c b/kernel/pid.c
index 31310b5d3f50..d3f722d20f9c 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -40,7 +40,7 @@
40#define pid_hashfn(nr, ns) \ 40#define pid_hashfn(nr, ns) \
41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) 41 hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
42static struct hlist_head *pid_hash; 42static struct hlist_head *pid_hash;
43static int pidhash_shift; 43static unsigned int pidhash_shift = 4;
44struct pid init_struct_pid = INIT_STRUCT_PID; 44struct pid init_struct_pid = INIT_STRUCT_PID;
45 45
46int pid_max = PID_MAX_DEFAULT; 46int pid_max = PID_MAX_DEFAULT;
@@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
499void __init pidhash_init(void) 499void __init pidhash_init(void)
500{ 500{
501 int i, pidhash_size; 501 int i, pidhash_size;
502 unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT);
503 502
504 pidhash_shift = max(4, fls(megabytes * 4)); 503 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
505 pidhash_shift = min(12, pidhash_shift); 504 HASH_EARLY | HASH_SMALL,
505 &pidhash_shift, NULL, 4096);
506 pidhash_size = 1 << pidhash_shift; 506 pidhash_size = 1 << pidhash_shift;
507 507
508 printk("PID hash table entries: %d (order: %d, %Zd bytes)\n",
509 pidhash_size, pidhash_shift,
510 pidhash_size * sizeof(struct hlist_head));
511
512 pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash)));
513 if (!pid_hash)
514 panic("Could not alloc pidhash!\n");
515 for (i = 0; i < pidhash_size; i++) 508 for (i = 0; i < pidhash_size; i++)
516 INIT_HLIST_HEAD(&pid_hash[i]); 509 INIT_HLIST_HEAD(&pid_hash[i]);
517} 510}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bece7c0b67b2..e33a21cb9407 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -521,11 +521,12 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
521} 521}
522void posix_cpu_timers_exit_group(struct task_struct *tsk) 522void posix_cpu_timers_exit_group(struct task_struct *tsk)
523{ 523{
524 struct task_cputime cputime; 524 struct signal_struct *const sig = tsk->signal;
525 525
526 thread_group_cputimer(tsk, &cputime);
527 cleanup_timers(tsk->signal->cpu_timers, 526 cleanup_timers(tsk->signal->cpu_timers,
528 cputime.utime, cputime.stime, cputime.sum_exec_runtime); 527 cputime_add(tsk->utime, sig->utime),
528 cputime_add(tsk->stime, sig->stime),
529 tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
529} 530}
530 531
531static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 532static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 052ec4d195c7..495440779ce3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -202,6 +202,12 @@ static int no_timer_create(struct k_itimer *new_timer)
202 return -EOPNOTSUPP; 202 return -EOPNOTSUPP;
203} 203}
204 204
205static int no_nsleep(const clockid_t which_clock, int flags,
206 struct timespec *tsave, struct timespec __user *rmtp)
207{
208 return -EOPNOTSUPP;
209}
210
205/* 211/*
206 * Return nonzero if we know a priori this clockid_t value is bogus. 212 * Return nonzero if we know a priori this clockid_t value is bogus.
207 */ 213 */
@@ -236,6 +242,25 @@ static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
236 return 0; 242 return 0;
237} 243}
238 244
245
246static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp)
247{
248 *tp = current_kernel_time();
249 return 0;
250}
251
252static int posix_get_monotonic_coarse(clockid_t which_clock,
253 struct timespec *tp)
254{
255 *tp = get_monotonic_coarse();
256 return 0;
257}
258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0;
263}
239/* 264/*
240 * Initialize everything, well, just everything in Posix clocks/timers ;) 265 * Initialize everything, well, just everything in Posix clocks/timers ;)
241 */ 266 */
@@ -254,11 +279,28 @@ static __init int init_posix_timers(void)
254 .clock_get = posix_get_monotonic_raw, 279 .clock_get = posix_get_monotonic_raw,
255 .clock_set = do_posix_clock_nosettime, 280 .clock_set = do_posix_clock_nosettime,
256 .timer_create = no_timer_create, 281 .timer_create = no_timer_create,
282 .nsleep = no_nsleep,
283 };
284 struct k_clock clock_realtime_coarse = {
285 .clock_getres = posix_get_coarse_res,
286 .clock_get = posix_get_realtime_coarse,
287 .clock_set = do_posix_clock_nosettime,
288 .timer_create = no_timer_create,
289 .nsleep = no_nsleep,
290 };
291 struct k_clock clock_monotonic_coarse = {
292 .clock_getres = posix_get_coarse_res,
293 .clock_get = posix_get_monotonic_coarse,
294 .clock_set = do_posix_clock_nosettime,
295 .timer_create = no_timer_create,
296 .nsleep = no_nsleep,
257 }; 297 };
258 298
259 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 299 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
260 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 300 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
261 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); 301 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
302 register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
303 register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
262 304
263 posix_timers_cache = kmem_cache_create("posix_timers_cache", 305 posix_timers_cache = kmem_cache_create("posix_timers_cache",
264 sizeof (struct k_itimer), 0, SLAB_PANIC, 306 sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 72067cbdb37f..91e09d3b2eb2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -208,3 +208,17 @@ config APM_EMULATION
208 random kernel OOPSes or reboots that don't seem to be related to 208 random kernel OOPSes or reboots that don't seem to be related to
209 anything, try disabling/enabling this option (or disabling/enabling 209 anything, try disabling/enabling this option (or disabling/enabling
210 APM in your BIOS). 210 APM in your BIOS).
211
212config PM_RUNTIME
213 bool "Run-time PM core functionality"
214 depends on PM
215 ---help---
216 Enable functionality allowing I/O devices to be put into energy-saving
217 (low power) states at run time (or autosuspended) after a specified
218 period of inactivity and woken up in response to a hardware-generated
219 wake-up event or a driver's request.
220
221 Hardware support is generally required for this functionality to work
222 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and
224 wake-up events.
diff --git a/kernel/power/console.c b/kernel/power/console.c
index a3961b205de7..5187136fe1de 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -14,56 +14,13 @@
14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) 14#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
15 15
16static int orig_fgconsole, orig_kmsg; 16static int orig_fgconsole, orig_kmsg;
17static int disable_vt_switch;
18
19/*
20 * Normally during a suspend, we allocate a new console and switch to it.
21 * When we resume, we switch back to the original console. This switch
22 * can be slow, so on systems where the framebuffer can handle restoration
23 * of video registers anyways, there's little point in doing the console
24 * switch. This function allows you to disable it by passing it '0'.
25 */
26void pm_set_vt_switch(int do_switch)
27{
28 acquire_console_sem();
29 disable_vt_switch = !do_switch;
30 release_console_sem();
31}
32EXPORT_SYMBOL(pm_set_vt_switch);
33 17
34int pm_prepare_console(void) 18int pm_prepare_console(void)
35{ 19{
36 acquire_console_sem(); 20 orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
37 21 if (orig_fgconsole < 0)
38 if (disable_vt_switch) {
39 release_console_sem();
40 return 0;
41 }
42
43 orig_fgconsole = fg_console;
44
45 if (vc_allocate(SUSPEND_CONSOLE)) {
46 /* we can't have a free VC for now. Too bad,
47 * we don't want to mess the screen for now. */
48 release_console_sem();
49 return 1; 22 return 1;
50 }
51 23
52 if (set_console(SUSPEND_CONSOLE)) {
53 /*
54 * We're unable to switch to the SUSPEND_CONSOLE.
55 * Let the calling function know so it can decide
56 * what to do.
57 */
58 release_console_sem();
59 return 1;
60 }
61 release_console_sem();
62
63 if (vt_waitactive(SUSPEND_CONSOLE)) {
64 pr_debug("Suspend: Can't switch VCs.");
65 return 1;
66 }
67 orig_kmsg = kmsg_redirect; 24 orig_kmsg = kmsg_redirect;
68 kmsg_redirect = SUSPEND_CONSOLE; 25 kmsg_redirect = SUSPEND_CONSOLE;
69 return 0; 26 return 0;
@@ -71,19 +28,9 @@ int pm_prepare_console(void)
71 28
72void pm_restore_console(void) 29void pm_restore_console(void)
73{ 30{
74 acquire_console_sem(); 31 if (orig_fgconsole >= 0) {
75 if (disable_vt_switch) { 32 vt_move_to_console(orig_fgconsole, 0);
76 release_console_sem(); 33 kmsg_redirect = orig_kmsg;
77 return;
78 }
79 set_console(orig_fgconsole);
80 release_console_sem();
81
82 if (vt_waitactive(orig_fgconsole)) {
83 pr_debug("Resume: Can't switch VCs.");
84 return;
85 } 34 }
86
87 kmsg_redirect = orig_kmsg;
88} 35}
89#endif 36#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 81d2e7464893..04b3a83d686f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -298,8 +298,8 @@ int hibernation_snapshot(int platform_mode)
298 if (error) 298 if (error)
299 return error; 299 return error;
300 300
301 /* Free memory before shutting down devices. */ 301 /* Preallocate image memory before shutting down devices. */
302 error = swsusp_shrink_memory(); 302 error = hibernate_preallocate_memory();
303 if (error) 303 if (error)
304 goto Close; 304 goto Close;
305 305
@@ -315,6 +315,10 @@ int hibernation_snapshot(int platform_mode)
315 /* Control returns here after successful restore */ 315 /* Control returns here after successful restore */
316 316
317 Resume_devices: 317 Resume_devices:
318 /* We may need to release the preallocated image pages here. */
319 if (error || !in_suspend)
320 swsusp_free();
321
318 dpm_resume_end(in_suspend ? 322 dpm_resume_end(in_suspend ?
319 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 323 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
320 resume_console(); 324 resume_console();
@@ -460,11 +464,11 @@ int hibernation_platform_enter(void)
460 464
461 error = hibernation_ops->prepare(); 465 error = hibernation_ops->prepare();
462 if (error) 466 if (error)
463 goto Platofrm_finish; 467 goto Platform_finish;
464 468
465 error = disable_nonboot_cpus(); 469 error = disable_nonboot_cpus();
466 if (error) 470 if (error)
467 goto Platofrm_finish; 471 goto Platform_finish;
468 472
469 local_irq_disable(); 473 local_irq_disable();
470 sysdev_suspend(PMSG_HIBERNATE); 474 sysdev_suspend(PMSG_HIBERNATE);
@@ -476,7 +480,7 @@ int hibernation_platform_enter(void)
476 * We don't need to reenable the nonboot CPUs or resume consoles, since 480 * We don't need to reenable the nonboot CPUs or resume consoles, since
477 * the system is going to be halted anyway. 481 * the system is going to be halted anyway.
478 */ 482 */
479 Platofrm_finish: 483 Platform_finish:
480 hibernation_ops->finish(); 484 hibernation_ops->finish();
481 485
482 dpm_suspend_noirq(PMSG_RESTORE); 486 dpm_suspend_noirq(PMSG_RESTORE);
@@ -578,7 +582,10 @@ int hibernate(void)
578 goto Thaw; 582 goto Thaw;
579 583
580 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 584 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
581 if (in_suspend && !error) { 585 if (error)
586 goto Thaw;
587
588 if (in_suspend) {
582 unsigned int flags = 0; 589 unsigned int flags = 0;
583 590
584 if (hibernation_mode == HIBERNATION_PLATFORM) 591 if (hibernation_mode == HIBERNATION_PLATFORM)
@@ -590,8 +597,8 @@ int hibernate(void)
590 power_down(); 597 power_down();
591 } else { 598 } else {
592 pr_debug("PM: Image restored successfully.\n"); 599 pr_debug("PM: Image restored successfully.\n");
593 swsusp_free();
594 } 600 }
601
595 Thaw: 602 Thaw:
596 thaw_processes(); 603 thaw_processes();
597 Finish: 604 Finish:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f710e36930cc..347d2cc88cd0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,6 +11,7 @@
11#include <linux/kobject.h> 11#include <linux/kobject.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/resume-trace.h> 13#include <linux/resume-trace.h>
14#include <linux/workqueue.h>
14 15
15#include "power.h" 16#include "power.h"
16 17
@@ -217,8 +218,24 @@ static struct attribute_group attr_group = {
217 .attrs = g, 218 .attrs = g,
218}; 219};
219 220
221#ifdef CONFIG_PM_RUNTIME
222struct workqueue_struct *pm_wq;
223
224static int __init pm_start_workqueue(void)
225{
226 pm_wq = create_freezeable_workqueue("pm");
227
228 return pm_wq ? 0 : -ENOMEM;
229}
230#else
231static inline int pm_start_workqueue(void) { return 0; }
232#endif
233
220static int __init pm_init(void) 234static int __init pm_init(void)
221{ 235{
236 int error = pm_start_workqueue();
237 if (error)
238 return error;
222 power_kobj = kobject_create_and_add("power", NULL); 239 power_kobj = kobject_create_and_add("power", NULL);
223 if (!power_kobj) 240 if (!power_kobj)
224 return -ENOMEM; 241 return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 26d5a26f82e3..46c5a26630a3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -74,7 +74,7 @@ extern asmlinkage int swsusp_arch_resume(void);
74 74
75extern int create_basic_memory_bitmaps(void); 75extern int create_basic_memory_bitmaps(void);
76extern void free_basic_memory_bitmaps(void); 76extern void free_basic_memory_bitmaps(void);
77extern int swsusp_shrink_memory(void); 77extern int hibernate_preallocate_memory(void);
78 78
79/** 79/**
80 * Auxiliary structure used for reading the snapshot image data and 80 * Auxiliary structure used for reading the snapshot image data and
diff --git a/kernel/power/process.c b/kernel/power/process.c
index da2072d73811..cc2e55373b68 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -9,6 +9,7 @@
9#undef DEBUG 9#undef DEBUG
10 10
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/oom.h>
12#include <linux/suspend.h> 13#include <linux/suspend.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/syscalls.h> 15#include <linux/syscalls.h>
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 523a451b45d3..36cb168e4330 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -233,7 +233,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
233 233
234#define BM_END_OF_MAP (~0UL) 234#define BM_END_OF_MAP (~0UL)
235 235
236#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) 236#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
237 237
238struct bm_block { 238struct bm_block {
239 struct list_head hook; /* hook into a list of bitmap blocks */ 239 struct list_head hook; /* hook into a list of bitmap blocks */
@@ -275,7 +275,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
275 275
276/** 276/**
277 * create_bm_block_list - create a list of block bitmap objects 277 * create_bm_block_list - create a list of block bitmap objects
278 * @nr_blocks - number of blocks to allocate 278 * @pages - number of pages to track
279 * @list - list to put the allocated blocks into 279 * @list - list to put the allocated blocks into
280 * @ca - chain allocator to be used for allocating memory 280 * @ca - chain allocator to be used for allocating memory
281 */ 281 */
@@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
619 BUG_ON(!region); 619 BUG_ON(!region);
620 } else 620 } else
621 /* This allocation cannot fail */ 621 /* This allocation cannot fail */
622 region = alloc_bootmem_low(sizeof(struct nosave_region)); 622 region = alloc_bootmem(sizeof(struct nosave_region));
623 region->start_pfn = start_pfn; 623 region->start_pfn = start_pfn;
624 region->end_pfn = end_pfn; 624 region->end_pfn = end_pfn;
625 list_add_tail(&region->list, &nosave_regions); 625 list_add_tail(&region->list, &nosave_regions);
@@ -853,7 +853,7 @@ static unsigned int count_highmem_pages(void)
853 struct zone *zone; 853 struct zone *zone;
854 unsigned int n = 0; 854 unsigned int n = 0;
855 855
856 for_each_zone(zone) { 856 for_each_populated_zone(zone) {
857 unsigned long pfn, max_zone_pfn; 857 unsigned long pfn, max_zone_pfn;
858 858
859 if (!is_highmem(zone)) 859 if (!is_highmem(zone))
@@ -916,7 +916,7 @@ static unsigned int count_data_pages(void)
916 unsigned long pfn, max_zone_pfn; 916 unsigned long pfn, max_zone_pfn;
917 unsigned int n = 0; 917 unsigned int n = 0;
918 918
919 for_each_zone(zone) { 919 for_each_populated_zone(zone) {
920 if (is_highmem(zone)) 920 if (is_highmem(zone))
921 continue; 921 continue;
922 922
@@ -1010,7 +1010,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1010 struct zone *zone; 1010 struct zone *zone;
1011 unsigned long pfn; 1011 unsigned long pfn;
1012 1012
1013 for_each_zone(zone) { 1013 for_each_populated_zone(zone) {
1014 unsigned long max_zone_pfn; 1014 unsigned long max_zone_pfn;
1015 1015
1016 mark_free_pages(zone); 1016 mark_free_pages(zone);
@@ -1033,6 +1033,25 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
1033static unsigned int nr_copy_pages; 1033static unsigned int nr_copy_pages;
1034/* Number of pages needed for saving the original pfns of the image pages */ 1034/* Number of pages needed for saving the original pfns of the image pages */
1035static unsigned int nr_meta_pages; 1035static unsigned int nr_meta_pages;
1036/*
1037 * Numbers of normal and highmem page frames allocated for hibernation image
1038 * before suspending devices.
1039 */
1040unsigned int alloc_normal, alloc_highmem;
1041/*
1042 * Memory bitmap used for marking saveable pages (during hibernation) or
1043 * hibernation image pages (during restore)
1044 */
1045static struct memory_bitmap orig_bm;
1046/*
1047 * Memory bitmap used during hibernation for marking allocated page frames that
1048 * will contain copies of saveable pages. During restore it is initially used
1049 * for marking hibernation image pages, but then the set bits from it are
1050 * duplicated in @orig_bm and it is released. On highmem systems it is next
1051 * used for marking "safe" highmem pages, but it has to be reinitialized for
1052 * this purpose.
1053 */
1054static struct memory_bitmap copy_bm;
1036 1055
1037/** 1056/**
1038 * swsusp_free - free pages allocated for the suspend. 1057 * swsusp_free - free pages allocated for the suspend.
@@ -1046,7 +1065,7 @@ void swsusp_free(void)
1046 struct zone *zone; 1065 struct zone *zone;
1047 unsigned long pfn, max_zone_pfn; 1066 unsigned long pfn, max_zone_pfn;
1048 1067
1049 for_each_zone(zone) { 1068 for_each_populated_zone(zone) {
1050 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1069 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1051 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1070 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1052 if (pfn_valid(pfn)) { 1071 if (pfn_valid(pfn)) {
@@ -1064,74 +1083,286 @@ void swsusp_free(void)
1064 nr_meta_pages = 0; 1083 nr_meta_pages = 0;
1065 restore_pblist = NULL; 1084 restore_pblist = NULL;
1066 buffer = NULL; 1085 buffer = NULL;
1086 alloc_normal = 0;
1087 alloc_highmem = 0;
1067} 1088}
1068 1089
1090/* Helper functions used for the shrinking of memory. */
1091
1092#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN)
1093
1069/** 1094/**
1070 * swsusp_shrink_memory - Try to free as much memory as needed 1095 * preallocate_image_pages - Allocate a number of pages for hibernation image
1071 * 1096 * @nr_pages: Number of page frames to allocate.
1072 * ... but do not OOM-kill anyone 1097 * @mask: GFP flags to use for the allocation.
1073 * 1098 *
1074 * Notice: all userland should be stopped before it is called, or 1099 * Return value: Number of page frames actually allocated
1075 * livelock is possible. 1100 */
1101static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
1102{
1103 unsigned long nr_alloc = 0;
1104
1105 while (nr_pages > 0) {
1106 struct page *page;
1107
1108 page = alloc_image_page(mask);
1109 if (!page)
1110 break;
1111 memory_bm_set_bit(&copy_bm, page_to_pfn(page));
1112 if (PageHighMem(page))
1113 alloc_highmem++;
1114 else
1115 alloc_normal++;
1116 nr_pages--;
1117 nr_alloc++;
1118 }
1119
1120 return nr_alloc;
1121}
1122
1123static unsigned long preallocate_image_memory(unsigned long nr_pages)
1124{
1125 return preallocate_image_pages(nr_pages, GFP_IMAGE);
1126}
1127
1128#ifdef CONFIG_HIGHMEM
1129static unsigned long preallocate_image_highmem(unsigned long nr_pages)
1130{
1131 return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM);
1132}
1133
1134/**
1135 * __fraction - Compute (an approximation of) x * (multiplier / base)
1076 */ 1136 */
1137static unsigned long __fraction(u64 x, u64 multiplier, u64 base)
1138{
1139 x *= multiplier;
1140 do_div(x, base);
1141 return (unsigned long)x;
1142}
1143
1144static unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1145 unsigned long highmem,
1146 unsigned long total)
1147{
1148 unsigned long alloc = __fraction(nr_pages, highmem, total);
1077 1149
1078#define SHRINK_BITE 10000 1150 return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM);
1079static inline unsigned long __shrink_memory(long tmp) 1151}
1152#else /* CONFIG_HIGHMEM */
1153static inline unsigned long preallocate_image_highmem(unsigned long nr_pages)
1080{ 1154{
1081 if (tmp > SHRINK_BITE) 1155 return 0;
1082 tmp = SHRINK_BITE;
1083 return shrink_all_memory(tmp);
1084} 1156}
1085 1157
1086int swsusp_shrink_memory(void) 1158static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
1159 unsigned long highmem,
1160 unsigned long total)
1161{
1162 return 0;
1163}
1164#endif /* CONFIG_HIGHMEM */
1165
1166/**
1167 * free_unnecessary_pages - Release preallocated pages not needed for the image
1168 */
1169static void free_unnecessary_pages(void)
1170{
1171 unsigned long save_highmem, to_free_normal, to_free_highmem;
1172
1173 to_free_normal = alloc_normal - count_data_pages();
1174 save_highmem = count_highmem_pages();
1175 if (alloc_highmem > save_highmem) {
1176 to_free_highmem = alloc_highmem - save_highmem;
1177 } else {
1178 to_free_highmem = 0;
1179 to_free_normal -= save_highmem - alloc_highmem;
1180 }
1181
1182 memory_bm_position_reset(&copy_bm);
1183
1184 while (to_free_normal > 0 && to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn);
1187
1188 if (PageHighMem(page)) {
1189 if (!to_free_highmem)
1190 continue;
1191 to_free_highmem--;
1192 alloc_highmem--;
1193 } else {
1194 if (!to_free_normal)
1195 continue;
1196 to_free_normal--;
1197 alloc_normal--;
1198 }
1199 memory_bm_clear_bit(&copy_bm, pfn);
1200 swsusp_unset_page_forbidden(page);
1201 swsusp_unset_page_free(page);
1202 __free_page(page);
1203 }
1204}
1205
1206/**
1207 * minimum_image_size - Estimate the minimum acceptable size of an image
1208 * @saveable: Number of saveable pages in the system.
1209 *
1210 * We want to avoid attempting to free too much memory too hard, so estimate the
1211 * minimum acceptable size of a hibernation image to use as the lower limit for
1212 * preallocating memory.
1213 *
1214 * We assume that the minimum image size should be proportional to
1215 *
1216 * [number of saveable pages] - [number of pages that can be freed in theory]
1217 *
1218 * where the second term is the sum of (1) reclaimable slab pages, (2) active
1219 * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages,
1220 * minus mapped file pages.
1221 */
1222static unsigned long minimum_image_size(unsigned long saveable)
1223{
1224 unsigned long size;
1225
1226 size = global_page_state(NR_SLAB_RECLAIMABLE)
1227 + global_page_state(NR_ACTIVE_ANON)
1228 + global_page_state(NR_INACTIVE_ANON)
1229 + global_page_state(NR_ACTIVE_FILE)
1230 + global_page_state(NR_INACTIVE_FILE)
1231 - global_page_state(NR_FILE_MAPPED);
1232
1233 return saveable <= size ? 0 : saveable - size;
1234}
1235
1236/**
1237 * hibernate_preallocate_memory - Preallocate memory for hibernation image
1238 *
1239 * To create a hibernation image it is necessary to make a copy of every page
1240 * frame in use. We also need a number of page frames to be free during
1241 * hibernation for allocations made while saving the image and for device
1242 * drivers, in case they need to allocate memory from their hibernation
1243 * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES,
1244 * respectively, both of which are rough estimates). To make this happen, we
1245 * compute the total number of available page frames and allocate at least
1246 *
1247 * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES
1248 *
1249 * of them, which corresponds to the maximum size of a hibernation image.
1250 *
1251 * If image_size is set below the number following from the above formula,
1252 * the preallocation of memory is continued until the total number of saveable
1253 * pages in the system is below the requested image size or the minimum
1254 * acceptable image size returned by minimum_image_size(), whichever is greater.
1255 */
1256int hibernate_preallocate_memory(void)
1087{ 1257{
1088 long tmp;
1089 struct zone *zone; 1258 struct zone *zone;
1090 unsigned long pages = 0; 1259 unsigned long saveable, size, max_size, count, highmem, pages = 0;
1091 unsigned int i = 0; 1260 unsigned long alloc, save_highmem, pages_highmem;
1092 char *p = "-\\|/";
1093 struct timeval start, stop; 1261 struct timeval start, stop;
1262 int error;
1094 1263
1095 printk(KERN_INFO "PM: Shrinking memory... "); 1264 printk(KERN_INFO "PM: Preallocating image memory... ");
1096 do_gettimeofday(&start); 1265 do_gettimeofday(&start);
1097 do {
1098 long size, highmem_size;
1099
1100 highmem_size = count_highmem_pages();
1101 size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
1102 tmp = size;
1103 size += highmem_size;
1104 for_each_populated_zone(zone) {
1105 tmp += snapshot_additional_pages(zone);
1106 if (is_highmem(zone)) {
1107 highmem_size -=
1108 zone_page_state(zone, NR_FREE_PAGES);
1109 } else {
1110 tmp -= zone_page_state(zone, NR_FREE_PAGES);
1111 tmp += zone->lowmem_reserve[ZONE_NORMAL];
1112 }
1113 }
1114 1266
1115 if (highmem_size < 0) 1267 error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
1116 highmem_size = 0; 1268 if (error)
1269 goto err_out;
1117 1270
1118 tmp += highmem_size; 1271 error = memory_bm_create(&copy_bm, GFP_IMAGE, PG_ANY);
1119 if (tmp > 0) { 1272 if (error)
1120 tmp = __shrink_memory(tmp); 1273 goto err_out;
1121 if (!tmp) 1274
1122 return -ENOMEM; 1275 alloc_normal = 0;
1123 pages += tmp; 1276 alloc_highmem = 0;
1124 } else if (size > image_size / PAGE_SIZE) { 1277
1125 tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); 1278 /* Count the number of saveable data pages. */
1126 pages += tmp; 1279 save_highmem = count_highmem_pages();
1127 } 1280 saveable = count_data_pages();
1128 printk("\b%c", p[i++%4]); 1281
1129 } while (tmp > 0); 1282 /*
1283 * Compute the total number of page frames we can use (count) and the
1284 * number of pages needed for image metadata (size).
1285 */
1286 count = saveable;
1287 saveable += save_highmem;
1288 highmem = save_highmem;
1289 size = 0;
1290 for_each_populated_zone(zone) {
1291 size += snapshot_additional_pages(zone);
1292 if (is_highmem(zone))
1293 highmem += zone_page_state(zone, NR_FREE_PAGES);
1294 else
1295 count += zone_page_state(zone, NR_FREE_PAGES);
1296 }
1297 count += highmem;
1298 count -= totalreserve_pages;
1299
1300 /* Compute the maximum number of saveable pages to leave in memory. */
1301 max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
1302 size = DIV_ROUND_UP(image_size, PAGE_SIZE);
1303 if (size > max_size)
1304 size = max_size;
1305 /*
1306 * If the maximum is not less than the current number of saveable pages
1307 * in memory, allocate page frames for the image and we're done.
1308 */
1309 if (size >= saveable) {
1310 pages = preallocate_image_highmem(save_highmem);
1311 pages += preallocate_image_memory(saveable - pages);
1312 goto out;
1313 }
1314
1315 /* Estimate the minimum size of the image. */
1316 pages = minimum_image_size(saveable);
1317 if (size < pages)
1318 size = min_t(unsigned long, pages, max_size);
1319
1320 /*
1321 * Let the memory management subsystem know that we're going to need a
1322 * large number of page frames to allocate and make it free some memory.
1323 * NOTE: If this is not done, performance will be hurt badly in some
1324 * test cases.
1325 */
1326 shrink_all_memory(saveable - size);
1327
1328 /*
1329 * The number of saveable pages in memory was too high, so apply some
1330 * pressure to decrease it. First, make room for the largest possible
1331 * image and fail if that doesn't work. Next, try to decrease the size
1332 * of the image as much as indicated by 'size' using allocations from
1333 * highmem and non-highmem zones separately.
1334 */
1335 pages_highmem = preallocate_image_highmem(highmem / 2);
1336 alloc = (count - max_size) - pages_highmem;
1337 pages = preallocate_image_memory(alloc);
1338 if (pages < alloc)
1339 goto err_out;
1340 size = max_size - size;
1341 alloc = size;
1342 size = preallocate_highmem_fraction(size, highmem, count);
1343 pages_highmem += size;
1344 alloc -= size;
1345 pages += preallocate_image_memory(alloc);
1346 pages += pages_highmem;
1347
1348 /*
1349 * We only need as many page frames for the image as there are saveable
1350 * pages in memory, but we have allocated more. Release the excessive
1351 * ones now.
1352 */
1353 free_unnecessary_pages();
1354
1355 out:
1130 do_gettimeofday(&stop); 1356 do_gettimeofday(&stop);
1131 printk("\bdone (%lu pages freed)\n", pages); 1357 printk(KERN_CONT "done (allocated %lu pages)\n", pages);
1132 swsusp_show_speed(&start, &stop, pages, "Freed"); 1358 swsusp_show_speed(&start, &stop, pages, "Allocated");
1133 1359
1134 return 0; 1360 return 0;
1361
1362 err_out:
1363 printk(KERN_CONT "\n");
1364 swsusp_free();
1365 return -ENOMEM;
1135} 1366}
1136 1367
1137#ifdef CONFIG_HIGHMEM 1368#ifdef CONFIG_HIGHMEM
@@ -1142,7 +1373,7 @@ int swsusp_shrink_memory(void)
1142 1373
1143static unsigned int count_pages_for_highmem(unsigned int nr_highmem) 1374static unsigned int count_pages_for_highmem(unsigned int nr_highmem)
1144{ 1375{
1145 unsigned int free_highmem = count_free_highmem_pages(); 1376 unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem;
1146 1377
1147 if (free_highmem >= nr_highmem) 1378 if (free_highmem >= nr_highmem)
1148 nr_highmem = 0; 1379 nr_highmem = 0;
@@ -1164,19 +1395,17 @@ count_pages_for_highmem(unsigned int nr_highmem) { return 0; }
1164static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) 1395static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
1165{ 1396{
1166 struct zone *zone; 1397 struct zone *zone;
1167 unsigned int free = 0, meta = 0; 1398 unsigned int free = alloc_normal;
1168 1399
1169 for_each_zone(zone) { 1400 for_each_populated_zone(zone)
1170 meta += snapshot_additional_pages(zone);
1171 if (!is_highmem(zone)) 1401 if (!is_highmem(zone))
1172 free += zone_page_state(zone, NR_FREE_PAGES); 1402 free += zone_page_state(zone, NR_FREE_PAGES);
1173 }
1174 1403
1175 nr_pages += count_pages_for_highmem(nr_highmem); 1404 nr_pages += count_pages_for_highmem(nr_highmem);
1176 pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n", 1405 pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
1177 nr_pages, PAGES_FOR_IO, meta, free); 1406 nr_pages, PAGES_FOR_IO, free);
1178 1407
1179 return free > nr_pages + PAGES_FOR_IO + meta; 1408 return free > nr_pages + PAGES_FOR_IO;
1180} 1409}
1181 1410
1182#ifdef CONFIG_HIGHMEM 1411#ifdef CONFIG_HIGHMEM
@@ -1198,7 +1427,7 @@ static inline int get_highmem_buffer(int safe_needed)
1198 */ 1427 */
1199 1428
1200static inline unsigned int 1429static inline unsigned int
1201alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) 1430alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1202{ 1431{
1203 unsigned int to_alloc = count_free_highmem_pages(); 1432 unsigned int to_alloc = count_free_highmem_pages();
1204 1433
@@ -1218,7 +1447,7 @@ alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
1218static inline int get_highmem_buffer(int safe_needed) { return 0; } 1447static inline int get_highmem_buffer(int safe_needed) { return 0; }
1219 1448
1220static inline unsigned int 1449static inline unsigned int
1221alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } 1450alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; }
1222#endif /* CONFIG_HIGHMEM */ 1451#endif /* CONFIG_HIGHMEM */
1223 1452
1224/** 1453/**
@@ -1237,51 +1466,36 @@ static int
1237swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, 1466swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
1238 unsigned int nr_pages, unsigned int nr_highmem) 1467 unsigned int nr_pages, unsigned int nr_highmem)
1239{ 1468{
1240 int error; 1469 int error = 0;
1241
1242 error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1243 if (error)
1244 goto Free;
1245
1246 error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
1247 if (error)
1248 goto Free;
1249 1470
1250 if (nr_highmem > 0) { 1471 if (nr_highmem > 0) {
1251 error = get_highmem_buffer(PG_ANY); 1472 error = get_highmem_buffer(PG_ANY);
1252 if (error) 1473 if (error)
1253 goto Free; 1474 goto err_out;
1254 1475 if (nr_highmem > alloc_highmem) {
1255 nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); 1476 nr_highmem -= alloc_highmem;
1477 nr_pages += alloc_highmem_pages(copy_bm, nr_highmem);
1478 }
1256 } 1479 }
1257 while (nr_pages-- > 0) { 1480 if (nr_pages > alloc_normal) {
1258 struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); 1481 nr_pages -= alloc_normal;
1259 1482 while (nr_pages-- > 0) {
1260 if (!page) 1483 struct page *page;
1261 goto Free;
1262 1484
1263 memory_bm_set_bit(copy_bm, page_to_pfn(page)); 1485 page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
1486 if (!page)
1487 goto err_out;
1488 memory_bm_set_bit(copy_bm, page_to_pfn(page));
1489 }
1264 } 1490 }
1491
1265 return 0; 1492 return 0;
1266 1493
1267 Free: 1494 err_out:
1268 swsusp_free(); 1495 swsusp_free();
1269 return -ENOMEM; 1496 return error;
1270} 1497}
1271 1498
1272/* Memory bitmap used for marking saveable pages (during suspend) or the
1273 * suspend image pages (during resume)
1274 */
1275static struct memory_bitmap orig_bm;
1276/* Memory bitmap used on suspend for marking allocated pages that will contain
1277 * the copies of saveable pages. During resume it is initially used for
1278 * marking the suspend image pages, but then its set bits are duplicated in
1279 * @orig_bm and it is released. Next, on systems with high memory, it may be
1280 * used for marking "safe" highmem pages, but it has to be reinitialized for
1281 * this purpose.
1282 */
1283static struct memory_bitmap copy_bm;
1284
1285asmlinkage int swsusp_save(void) 1499asmlinkage int swsusp_save(void)
1286{ 1500{
1287 unsigned int nr_pages, nr_highmem; 1501 unsigned int nr_pages, nr_highmem;
@@ -1474,7 +1688,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
1474 unsigned long pfn, max_zone_pfn; 1688 unsigned long pfn, max_zone_pfn;
1475 1689
1476 /* Clear page flags */ 1690 /* Clear page flags */
1477 for_each_zone(zone) { 1691 for_each_populated_zone(zone) {
1478 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 1692 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
1479 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 1693 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
1480 if (pfn_valid(pfn)) 1694 if (pfn_valid(pfn))
diff --git a/kernel/power/user.c b/kernel/power/user.c
index ed97375daae9..bf0014d6a5f0 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,7 +23,6 @@
23#include <linux/console.h> 23#include <linux/console.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/smp_lock.h>
27#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
28 27
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index b4d97b54c1ec..602033acd6c7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -37,6 +37,12 @@
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39/* 39/*
40 * for_each_console() allows you to iterate on each console
41 */
42#define for_each_console(con) \
43 for (con = console_drivers; con != NULL; con = con->next)
44
45/*
40 * Architectures can override it: 46 * Architectures can override it:
41 */ 47 */
42void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) 48void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
@@ -61,6 +67,8 @@ int console_printk[4] = {
61 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 67 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
62}; 68};
63 69
70static int saved_console_loglevel = -1;
71
64/* 72/*
65 * Low level drivers may need that to know if they can schedule in 73 * Low level drivers may need that to know if they can schedule in
66 * their unblank() callback or not. So let's export it. 74 * their unblank() callback or not. So let's export it.
@@ -372,10 +380,15 @@ int do_syslog(int type, char __user *buf, int len)
372 logged_chars = 0; 380 logged_chars = 0;
373 break; 381 break;
374 case 6: /* Disable logging to console */ 382 case 6: /* Disable logging to console */
383 if (saved_console_loglevel == -1)
384 saved_console_loglevel = console_loglevel;
375 console_loglevel = minimum_console_loglevel; 385 console_loglevel = minimum_console_loglevel;
376 break; 386 break;
377 case 7: /* Enable logging to console */ 387 case 7: /* Enable logging to console */
378 console_loglevel = default_console_loglevel; 388 if (saved_console_loglevel != -1) {
389 console_loglevel = saved_console_loglevel;
390 saved_console_loglevel = -1;
391 }
379 break; 392 break;
380 case 8: /* Set level of messages printed to console */ 393 case 8: /* Set level of messages printed to console */
381 error = -EINVAL; 394 error = -EINVAL;
@@ -384,6 +397,8 @@ int do_syslog(int type, char __user *buf, int len)
384 if (len < minimum_console_loglevel) 397 if (len < minimum_console_loglevel)
385 len = minimum_console_loglevel; 398 len = minimum_console_loglevel;
386 console_loglevel = len; 399 console_loglevel = len;
400 /* Implicitly re-enable logging to console */
401 saved_console_loglevel = -1;
387 error = 0; 402 error = 0;
388 break; 403 break;
389 case 9: /* Number of chars in the log buffer */ 404 case 9: /* Number of chars in the log buffer */
@@ -412,7 +427,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
412{ 427{
413 struct console *con; 428 struct console *con;
414 429
415 for (con = console_drivers; con; con = con->next) { 430 for_each_console(con) {
416 if ((con->flags & CON_ENABLED) && con->write && 431 if ((con->flags & CON_ENABLED) && con->write &&
417 (cpu_online(smp_processor_id()) || 432 (cpu_online(smp_processor_id()) ||
418 (con->flags & CON_ANYTIME))) 433 (con->flags & CON_ANYTIME)))
@@ -544,7 +559,7 @@ static int have_callable_console(void)
544{ 559{
545 struct console *con; 560 struct console *con;
546 561
547 for (con = console_drivers; con; con = con->next) 562 for_each_console(con)
548 if (con->flags & CON_ANYTIME) 563 if (con->flags & CON_ANYTIME)
549 return 1; 564 return 1;
550 565
@@ -1060,12 +1075,6 @@ void __sched console_conditional_schedule(void)
1060} 1075}
1061EXPORT_SYMBOL(console_conditional_schedule); 1076EXPORT_SYMBOL(console_conditional_schedule);
1062 1077
1063void console_print(const char *s)
1064{
1065 printk(KERN_EMERG "%s", s);
1066}
1067EXPORT_SYMBOL(console_print);
1068
1069void console_unblank(void) 1078void console_unblank(void)
1070{ 1079{
1071 struct console *c; 1080 struct console *c;
@@ -1082,7 +1091,7 @@ void console_unblank(void)
1082 1091
1083 console_locked = 1; 1092 console_locked = 1;
1084 console_may_schedule = 0; 1093 console_may_schedule = 0;
1085 for (c = console_drivers; c != NULL; c = c->next) 1094 for_each_console(c)
1086 if ((c->flags & CON_ENABLED) && c->unblank) 1095 if ((c->flags & CON_ENABLED) && c->unblank)
1087 c->unblank(); 1096 c->unblank();
1088 release_console_sem(); 1097 release_console_sem();
@@ -1097,7 +1106,7 @@ struct tty_driver *console_device(int *index)
1097 struct tty_driver *driver = NULL; 1106 struct tty_driver *driver = NULL;
1098 1107
1099 acquire_console_sem(); 1108 acquire_console_sem();
1100 for (c = console_drivers; c != NULL; c = c->next) { 1109 for_each_console(c) {
1101 if (!c->device) 1110 if (!c->device)
1102 continue; 1111 continue;
1103 driver = c->device(c, index); 1112 driver = c->device(c, index);
@@ -1134,25 +1143,49 @@ EXPORT_SYMBOL(console_start);
1134 * to register the console printing procedure with printk() and to 1143 * to register the console printing procedure with printk() and to
1135 * print any messages that were printed by the kernel before the 1144 * print any messages that were printed by the kernel before the
1136 * console driver was initialized. 1145 * console driver was initialized.
1146 *
1147 * This can happen pretty early during the boot process (because of
1148 * early_printk) - sometimes before setup_arch() completes - be careful
1149 * of what kernel features are used - they may not be initialised yet.
1150 *
1151 * There are two types of consoles - bootconsoles (early_printk) and
1152 * "real" consoles (everything which is not a bootconsole) which are
1153 * handled differently.
1154 * - Any number of bootconsoles can be registered at any time.
1155 * - As soon as a "real" console is registered, all bootconsoles
1156 * will be unregistered automatically.
1157 * - Once a "real" console is registered, any attempt to register a
1158 * bootconsoles will be rejected
1137 */ 1159 */
1138void register_console(struct console *console) 1160void register_console(struct console *newcon)
1139{ 1161{
1140 int i; 1162 int i;
1141 unsigned long flags; 1163 unsigned long flags;
1142 struct console *bootconsole = NULL; 1164 struct console *bcon = NULL;
1143 1165
1144 if (console_drivers) { 1166 /*
1145 if (console->flags & CON_BOOT) 1167 * before we register a new CON_BOOT console, make sure we don't
1146 return; 1168 * already have a valid console
1147 if (console_drivers->flags & CON_BOOT) 1169 */
1148 bootconsole = console_drivers; 1170 if (console_drivers && newcon->flags & CON_BOOT) {
1171 /* find the last or real console */
1172 for_each_console(bcon) {
1173 if (!(bcon->flags & CON_BOOT)) {
1174 printk(KERN_INFO "Too late to register bootconsole %s%d\n",
1175 newcon->name, newcon->index);
1176 return;
1177 }
1178 }
1149 } 1179 }
1150 1180
1151 if (preferred_console < 0 || bootconsole || !console_drivers) 1181 if (console_drivers && console_drivers->flags & CON_BOOT)
1182 bcon = console_drivers;
1183
1184 if (preferred_console < 0 || bcon || !console_drivers)
1152 preferred_console = selected_console; 1185 preferred_console = selected_console;
1153 1186
1154 if (console->early_setup) 1187 if (newcon->early_setup)
1155 console->early_setup(); 1188 newcon->early_setup();
1156 1189
1157 /* 1190 /*
1158 * See if we want to use this console driver. If we 1191 * See if we want to use this console driver. If we
@@ -1160,13 +1193,13 @@ void register_console(struct console *console)
1160 * that registers here. 1193 * that registers here.
1161 */ 1194 */
1162 if (preferred_console < 0) { 1195 if (preferred_console < 0) {
1163 if (console->index < 0) 1196 if (newcon->index < 0)
1164 console->index = 0; 1197 newcon->index = 0;
1165 if (console->setup == NULL || 1198 if (newcon->setup == NULL ||
1166 console->setup(console, NULL) == 0) { 1199 newcon->setup(newcon, NULL) == 0) {
1167 console->flags |= CON_ENABLED; 1200 newcon->flags |= CON_ENABLED;
1168 if (console->device) { 1201 if (newcon->device) {
1169 console->flags |= CON_CONSDEV; 1202 newcon->flags |= CON_CONSDEV;
1170 preferred_console = 0; 1203 preferred_console = 0;
1171 } 1204 }
1172 } 1205 }
@@ -1178,64 +1211,62 @@ void register_console(struct console *console)
1178 */ 1211 */
1179 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; 1212 for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
1180 i++) { 1213 i++) {
1181 if (strcmp(console_cmdline[i].name, console->name) != 0) 1214 if (strcmp(console_cmdline[i].name, newcon->name) != 0)
1182 continue; 1215 continue;
1183 if (console->index >= 0 && 1216 if (newcon->index >= 0 &&
1184 console->index != console_cmdline[i].index) 1217 newcon->index != console_cmdline[i].index)
1185 continue; 1218 continue;
1186 if (console->index < 0) 1219 if (newcon->index < 0)
1187 console->index = console_cmdline[i].index; 1220 newcon->index = console_cmdline[i].index;
1188#ifdef CONFIG_A11Y_BRAILLE_CONSOLE 1221#ifdef CONFIG_A11Y_BRAILLE_CONSOLE
1189 if (console_cmdline[i].brl_options) { 1222 if (console_cmdline[i].brl_options) {
1190 console->flags |= CON_BRL; 1223 newcon->flags |= CON_BRL;
1191 braille_register_console(console, 1224 braille_register_console(newcon,
1192 console_cmdline[i].index, 1225 console_cmdline[i].index,
1193 console_cmdline[i].options, 1226 console_cmdline[i].options,
1194 console_cmdline[i].brl_options); 1227 console_cmdline[i].brl_options);
1195 return; 1228 return;
1196 } 1229 }
1197#endif 1230#endif
1198 if (console->setup && 1231 if (newcon->setup &&
1199 console->setup(console, console_cmdline[i].options) != 0) 1232 newcon->setup(newcon, console_cmdline[i].options) != 0)
1200 break; 1233 break;
1201 console->flags |= CON_ENABLED; 1234 newcon->flags |= CON_ENABLED;
1202 console->index = console_cmdline[i].index; 1235 newcon->index = console_cmdline[i].index;
1203 if (i == selected_console) { 1236 if (i == selected_console) {
1204 console->flags |= CON_CONSDEV; 1237 newcon->flags |= CON_CONSDEV;
1205 preferred_console = selected_console; 1238 preferred_console = selected_console;
1206 } 1239 }
1207 break; 1240 break;
1208 } 1241 }
1209 1242
1210 if (!(console->flags & CON_ENABLED)) 1243 if (!(newcon->flags & CON_ENABLED))
1211 return; 1244 return;
1212 1245
1213 if (bootconsole && (console->flags & CON_CONSDEV)) { 1246 /*
1214 printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n", 1247 * If we have a bootconsole, and are switching to a real console,
1215 bootconsole->name, bootconsole->index, 1248 * don't print everything out again, since when the boot console, and
1216 console->name, console->index); 1249 * the real console are the same physical device, it's annoying to
1217 unregister_console(bootconsole); 1250 * see the beginning boot messages twice
1218 console->flags &= ~CON_PRINTBUFFER; 1251 */
1219 } else { 1252 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV))
1220 printk(KERN_INFO "console [%s%d] enabled\n", 1253 newcon->flags &= ~CON_PRINTBUFFER;
1221 console->name, console->index);
1222 }
1223 1254
1224 /* 1255 /*
1225 * Put this console in the list - keep the 1256 * Put this console in the list - keep the
1226 * preferred driver at the head of the list. 1257 * preferred driver at the head of the list.
1227 */ 1258 */
1228 acquire_console_sem(); 1259 acquire_console_sem();
1229 if ((console->flags & CON_CONSDEV) || console_drivers == NULL) { 1260 if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
1230 console->next = console_drivers; 1261 newcon->next = console_drivers;
1231 console_drivers = console; 1262 console_drivers = newcon;
1232 if (console->next) 1263 if (newcon->next)
1233 console->next->flags &= ~CON_CONSDEV; 1264 newcon->next->flags &= ~CON_CONSDEV;
1234 } else { 1265 } else {
1235 console->next = console_drivers->next; 1266 newcon->next = console_drivers->next;
1236 console_drivers->next = console; 1267 console_drivers->next = newcon;
1237 } 1268 }
1238 if (console->flags & CON_PRINTBUFFER) { 1269 if (newcon->flags & CON_PRINTBUFFER) {
1239 /* 1270 /*
1240 * release_console_sem() will print out the buffered messages 1271 * release_console_sem() will print out the buffered messages
1241 * for us. 1272 * for us.
@@ -1245,6 +1276,28 @@ void register_console(struct console *console)
1245 spin_unlock_irqrestore(&logbuf_lock, flags); 1276 spin_unlock_irqrestore(&logbuf_lock, flags);
1246 } 1277 }
1247 release_console_sem(); 1278 release_console_sem();
1279
1280 /*
1281 * By unregistering the bootconsoles after we enable the real console
1282 * we get the "console xxx enabled" message on all the consoles -
1283 * boot consoles, real consoles, etc - this is to ensure that end
1284 * users know there might be something in the kernel's log buffer that
1285 * went to the bootconsole (that they do not see on the real console)
1286 */
1287 if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
1288 /* we need to iterate through twice, to make sure we print
1289 * everything out, before we unregister the console(s)
1290 */
1291 printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n",
1292 newcon->name, newcon->index);
1293 for_each_console(bcon)
1294 if (bcon->flags & CON_BOOT)
1295 unregister_console(bcon);
1296 } else {
1297 printk(KERN_INFO "%sconsole [%s%d] enabled\n",
1298 (newcon->flags & CON_BOOT) ? "boot" : "" ,
1299 newcon->name, newcon->index);
1300 }
1248} 1301}
1249EXPORT_SYMBOL(register_console); 1302EXPORT_SYMBOL(register_console);
1250 1303
@@ -1287,11 +1340,13 @@ EXPORT_SYMBOL(unregister_console);
1287 1340
1288static int __init disable_boot_consoles(void) 1341static int __init disable_boot_consoles(void)
1289{ 1342{
1290 if (console_drivers != NULL) { 1343 struct console *con;
1291 if (console_drivers->flags & CON_BOOT) { 1344
1345 for_each_console(con) {
1346 if (con->flags & CON_BOOT) {
1292 printk(KERN_INFO "turn off boot console %s%d\n", 1347 printk(KERN_INFO "turn off boot console %s%d\n",
1293 console_drivers->name, console_drivers->index); 1348 con->name, con->index);
1294 return unregister_console(console_drivers); 1349 unregister_console(con);
1295 } 1350 }
1296 } 1351 }
1297 return 0; 1352 return 0;
diff --git a/kernel/profile.c b/kernel/profile.c
index 69911b5745eb..a55d3a367ae8 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -117,11 +117,12 @@ int __ref profile_init(void)
117 117
118 cpumask_copy(prof_cpu_mask, cpu_possible_mask); 118 cpumask_copy(prof_cpu_mask, cpu_possible_mask);
119 119
120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL); 120 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN);
121 if (prof_buffer) 121 if (prof_buffer)
122 return 0; 122 return 0;
123 123
124 prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO); 124 prof_buffer = alloc_pages_exact(buffer_bytes,
125 GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN);
125 if (prof_buffer) 126 if (prof_buffer)
126 return 0; 127 return 0;
127 128
@@ -441,48 +442,51 @@ void profile_tick(int type)
441 442
442#ifdef CONFIG_PROC_FS 443#ifdef CONFIG_PROC_FS
443#include <linux/proc_fs.h> 444#include <linux/proc_fs.h>
445#include <linux/seq_file.h>
444#include <asm/uaccess.h> 446#include <asm/uaccess.h>
445 447
446static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, 448static int prof_cpu_mask_proc_show(struct seq_file *m, void *v)
447 int count, int *eof, void *data)
448{ 449{
449 int len = cpumask_scnprintf(page, count, data); 450 seq_cpumask(m, prof_cpu_mask);
450 if (count - len < 2) 451 seq_putc(m, '\n');
451 return -EINVAL; 452 return 0;
452 len += sprintf(page + len, "\n");
453 return len;
454} 453}
455 454
456static int prof_cpu_mask_write_proc(struct file *file, 455static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file)
457 const char __user *buffer, unsigned long count, void *data) 456{
457 return single_open(file, prof_cpu_mask_proc_show, NULL);
458}
459
460static ssize_t prof_cpu_mask_proc_write(struct file *file,
461 const char __user *buffer, size_t count, loff_t *pos)
458{ 462{
459 struct cpumask *mask = data;
460 unsigned long full_count = count, err;
461 cpumask_var_t new_value; 463 cpumask_var_t new_value;
464 int err;
462 465
463 if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) 466 if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
464 return -ENOMEM; 467 return -ENOMEM;
465 468
466 err = cpumask_parse_user(buffer, count, new_value); 469 err = cpumask_parse_user(buffer, count, new_value);
467 if (!err) { 470 if (!err) {
468 cpumask_copy(mask, new_value); 471 cpumask_copy(prof_cpu_mask, new_value);
469 err = full_count; 472 err = count;
470 } 473 }
471 free_cpumask_var(new_value); 474 free_cpumask_var(new_value);
472 return err; 475 return err;
473} 476}
474 477
478static const struct file_operations prof_cpu_mask_proc_fops = {
479 .open = prof_cpu_mask_proc_open,
480 .read = seq_read,
481 .llseek = seq_lseek,
482 .release = single_release,
483 .write = prof_cpu_mask_proc_write,
484};
485
475void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) 486void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
476{ 487{
477 struct proc_dir_entry *entry;
478
479 /* create /proc/irq/prof_cpu_mask */ 488 /* create /proc/irq/prof_cpu_mask */
480 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); 489 proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
481 if (!entry)
482 return;
483 entry->data = prof_cpu_mask;
484 entry->read_proc = prof_cpu_mask_read_proc;
485 entry->write_proc = prof_cpu_mask_write_proc;
486} 490}
487 491
488/* 492/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 61c78b2c07ba..307c285af59e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -152,7 +152,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
152 if (!dumpable && !capable(CAP_SYS_PTRACE)) 152 if (!dumpable && !capable(CAP_SYS_PTRACE))
153 return -EPERM; 153 return -EPERM;
154 154
155 return security_ptrace_may_access(task, mode); 155 return security_ptrace_access_check(task, mode);
156} 156}
157 157
158bool ptrace_may_access(struct task_struct *task, unsigned int mode) 158bool ptrace_may_access(struct task_struct *task, unsigned int mode)
@@ -181,8 +181,8 @@ int ptrace_attach(struct task_struct *task)
181 * interference; SUID, SGID and LSM creds get determined differently 181 * interference; SUID, SGID and LSM creds get determined differently
182 * under ptrace. 182 * under ptrace.
183 */ 183 */
184 retval = mutex_lock_interruptible(&task->cred_guard_mutex); 184 retval = -ERESTARTNOINTR;
185 if (retval < 0) 185 if (mutex_lock_interruptible(&task->cred_guard_mutex))
186 goto out; 186 goto out;
187 187
188 task_lock(task); 188 task_lock(task);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
deleted file mode 100644
index 0f2b0b311304..000000000000
--- a/kernel/rcuclassic.c
+++ /dev/null
@@ -1,807 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers:
26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 *
29 * For detailed explanation of Read-Copy Update mechanism see -
30 * Documentation/RCU
31 *
32 */
33#include <linux/types.h>
34#include <linux/kernel.h>
35#include <linux/init.h>
36#include <linux/spinlock.h>
37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h>
40#include <linux/sched.h>
41#include <asm/atomic.h>
42#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h>
47#include <linux/notifier.h>
48#include <linux/cpu.h>
49#include <linux/mutex.h>
50#include <linux/time.h>
51
52#ifdef CONFIG_DEBUG_LOCK_ALLOC
53static struct lock_class_key rcu_lock_key;
54struct lockdep_map rcu_lock_map =
55 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59
60/* Definition for rcupdate control block. */
61static struct rcu_ctrlblk rcu_ctrlblk = {
62 .cur = -300,
63 .completed = -300,
64 .pending = -300,
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_BITS_NONE,
67};
68
69static struct rcu_ctrlblk rcu_bh_ctrlblk = {
70 .cur = -300,
71 .completed = -300,
72 .pending = -300,
73 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
74 .cpumask = CPU_BITS_NONE,
75};
76
77static DEFINE_PER_CPU(struct rcu_data, rcu_data);
78static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79
80/*
81 * Increment the quiescent state counter.
82 * The counter is a bit degenerated: We do not need to know
83 * how many quiescent states passed, just if there was at least
84 * one since the start of the grace period. Thus just a flag.
85 */
86void rcu_qsctr_inc(int cpu)
87{
88 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
89 rdp->passed_quiesc = 1;
90}
91
92void rcu_bh_qsctr_inc(int cpu)
93{
94 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
95 rdp->passed_quiesc = 1;
96}
97
98static int blimit = 10;
99static int qhimark = 10000;
100static int qlowmark = 100;
101
102#ifdef CONFIG_SMP
103static void force_quiescent_state(struct rcu_data *rdp,
104 struct rcu_ctrlblk *rcp)
105{
106 int cpu;
107 unsigned long flags;
108
109 set_need_resched();
110 spin_lock_irqsave(&rcp->lock, flags);
111 if (unlikely(!rcp->signaled)) {
112 rcp->signaled = 1;
113 /*
114 * Don't send IPI to itself. With irqs disabled,
115 * rdp->cpu is the current cpu.
116 *
117 * cpu_online_mask is updated by the _cpu_down()
118 * using __stop_machine(). Since we're in irqs disabled
119 * section, __stop_machine() is not exectuting, hence
120 * the cpu_online_mask is stable.
121 *
122 * However, a cpu might have been offlined _just_ before
123 * we disabled irqs while entering here.
124 * And rcu subsystem might not yet have handled the CPU_DEAD
125 * notification, leading to the offlined cpu's bit
126 * being set in the rcp->cpumask.
127 *
128 * Hence cpumask = (rcp->cpumask & cpu_online_mask) to prevent
129 * sending smp_reschedule() to an offlined CPU.
130 */
131 for_each_cpu_and(cpu,
132 to_cpumask(rcp->cpumask), cpu_online_mask) {
133 if (cpu != rdp->cpu)
134 smp_send_reschedule(cpu);
135 }
136 }
137 spin_unlock_irqrestore(&rcp->lock, flags);
138}
139#else
140static inline void force_quiescent_state(struct rcu_data *rdp,
141 struct rcu_ctrlblk *rcp)
142{
143 set_need_resched();
144}
145#endif
146
147static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
148 struct rcu_data *rdp)
149{
150 long batch;
151
152 head->next = NULL;
153 smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
154
155 /*
156 * Determine the batch number of this callback.
157 *
158 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
159 * local variable "batch" and emits codes like this:
160 * 1) rdp->batch = rcp->cur + 1 # gets old value
161 * ......
162 * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
163 * then [*nxttail[0], *nxttail[1]) may contain callbacks
164 * that batch# = rdp->batch, see the comment of struct rcu_data.
165 */
166 batch = ACCESS_ONCE(rcp->cur) + 1;
167
168 if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
169 /* process callbacks */
170 rdp->nxttail[0] = rdp->nxttail[1];
171 rdp->nxttail[1] = rdp->nxttail[2];
172 if (rcu_batch_after(batch - 1, rdp->batch))
173 rdp->nxttail[0] = rdp->nxttail[2];
174 }
175
176 rdp->batch = batch;
177 *rdp->nxttail[2] = head;
178 rdp->nxttail[2] = &head->next;
179
180 if (unlikely(++rdp->qlen > qhimark)) {
181 rdp->blimit = INT_MAX;
182 force_quiescent_state(rdp, &rcu_ctrlblk);
183 }
184}
185
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187
188static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
189{
190 rcp->gp_start = jiffies;
191 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
192}
193
194static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
195{
196 int cpu;
197 long delta;
198 unsigned long flags;
199
200 /* Only let one CPU complain about others per time interval. */
201
202 spin_lock_irqsave(&rcp->lock, flags);
203 delta = jiffies - rcp->jiffies_stall;
204 if (delta < 2 || rcp->cur != rcp->completed) {
205 spin_unlock_irqrestore(&rcp->lock, flags);
206 return;
207 }
208 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
209 spin_unlock_irqrestore(&rcp->lock, flags);
210
211 /* OK, time to rat on our buddy... */
212
213 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
214 for_each_possible_cpu(cpu) {
215 if (cpumask_test_cpu(cpu, to_cpumask(rcp->cpumask)))
216 printk(" %d", cpu);
217 }
218 printk(" (detected by %d, t=%ld jiffies)\n",
219 smp_processor_id(), (long)(jiffies - rcp->gp_start));
220}
221
222static void print_cpu_stall(struct rcu_ctrlblk *rcp)
223{
224 unsigned long flags;
225
226 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
227 smp_processor_id(), jiffies,
228 jiffies - rcp->gp_start);
229 dump_stack();
230 spin_lock_irqsave(&rcp->lock, flags);
231 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
232 rcp->jiffies_stall =
233 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
234 spin_unlock_irqrestore(&rcp->lock, flags);
235 set_need_resched(); /* kick ourselves to get things going. */
236}
237
238static void check_cpu_stall(struct rcu_ctrlblk *rcp)
239{
240 long delta;
241
242 delta = jiffies - rcp->jiffies_stall;
243 if (cpumask_test_cpu(smp_processor_id(), to_cpumask(rcp->cpumask)) &&
244 delta >= 0) {
245
246 /* We haven't checked in, so go dump stack. */
247 print_cpu_stall(rcp);
248
249 } else if (rcp->cur != rcp->completed && delta >= 2) {
250
251 /* They had two seconds to dump stack, so complain. */
252 print_other_cpu_stall(rcp);
253 }
254}
255
256#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
257
258static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
259{
260}
261
262static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
263{
264}
265
266#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
267
268/**
269 * call_rcu - Queue an RCU callback for invocation after a grace period.
270 * @head: structure to be used for queueing the RCU updates.
271 * @func: actual update function to be invoked after the grace period
272 *
273 * The update function will be invoked some time after a full grace
274 * period elapses, in other words after all currently executing RCU
275 * read-side critical sections have completed. RCU read-side critical
276 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
277 * and may be nested.
278 */
279void call_rcu(struct rcu_head *head,
280 void (*func)(struct rcu_head *rcu))
281{
282 unsigned long flags;
283
284 head->func = func;
285 local_irq_save(flags);
286 __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
287 local_irq_restore(flags);
288}
289EXPORT_SYMBOL_GPL(call_rcu);
290
291/**
292 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
293 * @head: structure to be used for queueing the RCU updates.
294 * @func: actual update function to be invoked after the grace period
295 *
296 * The update function will be invoked some time after a full grace
297 * period elapses, in other words after all currently executing RCU
298 * read-side critical sections have completed. call_rcu_bh() assumes
299 * that the read-side critical sections end on completion of a softirq
300 * handler. This means that read-side critical sections in process
301 * context must not be interrupted by softirqs. This interface is to be
302 * used when most of the read-side critical sections are in softirq context.
303 * RCU read-side critical sections are delimited by rcu_read_lock() and
304 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
305 * and rcu_read_unlock_bh(), if in process context. These may be nested.
306 */
307void call_rcu_bh(struct rcu_head *head,
308 void (*func)(struct rcu_head *rcu))
309{
310 unsigned long flags;
311
312 head->func = func;
313 local_irq_save(flags);
314 __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
315 local_irq_restore(flags);
316}
317EXPORT_SYMBOL_GPL(call_rcu_bh);
318
319/*
320 * Return the number of RCU batches processed thus far. Useful
321 * for debug and statistics.
322 */
323long rcu_batches_completed(void)
324{
325 return rcu_ctrlblk.completed;
326}
327EXPORT_SYMBOL_GPL(rcu_batches_completed);
328
329/*
330 * Return the number of RCU batches processed thus far. Useful
331 * for debug and statistics.
332 */
333long rcu_batches_completed_bh(void)
334{
335 return rcu_bh_ctrlblk.completed;
336}
337EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
338
339/* Raises the softirq for processing rcu_callbacks. */
340static inline void raise_rcu_softirq(void)
341{
342 raise_softirq(RCU_SOFTIRQ);
343}
344
345/*
346 * Invoke the completed RCU callbacks. They are expected to be in
347 * a per-cpu list.
348 */
349static void rcu_do_batch(struct rcu_data *rdp)
350{
351 unsigned long flags;
352 struct rcu_head *next, *list;
353 int count = 0;
354
355 list = rdp->donelist;
356 while (list) {
357 next = list->next;
358 prefetch(next);
359 list->func(list);
360 list = next;
361 if (++count >= rdp->blimit)
362 break;
363 }
364 rdp->donelist = list;
365
366 local_irq_save(flags);
367 rdp->qlen -= count;
368 local_irq_restore(flags);
369 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
370 rdp->blimit = blimit;
371
372 if (!rdp->donelist)
373 rdp->donetail = &rdp->donelist;
374 else
375 raise_rcu_softirq();
376}
377
378/*
379 * Grace period handling:
380 * The grace period handling consists out of two steps:
381 * - A new grace period is started.
382 * This is done by rcu_start_batch. The start is not broadcasted to
383 * all cpus, they must pick this up by comparing rcp->cur with
384 * rdp->quiescbatch. All cpus are recorded in the
385 * rcu_ctrlblk.cpumask bitmap.
386 * - All cpus must go through a quiescent state.
387 * Since the start of the grace period is not broadcasted, at least two
388 * calls to rcu_check_quiescent_state are required:
389 * The first call just notices that a new grace period is running. The
390 * following calls check if there was a quiescent state since the beginning
391 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
392 * the bitmap is empty, then the grace period is completed.
393 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
394 * period (if necessary).
395 */
396
397/*
398 * Register a new batch of callbacks, and start it up if there is currently no
399 * active batch and the batch to be registered has not already occurred.
400 * Caller must hold rcu_ctrlblk.lock.
401 */
402static void rcu_start_batch(struct rcu_ctrlblk *rcp)
403{
404 if (rcp->cur != rcp->pending &&
405 rcp->completed == rcp->cur) {
406 rcp->cur++;
407 record_gp_stall_check_time(rcp);
408
409 /*
410 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
411 * Barrier Otherwise it can cause tickless idle CPUs to be
412 * included in rcp->cpumask, which will extend graceperiods
413 * unnecessarily.
414 */
415 smp_mb();
416 cpumask_andnot(to_cpumask(rcp->cpumask),
417 cpu_online_mask, nohz_cpu_mask);
418
419 rcp->signaled = 0;
420 }
421}
422
423/*
424 * cpu went through a quiescent state since the beginning of the grace period.
425 * Clear it from the cpu mask and complete the grace period if it was the last
426 * cpu. Start another grace period if someone has further entries pending
427 */
428static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
429{
430 cpumask_clear_cpu(cpu, to_cpumask(rcp->cpumask));
431 if (cpumask_empty(to_cpumask(rcp->cpumask))) {
432 /* batch completed ! */
433 rcp->completed = rcp->cur;
434 rcu_start_batch(rcp);
435 }
436}
437
438/*
439 * Check if the cpu has gone through a quiescent state (say context
440 * switch). If so and if it already hasn't done so in this RCU
441 * quiescent cycle, then indicate that it has done so.
442 */
443static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
444 struct rcu_data *rdp)
445{
446 unsigned long flags;
447
448 if (rdp->quiescbatch != rcp->cur) {
449 /* start new grace period: */
450 rdp->qs_pending = 1;
451 rdp->passed_quiesc = 0;
452 rdp->quiescbatch = rcp->cur;
453 return;
454 }
455
456 /* Grace period already completed for this cpu?
457 * qs_pending is checked instead of the actual bitmap to avoid
458 * cacheline trashing.
459 */
460 if (!rdp->qs_pending)
461 return;
462
463 /*
464 * Was there a quiescent state since the beginning of the grace
465 * period? If no, then exit and wait for the next call.
466 */
467 if (!rdp->passed_quiesc)
468 return;
469 rdp->qs_pending = 0;
470
471 spin_lock_irqsave(&rcp->lock, flags);
472 /*
473 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
474 * during cpu startup. Ignore the quiescent state.
475 */
476 if (likely(rdp->quiescbatch == rcp->cur))
477 cpu_quiet(rdp->cpu, rcp);
478
479 spin_unlock_irqrestore(&rcp->lock, flags);
480}
481
482
483#ifdef CONFIG_HOTPLUG_CPU
484
485/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
486 * locking requirements, the list it's pulling from has to belong to a cpu
487 * which is dead and hence not processing interrupts.
488 */
489static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
490 struct rcu_head **tail, long batch)
491{
492 unsigned long flags;
493
494 if (list) {
495 local_irq_save(flags);
496 this_rdp->batch = batch;
497 *this_rdp->nxttail[2] = list;
498 this_rdp->nxttail[2] = tail;
499 local_irq_restore(flags);
500 }
501}
502
503static void __rcu_offline_cpu(struct rcu_data *this_rdp,
504 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
505{
506 unsigned long flags;
507
508 /*
509 * if the cpu going offline owns the grace period
510 * we can block indefinitely waiting for it, so flush
511 * it here
512 */
513 spin_lock_irqsave(&rcp->lock, flags);
514 if (rcp->cur != rcp->completed)
515 cpu_quiet(rdp->cpu, rcp);
516 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
517 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
518 spin_unlock(&rcp->lock);
519
520 this_rdp->qlen += rdp->qlen;
521 local_irq_restore(flags);
522}
523
524static void rcu_offline_cpu(int cpu)
525{
526 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
527 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
528
529 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
530 &per_cpu(rcu_data, cpu));
531 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
532 &per_cpu(rcu_bh_data, cpu));
533 put_cpu_var(rcu_data);
534 put_cpu_var(rcu_bh_data);
535}
536
537#else
538
539static void rcu_offline_cpu(int cpu)
540{
541}
542
543#endif
544
545/*
546 * This does the RCU processing work from softirq context.
547 */
548static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
549 struct rcu_data *rdp)
550{
551 unsigned long flags;
552 long completed_snap;
553
554 if (rdp->nxtlist) {
555 local_irq_save(flags);
556 completed_snap = ACCESS_ONCE(rcp->completed);
557
558 /*
559 * move the other grace-period-completed entries to
560 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
561 */
562 if (!rcu_batch_before(completed_snap, rdp->batch))
563 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
564 else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
565 rdp->nxttail[0] = rdp->nxttail[1];
566
567 /*
568 * the grace period for entries in
569 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
570 * move these entries to donelist
571 */
572 if (rdp->nxttail[0] != &rdp->nxtlist) {
573 *rdp->donetail = rdp->nxtlist;
574 rdp->donetail = rdp->nxttail[0];
575 rdp->nxtlist = *rdp->nxttail[0];
576 *rdp->donetail = NULL;
577
578 if (rdp->nxttail[1] == rdp->nxttail[0])
579 rdp->nxttail[1] = &rdp->nxtlist;
580 if (rdp->nxttail[2] == rdp->nxttail[0])
581 rdp->nxttail[2] = &rdp->nxtlist;
582 rdp->nxttail[0] = &rdp->nxtlist;
583 }
584
585 local_irq_restore(flags);
586
587 if (rcu_batch_after(rdp->batch, rcp->pending)) {
588 unsigned long flags2;
589
590 /* and start it/schedule start if it's a new batch */
591 spin_lock_irqsave(&rcp->lock, flags2);
592 if (rcu_batch_after(rdp->batch, rcp->pending)) {
593 rcp->pending = rdp->batch;
594 rcu_start_batch(rcp);
595 }
596 spin_unlock_irqrestore(&rcp->lock, flags2);
597 }
598 }
599
600 rcu_check_quiescent_state(rcp, rdp);
601 if (rdp->donelist)
602 rcu_do_batch(rdp);
603}
604
605static void rcu_process_callbacks(struct softirq_action *unused)
606{
607 /*
608 * Memory references from any prior RCU read-side critical sections
609 * executed by the interrupted code must be see before any RCU
610 * grace-period manupulations below.
611 */
612
613 smp_mb(); /* See above block comment. */
614
615 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
616 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
617
618 /*
619 * Memory references from any later RCU read-side critical sections
620 * executed by the interrupted code must be see after any RCU
621 * grace-period manupulations above.
622 */
623
624 smp_mb(); /* See above block comment. */
625}
626
627static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
628{
629 /* Check for CPU stalls, if enabled. */
630 check_cpu_stall(rcp);
631
632 if (rdp->nxtlist) {
633 long completed_snap = ACCESS_ONCE(rcp->completed);
634
635 /*
636 * This cpu has pending rcu entries and the grace period
637 * for them has completed.
638 */
639 if (!rcu_batch_before(completed_snap, rdp->batch))
640 return 1;
641 if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
642 rdp->nxttail[0] != rdp->nxttail[1])
643 return 1;
644 if (rdp->nxttail[0] != &rdp->nxtlist)
645 return 1;
646
647 /*
648 * This cpu has pending rcu entries and the new batch
649 * for then hasn't been started nor scheduled start
650 */
651 if (rcu_batch_after(rdp->batch, rcp->pending))
652 return 1;
653 }
654
655 /* This cpu has finished callbacks to invoke */
656 if (rdp->donelist)
657 return 1;
658
659 /* The rcu core waits for a quiescent state from the cpu */
660 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
661 return 1;
662
663 /* nothing to do */
664 return 0;
665}
666
667/*
668 * Check to see if there is any immediate RCU-related work to be done
669 * by the current CPU, returning 1 if so. This function is part of the
670 * RCU implementation; it is -not- an exported member of the RCU API.
671 */
672int rcu_pending(int cpu)
673{
674 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
675 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
676}
677
678/*
679 * Check to see if any future RCU-related work will need to be done
680 * by the current CPU, even if none need be done immediately, returning
681 * 1 if so. This function is part of the RCU implementation; it is -not-
682 * an exported member of the RCU API.
683 */
684int rcu_needs_cpu(int cpu)
685{
686 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
687 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
688
689 return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
690}
691
692/*
693 * Top-level function driving RCU grace-period detection, normally
694 * invoked from the scheduler-clock interrupt. This function simply
695 * increments counters that are read only from softirq by this same
696 * CPU, so there are no memory barriers required.
697 */
698void rcu_check_callbacks(int cpu, int user)
699{
700 if (user ||
701 (idle_cpu(cpu) && rcu_scheduler_active &&
702 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
703
704 /*
705 * Get here if this CPU took its interrupt from user
706 * mode or from the idle loop, and if this is not a
707 * nested interrupt. In this case, the CPU is in
708 * a quiescent state, so count it.
709 *
710 * Also do a memory barrier. This is needed to handle
711 * the case where writes from a preempt-disable section
712 * of code get reordered into schedule() by this CPU's
713 * write buffer. The memory barrier makes sure that
714 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
715 * by other CPUs to happen after any such write.
716 */
717
718 smp_mb(); /* See above block comment. */
719 rcu_qsctr_inc(cpu);
720 rcu_bh_qsctr_inc(cpu);
721
722 } else if (!in_softirq()) {
723
724 /*
725 * Get here if this CPU did not take its interrupt from
726 * softirq, in other words, if it is not interrupting
727 * a rcu_bh read-side critical section. This is an _bh
728 * critical section, so count it. The memory barrier
729 * is needed for the same reason as is the above one.
730 */
731
732 smp_mb(); /* See above block comment. */
733 rcu_bh_qsctr_inc(cpu);
734 }
735 raise_rcu_softirq();
736}
737
738static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
739 struct rcu_data *rdp)
740{
741 unsigned long flags;
742
743 spin_lock_irqsave(&rcp->lock, flags);
744 memset(rdp, 0, sizeof(*rdp));
745 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
746 rdp->donetail = &rdp->donelist;
747 rdp->quiescbatch = rcp->completed;
748 rdp->qs_pending = 0;
749 rdp->cpu = cpu;
750 rdp->blimit = blimit;
751 spin_unlock_irqrestore(&rcp->lock, flags);
752}
753
754static void __cpuinit rcu_online_cpu(int cpu)
755{
756 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
757 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
758
759 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
760 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
761 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
762}
763
764static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
765 unsigned long action, void *hcpu)
766{
767 long cpu = (long)hcpu;
768
769 switch (action) {
770 case CPU_UP_PREPARE:
771 case CPU_UP_PREPARE_FROZEN:
772 rcu_online_cpu(cpu);
773 break;
774 case CPU_DEAD:
775 case CPU_DEAD_FROZEN:
776 rcu_offline_cpu(cpu);
777 break;
778 default:
779 break;
780 }
781 return NOTIFY_OK;
782}
783
784static struct notifier_block __cpuinitdata rcu_nb = {
785 .notifier_call = rcu_cpu_notify,
786};
787
788/*
789 * Initializes rcu mechanism. Assumed to be called early.
790 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
791 * Note that rcu_qsctr and friends are implicitly
792 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
793 */
794void __init __rcu_init(void)
795{
796#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
797 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
798#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
799 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
800 (void *)(long)smp_processor_id());
801 /* Register notifier for non-boot CPUs */
802 register_cpu_notifier(&rcu_nb);
803}
804
805module_param(blimit, int, 0);
806module_param(qhimark, int, 0);
807module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a967c9feb90a..37ac45483082 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -19,7 +19,7 @@
19 * 19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com> 21 * Manfred Spraul <manfred@colorfullife.com>
22 * 22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com> 23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers: 25 * Papers:
@@ -27,7 +27,7 @@
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) 27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 * 28 *
29 * For detailed explanation of Read-Copy Update mechanism see - 29 * For detailed explanation of Read-Copy Update mechanism see -
30 * http://lse.sourceforge.net/locking/rcupdate.html 30 * http://lse.sourceforge.net/locking/rcupdate.html
31 * 31 *
32 */ 32 */
33#include <linux/types.h> 33#include <linux/types.h>
@@ -74,6 +74,8 @@ void wakeme_after_rcu(struct rcu_head *head)
74 complete(&rcu->completion); 74 complete(&rcu->completion);
75} 75}
76 76
77#ifdef CONFIG_TREE_PREEMPT_RCU
78
77/** 79/**
78 * synchronize_rcu - wait until a grace period has elapsed. 80 * synchronize_rcu - wait until a grace period has elapsed.
79 * 81 *
@@ -87,7 +89,7 @@ void synchronize_rcu(void)
87{ 89{
88 struct rcu_synchronize rcu; 90 struct rcu_synchronize rcu;
89 91
90 if (rcu_blocking_is_gp()) 92 if (!rcu_scheduler_active)
91 return; 93 return;
92 94
93 init_completion(&rcu.completion); 95 init_completion(&rcu.completion);
@@ -98,6 +100,70 @@ void synchronize_rcu(void)
98} 100}
99EXPORT_SYMBOL_GPL(synchronize_rcu); 101EXPORT_SYMBOL_GPL(synchronize_rcu);
100 102
103#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
104
105/**
106 * synchronize_sched - wait until an rcu-sched grace period has elapsed.
107 *
108 * Control will return to the caller some time after a full rcu-sched
109 * grace period has elapsed, in other words after all currently executing
110 * rcu-sched read-side critical sections have completed. These read-side
111 * critical sections are delimited by rcu_read_lock_sched() and
112 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(),
113 * local_irq_disable(), and so on may be used in place of
114 * rcu_read_lock_sched().
115 *
116 * This means that all preempt_disable code sequences, including NMI and
117 * hardware-interrupt handlers, in progress on entry will have completed
118 * before this primitive returns. However, this does not guarantee that
119 * softirq handlers will have completed, since in some kernels, these
120 * handlers can run in process context, and can block.
121 *
122 * This primitive provides the guarantees made by the (now removed)
123 * synchronize_kernel() API. In contrast, synchronize_rcu() only
124 * guarantees that rcu_read_lock() sections will have completed.
125 * In "classic RCU", these two guarantees happen to be one and
126 * the same, but can differ in realtime RCU implementations.
127 */
128void synchronize_sched(void)
129{
130 struct rcu_synchronize rcu;
131
132 if (rcu_blocking_is_gp())
133 return;
134
135 init_completion(&rcu.completion);
136 /* Will wake me after RCU finished. */
137 call_rcu_sched(&rcu.head, wakeme_after_rcu);
138 /* Wait for it. */
139 wait_for_completion(&rcu.completion);
140}
141EXPORT_SYMBOL_GPL(synchronize_sched);
142
143/**
144 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed.
145 *
146 * Control will return to the caller some time after a full rcu_bh grace
147 * period has elapsed, in other words after all currently executing rcu_bh
148 * read-side critical sections have completed. RCU read-side critical
149 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(),
150 * and may be nested.
151 */
152void synchronize_rcu_bh(void)
153{
154 struct rcu_synchronize rcu;
155
156 if (rcu_blocking_is_gp())
157 return;
158
159 init_completion(&rcu.completion);
160 /* Will wake me after RCU finished. */
161 call_rcu_bh(&rcu.head, wakeme_after_rcu);
162 /* Wait for it. */
163 wait_for_completion(&rcu.completion);
164}
165EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
166
101static void rcu_barrier_callback(struct rcu_head *notused) 167static void rcu_barrier_callback(struct rcu_head *notused)
102{ 168{
103 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 169 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -129,6 +195,7 @@ static void rcu_barrier_func(void *type)
129static inline void wait_migrated_callbacks(void) 195static inline void wait_migrated_callbacks(void)
130{ 196{
131 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); 197 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
198 smp_mb(); /* In case we didn't sleep. */
132} 199}
133 200
134/* 201/*
@@ -192,9 +259,13 @@ static void rcu_migrate_callback(struct rcu_head *notused)
192 wake_up(&rcu_migrate_wq); 259 wake_up(&rcu_migrate_wq);
193} 260}
194 261
262extern int rcu_cpu_notify(struct notifier_block *self,
263 unsigned long action, void *hcpu);
264
195static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, 265static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
196 unsigned long action, void *hcpu) 266 unsigned long action, void *hcpu)
197{ 267{
268 rcu_cpu_notify(self, action, hcpu);
198 if (action == CPU_DYING) { 269 if (action == CPU_DYING) {
199 /* 270 /*
200 * preempt_disable() in on_each_cpu() prevents stop_machine(), 271 * preempt_disable() in on_each_cpu() prevents stop_machine(),
@@ -209,7 +280,8 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
209 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); 280 call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
210 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); 281 call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
211 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); 282 call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
212 } else if (action == CPU_POST_DEAD) { 283 } else if (action == CPU_DOWN_PREPARE) {
284 /* Don't need to wait until next removal operation. */
213 /* rcu_migrate_head is protected by cpu_add_remove_lock */ 285 /* rcu_migrate_head is protected by cpu_add_remove_lock */
214 wait_migrated_callbacks(); 286 wait_migrated_callbacks();
215 } 287 }
@@ -219,8 +291,18 @@ static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
219 291
220void __init rcu_init(void) 292void __init rcu_init(void)
221{ 293{
294 int i;
295
222 __rcu_init(); 296 __rcu_init();
223 hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); 297 cpu_notifier(rcu_barrier_cpu_hotplug, 0);
298
299 /*
300 * We don't need protection against CPU-hotplug here because
301 * this is called early in boot, before either interrupts
302 * or the scheduler are operational.
303 */
304 for_each_online_cpu(i)
305 rcu_barrier_cpu_hotplug(NULL, CPU_UP_PREPARE, (void *)(long)i);
224} 306}
225 307
226void rcu_scheduler_starting(void) 308void rcu_scheduler_starting(void)
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
deleted file mode 100644
index beb0e659adcc..000000000000
--- a/kernel/rcupreempt.c
+++ /dev/null
@@ -1,1539 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22 * for pushing me away from locks and towards counters, and
23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side.
25 *
26 * - Added handling of Dynamic Ticks
27 * Copyright 2007 - Paul E. Mckenney <paulmck@us.ibm.com>
28 * - Steven Rostedt <srostedt@redhat.com>
29 *
30 * Papers: http://www.rdrop.com/users/paulmck/RCU
31 *
32 * Design Document: http://lwn.net/Articles/253651/
33 *
34 * For detailed explanation of Read-Copy Update mechanism see -
35 * Documentation/RCU/ *.txt
36 *
37 */
38#include <linux/types.h>
39#include <linux/kernel.h>
40#include <linux/init.h>
41#include <linux/spinlock.h>
42#include <linux/smp.h>
43#include <linux/rcupdate.h>
44#include <linux/interrupt.h>
45#include <linux/sched.h>
46#include <asm/atomic.h>
47#include <linux/bitops.h>
48#include <linux/module.h>
49#include <linux/kthread.h>
50#include <linux/completion.h>
51#include <linux/moduleparam.h>
52#include <linux/percpu.h>
53#include <linux/notifier.h>
54#include <linux/cpu.h>
55#include <linux/random.h>
56#include <linux/delay.h>
57#include <linux/cpumask.h>
58#include <linux/rcupreempt_trace.h>
59#include <asm/byteorder.h>
60
61/*
62 * PREEMPT_RCU data structures.
63 */
64
65/*
66 * GP_STAGES specifies the number of times the state machine has
67 * to go through the all the rcu_try_flip_states (see below)
68 * in a single Grace Period.
69 *
70 * GP in GP_STAGES stands for Grace Period ;)
71 */
72#define GP_STAGES 2
73struct rcu_data {
74 spinlock_t lock; /* Protect rcu_data fields. */
75 long completed; /* Number of last completed batch. */
76 int waitlistcount;
77 struct rcu_head *nextlist;
78 struct rcu_head **nexttail;
79 struct rcu_head *waitlist[GP_STAGES];
80 struct rcu_head **waittail[GP_STAGES];
81 struct rcu_head *donelist; /* from waitlist & waitschedlist */
82 struct rcu_head **donetail;
83 long rcu_flipctr[2];
84 struct rcu_head *nextschedlist;
85 struct rcu_head **nextschedtail;
86 struct rcu_head *waitschedlist;
87 struct rcu_head **waitschedtail;
88 int rcu_sched_sleeping;
89#ifdef CONFIG_RCU_TRACE
90 struct rcupreempt_trace trace;
91#endif /* #ifdef CONFIG_RCU_TRACE */
92};
93
94/*
95 * States for rcu_try_flip() and friends.
96 */
97
98enum rcu_try_flip_states {
99
100 /*
101 * Stay here if nothing is happening. Flip the counter if somthing
102 * starts happening. Denoted by "I"
103 */
104 rcu_try_flip_idle_state,
105
106 /*
107 * Wait here for all CPUs to notice that the counter has flipped. This
108 * prevents the old set of counters from ever being incremented once
109 * we leave this state, which in turn is necessary because we cannot
110 * test any individual counter for zero -- we can only check the sum.
111 * Denoted by "A".
112 */
113 rcu_try_flip_waitack_state,
114
115 /*
116 * Wait here for the sum of the old per-CPU counters to reach zero.
117 * Denoted by "Z".
118 */
119 rcu_try_flip_waitzero_state,
120
121 /*
122 * Wait here for each of the other CPUs to execute a memory barrier.
123 * This is necessary to ensure that these other CPUs really have
124 * completed executing their RCU read-side critical sections, despite
125 * their CPUs wildly reordering memory. Denoted by "M".
126 */
127 rcu_try_flip_waitmb_state,
128};
129
130/*
131 * States for rcu_ctrlblk.rcu_sched_sleep.
132 */
133
134enum rcu_sched_sleep_states {
135 rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
136 rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
137 rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
138};
139
140struct rcu_ctrlblk {
141 spinlock_t fliplock; /* Protect state-machine transitions. */
142 long completed; /* Number of last completed batch. */
143 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
144 the rcu state machine */
145 spinlock_t schedlock; /* Protect rcu_sched sleep state. */
146 enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
148};
149
150struct rcu_dyntick_sched {
151 int dynticks;
152 int dynticks_snap;
153 int sched_qs;
154 int sched_qs_snap;
155 int sched_dynticks_snap;
156};
157
158static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
159 .dynticks = 1,
160};
161
162void rcu_qsctr_inc(int cpu)
163{
164 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
165
166 rdssp->sched_qs++;
167}
168
169#ifdef CONFIG_NO_HZ
170
171void rcu_enter_nohz(void)
172{
173 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
174
175 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
176 __get_cpu_var(rcu_dyntick_sched).dynticks++;
177 WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
178}
179
180void rcu_exit_nohz(void)
181{
182 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
183
184 __get_cpu_var(rcu_dyntick_sched).dynticks++;
185 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
186 WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
187 &rs);
188}
189
190#endif /* CONFIG_NO_HZ */
191
192
193static DEFINE_PER_CPU(struct rcu_data, rcu_data);
194
195static struct rcu_ctrlblk rcu_ctrlblk = {
196 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
197 .completed = 0,
198 .rcu_try_flip_state = rcu_try_flip_idle_state,
199 .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
200 .sched_sleep = rcu_sched_not_sleeping,
201 .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
202};
203
204static struct task_struct *rcu_sched_grace_period_task;
205
206#ifdef CONFIG_RCU_TRACE
207static char *rcu_try_flip_state_names[] =
208 { "idle", "waitack", "waitzero", "waitmb" };
209#endif /* #ifdef CONFIG_RCU_TRACE */
210
211static DECLARE_BITMAP(rcu_cpu_online_map, NR_CPUS) __read_mostly
212 = CPU_BITS_NONE;
213
214/*
215 * Enum and per-CPU flag to determine when each CPU has seen
216 * the most recent counter flip.
217 */
218
219enum rcu_flip_flag_values {
220 rcu_flip_seen, /* Steady/initial state, last flip seen. */
221 /* Only GP detector can update. */
222 rcu_flipped /* Flip just completed, need confirmation. */
223 /* Only corresponding CPU can update. */
224};
225static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
226 = rcu_flip_seen;
227
228/*
229 * Enum and per-CPU flag to determine when each CPU has executed the
230 * needed memory barrier to fence in memory references from its last RCU
231 * read-side critical section in the just-completed grace period.
232 */
233
234enum rcu_mb_flag_values {
235 rcu_mb_done, /* Steady/initial state, no mb()s required. */
236 /* Only GP detector can update. */
237 rcu_mb_needed /* Flip just completed, need an mb(). */
238 /* Only corresponding CPU can update. */
239};
240static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
241 = rcu_mb_done;
242
243/*
244 * RCU_DATA_ME: find the current CPU's rcu_data structure.
245 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
246 */
247#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
248#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
249
250/*
251 * Helper macro for tracing when the appropriate rcu_data is not
252 * cached in a local variable, but where the CPU number is so cached.
253 */
254#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
255
256/*
257 * Helper macro for tracing when the appropriate rcu_data is not
258 * cached in a local variable.
259 */
260#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
261
262/*
263 * Helper macro for tracing when the appropriate rcu_data is pointed
264 * to by a local variable.
265 */
266#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
267
268#define RCU_SCHED_BATCH_TIME (HZ / 50)
269
270/*
271 * Return the number of RCU batches processed thus far. Useful
272 * for debug and statistics.
273 */
274long rcu_batches_completed(void)
275{
276 return rcu_ctrlblk.completed;
277}
278EXPORT_SYMBOL_GPL(rcu_batches_completed);
279
280void __rcu_read_lock(void)
281{
282 int idx;
283 struct task_struct *t = current;
284 int nesting;
285
286 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
287 if (nesting != 0) {
288
289 /* An earlier rcu_read_lock() covers us, just count it. */
290
291 t->rcu_read_lock_nesting = nesting + 1;
292
293 } else {
294 unsigned long flags;
295
296 /*
297 * We disable interrupts for the following reasons:
298 * - If we get scheduling clock interrupt here, and we
299 * end up acking the counter flip, it's like a promise
300 * that we will never increment the old counter again.
301 * Thus we will break that promise if that
302 * scheduling clock interrupt happens between the time
303 * we pick the .completed field and the time that we
304 * increment our counter.
305 *
306 * - We don't want to be preempted out here.
307 *
308 * NMIs can still occur, of course, and might themselves
309 * contain rcu_read_lock().
310 */
311
312 local_irq_save(flags);
313
314 /*
315 * Outermost nesting of rcu_read_lock(), so increment
316 * the current counter for the current CPU. Use volatile
317 * casts to prevent the compiler from reordering.
318 */
319
320 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
321 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
322
323 /*
324 * Now that the per-CPU counter has been incremented, we
325 * are protected from races with rcu_read_lock() invoked
326 * from NMI handlers on this CPU. We can therefore safely
327 * increment the nesting counter, relieving further NMIs
328 * of the need to increment the per-CPU counter.
329 */
330
331 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
332
333 /*
334 * Now that we have preventing any NMIs from storing
335 * to the ->rcu_flipctr_idx, we can safely use it to
336 * remember which counter to decrement in the matching
337 * rcu_read_unlock().
338 */
339
340 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
341 local_irq_restore(flags);
342 }
343}
344EXPORT_SYMBOL_GPL(__rcu_read_lock);
345
346void __rcu_read_unlock(void)
347{
348 int idx;
349 struct task_struct *t = current;
350 int nesting;
351
352 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
353 if (nesting > 1) {
354
355 /*
356 * We are still protected by the enclosing rcu_read_lock(),
357 * so simply decrement the counter.
358 */
359
360 t->rcu_read_lock_nesting = nesting - 1;
361
362 } else {
363 unsigned long flags;
364
365 /*
366 * Disable local interrupts to prevent the grace-period
367 * detection state machine from seeing us half-done.
368 * NMIs can still occur, of course, and might themselves
369 * contain rcu_read_lock() and rcu_read_unlock().
370 */
371
372 local_irq_save(flags);
373
374 /*
375 * Outermost nesting of rcu_read_unlock(), so we must
376 * decrement the current counter for the current CPU.
377 * This must be done carefully, because NMIs can
378 * occur at any point in this code, and any rcu_read_lock()
379 * and rcu_read_unlock() pairs in the NMI handlers
380 * must interact non-destructively with this code.
381 * Lots of volatile casts, and -very- careful ordering.
382 *
383 * Changes to this code, including this one, must be
384 * inspected, validated, and tested extremely carefully!!!
385 */
386
387 /*
388 * First, pick up the index.
389 */
390
391 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
392
393 /*
394 * Now that we have fetched the counter index, it is
395 * safe to decrement the per-task RCU nesting counter.
396 * After this, any interrupts or NMIs will increment and
397 * decrement the per-CPU counters.
398 */
399 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
400
401 /*
402 * It is now safe to decrement this task's nesting count.
403 * NMIs that occur after this statement will route their
404 * rcu_read_lock() calls through this "else" clause, and
405 * will thus start incrementing the per-CPU counter on
406 * their own. They will also clobber ->rcu_flipctr_idx,
407 * but that is OK, since we have already fetched it.
408 */
409
410 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
411 local_irq_restore(flags);
412 }
413}
414EXPORT_SYMBOL_GPL(__rcu_read_unlock);
415
416/*
417 * If a global counter flip has occurred since the last time that we
418 * advanced callbacks, advance them. Hardware interrupts must be
419 * disabled when calling this function.
420 */
421static void __rcu_advance_callbacks(struct rcu_data *rdp)
422{
423 int cpu;
424 int i;
425 int wlc = 0;
426
427 if (rdp->completed != rcu_ctrlblk.completed) {
428 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
429 *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
430 rdp->donetail = rdp->waittail[GP_STAGES - 1];
431 RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
432 }
433 for (i = GP_STAGES - 2; i >= 0; i--) {
434 if (rdp->waitlist[i] != NULL) {
435 rdp->waitlist[i + 1] = rdp->waitlist[i];
436 rdp->waittail[i + 1] = rdp->waittail[i];
437 wlc++;
438 } else {
439 rdp->waitlist[i + 1] = NULL;
440 rdp->waittail[i + 1] =
441 &rdp->waitlist[i + 1];
442 }
443 }
444 if (rdp->nextlist != NULL) {
445 rdp->waitlist[0] = rdp->nextlist;
446 rdp->waittail[0] = rdp->nexttail;
447 wlc++;
448 rdp->nextlist = NULL;
449 rdp->nexttail = &rdp->nextlist;
450 RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
451 } else {
452 rdp->waitlist[0] = NULL;
453 rdp->waittail[0] = &rdp->waitlist[0];
454 }
455 rdp->waitlistcount = wlc;
456 rdp->completed = rcu_ctrlblk.completed;
457 }
458
459 /*
460 * Check to see if this CPU needs to report that it has seen
461 * the most recent counter flip, thereby declaring that all
462 * subsequent rcu_read_lock() invocations will respect this flip.
463 */
464
465 cpu = raw_smp_processor_id();
466 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
467 smp_mb(); /* Subsequent counter accesses must see new value */
468 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
469 smp_mb(); /* Subsequent RCU read-side critical sections */
470 /* seen -after- acknowledgement. */
471 }
472}
473
474#ifdef CONFIG_NO_HZ
475static DEFINE_PER_CPU(int, rcu_update_flag);
476
477/**
478 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
479 *
480 * If the CPU was idle with dynamic ticks active, this updates the
481 * rcu_dyntick_sched.dynticks to let the RCU handling know that the
482 * CPU is active.
483 */
484void rcu_irq_enter(void)
485{
486 int cpu = smp_processor_id();
487 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
488
489 if (per_cpu(rcu_update_flag, cpu))
490 per_cpu(rcu_update_flag, cpu)++;
491
492 /*
493 * Only update if we are coming from a stopped ticks mode
494 * (rcu_dyntick_sched.dynticks is even).
495 */
496 if (!in_interrupt() &&
497 (rdssp->dynticks & 0x1) == 0) {
498 /*
499 * The following might seem like we could have a race
500 * with NMI/SMIs. But this really isn't a problem.
501 * Here we do a read/modify/write, and the race happens
502 * when an NMI/SMI comes in after the read and before
503 * the write. But NMI/SMIs will increment this counter
504 * twice before returning, so the zero bit will not
505 * be corrupted by the NMI/SMI which is the most important
506 * part.
507 *
508 * The only thing is that we would bring back the counter
509 * to a postion that it was in during the NMI/SMI.
510 * But the zero bit would be set, so the rest of the
511 * counter would again be ignored.
512 *
513 * On return from the IRQ, the counter may have the zero
514 * bit be 0 and the counter the same as the return from
515 * the NMI/SMI. If the state machine was so unlucky to
516 * see that, it still doesn't matter, since all
517 * RCU read-side critical sections on this CPU would
518 * have already completed.
519 */
520 rdssp->dynticks++;
521 /*
522 * The following memory barrier ensures that any
523 * rcu_read_lock() primitives in the irq handler
524 * are seen by other CPUs to follow the above
525 * increment to rcu_dyntick_sched.dynticks. This is
526 * required in order for other CPUs to correctly
527 * determine when it is safe to advance the RCU
528 * grace-period state machine.
529 */
530 smp_mb(); /* see above block comment. */
531 /*
532 * Since we can't determine the dynamic tick mode from
533 * the rcu_dyntick_sched.dynticks after this routine,
534 * we use a second flag to acknowledge that we came
535 * from an idle state with ticks stopped.
536 */
537 per_cpu(rcu_update_flag, cpu)++;
538 /*
539 * If we take an NMI/SMI now, they will also increment
540 * the rcu_update_flag, and will not update the
541 * rcu_dyntick_sched.dynticks on exit. That is for
542 * this IRQ to do.
543 */
544 }
545}
546
547/**
548 * rcu_irq_exit - Called from exiting Hard irq context.
549 *
550 * If the CPU was idle with dynamic ticks active, update the
551 * rcu_dyntick_sched.dynticks to put let the RCU handling be
552 * aware that the CPU is going back to idle with no ticks.
553 */
554void rcu_irq_exit(void)
555{
556 int cpu = smp_processor_id();
557 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
558
559 /*
560 * rcu_update_flag is set if we interrupted the CPU
561 * when it was idle with ticks stopped.
562 * Once this occurs, we keep track of interrupt nesting
563 * because a NMI/SMI could also come in, and we still
564 * only want the IRQ that started the increment of the
565 * rcu_dyntick_sched.dynticks to be the one that modifies
566 * it on exit.
567 */
568 if (per_cpu(rcu_update_flag, cpu)) {
569 if (--per_cpu(rcu_update_flag, cpu))
570 return;
571
572 /* This must match the interrupt nesting */
573 WARN_ON(in_interrupt());
574
575 /*
576 * If an NMI/SMI happens now we are still
577 * protected by the rcu_dyntick_sched.dynticks being odd.
578 */
579
580 /*
581 * The following memory barrier ensures that any
582 * rcu_read_unlock() primitives in the irq handler
583 * are seen by other CPUs to preceed the following
584 * increment to rcu_dyntick_sched.dynticks. This
585 * is required in order for other CPUs to determine
586 * when it is safe to advance the RCU grace-period
587 * state machine.
588 */
589 smp_mb(); /* see above block comment. */
590 rdssp->dynticks++;
591 WARN_ON(rdssp->dynticks & 0x1);
592 }
593}
594
595void rcu_nmi_enter(void)
596{
597 rcu_irq_enter();
598}
599
600void rcu_nmi_exit(void)
601{
602 rcu_irq_exit();
603}
604
605static void dyntick_save_progress_counter(int cpu)
606{
607 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
608
609 rdssp->dynticks_snap = rdssp->dynticks;
610}
611
612static inline int
613rcu_try_flip_waitack_needed(int cpu)
614{
615 long curr;
616 long snap;
617 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
618
619 curr = rdssp->dynticks;
620 snap = rdssp->dynticks_snap;
621 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
622
623 /*
624 * If the CPU remained in dynticks mode for the entire time
625 * and didn't take any interrupts, NMIs, SMIs, or whatever,
626 * then it cannot be in the middle of an rcu_read_lock(), so
627 * the next rcu_read_lock() it executes must use the new value
628 * of the counter. So we can safely pretend that this CPU
629 * already acknowledged the counter.
630 */
631
632 if ((curr == snap) && ((curr & 0x1) == 0))
633 return 0;
634
635 /*
636 * If the CPU passed through or entered a dynticks idle phase with
637 * no active irq handlers, then, as above, we can safely pretend
638 * that this CPU already acknowledged the counter.
639 */
640
641 if ((curr - snap) > 2 || (curr & 0x1) == 0)
642 return 0;
643
644 /* We need this CPU to explicitly acknowledge the counter flip. */
645
646 return 1;
647}
648
649static inline int
650rcu_try_flip_waitmb_needed(int cpu)
651{
652 long curr;
653 long snap;
654 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
655
656 curr = rdssp->dynticks;
657 snap = rdssp->dynticks_snap;
658 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
659
660 /*
661 * If the CPU remained in dynticks mode for the entire time
662 * and didn't take any interrupts, NMIs, SMIs, or whatever,
663 * then it cannot have executed an RCU read-side critical section
664 * during that time, so there is no need for it to execute a
665 * memory barrier.
666 */
667
668 if ((curr == snap) && ((curr & 0x1) == 0))
669 return 0;
670
671 /*
672 * If the CPU either entered or exited an outermost interrupt,
673 * SMI, NMI, or whatever handler, then we know that it executed
674 * a memory barrier when doing so. So we don't need another one.
675 */
676 if (curr != snap)
677 return 0;
678
679 /* We need the CPU to execute a memory barrier. */
680
681 return 1;
682}
683
684static void dyntick_save_progress_counter_sched(int cpu)
685{
686 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
687
688 rdssp->sched_dynticks_snap = rdssp->dynticks;
689}
690
691static int rcu_qsctr_inc_needed_dyntick(int cpu)
692{
693 long curr;
694 long snap;
695 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
696
697 curr = rdssp->dynticks;
698 snap = rdssp->sched_dynticks_snap;
699 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
700
701 /*
702 * If the CPU remained in dynticks mode for the entire time
703 * and didn't take any interrupts, NMIs, SMIs, or whatever,
704 * then it cannot be in the middle of an rcu_read_lock(), so
705 * the next rcu_read_lock() it executes must use the new value
706 * of the counter. Therefore, this CPU has been in a quiescent
707 * state the entire time, and we don't need to wait for it.
708 */
709
710 if ((curr == snap) && ((curr & 0x1) == 0))
711 return 0;
712
713 /*
714 * If the CPU passed through or entered a dynticks idle phase with
715 * no active irq handlers, then, as above, this CPU has already
716 * passed through a quiescent state.
717 */
718
719 if ((curr - snap) > 2 || (snap & 0x1) == 0)
720 return 0;
721
722 /* We need this CPU to go through a quiescent state. */
723
724 return 1;
725}
726
727#else /* !CONFIG_NO_HZ */
728
729# define dyntick_save_progress_counter(cpu) do { } while (0)
730# define rcu_try_flip_waitack_needed(cpu) (1)
731# define rcu_try_flip_waitmb_needed(cpu) (1)
732
733# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
734# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
735
736#endif /* CONFIG_NO_HZ */
737
738static void save_qsctr_sched(int cpu)
739{
740 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
741
742 rdssp->sched_qs_snap = rdssp->sched_qs;
743}
744
745static inline int rcu_qsctr_inc_needed(int cpu)
746{
747 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
748
749 /*
750 * If there has been a quiescent state, no more need to wait
751 * on this CPU.
752 */
753
754 if (rdssp->sched_qs != rdssp->sched_qs_snap) {
755 smp_mb(); /* force ordering with cpu entering schedule(). */
756 return 0;
757 }
758
759 /* We need this CPU to go through a quiescent state. */
760
761 return 1;
762}
763
764/*
765 * Get here when RCU is idle. Decide whether we need to
766 * move out of idle state, and return non-zero if so.
767 * "Straightforward" approach for the moment, might later
768 * use callback-list lengths, grace-period duration, or
769 * some such to determine when to exit idle state.
770 * Might also need a pre-idle test that does not acquire
771 * the lock, but let's get the simple case working first...
772 */
773
774static int
775rcu_try_flip_idle(void)
776{
777 int cpu;
778
779 RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
780 if (!rcu_pending(smp_processor_id())) {
781 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
782 return 0;
783 }
784
785 /*
786 * Do the flip.
787 */
788
789 RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
790 rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
791
792 /*
793 * Need a memory barrier so that other CPUs see the new
794 * counter value before they see the subsequent change of all
795 * the rcu_flip_flag instances to rcu_flipped.
796 */
797
798 smp_mb(); /* see above block comment. */
799
800 /* Now ask each CPU for acknowledgement of the flip. */
801
802 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
803 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
804 dyntick_save_progress_counter(cpu);
805 }
806
807 return 1;
808}
809
810/*
811 * Wait for CPUs to acknowledge the flip.
812 */
813
814static int
815rcu_try_flip_waitack(void)
816{
817 int cpu;
818
819 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
820 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
821 if (rcu_try_flip_waitack_needed(cpu) &&
822 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
823 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
824 return 0;
825 }
826
827 /*
828 * Make sure our checks above don't bleed into subsequent
829 * waiting for the sum of the counters to reach zero.
830 */
831
832 smp_mb(); /* see above block comment. */
833 RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
834 return 1;
835}
836
837/*
838 * Wait for collective ``last'' counter to reach zero,
839 * then tell all CPUs to do an end-of-grace-period memory barrier.
840 */
841
842static int
843rcu_try_flip_waitzero(void)
844{
845 int cpu;
846 int lastidx = !(rcu_ctrlblk.completed & 0x1);
847 int sum = 0;
848
849 /* Check to see if the sum of the "last" counters is zero. */
850
851 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
852 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
853 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
854 if (sum != 0) {
855 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
856 return 0;
857 }
858
859 /*
860 * This ensures that the other CPUs see the call for
861 * memory barriers -after- the sum to zero has been
862 * detected here
863 */
864 smp_mb(); /* ^^^^^^^^^^^^ */
865
866 /* Call for a memory barrier from each CPU. */
867 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map)) {
868 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
869 dyntick_save_progress_counter(cpu);
870 }
871
872 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
873 return 1;
874}
875
876/*
877 * Wait for all CPUs to do their end-of-grace-period memory barrier.
878 * Return 0 once all CPUs have done so.
879 */
880
881static int
882rcu_try_flip_waitmb(void)
883{
884 int cpu;
885
886 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
887 for_each_cpu(cpu, to_cpumask(rcu_cpu_online_map))
888 if (rcu_try_flip_waitmb_needed(cpu) &&
889 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
890 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
891 return 0;
892 }
893
894 smp_mb(); /* Ensure that the above checks precede any following flip. */
895 RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
896 return 1;
897}
898
899/*
900 * Attempt a single flip of the counters. Remember, a single flip does
901 * -not- constitute a grace period. Instead, the interval between
902 * at least GP_STAGES consecutive flips is a grace period.
903 *
904 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
905 * on a large SMP, they might want to use a hierarchical organization of
906 * the per-CPU-counter pairs.
907 */
908static void rcu_try_flip(void)
909{
910 unsigned long flags;
911
912 RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
913 if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
914 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
915 return;
916 }
917
918 /*
919 * Take the next transition(s) through the RCU grace-period
920 * flip-counter state machine.
921 */
922
923 switch (rcu_ctrlblk.rcu_try_flip_state) {
924 case rcu_try_flip_idle_state:
925 if (rcu_try_flip_idle())
926 rcu_ctrlblk.rcu_try_flip_state =
927 rcu_try_flip_waitack_state;
928 break;
929 case rcu_try_flip_waitack_state:
930 if (rcu_try_flip_waitack())
931 rcu_ctrlblk.rcu_try_flip_state =
932 rcu_try_flip_waitzero_state;
933 break;
934 case rcu_try_flip_waitzero_state:
935 if (rcu_try_flip_waitzero())
936 rcu_ctrlblk.rcu_try_flip_state =
937 rcu_try_flip_waitmb_state;
938 break;
939 case rcu_try_flip_waitmb_state:
940 if (rcu_try_flip_waitmb())
941 rcu_ctrlblk.rcu_try_flip_state =
942 rcu_try_flip_idle_state;
943 }
944 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
945}
946
947/*
948 * Check to see if this CPU needs to do a memory barrier in order to
949 * ensure that any prior RCU read-side critical sections have committed
950 * their counter manipulations and critical-section memory references
951 * before declaring the grace period to be completed.
952 */
953static void rcu_check_mb(int cpu)
954{
955 if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
956 smp_mb(); /* Ensure RCU read-side accesses are visible. */
957 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
958 }
959}
960
961void rcu_check_callbacks(int cpu, int user)
962{
963 unsigned long flags;
964 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
965
966 /*
967 * If this CPU took its interrupt from user mode or from the
968 * idle loop, and this is not a nested interrupt, then
969 * this CPU has to have exited all prior preept-disable
970 * sections of code. So increment the counter to note this.
971 *
972 * The memory barrier is needed to handle the case where
973 * writes from a preempt-disable section of code get reordered
974 * into schedule() by this CPU's write buffer. So the memory
975 * barrier makes sure that the rcu_qsctr_inc() is seen by other
976 * CPUs to happen after any such write.
977 */
978
979 if (user ||
980 (idle_cpu(cpu) && !in_softirq() &&
981 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
982 smp_mb(); /* Guard against aggressive schedule(). */
983 rcu_qsctr_inc(cpu);
984 }
985
986 rcu_check_mb(cpu);
987 if (rcu_ctrlblk.completed == rdp->completed)
988 rcu_try_flip();
989 spin_lock_irqsave(&rdp->lock, flags);
990 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
991 __rcu_advance_callbacks(rdp);
992 if (rdp->donelist == NULL) {
993 spin_unlock_irqrestore(&rdp->lock, flags);
994 } else {
995 spin_unlock_irqrestore(&rdp->lock, flags);
996 raise_softirq(RCU_SOFTIRQ);
997 }
998}
999
1000/*
1001 * Needed by dynticks, to make sure all RCU processing has finished
1002 * when we go idle:
1003 */
1004void rcu_advance_callbacks(int cpu, int user)
1005{
1006 unsigned long flags;
1007 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1008
1009 if (rcu_ctrlblk.completed == rdp->completed) {
1010 rcu_try_flip();
1011 if (rcu_ctrlblk.completed == rdp->completed)
1012 return;
1013 }
1014 spin_lock_irqsave(&rdp->lock, flags);
1015 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
1016 __rcu_advance_callbacks(rdp);
1017 spin_unlock_irqrestore(&rdp->lock, flags);
1018}
1019
1020#ifdef CONFIG_HOTPLUG_CPU
1021#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
1022 *dsttail = srclist; \
1023 if (srclist != NULL) { \
1024 dsttail = srctail; \
1025 srclist = NULL; \
1026 srctail = &srclist;\
1027 } \
1028 } while (0)
1029
1030void rcu_offline_cpu(int cpu)
1031{
1032 int i;
1033 struct rcu_head *list = NULL;
1034 unsigned long flags;
1035 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1036 struct rcu_head *schedlist = NULL;
1037 struct rcu_head **schedtail = &schedlist;
1038 struct rcu_head **tail = &list;
1039
1040 /*
1041 * Remove all callbacks from the newly dead CPU, retaining order.
1042 * Otherwise rcu_barrier() will fail
1043 */
1044
1045 spin_lock_irqsave(&rdp->lock, flags);
1046 rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
1047 for (i = GP_STAGES - 1; i >= 0; i--)
1048 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
1049 list, tail);
1050 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1051 rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1052 schedlist, schedtail);
1053 rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1054 schedlist, schedtail);
1055 rdp->rcu_sched_sleeping = 0;
1056 spin_unlock_irqrestore(&rdp->lock, flags);
1057 rdp->waitlistcount = 0;
1058
1059 /* Disengage the newly dead CPU from the grace-period computation. */
1060
1061 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1062 rcu_check_mb(cpu);
1063 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
1064 smp_mb(); /* Subsequent counter accesses must see new value */
1065 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
1066 smp_mb(); /* Subsequent RCU read-side critical sections */
1067 /* seen -after- acknowledgement. */
1068 }
1069
1070 RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1071 RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
1072
1073 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
1074 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
1075
1076 cpumask_clear_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1077
1078 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1079
1080 /*
1081 * Place the removed callbacks on the current CPU's queue.
1082 * Make them all start a new grace period: simple approach,
1083 * in theory could starve a given set of callbacks, but
1084 * you would need to be doing some serious CPU hotplugging
1085 * to make this happen. If this becomes a problem, adding
1086 * a synchronize_rcu() to the hotplug path would be a simple
1087 * fix.
1088 */
1089
1090 local_irq_save(flags); /* disable preempt till we know what lock. */
1091 rdp = RCU_DATA_ME();
1092 spin_lock(&rdp->lock);
1093 *rdp->nexttail = list;
1094 if (list)
1095 rdp->nexttail = tail;
1096 *rdp->nextschedtail = schedlist;
1097 if (schedlist)
1098 rdp->nextschedtail = schedtail;
1099 spin_unlock_irqrestore(&rdp->lock, flags);
1100}
1101
1102#else /* #ifdef CONFIG_HOTPLUG_CPU */
1103
1104void rcu_offline_cpu(int cpu)
1105{
1106}
1107
1108#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1109
1110void __cpuinit rcu_online_cpu(int cpu)
1111{
1112 unsigned long flags;
1113 struct rcu_data *rdp;
1114
1115 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
1116 cpumask_set_cpu(cpu, to_cpumask(rcu_cpu_online_map));
1117 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
1118
1119 /*
1120 * The rcu_sched grace-period processing might have bypassed
1121 * this CPU, given that it was not in the rcu_cpu_online_map
1122 * when the grace-period scan started. This means that the
1123 * grace-period task might sleep. So make sure that if this
1124 * should happen, the first callback posted to this CPU will
1125 * wake up the grace-period task if need be.
1126 */
1127
1128 rdp = RCU_DATA_CPU(cpu);
1129 spin_lock_irqsave(&rdp->lock, flags);
1130 rdp->rcu_sched_sleeping = 1;
1131 spin_unlock_irqrestore(&rdp->lock, flags);
1132}
1133
1134static void rcu_process_callbacks(struct softirq_action *unused)
1135{
1136 unsigned long flags;
1137 struct rcu_head *next, *list;
1138 struct rcu_data *rdp;
1139
1140 local_irq_save(flags);
1141 rdp = RCU_DATA_ME();
1142 spin_lock(&rdp->lock);
1143 list = rdp->donelist;
1144 if (list == NULL) {
1145 spin_unlock_irqrestore(&rdp->lock, flags);
1146 return;
1147 }
1148 rdp->donelist = NULL;
1149 rdp->donetail = &rdp->donelist;
1150 RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
1151 spin_unlock_irqrestore(&rdp->lock, flags);
1152 while (list) {
1153 next = list->next;
1154 list->func(list);
1155 list = next;
1156 RCU_TRACE_ME(rcupreempt_trace_invoke);
1157 }
1158}
1159
1160void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1161{
1162 unsigned long flags;
1163 struct rcu_data *rdp;
1164
1165 head->func = func;
1166 head->next = NULL;
1167 local_irq_save(flags);
1168 rdp = RCU_DATA_ME();
1169 spin_lock(&rdp->lock);
1170 __rcu_advance_callbacks(rdp);
1171 *rdp->nexttail = head;
1172 rdp->nexttail = &head->next;
1173 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
1174 spin_unlock_irqrestore(&rdp->lock, flags);
1175}
1176EXPORT_SYMBOL_GPL(call_rcu);
1177
1178void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1179{
1180 unsigned long flags;
1181 struct rcu_data *rdp;
1182 int wake_gp = 0;
1183
1184 head->func = func;
1185 head->next = NULL;
1186 local_irq_save(flags);
1187 rdp = RCU_DATA_ME();
1188 spin_lock(&rdp->lock);
1189 *rdp->nextschedtail = head;
1190 rdp->nextschedtail = &head->next;
1191 if (rdp->rcu_sched_sleeping) {
1192
1193 /* Grace-period processing might be sleeping... */
1194
1195 rdp->rcu_sched_sleeping = 0;
1196 wake_gp = 1;
1197 }
1198 spin_unlock_irqrestore(&rdp->lock, flags);
1199 if (wake_gp) {
1200
1201 /* Wake up grace-period processing, unless someone beat us. */
1202
1203 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1204 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1205 wake_gp = 0;
1206 rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1207 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1208 if (wake_gp)
1209 wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1210 }
1211}
1212EXPORT_SYMBOL_GPL(call_rcu_sched);
1213
1214/*
1215 * Wait until all currently running preempt_disable() code segments
1216 * (including hardware-irq-disable segments) complete. Note that
1217 * in -rt this does -not- necessarily result in all currently executing
1218 * interrupt -handlers- having completed.
1219 */
1220void __synchronize_sched(void)
1221{
1222 struct rcu_synchronize rcu;
1223
1224 if (num_online_cpus() == 1)
1225 return; /* blocking is gp if only one CPU! */
1226
1227 init_completion(&rcu.completion);
1228 /* Will wake me after RCU finished. */
1229 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1230 /* Wait for it. */
1231 wait_for_completion(&rcu.completion);
1232}
1233EXPORT_SYMBOL_GPL(__synchronize_sched);
1234
1235/*
1236 * kthread function that manages call_rcu_sched grace periods.
1237 */
1238static int rcu_sched_grace_period(void *arg)
1239{
1240 int couldsleep; /* might sleep after current pass. */
1241 int couldsleepnext = 0; /* might sleep after next pass. */
1242 int cpu;
1243 unsigned long flags;
1244 struct rcu_data *rdp;
1245 int ret;
1246
1247 /*
1248 * Each pass through the following loop handles one
1249 * rcu_sched grace period cycle.
1250 */
1251 do {
1252 /* Save each CPU's current state. */
1253
1254 for_each_online_cpu(cpu) {
1255 dyntick_save_progress_counter_sched(cpu);
1256 save_qsctr_sched(cpu);
1257 }
1258
1259 /*
1260 * Sleep for about an RCU grace-period's worth to
1261 * allow better batching and to consume less CPU.
1262 */
1263 schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1264
1265 /*
1266 * If there was nothing to do last time, prepare to
1267 * sleep at the end of the current grace period cycle.
1268 */
1269 couldsleep = couldsleepnext;
1270 couldsleepnext = 1;
1271 if (couldsleep) {
1272 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1273 rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1274 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1275 }
1276
1277 /*
1278 * Wait on each CPU in turn to have either visited
1279 * a quiescent state or been in dynticks-idle mode.
1280 */
1281 for_each_online_cpu(cpu) {
1282 while (rcu_qsctr_inc_needed(cpu) &&
1283 rcu_qsctr_inc_needed_dyntick(cpu)) {
1284 /* resched_cpu(cpu); @@@ */
1285 schedule_timeout_interruptible(1);
1286 }
1287 }
1288
1289 /* Advance callbacks for each CPU. */
1290
1291 for_each_online_cpu(cpu) {
1292
1293 rdp = RCU_DATA_CPU(cpu);
1294 spin_lock_irqsave(&rdp->lock, flags);
1295
1296 /*
1297 * We are running on this CPU irq-disabled, so no
1298 * CPU can go offline until we re-enable irqs.
1299 * The current CPU might have already gone
1300 * offline (between the for_each_offline_cpu and
1301 * the spin_lock_irqsave), but in that case all its
1302 * callback lists will be empty, so no harm done.
1303 *
1304 * Advance the callbacks! We share normal RCU's
1305 * donelist, since callbacks are invoked the
1306 * same way in either case.
1307 */
1308 if (rdp->waitschedlist != NULL) {
1309 *rdp->donetail = rdp->waitschedlist;
1310 rdp->donetail = rdp->waitschedtail;
1311
1312 /*
1313 * Next rcu_check_callbacks() will
1314 * do the required raise_softirq().
1315 */
1316 }
1317 if (rdp->nextschedlist != NULL) {
1318 rdp->waitschedlist = rdp->nextschedlist;
1319 rdp->waitschedtail = rdp->nextschedtail;
1320 couldsleep = 0;
1321 couldsleepnext = 0;
1322 } else {
1323 rdp->waitschedlist = NULL;
1324 rdp->waitschedtail = &rdp->waitschedlist;
1325 }
1326 rdp->nextschedlist = NULL;
1327 rdp->nextschedtail = &rdp->nextschedlist;
1328
1329 /* Mark sleep intention. */
1330
1331 rdp->rcu_sched_sleeping = couldsleep;
1332
1333 spin_unlock_irqrestore(&rdp->lock, flags);
1334 }
1335
1336 /* If we saw callbacks on the last scan, go deal with them. */
1337
1338 if (!couldsleep)
1339 continue;
1340
1341 /* Attempt to block... */
1342
1343 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1344 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1345
1346 /*
1347 * Someone posted a callback after we scanned.
1348 * Go take care of it.
1349 */
1350 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1351 couldsleepnext = 0;
1352 continue;
1353 }
1354
1355 /* Block until the next person posts a callback. */
1356
1357 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1358 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1359 ret = 0; /* unused */
1360 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1361 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1362 ret);
1363
1364 couldsleepnext = 0;
1365
1366 } while (!kthread_should_stop());
1367
1368 return (0);
1369}
1370
1371/*
1372 * Check to see if any future RCU-related work will need to be done
1373 * by the current CPU, even if none need be done immediately, returning
1374 * 1 if so. Assumes that notifiers would take care of handling any
1375 * outstanding requests from the RCU core.
1376 *
1377 * This function is part of the RCU implementation; it is -not-
1378 * an exported member of the RCU API.
1379 */
1380int rcu_needs_cpu(int cpu)
1381{
1382 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1383
1384 return (rdp->donelist != NULL ||
1385 !!rdp->waitlistcount ||
1386 rdp->nextlist != NULL ||
1387 rdp->nextschedlist != NULL ||
1388 rdp->waitschedlist != NULL);
1389}
1390
1391int rcu_pending(int cpu)
1392{
1393 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1394
1395 /* The CPU has at least one callback queued somewhere. */
1396
1397 if (rdp->donelist != NULL ||
1398 !!rdp->waitlistcount ||
1399 rdp->nextlist != NULL ||
1400 rdp->nextschedlist != NULL ||
1401 rdp->waitschedlist != NULL)
1402 return 1;
1403
1404 /* The RCU core needs an acknowledgement from this CPU. */
1405
1406 if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
1407 (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
1408 return 1;
1409
1410 /* This CPU has fallen behind the global grace-period number. */
1411
1412 if (rdp->completed != rcu_ctrlblk.completed)
1413 return 1;
1414
1415 /* Nothing needed from this CPU. */
1416
1417 return 0;
1418}
1419
1420static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1421 unsigned long action, void *hcpu)
1422{
1423 long cpu = (long)hcpu;
1424
1425 switch (action) {
1426 case CPU_UP_PREPARE:
1427 case CPU_UP_PREPARE_FROZEN:
1428 rcu_online_cpu(cpu);
1429 break;
1430 case CPU_UP_CANCELED:
1431 case CPU_UP_CANCELED_FROZEN:
1432 case CPU_DEAD:
1433 case CPU_DEAD_FROZEN:
1434 rcu_offline_cpu(cpu);
1435 break;
1436 default:
1437 break;
1438 }
1439 return NOTIFY_OK;
1440}
1441
1442static struct notifier_block __cpuinitdata rcu_nb = {
1443 .notifier_call = rcu_cpu_notify,
1444};
1445
1446void __init __rcu_init(void)
1447{
1448 int cpu;
1449 int i;
1450 struct rcu_data *rdp;
1451
1452 printk(KERN_NOTICE "Preemptible RCU implementation.\n");
1453 for_each_possible_cpu(cpu) {
1454 rdp = RCU_DATA_CPU(cpu);
1455 spin_lock_init(&rdp->lock);
1456 rdp->completed = 0;
1457 rdp->waitlistcount = 0;
1458 rdp->nextlist = NULL;
1459 rdp->nexttail = &rdp->nextlist;
1460 for (i = 0; i < GP_STAGES; i++) {
1461 rdp->waitlist[i] = NULL;
1462 rdp->waittail[i] = &rdp->waitlist[i];
1463 }
1464 rdp->donelist = NULL;
1465 rdp->donetail = &rdp->donelist;
1466 rdp->rcu_flipctr[0] = 0;
1467 rdp->rcu_flipctr[1] = 0;
1468 rdp->nextschedlist = NULL;
1469 rdp->nextschedtail = &rdp->nextschedlist;
1470 rdp->waitschedlist = NULL;
1471 rdp->waitschedtail = &rdp->waitschedlist;
1472 rdp->rcu_sched_sleeping = 0;
1473 }
1474 register_cpu_notifier(&rcu_nb);
1475
1476 /*
1477 * We don't need protection against CPU-Hotplug here
1478 * since
1479 * a) If a CPU comes online while we are iterating over the
1480 * cpu_online_mask below, we would only end up making a
1481 * duplicate call to rcu_online_cpu() which sets the corresponding
1482 * CPU's mask in the rcu_cpu_online_map.
1483 *
1484 * b) A CPU cannot go offline at this point in time since the user
1485 * does not have access to the sysfs interface, nor do we
1486 * suspend the system.
1487 */
1488 for_each_online_cpu(cpu)
1489 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
1490
1491 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1492}
1493
1494/*
1495 * Late-boot-time RCU initialization that must wait until after scheduler
1496 * has been initialized.
1497 */
1498void __init rcu_init_sched(void)
1499{
1500 rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1501 NULL,
1502 "rcu_sched_grace_period");
1503 WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1504}
1505
1506#ifdef CONFIG_RCU_TRACE
1507long *rcupreempt_flipctr(int cpu)
1508{
1509 return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
1510}
1511EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
1512
1513int rcupreempt_flip_flag(int cpu)
1514{
1515 return per_cpu(rcu_flip_flag, cpu);
1516}
1517EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
1518
1519int rcupreempt_mb_flag(int cpu)
1520{
1521 return per_cpu(rcu_mb_flag, cpu);
1522}
1523EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
1524
1525char *rcupreempt_try_flip_state_name(void)
1526{
1527 return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
1528}
1529EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
1530
1531struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
1532{
1533 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
1534
1535 return &rdp->trace;
1536}
1537EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
1538
1539#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
deleted file mode 100644
index 7c2665cac172..000000000000
--- a/kernel/rcupreempt_trace.c
+++ /dev/null
@@ -1,334 +0,0 @@
1/*
2 * Read-Copy Update tracing for realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/cpu.h>
42#include <linux/mutex.h>
43#include <linux/rcupreempt_trace.h>
44#include <linux/debugfs.h>
45
46static struct mutex rcupreempt_trace_mutex;
47static char *rcupreempt_trace_buf;
48#define RCUPREEMPT_TRACE_BUF_SIZE 4096
49
50void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
51{
52 trace->done_length += trace->wait_length;
53 trace->done_add += trace->wait_length;
54 trace->wait_length = 0;
55}
56void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
57{
58 trace->wait_length += trace->next_length;
59 trace->wait_add += trace->next_length;
60 trace->next_length = 0;
61}
62void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
63{
64 atomic_inc(&trace->rcu_try_flip_1);
65}
66void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
67{
68 atomic_inc(&trace->rcu_try_flip_e1);
69}
70void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
71{
72 trace->rcu_try_flip_i1++;
73}
74void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
75{
76 trace->rcu_try_flip_ie1++;
77}
78void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
79{
80 trace->rcu_try_flip_g1++;
81}
82void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
83{
84 trace->rcu_try_flip_a1++;
85}
86void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
87{
88 trace->rcu_try_flip_ae1++;
89}
90void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
91{
92 trace->rcu_try_flip_a2++;
93}
94void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
95{
96 trace->rcu_try_flip_z1++;
97}
98void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
99{
100 trace->rcu_try_flip_ze1++;
101}
102void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
103{
104 trace->rcu_try_flip_z2++;
105}
106void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
107{
108 trace->rcu_try_flip_m1++;
109}
110void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
111{
112 trace->rcu_try_flip_me1++;
113}
114void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
115{
116 trace->rcu_try_flip_m2++;
117}
118void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
119{
120 trace->rcu_check_callbacks++;
121}
122void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
123{
124 trace->done_remove += trace->done_length;
125 trace->done_length = 0;
126}
127void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
128{
129 atomic_inc(&trace->done_invoked);
130}
131void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
132{
133 trace->next_add++;
134 trace->next_length++;
135}
136
137static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
138{
139 struct rcupreempt_trace *cp;
140 int cpu;
141
142 memset(sp, 0, sizeof(*sp));
143 for_each_possible_cpu(cpu) {
144 cp = rcupreempt_trace_cpu(cpu);
145 sp->next_length += cp->next_length;
146 sp->next_add += cp->next_add;
147 sp->wait_length += cp->wait_length;
148 sp->wait_add += cp->wait_add;
149 sp->done_length += cp->done_length;
150 sp->done_add += cp->done_add;
151 sp->done_remove += cp->done_remove;
152 atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
153 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
154 atomic_add(atomic_read(&cp->rcu_try_flip_1),
155 &sp->rcu_try_flip_1);
156 atomic_add(atomic_read(&cp->rcu_try_flip_e1),
157 &sp->rcu_try_flip_e1);
158 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
159 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
160 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
161 sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
162 sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
163 sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
164 sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
165 sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
166 sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
167 sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
168 sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
169 sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
170 }
171}
172
173static ssize_t rcustats_read(struct file *filp, char __user *buffer,
174 size_t count, loff_t *ppos)
175{
176 struct rcupreempt_trace trace;
177 ssize_t bcount;
178 int cnt = 0;
179
180 rcupreempt_trace_sum(&trace);
181 mutex_lock(&rcupreempt_trace_mutex);
182 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
183 "ggp=%ld rcc=%ld\n",
184 rcu_batches_completed(),
185 trace.rcu_check_callbacks);
186 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
187 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
188 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
189 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
190
191 trace.next_add, trace.next_length,
192 trace.wait_add, trace.wait_length,
193 trace.done_add, trace.done_length,
194 trace.done_remove, atomic_read(&trace.done_invoked),
195 atomic_read(&trace.rcu_try_flip_1),
196 atomic_read(&trace.rcu_try_flip_e1),
197 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
198 trace.rcu_try_flip_g1,
199 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
200 trace.rcu_try_flip_a2,
201 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
202 trace.rcu_try_flip_z2,
203 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
204 trace.rcu_try_flip_m2);
205 bcount = simple_read_from_buffer(buffer, count, ppos,
206 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
207 mutex_unlock(&rcupreempt_trace_mutex);
208 return bcount;
209}
210
211static ssize_t rcugp_read(struct file *filp, char __user *buffer,
212 size_t count, loff_t *ppos)
213{
214 long oldgp = rcu_batches_completed();
215 ssize_t bcount;
216
217 mutex_lock(&rcupreempt_trace_mutex);
218 synchronize_rcu();
219 snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
220 "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
221 bcount = simple_read_from_buffer(buffer, count, ppos,
222 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
223 mutex_unlock(&rcupreempt_trace_mutex);
224 return bcount;
225}
226
227static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
228 size_t count, loff_t *ppos)
229{
230 int cnt = 0;
231 int cpu;
232 int f = rcu_batches_completed() & 0x1;
233 ssize_t bcount;
234
235 mutex_lock(&rcupreempt_trace_mutex);
236
237 cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
238 "CPU last cur F M\n");
239 for_each_online_cpu(cpu) {
240 long *flipctr = rcupreempt_flipctr(cpu);
241 cnt += snprintf(&rcupreempt_trace_buf[cnt],
242 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
243 "%3d %4ld %3ld %d %d\n",
244 cpu,
245 flipctr[!f],
246 flipctr[f],
247 rcupreempt_flip_flag(cpu),
248 rcupreempt_mb_flag(cpu));
249 }
250 cnt += snprintf(&rcupreempt_trace_buf[cnt],
251 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
252 "ggp = %ld, state = %s\n",
253 rcu_batches_completed(),
254 rcupreempt_try_flip_state_name());
255 cnt += snprintf(&rcupreempt_trace_buf[cnt],
256 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
257 "\n");
258 bcount = simple_read_from_buffer(buffer, count, ppos,
259 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
260 mutex_unlock(&rcupreempt_trace_mutex);
261 return bcount;
262}
263
264static struct file_operations rcustats_fops = {
265 .owner = THIS_MODULE,
266 .read = rcustats_read,
267};
268
269static struct file_operations rcugp_fops = {
270 .owner = THIS_MODULE,
271 .read = rcugp_read,
272};
273
274static struct file_operations rcuctrs_fops = {
275 .owner = THIS_MODULE,
276 .read = rcuctrs_read,
277};
278
279static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
280static int rcupreempt_debugfs_init(void)
281{
282 rcudir = debugfs_create_dir("rcu", NULL);
283 if (!rcudir)
284 goto out;
285 statdir = debugfs_create_file("rcustats", 0444, rcudir,
286 NULL, &rcustats_fops);
287 if (!statdir)
288 goto free_out;
289
290 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
291 if (!gpdir)
292 goto free_out;
293
294 ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
295 NULL, &rcuctrs_fops);
296 if (!ctrsdir)
297 goto free_out;
298 return 0;
299free_out:
300 if (statdir)
301 debugfs_remove(statdir);
302 if (gpdir)
303 debugfs_remove(gpdir);
304 debugfs_remove(rcudir);
305out:
306 return 1;
307}
308
309static int __init rcupreempt_trace_init(void)
310{
311 int ret;
312
313 mutex_init(&rcupreempt_trace_mutex);
314 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
315 if (!rcupreempt_trace_buf)
316 return 1;
317 ret = rcupreempt_debugfs_init();
318 if (ret)
319 kfree(rcupreempt_trace_buf);
320 return ret;
321}
322
323static void __exit rcupreempt_trace_cleanup(void)
324{
325 debugfs_remove(statdir);
326 debugfs_remove(gpdir);
327 debugfs_remove(ctrsdir);
328 debugfs_remove(rcudir);
329 kfree(rcupreempt_trace_buf);
330}
331
332
333module_init(rcupreempt_trace_init);
334module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b4a975a4b4a..233768f21f97 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -18,7 +18,7 @@
18 * Copyright (C) IBM Corporation, 2005, 2006 18 * Copyright (C) IBM Corporation, 2005, 2006
19 * 19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com> 20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * Josh Triplett <josh@freedesktop.org> 21 * Josh Triplett <josh@freedesktop.org>
22 * 22 *
23 * See also: Documentation/RCU/torture.txt 23 * See also: Documentation/RCU/torture.txt
24 */ 24 */
@@ -50,7 +50,7 @@
50 50
51MODULE_LICENSE("GPL"); 51MODULE_LICENSE("GPL");
52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 52MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
53 "Josh Triplett <josh@freedesktop.org>"); 53 "Josh Triplett <josh@freedesktop.org>");
54 54
55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ 55static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56static int nfakewriters = 4; /* # fake writer threads */ 56static int nfakewriters = 4; /* # fake writer threads */
@@ -110,8 +110,8 @@ struct rcu_torture {
110}; 110};
111 111
112static LIST_HEAD(rcu_torture_freelist); 112static LIST_HEAD(rcu_torture_freelist);
113static struct rcu_torture *rcu_torture_current = NULL; 113static struct rcu_torture *rcu_torture_current;
114static long rcu_torture_current_version = 0; 114static long rcu_torture_current_version;
115static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 115static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
116static DEFINE_SPINLOCK(rcu_torture_lock); 116static DEFINE_SPINLOCK(rcu_torture_lock);
117static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = 117static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -124,11 +124,11 @@ static atomic_t n_rcu_torture_alloc_fail;
124static atomic_t n_rcu_torture_free; 124static atomic_t n_rcu_torture_free;
125static atomic_t n_rcu_torture_mberror; 125static atomic_t n_rcu_torture_mberror;
126static atomic_t n_rcu_torture_error; 126static atomic_t n_rcu_torture_error;
127static long n_rcu_torture_timers = 0; 127static long n_rcu_torture_timers;
128static struct list_head rcu_torture_removed; 128static struct list_head rcu_torture_removed;
129static cpumask_var_t shuffle_tmp_mask; 129static cpumask_var_t shuffle_tmp_mask;
130 130
131static int stutter_pause_test = 0; 131static int stutter_pause_test;
132 132
133#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) 133#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
134#define RCUTORTURE_RUNNABLE_INIT 1 134#define RCUTORTURE_RUNNABLE_INIT 1
@@ -257,17 +257,18 @@ struct rcu_torture_ops {
257 void (*init)(void); 257 void (*init)(void);
258 void (*cleanup)(void); 258 void (*cleanup)(void);
259 int (*readlock)(void); 259 int (*readlock)(void);
260 void (*readdelay)(struct rcu_random_state *rrsp); 260 void (*read_delay)(struct rcu_random_state *rrsp);
261 void (*readunlock)(int idx); 261 void (*readunlock)(int idx);
262 int (*completed)(void); 262 int (*completed)(void);
263 void (*deferredfree)(struct rcu_torture *p); 263 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 264 void (*sync)(void);
265 void (*cb_barrier)(void); 265 void (*cb_barrier)(void);
266 int (*stats)(char *page); 266 int (*stats)(char *page);
267 int irqcapable; 267 int irq_capable;
268 char *name; 268 char *name;
269}; 269};
270static struct rcu_torture_ops *cur_ops = NULL; 270
271static struct rcu_torture_ops *cur_ops;
271 272
272/* 273/*
273 * Definitions for rcu torture testing. 274 * Definitions for rcu torture testing.
@@ -281,14 +282,17 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
281 282
282static void rcu_read_delay(struct rcu_random_state *rrsp) 283static void rcu_read_delay(struct rcu_random_state *rrsp)
283{ 284{
284 long delay; 285 const unsigned long shortdelay_us = 200;
285 const long longdelay = 200; 286 const unsigned long longdelay_ms = 50;
286 287
287 /* We want there to be long-running readers, but not all the time. */ 288 /* We want a short delay sometimes to make a reader delay the grace
289 * period, and we want a long delay occasionally to trigger
290 * force_quiescent_state. */
288 291
289 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay); 292 if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
290 if (!delay) 293 mdelay(longdelay_ms);
291 udelay(longdelay); 294 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
295 udelay(shortdelay_us);
292} 296}
293 297
294static void rcu_torture_read_unlock(int idx) __releases(RCU) 298static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -320,7 +324,7 @@ rcu_torture_cb(struct rcu_head *p)
320 rp->rtort_mbtest = 0; 324 rp->rtort_mbtest = 0;
321 rcu_torture_free(rp); 325 rcu_torture_free(rp);
322 } else 326 } else
323 cur_ops->deferredfree(rp); 327 cur_ops->deferred_free(rp);
324} 328}
325 329
326static void rcu_torture_deferred_free(struct rcu_torture *p) 330static void rcu_torture_deferred_free(struct rcu_torture *p)
@@ -329,18 +333,18 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
329} 333}
330 334
331static struct rcu_torture_ops rcu_ops = { 335static struct rcu_torture_ops rcu_ops = {
332 .init = NULL, 336 .init = NULL,
333 .cleanup = NULL, 337 .cleanup = NULL,
334 .readlock = rcu_torture_read_lock, 338 .readlock = rcu_torture_read_lock,
335 .readdelay = rcu_read_delay, 339 .read_delay = rcu_read_delay,
336 .readunlock = rcu_torture_read_unlock, 340 .readunlock = rcu_torture_read_unlock,
337 .completed = rcu_torture_completed, 341 .completed = rcu_torture_completed,
338 .deferredfree = rcu_torture_deferred_free, 342 .deferred_free = rcu_torture_deferred_free,
339 .sync = synchronize_rcu, 343 .sync = synchronize_rcu,
340 .cb_barrier = rcu_barrier, 344 .cb_barrier = rcu_barrier,
341 .stats = NULL, 345 .stats = NULL,
342 .irqcapable = 1, 346 .irq_capable = 1,
343 .name = "rcu" 347 .name = "rcu"
344}; 348};
345 349
346static void rcu_sync_torture_deferred_free(struct rcu_torture *p) 350static void rcu_sync_torture_deferred_free(struct rcu_torture *p)
@@ -370,18 +374,18 @@ static void rcu_sync_torture_init(void)
370} 374}
371 375
372static struct rcu_torture_ops rcu_sync_ops = { 376static struct rcu_torture_ops rcu_sync_ops = {
373 .init = rcu_sync_torture_init, 377 .init = rcu_sync_torture_init,
374 .cleanup = NULL, 378 .cleanup = NULL,
375 .readlock = rcu_torture_read_lock, 379 .readlock = rcu_torture_read_lock,
376 .readdelay = rcu_read_delay, 380 .read_delay = rcu_read_delay,
377 .readunlock = rcu_torture_read_unlock, 381 .readunlock = rcu_torture_read_unlock,
378 .completed = rcu_torture_completed, 382 .completed = rcu_torture_completed,
379 .deferredfree = rcu_sync_torture_deferred_free, 383 .deferred_free = rcu_sync_torture_deferred_free,
380 .sync = synchronize_rcu, 384 .sync = synchronize_rcu,
381 .cb_barrier = NULL, 385 .cb_barrier = NULL,
382 .stats = NULL, 386 .stats = NULL,
383 .irqcapable = 1, 387 .irq_capable = 1,
384 .name = "rcu_sync" 388 .name = "rcu_sync"
385}; 389};
386 390
387/* 391/*
@@ -432,33 +436,33 @@ static void rcu_bh_torture_synchronize(void)
432} 436}
433 437
434static struct rcu_torture_ops rcu_bh_ops = { 438static struct rcu_torture_ops rcu_bh_ops = {
435 .init = NULL, 439 .init = NULL,
436 .cleanup = NULL, 440 .cleanup = NULL,
437 .readlock = rcu_bh_torture_read_lock, 441 .readlock = rcu_bh_torture_read_lock,
438 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 442 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
439 .readunlock = rcu_bh_torture_read_unlock, 443 .readunlock = rcu_bh_torture_read_unlock,
440 .completed = rcu_bh_torture_completed, 444 .completed = rcu_bh_torture_completed,
441 .deferredfree = rcu_bh_torture_deferred_free, 445 .deferred_free = rcu_bh_torture_deferred_free,
442 .sync = rcu_bh_torture_synchronize, 446 .sync = rcu_bh_torture_synchronize,
443 .cb_barrier = rcu_barrier_bh, 447 .cb_barrier = rcu_barrier_bh,
444 .stats = NULL, 448 .stats = NULL,
445 .irqcapable = 1, 449 .irq_capable = 1,
446 .name = "rcu_bh" 450 .name = "rcu_bh"
447}; 451};
448 452
449static struct rcu_torture_ops rcu_bh_sync_ops = { 453static struct rcu_torture_ops rcu_bh_sync_ops = {
450 .init = rcu_sync_torture_init, 454 .init = rcu_sync_torture_init,
451 .cleanup = NULL, 455 .cleanup = NULL,
452 .readlock = rcu_bh_torture_read_lock, 456 .readlock = rcu_bh_torture_read_lock,
453 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 457 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
454 .readunlock = rcu_bh_torture_read_unlock, 458 .readunlock = rcu_bh_torture_read_unlock,
455 .completed = rcu_bh_torture_completed, 459 .completed = rcu_bh_torture_completed,
456 .deferredfree = rcu_sync_torture_deferred_free, 460 .deferred_free = rcu_sync_torture_deferred_free,
457 .sync = rcu_bh_torture_synchronize, 461 .sync = rcu_bh_torture_synchronize,
458 .cb_barrier = NULL, 462 .cb_barrier = NULL,
459 .stats = NULL, 463 .stats = NULL,
460 .irqcapable = 1, 464 .irq_capable = 1,
461 .name = "rcu_bh_sync" 465 .name = "rcu_bh_sync"
462}; 466};
463 467
464/* 468/*
@@ -530,17 +534,17 @@ static int srcu_torture_stats(char *page)
530} 534}
531 535
532static struct rcu_torture_ops srcu_ops = { 536static struct rcu_torture_ops srcu_ops = {
533 .init = srcu_torture_init, 537 .init = srcu_torture_init,
534 .cleanup = srcu_torture_cleanup, 538 .cleanup = srcu_torture_cleanup,
535 .readlock = srcu_torture_read_lock, 539 .readlock = srcu_torture_read_lock,
536 .readdelay = srcu_read_delay, 540 .read_delay = srcu_read_delay,
537 .readunlock = srcu_torture_read_unlock, 541 .readunlock = srcu_torture_read_unlock,
538 .completed = srcu_torture_completed, 542 .completed = srcu_torture_completed,
539 .deferredfree = rcu_sync_torture_deferred_free, 543 .deferred_free = rcu_sync_torture_deferred_free,
540 .sync = srcu_torture_synchronize, 544 .sync = srcu_torture_synchronize,
541 .cb_barrier = NULL, 545 .cb_barrier = NULL,
542 .stats = srcu_torture_stats, 546 .stats = srcu_torture_stats,
543 .name = "srcu" 547 .name = "srcu"
544}; 548};
545 549
546/* 550/*
@@ -574,32 +578,49 @@ static void sched_torture_synchronize(void)
574} 578}
575 579
576static struct rcu_torture_ops sched_ops = { 580static struct rcu_torture_ops sched_ops = {
577 .init = rcu_sync_torture_init, 581 .init = rcu_sync_torture_init,
578 .cleanup = NULL, 582 .cleanup = NULL,
579 .readlock = sched_torture_read_lock, 583 .readlock = sched_torture_read_lock,
580 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 584 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
581 .readunlock = sched_torture_read_unlock, 585 .readunlock = sched_torture_read_unlock,
582 .completed = sched_torture_completed, 586 .completed = sched_torture_completed,
583 .deferredfree = rcu_sched_torture_deferred_free, 587 .deferred_free = rcu_sched_torture_deferred_free,
584 .sync = sched_torture_synchronize, 588 .sync = sched_torture_synchronize,
585 .cb_barrier = rcu_barrier_sched, 589 .cb_barrier = rcu_barrier_sched,
586 .stats = NULL, 590 .stats = NULL,
587 .irqcapable = 1, 591 .irq_capable = 1,
588 .name = "sched" 592 .name = "sched"
589}; 593};
590 594
591static struct rcu_torture_ops sched_ops_sync = { 595static struct rcu_torture_ops sched_ops_sync = {
592 .init = rcu_sync_torture_init, 596 .init = rcu_sync_torture_init,
593 .cleanup = NULL, 597 .cleanup = NULL,
594 .readlock = sched_torture_read_lock, 598 .readlock = sched_torture_read_lock,
595 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 599 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
596 .readunlock = sched_torture_read_unlock, 600 .readunlock = sched_torture_read_unlock,
597 .completed = sched_torture_completed, 601 .completed = sched_torture_completed,
598 .deferredfree = rcu_sync_torture_deferred_free, 602 .deferred_free = rcu_sync_torture_deferred_free,
599 .sync = sched_torture_synchronize, 603 .sync = sched_torture_synchronize,
600 .cb_barrier = NULL, 604 .cb_barrier = NULL,
601 .stats = NULL, 605 .stats = NULL,
602 .name = "sched_sync" 606 .name = "sched_sync"
607};
608
609extern int rcu_expedited_torture_stats(char *page);
610
611static struct rcu_torture_ops sched_expedited_ops = {
612 .init = rcu_sync_torture_init,
613 .cleanup = NULL,
614 .readlock = sched_torture_read_lock,
615 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
616 .readunlock = sched_torture_read_unlock,
617 .completed = sched_torture_completed,
618 .deferred_free = rcu_sync_torture_deferred_free,
619 .sync = synchronize_sched_expedited,
620 .cb_barrier = NULL,
621 .stats = rcu_expedited_torture_stats,
622 .irq_capable = 1,
623 .name = "sched_expedited"
603}; 624};
604 625
605/* 626/*
@@ -621,7 +642,8 @@ rcu_torture_writer(void *arg)
621 642
622 do { 643 do {
623 schedule_timeout_uninterruptible(1); 644 schedule_timeout_uninterruptible(1);
624 if ((rp = rcu_torture_alloc()) == NULL) 645 rp = rcu_torture_alloc();
646 if (rp == NULL)
625 continue; 647 continue;
626 rp->rtort_pipe_count = 0; 648 rp->rtort_pipe_count = 0;
627 udelay(rcu_random(&rand) & 0x3ff); 649 udelay(rcu_random(&rand) & 0x3ff);
@@ -635,7 +657,7 @@ rcu_torture_writer(void *arg)
635 i = RCU_TORTURE_PIPE_LEN; 657 i = RCU_TORTURE_PIPE_LEN;
636 atomic_inc(&rcu_torture_wcount[i]); 658 atomic_inc(&rcu_torture_wcount[i]);
637 old_rp->rtort_pipe_count++; 659 old_rp->rtort_pipe_count++;
638 cur_ops->deferredfree(old_rp); 660 cur_ops->deferred_free(old_rp);
639 } 661 }
640 rcu_torture_current_version++; 662 rcu_torture_current_version++;
641 oldbatch = cur_ops->completed(); 663 oldbatch = cur_ops->completed();
@@ -700,7 +722,7 @@ static void rcu_torture_timer(unsigned long unused)
700 if (p->rtort_mbtest == 0) 722 if (p->rtort_mbtest == 0)
701 atomic_inc(&n_rcu_torture_mberror); 723 atomic_inc(&n_rcu_torture_mberror);
702 spin_lock(&rand_lock); 724 spin_lock(&rand_lock);
703 cur_ops->readdelay(&rand); 725 cur_ops->read_delay(&rand);
704 n_rcu_torture_timers++; 726 n_rcu_torture_timers++;
705 spin_unlock(&rand_lock); 727 spin_unlock(&rand_lock);
706 preempt_disable(); 728 preempt_disable();
@@ -738,11 +760,11 @@ rcu_torture_reader(void *arg)
738 760
739 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 761 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
740 set_user_nice(current, 19); 762 set_user_nice(current, 19);
741 if (irqreader && cur_ops->irqcapable) 763 if (irqreader && cur_ops->irq_capable)
742 setup_timer_on_stack(&t, rcu_torture_timer, 0); 764 setup_timer_on_stack(&t, rcu_torture_timer, 0);
743 765
744 do { 766 do {
745 if (irqreader && cur_ops->irqcapable) { 767 if (irqreader && cur_ops->irq_capable) {
746 if (!timer_pending(&t)) 768 if (!timer_pending(&t))
747 mod_timer(&t, 1); 769 mod_timer(&t, 1);
748 } 770 }
@@ -757,7 +779,7 @@ rcu_torture_reader(void *arg)
757 } 779 }
758 if (p->rtort_mbtest == 0) 780 if (p->rtort_mbtest == 0)
759 atomic_inc(&n_rcu_torture_mberror); 781 atomic_inc(&n_rcu_torture_mberror);
760 cur_ops->readdelay(&rand); 782 cur_ops->read_delay(&rand);
761 preempt_disable(); 783 preempt_disable();
762 pipe_count = p->rtort_pipe_count; 784 pipe_count = p->rtort_pipe_count;
763 if (pipe_count > RCU_TORTURE_PIPE_LEN) { 785 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -778,7 +800,7 @@ rcu_torture_reader(void *arg)
778 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); 800 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
779 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 801 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
780 rcutorture_shutdown_absorb("rcu_torture_reader"); 802 rcutorture_shutdown_absorb("rcu_torture_reader");
781 if (irqreader && cur_ops->irqcapable) 803 if (irqreader && cur_ops->irq_capable)
782 del_timer_sync(&t); 804 del_timer_sync(&t);
783 while (!kthread_should_stop()) 805 while (!kthread_should_stop())
784 schedule_timeout_uninterruptible(1); 806 schedule_timeout_uninterruptible(1);
@@ -1078,6 +1100,7 @@ rcu_torture_init(void)
1078 int firsterr = 0; 1100 int firsterr = 0;
1079 static struct rcu_torture_ops *torture_ops[] = 1101 static struct rcu_torture_ops *torture_ops[] =
1080 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1102 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
1103 &sched_expedited_ops,
1081 &srcu_ops, &sched_ops, &sched_ops_sync, }; 1104 &srcu_ops, &sched_ops, &sched_ops_sync, };
1082 1105
1083 mutex_lock(&fullstop_mutex); 1106 mutex_lock(&fullstop_mutex);
@@ -1092,7 +1115,7 @@ rcu_torture_init(void)
1092 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", 1115 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
1093 torture_type); 1116 torture_type);
1094 mutex_unlock(&fullstop_mutex); 1117 mutex_unlock(&fullstop_mutex);
1095 return (-EINVAL); 1118 return -EINVAL;
1096 } 1119 }
1097 if (cur_ops->init) 1120 if (cur_ops->init)
1098 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1121 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
@@ -1143,7 +1166,7 @@ rcu_torture_init(void)
1143 goto unwind; 1166 goto unwind;
1144 } 1167 }
1145 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 1168 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
1146 GFP_KERNEL); 1169 GFP_KERNEL);
1147 if (fakewriter_tasks == NULL) { 1170 if (fakewriter_tasks == NULL) {
1148 VERBOSE_PRINTK_ERRSTRING("out of memory"); 1171 VERBOSE_PRINTK_ERRSTRING("out of memory");
1149 firsterr = -ENOMEM; 1172 firsterr = -ENOMEM;
@@ -1152,7 +1175,7 @@ rcu_torture_init(void)
1152 for (i = 0; i < nfakewriters; i++) { 1175 for (i = 0; i < nfakewriters; i++) {
1153 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); 1176 VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
1154 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, 1177 fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
1155 "rcu_torture_fakewriter"); 1178 "rcu_torture_fakewriter");
1156 if (IS_ERR(fakewriter_tasks[i])) { 1179 if (IS_ERR(fakewriter_tasks[i])) {
1157 firsterr = PTR_ERR(fakewriter_tasks[i]); 1180 firsterr = PTR_ERR(fakewriter_tasks[i]);
1158 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); 1181 VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0dccfbba6d26..52b06f6e158c 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -25,7 +25,7 @@
25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
26 * 26 *
27 * For detailed explanation of Read-Copy Update mechanism see - 27 * For detailed explanation of Read-Copy Update mechanism see -
28 * Documentation/RCU 28 * Documentation/RCU
29 */ 29 */
30#include <linux/types.h> 30#include <linux/types.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
@@ -35,6 +35,7 @@
35#include <linux/rcupdate.h> 35#include <linux/rcupdate.h>
36#include <linux/interrupt.h> 36#include <linux/interrupt.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/nmi.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <linux/bitops.h> 40#include <linux/bitops.h>
40#include <linux/module.h> 41#include <linux/module.h>
@@ -46,6 +47,8 @@
46#include <linux/mutex.h> 47#include <linux/mutex.h>
47#include <linux/time.h> 48#include <linux/time.h>
48 49
50#include "rcutree.h"
51
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 53static struct lock_class_key rcu_lock_key;
51struct lockdep_map rcu_lock_map = 54struct lockdep_map rcu_lock_map =
@@ -72,30 +75,55 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
72 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
73} 76}
74 77
75struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state); 78struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
76DEFINE_PER_CPU(struct rcu_data, rcu_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
77 80
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 83
84extern long rcu_batches_completed_sched(void);
85static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
86static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
87 struct rcu_node *rnp, unsigned long flags);
88static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
89#ifdef CONFIG_HOTPLUG_CPU
90static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
91#endif /* #ifdef CONFIG_HOTPLUG_CPU */
92static void __rcu_process_callbacks(struct rcu_state *rsp,
93 struct rcu_data *rdp);
94static void __call_rcu(struct rcu_head *head,
95 void (*func)(struct rcu_head *rcu),
96 struct rcu_state *rsp);
97static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
98static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
99 int preemptable);
100
101#include "rcutree_plugin.h"
102
81/* 103/*
82 * Increment the quiescent state counter. 104 * Note a quiescent state. Because we do not need to know
83 * The counter is a bit degenerated: We do not need to know
84 * how many quiescent states passed, just if there was at least 105 * how many quiescent states passed, just if there was at least
85 * one since the start of the grace period. Thus just a flag. 106 * one since the start of the grace period, this just sets a flag.
86 */ 107 */
87void rcu_qsctr_inc(int cpu) 108void rcu_sched_qs(int cpu)
88{ 109{
89 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 110 struct rcu_data *rdp;
90 rdp->passed_quiesc = 1; 111
112 rdp = &per_cpu(rcu_sched_data, cpu);
91 rdp->passed_quiesc_completed = rdp->completed; 113 rdp->passed_quiesc_completed = rdp->completed;
114 barrier();
115 rdp->passed_quiesc = 1;
116 rcu_preempt_note_context_switch(cpu);
92} 117}
93 118
94void rcu_bh_qsctr_inc(int cpu) 119void rcu_bh_qs(int cpu)
95{ 120{
96 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 121 struct rcu_data *rdp;
97 rdp->passed_quiesc = 1; 122
123 rdp = &per_cpu(rcu_bh_data, cpu);
98 rdp->passed_quiesc_completed = rdp->completed; 124 rdp->passed_quiesc_completed = rdp->completed;
125 barrier();
126 rdp->passed_quiesc = 1;
99} 127}
100 128
101#ifdef CONFIG_NO_HZ 129#ifdef CONFIG_NO_HZ
@@ -110,15 +138,16 @@ static int qhimark = 10000; /* If this many pending, ignore blimit. */
110static int qlowmark = 100; /* Once only this many pending, use blimit. */ 138static int qlowmark = 100; /* Once only this many pending, use blimit. */
111 139
112static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 140static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
141static int rcu_pending(int cpu);
113 142
114/* 143/*
115 * Return the number of RCU batches processed thus far for debug & stats. 144 * Return the number of RCU-sched batches processed thus far for debug & stats.
116 */ 145 */
117long rcu_batches_completed(void) 146long rcu_batches_completed_sched(void)
118{ 147{
119 return rcu_state.completed; 148 return rcu_sched_state.completed;
120} 149}
121EXPORT_SYMBOL_GPL(rcu_batches_completed); 150EXPORT_SYMBOL_GPL(rcu_batches_completed_sched);
122 151
123/* 152/*
124 * Return the number of RCU BH batches processed thus far for debug & stats. 153 * Return the number of RCU BH batches processed thus far for debug & stats.
@@ -181,6 +210,10 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
181 return 1; 210 return 1;
182 } 211 }
183 212
213 /* If preemptable RCU, no point in sending reschedule IPI. */
214 if (rdp->preemptable)
215 return 0;
216
184 /* The CPU is online, so send it a reschedule IPI. */ 217 /* The CPU is online, so send it a reschedule IPI. */
185 if (rdp->cpu != smp_processor_id()) 218 if (rdp->cpu != smp_processor_id())
186 smp_send_reschedule(rdp->cpu); 219 smp_send_reschedule(rdp->cpu);
@@ -193,7 +226,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
193#endif /* #ifdef CONFIG_SMP */ 226#endif /* #ifdef CONFIG_SMP */
194 227
195#ifdef CONFIG_NO_HZ 228#ifdef CONFIG_NO_HZ
196static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
197 229
198/** 230/**
199 * rcu_enter_nohz - inform RCU that current CPU is entering nohz 231 * rcu_enter_nohz - inform RCU that current CPU is entering nohz
@@ -213,7 +245,7 @@ void rcu_enter_nohz(void)
213 rdtp = &__get_cpu_var(rcu_dynticks); 245 rdtp = &__get_cpu_var(rcu_dynticks);
214 rdtp->dynticks++; 246 rdtp->dynticks++;
215 rdtp->dynticks_nesting--; 247 rdtp->dynticks_nesting--;
216 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 248 WARN_ON_ONCE(rdtp->dynticks & 0x1);
217 local_irq_restore(flags); 249 local_irq_restore(flags);
218} 250}
219 251
@@ -232,7 +264,7 @@ void rcu_exit_nohz(void)
232 rdtp = &__get_cpu_var(rcu_dynticks); 264 rdtp = &__get_cpu_var(rcu_dynticks);
233 rdtp->dynticks++; 265 rdtp->dynticks++;
234 rdtp->dynticks_nesting++; 266 rdtp->dynticks_nesting++;
235 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 267 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
236 local_irq_restore(flags); 268 local_irq_restore(flags);
237 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 269 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
238} 270}
@@ -251,7 +283,7 @@ void rcu_nmi_enter(void)
251 if (rdtp->dynticks & 0x1) 283 if (rdtp->dynticks & 0x1)
252 return; 284 return;
253 rdtp->dynticks_nmi++; 285 rdtp->dynticks_nmi++;
254 WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs); 286 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
255 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 287 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
256} 288}
257 289
@@ -270,7 +302,7 @@ void rcu_nmi_exit(void)
270 return; 302 return;
271 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 303 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
272 rdtp->dynticks_nmi++; 304 rdtp->dynticks_nmi++;
273 WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs); 305 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
274} 306}
275 307
276/** 308/**
@@ -286,7 +318,7 @@ void rcu_irq_enter(void)
286 if (rdtp->dynticks_nesting++) 318 if (rdtp->dynticks_nesting++)
287 return; 319 return;
288 rdtp->dynticks++; 320 rdtp->dynticks++;
289 WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs); 321 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
290 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 322 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
291} 323}
292 324
@@ -305,10 +337,10 @@ void rcu_irq_exit(void)
305 return; 337 return;
306 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 338 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
307 rdtp->dynticks++; 339 rdtp->dynticks++;
308 WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs); 340 WARN_ON_ONCE(rdtp->dynticks & 0x1);
309 341
310 /* If the interrupt queued a callback, get out of dyntick mode. */ 342 /* If the interrupt queued a callback, get out of dyntick mode. */
311 if (__get_cpu_var(rcu_data).nxtlist || 343 if (__get_cpu_var(rcu_sched_data).nxtlist ||
312 __get_cpu_var(rcu_bh_data).nxtlist) 344 __get_cpu_var(rcu_bh_data).nxtlist)
313 set_need_resched(); 345 set_need_resched();
314} 346}
@@ -461,6 +493,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
461 493
462 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 494 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
463 for (; rnp_cur < rnp_end; rnp_cur++) { 495 for (; rnp_cur < rnp_end; rnp_cur++) {
496 rcu_print_task_stall(rnp);
464 if (rnp_cur->qsmask == 0) 497 if (rnp_cur->qsmask == 0)
465 continue; 498 continue;
466 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) 499 for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
@@ -469,6 +502,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
469 } 502 }
470 printk(" (detected by %d, t=%ld jiffies)\n", 503 printk(" (detected by %d, t=%ld jiffies)\n",
471 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 504 smp_processor_id(), (long)(jiffies - rsp->gp_start));
505 trigger_all_cpu_backtrace();
506
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 507 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 508}
474 509
@@ -479,12 +514,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
479 514
480 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 515 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n",
481 smp_processor_id(), jiffies - rsp->gp_start); 516 smp_processor_id(), jiffies - rsp->gp_start);
482 dump_stack(); 517 trigger_all_cpu_backtrace();
518
483 spin_lock_irqsave(&rnp->lock, flags); 519 spin_lock_irqsave(&rnp->lock, flags);
484 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 520 if ((long)(jiffies - rsp->jiffies_stall) >= 0)
485 rsp->jiffies_stall = 521 rsp->jiffies_stall =
486 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 522 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
487 spin_unlock_irqrestore(&rnp->lock, flags); 523 spin_unlock_irqrestore(&rnp->lock, flags);
524
488 set_need_resched(); /* kick ourselves to get things going. */ 525 set_need_resched(); /* kick ourselves to get things going. */
489} 526}
490 527
@@ -564,8 +601,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
564{ 601{
565 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 602 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
566 struct rcu_node *rnp = rcu_get_root(rsp); 603 struct rcu_node *rnp = rcu_get_root(rsp);
567 struct rcu_node *rnp_cur;
568 struct rcu_node *rnp_end;
569 604
570 if (!cpu_needs_another_gp(rsp, rdp)) { 605 if (!cpu_needs_another_gp(rsp, rdp)) {
571 spin_unlock_irqrestore(&rnp->lock, flags); 606 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -574,6 +609,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
574 609
575 /* Advance to a new grace period and initialize state. */ 610 /* Advance to a new grace period and initialize state. */
576 rsp->gpnum++; 611 rsp->gpnum++;
612 WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
577 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 613 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
578 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 614 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
579 record_gp_stall_check_time(rsp); 615 record_gp_stall_check_time(rsp);
@@ -590,7 +626,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
590 626
591 /* Special-case the common single-level case. */ 627 /* Special-case the common single-level case. */
592 if (NUM_RCU_NODES == 1) { 628 if (NUM_RCU_NODES == 1) {
629 rcu_preempt_check_blocked_tasks(rnp);
593 rnp->qsmask = rnp->qsmaskinit; 630 rnp->qsmask = rnp->qsmaskinit;
631 rnp->gpnum = rsp->gpnum;
594 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 632 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
595 spin_unlock_irqrestore(&rnp->lock, flags); 633 spin_unlock_irqrestore(&rnp->lock, flags);
596 return; 634 return;
@@ -603,42 +641,28 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
603 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 641 spin_lock(&rsp->onofflock); /* irqs already disabled. */
604 642
605 /* 643 /*
606 * Set the quiescent-state-needed bits in all the non-leaf RCU 644 * Set the quiescent-state-needed bits in all the rcu_node
607 * nodes for all currently online CPUs. This operation relies 645 * structures for all currently online CPUs in breadth-first
608 * on the layout of the hierarchy within the rsp->node[] array. 646 * order, starting from the root rcu_node structure. This
609 * Note that other CPUs will access only the leaves of the 647 * operation relies on the layout of the hierarchy within the
610 * hierarchy, which still indicate that no grace period is in 648 * rsp->node[] array. Note that other CPUs will access only
611 * progress. In addition, we have excluded CPU-hotplug operations. 649 * the leaves of the hierarchy, which still indicate that no
612 * 650 * grace period is in progress, at least until the corresponding
613 * We therefore do not need to hold any locks. Any required 651 * leaf node has been initialized. In addition, we have excluded
614 * memory barriers will be supplied by the locks guarding the 652 * CPU-hotplug operations.
615 * leaf rcu_nodes in the hierarchy.
616 */
617
618 rnp_end = rsp->level[NUM_RCU_LVLS - 1];
619 for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
620 rnp_cur->qsmask = rnp_cur->qsmaskinit;
621
622 /*
623 * Now set up the leaf nodes. Here we must be careful. First,
624 * we need to hold the lock in order to exclude other CPUs, which
625 * might be contending for the leaf nodes' locks. Second, as
626 * soon as we initialize a given leaf node, its CPUs might run
627 * up the rest of the hierarchy. We must therefore acquire locks
628 * for each node that we touch during this stage. (But we still
629 * are excluding CPU-hotplug operations.)
630 * 653 *
631 * Note that the grace period cannot complete until we finish 654 * Note that the grace period cannot complete until we finish
632 * the initialization process, as there will be at least one 655 * the initialization process, as there will be at least one
633 * qsmask bit set in the root node until that time, namely the 656 * qsmask bit set in the root node until that time, namely the
634 * one corresponding to this CPU. 657 * one corresponding to this CPU, due to the fact that we have
658 * irqs disabled.
635 */ 659 */
636 rnp_end = &rsp->node[NUM_RCU_NODES]; 660 for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) {
637 rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; 661 spin_lock(&rnp->lock); /* irqs already disabled. */
638 for (; rnp_cur < rnp_end; rnp_cur++) { 662 rcu_preempt_check_blocked_tasks(rnp);
639 spin_lock(&rnp_cur->lock); /* irqs already disabled. */ 663 rnp->qsmask = rnp->qsmaskinit;
640 rnp_cur->qsmask = rnp_cur->qsmaskinit; 664 rnp->gpnum = rsp->gpnum;
641 spin_unlock(&rnp_cur->lock); /* irqs already disabled. */ 665 spin_unlock(&rnp->lock); /* irqs already disabled. */
642 } 666 }
643 667
644 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 668 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
@@ -674,6 +698,20 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
674} 698}
675 699
676/* 700/*
701 * Clean up after the prior grace period and let rcu_start_gp() start up
702 * the next grace period if one is needed. Note that the caller must
703 * hold rnp->lock, as required by rcu_start_gp(), which will release it.
704 */
705static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
706 __releases(rnp->lock)
707{
708 WARN_ON_ONCE(rsp->completed == rsp->gpnum);
709 rsp->completed = rsp->gpnum;
710 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
711 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
712}
713
714/*
677 * Similar to cpu_quiet(), for which it is a helper function. Allows 715 * Similar to cpu_quiet(), for which it is a helper function. Allows
678 * a group of CPUs to be quieted at one go, though all the CPUs in the 716 * a group of CPUs to be quieted at one go, though all the CPUs in the
679 * group must be represented by the same leaf rcu_node structure. 717 * group must be represented by the same leaf rcu_node structure.
@@ -685,6 +723,8 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
685 unsigned long flags) 723 unsigned long flags)
686 __releases(rnp->lock) 724 __releases(rnp->lock)
687{ 725{
726 struct rcu_node *rnp_c;
727
688 /* Walk up the rcu_node hierarchy. */ 728 /* Walk up the rcu_node hierarchy. */
689 for (;;) { 729 for (;;) {
690 if (!(rnp->qsmask & mask)) { 730 if (!(rnp->qsmask & mask)) {
@@ -694,7 +734,7 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
694 return; 734 return;
695 } 735 }
696 rnp->qsmask &= ~mask; 736 rnp->qsmask &= ~mask;
697 if (rnp->qsmask != 0) { 737 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
698 738
699 /* Other bits still set at this level, so done. */ 739 /* Other bits still set at this level, so done. */
700 spin_unlock_irqrestore(&rnp->lock, flags); 740 spin_unlock_irqrestore(&rnp->lock, flags);
@@ -708,28 +748,26 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
708 break; 748 break;
709 } 749 }
710 spin_unlock_irqrestore(&rnp->lock, flags); 750 spin_unlock_irqrestore(&rnp->lock, flags);
751 rnp_c = rnp;
711 rnp = rnp->parent; 752 rnp = rnp->parent;
712 spin_lock_irqsave(&rnp->lock, flags); 753 spin_lock_irqsave(&rnp->lock, flags);
754 WARN_ON_ONCE(rnp_c->qsmask);
713 } 755 }
714 756
715 /* 757 /*
716 * Get here if we are the last CPU to pass through a quiescent 758 * Get here if we are the last CPU to pass through a quiescent
717 * state for this grace period. Clean up and let rcu_start_gp() 759 * state for this grace period. Invoke cpu_quiet_msk_finish()
718 * start up the next grace period if one is needed. Note that 760 * to clean up and start the next grace period if one is needed.
719 * we still hold rnp->lock, as required by rcu_start_gp(), which
720 * will release it.
721 */ 761 */
722 rsp->completed = rsp->gpnum; 762 cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */
723 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
724 rcu_start_gp(rsp, flags); /* releases rnp->lock. */
725} 763}
726 764
727/* 765/*
728 * Record a quiescent state for the specified CPU, which must either be 766 * Record a quiescent state for the specified CPU, which must either be
729 * the current CPU or an offline CPU. The lastcomp argument is used to 767 * the current CPU. The lastcomp argument is used to make sure we are
730 * make sure we are still in the grace period of interest. We don't want 768 * still in the grace period of interest. We don't want to end the current
731 * to end the current grace period based on quiescent states detected in 769 * grace period based on quiescent states detected in an earlier grace
732 * an earlier grace period! 770 * period!
733 */ 771 */
734static void 772static void
735cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) 773cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
@@ -764,7 +802,6 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
764 * This GP can't end until cpu checks in, so all of our 802 * This GP can't end until cpu checks in, so all of our
765 * callbacks can be processed during the next GP. 803 * callbacks can be processed during the next GP.
766 */ 804 */
767 rdp = rsp->rda[smp_processor_id()];
768 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 805 rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
769 806
770 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ 807 cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
@@ -822,30 +859,28 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
822 spin_lock_irqsave(&rsp->onofflock, flags); 859 spin_lock_irqsave(&rsp->onofflock, flags);
823 860
824 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 861 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
825 rnp = rdp->mynode; 862 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
826 mask = rdp->grpmask; /* rnp->grplo is constant. */ 863 mask = rdp->grpmask; /* rnp->grplo is constant. */
827 do { 864 do {
828 spin_lock(&rnp->lock); /* irqs already disabled. */ 865 spin_lock(&rnp->lock); /* irqs already disabled. */
829 rnp->qsmaskinit &= ~mask; 866 rnp->qsmaskinit &= ~mask;
830 if (rnp->qsmaskinit != 0) { 867 if (rnp->qsmaskinit != 0) {
831 spin_unlock(&rnp->lock); /* irqs already disabled. */ 868 spin_unlock(&rnp->lock); /* irqs remain disabled. */
832 break; 869 break;
833 } 870 }
871 rcu_preempt_offline_tasks(rsp, rnp, rdp);
834 mask = rnp->grpmask; 872 mask = rnp->grpmask;
835 spin_unlock(&rnp->lock); /* irqs already disabled. */ 873 spin_unlock(&rnp->lock); /* irqs remain disabled. */
836 rnp = rnp->parent; 874 rnp = rnp->parent;
837 } while (rnp != NULL); 875 } while (rnp != NULL);
838 lastcomp = rsp->completed; 876 lastcomp = rsp->completed;
839 877
840 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 878 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
841 879
842 /* Being offline is a quiescent state, so go record it. */
843 cpu_quiet(cpu, rsp, rdp, lastcomp);
844
845 /* 880 /*
846 * Move callbacks from the outgoing CPU to the running CPU. 881 * Move callbacks from the outgoing CPU to the running CPU.
847 * Note that the outgoing CPU is now quiscent, so it is now 882 * Note that the outgoing CPU is now quiscent, so it is now
848 * (uncharacteristically) safe to access it rcu_data structure. 883 * (uncharacteristically) safe to access its rcu_data structure.
849 * Note also that we must carefully retain the order of the 884 * Note also that we must carefully retain the order of the
850 * outgoing CPU's callbacks in order for rcu_barrier() to work 885 * outgoing CPU's callbacks in order for rcu_barrier() to work
851 * correctly. Finally, note that we start all the callbacks 886 * correctly. Finally, note that we start all the callbacks
@@ -876,8 +911,9 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
876 */ 911 */
877static void rcu_offline_cpu(int cpu) 912static void rcu_offline_cpu(int cpu)
878{ 913{
879 __rcu_offline_cpu(cpu, &rcu_state); 914 __rcu_offline_cpu(cpu, &rcu_sched_state);
880 __rcu_offline_cpu(cpu, &rcu_bh_state); 915 __rcu_offline_cpu(cpu, &rcu_bh_state);
916 rcu_preempt_offline_cpu(cpu);
881} 917}
882 918
883#else /* #ifdef CONFIG_HOTPLUG_CPU */ 919#else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -963,6 +999,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
963 */ 999 */
964void rcu_check_callbacks(int cpu, int user) 1000void rcu_check_callbacks(int cpu, int user)
965{ 1001{
1002 if (!rcu_pending(cpu))
1003 return; /* if nothing for RCU to do. */
966 if (user || 1004 if (user ||
967 (idle_cpu(cpu) && rcu_scheduler_active && 1005 (idle_cpu(cpu) && rcu_scheduler_active &&
968 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1006 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -971,17 +1009,16 @@ void rcu_check_callbacks(int cpu, int user)
971 * Get here if this CPU took its interrupt from user 1009 * Get here if this CPU took its interrupt from user
972 * mode or from the idle loop, and if this is not a 1010 * mode or from the idle loop, and if this is not a
973 * nested interrupt. In this case, the CPU is in 1011 * nested interrupt. In this case, the CPU is in
974 * a quiescent state, so count it. 1012 * a quiescent state, so note it.
975 * 1013 *
976 * No memory barrier is required here because both 1014 * No memory barrier is required here because both
977 * rcu_qsctr_inc() and rcu_bh_qsctr_inc() reference 1015 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local
978 * only CPU-local variables that other CPUs neither 1016 * variables that other CPUs neither access nor modify,
979 * access nor modify, at least not while the corresponding 1017 * at least not while the corresponding CPU is online.
980 * CPU is online.
981 */ 1018 */
982 1019
983 rcu_qsctr_inc(cpu); 1020 rcu_sched_qs(cpu);
984 rcu_bh_qsctr_inc(cpu); 1021 rcu_bh_qs(cpu);
985 1022
986 } else if (!in_softirq()) { 1023 } else if (!in_softirq()) {
987 1024
@@ -989,11 +1026,12 @@ void rcu_check_callbacks(int cpu, int user)
989 * Get here if this CPU did not take its interrupt from 1026 * Get here if this CPU did not take its interrupt from
990 * softirq, in other words, if it is not interrupting 1027 * softirq, in other words, if it is not interrupting
991 * a rcu_bh read-side critical section. This is an _bh 1028 * a rcu_bh read-side critical section. This is an _bh
992 * critical section, so count it. 1029 * critical section, so note it.
993 */ 1030 */
994 1031
995 rcu_bh_qsctr_inc(cpu); 1032 rcu_bh_qs(cpu);
996 } 1033 }
1034 rcu_preempt_check_callbacks(cpu);
997 raise_softirq(RCU_SOFTIRQ); 1035 raise_softirq(RCU_SOFTIRQ);
998} 1036}
999 1037
@@ -1132,6 +1170,8 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1132{ 1170{
1133 unsigned long flags; 1171 unsigned long flags;
1134 1172
1173 WARN_ON_ONCE(rdp->beenonline == 0);
1174
1135 /* 1175 /*
1136 * If an RCU GP has gone long enough, go check for dyntick 1176 * If an RCU GP has gone long enough, go check for dyntick
1137 * idle CPUs and, if needed, send resched IPIs. 1177 * idle CPUs and, if needed, send resched IPIs.
@@ -1170,8 +1210,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1170 */ 1210 */
1171 smp_mb(); /* See above block comment. */ 1211 smp_mb(); /* See above block comment. */
1172 1212
1173 __rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data)); 1213 __rcu_process_callbacks(&rcu_sched_state,
1214 &__get_cpu_var(rcu_sched_data));
1174 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1215 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1216 rcu_preempt_process_callbacks();
1175 1217
1176 /* 1218 /*
1177 * Memory references from any later RCU read-side critical sections 1219 * Memory references from any later RCU read-side critical sections
@@ -1227,13 +1269,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1227} 1269}
1228 1270
1229/* 1271/*
1230 * Queue an RCU callback for invocation after a grace period. 1272 * Queue an RCU-sched callback for invocation after a grace period.
1231 */ 1273 */
1232void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1274void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1233{ 1275{
1234 __call_rcu(head, func, &rcu_state); 1276 __call_rcu(head, func, &rcu_sched_state);
1235} 1277}
1236EXPORT_SYMBOL_GPL(call_rcu); 1278EXPORT_SYMBOL_GPL(call_rcu_sched);
1237 1279
1238/* 1280/*
1239 * Queue an RCU for invocation after a quicker grace period. 1281 * Queue an RCU for invocation after a quicker grace period.
@@ -1305,10 +1347,11 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1305 * by the current CPU, returning 1 if so. This function is part of the 1347 * by the current CPU, returning 1 if so. This function is part of the
1306 * RCU implementation; it is -not- an exported member of the RCU API. 1348 * RCU implementation; it is -not- an exported member of the RCU API.
1307 */ 1349 */
1308int rcu_pending(int cpu) 1350static int rcu_pending(int cpu)
1309{ 1351{
1310 return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) || 1352 return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) ||
1311 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)); 1353 __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) ||
1354 rcu_preempt_pending(cpu);
1312} 1355}
1313 1356
1314/* 1357/*
@@ -1320,27 +1363,46 @@ int rcu_pending(int cpu)
1320int rcu_needs_cpu(int cpu) 1363int rcu_needs_cpu(int cpu)
1321{ 1364{
1322 /* RCU callbacks either ready or pending? */ 1365 /* RCU callbacks either ready or pending? */
1323 return per_cpu(rcu_data, cpu).nxtlist || 1366 return per_cpu(rcu_sched_data, cpu).nxtlist ||
1324 per_cpu(rcu_bh_data, cpu).nxtlist; 1367 per_cpu(rcu_bh_data, cpu).nxtlist ||
1368 rcu_preempt_needs_cpu(cpu);
1325} 1369}
1326 1370
1327/* 1371/*
1328 * Initialize a CPU's per-CPU RCU data. We take this "scorched earth" 1372 * Do boot-time initialization of a CPU's per-CPU RCU data.
1329 * approach so that we don't have to worry about how long the CPU has
1330 * been gone, or whether it ever was online previously. We do trust the
1331 * ->mynode field, as it is constant for a given struct rcu_data and
1332 * initialized during early boot.
1333 *
1334 * Note that only one online or offline event can be happening at a given
1335 * time. Note also that we can accept some slop in the rsp->completed
1336 * access due to the fact that this CPU cannot possibly have any RCU
1337 * callbacks in flight yet.
1338 */ 1373 */
1339static void __cpuinit 1374static void __init
1340rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 1375rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1341{ 1376{
1342 unsigned long flags; 1377 unsigned long flags;
1343 int i; 1378 int i;
1379 struct rcu_data *rdp = rsp->rda[cpu];
1380 struct rcu_node *rnp = rcu_get_root(rsp);
1381
1382 /* Set up local state, ensuring consistent view of global state. */
1383 spin_lock_irqsave(&rnp->lock, flags);
1384 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1385 rdp->nxtlist = NULL;
1386 for (i = 0; i < RCU_NEXT_SIZE; i++)
1387 rdp->nxttail[i] = &rdp->nxtlist;
1388 rdp->qlen = 0;
1389#ifdef CONFIG_NO_HZ
1390 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1391#endif /* #ifdef CONFIG_NO_HZ */
1392 rdp->cpu = cpu;
1393 spin_unlock_irqrestore(&rnp->lock, flags);
1394}
1395
1396/*
1397 * Initialize a CPU's per-CPU RCU data. Note that only one online or
1398 * offline event can be happening at a given time. Note also that we
1399 * can accept some slop in the rsp->completed access due to the fact
1400 * that this CPU cannot possibly have any RCU callbacks in flight yet.
1401 */
1402static void __cpuinit
1403rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1404{
1405 unsigned long flags;
1344 long lastcomp; 1406 long lastcomp;
1345 unsigned long mask; 1407 unsigned long mask;
1346 struct rcu_data *rdp = rsp->rda[cpu]; 1408 struct rcu_data *rdp = rsp->rda[cpu];
@@ -1354,17 +1416,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1354 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1416 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1355 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1417 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1356 rdp->beenonline = 1; /* We have now been online. */ 1418 rdp->beenonline = 1; /* We have now been online. */
1419 rdp->preemptable = preemptable;
1357 rdp->passed_quiesc_completed = lastcomp - 1; 1420 rdp->passed_quiesc_completed = lastcomp - 1;
1358 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1359 rdp->nxtlist = NULL;
1360 for (i = 0; i < RCU_NEXT_SIZE; i++)
1361 rdp->nxttail[i] = &rdp->nxtlist;
1362 rdp->qlen = 0;
1363 rdp->blimit = blimit; 1421 rdp->blimit = blimit;
1364#ifdef CONFIG_NO_HZ
1365 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1366#endif /* #ifdef CONFIG_NO_HZ */
1367 rdp->cpu = cpu;
1368 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1422 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1369 1423
1370 /* 1424 /*
@@ -1387,34 +1441,21 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
1387 rnp = rnp->parent; 1441 rnp = rnp->parent;
1388 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1442 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1389 1443
1390 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1444 spin_unlock_irqrestore(&rsp->onofflock, flags);
1391
1392 /*
1393 * A new grace period might start here. If so, we will be part of
1394 * it, and its gpnum will be greater than ours, so we will
1395 * participate. It is also possible for the gpnum to have been
1396 * incremented before this function was called, and the bitmasks
1397 * to not be filled out until now, in which case we will also
1398 * participate due to our gpnum being behind.
1399 */
1400
1401 /* Since it is coming online, the CPU is in a quiescent state. */
1402 cpu_quiet(cpu, rsp, rdp, lastcomp);
1403 local_irq_restore(flags);
1404} 1445}
1405 1446
1406static void __cpuinit rcu_online_cpu(int cpu) 1447static void __cpuinit rcu_online_cpu(int cpu)
1407{ 1448{
1408 rcu_init_percpu_data(cpu, &rcu_state); 1449 rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
1409 rcu_init_percpu_data(cpu, &rcu_bh_state); 1450 rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
1410 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1451 rcu_preempt_init_percpu_data(cpu);
1411} 1452}
1412 1453
1413/* 1454/*
1414 * Handle CPU online/offline notifcation events. 1455 * Handle CPU online/offline notification events.
1415 */ 1456 */
1416static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 1457int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1417 unsigned long action, void *hcpu) 1458 unsigned long action, void *hcpu)
1418{ 1459{
1419 long cpu = (long)hcpu; 1460 long cpu = (long)hcpu;
1420 1461
@@ -1486,6 +1527,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1486 rnp = rsp->level[i]; 1527 rnp = rsp->level[i];
1487 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1528 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1488 spin_lock_init(&rnp->lock); 1529 spin_lock_init(&rnp->lock);
1530 rnp->gpnum = 0;
1489 rnp->qsmask = 0; 1531 rnp->qsmask = 0;
1490 rnp->qsmaskinit = 0; 1532 rnp->qsmaskinit = 0;
1491 rnp->grplo = j * cpustride; 1533 rnp->grplo = j * cpustride;
@@ -1503,16 +1545,20 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1503 j / rsp->levelspread[i - 1]; 1545 j / rsp->levelspread[i - 1];
1504 } 1546 }
1505 rnp->level = i; 1547 rnp->level = i;
1548 INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
1549 INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
1506 } 1550 }
1507 } 1551 }
1508} 1552}
1509 1553
1510/* 1554/*
1511 * Helper macro for __rcu_init(). To be used nowhere else! 1555 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1512 * Assigns leaf node pointers into each CPU's rcu_data structure. 1556 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1557 * structure.
1513 */ 1558 */
1514#define RCU_DATA_PTR_INIT(rsp, rcu_data) \ 1559#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1515do { \ 1560do { \
1561 rcu_init_one(rsp); \
1516 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ 1562 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1517 j = 0; \ 1563 j = 0; \
1518 for_each_possible_cpu(i) { \ 1564 for_each_possible_cpu(i) { \
@@ -1520,33 +1566,43 @@ do { \
1520 j++; \ 1566 j++; \
1521 per_cpu(rcu_data, i).mynode = &rnp[j]; \ 1567 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1522 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1568 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1569 rcu_boot_init_percpu_data(i, rsp); \
1523 } \ 1570 } \
1524} while (0) 1571} while (0)
1525 1572
1526static struct notifier_block __cpuinitdata rcu_nb = { 1573#ifdef CONFIG_TREE_PREEMPT_RCU
1527 .notifier_call = rcu_cpu_notify, 1574
1528}; 1575void __init __rcu_init_preempt(void)
1576{
1577 int i; /* All used by RCU_INIT_FLAVOR(). */
1578 int j;
1579 struct rcu_node *rnp;
1580
1581 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
1582}
1583
1584#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1585
1586void __init __rcu_init_preempt(void)
1587{
1588}
1589
1590#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1529 1591
1530void __init __rcu_init(void) 1592void __init __rcu_init(void)
1531{ 1593{
1532 int i; /* All used by RCU_DATA_PTR_INIT(). */ 1594 int i; /* All used by RCU_INIT_FLAVOR(). */
1533 int j; 1595 int j;
1534 struct rcu_node *rnp; 1596 struct rcu_node *rnp;
1535 1597
1536 printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n"); 1598 rcu_bootup_announce();
1537#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1599#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1538 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); 1600 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1539#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 1601#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1540 rcu_init_one(&rcu_state); 1602 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1541 RCU_DATA_PTR_INIT(&rcu_state, rcu_data); 1603 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1542 rcu_init_one(&rcu_bh_state); 1604 __rcu_init_preempt();
1543 RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data); 1605 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1544
1545 for_each_online_cpu(i)
1546 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
1547 /* Register notifier for non-boot CPUs */
1548 register_cpu_notifier(&rcu_nb);
1549 printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
1550} 1606}
1551 1607
1552module_param(blimit, int, 0); 1608module_param(blimit, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 5e872bbf07f5..8e8287a983c2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -1,10 +1,259 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 *
19 * Copyright IBM Corporation, 2008
20 *
21 * Author: Ingo Molnar <mingo@elte.hu>
22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#include <linux/cache.h>
26#include <linux/spinlock.h>
27#include <linux/threads.h>
28#include <linux/cpumask.h>
29#include <linux/seqlock.h>
30
31/*
32 * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
33 * In theory, it should be possible to add more levels straightforwardly.
34 * In practice, this has not been tested, so there is probably some
35 * bug somewhere.
36 */
37#define MAX_RCU_LVLS 3
38#define RCU_FANOUT (CONFIG_RCU_FANOUT)
39#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
40#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
41
42#if NR_CPUS <= RCU_FANOUT
43# define NUM_RCU_LVLS 1
44# define NUM_RCU_LVL_0 1
45# define NUM_RCU_LVL_1 (NR_CPUS)
46# define NUM_RCU_LVL_2 0
47# define NUM_RCU_LVL_3 0
48#elif NR_CPUS <= RCU_FANOUT_SQ
49# define NUM_RCU_LVLS 2
50# define NUM_RCU_LVL_0 1
51# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
52# define NUM_RCU_LVL_2 (NR_CPUS)
53# define NUM_RCU_LVL_3 0
54#elif NR_CPUS <= RCU_FANOUT_CUBE
55# define NUM_RCU_LVLS 3
56# define NUM_RCU_LVL_0 1
57# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
58# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
59# define NUM_RCU_LVL_3 NR_CPUS
60#else
61# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
62#endif /* #if (NR_CPUS) <= RCU_FANOUT */
63
64#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
65#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
66
67/*
68 * Dynticks per-CPU state.
69 */
70struct rcu_dynticks {
71 int dynticks_nesting; /* Track nesting level, sort of. */
72 int dynticks; /* Even value for dynticks-idle, else odd. */
73 int dynticks_nmi; /* Even value for either dynticks-idle or */
74 /* not in nmi handler, else odd. So this */
75 /* remains even for nmi from irq handler. */
76};
77
78/*
79 * Definition for node within the RCU grace-period-detection hierarchy.
80 */
81struct rcu_node {
82 spinlock_t lock;
83 long gpnum; /* Current grace period for this node. */
84 /* This will either be equal to or one */
85 /* behind the root rcu_node's gpnum. */
86 unsigned long qsmask; /* CPUs or groups that need to switch in */
87 /* order for current grace period to proceed.*/
88 unsigned long qsmaskinit;
89 /* Per-GP initialization for qsmask. */
90 unsigned long grpmask; /* Mask to apply to parent qsmask. */
91 int grplo; /* lowest-numbered CPU or group here. */
92 int grphi; /* highest-numbered CPU or group here. */
93 u8 grpnum; /* CPU/group number for next level up. */
94 u8 level; /* root is at level 0. */
95 struct rcu_node *parent;
96 struct list_head blocked_tasks[2];
97 /* Tasks blocked in RCU read-side critsect. */
98} ____cacheline_internodealigned_in_smp;
99
100/* Index values for nxttail array in struct rcu_data. */
101#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */
102#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */
103#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */
104#define RCU_NEXT_TAIL 3
105#define RCU_NEXT_SIZE 4
106
107/* Per-CPU data for read-copy update. */
108struct rcu_data {
109 /* 1) quiescent-state and grace-period handling : */
110 long completed; /* Track rsp->completed gp number */
111 /* in order to detect GP end. */
112 long gpnum; /* Highest gp number that this CPU */
113 /* is aware of having started. */
114 long passed_quiesc_completed;
115 /* Value of completed at time of qs. */
116 bool passed_quiesc; /* User-mode/idle loop etc. */
117 bool qs_pending; /* Core waits for quiesc state. */
118 bool beenonline; /* CPU online at least once. */
119 bool preemptable; /* Preemptable RCU? */
120 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
121 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
122
123 /* 2) batch handling */
124 /*
125 * If nxtlist is not NULL, it is partitioned as follows.
126 * Any of the partitions might be empty, in which case the
127 * pointer to that partition will be equal to the pointer for
128 * the following partition. When the list is empty, all of
129 * the nxttail elements point to nxtlist, which is NULL.
130 *
131 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
132 * Entries that might have arrived after current GP ended
133 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
134 * Entries known to have arrived before current GP ended
135 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
136 * Entries that batch # <= ->completed - 1: waiting for current GP
137 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
138 * Entries that batch # <= ->completed
139 * The grace period for these entries has completed, and
140 * the other grace-period-completed entries may be moved
141 * here temporarily in rcu_process_callbacks().
142 */
143 struct rcu_head *nxtlist;
144 struct rcu_head **nxttail[RCU_NEXT_SIZE];
145 long qlen; /* # of queued callbacks */
146 long blimit; /* Upper limit on a processed batch */
147
148#ifdef CONFIG_NO_HZ
149 /* 3) dynticks interface. */
150 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
151 int dynticks_snap; /* Per-GP tracking for dynticks. */
152 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
153#endif /* #ifdef CONFIG_NO_HZ */
154
155 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
156#ifdef CONFIG_NO_HZ
157 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
158#endif /* #ifdef CONFIG_NO_HZ */
159 unsigned long offline_fqs; /* Kicked due to being offline. */
160 unsigned long resched_ipi; /* Sent a resched IPI. */
161
162 /* 5) __rcu_pending() statistics. */
163 long n_rcu_pending; /* rcu_pending() calls since boot. */
164 long n_rp_qs_pending;
165 long n_rp_cb_ready;
166 long n_rp_cpu_needs_gp;
167 long n_rp_gp_completed;
168 long n_rp_gp_started;
169 long n_rp_need_fqs;
170 long n_rp_need_nothing;
171
172 int cpu;
173};
174
175/* Values for signaled field in struct rcu_state. */
176#define RCU_GP_INIT 0 /* Grace period being initialized. */
177#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */
178#define RCU_FORCE_QS 2 /* Need to force quiescent state. */
179#ifdef CONFIG_NO_HZ
180#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
181#else /* #ifdef CONFIG_NO_HZ */
182#define RCU_SIGNAL_INIT RCU_FORCE_QS
183#endif /* #else #ifdef CONFIG_NO_HZ */
184
185#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
186#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
187#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */
188#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */
189#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
190 /* to take at least one */
191 /* scheduling clock irq */
192 /* before ratting on them. */
193
194#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
195
196/*
197 * RCU global state, including node hierarchy. This hierarchy is
198 * represented in "heap" form in a dense array. The root (first level)
199 * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
200 * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
201 * and the third level in ->node[m+1] and following (->node[m+1] referenced
202 * by ->level[2]). The number of levels is determined by the number of
203 * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy"
204 * consisting of a single rcu_node.
205 */
206struct rcu_state {
207 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
208 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
209 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
210 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
211 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */
212
213 /* The following fields are guarded by the root rcu_node's lock. */
214
215 u8 signaled ____cacheline_internodealigned_in_smp;
216 /* Force QS state. */
217 long gpnum; /* Current gp number. */
218 long completed; /* # of last completed gp. */
219 spinlock_t onofflock; /* exclude on/offline and */
220 /* starting new GP. */
221 spinlock_t fqslock; /* Only one task forcing */
222 /* quiescent states. */
223 unsigned long jiffies_force_qs; /* Time at which to invoke */
224 /* force_quiescent_state(). */
225 unsigned long n_force_qs; /* Number of calls to */
226 /* force_quiescent_state(). */
227 unsigned long n_force_qs_lh; /* ~Number of calls leaving */
228 /* due to lock unavailable. */
229 unsigned long n_force_qs_ngp; /* Number of calls leaving */
230 /* due to no GP active. */
231#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
232 unsigned long gp_start; /* Time at which GP started, */
233 /* but in jiffies. */
234 unsigned long jiffies_stall; /* Time at which to check */
235 /* for CPU stalls. */
236#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237#ifdef CONFIG_NO_HZ
238 long dynticks_completed; /* Value of completed @ snap. */
239#endif /* #ifdef CONFIG_NO_HZ */
240};
241
242#ifdef RCU_TREE_NONCORE
1 243
2/* 244/*
3 * RCU implementation internal declarations: 245 * RCU implementation internal declarations:
4 */ 246 */
5extern struct rcu_state rcu_state; 247extern struct rcu_state rcu_sched_state;
6DECLARE_PER_CPU(struct rcu_data, rcu_data); 248DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
7 249
8extern struct rcu_state rcu_bh_state; 250extern struct rcu_state rcu_bh_state;
9DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); 251DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
10 252
253#ifdef CONFIG_TREE_PREEMPT_RCU
254extern struct rcu_state rcu_preempt_state;
255DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
256#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
257
258#endif /* #ifdef RCU_TREE_NONCORE */
259
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
new file mode 100644
index 000000000000..1cee04f627eb
--- /dev/null
+++ b/kernel/rcutree_plugin.h
@@ -0,0 +1,566 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright Red Hat, 2009
21 * Copyright IBM Corporation, 2009
22 *
23 * Author: Ingo Molnar <mingo@elte.hu>
24 * Paul E. McKenney <paulmck@linux.vnet.ibm.com>
25 */
26
27
28#ifdef CONFIG_TREE_PREEMPT_RCU
29
30struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
31DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
32
33/*
34 * Tell them what RCU they are running.
35 */
36static inline void rcu_bootup_announce(void)
37{
38 printk(KERN_INFO
39 "Experimental preemptable hierarchical RCU implementation.\n");
40}
41
42/*
43 * Return the number of RCU-preempt batches processed thus far
44 * for debug and statistics.
45 */
46long rcu_batches_completed_preempt(void)
47{
48 return rcu_preempt_state.completed;
49}
50EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt);
51
52/*
53 * Return the number of RCU batches processed thus far for debug & stats.
54 */
55long rcu_batches_completed(void)
56{
57 return rcu_batches_completed_preempt();
58}
59EXPORT_SYMBOL_GPL(rcu_batches_completed);
60
61/*
62 * Record a preemptable-RCU quiescent state for the specified CPU. Note
63 * that this just means that the task currently running on the CPU is
64 * not in a quiescent state. There might be any number of tasks blocked
65 * while in an RCU read-side critical section.
66 */
67static void rcu_preempt_qs(int cpu)
68{
69 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
70 rdp->passed_quiesc_completed = rdp->completed;
71 barrier();
72 rdp->passed_quiesc = 1;
73}
74
75/*
76 * We have entered the scheduler, and the current task might soon be
77 * context-switched away from. If this task is in an RCU read-side
78 * critical section, we will no longer be able to rely on the CPU to
79 * record that fact, so we enqueue the task on the appropriate entry
80 * of the blocked_tasks[] array. The task will dequeue itself when
81 * it exits the outermost enclosing RCU read-side critical section.
82 * Therefore, the current grace period cannot be permitted to complete
83 * until the blocked_tasks[] entry indexed by the low-order bit of
84 * rnp->gpnum empties.
85 *
86 * Caller must disable preemption.
87 */
88static void rcu_preempt_note_context_switch(int cpu)
89{
90 struct task_struct *t = current;
91 unsigned long flags;
92 int phase;
93 struct rcu_data *rdp;
94 struct rcu_node *rnp;
95
96 if (t->rcu_read_lock_nesting &&
97 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
98
99 /* Possibly blocking in an RCU read-side critical section. */
100 rdp = rcu_preempt_state.rda[cpu];
101 rnp = rdp->mynode;
102 spin_lock_irqsave(&rnp->lock, flags);
103 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
104 t->rcu_blocked_node = rnp;
105
106 /*
107 * If this CPU has already checked in, then this task
108 * will hold up the next grace period rather than the
109 * current grace period. Queue the task accordingly.
110 * If the task is queued for the current grace period
111 * (i.e., this CPU has not yet passed through a quiescent
112 * state for the current grace period), then as long
113 * as that task remains queued, the current grace period
114 * cannot end.
115 *
116 * But first, note that the current CPU must still be
117 * on line!
118 */
119 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
120 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
121 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
122 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
123 spin_unlock_irqrestore(&rnp->lock, flags);
124 }
125
126 /*
127 * Either we were not in an RCU read-side critical section to
128 * begin with, or we have now recorded that critical section
129 * globally. Either way, we can now note a quiescent state
130 * for this CPU. Again, if we were in an RCU read-side critical
131 * section, and if that critical section was blocking the current
132 * grace period, then the fact that the task has been enqueued
133 * means that we continue to block the current grace period.
134 */
135 rcu_preempt_qs(cpu);
136 local_irq_save(flags);
137 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
138 local_irq_restore(flags);
139}
140
141/*
142 * Tree-preemptable RCU implementation for rcu_read_lock().
143 * Just increment ->rcu_read_lock_nesting, shared state will be updated
144 * if we block.
145 */
146void __rcu_read_lock(void)
147{
148 ACCESS_ONCE(current->rcu_read_lock_nesting)++;
149 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
150}
151EXPORT_SYMBOL_GPL(__rcu_read_lock);
152
153static void rcu_read_unlock_special(struct task_struct *t)
154{
155 int empty;
156 unsigned long flags;
157 unsigned long mask;
158 struct rcu_node *rnp;
159 int special;
160
161 /* NMI handlers cannot block and cannot safely manipulate state. */
162 if (in_nmi())
163 return;
164
165 local_irq_save(flags);
166
167 /*
168 * If RCU core is waiting for this CPU to exit critical section,
169 * let it know that we have done so.
170 */
171 special = t->rcu_read_unlock_special;
172 if (special & RCU_READ_UNLOCK_NEED_QS) {
173 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
174 rcu_preempt_qs(smp_processor_id());
175 }
176
177 /* Hardware IRQ handlers cannot block. */
178 if (in_irq()) {
179 local_irq_restore(flags);
180 return;
181 }
182
183 /* Clean up if blocked during RCU read-side critical section. */
184 if (special & RCU_READ_UNLOCK_BLOCKED) {
185 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
186
187 /*
188 * Remove this task from the list it blocked on. The
189 * task can migrate while we acquire the lock, but at
190 * most one time. So at most two passes through loop.
191 */
192 for (;;) {
193 rnp = t->rcu_blocked_node;
194 spin_lock(&rnp->lock); /* irqs already disabled. */
195 if (rnp == t->rcu_blocked_node)
196 break;
197 spin_unlock(&rnp->lock); /* irqs remain disabled. */
198 }
199 empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
200 list_del_init(&t->rcu_node_entry);
201 t->rcu_blocked_node = NULL;
202
203 /*
204 * If this was the last task on the current list, and if
205 * we aren't waiting on any CPUs, report the quiescent state.
206 * Note that both cpu_quiet_msk_finish() and cpu_quiet_msk()
207 * drop rnp->lock and restore irq.
208 */
209 if (!empty && rnp->qsmask == 0 &&
210 list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
211 struct rcu_node *rnp_p;
212
213 if (rnp->parent == NULL) {
214 /* Only one rcu_node in the tree. */
215 cpu_quiet_msk_finish(&rcu_preempt_state, flags);
216 return;
217 }
218 /* Report up the rest of the hierarchy. */
219 mask = rnp->grpmask;
220 spin_unlock_irqrestore(&rnp->lock, flags);
221 rnp_p = rnp->parent;
222 spin_lock_irqsave(&rnp_p->lock, flags);
223 WARN_ON_ONCE(rnp->qsmask);
224 cpu_quiet_msk(mask, &rcu_preempt_state, rnp_p, flags);
225 return;
226 }
227 spin_unlock(&rnp->lock);
228 }
229 local_irq_restore(flags);
230}
231
232/*
233 * Tree-preemptable RCU implementation for rcu_read_unlock().
234 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
235 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
236 * invoke rcu_read_unlock_special() to clean up after a context switch
237 * in an RCU read-side critical section and other special cases.
238 */
239void __rcu_read_unlock(void)
240{
241 struct task_struct *t = current;
242
243 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
244 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
245 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
246 rcu_read_unlock_special(t);
247}
248EXPORT_SYMBOL_GPL(__rcu_read_unlock);
249
250#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
251
252/*
253 * Scan the current list of tasks blocked within RCU read-side critical
254 * sections, printing out the tid of each.
255 */
256static void rcu_print_task_stall(struct rcu_node *rnp)
257{
258 unsigned long flags;
259 struct list_head *lp;
260 int phase = rnp->gpnum & 0x1;
261 struct task_struct *t;
262
263 if (!list_empty(&rnp->blocked_tasks[phase])) {
264 spin_lock_irqsave(&rnp->lock, flags);
265 phase = rnp->gpnum & 0x1; /* re-read under lock. */
266 lp = &rnp->blocked_tasks[phase];
267 list_for_each_entry(t, lp, rcu_node_entry)
268 printk(" P%d", t->pid);
269 spin_unlock_irqrestore(&rnp->lock, flags);
270 }
271}
272
273#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
274
275/*
276 * Check that the list of blocked tasks for the newly completed grace
277 * period is in fact empty. It is a serious bug to complete a grace
278 * period that still has RCU readers blocked! This function must be
279 * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
280 * must be held by the caller.
281 */
282static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
283{
284 WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]));
285 WARN_ON_ONCE(rnp->qsmask);
286}
287
288/*
289 * Check for preempted RCU readers for the specified rcu_node structure.
290 * If the caller needs a reliable answer, it must hold the rcu_node's
291 * >lock.
292 */
293static int rcu_preempted_readers(struct rcu_node *rnp)
294{
295 return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
296}
297
298#ifdef CONFIG_HOTPLUG_CPU
299
300/*
301 * Handle tasklist migration for case in which all CPUs covered by the
302 * specified rcu_node have gone offline. Move them up to the root
303 * rcu_node. The reason for not just moving them to the immediate
304 * parent is to remove the need for rcu_read_unlock_special() to
305 * make more than two attempts to acquire the target rcu_node's lock.
306 *
307 * The caller must hold rnp->lock with irqs disabled.
308 */
309static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
310 struct rcu_node *rnp,
311 struct rcu_data *rdp)
312{
313 int i;
314 struct list_head *lp;
315 struct list_head *lp_root;
316 struct rcu_node *rnp_root = rcu_get_root(rsp);
317 struct task_struct *tp;
318
319 if (rnp == rnp_root) {
320 WARN_ONCE(1, "Last CPU thought to be offlined?");
321 return; /* Shouldn't happen: at least one CPU online. */
322 }
323 WARN_ON_ONCE(rnp != rdp->mynode &&
324 (!list_empty(&rnp->blocked_tasks[0]) ||
325 !list_empty(&rnp->blocked_tasks[1])));
326
327 /*
328 * Move tasks up to root rcu_node. Rely on the fact that the
329 * root rcu_node can be at most one ahead of the rest of the
330 * rcu_nodes in terms of gp_num value. This fact allows us to
331 * move the blocked_tasks[] array directly, element by element.
332 */
333 for (i = 0; i < 2; i++) {
334 lp = &rnp->blocked_tasks[i];
335 lp_root = &rnp_root->blocked_tasks[i];
336 while (!list_empty(lp)) {
337 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
338 spin_lock(&rnp_root->lock); /* irqs already disabled */
339 list_del(&tp->rcu_node_entry);
340 tp->rcu_blocked_node = rnp_root;
341 list_add(&tp->rcu_node_entry, lp_root);
342 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
343 }
344 }
345}
346
347/*
348 * Do CPU-offline processing for preemptable RCU.
349 */
350static void rcu_preempt_offline_cpu(int cpu)
351{
352 __rcu_offline_cpu(cpu, &rcu_preempt_state);
353}
354
355#endif /* #ifdef CONFIG_HOTPLUG_CPU */
356
357/*
358 * Check for a quiescent state from the current CPU. When a task blocks,
359 * the task is recorded in the corresponding CPU's rcu_node structure,
360 * which is checked elsewhere.
361 *
362 * Caller must disable hard irqs.
363 */
364static void rcu_preempt_check_callbacks(int cpu)
365{
366 struct task_struct *t = current;
367
368 if (t->rcu_read_lock_nesting == 0) {
369 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
370 rcu_preempt_qs(cpu);
371 return;
372 }
373 if (per_cpu(rcu_preempt_data, cpu).qs_pending)
374 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
375}
376
377/*
378 * Process callbacks for preemptable RCU.
379 */
380static void rcu_preempt_process_callbacks(void)
381{
382 __rcu_process_callbacks(&rcu_preempt_state,
383 &__get_cpu_var(rcu_preempt_data));
384}
385
386/*
387 * Queue a preemptable-RCU callback for invocation after a grace period.
388 */
389void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
390{
391 __call_rcu(head, func, &rcu_preempt_state);
392}
393EXPORT_SYMBOL_GPL(call_rcu);
394
395/*
396 * Check to see if there is any immediate preemptable-RCU-related work
397 * to be done.
398 */
399static int rcu_preempt_pending(int cpu)
400{
401 return __rcu_pending(&rcu_preempt_state,
402 &per_cpu(rcu_preempt_data, cpu));
403}
404
405/*
406 * Does preemptable RCU need the CPU to stay out of dynticks mode?
407 */
408static int rcu_preempt_needs_cpu(int cpu)
409{
410 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
411}
412
413/*
414 * Initialize preemptable RCU's per-CPU data.
415 */
416static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
417{
418 rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
419}
420
421/*
422 * Check for a task exiting while in a preemptable-RCU read-side
423 * critical section, clean up if so. No need to issue warnings,
424 * as debug_check_no_locks_held() already does this if lockdep
425 * is enabled.
426 */
427void exit_rcu(void)
428{
429 struct task_struct *t = current;
430
431 if (t->rcu_read_lock_nesting == 0)
432 return;
433 t->rcu_read_lock_nesting = 1;
434 rcu_read_unlock();
435}
436
437#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
438
439/*
440 * Tell them what RCU they are running.
441 */
442static inline void rcu_bootup_announce(void)
443{
444 printk(KERN_INFO "Hierarchical RCU implementation.\n");
445}
446
447/*
448 * Return the number of RCU batches processed thus far for debug & stats.
449 */
450long rcu_batches_completed(void)
451{
452 return rcu_batches_completed_sched();
453}
454EXPORT_SYMBOL_GPL(rcu_batches_completed);
455
456/*
457 * Because preemptable RCU does not exist, we never have to check for
458 * CPUs being in quiescent states.
459 */
460static void rcu_preempt_note_context_switch(int cpu)
461{
462}
463
464#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
465
466/*
467 * Because preemptable RCU does not exist, we never have to check for
468 * tasks blocked within RCU read-side critical sections.
469 */
470static void rcu_print_task_stall(struct rcu_node *rnp)
471{
472}
473
474#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
475
476/*
477 * Because there is no preemptable RCU, there can be no readers blocked,
478 * so there is no need to check for blocked tasks. So check only for
479 * bogus qsmask values.
480 */
481static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
482{
483 WARN_ON_ONCE(rnp->qsmask);
484}
485
486/*
487 * Because preemptable RCU does not exist, there are never any preempted
488 * RCU readers.
489 */
490static int rcu_preempted_readers(struct rcu_node *rnp)
491{
492 return 0;
493}
494
495#ifdef CONFIG_HOTPLUG_CPU
496
497/*
498 * Because preemptable RCU does not exist, it never needs to migrate
499 * tasks that were blocked within RCU read-side critical sections.
500 */
501static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
502 struct rcu_node *rnp,
503 struct rcu_data *rdp)
504{
505}
506
507/*
508 * Because preemptable RCU does not exist, it never needs CPU-offline
509 * processing.
510 */
511static void rcu_preempt_offline_cpu(int cpu)
512{
513}
514
515#endif /* #ifdef CONFIG_HOTPLUG_CPU */
516
517/*
518 * Because preemptable RCU does not exist, it never has any callbacks
519 * to check.
520 */
521void rcu_preempt_check_callbacks(int cpu)
522{
523}
524
525/*
526 * Because preemptable RCU does not exist, it never has any callbacks
527 * to process.
528 */
529void rcu_preempt_process_callbacks(void)
530{
531}
532
533/*
534 * In classic RCU, call_rcu() is just call_rcu_sched().
535 */
536void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
537{
538 call_rcu_sched(head, func);
539}
540EXPORT_SYMBOL_GPL(call_rcu);
541
542/*
543 * Because preemptable RCU does not exist, it never has any work to do.
544 */
545static int rcu_preempt_pending(int cpu)
546{
547 return 0;
548}
549
550/*
551 * Because preemptable RCU does not exist, it never needs any CPU.
552 */
553static int rcu_preempt_needs_cpu(int cpu)
554{
555 return 0;
556}
557
558/*
559 * Because preemptable RCU does not exist, there is no per-CPU
560 * data to initialize.
561 */
562static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
563{
564}
565
566#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index fe1dcdbf1ca3..c89f5e9fd173 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -20,7 +20,7 @@
20 * Papers: http://www.rdrop.com/users/paulmck/RCU 20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 * 21 *
22 * For detailed explanation of Read-Copy Update mechanism see - 22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU 23 * Documentation/RCU
24 * 24 *
25 */ 25 */
26#include <linux/types.h> 26#include <linux/types.h>
@@ -43,6 +43,7 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#define RCU_TREE_NONCORE
46#include "rcutree.h" 47#include "rcutree.h"
47 48
48static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 49static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
@@ -76,8 +77,12 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
76 77
77static int show_rcudata(struct seq_file *m, void *unused) 78static int show_rcudata(struct seq_file *m, void *unused)
78{ 79{
79 seq_puts(m, "rcu:\n"); 80#ifdef CONFIG_TREE_PREEMPT_RCU
80 PRINT_RCU_DATA(rcu_data, print_one_rcu_data, m); 81 seq_puts(m, "rcu_preempt:\n");
82 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m);
83#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
84 seq_puts(m, "rcu_sched:\n");
85 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m);
81 seq_puts(m, "rcu_bh:\n"); 86 seq_puts(m, "rcu_bh:\n");
82 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); 87 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m);
83 return 0; 88 return 0;
@@ -102,7 +107,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
102 return; 107 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
104 rdp->cpu, 109 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
106 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed, 112 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending); 113 rdp->qs_pending);
@@ -124,8 +129,12 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
124 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
125#endif /* #ifdef CONFIG_NO_HZ */ 130#endif /* #ifdef CONFIG_NO_HZ */
126 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n");
127 seq_puts(m, "\"rcu:\"\n"); 132#ifdef CONFIG_TREE_PREEMPT_RCU
128 PRINT_RCU_DATA(rcu_data, print_one_rcu_data_csv, m); 133 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
135#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
136 seq_puts(m, "\"rcu_sched:\"\n");
137 PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m);
129 seq_puts(m, "\"rcu_bh:\"\n"); 138 seq_puts(m, "\"rcu_bh:\"\n");
130 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); 139 PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m);
131 return 0; 140 return 0;
@@ -171,8 +180,12 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
171 180
172static int show_rcuhier(struct seq_file *m, void *unused) 181static int show_rcuhier(struct seq_file *m, void *unused)
173{ 182{
174 seq_puts(m, "rcu:\n"); 183#ifdef CONFIG_TREE_PREEMPT_RCU
175 print_one_rcu_state(m, &rcu_state); 184 seq_puts(m, "rcu_preempt:\n");
185 print_one_rcu_state(m, &rcu_preempt_state);
186#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
187 seq_puts(m, "rcu_sched:\n");
188 print_one_rcu_state(m, &rcu_sched_state);
176 seq_puts(m, "rcu_bh:\n"); 189 seq_puts(m, "rcu_bh:\n");
177 print_one_rcu_state(m, &rcu_bh_state); 190 print_one_rcu_state(m, &rcu_bh_state);
178 return 0; 191 return 0;
@@ -193,8 +206,12 @@ static struct file_operations rcuhier_fops = {
193 206
194static int show_rcugp(struct seq_file *m, void *unused) 207static int show_rcugp(struct seq_file *m, void *unused)
195{ 208{
196 seq_printf(m, "rcu: completed=%ld gpnum=%ld\n", 209#ifdef CONFIG_TREE_PREEMPT_RCU
197 rcu_state.completed, rcu_state.gpnum); 210 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n",
211 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
212#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
213 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n",
214 rcu_sched_state.completed, rcu_sched_state.gpnum);
198 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 215 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n",
199 rcu_bh_state.completed, rcu_bh_state.gpnum); 216 rcu_bh_state.completed, rcu_bh_state.gpnum);
200 return 0; 217 return 0;
@@ -243,8 +260,12 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
243 260
244static int show_rcu_pending(struct seq_file *m, void *unused) 261static int show_rcu_pending(struct seq_file *m, void *unused)
245{ 262{
246 seq_puts(m, "rcu:\n"); 263#ifdef CONFIG_TREE_PREEMPT_RCU
247 print_rcu_pendings(m, &rcu_state); 264 seq_puts(m, "rcu_preempt:\n");
265 print_rcu_pendings(m, &rcu_preempt_state);
266#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
267 seq_puts(m, "rcu_sched:\n");
268 print_rcu_pendings(m, &rcu_sched_state);
248 seq_puts(m, "rcu_bh:\n"); 269 seq_puts(m, "rcu_bh:\n");
249 print_rcu_pendings(m, &rcu_bh_state); 270 print_rcu_pendings(m, &rcu_bh_state);
250 return 0; 271 return 0;
@@ -264,62 +285,47 @@ static struct file_operations rcu_pending_fops = {
264}; 285};
265 286
266static struct dentry *rcudir; 287static struct dentry *rcudir;
267static struct dentry *datadir;
268static struct dentry *datadir_csv;
269static struct dentry *gpdir;
270static struct dentry *hierdir;
271static struct dentry *rcu_pendingdir;
272 288
273static int __init rcuclassic_trace_init(void) 289static int __init rcuclassic_trace_init(void)
274{ 290{
291 struct dentry *retval;
292
275 rcudir = debugfs_create_dir("rcu", NULL); 293 rcudir = debugfs_create_dir("rcu", NULL);
276 if (!rcudir) 294 if (!rcudir)
277 goto out; 295 goto free_out;
278 296
279 datadir = debugfs_create_file("rcudata", 0444, rcudir, 297 retval = debugfs_create_file("rcudata", 0444, rcudir,
280 NULL, &rcudata_fops); 298 NULL, &rcudata_fops);
281 if (!datadir) 299 if (!retval)
282 goto free_out; 300 goto free_out;
283 301
284 datadir_csv = debugfs_create_file("rcudata.csv", 0444, rcudir, 302 retval = debugfs_create_file("rcudata.csv", 0444, rcudir,
285 NULL, &rcudata_csv_fops); 303 NULL, &rcudata_csv_fops);
286 if (!datadir_csv) 304 if (!retval)
287 goto free_out; 305 goto free_out;
288 306
289 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); 307 retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
290 if (!gpdir) 308 if (!retval)
291 goto free_out; 309 goto free_out;
292 310
293 hierdir = debugfs_create_file("rcuhier", 0444, rcudir, 311 retval = debugfs_create_file("rcuhier", 0444, rcudir,
294 NULL, &rcuhier_fops); 312 NULL, &rcuhier_fops);
295 if (!hierdir) 313 if (!retval)
296 goto free_out; 314 goto free_out;
297 315
298 rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, 316 retval = debugfs_create_file("rcu_pending", 0444, rcudir,
299 NULL, &rcu_pending_fops); 317 NULL, &rcu_pending_fops);
300 if (!rcu_pendingdir) 318 if (!retval)
301 goto free_out; 319 goto free_out;
302 return 0; 320 return 0;
303free_out: 321free_out:
304 if (datadir) 322 debugfs_remove_recursive(rcudir);
305 debugfs_remove(datadir);
306 if (datadir_csv)
307 debugfs_remove(datadir_csv);
308 if (gpdir)
309 debugfs_remove(gpdir);
310 debugfs_remove(rcudir);
311out:
312 return 1; 323 return 1;
313} 324}
314 325
315static void __exit rcuclassic_trace_cleanup(void) 326static void __exit rcuclassic_trace_cleanup(void)
316{ 327{
317 debugfs_remove(datadir); 328 debugfs_remove_recursive(rcudir);
318 debugfs_remove(datadir_csv);
319 debugfs_remove(gpdir);
320 debugfs_remove(hierdir);
321 debugfs_remove(rcu_pendingdir);
322 debugfs_remove(rcudir);
323} 329}
324 330
325 331
diff --git a/kernel/resource.c b/kernel/resource.c
index ac5f3a36923f..78b087221c15 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -787,7 +787,7 @@ static int __init reserve_setup(char *str)
787 static struct resource reserve[MAXRESERVE]; 787 static struct resource reserve[MAXRESERVE];
788 788
789 for (;;) { 789 for (;;) {
790 int io_start, io_num; 790 unsigned int io_start, io_num;
791 int x = reserved; 791 int x = reserved;
792 792
793 if (get_option (&str, &io_start) != 2) 793 if (get_option (&str, &io_start) != 2)
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index fcd107a78c5a..29bd4baf9e75 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -1039,16 +1039,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { 1039 if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
1040 /* We got the lock for task. */ 1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock); 1041 debug_rt_mutex_lock(lock);
1042
1043 rt_mutex_set_owner(lock, task, 0); 1042 rt_mutex_set_owner(lock, task, 0);
1044 1043 spin_unlock(&lock->wait_lock);
1045 rt_mutex_deadlock_account_lock(lock, task); 1044 rt_mutex_deadlock_account_lock(lock, task);
1046 return 1; 1045 return 1;
1047 } 1046 }
1048 1047
1049 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); 1048 ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
1050 1049
1051
1052 if (ret && !waiter->task) { 1050 if (ret && !waiter->task) {
1053 /* 1051 /*
1054 * Reset the return value. We might have 1052 * Reset the return value. We might have
diff --git a/kernel/sched.c b/kernel/sched.c
index 7c9098d186e6..0ac9053c21d6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,7 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h> 42#include <linux/perf_event.h>
43#include <linux/security.h> 43#include <linux/security.h>
44#include <linux/notifier.h> 44#include <linux/notifier.h>
45#include <linux/profile.h> 45#include <linux/profile.h>
@@ -64,7 +64,6 @@
64#include <linux/tsacct_kern.h> 64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h> 65#include <linux/kprobes.h>
66#include <linux/delayacct.h> 66#include <linux/delayacct.h>
67#include <linux/reciprocal_div.h>
68#include <linux/unistd.h> 67#include <linux/unistd.h>
69#include <linux/pagemap.h> 68#include <linux/pagemap.h>
70#include <linux/hrtimer.h> 69#include <linux/hrtimer.h>
@@ -120,30 +119,6 @@
120 */ 119 */
121#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
122 121
123#ifdef CONFIG_SMP
124
125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
126
127/*
128 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
129 * Since cpu_power is a 'constant', we can use a reciprocal divide.
130 */
131static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
132{
133 return reciprocal_divide(load, sg->reciprocal_cpu_power);
134}
135
136/*
137 * Each time a sched group cpu_power is changed,
138 * we must compute its reciprocal value
139 */
140static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
141{
142 sg->__cpu_power += val;
143 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
144}
145#endif
146
147static inline int rt_policy(int policy) 122static inline int rt_policy(int policy)
148{ 123{
149 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -309,8 +284,8 @@ void set_tg_uid(struct user_struct *user)
309 284
310/* 285/*
311 * Root task group. 286 * Root task group.
312 * Every UID task group (including init_task_group aka UID-0) will 287 * Every UID task group (including init_task_group aka UID-0) will
313 * be a child to this group. 288 * be a child to this group.
314 */ 289 */
315struct task_group root_task_group; 290struct task_group root_task_group;
316 291
@@ -318,12 +293,12 @@ struct task_group root_task_group;
318/* Default task group's sched entity on each cpu */ 293/* Default task group's sched entity on each cpu */
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320/* Default task group's cfs_rq on each cpu */ 295/* Default task group's cfs_rq on each cpu */
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
322#endif /* CONFIG_FAIR_GROUP_SCHED */ 297#endif /* CONFIG_FAIR_GROUP_SCHED */
323 298
324#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
325static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
326static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
327#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
328#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
329#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -401,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
401 376
402#else 377#else
403 378
404#ifdef CONFIG_SMP
405static int root_task_group_empty(void)
406{
407 return 1;
408}
409#endif
410
411static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 379static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
412static inline struct task_group *task_group(struct task_struct *p) 380static inline struct task_group *task_group(struct task_struct *p)
413{ 381{
@@ -493,6 +461,7 @@ struct rt_rq {
493#endif 461#endif
494#ifdef CONFIG_SMP 462#ifdef CONFIG_SMP
495 unsigned long rt_nr_migratory; 463 unsigned long rt_nr_migratory;
464 unsigned long rt_nr_total;
496 int overloaded; 465 int overloaded;
497 struct plist_head pushable_tasks; 466 struct plist_head pushable_tasks;
498#endif 467#endif
@@ -536,14 +505,6 @@ struct root_domain {
536#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
537 struct cpupri cpupri; 506 struct cpupri cpupri;
538#endif 507#endif
539#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
540 /*
541 * Preferred wake up cpu nominated by sched_mc balance that will be
542 * used when most cpus are idle in the system indicating overall very
543 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
544 */
545 unsigned int sched_mc_preferred_wakeup_cpu;
546#endif
547}; 508};
548 509
549/* 510/*
@@ -615,6 +576,7 @@ struct rq {
615 576
616 unsigned char idle_at_tick; 577 unsigned char idle_at_tick;
617 /* For active balancing */ 578 /* For active balancing */
579 int post_schedule;
618 int active_balance; 580 int active_balance;
619 int push_cpu; 581 int push_cpu;
620 /* cpu of this runqueue: */ 582 /* cpu of this runqueue: */
@@ -625,6 +587,9 @@ struct rq {
625 587
626 struct task_struct *migration_thread; 588 struct task_struct *migration_thread;
627 struct list_head migration_queue; 589 struct list_head migration_queue;
590
591 u64 rt_avg;
592 u64 age_stamp;
628#endif 593#endif
629 594
630 /* calc_load related fields */ 595 /* calc_load related fields */
@@ -664,9 +629,10 @@ struct rq {
664 629
665static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 630static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
666 631
667static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) 632static inline
633void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
668{ 634{
669 rq->curr->sched_class->check_preempt_curr(rq, p, sync); 635 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
670} 636}
671 637
672static inline int cpu_of(struct rq *rq) 638static inline int cpu_of(struct rq *rq)
@@ -692,6 +658,7 @@ static inline int cpu_of(struct rq *rq)
692#define this_rq() (&__get_cpu_var(runqueues)) 658#define this_rq() (&__get_cpu_var(runqueues))
693#define task_rq(p) cpu_rq(task_cpu(p)) 659#define task_rq(p) cpu_rq(task_cpu(p))
694#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 660#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
661#define raw_rq() (&__raw_get_cpu_var(runqueues))
695 662
696inline void update_rq_clock(struct rq *rq) 663inline void update_rq_clock(struct rq *rq)
697{ 664{
@@ -714,15 +681,9 @@ inline void update_rq_clock(struct rq *rq)
714 * This interface allows printk to be called with the runqueue lock 681 * This interface allows printk to be called with the runqueue lock
715 * held and know whether or not it is OK to wake up the klogd. 682 * held and know whether or not it is OK to wake up the klogd.
716 */ 683 */
717int runqueue_is_locked(void) 684int runqueue_is_locked(int cpu)
718{ 685{
719 int cpu = get_cpu(); 686 return spin_is_locked(&cpu_rq(cpu)->lock);
720 struct rq *rq = cpu_rq(cpu);
721 int ret;
722
723 ret = spin_is_locked(&rq->lock);
724 put_cpu();
725 return ret;
726} 687}
727 688
728/* 689/*
@@ -860,6 +821,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
860unsigned int sysctl_sched_shares_thresh = 4; 821unsigned int sysctl_sched_shares_thresh = 4;
861 822
862/* 823/*
824 * period over which we average the RT time consumption, measured
825 * in ms.
826 *
827 * default: 1s
828 */
829const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
830
831/*
863 * period over which we measure -rt task cpu usage in us. 832 * period over which we measure -rt task cpu usage in us.
864 * default: 1s 833 * default: 1s
865 */ 834 */
@@ -1277,12 +1246,37 @@ void wake_up_idle_cpu(int cpu)
1277} 1246}
1278#endif /* CONFIG_NO_HZ */ 1247#endif /* CONFIG_NO_HZ */
1279 1248
1249static u64 sched_avg_period(void)
1250{
1251 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1252}
1253
1254static void sched_avg_update(struct rq *rq)
1255{
1256 s64 period = sched_avg_period();
1257
1258 while ((s64)(rq->clock - rq->age_stamp) > period) {
1259 rq->age_stamp += period;
1260 rq->rt_avg /= 2;
1261 }
1262}
1263
1264static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1265{
1266 rq->rt_avg += rt_delta;
1267 sched_avg_update(rq);
1268}
1269
1280#else /* !CONFIG_SMP */ 1270#else /* !CONFIG_SMP */
1281static void resched_task(struct task_struct *p) 1271static void resched_task(struct task_struct *p)
1282{ 1272{
1283 assert_spin_locked(&task_rq(p)->lock); 1273 assert_spin_locked(&task_rq(p)->lock);
1284 set_tsk_need_resched(p); 1274 set_tsk_need_resched(p);
1285} 1275}
1276
1277static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1278{
1279}
1286#endif /* CONFIG_SMP */ 1280#endif /* CONFIG_SMP */
1287 1281
1288#if BITS_PER_LONG == 32 1282#if BITS_PER_LONG == 32
@@ -1493,8 +1487,65 @@ static int tg_nop(struct task_group *tg, void *data)
1493#endif 1487#endif
1494 1488
1495#ifdef CONFIG_SMP 1489#ifdef CONFIG_SMP
1496static unsigned long source_load(int cpu, int type); 1490/* Used instead of source_load when we know the type == 0 */
1497static unsigned long target_load(int cpu, int type); 1491static unsigned long weighted_cpuload(const int cpu)
1492{
1493 return cpu_rq(cpu)->load.weight;
1494}
1495
1496/*
1497 * Return a low guess at the load of a migration-source cpu weighted
1498 * according to the scheduling class and "nice" value.
1499 *
1500 * We want to under-estimate the load of migration sources, to
1501 * balance conservatively.
1502 */
1503static unsigned long source_load(int cpu, int type)
1504{
1505 struct rq *rq = cpu_rq(cpu);
1506 unsigned long total = weighted_cpuload(cpu);
1507
1508 if (type == 0 || !sched_feat(LB_BIAS))
1509 return total;
1510
1511 return min(rq->cpu_load[type-1], total);
1512}
1513
1514/*
1515 * Return a high guess at the load of a migration-target cpu weighted
1516 * according to the scheduling class and "nice" value.
1517 */
1518static unsigned long target_load(int cpu, int type)
1519{
1520 struct rq *rq = cpu_rq(cpu);
1521 unsigned long total = weighted_cpuload(cpu);
1522
1523 if (type == 0 || !sched_feat(LB_BIAS))
1524 return total;
1525
1526 return max(rq->cpu_load[type-1], total);
1527}
1528
1529static struct sched_group *group_of(int cpu)
1530{
1531 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1532
1533 if (!sd)
1534 return NULL;
1535
1536 return sd->groups;
1537}
1538
1539static unsigned long power_of(int cpu)
1540{
1541 struct sched_group *group = group_of(cpu);
1542
1543 if (!group)
1544 return SCHED_LOAD_SCALE;
1545
1546 return group->cpu_power;
1547}
1548
1498static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1549static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1499 1550
1500static unsigned long cpu_avg_load_per_task(int cpu) 1551static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1512,28 +1563,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1512 1563
1513#ifdef CONFIG_FAIR_GROUP_SCHED 1564#ifdef CONFIG_FAIR_GROUP_SCHED
1514 1565
1566struct update_shares_data {
1567 unsigned long rq_weight[NR_CPUS];
1568};
1569
1570static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1571
1515static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1572static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1516 1573
1517/* 1574/*
1518 * Calculate and set the cpu's group shares. 1575 * Calculate and set the cpu's group shares.
1519 */ 1576 */
1520static void 1577static void update_group_shares_cpu(struct task_group *tg, int cpu,
1521update_group_shares_cpu(struct task_group *tg, int cpu, 1578 unsigned long sd_shares,
1522 unsigned long sd_shares, unsigned long sd_rq_weight) 1579 unsigned long sd_rq_weight,
1580 struct update_shares_data *usd)
1523{ 1581{
1524 unsigned long shares; 1582 unsigned long shares, rq_weight;
1525 unsigned long rq_weight; 1583 int boost = 0;
1526 1584
1527 if (!tg->se[cpu]) 1585 rq_weight = usd->rq_weight[cpu];
1528 return; 1586 if (!rq_weight) {
1529 1587 boost = 1;
1530 rq_weight = tg->cfs_rq[cpu]->rq_weight; 1588 rq_weight = NICE_0_LOAD;
1589 }
1531 1590
1532 /* 1591 /*
1533 * \Sum shares * rq_weight 1592 * \Sum_j shares_j * rq_weight_i
1534 * shares = ----------------------- 1593 * shares_i = -----------------------------
1535 * \Sum rq_weight 1594 * \Sum_j rq_weight_j
1536 *
1537 */ 1595 */
1538 shares = (sd_shares * rq_weight) / sd_rq_weight; 1596 shares = (sd_shares * rq_weight) / sd_rq_weight;
1539 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1597 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1544,8 +1602,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1544 unsigned long flags; 1602 unsigned long flags;
1545 1603
1546 spin_lock_irqsave(&rq->lock, flags); 1604 spin_lock_irqsave(&rq->lock, flags);
1547 tg->cfs_rq[cpu]->shares = shares; 1605 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1548 1606 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1549 __set_se_shares(tg->se[cpu], shares); 1607 __set_se_shares(tg->se[cpu], shares);
1550 spin_unlock_irqrestore(&rq->lock, flags); 1608 spin_unlock_irqrestore(&rq->lock, flags);
1551 } 1609 }
@@ -1558,22 +1616,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1558 */ 1616 */
1559static int tg_shares_up(struct task_group *tg, void *data) 1617static int tg_shares_up(struct task_group *tg, void *data)
1560{ 1618{
1561 unsigned long weight, rq_weight = 0; 1619 unsigned long weight, rq_weight = 0, shares = 0;
1562 unsigned long shares = 0; 1620 struct update_shares_data *usd;
1563 struct sched_domain *sd = data; 1621 struct sched_domain *sd = data;
1622 unsigned long flags;
1564 int i; 1623 int i;
1565 1624
1625 if (!tg->se[0])
1626 return 0;
1627
1628 local_irq_save(flags);
1629 usd = &__get_cpu_var(update_shares_data);
1630
1566 for_each_cpu(i, sched_domain_span(sd)) { 1631 for_each_cpu(i, sched_domain_span(sd)) {
1632 weight = tg->cfs_rq[i]->load.weight;
1633 usd->rq_weight[i] = weight;
1634
1567 /* 1635 /*
1568 * If there are currently no tasks on the cpu pretend there 1636 * If there are currently no tasks on the cpu pretend there
1569 * is one of average load so that when a new task gets to 1637 * is one of average load so that when a new task gets to
1570 * run here it will not get delayed by group starvation. 1638 * run here it will not get delayed by group starvation.
1571 */ 1639 */
1572 weight = tg->cfs_rq[i]->load.weight;
1573 if (!weight) 1640 if (!weight)
1574 weight = NICE_0_LOAD; 1641 weight = NICE_0_LOAD;
1575 1642
1576 tg->cfs_rq[i]->rq_weight = weight;
1577 rq_weight += weight; 1643 rq_weight += weight;
1578 shares += tg->cfs_rq[i]->shares; 1644 shares += tg->cfs_rq[i]->shares;
1579 } 1645 }
@@ -1585,7 +1651,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
1585 shares = tg->shares; 1651 shares = tg->shares;
1586 1652
1587 for_each_cpu(i, sched_domain_span(sd)) 1653 for_each_cpu(i, sched_domain_span(sd))
1588 update_group_shares_cpu(tg, i, shares, rq_weight); 1654 update_group_shares_cpu(tg, i, shares, rq_weight, usd);
1655
1656 local_irq_restore(flags);
1589 1657
1590 return 0; 1658 return 0;
1591} 1659}
@@ -1615,8 +1683,14 @@ static int tg_load_down(struct task_group *tg, void *data)
1615 1683
1616static void update_shares(struct sched_domain *sd) 1684static void update_shares(struct sched_domain *sd)
1617{ 1685{
1618 u64 now = cpu_clock(raw_smp_processor_id()); 1686 s64 elapsed;
1619 s64 elapsed = now - sd->last_update; 1687 u64 now;
1688
1689 if (root_task_group_empty())
1690 return;
1691
1692 now = cpu_clock(raw_smp_processor_id());
1693 elapsed = now - sd->last_update;
1620 1694
1621 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1695 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1622 sd->last_update = now; 1696 sd->last_update = now;
@@ -1626,6 +1700,9 @@ static void update_shares(struct sched_domain *sd)
1626 1700
1627static void update_shares_locked(struct rq *rq, struct sched_domain *sd) 1701static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1628{ 1702{
1703 if (root_task_group_empty())
1704 return;
1705
1629 spin_unlock(&rq->lock); 1706 spin_unlock(&rq->lock);
1630 update_shares(sd); 1707 update_shares(sd);
1631 spin_lock(&rq->lock); 1708 spin_lock(&rq->lock);
@@ -1633,6 +1710,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1633 1710
1634static void update_h_load(long cpu) 1711static void update_h_load(long cpu)
1635{ 1712{
1713 if (root_task_group_empty())
1714 return;
1715
1636 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1716 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1637} 1717}
1638 1718
@@ -1650,6 +1730,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1650 1730
1651#ifdef CONFIG_PREEMPT 1731#ifdef CONFIG_PREEMPT
1652 1732
1733static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1734
1653/* 1735/*
1654 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1736 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1655 * way at the expense of forcing extra atomic operations in all 1737 * way at the expense of forcing extra atomic operations in all
@@ -1914,13 +1996,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1914} 1996}
1915 1997
1916#ifdef CONFIG_SMP 1998#ifdef CONFIG_SMP
1917
1918/* Used instead of source_load when we know the type == 0 */
1919static unsigned long weighted_cpuload(const int cpu)
1920{
1921 return cpu_rq(cpu)->load.weight;
1922}
1923
1924/* 1999/*
1925 * Is this task likely cache-hot: 2000 * Is this task likely cache-hot:
1926 */ 2001 */
@@ -1978,7 +2053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1978 if (task_hot(p, old_rq->clock, NULL)) 2053 if (task_hot(p, old_rq->clock, NULL))
1979 schedstat_inc(p, se.nr_forced2_migrations); 2054 schedstat_inc(p, se.nr_forced2_migrations);
1980#endif 2055#endif
1981 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2056 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
1982 1, 1, NULL, 0); 2057 1, 1, NULL, 0);
1983 } 2058 }
1984 p->se.vruntime -= old_cfsrq->min_vruntime - 2059 p->se.vruntime -= old_cfsrq->min_vruntime -
@@ -2194,186 +2269,6 @@ void kick_process(struct task_struct *p)
2194 preempt_enable(); 2269 preempt_enable();
2195} 2270}
2196EXPORT_SYMBOL_GPL(kick_process); 2271EXPORT_SYMBOL_GPL(kick_process);
2197
2198/*
2199 * Return a low guess at the load of a migration-source cpu weighted
2200 * according to the scheduling class and "nice" value.
2201 *
2202 * We want to under-estimate the load of migration sources, to
2203 * balance conservatively.
2204 */
2205static unsigned long source_load(int cpu, int type)
2206{
2207 struct rq *rq = cpu_rq(cpu);
2208 unsigned long total = weighted_cpuload(cpu);
2209
2210 if (type == 0 || !sched_feat(LB_BIAS))
2211 return total;
2212
2213 return min(rq->cpu_load[type-1], total);
2214}
2215
2216/*
2217 * Return a high guess at the load of a migration-target cpu weighted
2218 * according to the scheduling class and "nice" value.
2219 */
2220static unsigned long target_load(int cpu, int type)
2221{
2222 struct rq *rq = cpu_rq(cpu);
2223 unsigned long total = weighted_cpuload(cpu);
2224
2225 if (type == 0 || !sched_feat(LB_BIAS))
2226 return total;
2227
2228 return max(rq->cpu_load[type-1], total);
2229}
2230
2231/*
2232 * find_idlest_group finds and returns the least busy CPU group within the
2233 * domain.
2234 */
2235static struct sched_group *
2236find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2237{
2238 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2239 unsigned long min_load = ULONG_MAX, this_load = 0;
2240 int load_idx = sd->forkexec_idx;
2241 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2242
2243 do {
2244 unsigned long load, avg_load;
2245 int local_group;
2246 int i;
2247
2248 /* Skip over this group if it has no CPUs allowed */
2249 if (!cpumask_intersects(sched_group_cpus(group),
2250 &p->cpus_allowed))
2251 continue;
2252
2253 local_group = cpumask_test_cpu(this_cpu,
2254 sched_group_cpus(group));
2255
2256 /* Tally up the load of all CPUs in the group */
2257 avg_load = 0;
2258
2259 for_each_cpu(i, sched_group_cpus(group)) {
2260 /* Bias balancing toward cpus of our domain */
2261 if (local_group)
2262 load = source_load(i, load_idx);
2263 else
2264 load = target_load(i, load_idx);
2265
2266 avg_load += load;
2267 }
2268
2269 /* Adjust by relative CPU power of the group */
2270 avg_load = sg_div_cpu_power(group,
2271 avg_load * SCHED_LOAD_SCALE);
2272
2273 if (local_group) {
2274 this_load = avg_load;
2275 this = group;
2276 } else if (avg_load < min_load) {
2277 min_load = avg_load;
2278 idlest = group;
2279 }
2280 } while (group = group->next, group != sd->groups);
2281
2282 if (!idlest || 100*this_load < imbalance*min_load)
2283 return NULL;
2284 return idlest;
2285}
2286
2287/*
2288 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2289 */
2290static int
2291find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2292{
2293 unsigned long load, min_load = ULONG_MAX;
2294 int idlest = -1;
2295 int i;
2296
2297 /* Traverse only the allowed CPUs */
2298 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2299 load = weighted_cpuload(i);
2300
2301 if (load < min_load || (load == min_load && i == this_cpu)) {
2302 min_load = load;
2303 idlest = i;
2304 }
2305 }
2306
2307 return idlest;
2308}
2309
2310/*
2311 * sched_balance_self: balance the current task (running on cpu) in domains
2312 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2313 * SD_BALANCE_EXEC.
2314 *
2315 * Balance, ie. select the least loaded group.
2316 *
2317 * Returns the target CPU number, or the same CPU if no balancing is needed.
2318 *
2319 * preempt must be disabled.
2320 */
2321static int sched_balance_self(int cpu, int flag)
2322{
2323 struct task_struct *t = current;
2324 struct sched_domain *tmp, *sd = NULL;
2325
2326 for_each_domain(cpu, tmp) {
2327 /*
2328 * If power savings logic is enabled for a domain, stop there.
2329 */
2330 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2331 break;
2332 if (tmp->flags & flag)
2333 sd = tmp;
2334 }
2335
2336 if (sd)
2337 update_shares(sd);
2338
2339 while (sd) {
2340 struct sched_group *group;
2341 int new_cpu, weight;
2342
2343 if (!(sd->flags & flag)) {
2344 sd = sd->child;
2345 continue;
2346 }
2347
2348 group = find_idlest_group(sd, t, cpu);
2349 if (!group) {
2350 sd = sd->child;
2351 continue;
2352 }
2353
2354 new_cpu = find_idlest_cpu(group, t, cpu);
2355 if (new_cpu == -1 || new_cpu == cpu) {
2356 /* Now try balancing at a lower domain level of cpu */
2357 sd = sd->child;
2358 continue;
2359 }
2360
2361 /* Now try balancing at a lower domain level of new_cpu */
2362 cpu = new_cpu;
2363 weight = cpumask_weight(sched_domain_span(sd));
2364 sd = NULL;
2365 for_each_domain(cpu, tmp) {
2366 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2367 break;
2368 if (tmp->flags & flag)
2369 sd = tmp;
2370 }
2371 /* while loop will break here if sd == NULL */
2372 }
2373
2374 return cpu;
2375}
2376
2377#endif /* CONFIG_SMP */ 2272#endif /* CONFIG_SMP */
2378 2273
2379/** 2274/**
@@ -2411,37 +2306,22 @@ void task_oncpu_function_call(struct task_struct *p,
2411 * 2306 *
2412 * returns failure only if the task is already active. 2307 * returns failure only if the task is already active.
2413 */ 2308 */
2414static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 2309static int try_to_wake_up(struct task_struct *p, unsigned int state,
2310 int wake_flags)
2415{ 2311{
2416 int cpu, orig_cpu, this_cpu, success = 0; 2312 int cpu, orig_cpu, this_cpu, success = 0;
2417 unsigned long flags; 2313 unsigned long flags;
2418 long old_state;
2419 struct rq *rq; 2314 struct rq *rq;
2420 2315
2421 if (!sched_feat(SYNC_WAKEUPS)) 2316 if (!sched_feat(SYNC_WAKEUPS))
2422 sync = 0; 2317 wake_flags &= ~WF_SYNC;
2423 2318
2424#ifdef CONFIG_SMP 2319 this_cpu = get_cpu();
2425 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2426 struct sched_domain *sd;
2427
2428 this_cpu = raw_smp_processor_id();
2429 cpu = task_cpu(p);
2430
2431 for_each_domain(this_cpu, sd) {
2432 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2433 update_shares(sd);
2434 break;
2435 }
2436 }
2437 }
2438#endif
2439 2320
2440 smp_wmb(); 2321 smp_wmb();
2441 rq = task_rq_lock(p, &flags); 2322 rq = task_rq_lock(p, &flags);
2442 update_rq_clock(rq); 2323 update_rq_clock(rq);
2443 old_state = p->state; 2324 if (!(p->state & state))
2444 if (!(old_state & state))
2445 goto out; 2325 goto out;
2446 2326
2447 if (p->se.on_rq) 2327 if (p->se.on_rq)
@@ -2449,27 +2329,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2449 2329
2450 cpu = task_cpu(p); 2330 cpu = task_cpu(p);
2451 orig_cpu = cpu; 2331 orig_cpu = cpu;
2452 this_cpu = smp_processor_id();
2453 2332
2454#ifdef CONFIG_SMP 2333#ifdef CONFIG_SMP
2455 if (unlikely(task_running(rq, p))) 2334 if (unlikely(task_running(rq, p)))
2456 goto out_activate; 2335 goto out_activate;
2457 2336
2458 cpu = p->sched_class->select_task_rq(p, sync); 2337 /*
2459 if (cpu != orig_cpu) { 2338 * In order to handle concurrent wakeups and release the rq->lock
2339 * we put the task in TASK_WAKING state.
2340 *
2341 * First fix up the nr_uninterruptible count:
2342 */
2343 if (task_contributes_to_load(p))
2344 rq->nr_uninterruptible--;
2345 p->state = TASK_WAKING;
2346 task_rq_unlock(rq, &flags);
2347
2348 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2349 if (cpu != orig_cpu)
2460 set_task_cpu(p, cpu); 2350 set_task_cpu(p, cpu);
2461 task_rq_unlock(rq, &flags);
2462 /* might preempt at this point */
2463 rq = task_rq_lock(p, &flags);
2464 old_state = p->state;
2465 if (!(old_state & state))
2466 goto out;
2467 if (p->se.on_rq)
2468 goto out_running;
2469 2351
2470 this_cpu = smp_processor_id(); 2352 rq = task_rq_lock(p, &flags);
2471 cpu = task_cpu(p); 2353 WARN_ON(p->state != TASK_WAKING);
2472 } 2354 cpu = task_cpu(p);
2473 2355
2474#ifdef CONFIG_SCHEDSTATS 2356#ifdef CONFIG_SCHEDSTATS
2475 schedstat_inc(rq, ttwu_count); 2357 schedstat_inc(rq, ttwu_count);
@@ -2489,7 +2371,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2489out_activate: 2371out_activate:
2490#endif /* CONFIG_SMP */ 2372#endif /* CONFIG_SMP */
2491 schedstat_inc(p, se.nr_wakeups); 2373 schedstat_inc(p, se.nr_wakeups);
2492 if (sync) 2374 if (wake_flags & WF_SYNC)
2493 schedstat_inc(p, se.nr_wakeups_sync); 2375 schedstat_inc(p, se.nr_wakeups_sync);
2494 if (orig_cpu != cpu) 2376 if (orig_cpu != cpu)
2495 schedstat_inc(p, se.nr_wakeups_migrate); 2377 schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2518,7 +2400,7 @@ out_activate:
2518 2400
2519out_running: 2401out_running:
2520 trace_sched_wakeup(rq, p, success); 2402 trace_sched_wakeup(rq, p, success);
2521 check_preempt_curr(rq, p, sync); 2403 check_preempt_curr(rq, p, wake_flags);
2522 2404
2523 p->state = TASK_RUNNING; 2405 p->state = TASK_RUNNING;
2524#ifdef CONFIG_SMP 2406#ifdef CONFIG_SMP
@@ -2527,6 +2409,7 @@ out_running:
2527#endif 2409#endif
2528out: 2410out:
2529 task_rq_unlock(rq, &flags); 2411 task_rq_unlock(rq, &flags);
2412 put_cpu();
2530 2413
2531 return success; 2414 return success;
2532} 2415}
@@ -2569,17 +2452,40 @@ static void __sched_fork(struct task_struct *p)
2569 p->se.avg_overlap = 0; 2452 p->se.avg_overlap = 0;
2570 p->se.start_runtime = 0; 2453 p->se.start_runtime = 0;
2571 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2454 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2455 p->se.avg_running = 0;
2572 2456
2573#ifdef CONFIG_SCHEDSTATS 2457#ifdef CONFIG_SCHEDSTATS
2574 p->se.wait_start = 0; 2458 p->se.wait_start = 0;
2575 p->se.sum_sleep_runtime = 0; 2459 p->se.wait_max = 0;
2576 p->se.sleep_start = 0; 2460 p->se.wait_count = 0;
2577 p->se.block_start = 0; 2461 p->se.wait_sum = 0;
2578 p->se.sleep_max = 0; 2462
2579 p->se.block_max = 0; 2463 p->se.sleep_start = 0;
2580 p->se.exec_max = 0; 2464 p->se.sleep_max = 0;
2581 p->se.slice_max = 0; 2465 p->se.sum_sleep_runtime = 0;
2582 p->se.wait_max = 0; 2466
2467 p->se.block_start = 0;
2468 p->se.block_max = 0;
2469 p->se.exec_max = 0;
2470 p->se.slice_max = 0;
2471
2472 p->se.nr_migrations_cold = 0;
2473 p->se.nr_failed_migrations_affine = 0;
2474 p->se.nr_failed_migrations_running = 0;
2475 p->se.nr_failed_migrations_hot = 0;
2476 p->se.nr_forced_migrations = 0;
2477 p->se.nr_forced2_migrations = 0;
2478
2479 p->se.nr_wakeups = 0;
2480 p->se.nr_wakeups_sync = 0;
2481 p->se.nr_wakeups_migrate = 0;
2482 p->se.nr_wakeups_local = 0;
2483 p->se.nr_wakeups_remote = 0;
2484 p->se.nr_wakeups_affine = 0;
2485 p->se.nr_wakeups_affine_attempts = 0;
2486 p->se.nr_wakeups_passive = 0;
2487 p->se.nr_wakeups_idle = 0;
2488
2583#endif 2489#endif
2584 2490
2585 INIT_LIST_HEAD(&p->rt.run_list); 2491 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2608,18 +2514,41 @@ void sched_fork(struct task_struct *p, int clone_flags)
2608 2514
2609 __sched_fork(p); 2515 __sched_fork(p);
2610 2516
2611#ifdef CONFIG_SMP
2612 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2613#endif
2614 set_task_cpu(p, cpu);
2615
2616 /* 2517 /*
2617 * Make sure we do not leak PI boosting priority to the child: 2518 * Make sure we do not leak PI boosting priority to the child.
2618 */ 2519 */
2619 p->prio = current->normal_prio; 2520 p->prio = current->normal_prio;
2521
2522 /*
2523 * Revert to default priority/policy on fork if requested.
2524 */
2525 if (unlikely(p->sched_reset_on_fork)) {
2526 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
2527 p->policy = SCHED_NORMAL;
2528
2529 if (p->normal_prio < DEFAULT_PRIO)
2530 p->prio = DEFAULT_PRIO;
2531
2532 if (PRIO_TO_NICE(p->static_prio) < 0) {
2533 p->static_prio = NICE_TO_PRIO(0);
2534 set_load_weight(p);
2535 }
2536
2537 /*
2538 * We don't need the reset flag anymore after the fork. It has
2539 * fulfilled its duty:
2540 */
2541 p->sched_reset_on_fork = 0;
2542 }
2543
2620 if (!rt_prio(p->prio)) 2544 if (!rt_prio(p->prio))
2621 p->sched_class = &fair_sched_class; 2545 p->sched_class = &fair_sched_class;
2622 2546
2547#ifdef CONFIG_SMP
2548 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2549#endif
2550 set_task_cpu(p, cpu);
2551
2623#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2552#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2624 if (likely(sched_info_on())) 2553 if (likely(sched_info_on()))
2625 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2554 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2665,7 +2594,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2665 inc_nr_running(rq); 2594 inc_nr_running(rq);
2666 } 2595 }
2667 trace_sched_wakeup_new(rq, p, 1); 2596 trace_sched_wakeup_new(rq, p, 1);
2668 check_preempt_curr(rq, p, 0); 2597 check_preempt_curr(rq, p, WF_FORK);
2669#ifdef CONFIG_SMP 2598#ifdef CONFIG_SMP
2670 if (p->sched_class->task_wake_up) 2599 if (p->sched_class->task_wake_up)
2671 p->sched_class->task_wake_up(rq, p); 2600 p->sched_class->task_wake_up(rq, p);
@@ -2773,12 +2702,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2773{ 2702{
2774 struct mm_struct *mm = rq->prev_mm; 2703 struct mm_struct *mm = rq->prev_mm;
2775 long prev_state; 2704 long prev_state;
2776#ifdef CONFIG_SMP
2777 int post_schedule = 0;
2778
2779 if (current->sched_class->needs_post_schedule)
2780 post_schedule = current->sched_class->needs_post_schedule(rq);
2781#endif
2782 2705
2783 rq->prev_mm = NULL; 2706 rq->prev_mm = NULL;
2784 2707
@@ -2795,12 +2718,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2795 */ 2718 */
2796 prev_state = prev->state; 2719 prev_state = prev->state;
2797 finish_arch_switch(prev); 2720 finish_arch_switch(prev);
2798 perf_counter_task_sched_in(current, cpu_of(rq)); 2721 perf_event_task_sched_in(current, cpu_of(rq));
2799 finish_lock_switch(rq, prev); 2722 finish_lock_switch(rq, prev);
2800#ifdef CONFIG_SMP
2801 if (post_schedule)
2802 current->sched_class->post_schedule(rq);
2803#endif
2804 2723
2805 fire_sched_in_preempt_notifiers(current); 2724 fire_sched_in_preempt_notifiers(current);
2806 if (mm) 2725 if (mm)
@@ -2815,6 +2734,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2815 } 2734 }
2816} 2735}
2817 2736
2737#ifdef CONFIG_SMP
2738
2739/* assumes rq->lock is held */
2740static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2741{
2742 if (prev->sched_class->pre_schedule)
2743 prev->sched_class->pre_schedule(rq, prev);
2744}
2745
2746/* rq->lock is NOT held, but preemption is disabled */
2747static inline void post_schedule(struct rq *rq)
2748{
2749 if (rq->post_schedule) {
2750 unsigned long flags;
2751
2752 spin_lock_irqsave(&rq->lock, flags);
2753 if (rq->curr->sched_class->post_schedule)
2754 rq->curr->sched_class->post_schedule(rq);
2755 spin_unlock_irqrestore(&rq->lock, flags);
2756
2757 rq->post_schedule = 0;
2758 }
2759}
2760
2761#else
2762
2763static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2764{
2765}
2766
2767static inline void post_schedule(struct rq *rq)
2768{
2769}
2770
2771#endif
2772
2818/** 2773/**
2819 * schedule_tail - first thing a freshly forked thread must call. 2774 * schedule_tail - first thing a freshly forked thread must call.
2820 * @prev: the thread we just switched away from. 2775 * @prev: the thread we just switched away from.
@@ -2825,6 +2780,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
2825 struct rq *rq = this_rq(); 2780 struct rq *rq = this_rq();
2826 2781
2827 finish_task_switch(rq, prev); 2782 finish_task_switch(rq, prev);
2783
2784 /*
2785 * FIXME: do we need to worry about rq being invalidated by the
2786 * task_switch?
2787 */
2788 post_schedule(rq);
2789
2828#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2790#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2829 /* In this case, finish_task_switch does not reenable preemption */ 2791 /* In this case, finish_task_switch does not reenable preemption */
2830 preempt_enable(); 2792 preempt_enable();
@@ -2942,6 +2904,19 @@ unsigned long nr_iowait(void)
2942 return sum; 2904 return sum;
2943} 2905}
2944 2906
2907unsigned long nr_iowait_cpu(void)
2908{
2909 struct rq *this = this_rq();
2910 return atomic_read(&this->nr_iowait);
2911}
2912
2913unsigned long this_cpu_load(void)
2914{
2915 struct rq *this = this_rq();
2916 return this->cpu_load[0];
2917}
2918
2919
2945/* Variables and functions for calc_load */ 2920/* Variables and functions for calc_load */
2946static atomic_long_t calc_load_tasks; 2921static atomic_long_t calc_load_tasks;
2947static unsigned long calc_load_update; 2922static unsigned long calc_load_update;
@@ -3141,7 +3116,7 @@ out:
3141void sched_exec(void) 3116void sched_exec(void)
3142{ 3117{
3143 int new_cpu, this_cpu = get_cpu(); 3118 int new_cpu, this_cpu = get_cpu();
3144 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 3119 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3145 put_cpu(); 3120 put_cpu();
3146 if (new_cpu != this_cpu) 3121 if (new_cpu != this_cpu)
3147 sched_migrate_task(current, new_cpu); 3122 sched_migrate_task(current, new_cpu);
@@ -3356,9 +3331,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3356{ 3331{
3357 const struct sched_class *class; 3332 const struct sched_class *class;
3358 3333
3359 for (class = sched_class_highest; class; class = class->next) 3334 for_each_class(class) {
3360 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) 3335 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3361 return 1; 3336 return 1;
3337 }
3362 3338
3363 return 0; 3339 return 0;
3364} 3340}
@@ -3521,7 +3497,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3521 * capacity but still has some space to pick up some load 3497 * capacity but still has some space to pick up some load
3522 * from other group and save more power 3498 * from other group and save more power
3523 */ 3499 */
3524 if (sgs->sum_nr_running > sgs->group_capacity - 1) 3500 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3525 return; 3501 return;
3526 3502
3527 if (sgs->sum_nr_running > sds->leader_nr_running || 3503 if (sgs->sum_nr_running > sds->leader_nr_running ||
@@ -3560,11 +3536,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3560 *imbalance = sds->min_load_per_task; 3536 *imbalance = sds->min_load_per_task;
3561 sds->busiest = sds->group_min; 3537 sds->busiest = sds->group_min;
3562 3538
3563 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3564 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3565 group_first_cpu(sds->group_leader);
3566 }
3567
3568 return 1; 3539 return 1;
3569 3540
3570} 3541}
@@ -3589,6 +3560,102 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3589#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3560#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3590 3561
3591 3562
3563unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3564{
3565 return SCHED_LOAD_SCALE;
3566}
3567
3568unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3569{
3570 return default_scale_freq_power(sd, cpu);
3571}
3572
3573unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3574{
3575 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3576 unsigned long smt_gain = sd->smt_gain;
3577
3578 smt_gain /= weight;
3579
3580 return smt_gain;
3581}
3582
3583unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3584{
3585 return default_scale_smt_power(sd, cpu);
3586}
3587
3588unsigned long scale_rt_power(int cpu)
3589{
3590 struct rq *rq = cpu_rq(cpu);
3591 u64 total, available;
3592
3593 sched_avg_update(rq);
3594
3595 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3596 available = total - rq->rt_avg;
3597
3598 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3599 total = SCHED_LOAD_SCALE;
3600
3601 total >>= SCHED_LOAD_SHIFT;
3602
3603 return div_u64(available, total);
3604}
3605
3606static void update_cpu_power(struct sched_domain *sd, int cpu)
3607{
3608 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3609 unsigned long power = SCHED_LOAD_SCALE;
3610 struct sched_group *sdg = sd->groups;
3611
3612 if (sched_feat(ARCH_POWER))
3613 power *= arch_scale_freq_power(sd, cpu);
3614 else
3615 power *= default_scale_freq_power(sd, cpu);
3616
3617 power >>= SCHED_LOAD_SHIFT;
3618
3619 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3620 if (sched_feat(ARCH_POWER))
3621 power *= arch_scale_smt_power(sd, cpu);
3622 else
3623 power *= default_scale_smt_power(sd, cpu);
3624
3625 power >>= SCHED_LOAD_SHIFT;
3626 }
3627
3628 power *= scale_rt_power(cpu);
3629 power >>= SCHED_LOAD_SHIFT;
3630
3631 if (!power)
3632 power = 1;
3633
3634 sdg->cpu_power = power;
3635}
3636
3637static void update_group_power(struct sched_domain *sd, int cpu)
3638{
3639 struct sched_domain *child = sd->child;
3640 struct sched_group *group, *sdg = sd->groups;
3641 unsigned long power;
3642
3643 if (!child) {
3644 update_cpu_power(sd, cpu);
3645 return;
3646 }
3647
3648 power = 0;
3649
3650 group = child->groups;
3651 do {
3652 power += group->cpu_power;
3653 group = group->next;
3654 } while (group != child->groups);
3655
3656 sdg->cpu_power = power;
3657}
3658
3592/** 3659/**
3593 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3660 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3594 * @group: sched_group whose statistics are to be updated. 3661 * @group: sched_group whose statistics are to be updated.
@@ -3601,7 +3668,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3601 * @balance: Should we balance. 3668 * @balance: Should we balance.
3602 * @sgs: variable to hold the statistics for this group. 3669 * @sgs: variable to hold the statistics for this group.
3603 */ 3670 */
3604static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, 3671static inline void update_sg_lb_stats(struct sched_domain *sd,
3672 struct sched_group *group, int this_cpu,
3605 enum cpu_idle_type idle, int load_idx, int *sd_idle, 3673 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3606 int local_group, const struct cpumask *cpus, 3674 int local_group, const struct cpumask *cpus,
3607 int *balance, struct sg_lb_stats *sgs) 3675 int *balance, struct sg_lb_stats *sgs)
@@ -3612,8 +3680,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3612 unsigned long sum_avg_load_per_task; 3680 unsigned long sum_avg_load_per_task;
3613 unsigned long avg_load_per_task; 3681 unsigned long avg_load_per_task;
3614 3682
3615 if (local_group) 3683 if (local_group) {
3616 balance_cpu = group_first_cpu(group); 3684 balance_cpu = group_first_cpu(group);
3685 if (balance_cpu == this_cpu)
3686 update_group_power(sd, this_cpu);
3687 }
3617 3688
3618 /* Tally up the load of all CPUs in the group */ 3689 /* Tally up the load of all CPUs in the group */
3619 sum_avg_load_per_task = avg_load_per_task = 0; 3690 sum_avg_load_per_task = avg_load_per_task = 0;
@@ -3662,8 +3733,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3662 } 3733 }
3663 3734
3664 /* Adjust by relative CPU power of the group */ 3735 /* Adjust by relative CPU power of the group */
3665 sgs->avg_load = sg_div_cpu_power(group, 3736 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3666 sgs->group_load * SCHED_LOAD_SCALE);
3667 3737
3668 3738
3669 /* 3739 /*
@@ -3675,14 +3745,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3675 * normalized nr_running number somewhere that negates 3745 * normalized nr_running number somewhere that negates
3676 * the hierarchy? 3746 * the hierarchy?
3677 */ 3747 */
3678 avg_load_per_task = sg_div_cpu_power(group, 3748 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3679 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3749 group->cpu_power;
3680 3750
3681 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3751 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3682 sgs->group_imb = 1; 3752 sgs->group_imb = 1;
3683 3753
3684 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3754 sgs->group_capacity =
3685 3755 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3686} 3756}
3687 3757
3688/** 3758/**
@@ -3700,9 +3770,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3700 const struct cpumask *cpus, int *balance, 3770 const struct cpumask *cpus, int *balance,
3701 struct sd_lb_stats *sds) 3771 struct sd_lb_stats *sds)
3702{ 3772{
3773 struct sched_domain *child = sd->child;
3703 struct sched_group *group = sd->groups; 3774 struct sched_group *group = sd->groups;
3704 struct sg_lb_stats sgs; 3775 struct sg_lb_stats sgs;
3705 int load_idx; 3776 int load_idx, prefer_sibling = 0;
3777
3778 if (child && child->flags & SD_PREFER_SIBLING)
3779 prefer_sibling = 1;
3706 3780
3707 init_sd_power_savings_stats(sd, sds, idle); 3781 init_sd_power_savings_stats(sd, sds, idle);
3708 load_idx = get_sd_load_idx(sd, idle); 3782 load_idx = get_sd_load_idx(sd, idle);
@@ -3713,14 +3787,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3713 local_group = cpumask_test_cpu(this_cpu, 3787 local_group = cpumask_test_cpu(this_cpu,
3714 sched_group_cpus(group)); 3788 sched_group_cpus(group));
3715 memset(&sgs, 0, sizeof(sgs)); 3789 memset(&sgs, 0, sizeof(sgs));
3716 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, 3790 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3717 local_group, cpus, balance, &sgs); 3791 local_group, cpus, balance, &sgs);
3718 3792
3719 if (local_group && balance && !(*balance)) 3793 if (local_group && balance && !(*balance))
3720 return; 3794 return;
3721 3795
3722 sds->total_load += sgs.group_load; 3796 sds->total_load += sgs.group_load;
3723 sds->total_pwr += group->__cpu_power; 3797 sds->total_pwr += group->cpu_power;
3798
3799 /*
3800 * In case the child domain prefers tasks go to siblings
3801 * first, lower the group capacity to one so that we'll try
3802 * and move all the excess tasks away.
3803 */
3804 if (prefer_sibling)
3805 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3724 3806
3725 if (local_group) { 3807 if (local_group) {
3726 sds->this_load = sgs.avg_load; 3808 sds->this_load = sgs.avg_load;
@@ -3740,7 +3822,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3740 update_sd_power_savings_stats(group, sds, local_group, &sgs); 3822 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3741 group = group->next; 3823 group = group->next;
3742 } while (group != sd->groups); 3824 } while (group != sd->groups);
3743
3744} 3825}
3745 3826
3746/** 3827/**
@@ -3778,28 +3859,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3778 * moving them. 3859 * moving them.
3779 */ 3860 */
3780 3861
3781 pwr_now += sds->busiest->__cpu_power * 3862 pwr_now += sds->busiest->cpu_power *
3782 min(sds->busiest_load_per_task, sds->max_load); 3863 min(sds->busiest_load_per_task, sds->max_load);
3783 pwr_now += sds->this->__cpu_power * 3864 pwr_now += sds->this->cpu_power *
3784 min(sds->this_load_per_task, sds->this_load); 3865 min(sds->this_load_per_task, sds->this_load);
3785 pwr_now /= SCHED_LOAD_SCALE; 3866 pwr_now /= SCHED_LOAD_SCALE;
3786 3867
3787 /* Amount of load we'd subtract */ 3868 /* Amount of load we'd subtract */
3788 tmp = sg_div_cpu_power(sds->busiest, 3869 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3789 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3870 sds->busiest->cpu_power;
3790 if (sds->max_load > tmp) 3871 if (sds->max_load > tmp)
3791 pwr_move += sds->busiest->__cpu_power * 3872 pwr_move += sds->busiest->cpu_power *
3792 min(sds->busiest_load_per_task, sds->max_load - tmp); 3873 min(sds->busiest_load_per_task, sds->max_load - tmp);
3793 3874
3794 /* Amount of load we'd add */ 3875 /* Amount of load we'd add */
3795 if (sds->max_load * sds->busiest->__cpu_power < 3876 if (sds->max_load * sds->busiest->cpu_power <
3796 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 3877 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3797 tmp = sg_div_cpu_power(sds->this, 3878 tmp = (sds->max_load * sds->busiest->cpu_power) /
3798 sds->max_load * sds->busiest->__cpu_power); 3879 sds->this->cpu_power;
3799 else 3880 else
3800 tmp = sg_div_cpu_power(sds->this, 3881 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3801 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3882 sds->this->cpu_power;
3802 pwr_move += sds->this->__cpu_power * 3883 pwr_move += sds->this->cpu_power *
3803 min(sds->this_load_per_task, sds->this_load + tmp); 3884 min(sds->this_load_per_task, sds->this_load + tmp);
3804 pwr_move /= SCHED_LOAD_SCALE; 3885 pwr_move /= SCHED_LOAD_SCALE;
3805 3886
@@ -3834,8 +3915,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3834 sds->max_load - sds->busiest_load_per_task); 3915 sds->max_load - sds->busiest_load_per_task);
3835 3916
3836 /* How much load to actually move to equalise the imbalance */ 3917 /* How much load to actually move to equalise the imbalance */
3837 *imbalance = min(max_pull * sds->busiest->__cpu_power, 3918 *imbalance = min(max_pull * sds->busiest->cpu_power,
3838 (sds->avg_load - sds->this_load) * sds->this->__cpu_power) 3919 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3839 / SCHED_LOAD_SCALE; 3920 / SCHED_LOAD_SCALE;
3840 3921
3841 /* 3922 /*
@@ -3965,15 +4046,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3965 int i; 4046 int i;
3966 4047
3967 for_each_cpu(i, sched_group_cpus(group)) { 4048 for_each_cpu(i, sched_group_cpus(group)) {
4049 unsigned long power = power_of(i);
4050 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
3968 unsigned long wl; 4051 unsigned long wl;
3969 4052
3970 if (!cpumask_test_cpu(i, cpus)) 4053 if (!cpumask_test_cpu(i, cpus))
3971 continue; 4054 continue;
3972 4055
3973 rq = cpu_rq(i); 4056 rq = cpu_rq(i);
3974 wl = weighted_cpuload(i); 4057 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4058 wl /= power;
3975 4059
3976 if (rq->nr_running == 1 && wl > imbalance) 4060 if (capacity && rq->nr_running == 1 && wl > imbalance)
3977 continue; 4061 continue;
3978 4062
3979 if (wl > max_load) { 4063 if (wl > max_load) {
@@ -5122,7 +5206,7 @@ void scheduler_tick(void)
5122 curr->sched_class->task_tick(rq, curr, 0); 5206 curr->sched_class->task_tick(rq, curr, 0);
5123 spin_unlock(&rq->lock); 5207 spin_unlock(&rq->lock);
5124 5208
5125 perf_counter_task_tick(curr, cpu); 5209 perf_event_task_tick(curr, cpu);
5126 5210
5127#ifdef CONFIG_SMP 5211#ifdef CONFIG_SMP
5128 rq->idle_at_tick = idle_cpu(cpu); 5212 rq->idle_at_tick = idle_cpu(cpu);
@@ -5234,14 +5318,13 @@ static inline void schedule_debug(struct task_struct *prev)
5234#endif 5318#endif
5235} 5319}
5236 5320
5237static void put_prev_task(struct rq *rq, struct task_struct *prev) 5321static void put_prev_task(struct rq *rq, struct task_struct *p)
5238{ 5322{
5239 if (prev->state == TASK_RUNNING) { 5323 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5240 u64 runtime = prev->se.sum_exec_runtime;
5241 5324
5242 runtime -= prev->se.prev_sum_exec_runtime; 5325 update_avg(&p->se.avg_running, runtime);
5243 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5244 5326
5327 if (p->state == TASK_RUNNING) {
5245 /* 5328 /*
5246 * In order to avoid avg_overlap growing stale when we are 5329 * In order to avoid avg_overlap growing stale when we are
5247 * indeed overlapping and hence not getting put to sleep, grow 5330 * indeed overlapping and hence not getting put to sleep, grow
@@ -5251,9 +5334,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
5251 * correlates to the amount of cache footprint a task can 5334 * correlates to the amount of cache footprint a task can
5252 * build up. 5335 * build up.
5253 */ 5336 */
5254 update_avg(&prev->se.avg_overlap, runtime); 5337 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5338 update_avg(&p->se.avg_overlap, runtime);
5339 } else {
5340 update_avg(&p->se.avg_running, 0);
5255 } 5341 }
5256 prev->sched_class->put_prev_task(rq, prev); 5342 p->sched_class->put_prev_task(rq, p);
5257} 5343}
5258 5344
5259/* 5345/*
@@ -5302,7 +5388,7 @@ need_resched:
5302 preempt_disable(); 5388 preempt_disable();
5303 cpu = smp_processor_id(); 5389 cpu = smp_processor_id();
5304 rq = cpu_rq(cpu); 5390 rq = cpu_rq(cpu);
5305 rcu_qsctr_inc(cpu); 5391 rcu_sched_qs(cpu);
5306 prev = rq->curr; 5392 prev = rq->curr;
5307 switch_count = &prev->nivcsw; 5393 switch_count = &prev->nivcsw;
5308 5394
@@ -5326,10 +5412,7 @@ need_resched_nonpreemptible:
5326 switch_count = &prev->nvcsw; 5412 switch_count = &prev->nvcsw;
5327 } 5413 }
5328 5414
5329#ifdef CONFIG_SMP 5415 pre_schedule(rq, prev);
5330 if (prev->sched_class->pre_schedule)
5331 prev->sched_class->pre_schedule(rq, prev);
5332#endif
5333 5416
5334 if (unlikely(!rq->nr_running)) 5417 if (unlikely(!rq->nr_running))
5335 idle_balance(cpu, rq); 5418 idle_balance(cpu, rq);
@@ -5339,7 +5422,7 @@ need_resched_nonpreemptible:
5339 5422
5340 if (likely(prev != next)) { 5423 if (likely(prev != next)) {
5341 sched_info_switch(prev, next); 5424 sched_info_switch(prev, next);
5342 perf_counter_task_sched_out(prev, next, cpu); 5425 perf_event_task_sched_out(prev, next, cpu);
5343 5426
5344 rq->nr_switches++; 5427 rq->nr_switches++;
5345 rq->curr = next; 5428 rq->curr = next;
@@ -5355,6 +5438,8 @@ need_resched_nonpreemptible:
5355 } else 5438 } else
5356 spin_unlock_irq(&rq->lock); 5439 spin_unlock_irq(&rq->lock);
5357 5440
5441 post_schedule(rq);
5442
5358 if (unlikely(reacquire_kernel_lock(current) < 0)) 5443 if (unlikely(reacquire_kernel_lock(current) < 0))
5359 goto need_resched_nonpreemptible; 5444 goto need_resched_nonpreemptible;
5360 5445
@@ -5486,10 +5571,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
5486 5571
5487#endif /* CONFIG_PREEMPT */ 5572#endif /* CONFIG_PREEMPT */
5488 5573
5489int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 5574int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5490 void *key) 5575 void *key)
5491{ 5576{
5492 return try_to_wake_up(curr->private, mode, sync); 5577 return try_to_wake_up(curr->private, mode, wake_flags);
5493} 5578}
5494EXPORT_SYMBOL(default_wake_function); 5579EXPORT_SYMBOL(default_wake_function);
5495 5580
@@ -5503,14 +5588,14 @@ EXPORT_SYMBOL(default_wake_function);
5503 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5588 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5504 */ 5589 */
5505static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5590static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5506 int nr_exclusive, int sync, void *key) 5591 int nr_exclusive, int wake_flags, void *key)
5507{ 5592{
5508 wait_queue_t *curr, *next; 5593 wait_queue_t *curr, *next;
5509 5594
5510 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5595 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5511 unsigned flags = curr->flags; 5596 unsigned flags = curr->flags;
5512 5597
5513 if (curr->func(curr, mode, sync, key) && 5598 if (curr->func(curr, mode, wake_flags, key) &&
5514 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5599 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5515 break; 5600 break;
5516 } 5601 }
@@ -5571,16 +5656,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5571 int nr_exclusive, void *key) 5656 int nr_exclusive, void *key)
5572{ 5657{
5573 unsigned long flags; 5658 unsigned long flags;
5574 int sync = 1; 5659 int wake_flags = WF_SYNC;
5575 5660
5576 if (unlikely(!q)) 5661 if (unlikely(!q))
5577 return; 5662 return;
5578 5663
5579 if (unlikely(!nr_exclusive)) 5664 if (unlikely(!nr_exclusive))
5580 sync = 0; 5665 wake_flags = 0;
5581 5666
5582 spin_lock_irqsave(&q->lock, flags); 5667 spin_lock_irqsave(&q->lock, flags);
5583 __wake_up_common(q, mode, nr_exclusive, sync, key); 5668 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5584 spin_unlock_irqrestore(&q->lock, flags); 5669 spin_unlock_irqrestore(&q->lock, flags);
5585} 5670}
5586EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5671EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -6100,17 +6185,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6100 unsigned long flags; 6185 unsigned long flags;
6101 const struct sched_class *prev_class = p->sched_class; 6186 const struct sched_class *prev_class = p->sched_class;
6102 struct rq *rq; 6187 struct rq *rq;
6188 int reset_on_fork;
6103 6189
6104 /* may grab non-irq protected spin_locks */ 6190 /* may grab non-irq protected spin_locks */
6105 BUG_ON(in_interrupt()); 6191 BUG_ON(in_interrupt());
6106recheck: 6192recheck:
6107 /* double check policy once rq lock held */ 6193 /* double check policy once rq lock held */
6108 if (policy < 0) 6194 if (policy < 0) {
6195 reset_on_fork = p->sched_reset_on_fork;
6109 policy = oldpolicy = p->policy; 6196 policy = oldpolicy = p->policy;
6110 else if (policy != SCHED_FIFO && policy != SCHED_RR && 6197 } else {
6111 policy != SCHED_NORMAL && policy != SCHED_BATCH && 6198 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6112 policy != SCHED_IDLE) 6199 policy &= ~SCHED_RESET_ON_FORK;
6113 return -EINVAL; 6200
6201 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6202 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6203 policy != SCHED_IDLE)
6204 return -EINVAL;
6205 }
6206
6114 /* 6207 /*
6115 * Valid priorities for SCHED_FIFO and SCHED_RR are 6208 * Valid priorities for SCHED_FIFO and SCHED_RR are
6116 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 6209 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6154,6 +6247,10 @@ recheck:
6154 /* can't change other user's priorities */ 6247 /* can't change other user's priorities */
6155 if (!check_same_owner(p)) 6248 if (!check_same_owner(p))
6156 return -EPERM; 6249 return -EPERM;
6250
6251 /* Normal users shall not reset the sched_reset_on_fork flag */
6252 if (p->sched_reset_on_fork && !reset_on_fork)
6253 return -EPERM;
6157 } 6254 }
6158 6255
6159 if (user) { 6256 if (user) {
@@ -6197,6 +6294,8 @@ recheck:
6197 if (running) 6294 if (running)
6198 p->sched_class->put_prev_task(rq, p); 6295 p->sched_class->put_prev_task(rq, p);
6199 6296
6297 p->sched_reset_on_fork = reset_on_fork;
6298
6200 oldprio = p->prio; 6299 oldprio = p->prio;
6201 __setscheduler(rq, p, policy, param->sched_priority); 6300 __setscheduler(rq, p, policy, param->sched_priority);
6202 6301
@@ -6313,14 +6412,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6313 if (p) { 6412 if (p) {
6314 retval = security_task_getscheduler(p); 6413 retval = security_task_getscheduler(p);
6315 if (!retval) 6414 if (!retval)
6316 retval = p->policy; 6415 retval = p->policy
6416 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6317 } 6417 }
6318 read_unlock(&tasklist_lock); 6418 read_unlock(&tasklist_lock);
6319 return retval; 6419 return retval;
6320} 6420}
6321 6421
6322/** 6422/**
6323 * sys_sched_getscheduler - get the RT priority of a thread 6423 * sys_sched_getparam - get the RT priority of a thread
6324 * @pid: the pid in question. 6424 * @pid: the pid in question.
6325 * @param: structure containing the RT priority. 6425 * @param: structure containing the RT priority.
6326 */ 6426 */
@@ -6541,27 +6641,21 @@ SYSCALL_DEFINE0(sched_yield)
6541 return 0; 6641 return 0;
6542} 6642}
6543 6643
6644static inline int should_resched(void)
6645{
6646 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
6647}
6648
6544static void __cond_resched(void) 6649static void __cond_resched(void)
6545{ 6650{
6546#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6651 add_preempt_count(PREEMPT_ACTIVE);
6547 __might_sleep(__FILE__, __LINE__); 6652 schedule();
6548#endif 6653 sub_preempt_count(PREEMPT_ACTIVE);
6549 /*
6550 * The BKS might be reacquired before we have dropped
6551 * PREEMPT_ACTIVE, which could trigger a second
6552 * cond_resched() call.
6553 */
6554 do {
6555 add_preempt_count(PREEMPT_ACTIVE);
6556 schedule();
6557 sub_preempt_count(PREEMPT_ACTIVE);
6558 } while (need_resched());
6559} 6654}
6560 6655
6561int __sched _cond_resched(void) 6656int __sched _cond_resched(void)
6562{ 6657{
6563 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 6658 if (should_resched()) {
6564 system_state == SYSTEM_RUNNING) {
6565 __cond_resched(); 6659 __cond_resched();
6566 return 1; 6660 return 1;
6567 } 6661 }
@@ -6570,21 +6664,23 @@ int __sched _cond_resched(void)
6570EXPORT_SYMBOL(_cond_resched); 6664EXPORT_SYMBOL(_cond_resched);
6571 6665
6572/* 6666/*
6573 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 6667 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6574 * call schedule, and on return reacquire the lock. 6668 * call schedule, and on return reacquire the lock.
6575 * 6669 *
6576 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 6670 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6577 * operations here to prevent schedule() from being called twice (once via 6671 * operations here to prevent schedule() from being called twice (once via
6578 * spin_unlock(), once by hand). 6672 * spin_unlock(), once by hand).
6579 */ 6673 */
6580int cond_resched_lock(spinlock_t *lock) 6674int __cond_resched_lock(spinlock_t *lock)
6581{ 6675{
6582 int resched = need_resched() && system_state == SYSTEM_RUNNING; 6676 int resched = should_resched();
6583 int ret = 0; 6677 int ret = 0;
6584 6678
6679 lockdep_assert_held(lock);
6680
6585 if (spin_needbreak(lock) || resched) { 6681 if (spin_needbreak(lock) || resched) {
6586 spin_unlock(lock); 6682 spin_unlock(lock);
6587 if (resched && need_resched()) 6683 if (resched)
6588 __cond_resched(); 6684 __cond_resched();
6589 else 6685 else
6590 cpu_relax(); 6686 cpu_relax();
@@ -6593,13 +6689,13 @@ int cond_resched_lock(spinlock_t *lock)
6593 } 6689 }
6594 return ret; 6690 return ret;
6595} 6691}
6596EXPORT_SYMBOL(cond_resched_lock); 6692EXPORT_SYMBOL(__cond_resched_lock);
6597 6693
6598int __sched cond_resched_softirq(void) 6694int __sched __cond_resched_softirq(void)
6599{ 6695{
6600 BUG_ON(!in_softirq()); 6696 BUG_ON(!in_softirq());
6601 6697
6602 if (need_resched() && system_state == SYSTEM_RUNNING) { 6698 if (should_resched()) {
6603 local_bh_enable(); 6699 local_bh_enable();
6604 __cond_resched(); 6700 __cond_resched();
6605 local_bh_disable(); 6701 local_bh_disable();
@@ -6607,7 +6703,7 @@ int __sched cond_resched_softirq(void)
6607 } 6703 }
6608 return 0; 6704 return 0;
6609} 6705}
6610EXPORT_SYMBOL(cond_resched_softirq); 6706EXPORT_SYMBOL(__cond_resched_softirq);
6611 6707
6612/** 6708/**
6613 * yield - yield the current processor to other threads. 6709 * yield - yield the current processor to other threads.
@@ -6631,11 +6727,13 @@ EXPORT_SYMBOL(yield);
6631 */ 6727 */
6632void __sched io_schedule(void) 6728void __sched io_schedule(void)
6633{ 6729{
6634 struct rq *rq = &__raw_get_cpu_var(runqueues); 6730 struct rq *rq = raw_rq();
6635 6731
6636 delayacct_blkio_start(); 6732 delayacct_blkio_start();
6637 atomic_inc(&rq->nr_iowait); 6733 atomic_inc(&rq->nr_iowait);
6734 current->in_iowait = 1;
6638 schedule(); 6735 schedule();
6736 current->in_iowait = 0;
6639 atomic_dec(&rq->nr_iowait); 6737 atomic_dec(&rq->nr_iowait);
6640 delayacct_blkio_end(); 6738 delayacct_blkio_end();
6641} 6739}
@@ -6643,12 +6741,14 @@ EXPORT_SYMBOL(io_schedule);
6643 6741
6644long __sched io_schedule_timeout(long timeout) 6742long __sched io_schedule_timeout(long timeout)
6645{ 6743{
6646 struct rq *rq = &__raw_get_cpu_var(runqueues); 6744 struct rq *rq = raw_rq();
6647 long ret; 6745 long ret;
6648 6746
6649 delayacct_blkio_start(); 6747 delayacct_blkio_start();
6650 atomic_inc(&rq->nr_iowait); 6748 atomic_inc(&rq->nr_iowait);
6749 current->in_iowait = 1;
6651 ret = schedule_timeout(timeout); 6750 ret = schedule_timeout(timeout);
6751 current->in_iowait = 0;
6652 atomic_dec(&rq->nr_iowait); 6752 atomic_dec(&rq->nr_iowait);
6653 delayacct_blkio_end(); 6753 delayacct_blkio_end();
6654 return ret; 6754 return ret;
@@ -6732,23 +6832,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6732 if (retval) 6832 if (retval)
6733 goto out_unlock; 6833 goto out_unlock;
6734 6834
6735 /* 6835 time_slice = p->sched_class->get_rr_interval(p);
6736 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
6737 * tasks that are on an otherwise idle runqueue:
6738 */
6739 time_slice = 0;
6740 if (p->policy == SCHED_RR) {
6741 time_slice = DEF_TIMESLICE;
6742 } else if (p->policy != SCHED_FIFO) {
6743 struct sched_entity *se = &p->se;
6744 unsigned long flags;
6745 struct rq *rq;
6746 6836
6747 rq = task_rq_lock(p, &flags);
6748 if (rq->cfs.load.weight)
6749 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6750 task_rq_unlock(rq, &flags);
6751 }
6752 read_unlock(&tasklist_lock); 6837 read_unlock(&tasklist_lock);
6753 jiffies_to_timespec(time_slice, &t); 6838 jiffies_to_timespec(time_slice, &t);
6754 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 6839 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -6965,8 +7050,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6965 7050
6966 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7051 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6967 /* Need help from migration thread: drop lock and wait. */ 7052 /* Need help from migration thread: drop lock and wait. */
7053 struct task_struct *mt = rq->migration_thread;
7054
7055 get_task_struct(mt);
6968 task_rq_unlock(rq, &flags); 7056 task_rq_unlock(rq, &flags);
6969 wake_up_process(rq->migration_thread); 7057 wake_up_process(rq->migration_thread);
7058 put_task_struct(mt);
6970 wait_for_completion(&req.done); 7059 wait_for_completion(&req.done);
6971 tlb_migrate_finish(p->mm); 7060 tlb_migrate_finish(p->mm);
6972 return 0; 7061 return 0;
@@ -7024,6 +7113,11 @@ fail:
7024 return ret; 7113 return ret;
7025} 7114}
7026 7115
7116#define RCU_MIGRATION_IDLE 0
7117#define RCU_MIGRATION_NEED_QS 1
7118#define RCU_MIGRATION_GOT_QS 2
7119#define RCU_MIGRATION_MUST_SYNC 3
7120
7027/* 7121/*
7028 * migration_thread - this is a highprio system thread that performs 7122 * migration_thread - this is a highprio system thread that performs
7029 * thread migration by bumping thread off CPU then 'pushing' onto 7123 * thread migration by bumping thread off CPU then 'pushing' onto
@@ -7031,6 +7125,7 @@ fail:
7031 */ 7125 */
7032static int migration_thread(void *data) 7126static int migration_thread(void *data)
7033{ 7127{
7128 int badcpu;
7034 int cpu = (long)data; 7129 int cpu = (long)data;
7035 struct rq *rq; 7130 struct rq *rq;
7036 7131
@@ -7065,8 +7160,17 @@ static int migration_thread(void *data)
7065 req = list_entry(head->next, struct migration_req, list); 7160 req = list_entry(head->next, struct migration_req, list);
7066 list_del_init(head->next); 7161 list_del_init(head->next);
7067 7162
7068 spin_unlock(&rq->lock); 7163 if (req->task != NULL) {
7069 __migrate_task(req->task, cpu, req->dest_cpu); 7164 spin_unlock(&rq->lock);
7165 __migrate_task(req->task, cpu, req->dest_cpu);
7166 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7167 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7168 spin_unlock(&rq->lock);
7169 } else {
7170 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7171 spin_unlock(&rq->lock);
7172 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7173 }
7070 local_irq_enable(); 7174 local_irq_enable();
7071 7175
7072 complete(&req->done); 7176 complete(&req->done);
@@ -7262,6 +7366,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
7262static void calc_global_load_remove(struct rq *rq) 7366static void calc_global_load_remove(struct rq *rq)
7263{ 7367{
7264 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 7368 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7369 rq->calc_load_active = 0;
7265} 7370}
7266#endif /* CONFIG_HOTPLUG_CPU */ 7371#endif /* CONFIG_HOTPLUG_CPU */
7267 7372
@@ -7488,6 +7593,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7488 task_rq_unlock(rq, &flags); 7593 task_rq_unlock(rq, &flags);
7489 get_task_struct(p); 7594 get_task_struct(p);
7490 cpu_rq(cpu)->migration_thread = p; 7595 cpu_rq(cpu)->migration_thread = p;
7596 rq->calc_load_update = calc_load_update;
7491 break; 7597 break;
7492 7598
7493 case CPU_ONLINE: 7599 case CPU_ONLINE:
@@ -7498,8 +7604,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7498 /* Update our root-domain */ 7604 /* Update our root-domain */
7499 rq = cpu_rq(cpu); 7605 rq = cpu_rq(cpu);
7500 spin_lock_irqsave(&rq->lock, flags); 7606 spin_lock_irqsave(&rq->lock, flags);
7501 rq->calc_load_update = calc_load_update;
7502 rq->calc_load_active = 0;
7503 if (rq->rd) { 7607 if (rq->rd) {
7504 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7608 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7505 7609
@@ -7580,7 +7684,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7580/* 7684/*
7581 * Register at high priority so that task migration (migrate_all_tasks) 7685 * Register at high priority so that task migration (migrate_all_tasks)
7582 * happens before everything else. This has to be lower priority than 7686 * happens before everything else. This has to be lower priority than
7583 * the notifier in the perf_counter subsystem, though. 7687 * the notifier in the perf_event subsystem, though.
7584 */ 7688 */
7585static struct notifier_block __cpuinitdata migration_notifier = { 7689static struct notifier_block __cpuinitdata migration_notifier = {
7586 .notifier_call = migration_call, 7690 .notifier_call = migration_call,
@@ -7598,7 +7702,7 @@ static int __init migration_init(void)
7598 migration_call(&migration_notifier, CPU_ONLINE, cpu); 7702 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7599 register_cpu_notifier(&migration_notifier); 7703 register_cpu_notifier(&migration_notifier);
7600 7704
7601 return err; 7705 return 0;
7602} 7706}
7603early_initcall(migration_init); 7707early_initcall(migration_init);
7604#endif 7708#endif
@@ -7645,7 +7749,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7645 break; 7749 break;
7646 } 7750 }
7647 7751
7648 if (!group->__cpu_power) { 7752 if (!group->cpu_power) {
7649 printk(KERN_CONT "\n"); 7753 printk(KERN_CONT "\n");
7650 printk(KERN_ERR "ERROR: domain->cpu_power not " 7754 printk(KERN_ERR "ERROR: domain->cpu_power not "
7651 "set\n"); 7755 "set\n");
@@ -7669,9 +7773,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7669 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7773 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7670 7774
7671 printk(KERN_CONT " %s", str); 7775 printk(KERN_CONT " %s", str);
7672 if (group->__cpu_power != SCHED_LOAD_SCALE) { 7776 if (group->cpu_power != SCHED_LOAD_SCALE) {
7673 printk(KERN_CONT " (__cpu_power = %d)", 7777 printk(KERN_CONT " (cpu_power = %d)",
7674 group->__cpu_power); 7778 group->cpu_power);
7675 } 7779 }
7676 7780
7677 group = group->next; 7781 group = group->next;
@@ -7736,9 +7840,7 @@ static int sd_degenerate(struct sched_domain *sd)
7736 } 7840 }
7737 7841
7738 /* Following flags don't use groups */ 7842 /* Following flags don't use groups */
7739 if (sd->flags & (SD_WAKE_IDLE | 7843 if (sd->flags & (SD_WAKE_AFFINE))
7740 SD_WAKE_AFFINE |
7741 SD_WAKE_BALANCE))
7742 return 0; 7844 return 0;
7743 7845
7744 return 1; 7846 return 1;
@@ -7755,10 +7857,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7755 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7857 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
7756 return 0; 7858 return 0;
7757 7859
7758 /* Does parent contain flags not in child? */
7759 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
7760 if (cflags & SD_WAKE_AFFINE)
7761 pflags &= ~SD_WAKE_BALANCE;
7762 /* Flags needing groups don't count if only 1 group in parent */ 7860 /* Flags needing groups don't count if only 1 group in parent */
7763 if (parent->groups == parent->groups->next) { 7861 if (parent->groups == parent->groups->next) {
7764 pflags &= ~(SD_LOAD_BALANCE | 7862 pflags &= ~(SD_LOAD_BALANCE |
@@ -7814,7 +7912,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7814 rq->rd = rd; 7912 rq->rd = rd;
7815 7913
7816 cpumask_set_cpu(rq->cpu, rd->span); 7914 cpumask_set_cpu(rq->cpu, rd->span);
7817 if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) 7915 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7818 set_rq_online(rq); 7916 set_rq_online(rq);
7819 7917
7820 spin_unlock_irqrestore(&rq->lock, flags); 7918 spin_unlock_irqrestore(&rq->lock, flags);
@@ -7956,7 +8054,7 @@ init_sched_build_groups(const struct cpumask *span,
7956 continue; 8054 continue;
7957 8055
7958 cpumask_clear(sched_group_cpus(sg)); 8056 cpumask_clear(sched_group_cpus(sg));
7959 sg->__cpu_power = 0; 8057 sg->cpu_power = 0;
7960 8058
7961 for_each_cpu(j, span) { 8059 for_each_cpu(j, span) {
7962 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 8060 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@ -8064,6 +8162,39 @@ struct static_sched_domain {
8064 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 8162 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8065}; 8163};
8066 8164
8165struct s_data {
8166#ifdef CONFIG_NUMA
8167 int sd_allnodes;
8168 cpumask_var_t domainspan;
8169 cpumask_var_t covered;
8170 cpumask_var_t notcovered;
8171#endif
8172 cpumask_var_t nodemask;
8173 cpumask_var_t this_sibling_map;
8174 cpumask_var_t this_core_map;
8175 cpumask_var_t send_covered;
8176 cpumask_var_t tmpmask;
8177 struct sched_group **sched_group_nodes;
8178 struct root_domain *rd;
8179};
8180
8181enum s_alloc {
8182 sa_sched_groups = 0,
8183 sa_rootdomain,
8184 sa_tmpmask,
8185 sa_send_covered,
8186 sa_this_core_map,
8187 sa_this_sibling_map,
8188 sa_nodemask,
8189 sa_sched_group_nodes,
8190#ifdef CONFIG_NUMA
8191 sa_notcovered,
8192 sa_covered,
8193 sa_domainspan,
8194#endif
8195 sa_none,
8196};
8197
8067/* 8198/*
8068 * SMT sched-domains: 8199 * SMT sched-domains:
8069 */ 8200 */
@@ -8181,11 +8312,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
8181 continue; 8312 continue;
8182 } 8313 }
8183 8314
8184 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 8315 sg->cpu_power += sd->groups->cpu_power;
8185 } 8316 }
8186 sg = sg->next; 8317 sg = sg->next;
8187 } while (sg != group_head); 8318 } while (sg != group_head);
8188} 8319}
8320
8321static int build_numa_sched_groups(struct s_data *d,
8322 const struct cpumask *cpu_map, int num)
8323{
8324 struct sched_domain *sd;
8325 struct sched_group *sg, *prev;
8326 int n, j;
8327
8328 cpumask_clear(d->covered);
8329 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
8330 if (cpumask_empty(d->nodemask)) {
8331 d->sched_group_nodes[num] = NULL;
8332 goto out;
8333 }
8334
8335 sched_domain_node_span(num, d->domainspan);
8336 cpumask_and(d->domainspan, d->domainspan, cpu_map);
8337
8338 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8339 GFP_KERNEL, num);
8340 if (!sg) {
8341 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
8342 num);
8343 return -ENOMEM;
8344 }
8345 d->sched_group_nodes[num] = sg;
8346
8347 for_each_cpu(j, d->nodemask) {
8348 sd = &per_cpu(node_domains, j).sd;
8349 sd->groups = sg;
8350 }
8351
8352 sg->cpu_power = 0;
8353 cpumask_copy(sched_group_cpus(sg), d->nodemask);
8354 sg->next = sg;
8355 cpumask_or(d->covered, d->covered, d->nodemask);
8356
8357 prev = sg;
8358 for (j = 0; j < nr_node_ids; j++) {
8359 n = (num + j) % nr_node_ids;
8360 cpumask_complement(d->notcovered, d->covered);
8361 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
8362 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
8363 if (cpumask_empty(d->tmpmask))
8364 break;
8365 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
8366 if (cpumask_empty(d->tmpmask))
8367 continue;
8368 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8369 GFP_KERNEL, num);
8370 if (!sg) {
8371 printk(KERN_WARNING
8372 "Can not alloc domain group for node %d\n", j);
8373 return -ENOMEM;
8374 }
8375 sg->cpu_power = 0;
8376 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
8377 sg->next = prev->next;
8378 cpumask_or(d->covered, d->covered, d->tmpmask);
8379 prev->next = sg;
8380 prev = sg;
8381 }
8382out:
8383 return 0;
8384}
8189#endif /* CONFIG_NUMA */ 8385#endif /* CONFIG_NUMA */
8190 8386
8191#ifdef CONFIG_NUMA 8387#ifdef CONFIG_NUMA
@@ -8239,15 +8435,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
8239 * there are asymmetries in the topology. If there are asymmetries, group 8435 * there are asymmetries in the topology. If there are asymmetries, group
8240 * having more cpu_power will pickup more load compared to the group having 8436 * having more cpu_power will pickup more load compared to the group having
8241 * less cpu_power. 8437 * less cpu_power.
8242 *
8243 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
8244 * the maximum number of tasks a group can handle in the presence of other idle
8245 * or lightly loaded groups in the same sched domain.
8246 */ 8438 */
8247static void init_sched_groups_power(int cpu, struct sched_domain *sd) 8439static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8248{ 8440{
8249 struct sched_domain *child; 8441 struct sched_domain *child;
8250 struct sched_group *group; 8442 struct sched_group *group;
8443 long power;
8444 int weight;
8251 8445
8252 WARN_ON(!sd || !sd->groups); 8446 WARN_ON(!sd || !sd->groups);
8253 8447
@@ -8256,28 +8450,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8256 8450
8257 child = sd->child; 8451 child = sd->child;
8258 8452
8259 sd->groups->__cpu_power = 0; 8453 sd->groups->cpu_power = 0;
8260 8454
8261 /* 8455 if (!child) {
8262 * For perf policy, if the groups in child domain share resources 8456 power = SCHED_LOAD_SCALE;
8263 * (for example cores sharing some portions of the cache hierarchy 8457 weight = cpumask_weight(sched_domain_span(sd));
8264 * or SMT), then set this domain groups cpu_power such that each group 8458 /*
8265 * can handle only one task, when there are other idle groups in the 8459 * SMT siblings share the power of a single core.
8266 * same sched domain. 8460 * Usually multiple threads get a better yield out of
8267 */ 8461 * that one core than a single thread would have,
8268 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 8462 * reflect that in sd->smt_gain.
8269 (child->flags & 8463 */
8270 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 8464 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
8271 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); 8465 power *= sd->smt_gain;
8466 power /= weight;
8467 power >>= SCHED_LOAD_SHIFT;
8468 }
8469 sd->groups->cpu_power += power;
8272 return; 8470 return;
8273 } 8471 }
8274 8472
8275 /* 8473 /*
8276 * add cpu_power of each child group to this groups cpu_power 8474 * Add cpu_power of each child group to this groups cpu_power.
8277 */ 8475 */
8278 group = child->groups; 8476 group = child->groups;
8279 do { 8477 do {
8280 sg_inc_cpu_power(sd->groups, group->__cpu_power); 8478 sd->groups->cpu_power += group->cpu_power;
8281 group = group->next; 8479 group = group->next;
8282 } while (group != child->groups); 8480 } while (group != child->groups);
8283} 8481}
@@ -8344,287 +8542,292 @@ static void set_domain_attribute(struct sched_domain *sd,
8344 request = attr->relax_domain_level; 8542 request = attr->relax_domain_level;
8345 if (request < sd->level) { 8543 if (request < sd->level) {
8346 /* turn off idle balance on this domain */ 8544 /* turn off idle balance on this domain */
8347 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8545 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8348 } else { 8546 } else {
8349 /* turn on idle balance on this domain */ 8547 /* turn on idle balance on this domain */
8350 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8548 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8549 }
8550}
8551
8552static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
8553 const struct cpumask *cpu_map)
8554{
8555 switch (what) {
8556 case sa_sched_groups:
8557 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
8558 d->sched_group_nodes = NULL;
8559 case sa_rootdomain:
8560 free_rootdomain(d->rd); /* fall through */
8561 case sa_tmpmask:
8562 free_cpumask_var(d->tmpmask); /* fall through */
8563 case sa_send_covered:
8564 free_cpumask_var(d->send_covered); /* fall through */
8565 case sa_this_core_map:
8566 free_cpumask_var(d->this_core_map); /* fall through */
8567 case sa_this_sibling_map:
8568 free_cpumask_var(d->this_sibling_map); /* fall through */
8569 case sa_nodemask:
8570 free_cpumask_var(d->nodemask); /* fall through */
8571 case sa_sched_group_nodes:
8572#ifdef CONFIG_NUMA
8573 kfree(d->sched_group_nodes); /* fall through */
8574 case sa_notcovered:
8575 free_cpumask_var(d->notcovered); /* fall through */
8576 case sa_covered:
8577 free_cpumask_var(d->covered); /* fall through */
8578 case sa_domainspan:
8579 free_cpumask_var(d->domainspan); /* fall through */
8580#endif
8581 case sa_none:
8582 break;
8351 } 8583 }
8352} 8584}
8353 8585
8354/* 8586static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
8355 * Build sched domains for a given set of cpus and attach the sched domains 8587 const struct cpumask *cpu_map)
8356 * to the individual cpus
8357 */
8358static int __build_sched_domains(const struct cpumask *cpu_map,
8359 struct sched_domain_attr *attr)
8360{ 8588{
8361 int i, err = -ENOMEM;
8362 struct root_domain *rd;
8363 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
8364 tmpmask;
8365#ifdef CONFIG_NUMA 8589#ifdef CONFIG_NUMA
8366 cpumask_var_t domainspan, covered, notcovered; 8590 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
8367 struct sched_group **sched_group_nodes = NULL; 8591 return sa_none;
8368 int sd_allnodes = 0; 8592 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
8369 8593 return sa_domainspan;
8370 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) 8594 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
8371 goto out; 8595 return sa_covered;
8372 if (!alloc_cpumask_var(&covered, GFP_KERNEL)) 8596 /* Allocate the per-node list of sched groups */
8373 goto free_domainspan; 8597 d->sched_group_nodes = kcalloc(nr_node_ids,
8374 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL)) 8598 sizeof(struct sched_group *), GFP_KERNEL);
8375 goto free_covered; 8599 if (!d->sched_group_nodes) {
8376#endif
8377
8378 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
8379 goto free_notcovered;
8380 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
8381 goto free_nodemask;
8382 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
8383 goto free_this_sibling_map;
8384 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
8385 goto free_this_core_map;
8386 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
8387 goto free_send_covered;
8388
8389#ifdef CONFIG_NUMA
8390 /*
8391 * Allocate the per-node list of sched groups
8392 */
8393 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
8394 GFP_KERNEL);
8395 if (!sched_group_nodes) {
8396 printk(KERN_WARNING "Can not alloc sched group node list\n"); 8600 printk(KERN_WARNING "Can not alloc sched group node list\n");
8397 goto free_tmpmask; 8601 return sa_notcovered;
8398 } 8602 }
8399#endif 8603 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
8400 8604#endif
8401 rd = alloc_rootdomain(); 8605 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
8402 if (!rd) { 8606 return sa_sched_group_nodes;
8607 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
8608 return sa_nodemask;
8609 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
8610 return sa_this_sibling_map;
8611 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
8612 return sa_this_core_map;
8613 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
8614 return sa_send_covered;
8615 d->rd = alloc_rootdomain();
8616 if (!d->rd) {
8403 printk(KERN_WARNING "Cannot alloc root domain\n"); 8617 printk(KERN_WARNING "Cannot alloc root domain\n");
8404 goto free_sched_groups; 8618 return sa_tmpmask;
8405 } 8619 }
8620 return sa_rootdomain;
8621}
8406 8622
8623static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
8624 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
8625{
8626 struct sched_domain *sd = NULL;
8407#ifdef CONFIG_NUMA 8627#ifdef CONFIG_NUMA
8408 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; 8628 struct sched_domain *parent;
8409#endif
8410
8411 /*
8412 * Set up domains for cpus specified by the cpu_map.
8413 */
8414 for_each_cpu(i, cpu_map) {
8415 struct sched_domain *sd = NULL, *p;
8416
8417 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
8418
8419#ifdef CONFIG_NUMA
8420 if (cpumask_weight(cpu_map) >
8421 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
8422 sd = &per_cpu(allnodes_domains, i).sd;
8423 SD_INIT(sd, ALLNODES);
8424 set_domain_attribute(sd, attr);
8425 cpumask_copy(sched_domain_span(sd), cpu_map);
8426 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
8427 p = sd;
8428 sd_allnodes = 1;
8429 } else
8430 p = NULL;
8431 8629
8432 sd = &per_cpu(node_domains, i).sd; 8630 d->sd_allnodes = 0;
8433 SD_INIT(sd, NODE); 8631 if (cpumask_weight(cpu_map) >
8632 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
8633 sd = &per_cpu(allnodes_domains, i).sd;
8634 SD_INIT(sd, ALLNODES);
8434 set_domain_attribute(sd, attr); 8635 set_domain_attribute(sd, attr);
8435 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 8636 cpumask_copy(sched_domain_span(sd), cpu_map);
8436 sd->parent = p; 8637 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
8437 if (p) 8638 d->sd_allnodes = 1;
8438 p->child = sd; 8639 }
8439 cpumask_and(sched_domain_span(sd), 8640 parent = sd;
8440 sched_domain_span(sd), cpu_map); 8641
8642 sd = &per_cpu(node_domains, i).sd;
8643 SD_INIT(sd, NODE);
8644 set_domain_attribute(sd, attr);
8645 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8646 sd->parent = parent;
8647 if (parent)
8648 parent->child = sd;
8649 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
8441#endif 8650#endif
8651 return sd;
8652}
8442 8653
8443 p = sd; 8654static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
8444 sd = &per_cpu(phys_domains, i).sd; 8655 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8445 SD_INIT(sd, CPU); 8656 struct sched_domain *parent, int i)
8446 set_domain_attribute(sd, attr); 8657{
8447 cpumask_copy(sched_domain_span(sd), nodemask); 8658 struct sched_domain *sd;
8448 sd->parent = p; 8659 sd = &per_cpu(phys_domains, i).sd;
8449 if (p) 8660 SD_INIT(sd, CPU);
8450 p->child = sd; 8661 set_domain_attribute(sd, attr);
8451 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); 8662 cpumask_copy(sched_domain_span(sd), d->nodemask);
8663 sd->parent = parent;
8664 if (parent)
8665 parent->child = sd;
8666 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
8667 return sd;
8668}
8452 8669
8670static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
8671 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8672 struct sched_domain *parent, int i)
8673{
8674 struct sched_domain *sd = parent;
8453#ifdef CONFIG_SCHED_MC 8675#ifdef CONFIG_SCHED_MC
8454 p = sd; 8676 sd = &per_cpu(core_domains, i).sd;
8455 sd = &per_cpu(core_domains, i).sd; 8677 SD_INIT(sd, MC);
8456 SD_INIT(sd, MC); 8678 set_domain_attribute(sd, attr);
8457 set_domain_attribute(sd, attr); 8679 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
8458 cpumask_and(sched_domain_span(sd), cpu_map, 8680 sd->parent = parent;
8459 cpu_coregroup_mask(i)); 8681 parent->child = sd;
8460 sd->parent = p; 8682 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
8461 p->child = sd;
8462 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
8463#endif 8683#endif
8684 return sd;
8685}
8464 8686
8687static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
8688 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8689 struct sched_domain *parent, int i)
8690{
8691 struct sched_domain *sd = parent;
8465#ifdef CONFIG_SCHED_SMT 8692#ifdef CONFIG_SCHED_SMT
8466 p = sd; 8693 sd = &per_cpu(cpu_domains, i).sd;
8467 sd = &per_cpu(cpu_domains, i).sd; 8694 SD_INIT(sd, SIBLING);
8468 SD_INIT(sd, SIBLING); 8695 set_domain_attribute(sd, attr);
8469 set_domain_attribute(sd, attr); 8696 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
8470 cpumask_and(sched_domain_span(sd), 8697 sd->parent = parent;
8471 topology_thread_cpumask(i), cpu_map); 8698 parent->child = sd;
8472 sd->parent = p; 8699 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
8473 p->child = sd;
8474 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
8475#endif 8700#endif
8476 } 8701 return sd;
8702}
8477 8703
8704static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
8705 const struct cpumask *cpu_map, int cpu)
8706{
8707 switch (l) {
8478#ifdef CONFIG_SCHED_SMT 8708#ifdef CONFIG_SCHED_SMT
8479 /* Set up CPU (sibling) groups */ 8709 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
8480 for_each_cpu(i, cpu_map) { 8710 cpumask_and(d->this_sibling_map, cpu_map,
8481 cpumask_and(this_sibling_map, 8711 topology_thread_cpumask(cpu));
8482 topology_thread_cpumask(i), cpu_map); 8712 if (cpu == cpumask_first(d->this_sibling_map))
8483 if (i != cpumask_first(this_sibling_map)) 8713 init_sched_build_groups(d->this_sibling_map, cpu_map,
8484 continue; 8714 &cpu_to_cpu_group,
8485 8715 d->send_covered, d->tmpmask);
8486 init_sched_build_groups(this_sibling_map, cpu_map, 8716 break;
8487 &cpu_to_cpu_group,
8488 send_covered, tmpmask);
8489 }
8490#endif 8717#endif
8491
8492#ifdef CONFIG_SCHED_MC 8718#ifdef CONFIG_SCHED_MC
8493 /* Set up multi-core groups */ 8719 case SD_LV_MC: /* set up multi-core groups */
8494 for_each_cpu(i, cpu_map) { 8720 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
8495 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); 8721 if (cpu == cpumask_first(d->this_core_map))
8496 if (i != cpumask_first(this_core_map)) 8722 init_sched_build_groups(d->this_core_map, cpu_map,
8497 continue; 8723 &cpu_to_core_group,
8498 8724 d->send_covered, d->tmpmask);
8499 init_sched_build_groups(this_core_map, cpu_map, 8725 break;
8500 &cpu_to_core_group,
8501 send_covered, tmpmask);
8502 }
8503#endif 8726#endif
8504 8727 case SD_LV_CPU: /* set up physical groups */
8505 /* Set up physical groups */ 8728 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
8506 for (i = 0; i < nr_node_ids; i++) { 8729 if (!cpumask_empty(d->nodemask))
8507 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8730 init_sched_build_groups(d->nodemask, cpu_map,
8508 if (cpumask_empty(nodemask)) 8731 &cpu_to_phys_group,
8509 continue; 8732 d->send_covered, d->tmpmask);
8510 8733 break;
8511 init_sched_build_groups(nodemask, cpu_map,
8512 &cpu_to_phys_group,
8513 send_covered, tmpmask);
8514 }
8515
8516#ifdef CONFIG_NUMA 8734#ifdef CONFIG_NUMA
8517 /* Set up node groups */ 8735 case SD_LV_ALLNODES:
8518 if (sd_allnodes) { 8736 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
8519 init_sched_build_groups(cpu_map, cpu_map, 8737 d->send_covered, d->tmpmask);
8520 &cpu_to_allnodes_group, 8738 break;
8521 send_covered, tmpmask); 8739#endif
8740 default:
8741 break;
8522 } 8742 }
8743}
8523 8744
8524 for (i = 0; i < nr_node_ids; i++) { 8745/*
8525 /* Set up node groups */ 8746 * Build sched domains for a given set of cpus and attach the sched domains
8526 struct sched_group *sg, *prev; 8747 * to the individual cpus
8527 int j; 8748 */
8528 8749static int __build_sched_domains(const struct cpumask *cpu_map,
8529 cpumask_clear(covered); 8750 struct sched_domain_attr *attr)
8530 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8751{
8531 if (cpumask_empty(nodemask)) { 8752 enum s_alloc alloc_state = sa_none;
8532 sched_group_nodes[i] = NULL; 8753 struct s_data d;
8533 continue; 8754 struct sched_domain *sd;
8534 } 8755 int i;
8756#ifdef CONFIG_NUMA
8757 d.sd_allnodes = 0;
8758#endif
8535 8759
8536 sched_domain_node_span(i, domainspan); 8760 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8537 cpumask_and(domainspan, domainspan, cpu_map); 8761 if (alloc_state != sa_rootdomain)
8762 goto error;
8763 alloc_state = sa_sched_groups;
8538 8764
8539 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 8765 /*
8540 GFP_KERNEL, i); 8766 * Set up domains for cpus specified by the cpu_map.
8541 if (!sg) { 8767 */
8542 printk(KERN_WARNING "Can not alloc domain group for " 8768 for_each_cpu(i, cpu_map) {
8543 "node %d\n", i); 8769 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
8544 goto error; 8770 cpu_map);
8545 }
8546 sched_group_nodes[i] = sg;
8547 for_each_cpu(j, nodemask) {
8548 struct sched_domain *sd;
8549 8771
8550 sd = &per_cpu(node_domains, j).sd; 8772 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
8551 sd->groups = sg; 8773 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
8552 } 8774 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
8553 sg->__cpu_power = 0; 8775 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
8554 cpumask_copy(sched_group_cpus(sg), nodemask); 8776 }
8555 sg->next = sg;
8556 cpumask_or(covered, covered, nodemask);
8557 prev = sg;
8558 8777
8559 for (j = 0; j < nr_node_ids; j++) { 8778 for_each_cpu(i, cpu_map) {
8560 int n = (i + j) % nr_node_ids; 8779 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
8780 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
8781 }
8561 8782
8562 cpumask_complement(notcovered, covered); 8783 /* Set up physical groups */
8563 cpumask_and(tmpmask, notcovered, cpu_map); 8784 for (i = 0; i < nr_node_ids; i++)
8564 cpumask_and(tmpmask, tmpmask, domainspan); 8785 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
8565 if (cpumask_empty(tmpmask))
8566 break;
8567 8786
8568 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); 8787#ifdef CONFIG_NUMA
8569 if (cpumask_empty(tmpmask)) 8788 /* Set up node groups */
8570 continue; 8789 if (d.sd_allnodes)
8790 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
8571 8791
8572 sg = kmalloc_node(sizeof(struct sched_group) + 8792 for (i = 0; i < nr_node_ids; i++)
8573 cpumask_size(), 8793 if (build_numa_sched_groups(&d, cpu_map, i))
8574 GFP_KERNEL, i); 8794 goto error;
8575 if (!sg) {
8576 printk(KERN_WARNING
8577 "Can not alloc domain group for node %d\n", j);
8578 goto error;
8579 }
8580 sg->__cpu_power = 0;
8581 cpumask_copy(sched_group_cpus(sg), tmpmask);
8582 sg->next = prev->next;
8583 cpumask_or(covered, covered, tmpmask);
8584 prev->next = sg;
8585 prev = sg;
8586 }
8587 }
8588#endif 8795#endif
8589 8796
8590 /* Calculate CPU power for physical packages and nodes */ 8797 /* Calculate CPU power for physical packages and nodes */
8591#ifdef CONFIG_SCHED_SMT 8798#ifdef CONFIG_SCHED_SMT
8592 for_each_cpu(i, cpu_map) { 8799 for_each_cpu(i, cpu_map) {
8593 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; 8800 sd = &per_cpu(cpu_domains, i).sd;
8594
8595 init_sched_groups_power(i, sd); 8801 init_sched_groups_power(i, sd);
8596 } 8802 }
8597#endif 8803#endif
8598#ifdef CONFIG_SCHED_MC 8804#ifdef CONFIG_SCHED_MC
8599 for_each_cpu(i, cpu_map) { 8805 for_each_cpu(i, cpu_map) {
8600 struct sched_domain *sd = &per_cpu(core_domains, i).sd; 8806 sd = &per_cpu(core_domains, i).sd;
8601
8602 init_sched_groups_power(i, sd); 8807 init_sched_groups_power(i, sd);
8603 } 8808 }
8604#endif 8809#endif
8605 8810
8606 for_each_cpu(i, cpu_map) { 8811 for_each_cpu(i, cpu_map) {
8607 struct sched_domain *sd = &per_cpu(phys_domains, i).sd; 8812 sd = &per_cpu(phys_domains, i).sd;
8608
8609 init_sched_groups_power(i, sd); 8813 init_sched_groups_power(i, sd);
8610 } 8814 }
8611 8815
8612#ifdef CONFIG_NUMA 8816#ifdef CONFIG_NUMA
8613 for (i = 0; i < nr_node_ids; i++) 8817 for (i = 0; i < nr_node_ids; i++)
8614 init_numa_sched_groups_power(sched_group_nodes[i]); 8818 init_numa_sched_groups_power(d.sched_group_nodes[i]);
8615 8819
8616 if (sd_allnodes) { 8820 if (d.sd_allnodes) {
8617 struct sched_group *sg; 8821 struct sched_group *sg;
8618 8822
8619 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 8823 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8620 tmpmask); 8824 d.tmpmask);
8621 init_numa_sched_groups_power(sg); 8825 init_numa_sched_groups_power(sg);
8622 } 8826 }
8623#endif 8827#endif
8624 8828
8625 /* Attach the domains */ 8829 /* Attach the domains */
8626 for_each_cpu(i, cpu_map) { 8830 for_each_cpu(i, cpu_map) {
8627 struct sched_domain *sd;
8628#ifdef CONFIG_SCHED_SMT 8831#ifdef CONFIG_SCHED_SMT
8629 sd = &per_cpu(cpu_domains, i).sd; 8832 sd = &per_cpu(cpu_domains, i).sd;
8630#elif defined(CONFIG_SCHED_MC) 8833#elif defined(CONFIG_SCHED_MC)
@@ -8632,44 +8835,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8632#else 8835#else
8633 sd = &per_cpu(phys_domains, i).sd; 8836 sd = &per_cpu(phys_domains, i).sd;
8634#endif 8837#endif
8635 cpu_attach_domain(sd, rd, i); 8838 cpu_attach_domain(sd, d.rd, i);
8636 } 8839 }
8637 8840
8638 err = 0; 8841 d.sched_group_nodes = NULL; /* don't free this we still need it */
8639 8842 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
8640free_tmpmask: 8843 return 0;
8641 free_cpumask_var(tmpmask);
8642free_send_covered:
8643 free_cpumask_var(send_covered);
8644free_this_core_map:
8645 free_cpumask_var(this_core_map);
8646free_this_sibling_map:
8647 free_cpumask_var(this_sibling_map);
8648free_nodemask:
8649 free_cpumask_var(nodemask);
8650free_notcovered:
8651#ifdef CONFIG_NUMA
8652 free_cpumask_var(notcovered);
8653free_covered:
8654 free_cpumask_var(covered);
8655free_domainspan:
8656 free_cpumask_var(domainspan);
8657out:
8658#endif
8659 return err;
8660
8661free_sched_groups:
8662#ifdef CONFIG_NUMA
8663 kfree(sched_group_nodes);
8664#endif
8665 goto free_tmpmask;
8666 8844
8667#ifdef CONFIG_NUMA
8668error: 8845error:
8669 free_sched_groups(cpu_map, tmpmask); 8846 __free_domain_allocs(&d, alloc_state, cpu_map);
8670 free_rootdomain(rd); 8847 return -ENOMEM;
8671 goto free_tmpmask;
8672#endif
8673} 8848}
8674 8849
8675static int build_sched_domains(const struct cpumask *cpu_map) 8850static int build_sched_domains(const struct cpumask *cpu_map)
@@ -8988,6 +9163,7 @@ void __init sched_init_smp(void)
8988 cpumask_var_t non_isolated_cpus; 9163 cpumask_var_t non_isolated_cpus;
8989 9164
8990 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 9165 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
9166 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
8991 9167
8992#if defined(CONFIG_NUMA) 9168#if defined(CONFIG_NUMA)
8993 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 9169 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -9019,7 +9195,6 @@ void __init sched_init_smp(void)
9019 sched_init_granularity(); 9195 sched_init_granularity();
9020 free_cpumask_var(non_isolated_cpus); 9196 free_cpumask_var(non_isolated_cpus);
9021 9197
9022 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9023 init_sched_rt_class(); 9198 init_sched_rt_class();
9024} 9199}
9025#else 9200#else
@@ -9070,7 +9245,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9070#ifdef CONFIG_SMP 9245#ifdef CONFIG_SMP
9071 rt_rq->rt_nr_migratory = 0; 9246 rt_rq->rt_nr_migratory = 0;
9072 rt_rq->overloaded = 0; 9247 rt_rq->overloaded = 0;
9073 plist_head_init(&rq->rt.pushable_tasks, &rq->lock); 9248 plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
9074#endif 9249#endif
9075 9250
9076 rt_rq->rt_time = 0; 9251 rt_rq->rt_time = 0;
@@ -9277,11 +9452,11 @@ void __init sched_init(void)
9277 * system cpu resource, based on the weight assigned to root 9452 * system cpu resource, based on the weight assigned to root
9278 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished 9453 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9279 * by letting tasks of init_task_group sit in a separate cfs_rq 9454 * by letting tasks of init_task_group sit in a separate cfs_rq
9280 * (init_cfs_rq) and having one entity represent this group of 9455 * (init_tg_cfs_rq) and having one entity represent this group of
9281 * tasks in rq->cfs (i.e init_task_group->se[] != NULL). 9456 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9282 */ 9457 */
9283 init_tg_cfs_entry(&init_task_group, 9458 init_tg_cfs_entry(&init_task_group,
9284 &per_cpu(init_cfs_rq, i), 9459 &per_cpu(init_tg_cfs_rq, i),
9285 &per_cpu(init_sched_entity, i), i, 1, 9460 &per_cpu(init_sched_entity, i), i, 1,
9286 root_task_group.se[i]); 9461 root_task_group.se[i]);
9287 9462
@@ -9307,6 +9482,7 @@ void __init sched_init(void)
9307#ifdef CONFIG_SMP 9482#ifdef CONFIG_SMP
9308 rq->sd = NULL; 9483 rq->sd = NULL;
9309 rq->rd = NULL; 9484 rq->rd = NULL;
9485 rq->post_schedule = 0;
9310 rq->active_balance = 0; 9486 rq->active_balance = 0;
9311 rq->next_balance = jiffies; 9487 rq->next_balance = jiffies;
9312 rq->push_cpu = 0; 9488 rq->push_cpu = 0;
@@ -9365,19 +9541,26 @@ void __init sched_init(void)
9365 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9541 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9366#endif /* SMP */ 9542#endif /* SMP */
9367 9543
9368 perf_counter_init(); 9544 perf_event_init();
9369 9545
9370 scheduler_running = 1; 9546 scheduler_running = 1;
9371} 9547}
9372 9548
9373#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9549#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9374void __might_sleep(char *file, int line) 9550static inline int preempt_count_equals(int preempt_offset)
9551{
9552 int nested = preempt_count() & ~PREEMPT_ACTIVE;
9553
9554 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9555}
9556
9557void __might_sleep(char *file, int line, int preempt_offset)
9375{ 9558{
9376#ifdef in_atomic 9559#ifdef in_atomic
9377 static unsigned long prev_jiffy; /* ratelimiting */ 9560 static unsigned long prev_jiffy; /* ratelimiting */
9378 9561
9379 if ((!in_atomic() && !irqs_disabled()) || 9562 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
9380 system_state != SYSTEM_RUNNING || oops_in_progress) 9563 system_state != SYSTEM_RUNNING || oops_in_progress)
9381 return; 9564 return;
9382 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9565 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9383 return; 9566 return;
@@ -10554,3 +10737,113 @@ struct cgroup_subsys cpuacct_subsys = {
10554 .subsys_id = cpuacct_subsys_id, 10737 .subsys_id = cpuacct_subsys_id,
10555}; 10738};
10556#endif /* CONFIG_CGROUP_CPUACCT */ 10739#endif /* CONFIG_CGROUP_CPUACCT */
10740
10741#ifndef CONFIG_SMP
10742
10743int rcu_expedited_torture_stats(char *page)
10744{
10745 return 0;
10746}
10747EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10748
10749void synchronize_sched_expedited(void)
10750{
10751}
10752EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10753
10754#else /* #ifndef CONFIG_SMP */
10755
10756static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
10757static DEFINE_MUTEX(rcu_sched_expedited_mutex);
10758
10759#define RCU_EXPEDITED_STATE_POST -2
10760#define RCU_EXPEDITED_STATE_IDLE -1
10761
10762static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10763
10764int rcu_expedited_torture_stats(char *page)
10765{
10766 int cnt = 0;
10767 int cpu;
10768
10769 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
10770 for_each_online_cpu(cpu) {
10771 cnt += sprintf(&page[cnt], " %d:%d",
10772 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
10773 }
10774 cnt += sprintf(&page[cnt], "\n");
10775 return cnt;
10776}
10777EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10778
10779static long synchronize_sched_expedited_count;
10780
10781/*
10782 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
10783 * approach to force grace period to end quickly. This consumes
10784 * significant time on all CPUs, and is thus not recommended for
10785 * any sort of common-case code.
10786 *
10787 * Note that it is illegal to call this function while holding any
10788 * lock that is acquired by a CPU-hotplug notifier. Failing to
10789 * observe this restriction will result in deadlock.
10790 */
10791void synchronize_sched_expedited(void)
10792{
10793 int cpu;
10794 unsigned long flags;
10795 bool need_full_sync = 0;
10796 struct rq *rq;
10797 struct migration_req *req;
10798 long snap;
10799 int trycount = 0;
10800
10801 smp_mb(); /* ensure prior mod happens before capturing snap. */
10802 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
10803 get_online_cpus();
10804 while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
10805 put_online_cpus();
10806 if (trycount++ < 10)
10807 udelay(trycount * num_online_cpus());
10808 else {
10809 synchronize_sched();
10810 return;
10811 }
10812 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
10813 smp_mb(); /* ensure test happens before caller kfree */
10814 return;
10815 }
10816 get_online_cpus();
10817 }
10818 rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
10819 for_each_online_cpu(cpu) {
10820 rq = cpu_rq(cpu);
10821 req = &per_cpu(rcu_migration_req, cpu);
10822 init_completion(&req->done);
10823 req->task = NULL;
10824 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10825 spin_lock_irqsave(&rq->lock, flags);
10826 list_add(&req->list, &rq->migration_queue);
10827 spin_unlock_irqrestore(&rq->lock, flags);
10828 wake_up_process(rq->migration_thread);
10829 }
10830 for_each_online_cpu(cpu) {
10831 rcu_expedited_state = cpu;
10832 req = &per_cpu(rcu_migration_req, cpu);
10833 rq = cpu_rq(cpu);
10834 wait_for_completion(&req->done);
10835 spin_lock_irqsave(&rq->lock, flags);
10836 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10837 need_full_sync = 1;
10838 req->dest_cpu = RCU_MIGRATION_IDLE;
10839 spin_unlock_irqrestore(&rq->lock, flags);
10840 }
10841 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10842 mutex_unlock(&rcu_sched_expedited_mutex);
10843 put_online_cpus();
10844 if (need_full_sync)
10845 synchronize_sched();
10846}
10847EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10848
10849#endif /* #else #ifndef CONFIG_SMP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e1d16c9a7680..ac2e1dc708bd 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running;
48__read_mostly int sched_clock_stable; 48__read_mostly int sched_clock_stable;
49 49
50struct sched_clock_data { 50struct sched_clock_data {
51 /*
52 * Raw spinlock - this is a special case: this might be called
53 * from within instrumentation code so we dont want to do any
54 * instrumentation ourselves.
55 */
56 raw_spinlock_t lock;
57
58 u64 tick_raw; 51 u64 tick_raw;
59 u64 tick_gtod; 52 u64 tick_gtod;
60 u64 clock; 53 u64 clock;
@@ -80,7 +73,6 @@ void sched_clock_init(void)
80 for_each_possible_cpu(cpu) { 73 for_each_possible_cpu(cpu) {
81 struct sched_clock_data *scd = cpu_sdc(cpu); 74 struct sched_clock_data *scd = cpu_sdc(cpu);
82 75
83 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
84 scd->tick_raw = 0; 76 scd->tick_raw = 0;
85 scd->tick_gtod = ktime_now; 77 scd->tick_gtod = ktime_now;
86 scd->clock = ktime_now; 78 scd->clock = ktime_now;
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y)
109 * - filter out backward motion 101 * - filter out backward motion
110 * - use the GTOD tick value to create a window to filter crazy TSC values 102 * - use the GTOD tick value to create a window to filter crazy TSC values
111 */ 103 */
112static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) 104static u64 sched_clock_local(struct sched_clock_data *scd)
113{ 105{
114 s64 delta = now - scd->tick_raw; 106 u64 now, clock, old_clock, min_clock, max_clock;
115 u64 clock, min_clock, max_clock; 107 s64 delta;
116 108
109again:
110 now = sched_clock();
111 delta = now - scd->tick_raw;
117 if (unlikely(delta < 0)) 112 if (unlikely(delta < 0))
118 delta = 0; 113 delta = 0;
119 114
115 old_clock = scd->clock;
116
120 /* 117 /*
121 * scd->clock = clamp(scd->tick_gtod + delta, 118 * scd->clock = clamp(scd->tick_gtod + delta,
122 * max(scd->tick_gtod, scd->clock), 119 * max(scd->tick_gtod, scd->clock),
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
124 */ 121 */
125 122
126 clock = scd->tick_gtod + delta; 123 clock = scd->tick_gtod + delta;
127 min_clock = wrap_max(scd->tick_gtod, scd->clock); 124 min_clock = wrap_max(scd->tick_gtod, old_clock);
128 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); 125 max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
129 126
130 clock = wrap_max(clock, min_clock); 127 clock = wrap_max(clock, min_clock);
131 clock = wrap_min(clock, max_clock); 128 clock = wrap_min(clock, max_clock);
132 129
133 scd->clock = clock; 130 if (cmpxchg(&scd->clock, old_clock, clock) != old_clock)
131 goto again;
134 132
135 return scd->clock; 133 return clock;
136} 134}
137 135
138static void lock_double_clock(struct sched_clock_data *data1, 136static u64 sched_clock_remote(struct sched_clock_data *scd)
139 struct sched_clock_data *data2)
140{ 137{
141 if (data1 < data2) { 138 struct sched_clock_data *my_scd = this_scd();
142 __raw_spin_lock(&data1->lock); 139 u64 this_clock, remote_clock;
143 __raw_spin_lock(&data2->lock); 140 u64 *ptr, old_val, val;
141
142 sched_clock_local(my_scd);
143again:
144 this_clock = my_scd->clock;
145 remote_clock = scd->clock;
146
147 /*
148 * Use the opportunity that we have both locks
149 * taken to couple the two clocks: we take the
150 * larger time as the latest time for both
151 * runqueues. (this creates monotonic movement)
152 */
153 if (likely((s64)(remote_clock - this_clock) < 0)) {
154 ptr = &scd->clock;
155 old_val = remote_clock;
156 val = this_clock;
144 } else { 157 } else {
145 __raw_spin_lock(&data2->lock); 158 /*
146 __raw_spin_lock(&data1->lock); 159 * Should be rare, but possible:
160 */
161 ptr = &my_scd->clock;
162 old_val = this_clock;
163 val = remote_clock;
147 } 164 }
165
166 if (cmpxchg(ptr, old_val, val) != old_val)
167 goto again;
168
169 return val;
148} 170}
149 171
150u64 sched_clock_cpu(int cpu) 172u64 sched_clock_cpu(int cpu)
151{ 173{
152 u64 now, clock, this_clock, remote_clock;
153 struct sched_clock_data *scd; 174 struct sched_clock_data *scd;
175 u64 clock;
176
177 WARN_ON_ONCE(!irqs_disabled());
154 178
155 if (sched_clock_stable) 179 if (sched_clock_stable)
156 return sched_clock(); 180 return sched_clock();
157 181
158 scd = cpu_sdc(cpu);
159
160 /*
161 * Normally this is not called in NMI context - but if it is,
162 * trying to do any locking here is totally lethal.
163 */
164 if (unlikely(in_nmi()))
165 return scd->clock;
166
167 if (unlikely(!sched_clock_running)) 182 if (unlikely(!sched_clock_running))
168 return 0ull; 183 return 0ull;
169 184
170 WARN_ON_ONCE(!irqs_disabled()); 185 scd = cpu_sdc(cpu);
171 now = sched_clock();
172
173 if (cpu != raw_smp_processor_id()) {
174 struct sched_clock_data *my_scd = this_scd();
175
176 lock_double_clock(scd, my_scd);
177
178 this_clock = __update_sched_clock(my_scd, now);
179 remote_clock = scd->clock;
180
181 /*
182 * Use the opportunity that we have both locks
183 * taken to couple the two clocks: we take the
184 * larger time as the latest time for both
185 * runqueues. (this creates monotonic movement)
186 */
187 if (likely((s64)(remote_clock - this_clock) < 0)) {
188 clock = this_clock;
189 scd->clock = clock;
190 } else {
191 /*
192 * Should be rare, but possible:
193 */
194 clock = remote_clock;
195 my_scd->clock = remote_clock;
196 }
197
198 __raw_spin_unlock(&my_scd->lock);
199 } else {
200 __raw_spin_lock(&scd->lock);
201 clock = __update_sched_clock(scd, now);
202 }
203 186
204 __raw_spin_unlock(&scd->lock); 187 if (cpu != smp_processor_id())
188 clock = sched_clock_remote(scd);
189 else
190 clock = sched_clock_local(scd);
205 191
206 return clock; 192 return clock;
207} 193}
@@ -223,11 +209,9 @@ void sched_clock_tick(void)
223 now_gtod = ktime_to_ns(ktime_get()); 209 now_gtod = ktime_to_ns(ktime_get());
224 now = sched_clock(); 210 now = sched_clock();
225 211
226 __raw_spin_lock(&scd->lock);
227 scd->tick_raw = now; 212 scd->tick_raw = now;
228 scd->tick_gtod = now_gtod; 213 scd->tick_gtod = now_gtod;
229 __update_sched_clock(scd, now); 214 sched_clock_local(scd);
230 __raw_spin_unlock(&scd->lock);
231} 215}
232 216
233/* 217/*
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6c251790dde..0f052fc674d5 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -81,8 +81,21 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82 continue; 82 continue;
83 83
84 if (lowest_mask) 84 if (lowest_mask) {
85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
86
87 /*
88 * We have to ensure that we have at least one bit
89 * still set in the array, since the map could have
90 * been concurrently emptied between the first and
91 * second reads of vec->mask. If we hit this
92 * condition, simply act as though we never hit this
93 * priority level and continue on.
94 */
95 if (cpumask_any(lowest_mask) >= nr_cpu_ids)
96 continue;
97 }
98
86 return 1; 99 return 1;
87 } 100 }
88 101
@@ -114,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
114 127
115 /* 128 /*
116 * If the cpu was currently mapped to a different value, we 129 * If the cpu was currently mapped to a different value, we
117 * first need to unmap the old value 130 * need to map it to the new value then remove the old value.
131 * Note, we must add the new value first, otherwise we risk the
132 * cpu being cleared from pri_active, and this cpu could be
133 * missed for a push or pull.
118 */ 134 */
119 if (likely(oldpri != CPUPRI_INVALID)) {
120 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
121
122 spin_lock_irqsave(&vec->lock, flags);
123
124 vec->count--;
125 if (!vec->count)
126 clear_bit(oldpri, cp->pri_active);
127 cpumask_clear_cpu(cpu, vec->mask);
128
129 spin_unlock_irqrestore(&vec->lock, flags);
130 }
131
132 if (likely(newpri != CPUPRI_INVALID)) { 135 if (likely(newpri != CPUPRI_INVALID)) {
133 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
134 137
@@ -141,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
141 144
142 spin_unlock_irqrestore(&vec->lock, flags); 145 spin_unlock_irqrestore(&vec->lock, flags);
143 } 146 }
147 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149
150 spin_lock_irqsave(&vec->lock, flags);
151
152 vec->count--;
153 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask);
156
157 spin_unlock_irqrestore(&vec->lock, flags);
158 }
144 159
145 *currpri = newpri; 160 *currpri = newpri;
146} 161}
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 70c7e0b79946..efb84409bc43 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
395 PN(se.sum_exec_runtime); 395 PN(se.sum_exec_runtime);
396 PN(se.avg_overlap); 396 PN(se.avg_overlap);
397 PN(se.avg_wakeup); 397 PN(se.avg_wakeup);
398 PN(se.avg_running);
398 399
399 nr_switches = p->nvcsw + p->nivcsw; 400 nr_switches = p->nvcsw + p->nivcsw;
400 401
@@ -409,6 +410,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
409 PN(se.wait_max); 410 PN(se.wait_max);
410 PN(se.wait_sum); 411 PN(se.wait_sum);
411 P(se.wait_count); 412 P(se.wait_count);
413 PN(se.iowait_sum);
414 P(se.iowait_count);
412 P(sched_info.bkl_count); 415 P(sched_info.bkl_count);
413 P(se.nr_migrations); 416 P(se.nr_migrations);
414 P(se.nr_migrations_cold); 417 P(se.nr_migrations_cold);
@@ -479,6 +482,8 @@ void proc_sched_set_task(struct task_struct *p)
479 p->se.wait_max = 0; 482 p->se.wait_max = 0;
480 p->se.wait_sum = 0; 483 p->se.wait_sum = 0;
481 p->se.wait_count = 0; 484 p->se.wait_count = 0;
485 p->se.iowait_sum = 0;
486 p->se.iowait_count = 0;
482 p->se.sleep_max = 0; 487 p->se.sleep_max = 0;
483 p->se.sum_sleep_runtime = 0; 488 p->se.sum_sleep_runtime = 0;
484 p->se.block_max = 0; 489 p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ba7fd6e9556f..ecc637a0d591 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -24,7 +24,7 @@
24 24
25/* 25/*
26 * Targeted preemption latency for CPU-bound tasks: 26 * Targeted preemption latency for CPU-bound tasks:
27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) 27 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
28 * 28 *
29 * NOTE: this latency value is not the same as the concept of 29 * NOTE: this latency value is not the same as the concept of
30 * 'timeslice length' - timeslices in CFS are of variable length 30 * 'timeslice length' - timeslices in CFS are of variable length
@@ -34,13 +34,13 @@
34 * (to see the precise effective timeslice length of your workload, 34 * (to see the precise effective timeslice length of your workload,
35 * run vmstat and monitor the context-switches (cs) field) 35 * run vmstat and monitor the context-switches (cs) field)
36 */ 36 */
37unsigned int sysctl_sched_latency = 20000000ULL; 37unsigned int sysctl_sched_latency = 5000000ULL;
38 38
39/* 39/*
40 * Minimal preemption granularity for CPU-bound tasks: 40 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) 41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 42 */
43unsigned int sysctl_sched_min_granularity = 4000000ULL; 43unsigned int sysctl_sched_min_granularity = 1000000ULL;
44 44
45/* 45/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL;
48static unsigned int sched_nr_latency = 5; 48static unsigned int sched_nr_latency = 5;
49 49
50/* 50/*
51 * After fork, child runs first. (default) If set to 0 then 51 * After fork, child runs first. If set to 0 (default) then
52 * parent will (try to) run first. 52 * parent will (try to) run first.
53 */ 53 */
54const_debug unsigned int sysctl_sched_child_runs_first = 1; 54unsigned int sysctl_sched_child_runs_first __read_mostly;
55 55
56/* 56/*
57 * sys_sched_yield() compat mode 57 * sys_sched_yield() compat mode
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
66 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 * 67 *
68 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
71 */ 71 */
72unsigned int sysctl_sched_wakeup_granularity = 5000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
@@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class;
79 * CFS operations on generic schedulable entities: 79 * CFS operations on generic schedulable entities:
80 */ 80 */
81 81
82static inline struct task_struct *task_of(struct sched_entity *se)
83{
84 return container_of(se, struct task_struct, se);
85}
86
87#ifdef CONFIG_FAIR_GROUP_SCHED 82#ifdef CONFIG_FAIR_GROUP_SCHED
88 83
89/* cpu runqueue to which this cfs_rq is attached */ 84/* cpu runqueue to which this cfs_rq is attached */
@@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
95/* An entity is a task if it doesn't "own" a runqueue */ 90/* An entity is a task if it doesn't "own" a runqueue */
96#define entity_is_task(se) (!se->my_q) 91#define entity_is_task(se) (!se->my_q)
97 92
93static inline struct task_struct *task_of(struct sched_entity *se)
94{
95#ifdef CONFIG_SCHED_DEBUG
96 WARN_ON_ONCE(!entity_is_task(se));
97#endif
98 return container_of(se, struct task_struct, se);
99}
100
98/* Walk up scheduling entities hierarchy */ 101/* Walk up scheduling entities hierarchy */
99#define for_each_sched_entity(se) \ 102#define for_each_sched_entity(se) \
100 for (; se; se = se->parent) 103 for (; se; se = se->parent)
@@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
186 } 189 }
187} 190}
188 191
189#else /* CONFIG_FAIR_GROUP_SCHED */ 192#else /* !CONFIG_FAIR_GROUP_SCHED */
193
194static inline struct task_struct *task_of(struct sched_entity *se)
195{
196 return container_of(se, struct task_struct, se);
197}
190 198
191static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 199static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
192{ 200{
@@ -266,6 +274,12 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
266 return min_vruntime; 274 return min_vruntime;
267} 275}
268 276
277static inline int entity_before(struct sched_entity *a,
278 struct sched_entity *b)
279{
280 return (s64)(a->vruntime - b->vruntime) < 0;
281}
282
269static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 283static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
270{ 284{
271 return se->vruntime - cfs_rq->min_vruntime; 285 return se->vruntime - cfs_rq->min_vruntime;
@@ -499,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
499 if (entity_is_task(curr)) { 513 if (entity_is_task(curr)) {
500 struct task_struct *curtask = task_of(curr); 514 struct task_struct *curtask = task_of(curr);
501 515
516 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
502 cpuacct_charge(curtask, delta_exec); 517 cpuacct_charge(curtask, delta_exec);
503 account_group_exec_runtime(curtask, delta_exec); 518 account_group_exec_runtime(curtask, delta_exec);
504 } 519 }
@@ -531,6 +546,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
531 schedstat_set(se->wait_count, se->wait_count + 1); 546 schedstat_set(se->wait_count, se->wait_count + 1);
532 schedstat_set(se->wait_sum, se->wait_sum + 547 schedstat_set(se->wait_sum, se->wait_sum +
533 rq_of(cfs_rq)->clock - se->wait_start); 548 rq_of(cfs_rq)->clock - se->wait_start);
549#ifdef CONFIG_SCHEDSTATS
550 if (entity_is_task(se)) {
551 trace_sched_stat_wait(task_of(se),
552 rq_of(cfs_rq)->clock - se->wait_start);
553 }
554#endif
534 schedstat_set(se->wait_start, 0); 555 schedstat_set(se->wait_start, 0);
535} 556}
536 557
@@ -605,9 +626,13 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
605static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 626static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
606{ 627{
607#ifdef CONFIG_SCHEDSTATS 628#ifdef CONFIG_SCHEDSTATS
629 struct task_struct *tsk = NULL;
630
631 if (entity_is_task(se))
632 tsk = task_of(se);
633
608 if (se->sleep_start) { 634 if (se->sleep_start) {
609 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 635 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
610 struct task_struct *tsk = task_of(se);
611 636
612 if ((s64)delta < 0) 637 if ((s64)delta < 0)
613 delta = 0; 638 delta = 0;
@@ -618,11 +643,13 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
618 se->sleep_start = 0; 643 se->sleep_start = 0;
619 se->sum_sleep_runtime += delta; 644 se->sum_sleep_runtime += delta;
620 645
621 account_scheduler_latency(tsk, delta >> 10, 1); 646 if (tsk) {
647 account_scheduler_latency(tsk, delta >> 10, 1);
648 trace_sched_stat_sleep(tsk, delta);
649 }
622 } 650 }
623 if (se->block_start) { 651 if (se->block_start) {
624 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 652 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
625 struct task_struct *tsk = task_of(se);
626 653
627 if ((s64)delta < 0) 654 if ((s64)delta < 0)
628 delta = 0; 655 delta = 0;
@@ -633,17 +660,25 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 se->block_start = 0; 660 se->block_start = 0;
634 se->sum_sleep_runtime += delta; 661 se->sum_sleep_runtime += delta;
635 662
636 /* 663 if (tsk) {
637 * Blocking time is in units of nanosecs, so shift by 20 to 664 if (tsk->in_iowait) {
638 * get a milliseconds-range estimation of the amount of 665 se->iowait_sum += delta;
639 * time that the task spent sleeping: 666 se->iowait_count++;
640 */ 667 trace_sched_stat_iowait(tsk, delta);
641 if (unlikely(prof_on == SLEEP_PROFILING)) { 668 }
642 669
643 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 670 /*
644 delta >> 20); 671 * Blocking time is in units of nanosecs, so shift by
672 * 20 to get a milliseconds-range estimation of the
673 * amount of time that the task spent sleeping:
674 */
675 if (unlikely(prof_on == SLEEP_PROFILING)) {
676 profile_hits(SLEEP_PROFILING,
677 (void *)get_wchan(tsk),
678 delta >> 20);
679 }
680 account_scheduler_latency(tsk, delta >> 10, 0);
645 } 681 }
646 account_scheduler_latency(tsk, delta >> 10, 0);
647 } 682 }
648#endif 683#endif
649} 684}
@@ -675,28 +710,33 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
675 if (initial && sched_feat(START_DEBIT)) 710 if (initial && sched_feat(START_DEBIT))
676 vruntime += sched_vslice(cfs_rq, se); 711 vruntime += sched_vslice(cfs_rq, se);
677 712
678 if (!initial) { 713 /* sleeps up to a single latency don't count. */
679 /* sleeps upto a single latency don't count. */ 714 if (!initial && sched_feat(FAIR_SLEEPERS)) {
680 if (sched_feat(NEW_FAIR_SLEEPERS)) { 715 unsigned long thresh = sysctl_sched_latency;
681 unsigned long thresh = sysctl_sched_latency;
682 716
683 /* 717 /*
684 * Convert the sleeper threshold into virtual time. 718 * Convert the sleeper threshold into virtual time.
685 * SCHED_IDLE is a special sub-class. We care about 719 * SCHED_IDLE is a special sub-class. We care about
686 * fairness only relative to other SCHED_IDLE tasks, 720 * fairness only relative to other SCHED_IDLE tasks,
687 * all of which have the same weight. 721 * all of which have the same weight.
688 */ 722 */
689 if (sched_feat(NORMALIZED_SLEEPER) && 723 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
690 task_of(se)->policy != SCHED_IDLE) 724 task_of(se)->policy != SCHED_IDLE))
691 thresh = calc_delta_fair(thresh, se); 725 thresh = calc_delta_fair(thresh, se);
692 726
693 vruntime -= thresh; 727 /*
694 } 728 * Halve their sleep time's effect, to allow
729 * for a gentler effect of sleepers:
730 */
731 if (sched_feat(GENTLE_FAIR_SLEEPERS))
732 thresh >>= 1;
695 733
696 /* ensure we never gain time by being placed backwards. */ 734 vruntime -= thresh;
697 vruntime = max_vruntime(se->vruntime, vruntime);
698 } 735 }
699 736
737 /* ensure we never gain time by being placed backwards. */
738 vruntime = max_vruntime(se->vruntime, vruntime);
739
700 se->vruntime = vruntime; 740 se->vruntime = vruntime;
701} 741}
702 742
@@ -722,10 +762,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
722 762
723static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 763static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
724{ 764{
725 if (cfs_rq->last == se) 765 if (!se || cfs_rq->last == se)
726 cfs_rq->last = NULL; 766 cfs_rq->last = NULL;
727 767
728 if (cfs_rq->next == se) 768 if (!se || cfs_rq->next == se)
729 cfs_rq->next = NULL; 769 cfs_rq->next = NULL;
730} 770}
731 771
@@ -1016,7 +1056,7 @@ static void yield_task_fair(struct rq *rq)
1016 /* 1056 /*
1017 * Already in the rightmost position? 1057 * Already in the rightmost position?
1018 */ 1058 */
1019 if (unlikely(!rightmost || rightmost->vruntime < se->vruntime)) 1059 if (unlikely(!rightmost || entity_before(rightmost, se)))
1020 return; 1060 return;
1021 1061
1022 /* 1062 /*
@@ -1027,79 +1067,6 @@ static void yield_task_fair(struct rq *rq)
1027 se->vruntime = rightmost->vruntime + 1; 1067 se->vruntime = rightmost->vruntime + 1;
1028} 1068}
1029 1069
1030/*
1031 * wake_idle() will wake a task on an idle cpu if task->cpu is
1032 * not idle and an idle cpu is available. The span of cpus to
1033 * search starts with cpus closest then further out as needed,
1034 * so we always favor a closer, idle cpu.
1035 * Domains may include CPUs that are not usable for migration,
1036 * hence we need to mask them out (cpu_active_mask)
1037 *
1038 * Returns the CPU we should wake onto.
1039 */
1040#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1041static int wake_idle(int cpu, struct task_struct *p)
1042{
1043 struct sched_domain *sd;
1044 int i;
1045 unsigned int chosen_wakeup_cpu;
1046 int this_cpu;
1047
1048 /*
1049 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1050 * are idle and this is not a kernel thread and this task's affinity
1051 * allows it to be moved to preferred cpu, then just move!
1052 */
1053
1054 this_cpu = smp_processor_id();
1055 chosen_wakeup_cpu =
1056 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1057
1058 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1059 idle_cpu(cpu) && idle_cpu(this_cpu) &&
1060 p->mm && !(p->flags & PF_KTHREAD) &&
1061 cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1062 return chosen_wakeup_cpu;
1063
1064 /*
1065 * If it is idle, then it is the best cpu to run this task.
1066 *
1067 * This cpu is also the best, if it has more than one task already.
1068 * Siblings must be also busy(in most cases) as they didn't already
1069 * pickup the extra load from this cpu and hence we need not check
1070 * sibling runqueue info. This will avoid the checks and cache miss
1071 * penalities associated with that.
1072 */
1073 if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
1074 return cpu;
1075
1076 for_each_domain(cpu, sd) {
1077 if ((sd->flags & SD_WAKE_IDLE)
1078 || ((sd->flags & SD_WAKE_IDLE_FAR)
1079 && !task_hot(p, task_rq(p)->clock, sd))) {
1080 for_each_cpu_and(i, sched_domain_span(sd),
1081 &p->cpus_allowed) {
1082 if (cpu_active(i) && idle_cpu(i)) {
1083 if (i != task_cpu(p)) {
1084 schedstat_inc(p,
1085 se.nr_wakeups_idle);
1086 }
1087 return i;
1088 }
1089 }
1090 } else {
1091 break;
1092 }
1093 }
1094 return cpu;
1095}
1096#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1097static inline int wake_idle(int cpu, struct task_struct *p)
1098{
1099 return cpu;
1100}
1101#endif
1102
1103#ifdef CONFIG_SMP 1070#ifdef CONFIG_SMP
1104 1071
1105#ifdef CONFIG_FAIR_GROUP_SCHED 1072#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1186,25 +1153,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1186 1153
1187#endif 1154#endif
1188 1155
1189static int 1156static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1190wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1191 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1192 int idx, unsigned long load, unsigned long this_load,
1193 unsigned int imbalance)
1194{ 1157{
1195 struct task_struct *curr = this_rq->curr; 1158 struct task_struct *curr = current;
1196 struct task_group *tg; 1159 unsigned long this_load, load;
1197 unsigned long tl = this_load; 1160 int idx, this_cpu, prev_cpu;
1198 unsigned long tl_per_task; 1161 unsigned long tl_per_task;
1162 unsigned int imbalance;
1163 struct task_group *tg;
1199 unsigned long weight; 1164 unsigned long weight;
1200 int balanced; 1165 int balanced;
1201 1166
1202 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1167 idx = sd->wake_idx;
1203 return 0; 1168 this_cpu = smp_processor_id();
1169 prev_cpu = task_cpu(p);
1170 load = source_load(prev_cpu, idx);
1171 this_load = target_load(this_cpu, idx);
1204 1172
1205 if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || 1173 if (sync) {
1206 p->se.avg_overlap > sysctl_sched_migration_cost)) 1174 if (sched_feat(SYNC_LESS) &&
1207 sync = 0; 1175 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1176 p->se.avg_overlap > sysctl_sched_migration_cost))
1177 sync = 0;
1178 } else {
1179 if (sched_feat(SYNC_MORE) &&
1180 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1181 p->se.avg_overlap < sysctl_sched_migration_cost))
1182 sync = 1;
1183 }
1208 1184
1209 /* 1185 /*
1210 * If sync wakeup then subtract the (maximum possible) 1186 * If sync wakeup then subtract the (maximum possible)
@@ -1215,14 +1191,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1215 tg = task_group(current); 1191 tg = task_group(current);
1216 weight = current->se.load.weight; 1192 weight = current->se.load.weight;
1217 1193
1218 tl += effective_load(tg, this_cpu, -weight, -weight); 1194 this_load += effective_load(tg, this_cpu, -weight, -weight);
1219 load += effective_load(tg, prev_cpu, 0, -weight); 1195 load += effective_load(tg, prev_cpu, 0, -weight);
1220 } 1196 }
1221 1197
1222 tg = task_group(p); 1198 tg = task_group(p);
1223 weight = p->se.load.weight; 1199 weight = p->se.load.weight;
1224 1200
1225 balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= 1201 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1202
1203 /*
1204 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1205 * due to the sync cause above having dropped this_load to 0, we'll
1206 * always have an imbalance, but there's really nothing you can do
1207 * about that, so that's good too.
1208 *
1209 * Otherwise check if either cpus are near enough in load to allow this
1210 * task to be woken on this_cpu.
1211 */
1212 balanced = !this_load ||
1213 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1226 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1214 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1227 1215
1228 /* 1216 /*
@@ -1236,14 +1224,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1236 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1224 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1237 tl_per_task = cpu_avg_load_per_task(this_cpu); 1225 tl_per_task = cpu_avg_load_per_task(this_cpu);
1238 1226
1239 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= 1227 if (balanced ||
1240 tl_per_task)) { 1228 (this_load <= load &&
1229 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1241 /* 1230 /*
1242 * This domain has SD_WAKE_AFFINE and 1231 * This domain has SD_WAKE_AFFINE and
1243 * p is cache cold in this domain, and 1232 * p is cache cold in this domain, and
1244 * there is no bad imbalance. 1233 * there is no bad imbalance.
1245 */ 1234 */
1246 schedstat_inc(this_sd, ttwu_move_affine); 1235 schedstat_inc(sd, ttwu_move_affine);
1247 schedstat_inc(p, se.nr_wakeups_affine); 1236 schedstat_inc(p, se.nr_wakeups_affine);
1248 1237
1249 return 1; 1238 return 1;
@@ -1251,67 +1240,216 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1251 return 0; 1240 return 0;
1252} 1241}
1253 1242
1254static int select_task_rq_fair(struct task_struct *p, int sync) 1243/*
1244 * find_idlest_group finds and returns the least busy CPU group within the
1245 * domain.
1246 */
1247static struct sched_group *
1248find_idlest_group(struct sched_domain *sd, struct task_struct *p,
1249 int this_cpu, int load_idx)
1255{ 1250{
1256 struct sched_domain *sd, *this_sd = NULL; 1251 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1257 int prev_cpu, this_cpu, new_cpu; 1252 unsigned long min_load = ULONG_MAX, this_load = 0;
1258 unsigned long load, this_load; 1253 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1259 struct rq *this_rq;
1260 unsigned int imbalance;
1261 int idx;
1262 1254
1263 prev_cpu = task_cpu(p); 1255 do {
1264 this_cpu = smp_processor_id(); 1256 unsigned long load, avg_load;
1265 this_rq = cpu_rq(this_cpu); 1257 int local_group;
1266 new_cpu = prev_cpu; 1258 int i;
1267 1259
1268 if (prev_cpu == this_cpu) 1260 /* Skip over this group if it has no CPUs allowed */
1269 goto out; 1261 if (!cpumask_intersects(sched_group_cpus(group),
1270 /* 1262 &p->cpus_allowed))
1271 * 'this_sd' is the first domain that both 1263 continue;
1272 * this_cpu and prev_cpu are present in: 1264
1273 */ 1265 local_group = cpumask_test_cpu(this_cpu,
1274 for_each_domain(this_cpu, sd) { 1266 sched_group_cpus(group));
1275 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { 1267
1276 this_sd = sd; 1268 /* Tally up the load of all CPUs in the group */
1277 break; 1269 avg_load = 0;
1270
1271 for_each_cpu(i, sched_group_cpus(group)) {
1272 /* Bias balancing toward cpus of our domain */
1273 if (local_group)
1274 load = source_load(i, load_idx);
1275 else
1276 load = target_load(i, load_idx);
1277
1278 avg_load += load;
1279 }
1280
1281 /* Adjust by relative CPU power of the group */
1282 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1283
1284 if (local_group) {
1285 this_load = avg_load;
1286 this = group;
1287 } else if (avg_load < min_load) {
1288 min_load = avg_load;
1289 idlest = group;
1290 }
1291 } while (group = group->next, group != sd->groups);
1292
1293 if (!idlest || 100*this_load < imbalance*min_load)
1294 return NULL;
1295 return idlest;
1296}
1297
1298/*
1299 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1300 */
1301static int
1302find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1303{
1304 unsigned long load, min_load = ULONG_MAX;
1305 int idlest = -1;
1306 int i;
1307
1308 /* Traverse only the allowed CPUs */
1309 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
1310 load = weighted_cpuload(i);
1311
1312 if (load < min_load || (load == min_load && i == this_cpu)) {
1313 min_load = load;
1314 idlest = i;
1278 } 1315 }
1279 } 1316 }
1280 1317
1281 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) 1318 return idlest;
1282 goto out; 1319}
1283 1320
1284 /* 1321/*
1285 * Check for affine wakeup and passive balancing possibilities. 1322 * sched_balance_self: balance the current task (running on cpu) in domains
1286 */ 1323 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1287 if (!this_sd) 1324 * SD_BALANCE_EXEC.
1325 *
1326 * Balance, ie. select the least loaded group.
1327 *
1328 * Returns the target CPU number, or the same CPU if no balancing is needed.
1329 *
1330 * preempt must be disabled.
1331 */
1332static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
1333{
1334 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1335 int cpu = smp_processor_id();
1336 int prev_cpu = task_cpu(p);
1337 int new_cpu = cpu;
1338 int want_affine = 0;
1339 int want_sd = 1;
1340 int sync = wake_flags & WF_SYNC;
1341
1342 if (sd_flag & SD_BALANCE_WAKE) {
1343 if (sched_feat(AFFINE_WAKEUPS) &&
1344 cpumask_test_cpu(cpu, &p->cpus_allowed))
1345 want_affine = 1;
1346 new_cpu = prev_cpu;
1347 }
1348
1349 rcu_read_lock();
1350 for_each_domain(cpu, tmp) {
1351 /*
1352 * If power savings logic is enabled for a domain, see if we
1353 * are not overloaded, if so, don't balance wider.
1354 */
1355 if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
1356 unsigned long power = 0;
1357 unsigned long nr_running = 0;
1358 unsigned long capacity;
1359 int i;
1360
1361 for_each_cpu(i, sched_domain_span(tmp)) {
1362 power += power_of(i);
1363 nr_running += cpu_rq(i)->cfs.nr_running;
1364 }
1365
1366 capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
1367
1368 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1369 nr_running /= 2;
1370
1371 if (nr_running < capacity)
1372 want_sd = 0;
1373 }
1374
1375 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1376 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1377
1378 affine_sd = tmp;
1379 want_affine = 0;
1380 }
1381
1382 if (!want_sd && !want_affine)
1383 break;
1384
1385 if (!(tmp->flags & sd_flag))
1386 continue;
1387
1388 if (want_sd)
1389 sd = tmp;
1390 }
1391
1392 if (sched_feat(LB_SHARES_UPDATE)) {
1393 /*
1394 * Pick the largest domain to update shares over
1395 */
1396 tmp = sd;
1397 if (affine_sd && (!tmp ||
1398 cpumask_weight(sched_domain_span(affine_sd)) >
1399 cpumask_weight(sched_domain_span(sd))))
1400 tmp = affine_sd;
1401
1402 if (tmp)
1403 update_shares(tmp);
1404 }
1405
1406 if (affine_sd && wake_affine(affine_sd, p, sync)) {
1407 new_cpu = cpu;
1288 goto out; 1408 goto out;
1409 }
1289 1410
1290 idx = this_sd->wake_idx; 1411 while (sd) {
1412 int load_idx = sd->forkexec_idx;
1413 struct sched_group *group;
1414 int weight;
1291 1415
1292 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; 1416 if (!(sd->flags & sd_flag)) {
1417 sd = sd->child;
1418 continue;
1419 }
1293 1420
1294 load = source_load(prev_cpu, idx); 1421 if (sd_flag & SD_BALANCE_WAKE)
1295 this_load = target_load(this_cpu, idx); 1422 load_idx = sd->wake_idx;
1423
1424 group = find_idlest_group(sd, p, cpu, load_idx);
1425 if (!group) {
1426 sd = sd->child;
1427 continue;
1428 }
1296 1429
1297 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1430 new_cpu = find_idlest_cpu(group, p, cpu);
1298 load, this_load, imbalance)) 1431 if (new_cpu == -1 || new_cpu == cpu) {
1299 return this_cpu; 1432 /* Now try balancing at a lower domain level of cpu */
1433 sd = sd->child;
1434 continue;
1435 }
1300 1436
1301 /* 1437 /* Now try balancing at a lower domain level of new_cpu */
1302 * Start passive balancing when half the imbalance_pct 1438 cpu = new_cpu;
1303 * limit is reached. 1439 weight = cpumask_weight(sched_domain_span(sd));
1304 */ 1440 sd = NULL;
1305 if (this_sd->flags & SD_WAKE_BALANCE) { 1441 for_each_domain(cpu, tmp) {
1306 if (imbalance*this_load <= 100*load) { 1442 if (weight <= cpumask_weight(sched_domain_span(tmp)))
1307 schedstat_inc(this_sd, ttwu_move_balance); 1443 break;
1308 schedstat_inc(p, se.nr_wakeups_passive); 1444 if (tmp->flags & sd_flag)
1309 return this_cpu; 1445 sd = tmp;
1310 } 1446 }
1447 /* while loop will break here if sd == NULL */
1311 } 1448 }
1312 1449
1313out: 1450out:
1314 return wake_idle(new_cpu, p); 1451 rcu_read_unlock();
1452 return new_cpu;
1315} 1453}
1316#endif /* CONFIG_SMP */ 1454#endif /* CONFIG_SMP */
1317 1455
@@ -1424,11 +1562,12 @@ static void set_next_buddy(struct sched_entity *se)
1424/* 1562/*
1425 * Preempt the current task with a newly woken task if needed: 1563 * Preempt the current task with a newly woken task if needed:
1426 */ 1564 */
1427static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) 1565static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1428{ 1566{
1429 struct task_struct *curr = rq->curr; 1567 struct task_struct *curr = rq->curr;
1430 struct sched_entity *se = &curr->se, *pse = &p->se; 1568 struct sched_entity *se = &curr->se, *pse = &p->se;
1431 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1569 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1570 int sync = wake_flags & WF_SYNC;
1432 1571
1433 update_curr(cfs_rq); 1572 update_curr(cfs_rq);
1434 1573
@@ -1454,7 +1593,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1454 */ 1593 */
1455 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) 1594 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1456 set_last_buddy(se); 1595 set_last_buddy(se);
1457 set_next_buddy(pse); 1596 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1597 set_next_buddy(pse);
1458 1598
1459 /* 1599 /*
1460 * We can come here with TIF_NEED_RESCHED already set from new task 1600 * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1476,16 +1616,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1476 return; 1616 return;
1477 } 1617 }
1478 1618
1479 if (!sched_feat(WAKEUP_PREEMPT)) 1619 if ((sched_feat(WAKEUP_SYNC) && sync) ||
1480 return; 1620 (sched_feat(WAKEUP_OVERLAP) &&
1481 1621 (se->avg_overlap < sysctl_sched_migration_cost &&
1482 if (sched_feat(WAKEUP_OVERLAP) && (sync || 1622 pse->avg_overlap < sysctl_sched_migration_cost))) {
1483 (se->avg_overlap < sysctl_sched_migration_cost &&
1484 pse->avg_overlap < sysctl_sched_migration_cost))) {
1485 resched_task(curr); 1623 resched_task(curr);
1486 return; 1624 return;
1487 } 1625 }
1488 1626
1627 if (sched_feat(WAKEUP_RUNNING)) {
1628 if (pse->avg_running < se->avg_running) {
1629 set_next_buddy(pse);
1630 resched_task(curr);
1631 return;
1632 }
1633 }
1634
1635 if (!sched_feat(WAKEUP_PREEMPT))
1636 return;
1637
1489 find_matching_se(&se, &pse); 1638 find_matching_se(&se, &pse);
1490 1639
1491 BUG_ON(!pse); 1640 BUG_ON(!pse);
@@ -1508,8 +1657,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1508 /* 1657 /*
1509 * If se was a buddy, clear it so that it will have to earn 1658 * If se was a buddy, clear it so that it will have to earn
1510 * the favour again. 1659 * the favour again.
1660 *
1661 * If se was not a buddy, clear the buddies because neither
1662 * was elegible to run, let them earn it again.
1663 *
1664 * IOW. unconditionally clear buddies.
1511 */ 1665 */
1512 __clear_buddies(cfs_rq, se); 1666 __clear_buddies(cfs_rq, NULL);
1513 set_next_entity(cfs_rq, se); 1667 set_next_entity(cfs_rq, se);
1514 cfs_rq = group_cfs_rq(se); 1668 cfs_rq = group_cfs_rq(se);
1515 } while (cfs_rq); 1669 } while (cfs_rq);
@@ -1708,11 +1862,13 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1708 sched_info_queued(p); 1862 sched_info_queued(p);
1709 1863
1710 update_curr(cfs_rq); 1864 update_curr(cfs_rq);
1865 if (curr)
1866 se->vruntime = curr->vruntime;
1711 place_entity(cfs_rq, se, 1); 1867 place_entity(cfs_rq, se, 1);
1712 1868
1713 /* 'curr' will be NULL if the child belongs to a different group */ 1869 /* 'curr' will be NULL if the child belongs to a different group */
1714 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1870 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1715 curr && curr->vruntime < se->vruntime) { 1871 curr && entity_before(curr, se)) {
1716 /* 1872 /*
1717 * Upon rescheduling, sched_class::put_prev_task() will place 1873 * Upon rescheduling, sched_class::put_prev_task() will place
1718 * 'current' within the tree based on its new key value. 1874 * 'current' within the tree based on its new key value.
@@ -1783,6 +1939,25 @@ static void moved_group_fair(struct task_struct *p)
1783} 1939}
1784#endif 1940#endif
1785 1941
1942unsigned int get_rr_interval_fair(struct task_struct *task)
1943{
1944 struct sched_entity *se = &task->se;
1945 unsigned long flags;
1946 struct rq *rq;
1947 unsigned int rr_interval = 0;
1948
1949 /*
1950 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
1951 * idle runqueue:
1952 */
1953 rq = task_rq_lock(task, &flags);
1954 if (rq->cfs.load.weight)
1955 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
1956 task_rq_unlock(rq, &flags);
1957
1958 return rr_interval;
1959}
1960
1786/* 1961/*
1787 * All the scheduling class methods: 1962 * All the scheduling class methods:
1788 */ 1963 */
@@ -1811,6 +1986,8 @@ static const struct sched_class fair_sched_class = {
1811 .prio_changed = prio_changed_fair, 1986 .prio_changed = prio_changed_fair,
1812 .switched_to = switched_to_fair, 1987 .switched_to = switched_to_fair,
1813 1988
1989 .get_rr_interval = get_rr_interval_fair,
1990
1814#ifdef CONFIG_FAIR_GROUP_SCHED 1991#ifdef CONFIG_FAIR_GROUP_SCHED
1815 .moved_group = moved_group_fair, 1992 .moved_group = moved_group_fair,
1816#endif 1993#endif
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 4569bfa7df9b..0d94083582c7 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,17 +1,123 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart.
12 */
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14
15/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
2SCHED_FEAT(NORMALIZED_SLEEPER, 0) 20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
3SCHED_FEAT(ADAPTIVE_GRAN, 1) 21
4SCHED_FEAT(WAKEUP_PREEMPT, 1) 22/*
23 * Place new tasks ahead so that they do not starve already running
24 * tasks
25 */
5SCHED_FEAT(START_DEBIT, 1) 26SCHED_FEAT(START_DEBIT, 1)
27
28/*
29 * Should wakeups try to preempt running tasks.
30 */
31SCHED_FEAT(WAKEUP_PREEMPT, 1)
32
33/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see
65 * also AFFINE_WAKEUPS.
66 */
67SCHED_FEAT(SYNC_WAKEUPS, 1)
68
69/*
70 * Based on load and program behaviour, see if it makes sense to place
71 * a newly woken task on the same cpu as the task that woke it --
72 * improve cache locality. Typically used with SYNC wakeups as
73 * generated by pipes and the like, see also SYNC_WAKEUPS.
74 */
6SCHED_FEAT(AFFINE_WAKEUPS, 1) 75SCHED_FEAT(AFFINE_WAKEUPS, 1)
76
77/*
78 * Weaken SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_LESS, 1)
81
82/*
83 * Add SYNC hint based on overlap
84 */
85SCHED_FEAT(SYNC_MORE, 0)
86
87/*
88 * Prefer to schedule the task we woke last (assuming it failed
89 * wakeup-preemption), since its likely going to consume data we
90 * touched, increases cache locality.
91 */
92SCHED_FEAT(NEXT_BUDDY, 0)
93
94/*
95 * Prefer to schedule the task that ran last (when we did
96 * wake-preempt) as that likely will touch the same data, increases
97 * cache locality.
98 */
99SCHED_FEAT(LAST_BUDDY, 1)
100
101/*
102 * Consider buddies to be cache hot, decreases the likelyness of a
103 * cache buddy being migrated away, increases cache locality.
104 */
7SCHED_FEAT(CACHE_HOT_BUDDY, 1) 105SCHED_FEAT(CACHE_HOT_BUDDY, 1)
8SCHED_FEAT(SYNC_WAKEUPS, 1) 106
107/*
108 * Use arch dependent cpu power functions
109 */
110SCHED_FEAT(ARCH_POWER, 0)
111
9SCHED_FEAT(HRTICK, 0) 112SCHED_FEAT(HRTICK, 0)
10SCHED_FEAT(DOUBLE_TICK, 0) 113SCHED_FEAT(DOUBLE_TICK, 0)
11SCHED_FEAT(ASYM_GRAN, 1)
12SCHED_FEAT(LB_BIAS, 1) 114SCHED_FEAT(LB_BIAS, 1)
13SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 115SCHED_FEAT(LB_SHARES_UPDATE, 1)
14SCHED_FEAT(ASYM_EFF_LOAD, 1) 116SCHED_FEAT(ASYM_EFF_LOAD, 1)
15SCHED_FEAT(WAKEUP_OVERLAP, 0) 117
16SCHED_FEAT(LAST_BUDDY, 1) 118/*
119 * Spin-wait on mutex acquisition when the mutex owner is running on
120 * another cpu -- assumes that when the owner is running, it will soon
121 * release the lock. Decreases scheduling overhead.
122 */
17SCHED_FEAT(OWNER_SPIN, 1) 123SCHED_FEAT(OWNER_SPIN, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 499672c10cbd..b133a28fcde3 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,7 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync) 9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
10{ 10{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 11 return task_cpu(p); /* IDLE tasks as never migrated */
12} 12}
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
@@ -97,6 +97,11 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 97 check_preempt_curr(rq, p, 0);
98} 98}
99 99
100unsigned int get_rr_interval_idle(struct task_struct *task)
101{
102 return 0;
103}
104
100/* 105/*
101 * Simple, special scheduling class for the per-CPU idle tasks: 106 * Simple, special scheduling class for the per-CPU idle tasks:
102 */ 107 */
@@ -122,6 +127,8 @@ static const struct sched_class idle_sched_class = {
122 .set_curr_task = set_curr_task_idle, 127 .set_curr_task = set_curr_task_idle,
123 .task_tick = task_tick_idle, 128 .task_tick = task_tick_idle,
124 129
130 .get_rr_interval = get_rr_interval_idle,
131
125 .prio_changed = prio_changed_idle, 132 .prio_changed = prio_changed_idle,
126 .switched_to = switched_to_idle, 133 .switched_to = switched_to_idle,
127 134
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 9bf0d2a73045..a4d790cddb19 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,13 +3,18 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#ifdef CONFIG_RT_GROUP_SCHED
7
8#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
9
6static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) 10static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
7{ 11{
12#ifdef CONFIG_SCHED_DEBUG
13 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
14#endif
8 return container_of(rt_se, struct task_struct, rt); 15 return container_of(rt_se, struct task_struct, rt);
9} 16}
10 17
11#ifdef CONFIG_RT_GROUP_SCHED
12
13static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 18static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
14{ 19{
15 return rt_rq->rq; 20 return rt_rq->rq;
@@ -22,6 +27,13 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
22 27
23#else /* CONFIG_RT_GROUP_SCHED */ 28#else /* CONFIG_RT_GROUP_SCHED */
24 29
30#define rt_entity_is_task(rt_se) (1)
31
32static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
33{
34 return container_of(rt_se, struct task_struct, rt);
35}
36
25static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) 37static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
26{ 38{
27 return container_of(rt_rq, struct rq, rt); 39 return container_of(rt_rq, struct rq, rt);
@@ -73,7 +85,7 @@ static inline void rt_clear_overload(struct rq *rq)
73 85
74static void update_rt_migration(struct rt_rq *rt_rq) 86static void update_rt_migration(struct rt_rq *rt_rq)
75{ 87{
76 if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) { 88 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
77 if (!rt_rq->overloaded) { 89 if (!rt_rq->overloaded) {
78 rt_set_overload(rq_of_rt_rq(rt_rq)); 90 rt_set_overload(rq_of_rt_rq(rt_rq));
79 rt_rq->overloaded = 1; 91 rt_rq->overloaded = 1;
@@ -86,6 +98,12 @@ static void update_rt_migration(struct rt_rq *rt_rq)
86 98
87static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 99static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
88{ 100{
101 if (!rt_entity_is_task(rt_se))
102 return;
103
104 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
105
106 rt_rq->rt_nr_total++;
89 if (rt_se->nr_cpus_allowed > 1) 107 if (rt_se->nr_cpus_allowed > 1)
90 rt_rq->rt_nr_migratory++; 108 rt_rq->rt_nr_migratory++;
91 109
@@ -94,6 +112,12 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
94 112
95static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 113static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
96{ 114{
115 if (!rt_entity_is_task(rt_se))
116 return;
117
118 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
119
120 rt_rq->rt_nr_total--;
97 if (rt_se->nr_cpus_allowed > 1) 121 if (rt_se->nr_cpus_allowed > 1)
98 rt_rq->rt_nr_migratory--; 122 rt_rq->rt_nr_migratory--;
99 123
@@ -112,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
112 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); 136 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
113} 137}
114 138
139static inline int has_pushable_tasks(struct rq *rq)
140{
141 return !plist_head_empty(&rq->rt.pushable_tasks);
142}
143
115#else 144#else
116 145
117static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) 146static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -586,6 +615,8 @@ static void update_curr_rt(struct rq *rq)
586 curr->se.exec_start = rq->clock; 615 curr->se.exec_start = rq->clock;
587 cpuacct_charge(curr, delta_exec); 616 cpuacct_charge(curr, delta_exec);
588 617
618 sched_rt_avg_update(rq, delta_exec);
619
589 if (!rt_bandwidth_enabled()) 620 if (!rt_bandwidth_enabled())
590 return; 621 return;
591 622
@@ -858,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
858 889
859 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
860 enqueue_pushable_task(rq, p); 891 enqueue_pushable_task(rq, p);
861
862 inc_cpu_load(rq, p->se.load.weight);
863} 892}
864 893
865static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 894static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -870,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
870 dequeue_rt_entity(rt_se); 899 dequeue_rt_entity(rt_se);
871 900
872 dequeue_pushable_task(rq, p); 901 dequeue_pushable_task(rq, p);
873
874 dec_cpu_load(rq, p->se.load.weight);
875} 902}
876 903
877/* 904/*
@@ -911,10 +938,13 @@ static void yield_task_rt(struct rq *rq)
911#ifdef CONFIG_SMP 938#ifdef CONFIG_SMP
912static int find_lowest_rq(struct task_struct *task); 939static int find_lowest_rq(struct task_struct *task);
913 940
914static int select_task_rq_rt(struct task_struct *p, int sync) 941static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
915{ 942{
916 struct rq *rq = task_rq(p); 943 struct rq *rq = task_rq(p);
917 944
945 if (sd_flag != SD_BALANCE_WAKE)
946 return smp_processor_id();
947
918 /* 948 /*
919 * If the current task is an RT task, then 949 * If the current task is an RT task, then
920 * try to see if we can wake this RT task up on another 950 * try to see if we can wake this RT task up on another
@@ -972,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
972/* 1002/*
973 * Preempt the current task with a newly woken task if needed: 1003 * Preempt the current task with a newly woken task if needed:
974 */ 1004 */
975static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) 1005static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
976{ 1006{
977 if (p->prio < rq->curr->prio) { 1007 if (p->prio < rq->curr->prio) {
978 resched_task(rq->curr); 1008 resched_task(rq->curr);
@@ -1048,6 +1078,14 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
1048 if (p) 1078 if (p)
1049 dequeue_pushable_task(rq, p); 1079 dequeue_pushable_task(rq, p);
1050 1080
1081#ifdef CONFIG_SMP
1082 /*
1083 * We detect this state here so that we can avoid taking the RQ
1084 * lock again later if there is no need to push
1085 */
1086 rq->post_schedule = has_pushable_tasks(rq);
1087#endif
1088
1051 return p; 1089 return p;
1052} 1090}
1053 1091
@@ -1146,13 +1184,6 @@ static int find_lowest_rq(struct task_struct *task)
1146 return -1; /* No targets found */ 1184 return -1; /* No targets found */
1147 1185
1148 /* 1186 /*
1149 * Only consider CPUs that are usable for migration.
1150 * I guess we might want to change cpupri_find() to ignore those
1151 * in the first place.
1152 */
1153 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
1154
1155 /*
1156 * At this point we have built a mask of cpus representing the 1187 * At this point we have built a mask of cpus representing the
1157 * lowest priority tasks in the system. Now we want to elect 1188 * lowest priority tasks in the system. Now we want to elect
1158 * the best one based on our affinity and topology. 1189 * the best one based on our affinity and topology.
@@ -1246,11 +1277,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1246 return lowest_rq; 1277 return lowest_rq;
1247} 1278}
1248 1279
1249static inline int has_pushable_tasks(struct rq *rq)
1250{
1251 return !plist_head_empty(&rq->rt.pushable_tasks);
1252}
1253
1254static struct task_struct *pick_next_pushable_task(struct rq *rq) 1280static struct task_struct *pick_next_pushable_task(struct rq *rq)
1255{ 1281{
1256 struct task_struct *p; 1282 struct task_struct *p;
@@ -1450,23 +1476,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
1450 pull_rt_task(rq); 1476 pull_rt_task(rq);
1451} 1477}
1452 1478
1453/*
1454 * assumes rq->lock is held
1455 */
1456static int needs_post_schedule_rt(struct rq *rq)
1457{
1458 return has_pushable_tasks(rq);
1459}
1460
1461static void post_schedule_rt(struct rq *rq) 1479static void post_schedule_rt(struct rq *rq)
1462{ 1480{
1463 /*
1464 * This is only called if needs_post_schedule_rt() indicates that
1465 * we need to push tasks away
1466 */
1467 spin_lock_irq(&rq->lock);
1468 push_rt_tasks(rq); 1481 push_rt_tasks(rq);
1469 spin_unlock_irq(&rq->lock);
1470} 1482}
1471 1483
1472/* 1484/*
@@ -1722,6 +1734,17 @@ static void set_curr_task_rt(struct rq *rq)
1722 dequeue_pushable_task(rq, p); 1734 dequeue_pushable_task(rq, p);
1723} 1735}
1724 1736
1737unsigned int get_rr_interval_rt(struct task_struct *task)
1738{
1739 /*
1740 * Time slice is 0 for SCHED_FIFO tasks
1741 */
1742 if (task->policy == SCHED_RR)
1743 return DEF_TIMESLICE;
1744 else
1745 return 0;
1746}
1747
1725static const struct sched_class rt_sched_class = { 1748static const struct sched_class rt_sched_class = {
1726 .next = &fair_sched_class, 1749 .next = &fair_sched_class,
1727 .enqueue_task = enqueue_task_rt, 1750 .enqueue_task = enqueue_task_rt,
@@ -1742,7 +1765,6 @@ static const struct sched_class rt_sched_class = {
1742 .rq_online = rq_online_rt, 1765 .rq_online = rq_online_rt,
1743 .rq_offline = rq_offline_rt, 1766 .rq_offline = rq_offline_rt,
1744 .pre_schedule = pre_schedule_rt, 1767 .pre_schedule = pre_schedule_rt,
1745 .needs_post_schedule = needs_post_schedule_rt,
1746 .post_schedule = post_schedule_rt, 1768 .post_schedule = post_schedule_rt,
1747 .task_wake_up = task_wake_up_rt, 1769 .task_wake_up = task_wake_up_rt,
1748 .switched_from = switched_from_rt, 1770 .switched_from = switched_from_rt,
@@ -1751,6 +1773,8 @@ static const struct sched_class rt_sched_class = {
1751 .set_curr_task = set_curr_task_rt, 1773 .set_curr_task = set_curr_task_rt,
1752 .task_tick = task_tick_rt, 1774 .task_tick = task_tick_rt,
1753 1775
1776 .get_rr_interval = get_rr_interval_rt,
1777
1754 .prio_changed = prio_changed_rt, 1778 .prio_changed = prio_changed_rt,
1755 .switched_to = switched_to_rt, 1779 .switched_to = switched_to_rt,
1756}; 1780};
diff --git a/kernel/signal.c b/kernel/signal.c
index ccf1ceedaebe..64c5deeaca5d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2454,11 +2454,9 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2454 stack_t oss; 2454 stack_t oss;
2455 int error; 2455 int error;
2456 2456
2457 if (uoss) { 2457 oss.ss_sp = (void __user *) current->sas_ss_sp;
2458 oss.ss_sp = (void __user *) current->sas_ss_sp; 2458 oss.ss_size = current->sas_ss_size;
2459 oss.ss_size = current->sas_ss_size; 2459 oss.ss_flags = sas_ss_flags(sp);
2460 oss.ss_flags = sas_ss_flags(sp);
2461 }
2462 2460
2463 if (uss) { 2461 if (uss) {
2464 void __user *ss_sp; 2462 void __user *ss_sp;
@@ -2466,10 +2464,12 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2466 int ss_flags; 2464 int ss_flags;
2467 2465
2468 error = -EFAULT; 2466 error = -EFAULT;
2469 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)) 2467 if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
2470 || __get_user(ss_sp, &uss->ss_sp) 2468 goto out;
2471 || __get_user(ss_flags, &uss->ss_flags) 2469 error = __get_user(ss_sp, &uss->ss_sp) |
2472 || __get_user(ss_size, &uss->ss_size)) 2470 __get_user(ss_flags, &uss->ss_flags) |
2471 __get_user(ss_size, &uss->ss_size);
2472 if (error)
2473 goto out; 2473 goto out;
2474 2474
2475 error = -EPERM; 2475 error = -EPERM;
@@ -2501,13 +2501,16 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
2501 current->sas_ss_size = ss_size; 2501 current->sas_ss_size = ss_size;
2502 } 2502 }
2503 2503
2504 error = 0;
2504 if (uoss) { 2505 if (uoss) {
2505 error = -EFAULT; 2506 error = -EFAULT;
2506 if (copy_to_user(uoss, &oss, sizeof(oss))) 2507 if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
2507 goto out; 2508 goto out;
2509 error = __put_user(oss.ss_sp, &uoss->ss_sp) |
2510 __put_user(oss.ss_size, &uoss->ss_size) |
2511 __put_user(oss.ss_flags, &uoss->ss_flags);
2508 } 2512 }
2509 2513
2510 error = 0;
2511out: 2514out:
2512 return error; 2515 return error;
2513} 2516}
diff --git a/kernel/smp.c b/kernel/smp.c
index ad63d8501207..8e218500ab14 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -57,7 +57,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
57 return NOTIFY_BAD; 57 return NOTIFY_BAD;
58 break; 58 break;
59 59
60#ifdef CONFIG_CPU_HOTPLUG 60#ifdef CONFIG_HOTPLUG_CPU
61 case CPU_UP_CANCELED: 61 case CPU_UP_CANCELED:
62 case CPU_UP_CANCELED_FROZEN: 62 case CPU_UP_CANCELED_FROZEN:
63 63
@@ -177,6 +177,11 @@ void generic_smp_call_function_interrupt(void)
177 int cpu = get_cpu(); 177 int cpu = get_cpu();
178 178
179 /* 179 /*
180 * Shouldn't receive this interrupt on a cpu that is not yet online.
181 */
182 WARN_ON_ONCE(!cpu_online(cpu));
183
184 /*
180 * Ensure entry is visible on call_function_queue after we have 185 * Ensure entry is visible on call_function_queue after we have
181 * entered the IPI. See comment in smp_call_function_many. 186 * entered the IPI. See comment in smp_call_function_many.
182 * If we don't have this, then we may miss an entry on the list 187 * If we don't have this, then we may miss an entry on the list
@@ -230,6 +235,11 @@ void generic_smp_call_function_single_interrupt(void)
230 unsigned int data_flags; 235 unsigned int data_flags;
231 LIST_HEAD(list); 236 LIST_HEAD(list);
232 237
238 /*
239 * Shouldn't receive this interrupt on a cpu that is not yet online.
240 */
241 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
242
233 spin_lock(&q->lock); 243 spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 244 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 245 spin_unlock(&q->lock);
@@ -285,8 +295,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
285 */ 295 */
286 this_cpu = get_cpu(); 296 this_cpu = get_cpu();
287 297
288 /* Can deadlock when called with interrupts disabled */ 298 /*
289 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 299 * Can deadlock when called with interrupts disabled.
300 * We allow cpu's that are not yet online though, as no one else can
301 * send smp call function interrupt to this cpu and as such deadlocks
302 * can't happen.
303 */
304 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
305 && !oops_in_progress);
290 306
291 if (cpu == this_cpu) { 307 if (cpu == this_cpu) {
292 local_irq_save(flags); 308 local_irq_save(flags);
@@ -329,8 +345,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
329{ 345{
330 csd_lock(data); 346 csd_lock(data);
331 347
332 /* Can deadlock when called with interrupts disabled */ 348 /*
333 WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); 349 * Can deadlock when called with interrupts disabled.
350 * We allow cpu's that are not yet online though, as no one else can
351 * send smp call function interrupt to this cpu and as such deadlocks
352 * can't happen.
353 */
354 WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled()
355 && !oops_in_progress);
334 356
335 generic_exec_single(cpu, data, wait); 357 generic_exec_single(cpu, data, wait);
336} 358}
@@ -365,8 +387,14 @@ void smp_call_function_many(const struct cpumask *mask,
365 unsigned long flags; 387 unsigned long flags;
366 int cpu, next_cpu, this_cpu = smp_processor_id(); 388 int cpu, next_cpu, this_cpu = smp_processor_id();
367 389
368 /* Can deadlock when called with interrupts disabled */ 390 /*
369 WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); 391 * Can deadlock when called with interrupts disabled.
392 * We allow cpu's that are not yet online though, as no one else can
393 * send smp call function interrupt to this cpu and as such deadlocks
394 * can't happen.
395 */
396 WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
397 && !oops_in_progress);
370 398
371 /* So, what's a CPU they want? Ignoring this one. */ 399 /* So, what's a CPU they want? Ignoring this one. */
372 cpu = cpumask_first_and(mask, cpu_online_mask); 400 cpu = cpumask_first_and(mask, cpu_online_mask);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3a94905fa5d2..f8749e5216e0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59char *softirq_to_name[NR_SOFTIRQS] = { 59char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62}; 62};
63 63
@@ -227,7 +227,7 @@ restart:
227 preempt_count() = prev_count; 227 preempt_count() = prev_count;
228 } 228 }
229 229
230 rcu_bh_qsctr_inc(cpu); 230 rcu_bh_qs(cpu);
231 } 231 }
232 h++; 232 h++;
233 pending >>= 1; 233 pending >>= 1;
@@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *))
345 softirq_vec[nr].action = action; 345 softirq_vec[nr].action = action;
346} 346}
347 347
348/* Tasklets */ 348/*
349 * Tasklets
350 */
349struct tasklet_head 351struct tasklet_head
350{ 352{
351 struct tasklet_struct *head; 353 struct tasklet_struct *head;
@@ -493,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t)
493 495
494EXPORT_SYMBOL(tasklet_kill); 496EXPORT_SYMBOL(tasklet_kill);
495 497
498/*
499 * tasklet_hrtimer
500 */
501
502/*
503 * The trampoline is called when the hrtimer expires. If this is
504 * called from the hrtimer interrupt then we schedule the tasklet as
505 * the timer callback function expects to run in softirq context. If
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{
511 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer);
513
514 if (hrtimer_is_hres_active(timer)) {
515 tasklet_hi_schedule(&ttimer->tasklet);
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519}
520
521/*
522 * Helper function which calls the hrtimer callback from
523 * tasklet/softirq context
524 */
525static void __tasklet_hrtimer_trampoline(unsigned long data)
526{
527 struct tasklet_hrtimer *ttimer = (void *)data;
528 enum hrtimer_restart restart;
529
530 restart = ttimer->function(&ttimer->timer);
531 if (restart != HRTIMER_NORESTART)
532 hrtimer_restart(&ttimer->timer);
533}
534
535/**
536 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
537 * @ttimer: tasklet_hrtimer which is initialized
538 * @function: hrtimer callback funtion which gets called from softirq context
539 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
540 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
541 */
542void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
543 enum hrtimer_restart (*function)(struct hrtimer *),
544 clockid_t which_clock, enum hrtimer_mode mode)
545{
546 hrtimer_init(&ttimer->timer, which_clock, mode);
547 ttimer->timer.function = __hrtimer_tasklet_trampoline;
548 tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
549 (unsigned long)ttimer);
550 ttimer->function = function;
551}
552EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
553
554/*
555 * Remote softirq bits
556 */
557
496DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); 558DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
497EXPORT_PER_CPU_SYMBOL(softirq_work_list); 559EXPORT_PER_CPU_SYMBOL(softirq_work_list);
498 560
@@ -659,7 +721,7 @@ static int ksoftirqd(void * __bind_cpu)
659 preempt_enable_no_resched(); 721 preempt_enable_no_resched();
660 cond_resched(); 722 cond_resched();
661 preempt_disable(); 723 preempt_disable();
662 rcu_qsctr_inc((long)__bind_cpu); 724 rcu_sched_qs((long)__bind_cpu);
663 } 725 }
664 preempt_enable(); 726 preempt_enable();
665 set_current_state(TASK_INTERRUPTIBLE); 727 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 7932653c4ebd..5ddab730cb2f 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -21,44 +21,29 @@
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/module.h> 22#include <linux/module.h>
23 23
24#ifndef _spin_trylock
24int __lockfunc _spin_trylock(spinlock_t *lock) 25int __lockfunc _spin_trylock(spinlock_t *lock)
25{ 26{
26 preempt_disable(); 27 return __spin_trylock(lock);
27 if (_raw_spin_trylock(lock)) {
28 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
29 return 1;
30 }
31
32 preempt_enable();
33 return 0;
34} 28}
35EXPORT_SYMBOL(_spin_trylock); 29EXPORT_SYMBOL(_spin_trylock);
30#endif
36 31
32#ifndef _read_trylock
37int __lockfunc _read_trylock(rwlock_t *lock) 33int __lockfunc _read_trylock(rwlock_t *lock)
38{ 34{
39 preempt_disable(); 35 return __read_trylock(lock);
40 if (_raw_read_trylock(lock)) {
41 rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
42 return 1;
43 }
44
45 preempt_enable();
46 return 0;
47} 36}
48EXPORT_SYMBOL(_read_trylock); 37EXPORT_SYMBOL(_read_trylock);
38#endif
49 39
40#ifndef _write_trylock
50int __lockfunc _write_trylock(rwlock_t *lock) 41int __lockfunc _write_trylock(rwlock_t *lock)
51{ 42{
52 preempt_disable(); 43 return __write_trylock(lock);
53 if (_raw_write_trylock(lock)) {
54 rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
55 return 1;
56 }
57
58 preempt_enable();
59 return 0;
60} 44}
61EXPORT_SYMBOL(_write_trylock); 45EXPORT_SYMBOL(_write_trylock);
46#endif
62 47
63/* 48/*
64 * If lockdep is enabled then we use the non-preemption spin-ops 49 * If lockdep is enabled then we use the non-preemption spin-ops
@@ -67,132 +52,101 @@ EXPORT_SYMBOL(_write_trylock);
67 */ 52 */
68#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 53#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
69 54
55#ifndef _read_lock
70void __lockfunc _read_lock(rwlock_t *lock) 56void __lockfunc _read_lock(rwlock_t *lock)
71{ 57{
72 preempt_disable(); 58 __read_lock(lock);
73 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
74 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
75} 59}
76EXPORT_SYMBOL(_read_lock); 60EXPORT_SYMBOL(_read_lock);
61#endif
77 62
63#ifndef _spin_lock_irqsave
78unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) 64unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
79{ 65{
80 unsigned long flags; 66 return __spin_lock_irqsave(lock);
81
82 local_irq_save(flags);
83 preempt_disable();
84 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
85 /*
86 * On lockdep we dont want the hand-coded irq-enable of
87 * _raw_spin_lock_flags() code, because lockdep assumes
88 * that interrupts are not re-enabled during lock-acquire:
89 */
90#ifdef CONFIG_LOCKDEP
91 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
92#else
93 _raw_spin_lock_flags(lock, &flags);
94#endif
95 return flags;
96} 67}
97EXPORT_SYMBOL(_spin_lock_irqsave); 68EXPORT_SYMBOL(_spin_lock_irqsave);
69#endif
98 70
71#ifndef _spin_lock_irq
99void __lockfunc _spin_lock_irq(spinlock_t *lock) 72void __lockfunc _spin_lock_irq(spinlock_t *lock)
100{ 73{
101 local_irq_disable(); 74 __spin_lock_irq(lock);
102 preempt_disable();
103 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
104 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
105} 75}
106EXPORT_SYMBOL(_spin_lock_irq); 76EXPORT_SYMBOL(_spin_lock_irq);
77#endif
107 78
79#ifndef _spin_lock_bh
108void __lockfunc _spin_lock_bh(spinlock_t *lock) 80void __lockfunc _spin_lock_bh(spinlock_t *lock)
109{ 81{
110 local_bh_disable(); 82 __spin_lock_bh(lock);
111 preempt_disable();
112 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
113 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
114} 83}
115EXPORT_SYMBOL(_spin_lock_bh); 84EXPORT_SYMBOL(_spin_lock_bh);
85#endif
116 86
87#ifndef _read_lock_irqsave
117unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) 88unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
118{ 89{
119 unsigned long flags; 90 return __read_lock_irqsave(lock);
120
121 local_irq_save(flags);
122 preempt_disable();
123 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
124 LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
125 _raw_read_lock_flags, &flags);
126 return flags;
127} 91}
128EXPORT_SYMBOL(_read_lock_irqsave); 92EXPORT_SYMBOL(_read_lock_irqsave);
93#endif
129 94
95#ifndef _read_lock_irq
130void __lockfunc _read_lock_irq(rwlock_t *lock) 96void __lockfunc _read_lock_irq(rwlock_t *lock)
131{ 97{
132 local_irq_disable(); 98 __read_lock_irq(lock);
133 preempt_disable();
134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
135 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
136} 99}
137EXPORT_SYMBOL(_read_lock_irq); 100EXPORT_SYMBOL(_read_lock_irq);
101#endif
138 102
103#ifndef _read_lock_bh
139void __lockfunc _read_lock_bh(rwlock_t *lock) 104void __lockfunc _read_lock_bh(rwlock_t *lock)
140{ 105{
141 local_bh_disable(); 106 __read_lock_bh(lock);
142 preempt_disable();
143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
144 LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
145} 107}
146EXPORT_SYMBOL(_read_lock_bh); 108EXPORT_SYMBOL(_read_lock_bh);
109#endif
147 110
111#ifndef _write_lock_irqsave
148unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) 112unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
149{ 113{
150 unsigned long flags; 114 return __write_lock_irqsave(lock);
151
152 local_irq_save(flags);
153 preempt_disable();
154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
155 LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
156 _raw_write_lock_flags, &flags);
157 return flags;
158} 115}
159EXPORT_SYMBOL(_write_lock_irqsave); 116EXPORT_SYMBOL(_write_lock_irqsave);
117#endif
160 118
119#ifndef _write_lock_irq
161void __lockfunc _write_lock_irq(rwlock_t *lock) 120void __lockfunc _write_lock_irq(rwlock_t *lock)
162{ 121{
163 local_irq_disable(); 122 __write_lock_irq(lock);
164 preempt_disable();
165 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
166 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
167} 123}
168EXPORT_SYMBOL(_write_lock_irq); 124EXPORT_SYMBOL(_write_lock_irq);
125#endif
169 126
127#ifndef _write_lock_bh
170void __lockfunc _write_lock_bh(rwlock_t *lock) 128void __lockfunc _write_lock_bh(rwlock_t *lock)
171{ 129{
172 local_bh_disable(); 130 __write_lock_bh(lock);
173 preempt_disable();
174 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
175 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
176} 131}
177EXPORT_SYMBOL(_write_lock_bh); 132EXPORT_SYMBOL(_write_lock_bh);
133#endif
178 134
135#ifndef _spin_lock
179void __lockfunc _spin_lock(spinlock_t *lock) 136void __lockfunc _spin_lock(spinlock_t *lock)
180{ 137{
181 preempt_disable(); 138 __spin_lock(lock);
182 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
183 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
184} 139}
185
186EXPORT_SYMBOL(_spin_lock); 140EXPORT_SYMBOL(_spin_lock);
141#endif
187 142
143#ifndef _write_lock
188void __lockfunc _write_lock(rwlock_t *lock) 144void __lockfunc _write_lock(rwlock_t *lock)
189{ 145{
190 preempt_disable(); 146 __write_lock(lock);
191 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
192 LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
193} 147}
194
195EXPORT_SYMBOL(_write_lock); 148EXPORT_SYMBOL(_write_lock);
149#endif
196 150
197#else /* CONFIG_PREEMPT: */ 151#else /* CONFIG_PREEMPT: */
198 152
@@ -318,125 +272,109 @@ EXPORT_SYMBOL(_spin_lock_nest_lock);
318 272
319#endif 273#endif
320 274
275#ifndef _spin_unlock
321void __lockfunc _spin_unlock(spinlock_t *lock) 276void __lockfunc _spin_unlock(spinlock_t *lock)
322{ 277{
323 spin_release(&lock->dep_map, 1, _RET_IP_); 278 __spin_unlock(lock);
324 _raw_spin_unlock(lock);
325 preempt_enable();
326} 279}
327EXPORT_SYMBOL(_spin_unlock); 280EXPORT_SYMBOL(_spin_unlock);
281#endif
328 282
283#ifndef _write_unlock
329void __lockfunc _write_unlock(rwlock_t *lock) 284void __lockfunc _write_unlock(rwlock_t *lock)
330{ 285{
331 rwlock_release(&lock->dep_map, 1, _RET_IP_); 286 __write_unlock(lock);
332 _raw_write_unlock(lock);
333 preempt_enable();
334} 287}
335EXPORT_SYMBOL(_write_unlock); 288EXPORT_SYMBOL(_write_unlock);
289#endif
336 290
291#ifndef _read_unlock
337void __lockfunc _read_unlock(rwlock_t *lock) 292void __lockfunc _read_unlock(rwlock_t *lock)
338{ 293{
339 rwlock_release(&lock->dep_map, 1, _RET_IP_); 294 __read_unlock(lock);
340 _raw_read_unlock(lock);
341 preempt_enable();
342} 295}
343EXPORT_SYMBOL(_read_unlock); 296EXPORT_SYMBOL(_read_unlock);
297#endif
344 298
299#ifndef _spin_unlock_irqrestore
345void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 300void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
346{ 301{
347 spin_release(&lock->dep_map, 1, _RET_IP_); 302 __spin_unlock_irqrestore(lock, flags);
348 _raw_spin_unlock(lock);
349 local_irq_restore(flags);
350 preempt_enable();
351} 303}
352EXPORT_SYMBOL(_spin_unlock_irqrestore); 304EXPORT_SYMBOL(_spin_unlock_irqrestore);
305#endif
353 306
307#ifndef _spin_unlock_irq
354void __lockfunc _spin_unlock_irq(spinlock_t *lock) 308void __lockfunc _spin_unlock_irq(spinlock_t *lock)
355{ 309{
356 spin_release(&lock->dep_map, 1, _RET_IP_); 310 __spin_unlock_irq(lock);
357 _raw_spin_unlock(lock);
358 local_irq_enable();
359 preempt_enable();
360} 311}
361EXPORT_SYMBOL(_spin_unlock_irq); 312EXPORT_SYMBOL(_spin_unlock_irq);
313#endif
362 314
315#ifndef _spin_unlock_bh
363void __lockfunc _spin_unlock_bh(spinlock_t *lock) 316void __lockfunc _spin_unlock_bh(spinlock_t *lock)
364{ 317{
365 spin_release(&lock->dep_map, 1, _RET_IP_); 318 __spin_unlock_bh(lock);
366 _raw_spin_unlock(lock);
367 preempt_enable_no_resched();
368 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
369} 319}
370EXPORT_SYMBOL(_spin_unlock_bh); 320EXPORT_SYMBOL(_spin_unlock_bh);
321#endif
371 322
323#ifndef _read_unlock_irqrestore
372void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 324void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
373{ 325{
374 rwlock_release(&lock->dep_map, 1, _RET_IP_); 326 __read_unlock_irqrestore(lock, flags);
375 _raw_read_unlock(lock);
376 local_irq_restore(flags);
377 preempt_enable();
378} 327}
379EXPORT_SYMBOL(_read_unlock_irqrestore); 328EXPORT_SYMBOL(_read_unlock_irqrestore);
329#endif
380 330
331#ifndef _read_unlock_irq
381void __lockfunc _read_unlock_irq(rwlock_t *lock) 332void __lockfunc _read_unlock_irq(rwlock_t *lock)
382{ 333{
383 rwlock_release(&lock->dep_map, 1, _RET_IP_); 334 __read_unlock_irq(lock);
384 _raw_read_unlock(lock);
385 local_irq_enable();
386 preempt_enable();
387} 335}
388EXPORT_SYMBOL(_read_unlock_irq); 336EXPORT_SYMBOL(_read_unlock_irq);
337#endif
389 338
339#ifndef _read_unlock_bh
390void __lockfunc _read_unlock_bh(rwlock_t *lock) 340void __lockfunc _read_unlock_bh(rwlock_t *lock)
391{ 341{
392 rwlock_release(&lock->dep_map, 1, _RET_IP_); 342 __read_unlock_bh(lock);
393 _raw_read_unlock(lock);
394 preempt_enable_no_resched();
395 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
396} 343}
397EXPORT_SYMBOL(_read_unlock_bh); 344EXPORT_SYMBOL(_read_unlock_bh);
345#endif
398 346
347#ifndef _write_unlock_irqrestore
399void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 348void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
400{ 349{
401 rwlock_release(&lock->dep_map, 1, _RET_IP_); 350 __write_unlock_irqrestore(lock, flags);
402 _raw_write_unlock(lock);
403 local_irq_restore(flags);
404 preempt_enable();
405} 351}
406EXPORT_SYMBOL(_write_unlock_irqrestore); 352EXPORT_SYMBOL(_write_unlock_irqrestore);
353#endif
407 354
355#ifndef _write_unlock_irq
408void __lockfunc _write_unlock_irq(rwlock_t *lock) 356void __lockfunc _write_unlock_irq(rwlock_t *lock)
409{ 357{
410 rwlock_release(&lock->dep_map, 1, _RET_IP_); 358 __write_unlock_irq(lock);
411 _raw_write_unlock(lock);
412 local_irq_enable();
413 preempt_enable();
414} 359}
415EXPORT_SYMBOL(_write_unlock_irq); 360EXPORT_SYMBOL(_write_unlock_irq);
361#endif
416 362
363#ifndef _write_unlock_bh
417void __lockfunc _write_unlock_bh(rwlock_t *lock) 364void __lockfunc _write_unlock_bh(rwlock_t *lock)
418{ 365{
419 rwlock_release(&lock->dep_map, 1, _RET_IP_); 366 __write_unlock_bh(lock);
420 _raw_write_unlock(lock);
421 preempt_enable_no_resched();
422 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
423} 367}
424EXPORT_SYMBOL(_write_unlock_bh); 368EXPORT_SYMBOL(_write_unlock_bh);
369#endif
425 370
371#ifndef _spin_trylock_bh
426int __lockfunc _spin_trylock_bh(spinlock_t *lock) 372int __lockfunc _spin_trylock_bh(spinlock_t *lock)
427{ 373{
428 local_bh_disable(); 374 return __spin_trylock_bh(lock);
429 preempt_disable();
430 if (_raw_spin_trylock(lock)) {
431 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
432 return 1;
433 }
434
435 preempt_enable_no_resched();
436 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
437 return 0;
438} 375}
439EXPORT_SYMBOL(_spin_trylock_bh); 376EXPORT_SYMBOL(_spin_trylock_bh);
377#endif
440 378
441notrace int in_lock_functions(unsigned long addr) 379notrace int in_lock_functions(unsigned long addr)
442{ 380{
diff --git a/kernel/sys.c b/kernel/sys.c
index b3f1097c76fa..ea5c3bcac881 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,7 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h> 17#include <linux/perf_event.h>
18#include <linux/resource.h> 18#include <linux/resource.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1511,11 +1511,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1511 case PR_SET_TSC: 1511 case PR_SET_TSC:
1512 error = SET_TSC_CTL(arg2); 1512 error = SET_TSC_CTL(arg2);
1513 break; 1513 break;
1514 case PR_TASK_PERF_COUNTERS_DISABLE: 1514 case PR_TASK_PERF_EVENTS_DISABLE:
1515 error = perf_counter_task_disable(); 1515 error = perf_event_task_disable();
1516 break; 1516 break;
1517 case PR_TASK_PERF_COUNTERS_ENABLE: 1517 case PR_TASK_PERF_EVENTS_ENABLE:
1518 error = perf_counter_task_enable(); 1518 error = perf_event_task_enable();
1519 break; 1519 break;
1520 case PR_GET_TIMERSLACK: 1520 case PR_GET_TIMERSLACK:
1521 error = current->timer_slack_ns; 1521 error = current->timer_slack_ns;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 68320f6b07b5..515bc230ac2a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -177,4 +177,4 @@ cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 177cond_syscall(sys_eventfd2);
178 178
179/* performance counters: */ 179/* performance counters: */
180cond_syscall(sys_perf_counter_open); 180cond_syscall(sys_perf_event_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 98e02328c67d..6ba49c7cb128 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,7 +50,7 @@
50#include <linux/reboot.h> 50#include <linux/reboot.h>
51#include <linux/ftrace.h> 51#include <linux/ftrace.h>
52#include <linux/slow-work.h> 52#include <linux/slow-work.h>
53#include <linux/perf_counter.h> 53#include <linux/perf_event.h>
54 54
55#include <asm/uaccess.h> 55#include <asm/uaccess.h>
56#include <asm/processor.h> 56#include <asm/processor.h>
@@ -91,6 +91,9 @@ extern int sysctl_nr_trim_pages;
91#ifdef CONFIG_RCU_TORTURE_TEST 91#ifdef CONFIG_RCU_TORTURE_TEST
92extern int rcutorture_runnable; 92extern int rcutorture_runnable;
93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 93#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
94#ifdef CONFIG_BLOCK
95extern int blk_iopoll_enabled;
96#endif
94 97
95/* Constants used for minimum and maximum */ 98/* Constants used for minimum and maximum */
96#ifdef CONFIG_DETECT_SOFTLOCKUP 99#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -245,6 +248,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
245#endif 248#endif
246 249
247static struct ctl_table kern_table[] = { 250static struct ctl_table kern_table[] = {
251 {
252 .ctl_name = CTL_UNNUMBERED,
253 .procname = "sched_child_runs_first",
254 .data = &sysctl_sched_child_runs_first,
255 .maxlen = sizeof(unsigned int),
256 .mode = 0644,
257 .proc_handler = &proc_dointvec,
258 },
248#ifdef CONFIG_SCHED_DEBUG 259#ifdef CONFIG_SCHED_DEBUG
249 { 260 {
250 .ctl_name = CTL_UNNUMBERED, 261 .ctl_name = CTL_UNNUMBERED,
@@ -299,14 +310,6 @@ static struct ctl_table kern_table[] = {
299 }, 310 },
300 { 311 {
301 .ctl_name = CTL_UNNUMBERED, 312 .ctl_name = CTL_UNNUMBERED,
302 .procname = "sched_child_runs_first",
303 .data = &sysctl_sched_child_runs_first,
304 .maxlen = sizeof(unsigned int),
305 .mode = 0644,
306 .proc_handler = &proc_dointvec,
307 },
308 {
309 .ctl_name = CTL_UNNUMBERED,
310 .procname = "sched_features", 313 .procname = "sched_features",
311 .data = &sysctl_sched_features, 314 .data = &sysctl_sched_features,
312 .maxlen = sizeof(unsigned int), 315 .maxlen = sizeof(unsigned int),
@@ -331,6 +334,14 @@ static struct ctl_table kern_table[] = {
331 }, 334 },
332 { 335 {
333 .ctl_name = CTL_UNNUMBERED, 336 .ctl_name = CTL_UNNUMBERED,
337 .procname = "sched_time_avg",
338 .data = &sysctl_sched_time_avg,
339 .maxlen = sizeof(unsigned int),
340 .mode = 0644,
341 .proc_handler = &proc_dointvec,
342 },
343 {
344 .ctl_name = CTL_UNNUMBERED,
334 .procname = "timer_migration", 345 .procname = "timer_migration",
335 .data = &sysctl_timer_migration, 346 .data = &sysctl_timer_migration,
336 .maxlen = sizeof(unsigned int), 347 .maxlen = sizeof(unsigned int),
@@ -953,28 +964,28 @@ static struct ctl_table kern_table[] = {
953 .child = slow_work_sysctls, 964 .child = slow_work_sysctls,
954 }, 965 },
955#endif 966#endif
956#ifdef CONFIG_PERF_COUNTERS 967#ifdef CONFIG_PERF_EVENTS
957 { 968 {
958 .ctl_name = CTL_UNNUMBERED, 969 .ctl_name = CTL_UNNUMBERED,
959 .procname = "perf_counter_paranoid", 970 .procname = "perf_event_paranoid",
960 .data = &sysctl_perf_counter_paranoid, 971 .data = &sysctl_perf_event_paranoid,
961 .maxlen = sizeof(sysctl_perf_counter_paranoid), 972 .maxlen = sizeof(sysctl_perf_event_paranoid),
962 .mode = 0644, 973 .mode = 0644,
963 .proc_handler = &proc_dointvec, 974 .proc_handler = &proc_dointvec,
964 }, 975 },
965 { 976 {
966 .ctl_name = CTL_UNNUMBERED, 977 .ctl_name = CTL_UNNUMBERED,
967 .procname = "perf_counter_mlock_kb", 978 .procname = "perf_event_mlock_kb",
968 .data = &sysctl_perf_counter_mlock, 979 .data = &sysctl_perf_event_mlock,
969 .maxlen = sizeof(sysctl_perf_counter_mlock), 980 .maxlen = sizeof(sysctl_perf_event_mlock),
970 .mode = 0644, 981 .mode = 0644,
971 .proc_handler = &proc_dointvec, 982 .proc_handler = &proc_dointvec,
972 }, 983 },
973 { 984 {
974 .ctl_name = CTL_UNNUMBERED, 985 .ctl_name = CTL_UNNUMBERED,
975 .procname = "perf_counter_max_sample_rate", 986 .procname = "perf_event_max_sample_rate",
976 .data = &sysctl_perf_counter_sample_rate, 987 .data = &sysctl_perf_event_sample_rate,
977 .maxlen = sizeof(sysctl_perf_counter_sample_rate), 988 .maxlen = sizeof(sysctl_perf_event_sample_rate),
978 .mode = 0644, 989 .mode = 0644,
979 .proc_handler = &proc_dointvec, 990 .proc_handler = &proc_dointvec,
980 }, 991 },
@@ -989,7 +1000,16 @@ static struct ctl_table kern_table[] = {
989 .proc_handler = &proc_dointvec, 1000 .proc_handler = &proc_dointvec,
990 }, 1001 },
991#endif 1002#endif
992 1003#ifdef CONFIG_BLOCK
1004 {
1005 .ctl_name = CTL_UNNUMBERED,
1006 .procname = "blk_iopoll",
1007 .data = &blk_iopoll_enabled,
1008 .maxlen = sizeof(int),
1009 .mode = 0644,
1010 .proc_handler = &proc_dointvec,
1011 },
1012#endif
993/* 1013/*
994 * NOTE: do not add new entries to this table unless you have read 1014 * NOTE: do not add new entries to this table unless you have read
995 * Documentation/sysctl/ctl_unnumbered.txt 1015 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1306,10 +1326,10 @@ static struct ctl_table vm_table[] = {
1306 { 1326 {
1307 .ctl_name = CTL_UNNUMBERED, 1327 .ctl_name = CTL_UNNUMBERED,
1308 .procname = "mmap_min_addr", 1328 .procname = "mmap_min_addr",
1309 .data = &mmap_min_addr, 1329 .data = &dac_mmap_min_addr,
1310 .maxlen = sizeof(unsigned long), 1330 .maxlen = sizeof(unsigned long),
1311 .mode = 0644, 1331 .mode = 0644,
1312 .proc_handler = &proc_doulongvec_minmax, 1332 .proc_handler = &mmap_min_addr_handler,
1313 }, 1333 },
1314#ifdef CONFIG_NUMA 1334#ifdef CONFIG_NUMA
1315 { 1335 {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 888adbcca30c..ea8384d3caa7 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -108,7 +108,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
108/* 108/*
109 * Send taskstats data in @skb to listener with nl_pid @pid 109 * Send taskstats data in @skb to listener with nl_pid @pid
110 */ 110 */
111static int send_reply(struct sk_buff *skb, pid_t pid) 111static int send_reply(struct sk_buff *skb, struct genl_info *info)
112{ 112{
113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); 113 struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
114 void *reply = genlmsg_data(genlhdr); 114 void *reply = genlmsg_data(genlhdr);
@@ -120,7 +120,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
120 return rc; 120 return rc;
121 } 121 }
122 122
123 return genlmsg_unicast(skb, pid); 123 return genlmsg_reply(skb, info);
124} 124}
125 125
126/* 126/*
@@ -150,7 +150,7 @@ static void send_cpu_listeners(struct sk_buff *skb,
150 if (!skb_next) 150 if (!skb_next)
151 break; 151 break;
152 } 152 }
153 rc = genlmsg_unicast(skb_cur, s->pid); 153 rc = genlmsg_unicast(&init_net, skb_cur, s->pid);
154 if (rc == -ECONNREFUSED) { 154 if (rc == -ECONNREFUSED) {
155 s->valid = 0; 155 s->valid = 0;
156 delcount++; 156 delcount++;
@@ -418,7 +418,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
418 goto err; 418 goto err;
419 } 419 }
420 420
421 rc = send_reply(rep_skb, info->snd_pid); 421 rc = send_reply(rep_skb, info);
422 422
423err: 423err:
424 fput_light(file, fput_needed); 424 fput_light(file, fput_needed);
@@ -487,7 +487,7 @@ free_return_rc:
487 } else 487 } else
488 goto err; 488 goto err;
489 489
490 return send_reply(rep_skb, info->snd_pid); 490 return send_reply(rep_skb, info);
491err: 491err:
492 nlmsg_free(rep_skb); 492 nlmsg_free(rep_skb);
493 return rc; 493 return rc;
diff --git a/kernel/time.c b/kernel/time.c
index 29511943871a..2e2e469a7fec 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -370,13 +370,20 @@ EXPORT_SYMBOL(mktime);
370 * 0 <= tv_nsec < NSEC_PER_SEC 370 * 0 <= tv_nsec < NSEC_PER_SEC
371 * For negative values only the tv_sec field is negative ! 371 * For negative values only the tv_sec field is negative !
372 */ 372 */
373void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec) 373void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec)
374{ 374{
375 while (nsec >= NSEC_PER_SEC) { 375 while (nsec >= NSEC_PER_SEC) {
376 /*
377 * The following asm() prevents the compiler from
378 * optimising this loop into a modulo operation. See
379 * also __iter_div_u64_rem() in include/linux/time.h
380 */
381 asm("" : "+rm"(nsec));
376 nsec -= NSEC_PER_SEC; 382 nsec -= NSEC_PER_SEC;
377 ++sec; 383 ++sec;
378 } 384 }
379 while (nsec < 0) { 385 while (nsec < 0) {
386 asm("" : "+rm"(nsec));
380 nsec += NSEC_PER_SEC; 387 nsec += NSEC_PER_SEC;
381 --sec; 388 --sec;
382 } 389 }
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ad6dd461119..620b58abdc32 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -137,11 +137,12 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
137 */ 137 */
138int clockevents_register_notifier(struct notifier_block *nb) 138int clockevents_register_notifier(struct notifier_block *nb)
139{ 139{
140 unsigned long flags;
140 int ret; 141 int ret;
141 142
142 spin_lock(&clockevents_lock); 143 spin_lock_irqsave(&clockevents_lock, flags);
143 ret = raw_notifier_chain_register(&clockevents_chain, nb); 144 ret = raw_notifier_chain_register(&clockevents_chain, nb);
144 spin_unlock(&clockevents_lock); 145 spin_unlock_irqrestore(&clockevents_lock, flags);
145 146
146 return ret; 147 return ret;
147} 148}
@@ -178,16 +179,18 @@ static void clockevents_notify_released(void)
178 */ 179 */
179void clockevents_register_device(struct clock_event_device *dev) 180void clockevents_register_device(struct clock_event_device *dev)
180{ 181{
182 unsigned long flags;
183
181 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
182 BUG_ON(!dev->cpumask); 185 BUG_ON(!dev->cpumask);
183 186
184 spin_lock(&clockevents_lock); 187 spin_lock_irqsave(&clockevents_lock, flags);
185 188
186 list_add(&dev->list, &clockevent_devices); 189 list_add(&dev->list, &clockevent_devices);
187 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 190 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
188 clockevents_notify_released(); 191 clockevents_notify_released();
189 192
190 spin_unlock(&clockevents_lock); 193 spin_unlock_irqrestore(&clockevents_lock, flags);
191} 194}
192EXPORT_SYMBOL_GPL(clockevents_register_device); 195EXPORT_SYMBOL_GPL(clockevents_register_device);
193 196
@@ -235,8 +238,9 @@ void clockevents_exchange_device(struct clock_event_device *old,
235void clockevents_notify(unsigned long reason, void *arg) 238void clockevents_notify(unsigned long reason, void *arg)
236{ 239{
237 struct list_head *node, *tmp; 240 struct list_head *node, *tmp;
241 unsigned long flags;
238 242
239 spin_lock(&clockevents_lock); 243 spin_lock_irqsave(&clockevents_lock, flags);
240 clockevents_do_notify(reason, arg); 244 clockevents_do_notify(reason, arg);
241 245
242 switch (reason) { 246 switch (reason) {
@@ -251,18 +255,7 @@ void clockevents_notify(unsigned long reason, void *arg)
251 default: 255 default:
252 break; 256 break;
253 } 257 }
254 spin_unlock(&clockevents_lock); 258 spin_unlock_irqrestore(&clockevents_lock, flags);
255} 259}
256EXPORT_SYMBOL_GPL(clockevents_notify); 260EXPORT_SYMBOL_GPL(clockevents_notify);
257
258ktime_t clockevents_get_next_event(int cpu)
259{
260 struct tick_device *td;
261 struct clock_event_device *dev;
262
263 td = &per_cpu(tick_cpu_device, cpu);
264 dev = td->evtdev;
265
266 return dev->next_event;
267}
268#endif 261#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 592bf584d1d2..09113347d328 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -21,7 +21,6 @@
21 * 21 *
22 * TODO WishList: 22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered 23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */ 24 */
26 25
27#include <linux/clocksource.h> 26#include <linux/clocksource.h>
@@ -30,6 +29,7 @@
30#include <linux/module.h> 29#include <linux/module.h>
31#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ 30#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
32#include <linux/tick.h> 31#include <linux/tick.h>
32#include <linux/kthread.h>
33 33
34void timecounter_init(struct timecounter *tc, 34void timecounter_init(struct timecounter *tc,
35 const struct cyclecounter *cc, 35 const struct cyclecounter *cc,
@@ -107,50 +107,35 @@ u64 timecounter_cyc2time(struct timecounter *tc,
107} 107}
108EXPORT_SYMBOL(timecounter_cyc2time); 108EXPORT_SYMBOL(timecounter_cyc2time);
109 109
110/* XXX - Would like a better way for initializing curr_clocksource */
111extern struct clocksource clocksource_jiffies;
112
113/*[Clocksource internal variables]--------- 110/*[Clocksource internal variables]---------
114 * curr_clocksource: 111 * curr_clocksource:
115 * currently selected clocksource. Initialized to clocksource_jiffies. 112 * currently selected clocksource.
116 * next_clocksource:
117 * pending next selected clocksource.
118 * clocksource_list: 113 * clocksource_list:
119 * linked list with the registered clocksources 114 * linked list with the registered clocksources
120 * clocksource_lock: 115 * clocksource_mutex:
121 * protects manipulations to curr_clocksource and next_clocksource 116 * protects manipulations to curr_clocksource and the clocksource_list
122 * and the clocksource_list
123 * override_name: 117 * override_name:
124 * Name of the user-specified clocksource. 118 * Name of the user-specified clocksource.
125 */ 119 */
126static struct clocksource *curr_clocksource = &clocksource_jiffies; 120static struct clocksource *curr_clocksource;
127static struct clocksource *next_clocksource;
128static struct clocksource *clocksource_override;
129static LIST_HEAD(clocksource_list); 121static LIST_HEAD(clocksource_list);
130static DEFINE_SPINLOCK(clocksource_lock); 122static DEFINE_MUTEX(clocksource_mutex);
131static char override_name[32]; 123static char override_name[32];
132static int finished_booting; 124static int finished_booting;
133 125
134/* clocksource_done_booting - Called near the end of core bootup
135 *
136 * Hack to avoid lots of clocksource churn at boot time.
137 * We use fs_initcall because we want this to start before
138 * device_initcall but after subsys_initcall.
139 */
140static int __init clocksource_done_booting(void)
141{
142 finished_booting = 1;
143 return 0;
144}
145fs_initcall(clocksource_done_booting);
146
147#ifdef CONFIG_CLOCKSOURCE_WATCHDOG 126#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
127static void clocksource_watchdog_work(struct work_struct *work);
128
148static LIST_HEAD(watchdog_list); 129static LIST_HEAD(watchdog_list);
149static struct clocksource *watchdog; 130static struct clocksource *watchdog;
150static struct timer_list watchdog_timer; 131static struct timer_list watchdog_timer;
132static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
151static DEFINE_SPINLOCK(watchdog_lock); 133static DEFINE_SPINLOCK(watchdog_lock);
152static cycle_t watchdog_last; 134static cycle_t watchdog_last;
153static unsigned long watchdog_resumed; 135static int watchdog_running;
136
137static int clocksource_watchdog_kthread(void *data);
138static void __clocksource_change_rating(struct clocksource *cs, int rating);
154 139
155/* 140/*
156 * Interval: 0.5sec Threshold: 0.0625s 141 * Interval: 0.5sec Threshold: 0.0625s
@@ -158,135 +143,249 @@ static unsigned long watchdog_resumed;
158#define WATCHDOG_INTERVAL (HZ >> 1) 143#define WATCHDOG_INTERVAL (HZ >> 1)
159#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) 144#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
160 145
161static void clocksource_ratewd(struct clocksource *cs, int64_t delta) 146static void clocksource_watchdog_work(struct work_struct *work)
162{ 147{
163 if (delta > -WATCHDOG_THRESHOLD && delta < WATCHDOG_THRESHOLD) 148 /*
164 return; 149 * If kthread_run fails the next watchdog scan over the
150 * watchdog_list will find the unstable clock again.
151 */
152 kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
153}
165 154
155static void __clocksource_unstable(struct clocksource *cs)
156{
157 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
158 cs->flags |= CLOCK_SOURCE_UNSTABLE;
159 if (finished_booting)
160 schedule_work(&watchdog_work);
161}
162
163static void clocksource_unstable(struct clocksource *cs, int64_t delta)
164{
166 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", 165 printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n",
167 cs->name, delta); 166 cs->name, delta);
168 cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); 167 __clocksource_unstable(cs);
169 clocksource_change_rating(cs, 0); 168}
170 list_del(&cs->wd_list); 169
170/**
171 * clocksource_mark_unstable - mark clocksource unstable via watchdog
172 * @cs: clocksource to be marked unstable
173 *
174 * This function is called instead of clocksource_change_rating from
175 * cpu hotplug code to avoid a deadlock between the clocksource mutex
176 * and the cpu hotplug mutex. It defers the update of the clocksource
177 * to the watchdog thread.
178 */
179void clocksource_mark_unstable(struct clocksource *cs)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&watchdog_lock, flags);
184 if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) {
185 if (list_empty(&cs->wd_list))
186 list_add(&cs->wd_list, &watchdog_list);
187 __clocksource_unstable(cs);
188 }
189 spin_unlock_irqrestore(&watchdog_lock, flags);
171} 190}
172 191
173static void clocksource_watchdog(unsigned long data) 192static void clocksource_watchdog(unsigned long data)
174{ 193{
175 struct clocksource *cs, *tmp; 194 struct clocksource *cs;
176 cycle_t csnow, wdnow; 195 cycle_t csnow, wdnow;
177 int64_t wd_nsec, cs_nsec; 196 int64_t wd_nsec, cs_nsec;
178 int resumed; 197 int next_cpu;
179 198
180 spin_lock(&watchdog_lock); 199 spin_lock(&watchdog_lock);
181 200 if (!watchdog_running)
182 resumed = test_and_clear_bit(0, &watchdog_resumed); 201 goto out;
183 202
184 wdnow = watchdog->read(watchdog); 203 wdnow = watchdog->read(watchdog);
185 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); 204 wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
205 watchdog->mult, watchdog->shift);
186 watchdog_last = wdnow; 206 watchdog_last = wdnow;
187 207
188 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { 208 list_for_each_entry(cs, &watchdog_list, wd_list) {
189 csnow = cs->read(cs);
190 209
191 if (unlikely(resumed)) { 210 /* Clocksource already marked unstable? */
192 cs->wd_last = csnow; 211 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
212 if (finished_booting)
213 schedule_work(&watchdog_work);
193 continue; 214 continue;
194 } 215 }
195 216
196 /* Initialized ? */ 217 csnow = cs->read(cs);
218
219 /* Clocksource initialized ? */
197 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { 220 if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
198 if ((cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
199 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
200 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
201 /*
202 * We just marked the clocksource as
203 * highres-capable, notify the rest of the
204 * system as well so that we transition
205 * into high-res mode:
206 */
207 tick_clock_notify();
208 }
209 cs->flags |= CLOCK_SOURCE_WATCHDOG; 221 cs->flags |= CLOCK_SOURCE_WATCHDOG;
210 cs->wd_last = csnow; 222 cs->wd_last = csnow;
211 } else { 223 continue;
212 cs_nsec = cyc2ns(cs, (csnow - cs->wd_last) & cs->mask);
213 cs->wd_last = csnow;
214 /* Check the delta. Might remove from the list ! */
215 clocksource_ratewd(cs, cs_nsec - wd_nsec);
216 } 224 }
217 }
218 225
219 if (!list_empty(&watchdog_list)) { 226 /* Check the deviation from the watchdog clocksource. */
220 /* 227 cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
221 * Cycle through CPUs to check if the CPUs stay 228 cs->mask, cs->mult, cs->shift);
222 * synchronized to each other. 229 cs->wd_last = csnow;
223 */ 230 if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
224 int next_cpu = cpumask_next(raw_smp_processor_id(), 231 clocksource_unstable(cs, cs_nsec - wd_nsec);
225 cpu_online_mask); 232 continue;
233 }
226 234
227 if (next_cpu >= nr_cpu_ids) 235 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
228 next_cpu = cpumask_first(cpu_online_mask); 236 (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
229 watchdog_timer.expires += WATCHDOG_INTERVAL; 237 (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
230 add_timer_on(&watchdog_timer, next_cpu); 238 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
239 /*
240 * We just marked the clocksource as highres-capable,
241 * notify the rest of the system as well so that we
242 * transition into high-res mode:
243 */
244 tick_clock_notify();
245 }
231 } 246 }
247
248 /*
249 * Cycle through CPUs to check if the CPUs stay synchronized
250 * to each other.
251 */
252 next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
253 if (next_cpu >= nr_cpu_ids)
254 next_cpu = cpumask_first(cpu_online_mask);
255 watchdog_timer.expires += WATCHDOG_INTERVAL;
256 add_timer_on(&watchdog_timer, next_cpu);
257out:
232 spin_unlock(&watchdog_lock); 258 spin_unlock(&watchdog_lock);
233} 259}
260
261static inline void clocksource_start_watchdog(void)
262{
263 if (watchdog_running || !watchdog || list_empty(&watchdog_list))
264 return;
265 init_timer(&watchdog_timer);
266 watchdog_timer.function = clocksource_watchdog;
267 watchdog_last = watchdog->read(watchdog);
268 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
269 add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
270 watchdog_running = 1;
271}
272
273static inline void clocksource_stop_watchdog(void)
274{
275 if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
276 return;
277 del_timer(&watchdog_timer);
278 watchdog_running = 0;
279}
280
281static inline void clocksource_reset_watchdog(void)
282{
283 struct clocksource *cs;
284
285 list_for_each_entry(cs, &watchdog_list, wd_list)
286 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
287}
288
234static void clocksource_resume_watchdog(void) 289static void clocksource_resume_watchdog(void)
235{ 290{
236 set_bit(0, &watchdog_resumed); 291 unsigned long flags;
292
293 spin_lock_irqsave(&watchdog_lock, flags);
294 clocksource_reset_watchdog();
295 spin_unlock_irqrestore(&watchdog_lock, flags);
237} 296}
238 297
239static void clocksource_check_watchdog(struct clocksource *cs) 298static void clocksource_enqueue_watchdog(struct clocksource *cs)
240{ 299{
241 struct clocksource *cse;
242 unsigned long flags; 300 unsigned long flags;
243 301
244 spin_lock_irqsave(&watchdog_lock, flags); 302 spin_lock_irqsave(&watchdog_lock, flags);
245 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { 303 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
246 int started = !list_empty(&watchdog_list); 304 /* cs is a clocksource to be watched. */
247
248 list_add(&cs->wd_list, &watchdog_list); 305 list_add(&cs->wd_list, &watchdog_list);
249 if (!started && watchdog) { 306 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
250 watchdog_last = watchdog->read(watchdog);
251 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
252 add_timer_on(&watchdog_timer,
253 cpumask_first(cpu_online_mask));
254 }
255 } else { 307 } else {
308 /* cs is a watchdog. */
256 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 309 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
257 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 310 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
258 311 /* Pick the best watchdog. */
259 if (!watchdog || cs->rating > watchdog->rating) { 312 if (!watchdog || cs->rating > watchdog->rating) {
260 if (watchdog)
261 del_timer(&watchdog_timer);
262 watchdog = cs; 313 watchdog = cs;
263 init_timer(&watchdog_timer);
264 watchdog_timer.function = clocksource_watchdog;
265
266 /* Reset watchdog cycles */ 314 /* Reset watchdog cycles */
267 list_for_each_entry(cse, &watchdog_list, wd_list) 315 clocksource_reset_watchdog();
268 cse->flags &= ~CLOCK_SOURCE_WATCHDOG; 316 }
269 /* Start if list is not empty */ 317 }
270 if (!list_empty(&watchdog_list)) { 318 /* Check if the watchdog timer needs to be started. */
271 watchdog_last = watchdog->read(watchdog); 319 clocksource_start_watchdog();
272 watchdog_timer.expires = 320 spin_unlock_irqrestore(&watchdog_lock, flags);
273 jiffies + WATCHDOG_INTERVAL; 321}
274 add_timer_on(&watchdog_timer, 322
275 cpumask_first(cpu_online_mask)); 323static void clocksource_dequeue_watchdog(struct clocksource *cs)
276 } 324{
325 struct clocksource *tmp;
326 unsigned long flags;
327
328 spin_lock_irqsave(&watchdog_lock, flags);
329 if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) {
330 /* cs is a watched clocksource. */
331 list_del_init(&cs->wd_list);
332 } else if (cs == watchdog) {
333 /* Reset watchdog cycles */
334 clocksource_reset_watchdog();
335 /* Current watchdog is removed. Find an alternative. */
336 watchdog = NULL;
337 list_for_each_entry(tmp, &clocksource_list, list) {
338 if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY)
339 continue;
340 if (!watchdog || tmp->rating > watchdog->rating)
341 watchdog = tmp;
277 } 342 }
278 } 343 }
344 cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
345 /* Check if the watchdog timer needs to be stopped. */
346 clocksource_stop_watchdog();
279 spin_unlock_irqrestore(&watchdog_lock, flags); 347 spin_unlock_irqrestore(&watchdog_lock, flags);
280} 348}
281#else 349
282static void clocksource_check_watchdog(struct clocksource *cs) 350static int clocksource_watchdog_kthread(void *data)
351{
352 struct clocksource *cs, *tmp;
353 unsigned long flags;
354 LIST_HEAD(unstable);
355
356 mutex_lock(&clocksource_mutex);
357 spin_lock_irqsave(&watchdog_lock, flags);
358 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list)
359 if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
360 list_del_init(&cs->wd_list);
361 list_add(&cs->wd_list, &unstable);
362 }
363 /* Check if the watchdog timer needs to be stopped. */
364 clocksource_stop_watchdog();
365 spin_unlock_irqrestore(&watchdog_lock, flags);
366
367 /* Needs to be done outside of watchdog lock */
368 list_for_each_entry_safe(cs, tmp, &unstable, wd_list) {
369 list_del_init(&cs->wd_list);
370 __clocksource_change_rating(cs, 0);
371 }
372 mutex_unlock(&clocksource_mutex);
373 return 0;
374}
375
376#else /* CONFIG_CLOCKSOURCE_WATCHDOG */
377
378static void clocksource_enqueue_watchdog(struct clocksource *cs)
283{ 379{
284 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 380 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
285 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; 381 cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
286} 382}
287 383
384static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
288static inline void clocksource_resume_watchdog(void) { } 385static inline void clocksource_resume_watchdog(void) { }
289#endif 386static inline int clocksource_watchdog_kthread(void *data) { return 0; }
387
388#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
290 389
291/** 390/**
292 * clocksource_resume - resume the clocksource(s) 391 * clocksource_resume - resume the clocksource(s)
@@ -294,18 +393,16 @@ static inline void clocksource_resume_watchdog(void) { }
294void clocksource_resume(void) 393void clocksource_resume(void)
295{ 394{
296 struct clocksource *cs; 395 struct clocksource *cs;
297 unsigned long flags;
298 396
299 spin_lock_irqsave(&clocksource_lock, flags); 397 mutex_lock(&clocksource_mutex);
300 398
301 list_for_each_entry(cs, &clocksource_list, list) { 399 list_for_each_entry(cs, &clocksource_list, list)
302 if (cs->resume) 400 if (cs->resume)
303 cs->resume(); 401 cs->resume();
304 }
305 402
306 clocksource_resume_watchdog(); 403 clocksource_resume_watchdog();
307 404
308 spin_unlock_irqrestore(&clocksource_lock, flags); 405 mutex_unlock(&clocksource_mutex);
309} 406}
310 407
311/** 408/**
@@ -320,75 +417,94 @@ void clocksource_touch_watchdog(void)
320 clocksource_resume_watchdog(); 417 clocksource_resume_watchdog();
321} 418}
322 419
420#ifdef CONFIG_GENERIC_TIME
421
323/** 422/**
324 * clocksource_get_next - Returns the selected clocksource 423 * clocksource_select - Select the best clocksource available
424 *
425 * Private function. Must hold clocksource_mutex when called.
325 * 426 *
427 * Select the clocksource with the best rating, or the clocksource,
428 * which is selected by userspace override.
326 */ 429 */
327struct clocksource *clocksource_get_next(void) 430static void clocksource_select(void)
328{ 431{
329 unsigned long flags; 432 struct clocksource *best, *cs;
330 433
331 spin_lock_irqsave(&clocksource_lock, flags); 434 if (!finished_booting || list_empty(&clocksource_list))
332 if (next_clocksource && finished_booting) { 435 return;
333 curr_clocksource = next_clocksource; 436 /* First clocksource on the list has the best rating. */
334 next_clocksource = NULL; 437 best = list_first_entry(&clocksource_list, struct clocksource, list);
438 /* Check for the override clocksource. */
439 list_for_each_entry(cs, &clocksource_list, list) {
440 if (strcmp(cs->name, override_name) != 0)
441 continue;
442 /*
443 * Check to make sure we don't switch to a non-highres
444 * capable clocksource if the tick code is in oneshot
445 * mode (highres or nohz)
446 */
447 if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
448 tick_oneshot_mode_active()) {
449 /* Override clocksource cannot be used. */
450 printk(KERN_WARNING "Override clocksource %s is not "
451 "HRT compatible. Cannot switch while in "
452 "HRT/NOHZ mode\n", cs->name);
453 override_name[0] = 0;
454 } else
455 /* Override clocksource can be used. */
456 best = cs;
457 break;
458 }
459 if (curr_clocksource != best) {
460 printk(KERN_INFO "Switching to clocksource %s\n", best->name);
461 curr_clocksource = best;
462 timekeeping_notify(curr_clocksource);
335 } 463 }
336 spin_unlock_irqrestore(&clocksource_lock, flags);
337
338 return curr_clocksource;
339} 464}
340 465
341/** 466#else /* CONFIG_GENERIC_TIME */
342 * select_clocksource - Selects the best registered clocksource. 467
343 * 468static inline void clocksource_select(void) { }
344 * Private function. Must hold clocksource_lock when called. 469
470#endif
471
472/*
473 * clocksource_done_booting - Called near the end of core bootup
345 * 474 *
346 * Select the clocksource with the best rating, or the clocksource, 475 * Hack to avoid lots of clocksource churn at boot time.
347 * which is selected by userspace override. 476 * We use fs_initcall because we want this to start before
477 * device_initcall but after subsys_initcall.
348 */ 478 */
349static struct clocksource *select_clocksource(void) 479static int __init clocksource_done_booting(void)
350{ 480{
351 struct clocksource *next; 481 finished_booting = 1;
352
353 if (list_empty(&clocksource_list))
354 return NULL;
355
356 if (clocksource_override)
357 next = clocksource_override;
358 else
359 next = list_entry(clocksource_list.next, struct clocksource,
360 list);
361 482
362 if (next == curr_clocksource) 483 /*
363 return NULL; 484 * Run the watchdog first to eliminate unstable clock sources
485 */
486 clocksource_watchdog_kthread(NULL);
364 487
365 return next; 488 mutex_lock(&clocksource_mutex);
489 clocksource_select();
490 mutex_unlock(&clocksource_mutex);
491 return 0;
366} 492}
493fs_initcall(clocksource_done_booting);
367 494
368/* 495/*
369 * Enqueue the clocksource sorted by rating 496 * Enqueue the clocksource sorted by rating
370 */ 497 */
371static int clocksource_enqueue(struct clocksource *c) 498static void clocksource_enqueue(struct clocksource *cs)
372{ 499{
373 struct list_head *tmp, *entry = &clocksource_list; 500 struct list_head *entry = &clocksource_list;
501 struct clocksource *tmp;
374 502
375 list_for_each(tmp, &clocksource_list) { 503 list_for_each_entry(tmp, &clocksource_list, list)
376 struct clocksource *cs;
377
378 cs = list_entry(tmp, struct clocksource, list);
379 if (cs == c)
380 return -EBUSY;
381 /* Keep track of the place, where to insert */ 504 /* Keep track of the place, where to insert */
382 if (cs->rating >= c->rating) 505 if (tmp->rating >= cs->rating)
383 entry = tmp; 506 entry = &tmp->list;
384 } 507 list_add(&cs->list, entry);
385 list_add(&c->list, entry);
386
387 if (strlen(c->name) == strlen(override_name) &&
388 !strcmp(c->name, override_name))
389 clocksource_override = c;
390
391 return 0;
392} 508}
393 509
394/** 510/**
@@ -397,52 +513,48 @@ static int clocksource_enqueue(struct clocksource *c)
397 * 513 *
398 * Returns -EBUSY if registration fails, zero otherwise. 514 * Returns -EBUSY if registration fails, zero otherwise.
399 */ 515 */
400int clocksource_register(struct clocksource *c) 516int clocksource_register(struct clocksource *cs)
401{ 517{
402 unsigned long flags; 518 mutex_lock(&clocksource_mutex);
403 int ret; 519 clocksource_enqueue(cs);
404 520 clocksource_select();
405 spin_lock_irqsave(&clocksource_lock, flags); 521 clocksource_enqueue_watchdog(cs);
406 ret = clocksource_enqueue(c); 522 mutex_unlock(&clocksource_mutex);
407 if (!ret) 523 return 0;
408 next_clocksource = select_clocksource();
409 spin_unlock_irqrestore(&clocksource_lock, flags);
410 if (!ret)
411 clocksource_check_watchdog(c);
412 return ret;
413} 524}
414EXPORT_SYMBOL(clocksource_register); 525EXPORT_SYMBOL(clocksource_register);
415 526
527static void __clocksource_change_rating(struct clocksource *cs, int rating)
528{
529 list_del(&cs->list);
530 cs->rating = rating;
531 clocksource_enqueue(cs);
532 clocksource_select();
533}
534
416/** 535/**
417 * clocksource_change_rating - Change the rating of a registered clocksource 536 * clocksource_change_rating - Change the rating of a registered clocksource
418 *
419 */ 537 */
420void clocksource_change_rating(struct clocksource *cs, int rating) 538void clocksource_change_rating(struct clocksource *cs, int rating)
421{ 539{
422 unsigned long flags; 540 mutex_lock(&clocksource_mutex);
423 541 __clocksource_change_rating(cs, rating);
424 spin_lock_irqsave(&clocksource_lock, flags); 542 mutex_unlock(&clocksource_mutex);
425 list_del(&cs->list);
426 cs->rating = rating;
427 clocksource_enqueue(cs);
428 next_clocksource = select_clocksource();
429 spin_unlock_irqrestore(&clocksource_lock, flags);
430} 543}
544EXPORT_SYMBOL(clocksource_change_rating);
431 545
432/** 546/**
433 * clocksource_unregister - remove a registered clocksource 547 * clocksource_unregister - remove a registered clocksource
434 */ 548 */
435void clocksource_unregister(struct clocksource *cs) 549void clocksource_unregister(struct clocksource *cs)
436{ 550{
437 unsigned long flags; 551 mutex_lock(&clocksource_mutex);
438 552 clocksource_dequeue_watchdog(cs);
439 spin_lock_irqsave(&clocksource_lock, flags);
440 list_del(&cs->list); 553 list_del(&cs->list);
441 if (clocksource_override == cs) 554 clocksource_select();
442 clocksource_override = NULL; 555 mutex_unlock(&clocksource_mutex);
443 next_clocksource = select_clocksource();
444 spin_unlock_irqrestore(&clocksource_lock, flags);
445} 556}
557EXPORT_SYMBOL(clocksource_unregister);
446 558
447#ifdef CONFIG_SYSFS 559#ifdef CONFIG_SYSFS
448/** 560/**
@@ -458,9 +570,9 @@ sysfs_show_current_clocksources(struct sys_device *dev,
458{ 570{
459 ssize_t count = 0; 571 ssize_t count = 0;
460 572
461 spin_lock_irq(&clocksource_lock); 573 mutex_lock(&clocksource_mutex);
462 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); 574 count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name);
463 spin_unlock_irq(&clocksource_lock); 575 mutex_unlock(&clocksource_mutex);
464 576
465 return count; 577 return count;
466} 578}
@@ -478,9 +590,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
478 struct sysdev_attribute *attr, 590 struct sysdev_attribute *attr,
479 const char *buf, size_t count) 591 const char *buf, size_t count)
480{ 592{
481 struct clocksource *ovr = NULL;
482 size_t ret = count; 593 size_t ret = count;
483 int len;
484 594
485 /* strings from sysfs write are not 0 terminated! */ 595 /* strings from sysfs write are not 0 terminated! */
486 if (count >= sizeof(override_name)) 596 if (count >= sizeof(override_name))
@@ -490,44 +600,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
490 if (buf[count-1] == '\n') 600 if (buf[count-1] == '\n')
491 count--; 601 count--;
492 602
493 spin_lock_irq(&clocksource_lock); 603 mutex_lock(&clocksource_mutex);
494 604
495 if (count > 0) 605 if (count > 0)
496 memcpy(override_name, buf, count); 606 memcpy(override_name, buf, count);
497 override_name[count] = 0; 607 override_name[count] = 0;
608 clocksource_select();
498 609
499 len = strlen(override_name); 610 mutex_unlock(&clocksource_mutex);
500 if (len) {
501 struct clocksource *cs;
502
503 ovr = clocksource_override;
504 /* try to select it: */
505 list_for_each_entry(cs, &clocksource_list, list) {
506 if (strlen(cs->name) == len &&
507 !strcmp(cs->name, override_name))
508 ovr = cs;
509 }
510 }
511
512 /*
513 * Check to make sure we don't switch to a non-highres capable
514 * clocksource if the tick code is in oneshot mode (highres or nohz)
515 */
516 if (tick_oneshot_mode_active() &&
517 !(ovr->flags & CLOCK_SOURCE_VALID_FOR_HRES)) {
518 printk(KERN_WARNING "%s clocksource is not HRT compatible. "
519 "Cannot switch while in HRT/NOHZ mode\n", ovr->name);
520 ovr = NULL;
521 override_name[0] = 0;
522 }
523
524 /* Reselect, when the override name has changed */
525 if (ovr != clocksource_override) {
526 clocksource_override = ovr;
527 next_clocksource = select_clocksource();
528 }
529
530 spin_unlock_irq(&clocksource_lock);
531 611
532 return ret; 612 return ret;
533} 613}
@@ -547,7 +627,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
547 struct clocksource *src; 627 struct clocksource *src;
548 ssize_t count = 0; 628 ssize_t count = 0;
549 629
550 spin_lock_irq(&clocksource_lock); 630 mutex_lock(&clocksource_mutex);
551 list_for_each_entry(src, &clocksource_list, list) { 631 list_for_each_entry(src, &clocksource_list, list) {
552 /* 632 /*
553 * Don't show non-HRES clocksource if the tick code is 633 * Don't show non-HRES clocksource if the tick code is
@@ -559,7 +639,7 @@ sysfs_show_available_clocksources(struct sys_device *dev,
559 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), 639 max((ssize_t)PAGE_SIZE - count, (ssize_t)0),
560 "%s ", src->name); 640 "%s ", src->name);
561 } 641 }
562 spin_unlock_irq(&clocksource_lock); 642 mutex_unlock(&clocksource_mutex);
563 643
564 count += snprintf(buf + count, 644 count += snprintf(buf + count,
565 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); 645 max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n");
@@ -614,11 +694,10 @@ device_initcall(init_clocksource_sysfs);
614 */ 694 */
615static int __init boot_override_clocksource(char* str) 695static int __init boot_override_clocksource(char* str)
616{ 696{
617 unsigned long flags; 697 mutex_lock(&clocksource_mutex);
618 spin_lock_irqsave(&clocksource_lock, flags);
619 if (str) 698 if (str)
620 strlcpy(override_name, str, sizeof(override_name)); 699 strlcpy(override_name, str, sizeof(override_name));
621 spin_unlock_irqrestore(&clocksource_lock, flags); 700 mutex_unlock(&clocksource_mutex);
622 return 1; 701 return 1;
623} 702}
624 703
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index c3f6c30816e3..5404a8456909 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,7 +61,6 @@ struct clocksource clocksource_jiffies = {
61 .read = jiffies_read, 61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
65 .shift = JIFFIES_SHIFT, 64 .shift = JIFFIES_SHIFT,
66}; 65};
67 66
@@ -71,3 +70,8 @@ static int __init init_jiffies_clocksource(void)
71} 70}
72 71
73core_initcall(init_jiffies_clocksource); 72core_initcall(init_jiffies_clocksource);
73
74struct clocksource * __init __weak clocksource_default_clock(void)
75{
76 return &clocksource_jiffies;
77}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7fc64375ff43..4800f933910e 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -194,8 +194,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
194 case TIME_OK: 194 case TIME_OK:
195 break; 195 break;
196 case TIME_INS: 196 case TIME_INS:
197 xtime.tv_sec--; 197 timekeeping_leap_insert(-1);
198 wall_to_monotonic.tv_sec++;
199 time_state = TIME_OOP; 198 time_state = TIME_OOP;
200 printk(KERN_NOTICE 199 printk(KERN_NOTICE
201 "Clock: inserting leap second 23:59:60 UTC\n"); 200 "Clock: inserting leap second 23:59:60 UTC\n");
@@ -203,9 +202,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
203 res = HRTIMER_RESTART; 202 res = HRTIMER_RESTART;
204 break; 203 break;
205 case TIME_DEL: 204 case TIME_DEL:
206 xtime.tv_sec++; 205 timekeeping_leap_insert(1);
207 time_tai--; 206 time_tai--;
208 wall_to_monotonic.tv_sec--;
209 time_state = TIME_WAIT; 207 time_state = TIME_WAIT;
210 printk(KERN_NOTICE 208 printk(KERN_NOTICE
211 "Clock: deleting leap second 23:59:59 UTC\n"); 209 "Clock: deleting leap second 23:59:59 UTC\n");
@@ -219,7 +217,6 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
219 time_state = TIME_OK; 217 time_state = TIME_OK;
220 break; 218 break;
221 } 219 }
222 update_vsyscall(&xtime, clock);
223 220
224 write_sequnlock(&xtime_lock); 221 write_sequnlock(&xtime_lock);
225 222
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 877dbedc3118..c2ec25087a35 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -205,11 +205,11 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
205 * Powerstate information: The system enters/leaves a state, where 205 * Powerstate information: The system enters/leaves a state, where
206 * affected devices might stop 206 * affected devices might stop
207 */ 207 */
208static void tick_do_broadcast_on_off(void *why) 208static void tick_do_broadcast_on_off(unsigned long *reason)
209{ 209{
210 struct clock_event_device *bc, *dev; 210 struct clock_event_device *bc, *dev;
211 struct tick_device *td; 211 struct tick_device *td;
212 unsigned long flags, *reason = why; 212 unsigned long flags;
213 int cpu, bc_stopped; 213 int cpu, bc_stopped;
214 214
215 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -276,8 +276,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for " 276 printk(KERN_ERR "tick-broadcast: ignoring broadcast for "
277 "offline CPU #%d\n", *oncpu); 277 "offline CPU #%d\n", *oncpu);
278 else 278 else
279 smp_call_function_single(*oncpu, tick_do_broadcast_on_off, 279 tick_do_broadcast_on_off(&reason);
280 &reason, 1);
281} 280}
282 281
283/* 282/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e8c77d9c633a..fb0f46fa1ecd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -18,7 +18,117 @@
18#include <linux/jiffies.h> 18#include <linux/jiffies.h>
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/tick.h> 20#include <linux/tick.h>
21#include <linux/stop_machine.h>
22
23/* Structure holding internal timekeeping values. */
24struct timekeeper {
25 /* Current clocksource used for timekeeping. */
26 struct clocksource *clock;
27 /* The shift value of the current clocksource. */
28 int shift;
29
30 /* Number of clock cycles in one NTP interval. */
31 cycle_t cycle_interval;
32 /* Number of clock shifted nano seconds in one NTP interval. */
33 u64 xtime_interval;
34 /* Raw nano seconds accumulated per NTP interval. */
35 u32 raw_interval;
36
37 /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */
38 u64 xtime_nsec;
39 /* Difference between accumulated time and NTP time in ntp
40 * shifted nano seconds. */
41 s64 ntp_error;
42 /* Shift conversion between clock shifted nano seconds and
43 * ntp shifted nano seconds. */
44 int ntp_error_shift;
45 /* NTP adjusted clock multiplier */
46 u32 mult;
47};
48
49struct timekeeper timekeeper;
50
51/**
52 * timekeeper_setup_internals - Set up internals to use clocksource clock.
53 *
54 * @clock: Pointer to clocksource.
55 *
56 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
57 * pair and interval request.
58 *
59 * Unless you're the timekeeping code, you should not be using this!
60 */
61static void timekeeper_setup_internals(struct clocksource *clock)
62{
63 cycle_t interval;
64 u64 tmp;
65
66 timekeeper.clock = clock;
67 clock->cycle_last = clock->read(clock);
21 68
69 /* Do the ns -> cycle conversion first, using original mult */
70 tmp = NTP_INTERVAL_LENGTH;
71 tmp <<= clock->shift;
72 tmp += clock->mult/2;
73 do_div(tmp, clock->mult);
74 if (tmp == 0)
75 tmp = 1;
76
77 interval = (cycle_t) tmp;
78 timekeeper.cycle_interval = interval;
79
80 /* Go back from cycles -> shifted ns */
81 timekeeper.xtime_interval = (u64) interval * clock->mult;
82 timekeeper.raw_interval =
83 ((u64) interval * clock->mult) >> clock->shift;
84
85 timekeeper.xtime_nsec = 0;
86 timekeeper.shift = clock->shift;
87
88 timekeeper.ntp_error = 0;
89 timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
90
91 /*
92 * The timekeeper keeps its own mult values for the currently
93 * active clocksource. These value will be adjusted via NTP
94 * to counteract clock drifting.
95 */
96 timekeeper.mult = clock->mult;
97}
98
99/* Timekeeper helper functions. */
100static inline s64 timekeeping_get_ns(void)
101{
102 cycle_t cycle_now, cycle_delta;
103 struct clocksource *clock;
104
105 /* read clocksource: */
106 clock = timekeeper.clock;
107 cycle_now = clock->read(clock);
108
109 /* calculate the delta since the last update_wall_time: */
110 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
111
112 /* return delta convert to nanoseconds using ntp adjusted mult. */
113 return clocksource_cyc2ns(cycle_delta, timekeeper.mult,
114 timekeeper.shift);
115}
116
117static inline s64 timekeeping_get_ns_raw(void)
118{
119 cycle_t cycle_now, cycle_delta;
120 struct clocksource *clock;
121
122 /* read clocksource: */
123 clock = timekeeper.clock;
124 cycle_now = clock->read(clock);
125
126 /* calculate the delta since the last update_wall_time: */
127 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
128
129 /* return delta convert to nanoseconds using ntp adjusted mult. */
130 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
131}
22 132
23/* 133/*
24 * This read-write spinlock protects us from races in SMP while 134 * This read-write spinlock protects us from races in SMP while
@@ -44,7 +154,12 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
44 */ 154 */
45struct timespec xtime __attribute__ ((aligned (16))); 155struct timespec xtime __attribute__ ((aligned (16)));
46struct timespec wall_to_monotonic __attribute__ ((aligned (16))); 156struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 157static struct timespec total_sleep_time;
158
159/*
160 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
161 */
162struct timespec raw_time;
48 163
49/* flag for if timekeeping is suspended */ 164/* flag for if timekeeping is suspended */
50int __read_mostly timekeeping_suspended; 165int __read_mostly timekeeping_suspended;
@@ -56,35 +171,44 @@ void update_xtime_cache(u64 nsec)
56 timespec_add_ns(&xtime_cache, nsec); 171 timespec_add_ns(&xtime_cache, nsec);
57} 172}
58 173
59struct clocksource *clock; 174/* must hold xtime_lock */
60 175void timekeeping_leap_insert(int leapsecond)
176{
177 xtime.tv_sec += leapsecond;
178 wall_to_monotonic.tv_sec -= leapsecond;
179 update_vsyscall(&xtime, timekeeper.clock);
180}
61 181
62#ifdef CONFIG_GENERIC_TIME 182#ifdef CONFIG_GENERIC_TIME
183
63/** 184/**
64 * clocksource_forward_now - update clock to the current time 185 * timekeeping_forward_now - update clock to the current time
65 * 186 *
66 * Forward the current clock to update its state since the last call to 187 * Forward the current clock to update its state since the last call to
67 * update_wall_time(). This is useful before significant clock changes, 188 * update_wall_time(). This is useful before significant clock changes,
68 * as it avoids having to deal with this time offset explicitly. 189 * as it avoids having to deal with this time offset explicitly.
69 */ 190 */
70static void clocksource_forward_now(void) 191static void timekeeping_forward_now(void)
71{ 192{
72 cycle_t cycle_now, cycle_delta; 193 cycle_t cycle_now, cycle_delta;
194 struct clocksource *clock;
73 s64 nsec; 195 s64 nsec;
74 196
75 cycle_now = clocksource_read(clock); 197 clock = timekeeper.clock;
198 cycle_now = clock->read(clock);
76 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 199 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
77 clock->cycle_last = cycle_now; 200 clock->cycle_last = cycle_now;
78 201
79 nsec = cyc2ns(clock, cycle_delta); 202 nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult,
203 timekeeper.shift);
80 204
81 /* If arch requires, add in gettimeoffset() */ 205 /* If arch requires, add in gettimeoffset() */
82 nsec += arch_gettimeoffset(); 206 nsec += arch_gettimeoffset();
83 207
84 timespec_add_ns(&xtime, nsec); 208 timespec_add_ns(&xtime, nsec);
85 209
86 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; 210 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
87 clock->raw_time.tv_nsec += nsec; 211 timespec_add_ns(&raw_time, nsec);
88} 212}
89 213
90/** 214/**
@@ -95,7 +219,6 @@ static void clocksource_forward_now(void)
95 */ 219 */
96void getnstimeofday(struct timespec *ts) 220void getnstimeofday(struct timespec *ts)
97{ 221{
98 cycle_t cycle_now, cycle_delta;
99 unsigned long seq; 222 unsigned long seq;
100 s64 nsecs; 223 s64 nsecs;
101 224
@@ -105,15 +228,7 @@ void getnstimeofday(struct timespec *ts)
105 seq = read_seqbegin(&xtime_lock); 228 seq = read_seqbegin(&xtime_lock);
106 229
107 *ts = xtime; 230 *ts = xtime;
108 231 nsecs = timekeeping_get_ns();
109 /* read clocksource: */
110 cycle_now = clocksource_read(clock);
111
112 /* calculate the delta since the last update_wall_time: */
113 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
114
115 /* convert to nanoseconds: */
116 nsecs = cyc2ns(clock, cycle_delta);
117 232
118 /* If arch requires, add in gettimeoffset() */ 233 /* If arch requires, add in gettimeoffset() */
119 nsecs += arch_gettimeoffset(); 234 nsecs += arch_gettimeoffset();
@@ -125,6 +240,57 @@ void getnstimeofday(struct timespec *ts)
125 240
126EXPORT_SYMBOL(getnstimeofday); 241EXPORT_SYMBOL(getnstimeofday);
127 242
243ktime_t ktime_get(void)
244{
245 unsigned int seq;
246 s64 secs, nsecs;
247
248 WARN_ON(timekeeping_suspended);
249
250 do {
251 seq = read_seqbegin(&xtime_lock);
252 secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
253 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
254 nsecs += timekeeping_get_ns();
255
256 } while (read_seqretry(&xtime_lock, seq));
257 /*
258 * Use ktime_set/ktime_add_ns to create a proper ktime on
259 * 32-bit architectures without CONFIG_KTIME_SCALAR.
260 */
261 return ktime_add_ns(ktime_set(secs, 0), nsecs);
262}
263EXPORT_SYMBOL_GPL(ktime_get);
264
265/**
266 * ktime_get_ts - get the monotonic clock in timespec format
267 * @ts: pointer to timespec variable
268 *
269 * The function calculates the monotonic clock from the realtime
270 * clock and the wall_to_monotonic offset and stores the result
271 * in normalized timespec format in the variable pointed to by @ts.
272 */
273void ktime_get_ts(struct timespec *ts)
274{
275 struct timespec tomono;
276 unsigned int seq;
277 s64 nsecs;
278
279 WARN_ON(timekeeping_suspended);
280
281 do {
282 seq = read_seqbegin(&xtime_lock);
283 *ts = xtime;
284 tomono = wall_to_monotonic;
285 nsecs = timekeeping_get_ns();
286
287 } while (read_seqretry(&xtime_lock, seq));
288
289 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
290 ts->tv_nsec + tomono.tv_nsec + nsecs);
291}
292EXPORT_SYMBOL_GPL(ktime_get_ts);
293
128/** 294/**
129 * do_gettimeofday - Returns the time of day in a timeval 295 * do_gettimeofday - Returns the time of day in a timeval
130 * @tv: pointer to the timeval to be set 296 * @tv: pointer to the timeval to be set
@@ -157,7 +323,7 @@ int do_settimeofday(struct timespec *tv)
157 323
158 write_seqlock_irqsave(&xtime_lock, flags); 324 write_seqlock_irqsave(&xtime_lock, flags);
159 325
160 clocksource_forward_now(); 326 timekeeping_forward_now();
161 327
162 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; 328 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
163 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; 329 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
@@ -167,10 +333,10 @@ int do_settimeofday(struct timespec *tv)
167 333
168 update_xtime_cache(0); 334 update_xtime_cache(0);
169 335
170 clock->error = 0; 336 timekeeper.ntp_error = 0;
171 ntp_clear(); 337 ntp_clear();
172 338
173 update_vsyscall(&xtime, clock); 339 update_vsyscall(&xtime, timekeeper.clock);
174 340
175 write_sequnlock_irqrestore(&xtime_lock, flags); 341 write_sequnlock_irqrestore(&xtime_lock, flags);
176 342
@@ -187,44 +353,97 @@ EXPORT_SYMBOL(do_settimeofday);
187 * 353 *
188 * Accumulates current time interval and initializes new clocksource 354 * Accumulates current time interval and initializes new clocksource
189 */ 355 */
190static void change_clocksource(void) 356static int change_clocksource(void *data)
191{ 357{
192 struct clocksource *new, *old; 358 struct clocksource *new, *old;
193 359
194 new = clocksource_get_next(); 360 new = (struct clocksource *) data;
361
362 timekeeping_forward_now();
363 if (!new->enable || new->enable(new) == 0) {
364 old = timekeeper.clock;
365 timekeeper_setup_internals(new);
366 if (old->disable)
367 old->disable(old);
368 }
369 return 0;
370}
195 371
196 if (clock == new) 372/**
373 * timekeeping_notify - Install a new clock source
374 * @clock: pointer to the clock source
375 *
376 * This function is called from clocksource.c after a new, better clock
377 * source has been registered. The caller holds the clocksource_mutex.
378 */
379void timekeeping_notify(struct clocksource *clock)
380{
381 if (timekeeper.clock == clock)
197 return; 382 return;
383 stop_machine(change_clocksource, clock, NULL);
384 tick_clock_notify();
385}
198 386
199 clocksource_forward_now(); 387#else /* GENERIC_TIME */
200 388
201 if (clocksource_enable(new)) 389static inline void timekeeping_forward_now(void) { }
202 return;
203 390
204 new->raw_time = clock->raw_time; 391/**
205 old = clock; 392 * ktime_get - get the monotonic time in ktime_t format
206 clock = new; 393 *
207 clocksource_disable(old); 394 * returns the time in ktime_t format
395 */
396ktime_t ktime_get(void)
397{
398 struct timespec now;
208 399
209 clock->cycle_last = 0; 400 ktime_get_ts(&now);
210 clock->cycle_last = clocksource_read(clock);
211 clock->error = 0;
212 clock->xtime_nsec = 0;
213 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
214 401
215 tick_clock_notify(); 402 return timespec_to_ktime(now);
403}
404EXPORT_SYMBOL_GPL(ktime_get);
216 405
217 /* 406/**
218 * We're holding xtime lock and waking up klogd would deadlock 407 * ktime_get_ts - get the monotonic clock in timespec format
219 * us on enqueue. So no printing! 408 * @ts: pointer to timespec variable
220 printk(KERN_INFO "Time: %s clocksource has been installed.\n", 409 *
221 clock->name); 410 * The function calculates the monotonic clock from the realtime
222 */ 411 * clock and the wall_to_monotonic offset and stores the result
412 * in normalized timespec format in the variable pointed to by @ts.
413 */
414void ktime_get_ts(struct timespec *ts)
415{
416 struct timespec tomono;
417 unsigned long seq;
418
419 do {
420 seq = read_seqbegin(&xtime_lock);
421 getnstimeofday(ts);
422 tomono = wall_to_monotonic;
423
424 } while (read_seqretry(&xtime_lock, seq));
425
426 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
427 ts->tv_nsec + tomono.tv_nsec);
223} 428}
224#else 429EXPORT_SYMBOL_GPL(ktime_get_ts);
225static inline void clocksource_forward_now(void) { } 430
226static inline void change_clocksource(void) { } 431#endif /* !GENERIC_TIME */
227#endif 432
433/**
434 * ktime_get_real - get the real (wall-) time in ktime_t format
435 *
436 * returns the time in ktime_t format
437 */
438ktime_t ktime_get_real(void)
439{
440 struct timespec now;
441
442 getnstimeofday(&now);
443
444 return timespec_to_ktime(now);
445}
446EXPORT_SYMBOL_GPL(ktime_get_real);
228 447
229/** 448/**
230 * getrawmonotonic - Returns the raw monotonic time in a timespec 449 * getrawmonotonic - Returns the raw monotonic time in a timespec
@@ -236,21 +455,11 @@ void getrawmonotonic(struct timespec *ts)
236{ 455{
237 unsigned long seq; 456 unsigned long seq;
238 s64 nsecs; 457 s64 nsecs;
239 cycle_t cycle_now, cycle_delta;
240 458
241 do { 459 do {
242 seq = read_seqbegin(&xtime_lock); 460 seq = read_seqbegin(&xtime_lock);
243 461 nsecs = timekeeping_get_ns_raw();
244 /* read clocksource: */ 462 *ts = raw_time;
245 cycle_now = clocksource_read(clock);
246
247 /* calculate the delta since the last update_wall_time: */
248 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
249
250 /* convert to nanoseconds: */
251 nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
252
253 *ts = clock->raw_time;
254 463
255 } while (read_seqretry(&xtime_lock, seq)); 464 } while (read_seqretry(&xtime_lock, seq));
256 465
@@ -270,7 +479,7 @@ int timekeeping_valid_for_hres(void)
270 do { 479 do {
271 seq = read_seqbegin(&xtime_lock); 480 seq = read_seqbegin(&xtime_lock);
272 481
273 ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 482 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
274 483
275 } while (read_seqretry(&xtime_lock, seq)); 484 } while (read_seqretry(&xtime_lock, seq));
276 485
@@ -278,17 +487,33 @@ int timekeeping_valid_for_hres(void)
278} 487}
279 488
280/** 489/**
281 * read_persistent_clock - Return time in seconds from the persistent clock. 490 * read_persistent_clock - Return time from the persistent clock.
282 * 491 *
283 * Weak dummy function for arches that do not yet support it. 492 * Weak dummy function for arches that do not yet support it.
284 * Returns seconds from epoch using the battery backed persistent clock. 493 * Reads the time from the battery backed persistent clock.
285 * Returns zero if unsupported. 494 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
286 * 495 *
287 * XXX - Do be sure to remove it once all arches implement it. 496 * XXX - Do be sure to remove it once all arches implement it.
288 */ 497 */
289unsigned long __attribute__((weak)) read_persistent_clock(void) 498void __attribute__((weak)) read_persistent_clock(struct timespec *ts)
290{ 499{
291 return 0; 500 ts->tv_sec = 0;
501 ts->tv_nsec = 0;
502}
503
504/**
505 * read_boot_clock - Return time of the system start.
506 *
507 * Weak dummy function for arches that do not yet support it.
508 * Function to read the exact time the system has been started.
509 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
510 *
511 * XXX - Do be sure to remove it once all arches implement it.
512 */
513void __attribute__((weak)) read_boot_clock(struct timespec *ts)
514{
515 ts->tv_sec = 0;
516 ts->tv_nsec = 0;
292} 517}
293 518
294/* 519/*
@@ -296,29 +521,40 @@ unsigned long __attribute__((weak)) read_persistent_clock(void)
296 */ 521 */
297void __init timekeeping_init(void) 522void __init timekeeping_init(void)
298{ 523{
524 struct clocksource *clock;
299 unsigned long flags; 525 unsigned long flags;
300 unsigned long sec = read_persistent_clock(); 526 struct timespec now, boot;
527
528 read_persistent_clock(&now);
529 read_boot_clock(&boot);
301 530
302 write_seqlock_irqsave(&xtime_lock, flags); 531 write_seqlock_irqsave(&xtime_lock, flags);
303 532
304 ntp_init(); 533 ntp_init();
305 534
306 clock = clocksource_get_next(); 535 clock = clocksource_default_clock();
307 clocksource_enable(clock); 536 if (clock->enable)
308 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 537 clock->enable(clock);
309 clock->cycle_last = clocksource_read(clock); 538 timekeeper_setup_internals(clock);
310 539
311 xtime.tv_sec = sec; 540 xtime.tv_sec = now.tv_sec;
312 xtime.tv_nsec = 0; 541 xtime.tv_nsec = now.tv_nsec;
542 raw_time.tv_sec = 0;
543 raw_time.tv_nsec = 0;
544 if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
545 boot.tv_sec = xtime.tv_sec;
546 boot.tv_nsec = xtime.tv_nsec;
547 }
313 set_normalized_timespec(&wall_to_monotonic, 548 set_normalized_timespec(&wall_to_monotonic,
314 -xtime.tv_sec, -xtime.tv_nsec); 549 -boot.tv_sec, -boot.tv_nsec);
315 update_xtime_cache(0); 550 update_xtime_cache(0);
316 total_sleep_time = 0; 551 total_sleep_time.tv_sec = 0;
552 total_sleep_time.tv_nsec = 0;
317 write_sequnlock_irqrestore(&xtime_lock, flags); 553 write_sequnlock_irqrestore(&xtime_lock, flags);
318} 554}
319 555
320/* time in seconds when suspend began */ 556/* time in seconds when suspend began */
321static unsigned long timekeeping_suspend_time; 557static struct timespec timekeeping_suspend_time;
322 558
323/** 559/**
324 * timekeeping_resume - Resumes the generic timekeeping subsystem. 560 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -331,24 +567,24 @@ static unsigned long timekeeping_suspend_time;
331static int timekeeping_resume(struct sys_device *dev) 567static int timekeeping_resume(struct sys_device *dev)
332{ 568{
333 unsigned long flags; 569 unsigned long flags;
334 unsigned long now = read_persistent_clock(); 570 struct timespec ts;
571
572 read_persistent_clock(&ts);
335 573
336 clocksource_resume(); 574 clocksource_resume();
337 575
338 write_seqlock_irqsave(&xtime_lock, flags); 576 write_seqlock_irqsave(&xtime_lock, flags);
339 577
340 if (now && (now > timekeeping_suspend_time)) { 578 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
341 unsigned long sleep_length = now - timekeeping_suspend_time; 579 ts = timespec_sub(ts, timekeeping_suspend_time);
342 580 xtime = timespec_add_safe(xtime, ts);
343 xtime.tv_sec += sleep_length; 581 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
344 wall_to_monotonic.tv_sec -= sleep_length; 582 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
345 total_sleep_time += sleep_length;
346 } 583 }
347 update_xtime_cache(0); 584 update_xtime_cache(0);
348 /* re-base the last cycle value */ 585 /* re-base the last cycle value */
349 clock->cycle_last = 0; 586 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
350 clock->cycle_last = clocksource_read(clock); 587 timekeeper.ntp_error = 0;
351 clock->error = 0;
352 timekeeping_suspended = 0; 588 timekeeping_suspended = 0;
353 write_sequnlock_irqrestore(&xtime_lock, flags); 589 write_sequnlock_irqrestore(&xtime_lock, flags);
354 590
@@ -366,10 +602,10 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
366{ 602{
367 unsigned long flags; 603 unsigned long flags;
368 604
369 timekeeping_suspend_time = read_persistent_clock(); 605 read_persistent_clock(&timekeeping_suspend_time);
370 606
371 write_seqlock_irqsave(&xtime_lock, flags); 607 write_seqlock_irqsave(&xtime_lock, flags);
372 clocksource_forward_now(); 608 timekeeping_forward_now();
373 timekeeping_suspended = 1; 609 timekeeping_suspended = 1;
374 write_sequnlock_irqrestore(&xtime_lock, flags); 610 write_sequnlock_irqrestore(&xtime_lock, flags);
375 611
@@ -404,7 +640,7 @@ device_initcall(timekeeping_init_device);
404 * If the error is already larger, we look ahead even further 640 * If the error is already larger, we look ahead even further
405 * to compensate for late or lost adjustments. 641 * to compensate for late or lost adjustments.
406 */ 642 */
407static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, 643static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
408 s64 *offset) 644 s64 *offset)
409{ 645{
410 s64 tick_error, i; 646 s64 tick_error, i;
@@ -420,7 +656,7 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
420 * here. This is tuned so that an error of about 1 msec is adjusted 656 * here. This is tuned so that an error of about 1 msec is adjusted
421 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). 657 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
422 */ 658 */
423 error2 = clock->error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); 659 error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
424 error2 = abs(error2); 660 error2 = abs(error2);
425 for (look_ahead = 0; error2 > 0; look_ahead++) 661 for (look_ahead = 0; error2 > 0; look_ahead++)
426 error2 >>= 2; 662 error2 >>= 2;
@@ -429,8 +665,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
429 * Now calculate the error in (1 << look_ahead) ticks, but first 665 * Now calculate the error in (1 << look_ahead) ticks, but first
430 * remove the single look ahead already included in the error. 666 * remove the single look ahead already included in the error.
431 */ 667 */
432 tick_error = tick_length >> (NTP_SCALE_SHIFT - clock->shift + 1); 668 tick_error = tick_length >> (timekeeper.ntp_error_shift + 1);
433 tick_error -= clock->xtime_interval >> 1; 669 tick_error -= timekeeper.xtime_interval >> 1;
434 error = ((error - tick_error) >> look_ahead) + tick_error; 670 error = ((error - tick_error) >> look_ahead) + tick_error;
435 671
436 /* Finally calculate the adjustment shift value. */ 672 /* Finally calculate the adjustment shift value. */
@@ -455,18 +691,18 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
455 * this is optimized for the most common adjustments of -1,0,1, 691 * this is optimized for the most common adjustments of -1,0,1,
456 * for other values we can do a bit more work. 692 * for other values we can do a bit more work.
457 */ 693 */
458static void clocksource_adjust(s64 offset) 694static void timekeeping_adjust(s64 offset)
459{ 695{
460 s64 error, interval = clock->cycle_interval; 696 s64 error, interval = timekeeper.cycle_interval;
461 int adj; 697 int adj;
462 698
463 error = clock->error >> (NTP_SCALE_SHIFT - clock->shift - 1); 699 error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
464 if (error > interval) { 700 if (error > interval) {
465 error >>= 2; 701 error >>= 2;
466 if (likely(error <= interval)) 702 if (likely(error <= interval))
467 adj = 1; 703 adj = 1;
468 else 704 else
469 adj = clocksource_bigadjust(error, &interval, &offset); 705 adj = timekeeping_bigadjust(error, &interval, &offset);
470 } else if (error < -interval) { 706 } else if (error < -interval) {
471 error >>= 2; 707 error >>= 2;
472 if (likely(error >= -interval)) { 708 if (likely(error >= -interval)) {
@@ -474,15 +710,15 @@ static void clocksource_adjust(s64 offset)
474 interval = -interval; 710 interval = -interval;
475 offset = -offset; 711 offset = -offset;
476 } else 712 } else
477 adj = clocksource_bigadjust(error, &interval, &offset); 713 adj = timekeeping_bigadjust(error, &interval, &offset);
478 } else 714 } else
479 return; 715 return;
480 716
481 clock->mult += adj; 717 timekeeper.mult += adj;
482 clock->xtime_interval += interval; 718 timekeeper.xtime_interval += interval;
483 clock->xtime_nsec -= offset; 719 timekeeper.xtime_nsec -= offset;
484 clock->error -= (interval - offset) << 720 timekeeper.ntp_error -= (interval - offset) <<
485 (NTP_SCALE_SHIFT - clock->shift); 721 timekeeper.ntp_error_shift;
486} 722}
487 723
488/** 724/**
@@ -492,53 +728,59 @@ static void clocksource_adjust(s64 offset)
492 */ 728 */
493void update_wall_time(void) 729void update_wall_time(void)
494{ 730{
731 struct clocksource *clock;
495 cycle_t offset; 732 cycle_t offset;
733 u64 nsecs;
496 734
497 /* Make sure we're fully resumed: */ 735 /* Make sure we're fully resumed: */
498 if (unlikely(timekeeping_suspended)) 736 if (unlikely(timekeeping_suspended))
499 return; 737 return;
500 738
739 clock = timekeeper.clock;
501#ifdef CONFIG_GENERIC_TIME 740#ifdef CONFIG_GENERIC_TIME
502 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; 741 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
503#else 742#else
504 offset = clock->cycle_interval; 743 offset = timekeeper.cycle_interval;
505#endif 744#endif
506 clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift; 745 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
507 746
508 /* normally this loop will run just once, however in the 747 /* normally this loop will run just once, however in the
509 * case of lost or late ticks, it will accumulate correctly. 748 * case of lost or late ticks, it will accumulate correctly.
510 */ 749 */
511 while (offset >= clock->cycle_interval) { 750 while (offset >= timekeeper.cycle_interval) {
751 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
752
512 /* accumulate one interval */ 753 /* accumulate one interval */
513 offset -= clock->cycle_interval; 754 offset -= timekeeper.cycle_interval;
514 clock->cycle_last += clock->cycle_interval; 755 clock->cycle_last += timekeeper.cycle_interval;
515 756
516 clock->xtime_nsec += clock->xtime_interval; 757 timekeeper.xtime_nsec += timekeeper.xtime_interval;
517 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { 758 if (timekeeper.xtime_nsec >= nsecps) {
518 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; 759 timekeeper.xtime_nsec -= nsecps;
519 xtime.tv_sec++; 760 xtime.tv_sec++;
520 second_overflow(); 761 second_overflow();
521 } 762 }
522 763
523 clock->raw_time.tv_nsec += clock->raw_interval; 764 raw_time.tv_nsec += timekeeper.raw_interval;
524 if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) { 765 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
525 clock->raw_time.tv_nsec -= NSEC_PER_SEC; 766 raw_time.tv_nsec -= NSEC_PER_SEC;
526 clock->raw_time.tv_sec++; 767 raw_time.tv_sec++;
527 } 768 }
528 769
529 /* accumulate error between NTP and clock interval */ 770 /* accumulate error between NTP and clock interval */
530 clock->error += tick_length; 771 timekeeper.ntp_error += tick_length;
531 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); 772 timekeeper.ntp_error -= timekeeper.xtime_interval <<
773 timekeeper.ntp_error_shift;
532 } 774 }
533 775
534 /* correct the clock when NTP error is too big */ 776 /* correct the clock when NTP error is too big */
535 clocksource_adjust(offset); 777 timekeeping_adjust(offset);
536 778
537 /* 779 /*
538 * Since in the loop above, we accumulate any amount of time 780 * Since in the loop above, we accumulate any amount of time
539 * in xtime_nsec over a second into xtime.tv_sec, its possible for 781 * in xtime_nsec over a second into xtime.tv_sec, its possible for
540 * xtime_nsec to be fairly small after the loop. Further, if we're 782 * xtime_nsec to be fairly small after the loop. Further, if we're
541 * slightly speeding the clocksource up in clocksource_adjust(), 783 * slightly speeding the clocksource up in timekeeping_adjust(),
542 * its possible the required corrective factor to xtime_nsec could 784 * its possible the required corrective factor to xtime_nsec could
543 * cause it to underflow. 785 * cause it to underflow.
544 * 786 *
@@ -550,24 +792,25 @@ void update_wall_time(void)
550 * We'll correct this error next time through this function, when 792 * We'll correct this error next time through this function, when
551 * xtime_nsec is not as small. 793 * xtime_nsec is not as small.
552 */ 794 */
553 if (unlikely((s64)clock->xtime_nsec < 0)) { 795 if (unlikely((s64)timekeeper.xtime_nsec < 0)) {
554 s64 neg = -(s64)clock->xtime_nsec; 796 s64 neg = -(s64)timekeeper.xtime_nsec;
555 clock->xtime_nsec = 0; 797 timekeeper.xtime_nsec = 0;
556 clock->error += neg << (NTP_SCALE_SHIFT - clock->shift); 798 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
557 } 799 }
558 800
559 /* store full nanoseconds into xtime after rounding it up and 801 /* store full nanoseconds into xtime after rounding it up and
560 * add the remainder to the error difference. 802 * add the remainder to the error difference.
561 */ 803 */
562 xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1; 804 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
563 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 805 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift;
564 clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift); 806 timekeeper.ntp_error += timekeeper.xtime_nsec <<
807 timekeeper.ntp_error_shift;
565 808
566 update_xtime_cache(cyc2ns(clock, offset)); 809 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
810 update_xtime_cache(nsecs);
567 811
568 /* check to see if there is a new clocksource to use */ 812 /* check to see if there is a new clocksource to use */
569 change_clocksource(); 813 update_vsyscall(&xtime, timekeeper.clock);
570 update_vsyscall(&xtime, clock);
571} 814}
572 815
573/** 816/**
@@ -583,9 +826,12 @@ void update_wall_time(void)
583 */ 826 */
584void getboottime(struct timespec *ts) 827void getboottime(struct timespec *ts)
585{ 828{
586 set_normalized_timespec(ts, 829 struct timespec boottime = {
587 - (wall_to_monotonic.tv_sec + total_sleep_time), 830 .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec,
588 - wall_to_monotonic.tv_nsec); 831 .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec
832 };
833
834 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
589} 835}
590 836
591/** 837/**
@@ -594,7 +840,7 @@ void getboottime(struct timespec *ts)
594 */ 840 */
595void monotonic_to_bootbased(struct timespec *ts) 841void monotonic_to_bootbased(struct timespec *ts)
596{ 842{
597 ts->tv_sec += total_sleep_time; 843 *ts = timespec_add_safe(*ts, total_sleep_time);
598} 844}
599 845
600unsigned long get_seconds(void) 846unsigned long get_seconds(void)
@@ -603,6 +849,10 @@ unsigned long get_seconds(void)
603} 849}
604EXPORT_SYMBOL(get_seconds); 850EXPORT_SYMBOL(get_seconds);
605 851
852struct timespec __current_kernel_time(void)
853{
854 return xtime_cache;
855}
606 856
607struct timespec current_kernel_time(void) 857struct timespec current_kernel_time(void)
608{ 858{
@@ -618,3 +868,20 @@ struct timespec current_kernel_time(void)
618 return now; 868 return now;
619} 869}
620EXPORT_SYMBOL(current_kernel_time); 870EXPORT_SYMBOL(current_kernel_time);
871
872struct timespec get_monotonic_coarse(void)
873{
874 struct timespec now, mono;
875 unsigned long seq;
876
877 do {
878 seq = read_seqbegin(&xtime_lock);
879
880 now = xtime_cache;
881 mono = wall_to_monotonic;
882 } while (read_seqretry(&xtime_lock, seq));
883
884 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
885 now.tv_nsec + mono.tv_nsec);
886 return now;
887}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a999b92a1277..fddd69d16e03 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -286,7 +286,7 @@ static int __init init_timer_list_procfs(void)
286{ 286{
287 struct proc_dir_entry *pe; 287 struct proc_dir_entry *pe;
288 288
289 pe = proc_create("timer_list", 0644, NULL, &timer_list_fops); 289 pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
290 if (!pe) 290 if (!pe)
291 return -ENOMEM; 291 return -ENOMEM;
292 return 0; 292 return 0;
diff --git a/kernel/timer.c b/kernel/timer.c
index 0b36b9e5cc8b..811e5c391456 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,7 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h> 40#include <linux/perf_event.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42 42
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
@@ -72,6 +72,7 @@ struct tvec_base {
72 spinlock_t lock; 72 spinlock_t lock;
73 struct timer_list *running_timer; 73 struct timer_list *running_timer;
74 unsigned long timer_jiffies; 74 unsigned long timer_jiffies;
75 unsigned long next_timer;
75 struct tvec_root tv1; 76 struct tvec_root tv1;
76 struct tvec tv2; 77 struct tvec tv2;
77 struct tvec tv3; 78 struct tvec tv3;
@@ -622,6 +623,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
622 623
623 if (timer_pending(timer)) { 624 if (timer_pending(timer)) {
624 detach_timer(timer, 0); 625 detach_timer(timer, 0);
626 if (timer->expires == base->next_timer &&
627 !tbase_get_deferrable(timer->base))
628 base->next_timer = base->timer_jiffies;
625 ret = 1; 629 ret = 1;
626 } else { 630 } else {
627 if (pending_only) 631 if (pending_only)
@@ -663,6 +667,9 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
663 } 667 }
664 668
665 timer->expires = expires; 669 timer->expires = expires;
670 if (time_before(timer->expires, base->next_timer) &&
671 !tbase_get_deferrable(timer->base))
672 base->next_timer = timer->expires;
666 internal_add_timer(base, timer); 673 internal_add_timer(base, timer);
667 674
668out_unlock: 675out_unlock:
@@ -714,7 +721,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
714 * networking code - if the timer is re-modified 721 * networking code - if the timer is re-modified
715 * to be the same thing then just return: 722 * to be the same thing then just return:
716 */ 723 */
717 if (timer->expires == expires && timer_pending(timer)) 724 if (timer_pending(timer) && timer->expires == expires)
718 return 1; 725 return 1;
719 726
720 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 727 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
@@ -781,6 +788,9 @@ void add_timer_on(struct timer_list *timer, int cpu)
781 spin_lock_irqsave(&base->lock, flags); 788 spin_lock_irqsave(&base->lock, flags);
782 timer_set_base(timer, base); 789 timer_set_base(timer, base);
783 debug_timer_activate(timer); 790 debug_timer_activate(timer);
791 if (time_before(timer->expires, base->next_timer) &&
792 !tbase_get_deferrable(timer->base))
793 base->next_timer = timer->expires;
784 internal_add_timer(base, timer); 794 internal_add_timer(base, timer);
785 /* 795 /*
786 * Check whether the other CPU is idle and needs to be 796 * Check whether the other CPU is idle and needs to be
@@ -817,6 +827,9 @@ int del_timer(struct timer_list *timer)
817 base = lock_timer_base(timer, &flags); 827 base = lock_timer_base(timer, &flags);
818 if (timer_pending(timer)) { 828 if (timer_pending(timer)) {
819 detach_timer(timer, 1); 829 detach_timer(timer, 1);
830 if (timer->expires == base->next_timer &&
831 !tbase_get_deferrable(timer->base))
832 base->next_timer = base->timer_jiffies;
820 ret = 1; 833 ret = 1;
821 } 834 }
822 spin_unlock_irqrestore(&base->lock, flags); 835 spin_unlock_irqrestore(&base->lock, flags);
@@ -850,6 +863,9 @@ int try_to_del_timer_sync(struct timer_list *timer)
850 ret = 0; 863 ret = 0;
851 if (timer_pending(timer)) { 864 if (timer_pending(timer)) {
852 detach_timer(timer, 1); 865 detach_timer(timer, 1);
866 if (timer->expires == base->next_timer &&
867 !tbase_get_deferrable(timer->base))
868 base->next_timer = base->timer_jiffies;
853 ret = 1; 869 ret = 1;
854 } 870 }
855out: 871out:
@@ -1007,8 +1023,8 @@ static inline void __run_timers(struct tvec_base *base)
1007#ifdef CONFIG_NO_HZ 1023#ifdef CONFIG_NO_HZ
1008/* 1024/*
1009 * Find out when the next timer event is due to happen. This 1025 * Find out when the next timer event is due to happen. This
1010 * is used on S/390 to stop all activity when a cpus is idle. 1026 * is used on S/390 to stop all activity when a CPU is idle.
1011 * This functions needs to be called disabled. 1027 * This function needs to be called with interrupts disabled.
1012 */ 1028 */
1013static unsigned long __next_timer_interrupt(struct tvec_base *base) 1029static unsigned long __next_timer_interrupt(struct tvec_base *base)
1014{ 1030{
@@ -1134,7 +1150,9 @@ unsigned long get_next_timer_interrupt(unsigned long now)
1134 unsigned long expires; 1150 unsigned long expires;
1135 1151
1136 spin_lock(&base->lock); 1152 spin_lock(&base->lock);
1137 expires = __next_timer_interrupt(base); 1153 if (time_before_eq(base->next_timer, base->timer_jiffies))
1154 base->next_timer = __next_timer_interrupt(base);
1155 expires = base->next_timer;
1138 spin_unlock(&base->lock); 1156 spin_unlock(&base->lock);
1139 1157
1140 if (time_before_eq(expires, now)) 1158 if (time_before_eq(expires, now))
@@ -1156,8 +1174,7 @@ void update_process_times(int user_tick)
1156 /* Note: this timer irq context must be accounted for as well. */ 1174 /* Note: this timer irq context must be accounted for as well. */
1157 account_process_tick(p, user_tick); 1175 account_process_tick(p, user_tick);
1158 run_local_timers(); 1176 run_local_timers();
1159 if (rcu_pending(cpu)) 1177 rcu_check_callbacks(cpu, user_tick);
1160 rcu_check_callbacks(cpu, user_tick);
1161 printk_tick(); 1178 printk_tick();
1162 scheduler_tick(); 1179 scheduler_tick();
1163 run_posix_cpu_timers(p); 1180 run_posix_cpu_timers(p);
@@ -1170,7 +1187,7 @@ static void run_timer_softirq(struct softirq_action *h)
1170{ 1187{
1171 struct tvec_base *base = __get_cpu_var(tvec_bases); 1188 struct tvec_base *base = __get_cpu_var(tvec_bases);
1172 1189
1173 perf_counter_do_pending(); 1190 perf_event_do_pending();
1174 1191
1175 hrtimer_run_pending(); 1192 hrtimer_run_pending();
1176 1193
@@ -1523,6 +1540,7 @@ static int __cpuinit init_timers_cpu(int cpu)
1523 INIT_LIST_HEAD(base->tv1.vec + j); 1540 INIT_LIST_HEAD(base->tv1.vec + j);
1524 1541
1525 base->timer_jiffies = jiffies; 1542 base->timer_jiffies = jiffies;
1543 base->next_timer = base->timer_jiffies;
1526 return 0; 1544 return 0;
1527} 1545}
1528 1546
@@ -1535,6 +1553,9 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
1535 timer = list_first_entry(head, struct timer_list, entry); 1553 timer = list_first_entry(head, struct timer_list, entry);
1536 detach_timer(timer, 0); 1554 detach_timer(timer, 0);
1537 timer_set_base(timer, new_base); 1555 timer_set_base(timer, new_base);
1556 if (time_before(timer->expires, new_base->next_timer) &&
1557 !tbase_get_deferrable(timer->base))
1558 new_base->next_timer = timer->expires;
1538 internal_add_timer(new_base, timer); 1559 internal_add_timer(new_base, timer);
1539 } 1560 }
1540} 1561}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1551f47e7669..b416512ad17f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -11,12 +11,18 @@ config NOP_TRACER
11 11
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help
15 See Documentation/trace/ftrace-implementation.txt
14 16
15config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
16 bool 18 bool
19 help
20 See Documentation/trace/ftrace-implementation.txt
17 21
18config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
19 bool 23 bool
24 help
25 See Documentation/trace/ftrace-implementation.txt
20 26
21config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
22 bool 28 bool
@@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
28config HAVE_FUNCTION_TRACE_MCOUNT_TEST 34config HAVE_FUNCTION_TRACE_MCOUNT_TEST
29 bool 35 bool
30 help 36 help
31 This gets selected when the arch tests the function_trace_stop 37 See Documentation/trace/ftrace-implementation.txt
32 variable at the mcount call site. Otherwise, this variable
33 is tested by the called function.
34 38
35config HAVE_DYNAMIC_FTRACE 39config HAVE_DYNAMIC_FTRACE
36 bool 40 bool
41 help
42 See Documentation/trace/ftrace-implementation.txt
37 43
38config HAVE_FTRACE_MCOUNT_RECORD 44config HAVE_FTRACE_MCOUNT_RECORD
39 bool 45 bool
46 help
47 See Documentation/trace/ftrace-implementation.txt
40 48
41config HAVE_HW_BRANCH_TRACER 49config HAVE_HW_BRANCH_TRACER
42 bool 50 bool
43 51
44config HAVE_FTRACE_SYSCALLS 52config HAVE_SYSCALL_TRACEPOINTS
45 bool 53 bool
54 help
55 See Documentation/trace/ftrace-implementation.txt
46 56
47config TRACER_MAX_TRACE 57config TRACER_MAX_TRACE
48 bool 58 bool
@@ -60,15 +70,20 @@ config EVENT_TRACING
60 bool 70 bool
61 71
62config CONTEXT_SWITCH_TRACER 72config CONTEXT_SWITCH_TRACER
63 select MARKERS
64 bool 73 bool
65 74
75config RING_BUFFER_ALLOW_SWAP
76 bool
77 help
78 Allow the use of ring_buffer_swap_cpu.
79 Adds a very slight overhead to tracing when enabled.
80
66# All tracer options should select GENERIC_TRACER. For those options that are 81# All tracer options should select GENERIC_TRACER. For those options that are
67# enabled by all tracers (context switch and event tracer) they select TRACING. 82# enabled by all tracers (context switch and event tracer) they select TRACING.
68# This allows those options to appear when no other tracer is selected. But the 83# This allows those options to appear when no other tracer is selected. But the
69# options do not appear when something else selects it. We need the two options 84# options do not appear when something else selects it. We need the two options
70# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the 85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
71# hidding of the automatic options options. 86# hidding of the automatic options.
72 87
73config TRACING 88config TRACING
74 bool 89 bool
@@ -147,6 +162,7 @@ config IRQSOFF_TRACER
147 select TRACE_IRQFLAGS 162 select TRACE_IRQFLAGS
148 select GENERIC_TRACER 163 select GENERIC_TRACER
149 select TRACER_MAX_TRACE 164 select TRACER_MAX_TRACE
165 select RING_BUFFER_ALLOW_SWAP
150 help 166 help
151 This option measures the time spent in irqs-off critical 167 This option measures the time spent in irqs-off critical
152 sections, with microsecond accuracy. 168 sections, with microsecond accuracy.
@@ -168,6 +184,7 @@ config PREEMPT_TRACER
168 depends on PREEMPT 184 depends on PREEMPT
169 select GENERIC_TRACER 185 select GENERIC_TRACER
170 select TRACER_MAX_TRACE 186 select TRACER_MAX_TRACE
187 select RING_BUFFER_ALLOW_SWAP
171 help 188 help
172 This option measures the time spent in preemption off critical 189 This option measures the time spent in preemption off critical
173 sections, with microsecond accuracy. 190 sections, with microsecond accuracy.
@@ -211,7 +228,7 @@ config ENABLE_DEFAULT_TRACERS
211 228
212config FTRACE_SYSCALLS 229config FTRACE_SYSCALLS
213 bool "Trace syscalls" 230 bool "Trace syscalls"
214 depends on HAVE_FTRACE_SYSCALLS 231 depends on HAVE_SYSCALL_TRACEPOINTS
215 select GENERIC_TRACER 232 select GENERIC_TRACER
216 select KALLSYMS 233 select KALLSYMS
217 help 234 help
@@ -226,13 +243,13 @@ config BOOT_TRACER
226 the timings of the initcalls and traces key events and the identity 243 the timings of the initcalls and traces key events and the identity
227 of tasks that can cause boot delays, such as context-switches. 244 of tasks that can cause boot delays, such as context-switches.
228 245
229 Its aim is to be parsed by the /scripts/bootgraph.pl tool to 246 Its aim is to be parsed by the scripts/bootgraph.pl tool to
230 produce pretty graphics about boot inefficiencies, giving a visual 247 produce pretty graphics about boot inefficiencies, giving a visual
231 representation of the delays during initcalls - but the raw 248 representation of the delays during initcalls - but the raw
232 /debug/tracing/trace text output is readable too. 249 /debug/tracing/trace text output is readable too.
233 250
234 You must pass in ftrace=initcall to the kernel command line 251 You must pass in initcall_debug and ftrace=initcall to the kernel
235 to enable this on bootup. 252 command line to enable this on bootup.
236 253
237config TRACE_BRANCH_PROFILING 254config TRACE_BRANCH_PROFILING
238 bool 255 bool
@@ -462,6 +479,18 @@ config FTRACE_STARTUP_TEST
462 functioning properly. It will do tests on all the configured 479 functioning properly. It will do tests on all the configured
463 tracers of ftrace. 480 tracers of ftrace.
464 481
482config EVENT_TRACE_TEST_SYSCALLS
483 bool "Run selftest on syscall events"
484 depends on FTRACE_STARTUP_TEST
485 help
486 This option will also enable testing every syscall event.
487 It only enables the event and disables it and runs various loads
488 with the event enabled. This adds a bit more time for kernel boot
489 up since it runs this on every system call defined.
490
491 TBD - enable a way to actually call the syscalls as we test their
492 events
493
465config MMIOTRACE 494config MMIOTRACE
466 bool "Memory mapped IO tracing" 495 bool "Memory mapped IO tracing"
467 depends on HAVE_MMIOTRACE_SUPPORT && PCI 496 depends on HAVE_MMIOTRACE_SUPPORT && PCI
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 844164dca90a..26f03ac07c2b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o 44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_POWER_TRACER) += trace_power.o
46obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
47obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
48obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
54obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
55obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
56obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_EVENT_TRACING) += power-traces.o
57 57
58libftrace-y := ftrace.o 58libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 39af8af6fc30..3eb159c277c8 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -22,6 +22,7 @@
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/smp_lock.h>
25#include <linux/time.h> 26#include <linux/time.h>
26#include <linux/uaccess.h> 27#include <linux/uaccess.h>
27 28
@@ -64,13 +65,15 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
64{ 65{
65 struct blk_io_trace *t; 66 struct blk_io_trace *t;
66 struct ring_buffer_event *event = NULL; 67 struct ring_buffer_event *event = NULL;
68 struct ring_buffer *buffer = NULL;
67 int pc = 0; 69 int pc = 0;
68 int cpu = smp_processor_id(); 70 int cpu = smp_processor_id();
69 bool blk_tracer = blk_tracer_enabled; 71 bool blk_tracer = blk_tracer_enabled;
70 72
71 if (blk_tracer) { 73 if (blk_tracer) {
74 buffer = blk_tr->buffer;
72 pc = preempt_count(); 75 pc = preempt_count();
73 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 76 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
74 sizeof(*t) + len, 77 sizeof(*t) + len,
75 0, pc); 78 0, pc);
76 if (!event) 79 if (!event)
@@ -95,7 +98,7 @@ record_it:
95 memcpy((void *) t + sizeof(*t), data, len); 98 memcpy((void *) t + sizeof(*t), data, len);
96 99
97 if (blk_tracer) 100 if (blk_tracer)
98 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 101 trace_buffer_unlock_commit(buffer, event, 0, pc);
99 } 102 }
100} 103}
101 104
@@ -178,6 +181,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
178{ 181{
179 struct task_struct *tsk = current; 182 struct task_struct *tsk = current;
180 struct ring_buffer_event *event = NULL; 183 struct ring_buffer_event *event = NULL;
184 struct ring_buffer *buffer = NULL;
181 struct blk_io_trace *t; 185 struct blk_io_trace *t;
182 unsigned long flags = 0; 186 unsigned long flags = 0;
183 unsigned long *sequence; 187 unsigned long *sequence;
@@ -203,8 +207,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
203 if (blk_tracer) { 207 if (blk_tracer) {
204 tracing_record_cmdline(current); 208 tracing_record_cmdline(current);
205 209
210 buffer = blk_tr->buffer;
206 pc = preempt_count(); 211 pc = preempt_count();
207 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK, 212 event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
208 sizeof(*t) + pdu_len, 213 sizeof(*t) + pdu_len,
209 0, pc); 214 0, pc);
210 if (!event) 215 if (!event)
@@ -251,7 +256,7 @@ record_it:
251 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); 256 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
252 257
253 if (blk_tracer) { 258 if (blk_tracer) {
254 trace_buffer_unlock_commit(blk_tr, event, 0, pc); 259 trace_buffer_unlock_commit(buffer, event, 0, pc);
255 return; 260 return;
256 } 261 }
257 } 262 }
@@ -266,8 +271,8 @@ static void blk_trace_free(struct blk_trace *bt)
266{ 271{
267 debugfs_remove(bt->msg_file); 272 debugfs_remove(bt->msg_file);
268 debugfs_remove(bt->dropped_file); 273 debugfs_remove(bt->dropped_file);
269 debugfs_remove(bt->dir);
270 relay_close(bt->rchan); 274 relay_close(bt->rchan);
275 debugfs_remove(bt->dir);
271 free_percpu(bt->sequence); 276 free_percpu(bt->sequence);
272 free_percpu(bt->msg_data); 277 free_percpu(bt->msg_data);
273 kfree(bt); 278 kfree(bt);
@@ -377,18 +382,8 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
377 382
378static int blk_remove_buf_file_callback(struct dentry *dentry) 383static int blk_remove_buf_file_callback(struct dentry *dentry)
379{ 384{
380 struct dentry *parent = dentry->d_parent;
381 debugfs_remove(dentry); 385 debugfs_remove(dentry);
382 386
383 /*
384 * this will fail for all but the last file, but that is ok. what we
385 * care about is the top level buts->name directory going away, when
386 * the last trace file is gone. Then we don't have to rmdir() that
387 * manually on trace stop, so it nicely solves the issue with
388 * force killing of running traces.
389 */
390
391 debugfs_remove(parent);
392 return 0; 387 return 0;
393} 388}
394 389
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3716bf04df6..c71e91bf7372 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -768,7 +768,7 @@ static struct tracer_stat function_stats __initdata = {
768 .stat_show = function_stat_show 768 .stat_show = function_stat_show
769}; 769};
770 770
771static void ftrace_profile_debugfs(struct dentry *d_tracer) 771static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
772{ 772{
773 struct ftrace_profile_stat *stat; 773 struct ftrace_profile_stat *stat;
774 struct dentry *entry; 774 struct dentry *entry;
@@ -786,7 +786,6 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
786 * The files created are permanent, if something happens 786 * The files created are permanent, if something happens
787 * we still do not free memory. 787 * we still do not free memory.
788 */ 788 */
789 kfree(stat);
790 WARN(1, 789 WARN(1,
791 "Could not allocate stat file for cpu %d\n", 790 "Could not allocate stat file for cpu %d\n",
792 cpu); 791 cpu);
@@ -813,7 +812,7 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
813} 812}
814 813
815#else /* CONFIG_FUNCTION_PROFILER */ 814#else /* CONFIG_FUNCTION_PROFILER */
816static void ftrace_profile_debugfs(struct dentry *d_tracer) 815static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
817{ 816{
818} 817}
819#endif /* CONFIG_FUNCTION_PROFILER */ 818#endif /* CONFIG_FUNCTION_PROFILER */
@@ -1017,71 +1016,35 @@ static int
1017__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1016__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1018{ 1017{
1019 unsigned long ftrace_addr; 1018 unsigned long ftrace_addr;
1020 unsigned long ip, fl; 1019 unsigned long flag = 0UL;
1021 1020
1022 ftrace_addr = (unsigned long)FTRACE_ADDR; 1021 ftrace_addr = (unsigned long)FTRACE_ADDR;
1023 1022
1024 ip = rec->ip;
1025
1026 /* 1023 /*
1027 * If this record is not to be traced and 1024 * If this record is not to be traced or we want to disable it,
1028 * it is not enabled then do nothing. 1025 * then disable it.
1029 * 1026 *
1030 * If this record is not to be traced and 1027 * If we want to enable it and filtering is off, then enable it.
1031 * it is enabled then disable it.
1032 * 1028 *
1029 * If we want to enable it and filtering is on, enable it only if
1030 * it's filtered
1033 */ 1031 */
1034 if (rec->flags & FTRACE_FL_NOTRACE) { 1032 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
1035 if (rec->flags & FTRACE_FL_ENABLED) 1033 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
1036 rec->flags &= ~FTRACE_FL_ENABLED; 1034 flag = FTRACE_FL_ENABLED;
1037 else 1035 }
1038 return 0;
1039
1040 } else if (ftrace_filtered && enable) {
1041 /*
1042 * Filtering is on:
1043 */
1044
1045 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
1046
1047 /* Record is filtered and enabled, do nothing */
1048 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
1049 return 0;
1050
1051 /* Record is not filtered or enabled, do nothing */
1052 if (!fl)
1053 return 0;
1054
1055 /* Record is not filtered but enabled, disable it */
1056 if (fl == FTRACE_FL_ENABLED)
1057 rec->flags &= ~FTRACE_FL_ENABLED;
1058 else
1059 /* Otherwise record is filtered but not enabled, enable it */
1060 rec->flags |= FTRACE_FL_ENABLED;
1061 } else {
1062 /* Disable or not filtered */
1063
1064 if (enable) {
1065 /* if record is enabled, do nothing */
1066 if (rec->flags & FTRACE_FL_ENABLED)
1067 return 0;
1068
1069 rec->flags |= FTRACE_FL_ENABLED;
1070
1071 } else {
1072 1036
1073 /* if record is not enabled, do nothing */ 1037 /* If the state of this record hasn't changed, then do nothing */
1074 if (!(rec->flags & FTRACE_FL_ENABLED)) 1038 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1075 return 0; 1039 return 0;
1076 1040
1077 rec->flags &= ~FTRACE_FL_ENABLED; 1041 if (flag) {
1078 } 1042 rec->flags |= FTRACE_FL_ENABLED;
1043 return ftrace_make_call(rec, ftrace_addr);
1079 } 1044 }
1080 1045
1081 if (rec->flags & FTRACE_FL_ENABLED) 1046 rec->flags &= ~FTRACE_FL_ENABLED;
1082 return ftrace_make_call(rec, ftrace_addr); 1047 return ftrace_make_nop(NULL, rec, ftrace_addr);
1083 else
1084 return ftrace_make_nop(NULL, rec, ftrace_addr);
1085} 1048}
1086 1049
1087static void ftrace_replace_code(int enable) 1050static void ftrace_replace_code(int enable)
@@ -1360,11 +1323,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
1360 1323
1361enum { 1324enum {
1362 FTRACE_ITER_FILTER = (1 << 0), 1325 FTRACE_ITER_FILTER = (1 << 0),
1363 FTRACE_ITER_CONT = (1 << 1), 1326 FTRACE_ITER_NOTRACE = (1 << 1),
1364 FTRACE_ITER_NOTRACE = (1 << 2), 1327 FTRACE_ITER_FAILURES = (1 << 2),
1365 FTRACE_ITER_FAILURES = (1 << 3), 1328 FTRACE_ITER_PRINTALL = (1 << 3),
1366 FTRACE_ITER_PRINTALL = (1 << 4), 1329 FTRACE_ITER_HASH = (1 << 4),
1367 FTRACE_ITER_HASH = (1 << 5),
1368}; 1330};
1369 1331
1370#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 1332#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1374,9 +1336,7 @@ struct ftrace_iterator {
1374 int hidx; 1336 int hidx;
1375 int idx; 1337 int idx;
1376 unsigned flags; 1338 unsigned flags;
1377 unsigned char buffer[FTRACE_BUFF_MAX+1]; 1339 struct trace_parser parser;
1378 unsigned buffer_idx;
1379 unsigned filtered;
1380}; 1340};
1381 1341
1382static void * 1342static void *
@@ -1439,18 +1399,13 @@ static int t_hash_show(struct seq_file *m, void *v)
1439{ 1399{
1440 struct ftrace_func_probe *rec; 1400 struct ftrace_func_probe *rec;
1441 struct hlist_node *hnd = v; 1401 struct hlist_node *hnd = v;
1442 char str[KSYM_SYMBOL_LEN];
1443 1402
1444 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1403 rec = hlist_entry(hnd, struct ftrace_func_probe, node);
1445 1404
1446 if (rec->ops->print) 1405 if (rec->ops->print)
1447 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1406 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
1448 1407
1449 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1408 seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);
1450 seq_printf(m, "%s:", str);
1451
1452 kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
1453 seq_printf(m, "%s", str);
1454 1409
1455 if (rec->data) 1410 if (rec->data)
1456 seq_printf(m, ":%p", rec->data); 1411 seq_printf(m, ":%p", rec->data);
@@ -1548,7 +1503,6 @@ static int t_show(struct seq_file *m, void *v)
1548{ 1503{
1549 struct ftrace_iterator *iter = m->private; 1504 struct ftrace_iterator *iter = m->private;
1550 struct dyn_ftrace *rec = v; 1505 struct dyn_ftrace *rec = v;
1551 char str[KSYM_SYMBOL_LEN];
1552 1506
1553 if (iter->flags & FTRACE_ITER_HASH) 1507 if (iter->flags & FTRACE_ITER_HASH)
1554 return t_hash_show(m, v); 1508 return t_hash_show(m, v);
@@ -1561,9 +1515,7 @@ static int t_show(struct seq_file *m, void *v)
1561 if (!rec) 1515 if (!rec)
1562 return 0; 1516 return 0;
1563 1517
1564 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1518 seq_printf(m, "%ps\n", (void *)rec->ip);
1565
1566 seq_printf(m, "%s\n", str);
1567 1519
1568 return 0; 1520 return 0;
1569} 1521}
@@ -1602,17 +1554,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
1602 return ret; 1554 return ret;
1603} 1555}
1604 1556
1605int ftrace_avail_release(struct inode *inode, struct file *file)
1606{
1607 struct seq_file *m = (struct seq_file *)file->private_data;
1608 struct ftrace_iterator *iter = m->private;
1609
1610 seq_release(inode, file);
1611 kfree(iter);
1612
1613 return 0;
1614}
1615
1616static int 1557static int
1617ftrace_failures_open(struct inode *inode, struct file *file) 1558ftrace_failures_open(struct inode *inode, struct file *file)
1618{ 1559{
@@ -1661,9 +1602,14 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1661 if (!iter) 1602 if (!iter)
1662 return -ENOMEM; 1603 return -ENOMEM;
1663 1604
1605 if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) {
1606 kfree(iter);
1607 return -ENOMEM;
1608 }
1609
1664 mutex_lock(&ftrace_regex_lock); 1610 mutex_lock(&ftrace_regex_lock);
1665 if ((file->f_mode & FMODE_WRITE) && 1611 if ((file->f_mode & FMODE_WRITE) &&
1666 !(file->f_flags & O_APPEND)) 1612 (file->f_flags & O_TRUNC))
1667 ftrace_filter_reset(enable); 1613 ftrace_filter_reset(enable);
1668 1614
1669 if (file->f_mode & FMODE_READ) { 1615 if (file->f_mode & FMODE_READ) {
@@ -2116,9 +2062,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
2116 int i, len = 0; 2062 int i, len = 0;
2117 char *search; 2063 char *search;
2118 2064
2119 if (glob && (strcmp(glob, "*") || !strlen(glob))) 2065 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
2120 glob = NULL; 2066 glob = NULL;
2121 else { 2067 else if (glob) {
2122 int not; 2068 int not;
2123 2069
2124 type = ftrace_setup_glob(glob, strlen(glob), &search, &not); 2070 type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
@@ -2253,9 +2199,8 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2253 size_t cnt, loff_t *ppos, int enable) 2199 size_t cnt, loff_t *ppos, int enable)
2254{ 2200{
2255 struct ftrace_iterator *iter; 2201 struct ftrace_iterator *iter;
2256 char ch; 2202 struct trace_parser *parser;
2257 size_t read = 0; 2203 ssize_t ret, read;
2258 ssize_t ret;
2259 2204
2260 if (!cnt || cnt < 0) 2205 if (!cnt || cnt < 0)
2261 return 0; 2206 return 0;
@@ -2268,68 +2213,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2268 } else 2213 } else
2269 iter = file->private_data; 2214 iter = file->private_data;
2270 2215
2271 if (!*ppos) { 2216 parser = &iter->parser;
2272 iter->flags &= ~FTRACE_ITER_CONT; 2217 read = trace_get_user(parser, ubuf, cnt, ppos);
2273 iter->buffer_idx = 0;
2274 }
2275
2276 ret = get_user(ch, ubuf++);
2277 if (ret)
2278 goto out;
2279 read++;
2280 cnt--;
2281
2282 if (!(iter->flags & ~FTRACE_ITER_CONT)) {
2283 /* skip white space */
2284 while (cnt && isspace(ch)) {
2285 ret = get_user(ch, ubuf++);
2286 if (ret)
2287 goto out;
2288 read++;
2289 cnt--;
2290 }
2291
2292 if (isspace(ch)) {
2293 file->f_pos += read;
2294 ret = read;
2295 goto out;
2296 }
2297 2218
2298 iter->buffer_idx = 0; 2219 if (trace_parser_loaded(parser) &&
2299 } 2220 !trace_parser_cont(parser)) {
2300 2221 ret = ftrace_process_regex(parser->buffer,
2301 while (cnt && !isspace(ch)) { 2222 parser->idx, enable);
2302 if (iter->buffer_idx < FTRACE_BUFF_MAX)
2303 iter->buffer[iter->buffer_idx++] = ch;
2304 else {
2305 ret = -EINVAL;
2306 goto out;
2307 }
2308 ret = get_user(ch, ubuf++);
2309 if (ret) 2223 if (ret)
2310 goto out; 2224 goto out;
2311 read++;
2312 cnt--;
2313 }
2314 2225
2315 if (isspace(ch)) { 2226 trace_parser_clear(parser);
2316 iter->filtered++; 2227 }
2317 iter->buffer[iter->buffer_idx] = 0;
2318 ret = ftrace_process_regex(iter->buffer,
2319 iter->buffer_idx, enable);
2320 if (ret)
2321 goto out;
2322 iter->buffer_idx = 0;
2323 } else
2324 iter->flags |= FTRACE_ITER_CONT;
2325
2326
2327 file->f_pos += read;
2328 2228
2329 ret = read; 2229 ret = read;
2330 out:
2331 mutex_unlock(&ftrace_regex_lock);
2332 2230
2231 mutex_unlock(&ftrace_regex_lock);
2232out:
2333 return ret; 2233 return ret;
2334} 2234}
2335 2235
@@ -2434,6 +2334,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2434{ 2334{
2435 struct seq_file *m = (struct seq_file *)file->private_data; 2335 struct seq_file *m = (struct seq_file *)file->private_data;
2436 struct ftrace_iterator *iter; 2336 struct ftrace_iterator *iter;
2337 struct trace_parser *parser;
2437 2338
2438 mutex_lock(&ftrace_regex_lock); 2339 mutex_lock(&ftrace_regex_lock);
2439 if (file->f_mode & FMODE_READ) { 2340 if (file->f_mode & FMODE_READ) {
@@ -2443,10 +2344,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2443 } else 2344 } else
2444 iter = file->private_data; 2345 iter = file->private_data;
2445 2346
2446 if (iter->buffer_idx) { 2347 parser = &iter->parser;
2447 iter->filtered++; 2348 if (trace_parser_loaded(parser)) {
2448 iter->buffer[iter->buffer_idx] = 0; 2349 parser->buffer[parser->idx] = 0;
2449 ftrace_match_records(iter->buffer, iter->buffer_idx, enable); 2350 ftrace_match_records(parser->buffer, parser->idx, enable);
2450 } 2351 }
2451 2352
2452 mutex_lock(&ftrace_lock); 2353 mutex_lock(&ftrace_lock);
@@ -2454,7 +2355,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2454 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 2355 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
2455 mutex_unlock(&ftrace_lock); 2356 mutex_unlock(&ftrace_lock);
2456 2357
2358 trace_parser_put(parser);
2457 kfree(iter); 2359 kfree(iter);
2360
2458 mutex_unlock(&ftrace_regex_lock); 2361 mutex_unlock(&ftrace_regex_lock);
2459 return 0; 2362 return 0;
2460} 2363}
@@ -2475,14 +2378,14 @@ static const struct file_operations ftrace_avail_fops = {
2475 .open = ftrace_avail_open, 2378 .open = ftrace_avail_open,
2476 .read = seq_read, 2379 .read = seq_read,
2477 .llseek = seq_lseek, 2380 .llseek = seq_lseek,
2478 .release = ftrace_avail_release, 2381 .release = seq_release_private,
2479}; 2382};
2480 2383
2481static const struct file_operations ftrace_failures_fops = { 2384static const struct file_operations ftrace_failures_fops = {
2482 .open = ftrace_failures_open, 2385 .open = ftrace_failures_open,
2483 .read = seq_read, 2386 .read = seq_read,
2484 .llseek = seq_lseek, 2387 .llseek = seq_lseek,
2485 .release = ftrace_avail_release, 2388 .release = seq_release_private,
2486}; 2389};
2487 2390
2488static const struct file_operations ftrace_filter_fops = { 2391static const struct file_operations ftrace_filter_fops = {
@@ -2511,11 +2414,9 @@ unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2511static void * 2414static void *
2512__g_next(struct seq_file *m, loff_t *pos) 2415__g_next(struct seq_file *m, loff_t *pos)
2513{ 2416{
2514 unsigned long *array = m->private;
2515
2516 if (*pos >= ftrace_graph_count) 2417 if (*pos >= ftrace_graph_count)
2517 return NULL; 2418 return NULL;
2518 return &array[*pos]; 2419 return &ftrace_graph_funcs[*pos];
2519} 2420}
2520 2421
2521static void * 2422static void *
@@ -2544,7 +2445,6 @@ static void g_stop(struct seq_file *m, void *p)
2544static int g_show(struct seq_file *m, void *v) 2445static int g_show(struct seq_file *m, void *v)
2545{ 2446{
2546 unsigned long *ptr = v; 2447 unsigned long *ptr = v;
2547 char str[KSYM_SYMBOL_LEN];
2548 2448
2549 if (!ptr) 2449 if (!ptr)
2550 return 0; 2450 return 0;
@@ -2554,9 +2454,7 @@ static int g_show(struct seq_file *m, void *v)
2554 return 0; 2454 return 0;
2555 } 2455 }
2556 2456
2557 kallsyms_lookup(*ptr, NULL, NULL, NULL, str); 2457 seq_printf(m, "%ps\n", (void *)*ptr);
2558
2559 seq_printf(m, "%s\n", str);
2560 2458
2561 return 0; 2459 return 0;
2562} 2460}
@@ -2578,25 +2476,27 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2578 2476
2579 mutex_lock(&graph_lock); 2477 mutex_lock(&graph_lock);
2580 if ((file->f_mode & FMODE_WRITE) && 2478 if ((file->f_mode & FMODE_WRITE) &&
2581 !(file->f_flags & O_APPEND)) { 2479 (file->f_flags & O_TRUNC)) {
2582 ftrace_graph_count = 0; 2480 ftrace_graph_count = 0;
2583 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2481 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2584 } 2482 }
2483 mutex_unlock(&graph_lock);
2585 2484
2586 if (file->f_mode & FMODE_READ) { 2485 if (file->f_mode & FMODE_READ)
2587 ret = seq_open(file, &ftrace_graph_seq_ops); 2486 ret = seq_open(file, &ftrace_graph_seq_ops);
2588 if (!ret) {
2589 struct seq_file *m = file->private_data;
2590 m->private = ftrace_graph_funcs;
2591 }
2592 } else
2593 file->private_data = ftrace_graph_funcs;
2594 mutex_unlock(&graph_lock);
2595 2487
2596 return ret; 2488 return ret;
2597} 2489}
2598 2490
2599static int 2491static int
2492ftrace_graph_release(struct inode *inode, struct file *file)
2493{
2494 if (file->f_mode & FMODE_READ)
2495 seq_release(inode, file);
2496 return 0;
2497}
2498
2499static int
2600ftrace_set_func(unsigned long *array, int *idx, char *buffer) 2500ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2601{ 2501{
2602 struct dyn_ftrace *rec; 2502 struct dyn_ftrace *rec;
@@ -2651,12 +2551,9 @@ static ssize_t
2651ftrace_graph_write(struct file *file, const char __user *ubuf, 2551ftrace_graph_write(struct file *file, const char __user *ubuf,
2652 size_t cnt, loff_t *ppos) 2552 size_t cnt, loff_t *ppos)
2653{ 2553{
2654 unsigned char buffer[FTRACE_BUFF_MAX+1]; 2554 struct trace_parser parser;
2655 unsigned long *array;
2656 size_t read = 0; 2555 size_t read = 0;
2657 ssize_t ret; 2556 ssize_t ret;
2658 int index = 0;
2659 char ch;
2660 2557
2661 if (!cnt || cnt < 0) 2558 if (!cnt || cnt < 0)
2662 return 0; 2559 return 0;
@@ -2668,66 +2565,36 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2668 goto out; 2565 goto out;
2669 } 2566 }
2670 2567
2671 if (file->f_mode & FMODE_READ) { 2568 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2672 struct seq_file *m = file->private_data; 2569 ret = -ENOMEM;
2673 array = m->private;
2674 } else
2675 array = file->private_data;
2676
2677 ret = get_user(ch, ubuf++);
2678 if (ret)
2679 goto out; 2570 goto out;
2680 read++;
2681 cnt--;
2682
2683 /* skip white space */
2684 while (cnt && isspace(ch)) {
2685 ret = get_user(ch, ubuf++);
2686 if (ret)
2687 goto out;
2688 read++;
2689 cnt--;
2690 } 2571 }
2691 2572
2692 if (isspace(ch)) { 2573 read = trace_get_user(&parser, ubuf, cnt, ppos);
2693 *ppos += read;
2694 ret = read;
2695 goto out;
2696 }
2697 2574
2698 while (cnt && !isspace(ch)) { 2575 if (trace_parser_loaded((&parser))) {
2699 if (index < FTRACE_BUFF_MAX) 2576 parser.buffer[parser.idx] = 0;
2700 buffer[index++] = ch; 2577
2701 else { 2578 /* we allow only one expression at a time */
2702 ret = -EINVAL; 2579 ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
2703 goto out; 2580 parser.buffer);
2704 }
2705 ret = get_user(ch, ubuf++);
2706 if (ret) 2581 if (ret)
2707 goto out; 2582 goto out;
2708 read++;
2709 cnt--;
2710 } 2583 }
2711 buffer[index] = 0;
2712
2713 /* we allow only one expression at a time */
2714 ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
2715 if (ret)
2716 goto out;
2717
2718 file->f_pos += read;
2719 2584
2720 ret = read; 2585 ret = read;
2721 out: 2586 out:
2587 trace_parser_put(&parser);
2722 mutex_unlock(&graph_lock); 2588 mutex_unlock(&graph_lock);
2723 2589
2724 return ret; 2590 return ret;
2725} 2591}
2726 2592
2727static const struct file_operations ftrace_graph_fops = { 2593static const struct file_operations ftrace_graph_fops = {
2728 .open = ftrace_graph_open, 2594 .open = ftrace_graph_open,
2729 .read = seq_read, 2595 .read = seq_read,
2730 .write = ftrace_graph_write, 2596 .write = ftrace_graph_write,
2597 .release = ftrace_graph_release,
2731}; 2598};
2732#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2599#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
2733 2600
@@ -3160,10 +3027,10 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
3160 3027
3161 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 3028 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
3162 3029
3163 if (ret || !write || (last_ftrace_enabled == ftrace_enabled)) 3030 if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
3164 goto out; 3031 goto out;
3165 3032
3166 last_ftrace_enabled = ftrace_enabled; 3033 last_ftrace_enabled = !!ftrace_enabled;
3167 3034
3168 if (ftrace_enabled) { 3035 if (ftrace_enabled) {
3169 3036
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 1edaa9516e81..81b1645c8549 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -183,11 +183,9 @@ static void kmemtrace_stop_probes(void)
183 183
184static int kmem_trace_init(struct trace_array *tr) 184static int kmem_trace_init(struct trace_array *tr)
185{ 185{
186 int cpu;
187 kmemtrace_array = tr; 186 kmemtrace_array = tr;
188 187
189 for_each_cpu(cpu, cpu_possible_mask) 188 tracing_reset_online_cpus(tr);
190 tracing_reset(tr, cpu);
191 189
192 kmemtrace_start_probes(); 190 kmemtrace_start_probes();
193 191
@@ -239,12 +237,52 @@ struct kmemtrace_user_event_alloc {
239}; 237};
240 238
241static enum print_line_t 239static enum print_line_t
242kmemtrace_print_alloc_user(struct trace_iterator *iter, 240kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
243 struct kmemtrace_alloc_entry *entry)
244{ 241{
245 struct kmemtrace_user_event_alloc *ev_alloc;
246 struct trace_seq *s = &iter->seq; 242 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_alloc_entry *entry;
244 int ret;
245
246 trace_assign_type(entry, iter->ent);
247
248 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
249 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
250 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
251 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
252 (unsigned long)entry->gfp_flags, entry->node);
253
254 if (!ret)
255 return TRACE_TYPE_PARTIAL_LINE;
256 return TRACE_TYPE_HANDLED;
257}
258
259static enum print_line_t
260kmemtrace_print_free(struct trace_iterator *iter, int flags)
261{
262 struct trace_seq *s = &iter->seq;
263 struct kmemtrace_free_entry *entry;
264 int ret;
265
266 trace_assign_type(entry, iter->ent);
267
268 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
269 entry->type_id, (void *)entry->call_site,
270 (unsigned long)entry->ptr);
271
272 if (!ret)
273 return TRACE_TYPE_PARTIAL_LINE;
274 return TRACE_TYPE_HANDLED;
275}
276
277static enum print_line_t
278kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
279{
280 struct trace_seq *s = &iter->seq;
281 struct kmemtrace_alloc_entry *entry;
247 struct kmemtrace_user_event *ev; 282 struct kmemtrace_user_event *ev;
283 struct kmemtrace_user_event_alloc *ev_alloc;
284
285 trace_assign_type(entry, iter->ent);
248 286
249 ev = trace_seq_reserve(s, sizeof(*ev)); 287 ev = trace_seq_reserve(s, sizeof(*ev));
250 if (!ev) 288 if (!ev)
@@ -271,12 +309,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter,
271} 309}
272 310
273static enum print_line_t 311static enum print_line_t
274kmemtrace_print_free_user(struct trace_iterator *iter, 312kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
275 struct kmemtrace_free_entry *entry)
276{ 313{
277 struct trace_seq *s = &iter->seq; 314 struct trace_seq *s = &iter->seq;
315 struct kmemtrace_free_entry *entry;
278 struct kmemtrace_user_event *ev; 316 struct kmemtrace_user_event *ev;
279 317
318 trace_assign_type(entry, iter->ent);
319
280 ev = trace_seq_reserve(s, sizeof(*ev)); 320 ev = trace_seq_reserve(s, sizeof(*ev));
281 if (!ev) 321 if (!ev)
282 return TRACE_TYPE_PARTIAL_LINE; 322 return TRACE_TYPE_PARTIAL_LINE;
@@ -294,12 +334,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter,
294 334
295/* The two other following provide a more minimalistic output */ 335/* The two other following provide a more minimalistic output */
296static enum print_line_t 336static enum print_line_t
297kmemtrace_print_alloc_compress(struct trace_iterator *iter, 337kmemtrace_print_alloc_compress(struct trace_iterator *iter)
298 struct kmemtrace_alloc_entry *entry)
299{ 338{
339 struct kmemtrace_alloc_entry *entry;
300 struct trace_seq *s = &iter->seq; 340 struct trace_seq *s = &iter->seq;
301 int ret; 341 int ret;
302 342
343 trace_assign_type(entry, iter->ent);
344
303 /* Alloc entry */ 345 /* Alloc entry */
304 ret = trace_seq_printf(s, " + "); 346 ret = trace_seq_printf(s, " + ");
305 if (!ret) 347 if (!ret)
@@ -345,29 +387,24 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
345 if (!ret) 387 if (!ret)
346 return TRACE_TYPE_PARTIAL_LINE; 388 return TRACE_TYPE_PARTIAL_LINE;
347 389
348 /* Node */ 390 /* Node and call site*/
349 ret = trace_seq_printf(s, "%4d ", entry->node); 391 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
350 if (!ret) 392 (void *)entry->call_site);
351 return TRACE_TYPE_PARTIAL_LINE;
352
353 /* Call site */
354 ret = seq_print_ip_sym(s, entry->call_site, 0);
355 if (!ret) 393 if (!ret)
356 return TRACE_TYPE_PARTIAL_LINE; 394 return TRACE_TYPE_PARTIAL_LINE;
357 395
358 if (!trace_seq_printf(s, "\n"))
359 return TRACE_TYPE_PARTIAL_LINE;
360
361 return TRACE_TYPE_HANDLED; 396 return TRACE_TYPE_HANDLED;
362} 397}
363 398
364static enum print_line_t 399static enum print_line_t
365kmemtrace_print_free_compress(struct trace_iterator *iter, 400kmemtrace_print_free_compress(struct trace_iterator *iter)
366 struct kmemtrace_free_entry *entry)
367{ 401{
402 struct kmemtrace_free_entry *entry;
368 struct trace_seq *s = &iter->seq; 403 struct trace_seq *s = &iter->seq;
369 int ret; 404 int ret;
370 405
406 trace_assign_type(entry, iter->ent);
407
371 /* Free entry */ 408 /* Free entry */
372 ret = trace_seq_printf(s, " - "); 409 ret = trace_seq_printf(s, " - ");
373 if (!ret) 410 if (!ret)
@@ -401,19 +438,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter,
401 if (!ret) 438 if (!ret)
402 return TRACE_TYPE_PARTIAL_LINE; 439 return TRACE_TYPE_PARTIAL_LINE;
403 440
404 /* Skip node */ 441 /* Skip node and print call site*/
405 ret = trace_seq_printf(s, " "); 442 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
406 if (!ret) 443 if (!ret)
407 return TRACE_TYPE_PARTIAL_LINE; 444 return TRACE_TYPE_PARTIAL_LINE;
408 445
409 /* Call site */
410 ret = seq_print_ip_sym(s, entry->call_site, 0);
411 if (!ret)
412 return TRACE_TYPE_PARTIAL_LINE;
413
414 if (!trace_seq_printf(s, "\n"))
415 return TRACE_TYPE_PARTIAL_LINE;
416
417 return TRACE_TYPE_HANDLED; 446 return TRACE_TYPE_HANDLED;
418} 447}
419 448
@@ -421,32 +450,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
421{ 450{
422 struct trace_entry *entry = iter->ent; 451 struct trace_entry *entry = iter->ent;
423 452
424 switch (entry->type) { 453 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
425 case TRACE_KMEM_ALLOC: { 454 return TRACE_TYPE_UNHANDLED;
426 struct kmemtrace_alloc_entry *field;
427
428 trace_assign_type(field, entry);
429 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
430 return kmemtrace_print_alloc_compress(iter, field);
431 else
432 return kmemtrace_print_alloc_user(iter, field);
433 }
434
435 case TRACE_KMEM_FREE: {
436 struct kmemtrace_free_entry *field;
437
438 trace_assign_type(field, entry);
439 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
440 return kmemtrace_print_free_compress(iter, field);
441 else
442 return kmemtrace_print_free_user(iter, field);
443 }
444 455
456 switch (entry->type) {
457 case TRACE_KMEM_ALLOC:
458 return kmemtrace_print_alloc_compress(iter);
459 case TRACE_KMEM_FREE:
460 return kmemtrace_print_free_compress(iter);
445 default: 461 default:
446 return TRACE_TYPE_UNHANDLED; 462 return TRACE_TYPE_UNHANDLED;
447 } 463 }
448} 464}
449 465
466static struct trace_event kmem_trace_alloc = {
467 .type = TRACE_KMEM_ALLOC,
468 .trace = kmemtrace_print_alloc,
469 .binary = kmemtrace_print_alloc_user,
470};
471
472static struct trace_event kmem_trace_free = {
473 .type = TRACE_KMEM_FREE,
474 .trace = kmemtrace_print_free,
475 .binary = kmemtrace_print_free_user,
476};
477
450static struct tracer kmem_tracer __read_mostly = { 478static struct tracer kmem_tracer __read_mostly = {
451 .name = "kmemtrace", 479 .name = "kmemtrace",
452 .init = kmem_trace_init, 480 .init = kmem_trace_init,
@@ -463,6 +491,21 @@ void kmemtrace_init(void)
463 491
464static int __init init_kmem_tracer(void) 492static int __init init_kmem_tracer(void)
465{ 493{
466 return register_tracer(&kmem_tracer); 494 if (!register_ftrace_event(&kmem_trace_alloc)) {
495 pr_warning("Warning: could not register kmem events\n");
496 return 1;
497 }
498
499 if (!register_ftrace_event(&kmem_trace_free)) {
500 pr_warning("Warning: could not register kmem events\n");
501 return 1;
502 }
503
504 if (!register_tracer(&kmem_tracer)) {
505 pr_warning("Warning: could not register the kmem tracer\n");
506 return 1;
507 }
508
509 return 0;
467} 510}
468device_initcall(init_kmem_tracer); 511device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
new file mode 100644
index 000000000000..e06c6e3d56a3
--- /dev/null
+++ b/kernel/trace/power-traces.c
@@ -0,0 +1,20 @@
1/*
2 * Power trace points
3 *
4 * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
5 */
6
7#include <linux/string.h>
8#include <linux/types.h>
9#include <linux/workqueue.h>
10#include <linux/sched.h>
11#include <linux/module.h>
12#include <linux/slab.h>
13
14#define CREATE_TRACE_POINTS
15#include <trace/events/power.h>
16
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
19EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
20
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bf27bb7a63e2..d4ff01970547 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -201,8 +201,6 @@ int tracing_is_on(void)
201} 201}
202EXPORT_SYMBOL_GPL(tracing_is_on); 202EXPORT_SYMBOL_GPL(tracing_is_on);
203 203
204#include "trace.h"
205
206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 204#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
207#define RB_ALIGNMENT 4U 205#define RB_ALIGNMENT 4U
208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 206#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -218,17 +216,12 @@ enum {
218 216
219static inline int rb_null_event(struct ring_buffer_event *event) 217static inline int rb_null_event(struct ring_buffer_event *event)
220{ 218{
221 return event->type_len == RINGBUF_TYPE_PADDING 219 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
222 && event->time_delta == 0;
223}
224
225static inline int rb_discarded_event(struct ring_buffer_event *event)
226{
227 return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
228} 220}
229 221
230static void rb_event_set_padding(struct ring_buffer_event *event) 222static void rb_event_set_padding(struct ring_buffer_event *event)
231{ 223{
224 /* padding has a NULL time_delta */
232 event->type_len = RINGBUF_TYPE_PADDING; 225 event->type_len = RINGBUF_TYPE_PADDING;
233 event->time_delta = 0; 226 event->time_delta = 0;
234} 227}
@@ -322,6 +315,14 @@ struct buffer_data_page {
322 unsigned char data[]; /* data of buffer page */ 315 unsigned char data[]; /* data of buffer page */
323}; 316};
324 317
318/*
319 * Note, the buffer_page list must be first. The buffer pages
320 * are allocated in cache lines, which means that each buffer
321 * page will be at the beginning of a cache line, and thus
322 * the least significant bits will be zero. We use this to
323 * add flags in the list struct pointers, to make the ring buffer
324 * lockless.
325 */
325struct buffer_page { 326struct buffer_page {
326 struct list_head list; /* list of buffer pages */ 327 struct list_head list; /* list of buffer pages */
327 local_t write; /* index for next write */ 328 local_t write; /* index for next write */
@@ -330,6 +331,21 @@ struct buffer_page {
330 struct buffer_data_page *page; /* Actual data page */ 331 struct buffer_data_page *page; /* Actual data page */
331}; 332};
332 333
334/*
335 * The buffer page counters, write and entries, must be reset
336 * atomically when crossing page boundaries. To synchronize this
337 * update, two counters are inserted into the number. One is
338 * the actual counter for the write position or count on the page.
339 *
340 * The other is a counter of updaters. Before an update happens
341 * the update partition of the counter is incremented. This will
342 * allow the updater to update the counter atomically.
343 *
344 * The counter is 20 bits, and the state data is 12.
345 */
346#define RB_WRITE_MASK 0xfffff
347#define RB_WRITE_INTCNT (1 << 20)
348
333static void rb_init_page(struct buffer_data_page *bpage) 349static void rb_init_page(struct buffer_data_page *bpage)
334{ 350{
335 local_set(&bpage->commit, 0); 351 local_set(&bpage->commit, 0);
@@ -403,21 +419,20 @@ int ring_buffer_print_page_header(struct trace_seq *s)
403struct ring_buffer_per_cpu { 419struct ring_buffer_per_cpu {
404 int cpu; 420 int cpu;
405 struct ring_buffer *buffer; 421 struct ring_buffer *buffer;
406 spinlock_t reader_lock; /* serialize readers */ 422 spinlock_t reader_lock; /* serialize readers */
407 raw_spinlock_t lock; 423 raw_spinlock_t lock;
408 struct lock_class_key lock_key; 424 struct lock_class_key lock_key;
409 struct list_head pages; 425 struct list_head *pages;
410 struct buffer_page *head_page; /* read from head */ 426 struct buffer_page *head_page; /* read from head */
411 struct buffer_page *tail_page; /* write to tail */ 427 struct buffer_page *tail_page; /* write to tail */
412 struct buffer_page *commit_page; /* committed pages */ 428 struct buffer_page *commit_page; /* committed pages */
413 struct buffer_page *reader_page; 429 struct buffer_page *reader_page;
414 unsigned long nmi_dropped; 430 local_t commit_overrun;
415 unsigned long commit_overrun; 431 local_t overrun;
416 unsigned long overrun;
417 unsigned long read;
418 local_t entries; 432 local_t entries;
419 local_t committing; 433 local_t committing;
420 local_t commits; 434 local_t commits;
435 unsigned long read;
421 u64 write_stamp; 436 u64 write_stamp;
422 u64 read_stamp; 437 u64 read_stamp;
423 atomic_t record_disabled; 438 atomic_t record_disabled;
@@ -450,14 +465,19 @@ struct ring_buffer_iter {
450}; 465};
451 466
452/* buffer may be either ring_buffer or ring_buffer_per_cpu */ 467/* buffer may be either ring_buffer or ring_buffer_per_cpu */
453#define RB_WARN_ON(buffer, cond) \ 468#define RB_WARN_ON(b, cond) \
454 ({ \ 469 ({ \
455 int _____ret = unlikely(cond); \ 470 int _____ret = unlikely(cond); \
456 if (_____ret) { \ 471 if (_____ret) { \
457 atomic_inc(&buffer->record_disabled); \ 472 if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
458 WARN_ON(1); \ 473 struct ring_buffer_per_cpu *__b = \
459 } \ 474 (void *)b; \
460 _____ret; \ 475 atomic_inc(&__b->buffer->record_disabled); \
476 } else \
477 atomic_inc(&b->record_disabled); \
478 WARN_ON(1); \
479 } \
480 _____ret; \
461 }) 481 })
462 482
463/* Up this if you want to test the TIME_EXTENTS and normalization */ 483/* Up this if you want to test the TIME_EXTENTS and normalization */
@@ -489,6 +509,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
489} 509}
490EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); 510EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
491 511
512/*
513 * Making the ring buffer lockless makes things tricky.
514 * Although writes only happen on the CPU that they are on,
515 * and they only need to worry about interrupts. Reads can
516 * happen on any CPU.
517 *
518 * The reader page is always off the ring buffer, but when the
519 * reader finishes with a page, it needs to swap its page with
520 * a new one from the buffer. The reader needs to take from
521 * the head (writes go to the tail). But if a writer is in overwrite
522 * mode and wraps, it must push the head page forward.
523 *
524 * Here lies the problem.
525 *
526 * The reader must be careful to replace only the head page, and
527 * not another one. As described at the top of the file in the
528 * ASCII art, the reader sets its old page to point to the next
529 * page after head. It then sets the page after head to point to
530 * the old reader page. But if the writer moves the head page
531 * during this operation, the reader could end up with the tail.
532 *
533 * We use cmpxchg to help prevent this race. We also do something
534 * special with the page before head. We set the LSB to 1.
535 *
536 * When the writer must push the page forward, it will clear the
537 * bit that points to the head page, move the head, and then set
538 * the bit that points to the new head page.
539 *
540 * We also don't want an interrupt coming in and moving the head
541 * page on another writer. Thus we use the second LSB to catch
542 * that too. Thus:
543 *
544 * head->list->prev->next bit 1 bit 0
545 * ------- -------
546 * Normal page 0 0
547 * Points to head page 0 1
548 * New head page 1 0
549 *
550 * Note we can not trust the prev pointer of the head page, because:
551 *
552 * +----+ +-----+ +-----+
553 * | |------>| T |---X--->| N |
554 * | |<------| | | |
555 * +----+ +-----+ +-----+
556 * ^ ^ |
557 * | +-----+ | |
558 * +----------| R |----------+ |
559 * | |<-----------+
560 * +-----+
561 *
562 * Key: ---X--> HEAD flag set in pointer
563 * T Tail page
564 * R Reader page
565 * N Next page
566 *
567 * (see __rb_reserve_next() to see where this happens)
568 *
569 * What the above shows is that the reader just swapped out
570 * the reader page with a page in the buffer, but before it
571 * could make the new header point back to the new page added
572 * it was preempted by a writer. The writer moved forward onto
573 * the new page added by the reader and is about to move forward
574 * again.
575 *
576 * You can see, it is legitimate for the previous pointer of
577 * the head (or any page) not to point back to itself. But only
578 * temporarially.
579 */
580
581#define RB_PAGE_NORMAL 0UL
582#define RB_PAGE_HEAD 1UL
583#define RB_PAGE_UPDATE 2UL
584
585
586#define RB_FLAG_MASK 3UL
587
588/* PAGE_MOVED is not part of the mask */
589#define RB_PAGE_MOVED 4UL
590
591/*
592 * rb_list_head - remove any bit
593 */
594static struct list_head *rb_list_head(struct list_head *list)
595{
596 unsigned long val = (unsigned long)list;
597
598 return (struct list_head *)(val & ~RB_FLAG_MASK);
599}
600
601/*
602 * rb_is_head_page - test if the give page is the head page
603 *
604 * Because the reader may move the head_page pointer, we can
605 * not trust what the head page is (it may be pointing to
606 * the reader page). But if the next page is a header page,
607 * its flags will be non zero.
608 */
609static int inline
610rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
611 struct buffer_page *page, struct list_head *list)
612{
613 unsigned long val;
614
615 val = (unsigned long)list->next;
616
617 if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
618 return RB_PAGE_MOVED;
619
620 return val & RB_FLAG_MASK;
621}
622
623/*
624 * rb_is_reader_page
625 *
626 * The unique thing about the reader page, is that, if the
627 * writer is ever on it, the previous pointer never points
628 * back to the reader page.
629 */
630static int rb_is_reader_page(struct buffer_page *page)
631{
632 struct list_head *list = page->list.prev;
633
634 return rb_list_head(list->next) != &page->list;
635}
636
637/*
638 * rb_set_list_to_head - set a list_head to be pointing to head.
639 */
640static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
641 struct list_head *list)
642{
643 unsigned long *ptr;
644
645 ptr = (unsigned long *)&list->next;
646 *ptr |= RB_PAGE_HEAD;
647 *ptr &= ~RB_PAGE_UPDATE;
648}
649
650/*
651 * rb_head_page_activate - sets up head page
652 */
653static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
654{
655 struct buffer_page *head;
656
657 head = cpu_buffer->head_page;
658 if (!head)
659 return;
660
661 /*
662 * Set the previous list pointer to have the HEAD flag.
663 */
664 rb_set_list_to_head(cpu_buffer, head->list.prev);
665}
666
667static void rb_list_head_clear(struct list_head *list)
668{
669 unsigned long *ptr = (unsigned long *)&list->next;
670
671 *ptr &= ~RB_FLAG_MASK;
672}
673
674/*
675 * rb_head_page_dactivate - clears head page ptr (for free list)
676 */
677static void
678rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
679{
680 struct list_head *hd;
681
682 /* Go through the whole list and clear any pointers found. */
683 rb_list_head_clear(cpu_buffer->pages);
684
685 list_for_each(hd, cpu_buffer->pages)
686 rb_list_head_clear(hd);
687}
688
689static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
690 struct buffer_page *head,
691 struct buffer_page *prev,
692 int old_flag, int new_flag)
693{
694 struct list_head *list;
695 unsigned long val = (unsigned long)&head->list;
696 unsigned long ret;
697
698 list = &prev->list;
699
700 val &= ~RB_FLAG_MASK;
701
702 ret = cmpxchg((unsigned long *)&list->next,
703 val | old_flag, val | new_flag);
704
705 /* check if the reader took the page */
706 if ((ret & ~RB_FLAG_MASK) != val)
707 return RB_PAGE_MOVED;
708
709 return ret & RB_FLAG_MASK;
710}
711
712static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
713 struct buffer_page *head,
714 struct buffer_page *prev,
715 int old_flag)
716{
717 return rb_head_page_set(cpu_buffer, head, prev,
718 old_flag, RB_PAGE_UPDATE);
719}
720
721static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
722 struct buffer_page *head,
723 struct buffer_page *prev,
724 int old_flag)
725{
726 return rb_head_page_set(cpu_buffer, head, prev,
727 old_flag, RB_PAGE_HEAD);
728}
729
730static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
731 struct buffer_page *head,
732 struct buffer_page *prev,
733 int old_flag)
734{
735 return rb_head_page_set(cpu_buffer, head, prev,
736 old_flag, RB_PAGE_NORMAL);
737}
738
739static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
740 struct buffer_page **bpage)
741{
742 struct list_head *p = rb_list_head((*bpage)->list.next);
743
744 *bpage = list_entry(p, struct buffer_page, list);
745}
746
747static struct buffer_page *
748rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
749{
750 struct buffer_page *head;
751 struct buffer_page *page;
752 struct list_head *list;
753 int i;
754
755 if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
756 return NULL;
757
758 /* sanity check */
759 list = cpu_buffer->pages;
760 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
761 return NULL;
762
763 page = head = cpu_buffer->head_page;
764 /*
765 * It is possible that the writer moves the header behind
766 * where we started, and we miss in one loop.
767 * A second loop should grab the header, but we'll do
768 * three loops just because I'm paranoid.
769 */
770 for (i = 0; i < 3; i++) {
771 do {
772 if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
773 cpu_buffer->head_page = page;
774 return page;
775 }
776 rb_inc_page(cpu_buffer, &page);
777 } while (page != head);
778 }
779
780 RB_WARN_ON(cpu_buffer, 1);
781
782 return NULL;
783}
784
785static int rb_head_page_replace(struct buffer_page *old,
786 struct buffer_page *new)
787{
788 unsigned long *ptr = (unsigned long *)&old->list.prev->next;
789 unsigned long val;
790 unsigned long ret;
791
792 val = *ptr & ~RB_FLAG_MASK;
793 val |= RB_PAGE_HEAD;
794
795 ret = cmpxchg(ptr, val, (unsigned long)&new->list);
796
797 return ret == val;
798}
799
800/*
801 * rb_tail_page_update - move the tail page forward
802 *
803 * Returns 1 if moved tail page, 0 if someone else did.
804 */
805static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
806 struct buffer_page *tail_page,
807 struct buffer_page *next_page)
808{
809 struct buffer_page *old_tail;
810 unsigned long old_entries;
811 unsigned long old_write;
812 int ret = 0;
813
814 /*
815 * The tail page now needs to be moved forward.
816 *
817 * We need to reset the tail page, but without messing
818 * with possible erasing of data brought in by interrupts
819 * that have moved the tail page and are currently on it.
820 *
821 * We add a counter to the write field to denote this.
822 */
823 old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
824 old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
825
826 /*
827 * Just make sure we have seen our old_write and synchronize
828 * with any interrupts that come in.
829 */
830 barrier();
831
832 /*
833 * If the tail page is still the same as what we think
834 * it is, then it is up to us to update the tail
835 * pointer.
836 */
837 if (tail_page == cpu_buffer->tail_page) {
838 /* Zero the write counter */
839 unsigned long val = old_write & ~RB_WRITE_MASK;
840 unsigned long eval = old_entries & ~RB_WRITE_MASK;
841
842 /*
843 * This will only succeed if an interrupt did
844 * not come in and change it. In which case, we
845 * do not want to modify it.
846 *
847 * We add (void) to let the compiler know that we do not care
848 * about the return value of these functions. We use the
849 * cmpxchg to only update if an interrupt did not already
850 * do it for us. If the cmpxchg fails, we don't care.
851 */
852 (void)local_cmpxchg(&next_page->write, old_write, val);
853 (void)local_cmpxchg(&next_page->entries, old_entries, eval);
854
855 /*
856 * No need to worry about races with clearing out the commit.
857 * it only can increment when a commit takes place. But that
858 * only happens in the outer most nested commit.
859 */
860 local_set(&next_page->page->commit, 0);
861
862 old_tail = cmpxchg(&cpu_buffer->tail_page,
863 tail_page, next_page);
864
865 if (old_tail == tail_page)
866 ret = 1;
867 }
868
869 return ret;
870}
871
872static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
873 struct buffer_page *bpage)
874{
875 unsigned long val = (unsigned long)bpage;
876
877 if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
878 return 1;
879
880 return 0;
881}
882
883/**
884 * rb_check_list - make sure a pointer to a list has the last bits zero
885 */
886static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
887 struct list_head *list)
888{
889 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
890 return 1;
891 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
892 return 1;
893 return 0;
894}
895
492/** 896/**
493 * check_pages - integrity check of buffer pages 897 * check_pages - integrity check of buffer pages
494 * @cpu_buffer: CPU buffer with pages to test 898 * @cpu_buffer: CPU buffer with pages to test
@@ -498,14 +902,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
498 */ 902 */
499static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 903static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
500{ 904{
501 struct list_head *head = &cpu_buffer->pages; 905 struct list_head *head = cpu_buffer->pages;
502 struct buffer_page *bpage, *tmp; 906 struct buffer_page *bpage, *tmp;
503 907
908 rb_head_page_deactivate(cpu_buffer);
909
504 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 910 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
505 return -1; 911 return -1;
506 if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) 912 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
507 return -1; 913 return -1;
508 914
915 if (rb_check_list(cpu_buffer, head))
916 return -1;
917
509 list_for_each_entry_safe(bpage, tmp, head, list) { 918 list_for_each_entry_safe(bpage, tmp, head, list) {
510 if (RB_WARN_ON(cpu_buffer, 919 if (RB_WARN_ON(cpu_buffer,
511 bpage->list.next->prev != &bpage->list)) 920 bpage->list.next->prev != &bpage->list))
@@ -513,25 +922,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
513 if (RB_WARN_ON(cpu_buffer, 922 if (RB_WARN_ON(cpu_buffer,
514 bpage->list.prev->next != &bpage->list)) 923 bpage->list.prev->next != &bpage->list))
515 return -1; 924 return -1;
925 if (rb_check_list(cpu_buffer, &bpage->list))
926 return -1;
516 } 927 }
517 928
929 rb_head_page_activate(cpu_buffer);
930
518 return 0; 931 return 0;
519} 932}
520 933
521static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 934static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
522 unsigned nr_pages) 935 unsigned nr_pages)
523{ 936{
524 struct list_head *head = &cpu_buffer->pages;
525 struct buffer_page *bpage, *tmp; 937 struct buffer_page *bpage, *tmp;
526 unsigned long addr; 938 unsigned long addr;
527 LIST_HEAD(pages); 939 LIST_HEAD(pages);
528 unsigned i; 940 unsigned i;
529 941
942 WARN_ON(!nr_pages);
943
530 for (i = 0; i < nr_pages; i++) { 944 for (i = 0; i < nr_pages; i++) {
531 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 945 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
532 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 946 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
533 if (!bpage) 947 if (!bpage)
534 goto free_pages; 948 goto free_pages;
949
950 rb_check_bpage(cpu_buffer, bpage);
951
535 list_add(&bpage->list, &pages); 952 list_add(&bpage->list, &pages);
536 953
537 addr = __get_free_page(GFP_KERNEL); 954 addr = __get_free_page(GFP_KERNEL);
@@ -541,7 +958,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
541 rb_init_page(bpage->page); 958 rb_init_page(bpage->page);
542 } 959 }
543 960
544 list_splice(&pages, head); 961 /*
962 * The ring buffer page list is a circular list that does not
963 * start and end with a list head. All page list items point to
964 * other pages.
965 */
966 cpu_buffer->pages = pages.next;
967 list_del(&pages);
545 968
546 rb_check_pages(cpu_buffer); 969 rb_check_pages(cpu_buffer);
547 970
@@ -573,13 +996,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
573 spin_lock_init(&cpu_buffer->reader_lock); 996 spin_lock_init(&cpu_buffer->reader_lock);
574 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 997 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
575 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 998 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
576 INIT_LIST_HEAD(&cpu_buffer->pages);
577 999
578 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1000 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
579 GFP_KERNEL, cpu_to_node(cpu)); 1001 GFP_KERNEL, cpu_to_node(cpu));
580 if (!bpage) 1002 if (!bpage)
581 goto fail_free_buffer; 1003 goto fail_free_buffer;
582 1004
1005 rb_check_bpage(cpu_buffer, bpage);
1006
583 cpu_buffer->reader_page = bpage; 1007 cpu_buffer->reader_page = bpage;
584 addr = __get_free_page(GFP_KERNEL); 1008 addr = __get_free_page(GFP_KERNEL);
585 if (!addr) 1009 if (!addr)
@@ -594,9 +1018,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
594 goto fail_free_reader; 1018 goto fail_free_reader;
595 1019
596 cpu_buffer->head_page 1020 cpu_buffer->head_page
597 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 1021 = list_entry(cpu_buffer->pages, struct buffer_page, list);
598 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; 1022 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
599 1023
1024 rb_head_page_activate(cpu_buffer);
1025
600 return cpu_buffer; 1026 return cpu_buffer;
601 1027
602 fail_free_reader: 1028 fail_free_reader:
@@ -609,15 +1035,22 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
609 1035
610static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 1036static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
611{ 1037{
612 struct list_head *head = &cpu_buffer->pages; 1038 struct list_head *head = cpu_buffer->pages;
613 struct buffer_page *bpage, *tmp; 1039 struct buffer_page *bpage, *tmp;
614 1040
615 free_buffer_page(cpu_buffer->reader_page); 1041 free_buffer_page(cpu_buffer->reader_page);
616 1042
617 list_for_each_entry_safe(bpage, tmp, head, list) { 1043 rb_head_page_deactivate(cpu_buffer);
618 list_del_init(&bpage->list); 1044
1045 if (head) {
1046 list_for_each_entry_safe(bpage, tmp, head, list) {
1047 list_del_init(&bpage->list);
1048 free_buffer_page(bpage);
1049 }
1050 bpage = list_entry(head, struct buffer_page, list);
619 free_buffer_page(bpage); 1051 free_buffer_page(bpage);
620 } 1052 }
1053
621 kfree(cpu_buffer); 1054 kfree(cpu_buffer);
622} 1055}
623 1056
@@ -735,6 +1168,7 @@ ring_buffer_free(struct ring_buffer *buffer)
735 1168
736 put_online_cpus(); 1169 put_online_cpus();
737 1170
1171 kfree(buffer->buffers);
738 free_cpumask_var(buffer->cpumask); 1172 free_cpumask_var(buffer->cpumask);
739 1173
740 kfree(buffer); 1174 kfree(buffer);
@@ -759,15 +1193,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
759 atomic_inc(&cpu_buffer->record_disabled); 1193 atomic_inc(&cpu_buffer->record_disabled);
760 synchronize_sched(); 1194 synchronize_sched();
761 1195
1196 rb_head_page_deactivate(cpu_buffer);
1197
762 for (i = 0; i < nr_pages; i++) { 1198 for (i = 0; i < nr_pages; i++) {
763 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1199 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
764 return; 1200 return;
765 p = cpu_buffer->pages.next; 1201 p = cpu_buffer->pages->next;
766 bpage = list_entry(p, struct buffer_page, list); 1202 bpage = list_entry(p, struct buffer_page, list);
767 list_del_init(&bpage->list); 1203 list_del_init(&bpage->list);
768 free_buffer_page(bpage); 1204 free_buffer_page(bpage);
769 } 1205 }
770 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1206 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
771 return; 1207 return;
772 1208
773 rb_reset_cpu(cpu_buffer); 1209 rb_reset_cpu(cpu_buffer);
@@ -789,15 +1225,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
789 atomic_inc(&cpu_buffer->record_disabled); 1225 atomic_inc(&cpu_buffer->record_disabled);
790 synchronize_sched(); 1226 synchronize_sched();
791 1227
1228 spin_lock_irq(&cpu_buffer->reader_lock);
1229 rb_head_page_deactivate(cpu_buffer);
1230
792 for (i = 0; i < nr_pages; i++) { 1231 for (i = 0; i < nr_pages; i++) {
793 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1232 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
794 return; 1233 return;
795 p = pages->next; 1234 p = pages->next;
796 bpage = list_entry(p, struct buffer_page, list); 1235 bpage = list_entry(p, struct buffer_page, list);
797 list_del_init(&bpage->list); 1236 list_del_init(&bpage->list);
798 list_add_tail(&bpage->list, &cpu_buffer->pages); 1237 list_add_tail(&bpage->list, cpu_buffer->pages);
799 } 1238 }
800 rb_reset_cpu(cpu_buffer); 1239 rb_reset_cpu(cpu_buffer);
1240 spin_unlock_irq(&cpu_buffer->reader_lock);
801 1241
802 rb_check_pages(cpu_buffer); 1242 rb_check_pages(cpu_buffer);
803 1243
@@ -948,21 +1388,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
948} 1388}
949 1389
950static inline struct ring_buffer_event * 1390static inline struct ring_buffer_event *
951rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
952{
953 return __rb_page_index(cpu_buffer->head_page,
954 cpu_buffer->head_page->read);
955}
956
957static inline struct ring_buffer_event *
958rb_iter_head_event(struct ring_buffer_iter *iter) 1391rb_iter_head_event(struct ring_buffer_iter *iter)
959{ 1392{
960 return __rb_page_index(iter->head_page, iter->head); 1393 return __rb_page_index(iter->head_page, iter->head);
961} 1394}
962 1395
963static inline unsigned rb_page_write(struct buffer_page *bpage) 1396static inline unsigned long rb_page_write(struct buffer_page *bpage)
964{ 1397{
965 return local_read(&bpage->write); 1398 return local_read(&bpage->write) & RB_WRITE_MASK;
966} 1399}
967 1400
968static inline unsigned rb_page_commit(struct buffer_page *bpage) 1401static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -970,6 +1403,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
970 return local_read(&bpage->page->commit); 1403 return local_read(&bpage->page->commit);
971} 1404}
972 1405
1406static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1407{
1408 return local_read(&bpage->entries) & RB_WRITE_MASK;
1409}
1410
973/* Size is determined by what has been commited */ 1411/* Size is determined by what has been commited */
974static inline unsigned rb_page_size(struct buffer_page *bpage) 1412static inline unsigned rb_page_size(struct buffer_page *bpage)
975{ 1413{
@@ -982,22 +1420,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
982 return rb_page_commit(cpu_buffer->commit_page); 1420 return rb_page_commit(cpu_buffer->commit_page);
983} 1421}
984 1422
985static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
986{
987 return rb_page_commit(cpu_buffer->head_page);
988}
989
990static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
991 struct buffer_page **bpage)
992{
993 struct list_head *p = (*bpage)->list.next;
994
995 if (p == &cpu_buffer->pages)
996 p = p->next;
997
998 *bpage = list_entry(p, struct buffer_page, list);
999}
1000
1001static inline unsigned 1423static inline unsigned
1002rb_event_index(struct ring_buffer_event *event) 1424rb_event_index(struct ring_buffer_event *event)
1003{ 1425{
@@ -1023,6 +1445,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1023static void 1445static void
1024rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1446rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1025{ 1447{
1448 unsigned long max_count;
1449
1026 /* 1450 /*
1027 * We only race with interrupts and NMIs on this CPU. 1451 * We only race with interrupts and NMIs on this CPU.
1028 * If we own the commit event, then we can commit 1452 * If we own the commit event, then we can commit
@@ -1032,9 +1456,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1032 * assign the commit to the tail. 1456 * assign the commit to the tail.
1033 */ 1457 */
1034 again: 1458 again:
1459 max_count = cpu_buffer->buffer->pages * 100;
1460
1035 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1461 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1036 cpu_buffer->commit_page->page->commit = 1462 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
1037 cpu_buffer->commit_page->write; 1463 return;
1464 if (RB_WARN_ON(cpu_buffer,
1465 rb_is_reader_page(cpu_buffer->tail_page)))
1466 return;
1467 local_set(&cpu_buffer->commit_page->page->commit,
1468 rb_page_write(cpu_buffer->commit_page));
1038 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 1469 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1039 cpu_buffer->write_stamp = 1470 cpu_buffer->write_stamp =
1040 cpu_buffer->commit_page->page->time_stamp; 1471 cpu_buffer->commit_page->page->time_stamp;
@@ -1043,8 +1474,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1043 } 1474 }
1044 while (rb_commit_index(cpu_buffer) != 1475 while (rb_commit_index(cpu_buffer) !=
1045 rb_page_write(cpu_buffer->commit_page)) { 1476 rb_page_write(cpu_buffer->commit_page)) {
1046 cpu_buffer->commit_page->page->commit = 1477
1047 cpu_buffer->commit_page->write; 1478 local_set(&cpu_buffer->commit_page->page->commit,
1479 rb_page_write(cpu_buffer->commit_page));
1480 RB_WARN_ON(cpu_buffer,
1481 local_read(&cpu_buffer->commit_page->page->commit) &
1482 ~RB_WRITE_MASK);
1048 barrier(); 1483 barrier();
1049 } 1484 }
1050 1485
@@ -1077,7 +1512,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1077 * to the head page instead of next. 1512 * to the head page instead of next.
1078 */ 1513 */
1079 if (iter->head_page == cpu_buffer->reader_page) 1514 if (iter->head_page == cpu_buffer->reader_page)
1080 iter->head_page = cpu_buffer->head_page; 1515 iter->head_page = rb_set_head_page(cpu_buffer);
1081 else 1516 else
1082 rb_inc_page(cpu_buffer, &iter->head_page); 1517 rb_inc_page(cpu_buffer, &iter->head_page);
1083 1518
@@ -1121,6 +1556,163 @@ rb_update_event(struct ring_buffer_event *event,
1121 } 1556 }
1122} 1557}
1123 1558
1559/*
1560 * rb_handle_head_page - writer hit the head page
1561 *
1562 * Returns: +1 to retry page
1563 * 0 to continue
1564 * -1 on error
1565 */
1566static int
1567rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1568 struct buffer_page *tail_page,
1569 struct buffer_page *next_page)
1570{
1571 struct buffer_page *new_head;
1572 int entries;
1573 int type;
1574 int ret;
1575
1576 entries = rb_page_entries(next_page);
1577
1578 /*
1579 * The hard part is here. We need to move the head
1580 * forward, and protect against both readers on
1581 * other CPUs and writers coming in via interrupts.
1582 */
1583 type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
1584 RB_PAGE_HEAD);
1585
1586 /*
1587 * type can be one of four:
1588 * NORMAL - an interrupt already moved it for us
1589 * HEAD - we are the first to get here.
1590 * UPDATE - we are the interrupt interrupting
1591 * a current move.
1592 * MOVED - a reader on another CPU moved the next
1593 * pointer to its reader page. Give up
1594 * and try again.
1595 */
1596
1597 switch (type) {
1598 case RB_PAGE_HEAD:
1599 /*
1600 * We changed the head to UPDATE, thus
1601 * it is our responsibility to update
1602 * the counters.
1603 */
1604 local_add(entries, &cpu_buffer->overrun);
1605
1606 /*
1607 * The entries will be zeroed out when we move the
1608 * tail page.
1609 */
1610
1611 /* still more to do */
1612 break;
1613
1614 case RB_PAGE_UPDATE:
1615 /*
1616 * This is an interrupt that interrupt the
1617 * previous update. Still more to do.
1618 */
1619 break;
1620 case RB_PAGE_NORMAL:
1621 /*
1622 * An interrupt came in before the update
1623 * and processed this for us.
1624 * Nothing left to do.
1625 */
1626 return 1;
1627 case RB_PAGE_MOVED:
1628 /*
1629 * The reader is on another CPU and just did
1630 * a swap with our next_page.
1631 * Try again.
1632 */
1633 return 1;
1634 default:
1635 RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
1636 return -1;
1637 }
1638
1639 /*
1640 * Now that we are here, the old head pointer is
1641 * set to UPDATE. This will keep the reader from
1642 * swapping the head page with the reader page.
1643 * The reader (on another CPU) will spin till
1644 * we are finished.
1645 *
1646 * We just need to protect against interrupts
1647 * doing the job. We will set the next pointer
1648 * to HEAD. After that, we set the old pointer
1649 * to NORMAL, but only if it was HEAD before.
1650 * otherwise we are an interrupt, and only
1651 * want the outer most commit to reset it.
1652 */
1653 new_head = next_page;
1654 rb_inc_page(cpu_buffer, &new_head);
1655
1656 ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
1657 RB_PAGE_NORMAL);
1658
1659 /*
1660 * Valid returns are:
1661 * HEAD - an interrupt came in and already set it.
1662 * NORMAL - One of two things:
1663 * 1) We really set it.
1664 * 2) A bunch of interrupts came in and moved
1665 * the page forward again.
1666 */
1667 switch (ret) {
1668 case RB_PAGE_HEAD:
1669 case RB_PAGE_NORMAL:
1670 /* OK */
1671 break;
1672 default:
1673 RB_WARN_ON(cpu_buffer, 1);
1674 return -1;
1675 }
1676
1677 /*
1678 * It is possible that an interrupt came in,
1679 * set the head up, then more interrupts came in
1680 * and moved it again. When we get back here,
1681 * the page would have been set to NORMAL but we
1682 * just set it back to HEAD.
1683 *
1684 * How do you detect this? Well, if that happened
1685 * the tail page would have moved.
1686 */
1687 if (ret == RB_PAGE_NORMAL) {
1688 /*
1689 * If the tail had moved passed next, then we need
1690 * to reset the pointer.
1691 */
1692 if (cpu_buffer->tail_page != tail_page &&
1693 cpu_buffer->tail_page != next_page)
1694 rb_head_page_set_normal(cpu_buffer, new_head,
1695 next_page,
1696 RB_PAGE_HEAD);
1697 }
1698
1699 /*
1700 * If this was the outer most commit (the one that
1701 * changed the original pointer from HEAD to UPDATE),
1702 * then it is up to us to reset it to NORMAL.
1703 */
1704 if (type == RB_PAGE_HEAD) {
1705 ret = rb_head_page_set_normal(cpu_buffer, next_page,
1706 tail_page,
1707 RB_PAGE_UPDATE);
1708 if (RB_WARN_ON(cpu_buffer,
1709 ret != RB_PAGE_UPDATE))
1710 return -1;
1711 }
1712
1713 return 0;
1714}
1715
1124static unsigned rb_calculate_event_length(unsigned length) 1716static unsigned rb_calculate_event_length(unsigned length)
1125{ 1717{
1126 struct ring_buffer_event event; /* Used only for sizeof array */ 1718 struct ring_buffer_event event; /* Used only for sizeof array */
@@ -1184,9 +1776,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1184 event->type_len = RINGBUF_TYPE_PADDING; 1776 event->type_len = RINGBUF_TYPE_PADDING;
1185 /* time delta must be non zero */ 1777 /* time delta must be non zero */
1186 event->time_delta = 1; 1778 event->time_delta = 1;
1187 /* Account for this as an entry */
1188 local_inc(&tail_page->entries);
1189 local_inc(&cpu_buffer->entries);
1190 1779
1191 /* Set write to end of buffer */ 1780 /* Set write to end of buffer */
1192 length = (tail + length) - BUF_PAGE_SIZE; 1781 length = (tail + length) - BUF_PAGE_SIZE;
@@ -1199,96 +1788,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1199 struct buffer_page *commit_page, 1788 struct buffer_page *commit_page,
1200 struct buffer_page *tail_page, u64 *ts) 1789 struct buffer_page *tail_page, u64 *ts)
1201{ 1790{
1202 struct buffer_page *next_page, *head_page, *reader_page;
1203 struct ring_buffer *buffer = cpu_buffer->buffer; 1791 struct ring_buffer *buffer = cpu_buffer->buffer;
1204 bool lock_taken = false; 1792 struct buffer_page *next_page;
1205 unsigned long flags; 1793 int ret;
1206 1794
1207 next_page = tail_page; 1795 next_page = tail_page;
1208 1796
1209 local_irq_save(flags);
1210 /*
1211 * Since the write to the buffer is still not
1212 * fully lockless, we must be careful with NMIs.
1213 * The locks in the writers are taken when a write
1214 * crosses to a new page. The locks protect against
1215 * races with the readers (this will soon be fixed
1216 * with a lockless solution).
1217 *
1218 * Because we can not protect against NMIs, and we
1219 * want to keep traces reentrant, we need to manage
1220 * what happens when we are in an NMI.
1221 *
1222 * NMIs can happen after we take the lock.
1223 * If we are in an NMI, only take the lock
1224 * if it is not already taken. Otherwise
1225 * simply fail.
1226 */
1227 if (unlikely(in_nmi())) {
1228 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1229 cpu_buffer->nmi_dropped++;
1230 goto out_reset;
1231 }
1232 } else
1233 __raw_spin_lock(&cpu_buffer->lock);
1234
1235 lock_taken = true;
1236
1237 rb_inc_page(cpu_buffer, &next_page); 1797 rb_inc_page(cpu_buffer, &next_page);
1238 1798
1239 head_page = cpu_buffer->head_page;
1240 reader_page = cpu_buffer->reader_page;
1241
1242 /* we grabbed the lock before incrementing */
1243 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1244 goto out_reset;
1245
1246 /* 1799 /*
1247 * If for some reason, we had an interrupt storm that made 1800 * If for some reason, we had an interrupt storm that made
1248 * it all the way around the buffer, bail, and warn 1801 * it all the way around the buffer, bail, and warn
1249 * about it. 1802 * about it.
1250 */ 1803 */
1251 if (unlikely(next_page == commit_page)) { 1804 if (unlikely(next_page == commit_page)) {
1252 cpu_buffer->commit_overrun++; 1805 local_inc(&cpu_buffer->commit_overrun);
1253 goto out_reset; 1806 goto out_reset;
1254 } 1807 }
1255 1808
1256 if (next_page == head_page) { 1809 /*
1257 if (!(buffer->flags & RB_FL_OVERWRITE)) 1810 * This is where the fun begins!
1258 goto out_reset; 1811 *
1259 1812 * We are fighting against races between a reader that
1260 /* tail_page has not moved yet? */ 1813 * could be on another CPU trying to swap its reader
1261 if (tail_page == cpu_buffer->tail_page) { 1814 * page with the buffer head.
1262 /* count overflows */ 1815 *
1263 cpu_buffer->overrun += 1816 * We are also fighting against interrupts coming in and
1264 local_read(&head_page->entries); 1817 * moving the head or tail on us as well.
1818 *
1819 * If the next page is the head page then we have filled
1820 * the buffer, unless the commit page is still on the
1821 * reader page.
1822 */
1823 if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
1265 1824
1266 rb_inc_page(cpu_buffer, &head_page); 1825 /*
1267 cpu_buffer->head_page = head_page; 1826 * If the commit is not on the reader page, then
1268 cpu_buffer->head_page->read = 0; 1827 * move the header page.
1828 */
1829 if (!rb_is_reader_page(cpu_buffer->commit_page)) {
1830 /*
1831 * If we are not in overwrite mode,
1832 * this is easy, just stop here.
1833 */
1834 if (!(buffer->flags & RB_FL_OVERWRITE))
1835 goto out_reset;
1836
1837 ret = rb_handle_head_page(cpu_buffer,
1838 tail_page,
1839 next_page);
1840 if (ret < 0)
1841 goto out_reset;
1842 if (ret)
1843 goto out_again;
1844 } else {
1845 /*
1846 * We need to be careful here too. The
1847 * commit page could still be on the reader
1848 * page. We could have a small buffer, and
1849 * have filled up the buffer with events
1850 * from interrupts and such, and wrapped.
1851 *
1852 * Note, if the tail page is also the on the
1853 * reader_page, we let it move out.
1854 */
1855 if (unlikely((cpu_buffer->commit_page !=
1856 cpu_buffer->tail_page) &&
1857 (cpu_buffer->commit_page ==
1858 cpu_buffer->reader_page))) {
1859 local_inc(&cpu_buffer->commit_overrun);
1860 goto out_reset;
1861 }
1269 } 1862 }
1270 } 1863 }
1271 1864
1272 /* 1865 ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
1273 * If the tail page is still the same as what we think 1866 if (ret) {
1274 * it is, then it is up to us to update the tail 1867 /*
1275 * pointer. 1868 * Nested commits always have zero deltas, so
1276 */ 1869 * just reread the time stamp
1277 if (tail_page == cpu_buffer->tail_page) { 1870 */
1278 local_set(&next_page->write, 0);
1279 local_set(&next_page->entries, 0);
1280 local_set(&next_page->page->commit, 0);
1281 cpu_buffer->tail_page = next_page;
1282
1283 /* reread the time stamp */
1284 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1871 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1285 cpu_buffer->tail_page->page->time_stamp = *ts; 1872 next_page->page->time_stamp = *ts;
1286 } 1873 }
1287 1874
1288 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1875 out_again:
1289 1876
1290 __raw_spin_unlock(&cpu_buffer->lock); 1877 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1291 local_irq_restore(flags);
1292 1878
1293 /* fail and let the caller try again */ 1879 /* fail and let the caller try again */
1294 return ERR_PTR(-EAGAIN); 1880 return ERR_PTR(-EAGAIN);
@@ -1297,9 +1883,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1297 /* reset write */ 1883 /* reset write */
1298 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1884 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1299 1885
1300 if (likely(lock_taken))
1301 __raw_spin_unlock(&cpu_buffer->lock);
1302 local_irq_restore(flags);
1303 return NULL; 1886 return NULL;
1304} 1887}
1305 1888
@@ -1316,6 +1899,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1316 barrier(); 1899 barrier();
1317 tail_page = cpu_buffer->tail_page; 1900 tail_page = cpu_buffer->tail_page;
1318 write = local_add_return(length, &tail_page->write); 1901 write = local_add_return(length, &tail_page->write);
1902
1903 /* set write to only the index of the write */
1904 write &= RB_WRITE_MASK;
1319 tail = write - length; 1905 tail = write - length;
1320 1906
1321 /* See if we shot pass the end of this buffer page */ 1907 /* See if we shot pass the end of this buffer page */
@@ -1360,12 +1946,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1360 bpage = cpu_buffer->tail_page; 1946 bpage = cpu_buffer->tail_page;
1361 1947
1362 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 1948 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1949 unsigned long write_mask =
1950 local_read(&bpage->write) & ~RB_WRITE_MASK;
1363 /* 1951 /*
1364 * This is on the tail page. It is possible that 1952 * This is on the tail page. It is possible that
1365 * a write could come in and move the tail page 1953 * a write could come in and move the tail page
1366 * and write to the next page. That is fine 1954 * and write to the next page. That is fine
1367 * because we just shorten what is on this page. 1955 * because we just shorten what is on this page.
1368 */ 1956 */
1957 old_index += write_mask;
1958 new_index += write_mask;
1369 index = local_cmpxchg(&bpage->write, old_index, new_index); 1959 index = local_cmpxchg(&bpage->write, old_index, new_index);
1370 if (index == old_index) 1960 if (index == old_index)
1371 return 1; 1961 return 1;
@@ -1481,7 +2071,8 @@ static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
1481} 2071}
1482 2072
1483static struct ring_buffer_event * 2073static struct ring_buffer_event *
1484rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 2074rb_reserve_next_event(struct ring_buffer *buffer,
2075 struct ring_buffer_per_cpu *cpu_buffer,
1485 unsigned long length) 2076 unsigned long length)
1486{ 2077{
1487 struct ring_buffer_event *event; 2078 struct ring_buffer_event *event;
@@ -1491,6 +2082,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1491 2082
1492 rb_start_commit(cpu_buffer); 2083 rb_start_commit(cpu_buffer);
1493 2084
2085#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2086 /*
2087 * Due to the ability to swap a cpu buffer from a buffer
2088 * it is possible it was swapped before we committed.
2089 * (committing stops a swap). We check for it here and
2090 * if it happened, we have to fail the write.
2091 */
2092 barrier();
2093 if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
2094 local_dec(&cpu_buffer->committing);
2095 local_dec(&cpu_buffer->commits);
2096 return NULL;
2097 }
2098#endif
2099
1494 length = rb_calculate_event_length(length); 2100 length = rb_calculate_event_length(length);
1495 again: 2101 again:
1496 /* 2102 /*
@@ -1651,7 +2257,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1651 if (length > BUF_MAX_DATA_SIZE) 2257 if (length > BUF_MAX_DATA_SIZE)
1652 goto out; 2258 goto out;
1653 2259
1654 event = rb_reserve_next_event(cpu_buffer, length); 2260 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1655 if (!event) 2261 if (!event)
1656 goto out; 2262 goto out;
1657 2263
@@ -1674,18 +2280,23 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1674} 2280}
1675EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2281EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1676 2282
1677static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2283static void
2284rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1678 struct ring_buffer_event *event) 2285 struct ring_buffer_event *event)
1679{ 2286{
1680 local_inc(&cpu_buffer->entries);
1681
1682 /* 2287 /*
1683 * The event first in the commit queue updates the 2288 * The event first in the commit queue updates the
1684 * time stamp. 2289 * time stamp.
1685 */ 2290 */
1686 if (rb_event_is_commit(cpu_buffer, event)) 2291 if (rb_event_is_commit(cpu_buffer, event))
1687 cpu_buffer->write_stamp += event->time_delta; 2292 cpu_buffer->write_stamp += event->time_delta;
2293}
1688 2294
2295static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2296 struct ring_buffer_event *event)
2297{
2298 local_inc(&cpu_buffer->entries);
2299 rb_update_write_stamp(cpu_buffer, event);
1689 rb_end_commit(cpu_buffer); 2300 rb_end_commit(cpu_buffer);
1690} 2301}
1691 2302
@@ -1732,32 +2343,57 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
1732 event->time_delta = 1; 2343 event->time_delta = 1;
1733} 2344}
1734 2345
1735/** 2346/*
1736 * ring_buffer_event_discard - discard any event in the ring buffer 2347 * Decrement the entries to the page that an event is on.
1737 * @event: the event to discard 2348 * The event does not even need to exist, only the pointer
1738 * 2349 * to the page it is on. This may only be called before the commit
1739 * Sometimes a event that is in the ring buffer needs to be ignored. 2350 * takes place.
1740 * This function lets the user discard an event in the ring buffer
1741 * and then that event will not be read later.
1742 *
1743 * Note, it is up to the user to be careful with this, and protect
1744 * against races. If the user discards an event that has been consumed
1745 * it is possible that it could corrupt the ring buffer.
1746 */ 2351 */
1747void ring_buffer_event_discard(struct ring_buffer_event *event) 2352static inline void
2353rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
2354 struct ring_buffer_event *event)
1748{ 2355{
1749 rb_event_discard(event); 2356 unsigned long addr = (unsigned long)event;
2357 struct buffer_page *bpage = cpu_buffer->commit_page;
2358 struct buffer_page *start;
2359
2360 addr &= PAGE_MASK;
2361
2362 /* Do the likely case first */
2363 if (likely(bpage->page == (void *)addr)) {
2364 local_dec(&bpage->entries);
2365 return;
2366 }
2367
2368 /*
2369 * Because the commit page may be on the reader page we
2370 * start with the next page and check the end loop there.
2371 */
2372 rb_inc_page(cpu_buffer, &bpage);
2373 start = bpage;
2374 do {
2375 if (bpage->page == (void *)addr) {
2376 local_dec(&bpage->entries);
2377 return;
2378 }
2379 rb_inc_page(cpu_buffer, &bpage);
2380 } while (bpage != start);
2381
2382 /* commit not part of this buffer?? */
2383 RB_WARN_ON(cpu_buffer, 1);
1750} 2384}
1751EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
1752 2385
1753/** 2386/**
1754 * ring_buffer_commit_discard - discard an event that has not been committed 2387 * ring_buffer_commit_discard - discard an event that has not been committed
1755 * @buffer: the ring buffer 2388 * @buffer: the ring buffer
1756 * @event: non committed event to discard 2389 * @event: non committed event to discard
1757 * 2390 *
1758 * This is similar to ring_buffer_event_discard but must only be 2391 * Sometimes an event that is in the ring buffer needs to be ignored.
1759 * performed on an event that has not been committed yet. The difference 2392 * This function lets the user discard an event in the ring buffer
1760 * is that this will also try to free the event from the ring buffer 2393 * and then that event will not be read later.
2394 *
2395 * This function only works if it is called before the the item has been
2396 * committed. It will try to free the event from the ring buffer
1761 * if another event has not been added behind it. 2397 * if another event has not been added behind it.
1762 * 2398 *
1763 * If another event has been added behind it, it will set the event 2399 * If another event has been added behind it, it will set the event
@@ -1785,14 +2421,15 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1785 */ 2421 */
1786 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); 2422 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1787 2423
1788 if (!rb_try_to_discard(cpu_buffer, event)) 2424 rb_decrement_entry(cpu_buffer, event);
2425 if (rb_try_to_discard(cpu_buffer, event))
1789 goto out; 2426 goto out;
1790 2427
1791 /* 2428 /*
1792 * The commit is still visible by the reader, so we 2429 * The commit is still visible by the reader, so we
1793 * must increment entries. 2430 * must still update the timestamp.
1794 */ 2431 */
1795 local_inc(&cpu_buffer->entries); 2432 rb_update_write_stamp(cpu_buffer, event);
1796 out: 2433 out:
1797 rb_end_commit(cpu_buffer); 2434 rb_end_commit(cpu_buffer);
1798 2435
@@ -1853,7 +2490,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1853 if (length > BUF_MAX_DATA_SIZE) 2490 if (length > BUF_MAX_DATA_SIZE)
1854 goto out; 2491 goto out;
1855 2492
1856 event = rb_reserve_next_event(cpu_buffer, length); 2493 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1857 if (!event) 2494 if (!event)
1858 goto out; 2495 goto out;
1859 2496
@@ -1874,9 +2511,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write);
1874static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 2511static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1875{ 2512{
1876 struct buffer_page *reader = cpu_buffer->reader_page; 2513 struct buffer_page *reader = cpu_buffer->reader_page;
1877 struct buffer_page *head = cpu_buffer->head_page; 2514 struct buffer_page *head = rb_set_head_page(cpu_buffer);
1878 struct buffer_page *commit = cpu_buffer->commit_page; 2515 struct buffer_page *commit = cpu_buffer->commit_page;
1879 2516
2517 /* In case of error, head will be NULL */
2518 if (unlikely(!head))
2519 return 1;
2520
1880 return reader->read == rb_page_commit(reader) && 2521 return reader->read == rb_page_commit(reader) &&
1881 (commit == reader || 2522 (commit == reader ||
1882 (commit == head && 2523 (commit == head &&
@@ -1967,7 +2608,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1967 return 0; 2608 return 0;
1968 2609
1969 cpu_buffer = buffer->buffers[cpu]; 2610 cpu_buffer = buffer->buffers[cpu];
1970 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) 2611 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
1971 - cpu_buffer->read; 2612 - cpu_buffer->read;
1972 2613
1973 return ret; 2614 return ret;
@@ -1988,33 +2629,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1988 return 0; 2629 return 0;
1989 2630
1990 cpu_buffer = buffer->buffers[cpu]; 2631 cpu_buffer = buffer->buffers[cpu];
1991 ret = cpu_buffer->overrun; 2632 ret = local_read(&cpu_buffer->overrun);
1992 2633
1993 return ret; 2634 return ret;
1994} 2635}
1995EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 2636EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1996 2637
1997/** 2638/**
1998 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
1999 * @buffer: The ring buffer
2000 * @cpu: The per CPU buffer to get the number of overruns from
2001 */
2002unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
2003{
2004 struct ring_buffer_per_cpu *cpu_buffer;
2005 unsigned long ret;
2006
2007 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2008 return 0;
2009
2010 cpu_buffer = buffer->buffers[cpu];
2011 ret = cpu_buffer->nmi_dropped;
2012
2013 return ret;
2014}
2015EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
2016
2017/**
2018 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 2639 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
2019 * @buffer: The ring buffer 2640 * @buffer: The ring buffer
2020 * @cpu: The per CPU buffer to get the number of overruns from 2641 * @cpu: The per CPU buffer to get the number of overruns from
@@ -2029,7 +2650,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
2029 return 0; 2650 return 0;
2030 2651
2031 cpu_buffer = buffer->buffers[cpu]; 2652 cpu_buffer = buffer->buffers[cpu];
2032 ret = cpu_buffer->commit_overrun; 2653 ret = local_read(&cpu_buffer->commit_overrun);
2033 2654
2034 return ret; 2655 return ret;
2035} 2656}
@@ -2052,7 +2673,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2052 for_each_buffer_cpu(buffer, cpu) { 2673 for_each_buffer_cpu(buffer, cpu) {
2053 cpu_buffer = buffer->buffers[cpu]; 2674 cpu_buffer = buffer->buffers[cpu];
2054 entries += (local_read(&cpu_buffer->entries) - 2675 entries += (local_read(&cpu_buffer->entries) -
2055 cpu_buffer->overrun) - cpu_buffer->read; 2676 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2056 } 2677 }
2057 2678
2058 return entries; 2679 return entries;
@@ -2075,7 +2696,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
2075 /* if you care about this being correct, lock the buffer */ 2696 /* if you care about this being correct, lock the buffer */
2076 for_each_buffer_cpu(buffer, cpu) { 2697 for_each_buffer_cpu(buffer, cpu) {
2077 cpu_buffer = buffer->buffers[cpu]; 2698 cpu_buffer = buffer->buffers[cpu];
2078 overruns += cpu_buffer->overrun; 2699 overruns += local_read(&cpu_buffer->overrun);
2079 } 2700 }
2080 2701
2081 return overruns; 2702 return overruns;
@@ -2088,8 +2709,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2088 2709
2089 /* Iterator usage is expected to have record disabled */ 2710 /* Iterator usage is expected to have record disabled */
2090 if (list_empty(&cpu_buffer->reader_page->list)) { 2711 if (list_empty(&cpu_buffer->reader_page->list)) {
2091 iter->head_page = cpu_buffer->head_page; 2712 iter->head_page = rb_set_head_page(cpu_buffer);
2092 iter->head = cpu_buffer->head_page->read; 2713 if (unlikely(!iter->head_page))
2714 return;
2715 iter->head = iter->head_page->read;
2093 } else { 2716 } else {
2094 iter->head_page = cpu_buffer->reader_page; 2717 iter->head_page = cpu_buffer->reader_page;
2095 iter->head = cpu_buffer->reader_page->read; 2718 iter->head = cpu_buffer->reader_page->read;
@@ -2206,6 +2829,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2206 struct buffer_page *reader = NULL; 2829 struct buffer_page *reader = NULL;
2207 unsigned long flags; 2830 unsigned long flags;
2208 int nr_loops = 0; 2831 int nr_loops = 0;
2832 int ret;
2209 2833
2210 local_irq_save(flags); 2834 local_irq_save(flags);
2211 __raw_spin_lock(&cpu_buffer->lock); 2835 __raw_spin_lock(&cpu_buffer->lock);
@@ -2239,30 +2863,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2239 goto out; 2863 goto out;
2240 2864
2241 /* 2865 /*
2242 * Splice the empty reader page into the list around the head.
2243 * Reset the reader page to size zero. 2866 * Reset the reader page to size zero.
2244 */ 2867 */
2868 local_set(&cpu_buffer->reader_page->write, 0);
2869 local_set(&cpu_buffer->reader_page->entries, 0);
2870 local_set(&cpu_buffer->reader_page->page->commit, 0);
2245 2871
2246 reader = cpu_buffer->head_page; 2872 spin:
2873 /*
2874 * Splice the empty reader page into the list around the head.
2875 */
2876 reader = rb_set_head_page(cpu_buffer);
2247 cpu_buffer->reader_page->list.next = reader->list.next; 2877 cpu_buffer->reader_page->list.next = reader->list.next;
2248 cpu_buffer->reader_page->list.prev = reader->list.prev; 2878 cpu_buffer->reader_page->list.prev = reader->list.prev;
2249 2879
2250 local_set(&cpu_buffer->reader_page->write, 0); 2880 /*
2251 local_set(&cpu_buffer->reader_page->entries, 0); 2881 * cpu_buffer->pages just needs to point to the buffer, it
2252 local_set(&cpu_buffer->reader_page->page->commit, 0); 2882 * has no specific buffer page to point to. Lets move it out
2883 * of our way so we don't accidently swap it.
2884 */
2885 cpu_buffer->pages = reader->list.prev;
2253 2886
2254 /* Make the reader page now replace the head */ 2887 /* The reader page will be pointing to the new head */
2255 reader->list.prev->next = &cpu_buffer->reader_page->list; 2888 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2256 reader->list.next->prev = &cpu_buffer->reader_page->list;
2257 2889
2258 /* 2890 /*
2259 * If the tail is on the reader, then we must set the head 2891 * Here's the tricky part.
2260 * to the inserted page, otherwise we set it one before. 2892 *
2893 * We need to move the pointer past the header page.
2894 * But we can only do that if a writer is not currently
2895 * moving it. The page before the header page has the
2896 * flag bit '1' set if it is pointing to the page we want.
2897 * but if the writer is in the process of moving it
2898 * than it will be '2' or already moved '0'.
2261 */ 2899 */
2262 cpu_buffer->head_page = cpu_buffer->reader_page;
2263 2900
2264 if (cpu_buffer->commit_page != reader) 2901 ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
2265 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2902
2903 /*
2904 * If we did not convert it, then we must try again.
2905 */
2906 if (!ret)
2907 goto spin;
2908
2909 /*
2910 * Yeah! We succeeded in replacing the page.
2911 *
2912 * Now make the new head point back to the reader page.
2913 */
2914 reader->list.next->prev = &cpu_buffer->reader_page->list;
2915 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2266 2916
2267 /* Finally update the reader page to the new head */ 2917 /* Finally update the reader page to the new head */
2268 cpu_buffer->reader_page = reader; 2918 cpu_buffer->reader_page = reader;
@@ -2291,8 +2941,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2291 2941
2292 event = rb_reader_event(cpu_buffer); 2942 event = rb_reader_event(cpu_buffer);
2293 2943
2294 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX 2944 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
2295 || rb_discarded_event(event))
2296 cpu_buffer->read++; 2945 cpu_buffer->read++;
2297 2946
2298 rb_update_read_stamp(cpu_buffer, event); 2947 rb_update_read_stamp(cpu_buffer, event);
@@ -2346,15 +2995,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2346} 2995}
2347 2996
2348static struct ring_buffer_event * 2997static struct ring_buffer_event *
2349rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 2998rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
2350{ 2999{
2351 struct ring_buffer_per_cpu *cpu_buffer;
2352 struct ring_buffer_event *event; 3000 struct ring_buffer_event *event;
2353 struct buffer_page *reader; 3001 struct buffer_page *reader;
2354 int nr_loops = 0; 3002 int nr_loops = 0;
2355 3003
2356 cpu_buffer = buffer->buffers[cpu];
2357
2358 again: 3004 again:
2359 /* 3005 /*
2360 * We repeat when a timestamp is encountered. It is possible 3006 * We repeat when a timestamp is encountered. It is possible
@@ -2383,7 +3029,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2383 * the box. Return the padding, and we will release 3029 * the box. Return the padding, and we will release
2384 * the current locks, and try again. 3030 * the current locks, and try again.
2385 */ 3031 */
2386 rb_advance_reader(cpu_buffer);
2387 return event; 3032 return event;
2388 3033
2389 case RINGBUF_TYPE_TIME_EXTEND: 3034 case RINGBUF_TYPE_TIME_EXTEND:
@@ -2399,7 +3044,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2399 case RINGBUF_TYPE_DATA: 3044 case RINGBUF_TYPE_DATA:
2400 if (ts) { 3045 if (ts) {
2401 *ts = cpu_buffer->read_stamp + event->time_delta; 3046 *ts = cpu_buffer->read_stamp + event->time_delta;
2402 ring_buffer_normalize_time_stamp(buffer, 3047 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
2403 cpu_buffer->cpu, ts); 3048 cpu_buffer->cpu, ts);
2404 } 3049 }
2405 return event; 3050 return event;
@@ -2486,7 +3131,7 @@ static inline int rb_ok_to_lock(void)
2486 * buffer too. A one time deal is all you get from reading 3131 * buffer too. A one time deal is all you get from reading
2487 * the ring buffer from an NMI. 3132 * the ring buffer from an NMI.
2488 */ 3133 */
2489 if (likely(!in_nmi() && !oops_in_progress)) 3134 if (likely(!in_nmi()))
2490 return 1; 3135 return 1;
2491 3136
2492 tracing_off_permanent(); 3137 tracing_off_permanent();
@@ -2518,15 +3163,15 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2518 local_irq_save(flags); 3163 local_irq_save(flags);
2519 if (dolock) 3164 if (dolock)
2520 spin_lock(&cpu_buffer->reader_lock); 3165 spin_lock(&cpu_buffer->reader_lock);
2521 event = rb_buffer_peek(buffer, cpu, ts); 3166 event = rb_buffer_peek(cpu_buffer, ts);
3167 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3168 rb_advance_reader(cpu_buffer);
2522 if (dolock) 3169 if (dolock)
2523 spin_unlock(&cpu_buffer->reader_lock); 3170 spin_unlock(&cpu_buffer->reader_lock);
2524 local_irq_restore(flags); 3171 local_irq_restore(flags);
2525 3172
2526 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3173 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2527 cpu_relax();
2528 goto again; 3174 goto again;
2529 }
2530 3175
2531 return event; 3176 return event;
2532} 3177}
@@ -2551,10 +3196,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2551 event = rb_iter_peek(iter, ts); 3196 event = rb_iter_peek(iter, ts);
2552 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3197 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2553 3198
2554 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3199 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2555 cpu_relax();
2556 goto again; 3200 goto again;
2557 }
2558 3201
2559 return event; 3202 return event;
2560} 3203}
@@ -2589,13 +3232,10 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2589 if (dolock) 3232 if (dolock)
2590 spin_lock(&cpu_buffer->reader_lock); 3233 spin_lock(&cpu_buffer->reader_lock);
2591 3234
2592 event = rb_buffer_peek(buffer, cpu, ts); 3235 event = rb_buffer_peek(cpu_buffer, ts);
2593 if (!event) 3236 if (event)
2594 goto out_unlock; 3237 rb_advance_reader(cpu_buffer);
2595
2596 rb_advance_reader(cpu_buffer);
2597 3238
2598 out_unlock:
2599 if (dolock) 3239 if (dolock)
2600 spin_unlock(&cpu_buffer->reader_lock); 3240 spin_unlock(&cpu_buffer->reader_lock);
2601 local_irq_restore(flags); 3241 local_irq_restore(flags);
@@ -2603,10 +3243,8 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2603 out: 3243 out:
2604 preempt_enable(); 3244 preempt_enable();
2605 3245
2606 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3246 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2607 cpu_relax();
2608 goto again; 3247 goto again;
2609 }
2610 3248
2611 return event; 3249 return event;
2612} 3250}
@@ -2686,21 +3324,19 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2686 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3324 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2687 unsigned long flags; 3325 unsigned long flags;
2688 3326
2689 again:
2690 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3327 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3328 again:
2691 event = rb_iter_peek(iter, ts); 3329 event = rb_iter_peek(iter, ts);
2692 if (!event) 3330 if (!event)
2693 goto out; 3331 goto out;
2694 3332
3333 if (event->type_len == RINGBUF_TYPE_PADDING)
3334 goto again;
3335
2695 rb_advance_iter(iter); 3336 rb_advance_iter(iter);
2696 out: 3337 out:
2697 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3338 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2698 3339
2699 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2700 cpu_relax();
2701 goto again;
2702 }
2703
2704 return event; 3340 return event;
2705} 3341}
2706EXPORT_SYMBOL_GPL(ring_buffer_read); 3342EXPORT_SYMBOL_GPL(ring_buffer_read);
@@ -2718,8 +3354,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
2718static void 3354static void
2719rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 3355rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2720{ 3356{
3357 rb_head_page_deactivate(cpu_buffer);
3358
2721 cpu_buffer->head_page 3359 cpu_buffer->head_page
2722 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 3360 = list_entry(cpu_buffer->pages, struct buffer_page, list);
2723 local_set(&cpu_buffer->head_page->write, 0); 3361 local_set(&cpu_buffer->head_page->write, 0);
2724 local_set(&cpu_buffer->head_page->entries, 0); 3362 local_set(&cpu_buffer->head_page->entries, 0);
2725 local_set(&cpu_buffer->head_page->page->commit, 0); 3363 local_set(&cpu_buffer->head_page->page->commit, 0);
@@ -2735,16 +3373,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2735 local_set(&cpu_buffer->reader_page->page->commit, 0); 3373 local_set(&cpu_buffer->reader_page->page->commit, 0);
2736 cpu_buffer->reader_page->read = 0; 3374 cpu_buffer->reader_page->read = 0;
2737 3375
2738 cpu_buffer->nmi_dropped = 0; 3376 local_set(&cpu_buffer->commit_overrun, 0);
2739 cpu_buffer->commit_overrun = 0; 3377 local_set(&cpu_buffer->overrun, 0);
2740 cpu_buffer->overrun = 0;
2741 cpu_buffer->read = 0;
2742 local_set(&cpu_buffer->entries, 0); 3378 local_set(&cpu_buffer->entries, 0);
2743 local_set(&cpu_buffer->committing, 0); 3379 local_set(&cpu_buffer->committing, 0);
2744 local_set(&cpu_buffer->commits, 0); 3380 local_set(&cpu_buffer->commits, 0);
3381 cpu_buffer->read = 0;
2745 3382
2746 cpu_buffer->write_stamp = 0; 3383 cpu_buffer->write_stamp = 0;
2747 cpu_buffer->read_stamp = 0; 3384 cpu_buffer->read_stamp = 0;
3385
3386 rb_head_page_activate(cpu_buffer);
2748} 3387}
2749 3388
2750/** 3389/**
@@ -2764,12 +3403,16 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2764 3403
2765 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3404 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2766 3405
3406 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3407 goto out;
3408
2767 __raw_spin_lock(&cpu_buffer->lock); 3409 __raw_spin_lock(&cpu_buffer->lock);
2768 3410
2769 rb_reset_cpu(cpu_buffer); 3411 rb_reset_cpu(cpu_buffer);
2770 3412
2771 __raw_spin_unlock(&cpu_buffer->lock); 3413 __raw_spin_unlock(&cpu_buffer->lock);
2772 3414
3415 out:
2773 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3416 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2774 3417
2775 atomic_dec(&cpu_buffer->record_disabled); 3418 atomic_dec(&cpu_buffer->record_disabled);
@@ -2852,6 +3495,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2852} 3495}
2853EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); 3496EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2854 3497
3498#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2855/** 3499/**
2856 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers 3500 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
2857 * @buffer_a: One buffer to swap with 3501 * @buffer_a: One buffer to swap with
@@ -2906,20 +3550,28 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2906 atomic_inc(&cpu_buffer_a->record_disabled); 3550 atomic_inc(&cpu_buffer_a->record_disabled);
2907 atomic_inc(&cpu_buffer_b->record_disabled); 3551 atomic_inc(&cpu_buffer_b->record_disabled);
2908 3552
3553 ret = -EBUSY;
3554 if (local_read(&cpu_buffer_a->committing))
3555 goto out_dec;
3556 if (local_read(&cpu_buffer_b->committing))
3557 goto out_dec;
3558
2909 buffer_a->buffers[cpu] = cpu_buffer_b; 3559 buffer_a->buffers[cpu] = cpu_buffer_b;
2910 buffer_b->buffers[cpu] = cpu_buffer_a; 3560 buffer_b->buffers[cpu] = cpu_buffer_a;
2911 3561
2912 cpu_buffer_b->buffer = buffer_a; 3562 cpu_buffer_b->buffer = buffer_a;
2913 cpu_buffer_a->buffer = buffer_b; 3563 cpu_buffer_a->buffer = buffer_b;
2914 3564
3565 ret = 0;
3566
3567out_dec:
2915 atomic_dec(&cpu_buffer_a->record_disabled); 3568 atomic_dec(&cpu_buffer_a->record_disabled);
2916 atomic_dec(&cpu_buffer_b->record_disabled); 3569 atomic_dec(&cpu_buffer_b->record_disabled);
2917
2918 ret = 0;
2919out: 3570out:
2920 return ret; 3571 return ret;
2921} 3572}
2922EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 3573EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3574#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
2923 3575
2924/** 3576/**
2925 * ring_buffer_alloc_read_page - allocate a page to read from buffer 3577 * ring_buffer_alloc_read_page - allocate a page to read from buffer
@@ -3092,7 +3744,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3092 read = 0; 3744 read = 0;
3093 } else { 3745 } else {
3094 /* update the entry counter */ 3746 /* update the entry counter */
3095 cpu_buffer->read += local_read(&reader->entries); 3747 cpu_buffer->read += rb_page_entries(reader);
3096 3748
3097 /* swap the pages */ 3749 /* swap the pages */
3098 rb_init_page(bpage); 3750 rb_init_page(bpage);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3aa0a0dfdfa8..a35925d222ba 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -17,6 +17,7 @@
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
19#include <linux/seq_file.h> 19#include <linux/seq_file.h>
20#include <linux/smp_lock.h>
20#include <linux/notifier.h> 21#include <linux/notifier.h>
21#include <linux/irqflags.h> 22#include <linux/irqflags.h>
22#include <linux/debugfs.h> 23#include <linux/debugfs.h>
@@ -42,14 +43,11 @@
42 43
43#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) 44#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
44 45
45unsigned long __read_mostly tracing_max_latency;
46unsigned long __read_mostly tracing_thresh;
47
48/* 46/*
49 * On boot up, the ring buffer is set to the minimum size, so that 47 * On boot up, the ring buffer is set to the minimum size, so that
50 * we do not waste memory on systems that are not using tracing. 48 * we do not waste memory on systems that are not using tracing.
51 */ 49 */
52static int ring_buffer_expanded; 50int ring_buffer_expanded;
53 51
54/* 52/*
55 * We need to change this state when a selftest is running. 53 * We need to change this state when a selftest is running.
@@ -63,7 +61,7 @@ static bool __read_mostly tracing_selftest_running;
63/* 61/*
64 * If a tracer is running, we do not want to run SELFTEST. 62 * If a tracer is running, we do not want to run SELFTEST.
65 */ 63 */
66static bool __read_mostly tracing_selftest_disabled; 64bool __read_mostly tracing_selftest_disabled;
67 65
68/* For tracers that don't implement custom flags */ 66/* For tracers that don't implement custom flags */
69static struct tracer_opt dummy_tracer_opt[] = { 67static struct tracer_opt dummy_tracer_opt[] = {
@@ -88,7 +86,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
88 */ 86 */
89static int tracing_disabled = 1; 87static int tracing_disabled = 1;
90 88
91static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 89DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
92 90
93static inline void ftrace_disable_cpu(void) 91static inline void ftrace_disable_cpu(void)
94{ 92{
@@ -127,13 +125,13 @@ int ftrace_dump_on_oops;
127 125
128static int tracing_set_tracer(const char *buf); 126static int tracing_set_tracer(const char *buf);
129 127
130#define BOOTUP_TRACER_SIZE 100 128#define MAX_TRACER_SIZE 100
131static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata; 129static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
132static char *default_bootup_tracer; 130static char *default_bootup_tracer;
133 131
134static int __init set_ftrace(char *str) 132static int __init set_ftrace(char *str)
135{ 133{
136 strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE); 134 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
137 default_bootup_tracer = bootup_tracer_buf; 135 default_bootup_tracer = bootup_tracer_buf;
138 /* We are using ftrace early, expand it */ 136 /* We are using ftrace early, expand it */
139 ring_buffer_expanded = 1; 137 ring_buffer_expanded = 1;
@@ -171,10 +169,11 @@ static struct trace_array global_trace;
171 169
172static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 170static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
173 171
174int filter_current_check_discard(struct ftrace_event_call *call, void *rec, 172int filter_current_check_discard(struct ring_buffer *buffer,
173 struct ftrace_event_call *call, void *rec,
175 struct ring_buffer_event *event) 174 struct ring_buffer_event *event)
176{ 175{
177 return filter_check_discard(call, rec, global_trace.buffer, event); 176 return filter_check_discard(call, rec, buffer, event);
178} 177}
179EXPORT_SYMBOL_GPL(filter_current_check_discard); 178EXPORT_SYMBOL_GPL(filter_current_check_discard);
180 179
@@ -243,13 +242,6 @@ static struct tracer *trace_types __read_mostly;
243static struct tracer *current_trace __read_mostly; 242static struct tracer *current_trace __read_mostly;
244 243
245/* 244/*
246 * max_tracer_type_len is used to simplify the allocating of
247 * buffers to read userspace tracer names. We keep track of
248 * the longest tracer name registered.
249 */
250static int max_tracer_type_len;
251
252/*
253 * trace_types_lock is used to protect the trace_types list. 245 * trace_types_lock is used to protect the trace_types list.
254 * This lock is also used to keep user access serialized. 246 * This lock is also used to keep user access serialized.
255 * Accesses from userspace will grab this lock while userspace 247 * Accesses from userspace will grab this lock while userspace
@@ -265,6 +257,9 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
265 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | 257 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
266 TRACE_ITER_GRAPH_TIME; 258 TRACE_ITER_GRAPH_TIME;
267 259
260static int trace_stop_count;
261static DEFINE_SPINLOCK(tracing_start_lock);
262
268/** 263/**
269 * trace_wake_up - wake up tasks waiting for trace input 264 * trace_wake_up - wake up tasks waiting for trace input
270 * 265 *
@@ -273,12 +268,18 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
273 */ 268 */
274void trace_wake_up(void) 269void trace_wake_up(void)
275{ 270{
271 int cpu;
272
273 if (trace_flags & TRACE_ITER_BLOCK)
274 return;
276 /* 275 /*
277 * The runqueue_is_locked() can fail, but this is the best we 276 * The runqueue_is_locked() can fail, but this is the best we
278 * have for now: 277 * have for now:
279 */ 278 */
280 if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked()) 279 cpu = get_cpu();
280 if (!runqueue_is_locked(cpu))
281 wake_up(&trace_wait); 281 wake_up(&trace_wait);
282 put_cpu();
282} 283}
283 284
284static int __init set_buf_size(char *str) 285static int __init set_buf_size(char *str)
@@ -322,49 +323,125 @@ static const char *trace_options[] = {
322 "printk-msg-only", 323 "printk-msg-only",
323 "context-info", 324 "context-info",
324 "latency-format", 325 "latency-format",
325 "global-clock",
326 "sleep-time", 326 "sleep-time",
327 "graph-time", 327 "graph-time",
328 NULL 328 NULL
329}; 329};
330 330
331static struct {
332 u64 (*func)(void);
333 const char *name;
334} trace_clocks[] = {
335 { trace_clock_local, "local" },
336 { trace_clock_global, "global" },
337};
338
339int trace_clock_id;
340
331/* 341/*
332 * ftrace_max_lock is used to protect the swapping of buffers 342 * trace_parser_get_init - gets the buffer for trace parser
333 * when taking a max snapshot. The buffers themselves are
334 * protected by per_cpu spinlocks. But the action of the swap
335 * needs its own lock.
336 *
337 * This is defined as a raw_spinlock_t in order to help
338 * with performance when lockdep debugging is enabled.
339 */ 343 */
340static raw_spinlock_t ftrace_max_lock = 344int trace_parser_get_init(struct trace_parser *parser, int size)
341 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 345{
346 memset(parser, 0, sizeof(*parser));
347
348 parser->buffer = kmalloc(size, GFP_KERNEL);
349 if (!parser->buffer)
350 return 1;
351
352 parser->size = size;
353 return 0;
354}
342 355
343/* 356/*
344 * Copy the new maximum trace into the separate maximum-trace 357 * trace_parser_put - frees the buffer for trace parser
345 * structure. (this way the maximum trace is permanently saved,
346 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
347 */ 358 */
348static void 359void trace_parser_put(struct trace_parser *parser)
349__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
350{ 360{
351 struct trace_array_cpu *data = tr->data[cpu]; 361 kfree(parser->buffer);
362}
352 363
353 max_tr.cpu = cpu; 364/*
354 max_tr.time_start = data->preempt_timestamp; 365 * trace_get_user - reads the user input string separated by space
366 * (matched by isspace(ch))
367 *
368 * For each string found the 'struct trace_parser' is updated,
369 * and the function returns.
370 *
371 * Returns number of bytes read.
372 *
373 * See kernel/trace/trace.h for 'struct trace_parser' details.
374 */
375int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
376 size_t cnt, loff_t *ppos)
377{
378 char ch;
379 size_t read = 0;
380 ssize_t ret;
355 381
356 data = max_tr.data[cpu]; 382 if (!*ppos)
357 data->saved_latency = tracing_max_latency; 383 trace_parser_clear(parser);
358 384
359 memcpy(data->comm, tsk->comm, TASK_COMM_LEN); 385 ret = get_user(ch, ubuf++);
360 data->pid = tsk->pid; 386 if (ret)
361 data->uid = task_uid(tsk); 387 goto out;
362 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
363 data->policy = tsk->policy;
364 data->rt_priority = tsk->rt_priority;
365 388
366 /* record this tasks comm */ 389 read++;
367 tracing_record_cmdline(tsk); 390 cnt--;
391
392 /*
393 * The parser is not finished with the last write,
394 * continue reading the user input without skipping spaces.
395 */
396 if (!parser->cont) {
397 /* skip white space */
398 while (cnt && isspace(ch)) {
399 ret = get_user(ch, ubuf++);
400 if (ret)
401 goto out;
402 read++;
403 cnt--;
404 }
405
406 /* only spaces were written */
407 if (isspace(ch)) {
408 *ppos += read;
409 ret = read;
410 goto out;
411 }
412
413 parser->idx = 0;
414 }
415
416 /* read the non-space input */
417 while (cnt && !isspace(ch)) {
418 if (parser->idx < parser->size)
419 parser->buffer[parser->idx++] = ch;
420 else {
421 ret = -EINVAL;
422 goto out;
423 }
424 ret = get_user(ch, ubuf++);
425 if (ret)
426 goto out;
427 read++;
428 cnt--;
429 }
430
431 /* We either got finished input or we have to wait for another call. */
432 if (isspace(ch)) {
433 parser->buffer[parser->idx] = 0;
434 parser->cont = false;
435 } else {
436 parser->cont = true;
437 parser->buffer[parser->idx++] = ch;
438 }
439
440 *ppos += read;
441 ret = read;
442
443out:
444 return ret;
368} 445}
369 446
370ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 447ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
@@ -410,6 +487,56 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
410 return cnt; 487 return cnt;
411} 488}
412 489
490/*
491 * ftrace_max_lock is used to protect the swapping of buffers
492 * when taking a max snapshot. The buffers themselves are
493 * protected by per_cpu spinlocks. But the action of the swap
494 * needs its own lock.
495 *
496 * This is defined as a raw_spinlock_t in order to help
497 * with performance when lockdep debugging is enabled.
498 *
499 * It is also used in other places outside the update_max_tr
500 * so it needs to be defined outside of the
501 * CONFIG_TRACER_MAX_TRACE.
502 */
503static raw_spinlock_t ftrace_max_lock =
504 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
505
506#ifdef CONFIG_TRACER_MAX_TRACE
507unsigned long __read_mostly tracing_max_latency;
508unsigned long __read_mostly tracing_thresh;
509
510/*
511 * Copy the new maximum trace into the separate maximum-trace
512 * structure. (this way the maximum trace is permanently saved,
513 * for later retrieval via /sys/kernel/debug/tracing/latency_trace)
514 */
515static void
516__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
517{
518 struct trace_array_cpu *data = tr->data[cpu];
519 struct trace_array_cpu *max_data = tr->data[cpu];
520
521 max_tr.cpu = cpu;
522 max_tr.time_start = data->preempt_timestamp;
523
524 max_data = max_tr.data[cpu];
525 max_data->saved_latency = tracing_max_latency;
526 max_data->critical_start = data->critical_start;
527 max_data->critical_end = data->critical_end;
528
529 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
530 max_data->pid = tsk->pid;
531 max_data->uid = task_uid(tsk);
532 max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
533 max_data->policy = tsk->policy;
534 max_data->rt_priority = tsk->rt_priority;
535
536 /* record this tasks comm */
537 tracing_record_cmdline(tsk);
538}
539
413/** 540/**
414 * update_max_tr - snapshot all trace buffers from global_trace to max_tr 541 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
415 * @tr: tracer 542 * @tr: tracer
@@ -424,16 +551,15 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
424{ 551{
425 struct ring_buffer *buf = tr->buffer; 552 struct ring_buffer *buf = tr->buffer;
426 553
554 if (trace_stop_count)
555 return;
556
427 WARN_ON_ONCE(!irqs_disabled()); 557 WARN_ON_ONCE(!irqs_disabled());
428 __raw_spin_lock(&ftrace_max_lock); 558 __raw_spin_lock(&ftrace_max_lock);
429 559
430 tr->buffer = max_tr.buffer; 560 tr->buffer = max_tr.buffer;
431 max_tr.buffer = buf; 561 max_tr.buffer = buf;
432 562
433 ftrace_disable_cpu();
434 ring_buffer_reset(tr->buffer);
435 ftrace_enable_cpu();
436
437 __update_max_tr(tr, tsk, cpu); 563 __update_max_tr(tr, tsk, cpu);
438 __raw_spin_unlock(&ftrace_max_lock); 564 __raw_spin_unlock(&ftrace_max_lock);
439} 565}
@@ -451,21 +577,35 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
451{ 577{
452 int ret; 578 int ret;
453 579
580 if (trace_stop_count)
581 return;
582
454 WARN_ON_ONCE(!irqs_disabled()); 583 WARN_ON_ONCE(!irqs_disabled());
455 __raw_spin_lock(&ftrace_max_lock); 584 __raw_spin_lock(&ftrace_max_lock);
456 585
457 ftrace_disable_cpu(); 586 ftrace_disable_cpu();
458 587
459 ring_buffer_reset(max_tr.buffer);
460 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); 588 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
461 589
590 if (ret == -EBUSY) {
591 /*
592 * We failed to swap the buffer due to a commit taking
593 * place on this CPU. We fail to record, but we reset
594 * the max trace buffer (no one writes directly to it)
595 * and flag that it failed.
596 */
597 trace_array_printk(&max_tr, _THIS_IP_,
598 "Failed to swap buffers due to commit in progress\n");
599 }
600
462 ftrace_enable_cpu(); 601 ftrace_enable_cpu();
463 602
464 WARN_ON_ONCE(ret && ret != -EAGAIN); 603 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
465 604
466 __update_max_tr(tr, tsk, cpu); 605 __update_max_tr(tr, tsk, cpu);
467 __raw_spin_unlock(&ftrace_max_lock); 606 __raw_spin_unlock(&ftrace_max_lock);
468} 607}
608#endif /* CONFIG_TRACER_MAX_TRACE */
469 609
470/** 610/**
471 * register_tracer - register a tracer with the ftrace system. 611 * register_tracer - register a tracer with the ftrace system.
@@ -478,7 +618,6 @@ __releases(kernel_lock)
478__acquires(kernel_lock) 618__acquires(kernel_lock)
479{ 619{
480 struct tracer *t; 620 struct tracer *t;
481 int len;
482 int ret = 0; 621 int ret = 0;
483 622
484 if (!type->name) { 623 if (!type->name) {
@@ -486,6 +625,11 @@ __acquires(kernel_lock)
486 return -1; 625 return -1;
487 } 626 }
488 627
628 if (strlen(type->name) > MAX_TRACER_SIZE) {
629 pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE);
630 return -1;
631 }
632
489 /* 633 /*
490 * When this gets called we hold the BKL which means that 634 * When this gets called we hold the BKL which means that
491 * preemption is disabled. Various trace selftests however 635 * preemption is disabled. Various trace selftests however
@@ -500,7 +644,7 @@ __acquires(kernel_lock)
500 for (t = trace_types; t; t = t->next) { 644 for (t = trace_types; t; t = t->next) {
501 if (strcmp(type->name, t->name) == 0) { 645 if (strcmp(type->name, t->name) == 0) {
502 /* already found */ 646 /* already found */
503 pr_info("Trace %s already registered\n", 647 pr_info("Tracer %s already registered\n",
504 type->name); 648 type->name);
505 ret = -1; 649 ret = -1;
506 goto out; 650 goto out;
@@ -522,7 +666,6 @@ __acquires(kernel_lock)
522 if (type->selftest && !tracing_selftest_disabled) { 666 if (type->selftest && !tracing_selftest_disabled) {
523 struct tracer *saved_tracer = current_trace; 667 struct tracer *saved_tracer = current_trace;
524 struct trace_array *tr = &global_trace; 668 struct trace_array *tr = &global_trace;
525 int i;
526 669
527 /* 670 /*
528 * Run a selftest on this tracer. 671 * Run a selftest on this tracer.
@@ -531,8 +674,7 @@ __acquires(kernel_lock)
531 * internal tracing to verify that everything is in order. 674 * internal tracing to verify that everything is in order.
532 * If we fail, we do not register this tracer. 675 * If we fail, we do not register this tracer.
533 */ 676 */
534 for_each_tracing_cpu(i) 677 tracing_reset_online_cpus(tr);
535 tracing_reset(tr, i);
536 678
537 current_trace = type; 679 current_trace = type;
538 /* the test is responsible for initializing and enabling */ 680 /* the test is responsible for initializing and enabling */
@@ -545,8 +687,7 @@ __acquires(kernel_lock)
545 goto out; 687 goto out;
546 } 688 }
547 /* Only reset on passing, to avoid touching corrupted buffers */ 689 /* Only reset on passing, to avoid touching corrupted buffers */
548 for_each_tracing_cpu(i) 690 tracing_reset_online_cpus(tr);
549 tracing_reset(tr, i);
550 691
551 printk(KERN_CONT "PASSED\n"); 692 printk(KERN_CONT "PASSED\n");
552 } 693 }
@@ -554,9 +695,6 @@ __acquires(kernel_lock)
554 695
555 type->next = trace_types; 696 type->next = trace_types;
556 trace_types = type; 697 trace_types = type;
557 len = strlen(type->name);
558 if (len > max_tracer_type_len)
559 max_tracer_type_len = len;
560 698
561 out: 699 out:
562 tracing_selftest_running = false; 700 tracing_selftest_running = false;
@@ -565,7 +703,7 @@ __acquires(kernel_lock)
565 if (ret || !default_bootup_tracer) 703 if (ret || !default_bootup_tracer)
566 goto out_unlock; 704 goto out_unlock;
567 705
568 if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE)) 706 if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE))
569 goto out_unlock; 707 goto out_unlock;
570 708
571 printk(KERN_INFO "Starting tracer '%s'\n", type->name); 709 printk(KERN_INFO "Starting tracer '%s'\n", type->name);
@@ -587,14 +725,13 @@ __acquires(kernel_lock)
587void unregister_tracer(struct tracer *type) 725void unregister_tracer(struct tracer *type)
588{ 726{
589 struct tracer **t; 727 struct tracer **t;
590 int len;
591 728
592 mutex_lock(&trace_types_lock); 729 mutex_lock(&trace_types_lock);
593 for (t = &trace_types; *t; t = &(*t)->next) { 730 for (t = &trace_types; *t; t = &(*t)->next) {
594 if (*t == type) 731 if (*t == type)
595 goto found; 732 goto found;
596 } 733 }
597 pr_info("Trace %s not registered\n", type->name); 734 pr_info("Tracer %s not registered\n", type->name);
598 goto out; 735 goto out;
599 736
600 found: 737 found:
@@ -607,35 +744,46 @@ void unregister_tracer(struct tracer *type)
607 current_trace->stop(&global_trace); 744 current_trace->stop(&global_trace);
608 current_trace = &nop_trace; 745 current_trace = &nop_trace;
609 } 746 }
610 747out:
611 if (strlen(type->name) != max_tracer_type_len)
612 goto out;
613
614 max_tracer_type_len = 0;
615 for (t = &trace_types; *t; t = &(*t)->next) {
616 len = strlen((*t)->name);
617 if (len > max_tracer_type_len)
618 max_tracer_type_len = len;
619 }
620 out:
621 mutex_unlock(&trace_types_lock); 748 mutex_unlock(&trace_types_lock);
622} 749}
623 750
624void tracing_reset(struct trace_array *tr, int cpu) 751static void __tracing_reset(struct trace_array *tr, int cpu)
625{ 752{
626 ftrace_disable_cpu(); 753 ftrace_disable_cpu();
627 ring_buffer_reset_cpu(tr->buffer, cpu); 754 ring_buffer_reset_cpu(tr->buffer, cpu);
628 ftrace_enable_cpu(); 755 ftrace_enable_cpu();
629} 756}
630 757
758void tracing_reset(struct trace_array *tr, int cpu)
759{
760 struct ring_buffer *buffer = tr->buffer;
761
762 ring_buffer_record_disable(buffer);
763
764 /* Make sure all commits have finished */
765 synchronize_sched();
766 __tracing_reset(tr, cpu);
767
768 ring_buffer_record_enable(buffer);
769}
770
631void tracing_reset_online_cpus(struct trace_array *tr) 771void tracing_reset_online_cpus(struct trace_array *tr)
632{ 772{
773 struct ring_buffer *buffer = tr->buffer;
633 int cpu; 774 int cpu;
634 775
776 ring_buffer_record_disable(buffer);
777
778 /* Make sure all commits have finished */
779 synchronize_sched();
780
635 tr->time_start = ftrace_now(tr->cpu); 781 tr->time_start = ftrace_now(tr->cpu);
636 782
637 for_each_online_cpu(cpu) 783 for_each_online_cpu(cpu)
638 tracing_reset(tr, cpu); 784 __tracing_reset(tr, cpu);
785
786 ring_buffer_record_enable(buffer);
639} 787}
640 788
641void tracing_reset_current(int cpu) 789void tracing_reset_current(int cpu)
@@ -666,8 +814,10 @@ static void trace_init_cmdlines(void)
666 cmdline_idx = 0; 814 cmdline_idx = 0;
667} 815}
668 816
669static int trace_stop_count; 817int is_tracing_stopped(void)
670static DEFINE_SPINLOCK(tracing_start_lock); 818{
819 return trace_stop_count;
820}
671 821
672/** 822/**
673 * ftrace_off_permanent - disable all ftrace code permanently 823 * ftrace_off_permanent - disable all ftrace code permanently
@@ -836,7 +986,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
836 986
837 entry->preempt_count = pc & 0xff; 987 entry->preempt_count = pc & 0xff;
838 entry->pid = (tsk) ? tsk->pid : 0; 988 entry->pid = (tsk) ? tsk->pid : 0;
839 entry->tgid = (tsk) ? tsk->tgid : 0; 989 entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
840 entry->flags = 990 entry->flags =
841#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 991#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
842 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 992 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -847,15 +997,17 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
847 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 997 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
848 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 998 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
849} 999}
1000EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
850 1001
851struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 1002struct ring_buffer_event *
852 int type, 1003trace_buffer_lock_reserve(struct ring_buffer *buffer,
853 unsigned long len, 1004 int type,
854 unsigned long flags, int pc) 1005 unsigned long len,
1006 unsigned long flags, int pc)
855{ 1007{
856 struct ring_buffer_event *event; 1008 struct ring_buffer_event *event;
857 1009
858 event = ring_buffer_lock_reserve(tr->buffer, len); 1010 event = ring_buffer_lock_reserve(buffer, len);
859 if (event != NULL) { 1011 if (event != NULL) {
860 struct trace_entry *ent = ring_buffer_event_data(event); 1012 struct trace_entry *ent = ring_buffer_event_data(event);
861 1013
@@ -865,58 +1017,60 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
865 1017
866 return event; 1018 return event;
867} 1019}
868static void ftrace_trace_stack(struct trace_array *tr,
869 unsigned long flags, int skip, int pc);
870static void ftrace_trace_userstack(struct trace_array *tr,
871 unsigned long flags, int pc);
872 1020
873static inline void __trace_buffer_unlock_commit(struct trace_array *tr, 1021static inline void
874 struct ring_buffer_event *event, 1022__trace_buffer_unlock_commit(struct ring_buffer *buffer,
875 unsigned long flags, int pc, 1023 struct ring_buffer_event *event,
876 int wake) 1024 unsigned long flags, int pc,
1025 int wake)
877{ 1026{
878 ring_buffer_unlock_commit(tr->buffer, event); 1027 ring_buffer_unlock_commit(buffer, event);
879 1028
880 ftrace_trace_stack(tr, flags, 6, pc); 1029 ftrace_trace_stack(buffer, flags, 6, pc);
881 ftrace_trace_userstack(tr, flags, pc); 1030 ftrace_trace_userstack(buffer, flags, pc);
882 1031
883 if (wake) 1032 if (wake)
884 trace_wake_up(); 1033 trace_wake_up();
885} 1034}
886 1035
887void trace_buffer_unlock_commit(struct trace_array *tr, 1036void trace_buffer_unlock_commit(struct ring_buffer *buffer,
888 struct ring_buffer_event *event, 1037 struct ring_buffer_event *event,
889 unsigned long flags, int pc) 1038 unsigned long flags, int pc)
890{ 1039{
891 __trace_buffer_unlock_commit(tr, event, flags, pc, 1); 1040 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
892} 1041}
893 1042
894struct ring_buffer_event * 1043struct ring_buffer_event *
895trace_current_buffer_lock_reserve(int type, unsigned long len, 1044trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
1045 int type, unsigned long len,
896 unsigned long flags, int pc) 1046 unsigned long flags, int pc)
897{ 1047{
898 return trace_buffer_lock_reserve(&global_trace, 1048 *current_rb = global_trace.buffer;
1049 return trace_buffer_lock_reserve(*current_rb,
899 type, len, flags, pc); 1050 type, len, flags, pc);
900} 1051}
901EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); 1052EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
902 1053
903void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, 1054void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
1055 struct ring_buffer_event *event,
904 unsigned long flags, int pc) 1056 unsigned long flags, int pc)
905{ 1057{
906 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); 1058 __trace_buffer_unlock_commit(buffer, event, flags, pc, 1);
907} 1059}
908EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); 1060EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
909 1061
910void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, 1062void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer,
911 unsigned long flags, int pc) 1063 struct ring_buffer_event *event,
1064 unsigned long flags, int pc)
912{ 1065{
913 __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); 1066 __trace_buffer_unlock_commit(buffer, event, flags, pc, 0);
914} 1067}
915EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); 1068EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
916 1069
917void trace_current_buffer_discard_commit(struct ring_buffer_event *event) 1070void trace_current_buffer_discard_commit(struct ring_buffer *buffer,
1071 struct ring_buffer_event *event)
918{ 1072{
919 ring_buffer_discard_commit(global_trace.buffer, event); 1073 ring_buffer_discard_commit(buffer, event);
920} 1074}
921EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); 1075EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
922 1076
@@ -926,6 +1080,7 @@ trace_function(struct trace_array *tr,
926 int pc) 1080 int pc)
927{ 1081{
928 struct ftrace_event_call *call = &event_function; 1082 struct ftrace_event_call *call = &event_function;
1083 struct ring_buffer *buffer = tr->buffer;
929 struct ring_buffer_event *event; 1084 struct ring_buffer_event *event;
930 struct ftrace_entry *entry; 1085 struct ftrace_entry *entry;
931 1086
@@ -933,7 +1088,7 @@ trace_function(struct trace_array *tr,
933 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 1088 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
934 return; 1089 return;
935 1090
936 event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry), 1091 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
937 flags, pc); 1092 flags, pc);
938 if (!event) 1093 if (!event)
939 return; 1094 return;
@@ -941,58 +1096,10 @@ trace_function(struct trace_array *tr,
941 entry->ip = ip; 1096 entry->ip = ip;
942 entry->parent_ip = parent_ip; 1097 entry->parent_ip = parent_ip;
943 1098
944 if (!filter_check_discard(call, entry, tr->buffer, event)) 1099 if (!filter_check_discard(call, entry, buffer, event))
945 ring_buffer_unlock_commit(tr->buffer, event); 1100 ring_buffer_unlock_commit(buffer, event);
946} 1101}
947 1102
948#ifdef CONFIG_FUNCTION_GRAPH_TRACER
949static int __trace_graph_entry(struct trace_array *tr,
950 struct ftrace_graph_ent *trace,
951 unsigned long flags,
952 int pc)
953{
954 struct ftrace_event_call *call = &event_funcgraph_entry;
955 struct ring_buffer_event *event;
956 struct ftrace_graph_ent_entry *entry;
957
958 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
959 return 0;
960
961 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
962 sizeof(*entry), flags, pc);
963 if (!event)
964 return 0;
965 entry = ring_buffer_event_data(event);
966 entry->graph_ent = *trace;
967 if (!filter_current_check_discard(call, entry, event))
968 ring_buffer_unlock_commit(global_trace.buffer, event);
969
970 return 1;
971}
972
973static void __trace_graph_return(struct trace_array *tr,
974 struct ftrace_graph_ret *trace,
975 unsigned long flags,
976 int pc)
977{
978 struct ftrace_event_call *call = &event_funcgraph_exit;
979 struct ring_buffer_event *event;
980 struct ftrace_graph_ret_entry *entry;
981
982 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
983 return;
984
985 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
986 sizeof(*entry), flags, pc);
987 if (!event)
988 return;
989 entry = ring_buffer_event_data(event);
990 entry->ret = *trace;
991 if (!filter_current_check_discard(call, entry, event))
992 ring_buffer_unlock_commit(global_trace.buffer, event);
993}
994#endif
995
996void 1103void
997ftrace(struct trace_array *tr, struct trace_array_cpu *data, 1104ftrace(struct trace_array *tr, struct trace_array_cpu *data,
998 unsigned long ip, unsigned long parent_ip, unsigned long flags, 1105 unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -1002,17 +1109,17 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1002 trace_function(tr, ip, parent_ip, flags, pc); 1109 trace_function(tr, ip, parent_ip, flags, pc);
1003} 1110}
1004 1111
1005static void __ftrace_trace_stack(struct trace_array *tr, 1112#ifdef CONFIG_STACKTRACE
1113static void __ftrace_trace_stack(struct ring_buffer *buffer,
1006 unsigned long flags, 1114 unsigned long flags,
1007 int skip, int pc) 1115 int skip, int pc)
1008{ 1116{
1009#ifdef CONFIG_STACKTRACE
1010 struct ftrace_event_call *call = &event_kernel_stack; 1117 struct ftrace_event_call *call = &event_kernel_stack;
1011 struct ring_buffer_event *event; 1118 struct ring_buffer_event *event;
1012 struct stack_entry *entry; 1119 struct stack_entry *entry;
1013 struct stack_trace trace; 1120 struct stack_trace trace;
1014 1121
1015 event = trace_buffer_lock_reserve(tr, TRACE_STACK, 1122 event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
1016 sizeof(*entry), flags, pc); 1123 sizeof(*entry), flags, pc);
1017 if (!event) 1124 if (!event)
1018 return; 1125 return;
@@ -1025,32 +1132,28 @@ static void __ftrace_trace_stack(struct trace_array *tr,
1025 trace.entries = entry->caller; 1132 trace.entries = entry->caller;
1026 1133
1027 save_stack_trace(&trace); 1134 save_stack_trace(&trace);
1028 if (!filter_check_discard(call, entry, tr->buffer, event)) 1135 if (!filter_check_discard(call, entry, buffer, event))
1029 ring_buffer_unlock_commit(tr->buffer, event); 1136 ring_buffer_unlock_commit(buffer, event);
1030#endif
1031} 1137}
1032 1138
1033static void ftrace_trace_stack(struct trace_array *tr, 1139void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
1034 unsigned long flags, 1140 int skip, int pc)
1035 int skip, int pc)
1036{ 1141{
1037 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 1142 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1038 return; 1143 return;
1039 1144
1040 __ftrace_trace_stack(tr, flags, skip, pc); 1145 __ftrace_trace_stack(buffer, flags, skip, pc);
1041} 1146}
1042 1147
1043void __trace_stack(struct trace_array *tr, 1148void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1044 unsigned long flags, 1149 int pc)
1045 int skip, int pc)
1046{ 1150{
1047 __ftrace_trace_stack(tr, flags, skip, pc); 1151 __ftrace_trace_stack(tr->buffer, flags, skip, pc);
1048} 1152}
1049 1153
1050static void ftrace_trace_userstack(struct trace_array *tr, 1154void
1051 unsigned long flags, int pc) 1155ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1052{ 1156{
1053#ifdef CONFIG_STACKTRACE
1054 struct ftrace_event_call *call = &event_user_stack; 1157 struct ftrace_event_call *call = &event_user_stack;
1055 struct ring_buffer_event *event; 1158 struct ring_buffer_event *event;
1056 struct userstack_entry *entry; 1159 struct userstack_entry *entry;
@@ -1059,12 +1162,13 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1059 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1162 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1060 return; 1163 return;
1061 1164
1062 event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK, 1165 event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
1063 sizeof(*entry), flags, pc); 1166 sizeof(*entry), flags, pc);
1064 if (!event) 1167 if (!event)
1065 return; 1168 return;
1066 entry = ring_buffer_event_data(event); 1169 entry = ring_buffer_event_data(event);
1067 1170
1171 entry->tgid = current->tgid;
1068 memset(&entry->caller, 0, sizeof(entry->caller)); 1172 memset(&entry->caller, 0, sizeof(entry->caller));
1069 1173
1070 trace.nr_entries = 0; 1174 trace.nr_entries = 0;
@@ -1073,9 +1177,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1073 trace.entries = entry->caller; 1177 trace.entries = entry->caller;
1074 1178
1075 save_stack_trace_user(&trace); 1179 save_stack_trace_user(&trace);
1076 if (!filter_check_discard(call, entry, tr->buffer, event)) 1180 if (!filter_check_discard(call, entry, buffer, event))
1077 ring_buffer_unlock_commit(tr->buffer, event); 1181 ring_buffer_unlock_commit(buffer, event);
1078#endif
1079} 1182}
1080 1183
1081#ifdef UNUSED 1184#ifdef UNUSED
@@ -1085,16 +1188,20 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1085} 1188}
1086#endif /* UNUSED */ 1189#endif /* UNUSED */
1087 1190
1191#endif /* CONFIG_STACKTRACE */
1192
1088static void 1193static void
1089ftrace_trace_special(void *__tr, 1194ftrace_trace_special(void *__tr,
1090 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1195 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1091 int pc) 1196 int pc)
1092{ 1197{
1198 struct ftrace_event_call *call = &event_special;
1093 struct ring_buffer_event *event; 1199 struct ring_buffer_event *event;
1094 struct trace_array *tr = __tr; 1200 struct trace_array *tr = __tr;
1201 struct ring_buffer *buffer = tr->buffer;
1095 struct special_entry *entry; 1202 struct special_entry *entry;
1096 1203
1097 event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL, 1204 event = trace_buffer_lock_reserve(buffer, TRACE_SPECIAL,
1098 sizeof(*entry), 0, pc); 1205 sizeof(*entry), 0, pc);
1099 if (!event) 1206 if (!event)
1100 return; 1207 return;
@@ -1102,7 +1209,9 @@ ftrace_trace_special(void *__tr,
1102 entry->arg1 = arg1; 1209 entry->arg1 = arg1;
1103 entry->arg2 = arg2; 1210 entry->arg2 = arg2;
1104 entry->arg3 = arg3; 1211 entry->arg3 = arg3;
1105 trace_buffer_unlock_commit(tr, event, 0, pc); 1212
1213 if (!filter_check_discard(call, entry, buffer, event))
1214 trace_buffer_unlock_commit(buffer, event, 0, pc);
1106} 1215}
1107 1216
1108void 1217void
@@ -1113,62 +1222,6 @@ __trace_special(void *__tr, void *__data,
1113} 1222}
1114 1223
1115void 1224void
1116tracing_sched_switch_trace(struct trace_array *tr,
1117 struct task_struct *prev,
1118 struct task_struct *next,
1119 unsigned long flags, int pc)
1120{
1121 struct ftrace_event_call *call = &event_context_switch;
1122 struct ring_buffer_event *event;
1123 struct ctx_switch_entry *entry;
1124
1125 event = trace_buffer_lock_reserve(tr, TRACE_CTX,
1126 sizeof(*entry), flags, pc);
1127 if (!event)
1128 return;
1129 entry = ring_buffer_event_data(event);
1130 entry->prev_pid = prev->pid;
1131 entry->prev_prio = prev->prio;
1132 entry->prev_state = prev->state;
1133 entry->next_pid = next->pid;
1134 entry->next_prio = next->prio;
1135 entry->next_state = next->state;
1136 entry->next_cpu = task_cpu(next);
1137
1138 if (!filter_check_discard(call, entry, tr->buffer, event))
1139 trace_buffer_unlock_commit(tr, event, flags, pc);
1140}
1141
1142void
1143tracing_sched_wakeup_trace(struct trace_array *tr,
1144 struct task_struct *wakee,
1145 struct task_struct *curr,
1146 unsigned long flags, int pc)
1147{
1148 struct ftrace_event_call *call = &event_wakeup;
1149 struct ring_buffer_event *event;
1150 struct ctx_switch_entry *entry;
1151
1152 event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
1153 sizeof(*entry), flags, pc);
1154 if (!event)
1155 return;
1156 entry = ring_buffer_event_data(event);
1157 entry->prev_pid = curr->pid;
1158 entry->prev_prio = curr->prio;
1159 entry->prev_state = curr->state;
1160 entry->next_pid = wakee->pid;
1161 entry->next_prio = wakee->prio;
1162 entry->next_state = wakee->state;
1163 entry->next_cpu = task_cpu(wakee);
1164
1165 if (!filter_check_discard(call, entry, tr->buffer, event))
1166 ring_buffer_unlock_commit(tr->buffer, event);
1167 ftrace_trace_stack(tr, flags, 6, pc);
1168 ftrace_trace_userstack(tr, flags, pc);
1169}
1170
1171void
1172ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) 1225ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1173{ 1226{
1174 struct trace_array *tr = &global_trace; 1227 struct trace_array *tr = &global_trace;
@@ -1192,68 +1245,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1192 local_irq_restore(flags); 1245 local_irq_restore(flags);
1193} 1246}
1194 1247
1195#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1196int trace_graph_entry(struct ftrace_graph_ent *trace)
1197{
1198 struct trace_array *tr = &global_trace;
1199 struct trace_array_cpu *data;
1200 unsigned long flags;
1201 long disabled;
1202 int ret;
1203 int cpu;
1204 int pc;
1205
1206 if (!ftrace_trace_task(current))
1207 return 0;
1208
1209 if (!ftrace_graph_addr(trace->func))
1210 return 0;
1211
1212 local_irq_save(flags);
1213 cpu = raw_smp_processor_id();
1214 data = tr->data[cpu];
1215 disabled = atomic_inc_return(&data->disabled);
1216 if (likely(disabled == 1)) {
1217 pc = preempt_count();
1218 ret = __trace_graph_entry(tr, trace, flags, pc);
1219 } else {
1220 ret = 0;
1221 }
1222 /* Only do the atomic if it is not already set */
1223 if (!test_tsk_trace_graph(current))
1224 set_tsk_trace_graph(current);
1225
1226 atomic_dec(&data->disabled);
1227 local_irq_restore(flags);
1228
1229 return ret;
1230}
1231
1232void trace_graph_return(struct ftrace_graph_ret *trace)
1233{
1234 struct trace_array *tr = &global_trace;
1235 struct trace_array_cpu *data;
1236 unsigned long flags;
1237 long disabled;
1238 int cpu;
1239 int pc;
1240
1241 local_irq_save(flags);
1242 cpu = raw_smp_processor_id();
1243 data = tr->data[cpu];
1244 disabled = atomic_inc_return(&data->disabled);
1245 if (likely(disabled == 1)) {
1246 pc = preempt_count();
1247 __trace_graph_return(tr, trace, flags, pc);
1248 }
1249 if (!trace->depth)
1250 clear_tsk_trace_graph(current);
1251 atomic_dec(&data->disabled);
1252 local_irq_restore(flags);
1253}
1254#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1255
1256
1257/** 1248/**
1258 * trace_vbprintk - write binary msg to tracing buffer 1249 * trace_vbprintk - write binary msg to tracing buffer
1259 * 1250 *
@@ -1266,6 +1257,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1266 1257
1267 struct ftrace_event_call *call = &event_bprint; 1258 struct ftrace_event_call *call = &event_bprint;
1268 struct ring_buffer_event *event; 1259 struct ring_buffer_event *event;
1260 struct ring_buffer *buffer;
1269 struct trace_array *tr = &global_trace; 1261 struct trace_array *tr = &global_trace;
1270 struct trace_array_cpu *data; 1262 struct trace_array_cpu *data;
1271 struct bprint_entry *entry; 1263 struct bprint_entry *entry;
@@ -1298,7 +1290,9 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1298 goto out_unlock; 1290 goto out_unlock;
1299 1291
1300 size = sizeof(*entry) + sizeof(u32) * len; 1292 size = sizeof(*entry) + sizeof(u32) * len;
1301 event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc); 1293 buffer = tr->buffer;
1294 event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
1295 flags, pc);
1302 if (!event) 1296 if (!event)
1303 goto out_unlock; 1297 goto out_unlock;
1304 entry = ring_buffer_event_data(event); 1298 entry = ring_buffer_event_data(event);
@@ -1306,8 +1300,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1306 entry->fmt = fmt; 1300 entry->fmt = fmt;
1307 1301
1308 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1302 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1309 if (!filter_check_discard(call, entry, tr->buffer, event)) 1303 if (!filter_check_discard(call, entry, buffer, event))
1310 ring_buffer_unlock_commit(tr->buffer, event); 1304 ring_buffer_unlock_commit(buffer, event);
1311 1305
1312out_unlock: 1306out_unlock:
1313 __raw_spin_unlock(&trace_buf_lock); 1307 __raw_spin_unlock(&trace_buf_lock);
@@ -1322,14 +1316,30 @@ out:
1322} 1316}
1323EXPORT_SYMBOL_GPL(trace_vbprintk); 1317EXPORT_SYMBOL_GPL(trace_vbprintk);
1324 1318
1325int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 1319int trace_array_printk(struct trace_array *tr,
1320 unsigned long ip, const char *fmt, ...)
1321{
1322 int ret;
1323 va_list ap;
1324
1325 if (!(trace_flags & TRACE_ITER_PRINTK))
1326 return 0;
1327
1328 va_start(ap, fmt);
1329 ret = trace_array_vprintk(tr, ip, fmt, ap);
1330 va_end(ap);
1331 return ret;
1332}
1333
1334int trace_array_vprintk(struct trace_array *tr,
1335 unsigned long ip, const char *fmt, va_list args)
1326{ 1336{
1327 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1337 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
1328 static char trace_buf[TRACE_BUF_SIZE]; 1338 static char trace_buf[TRACE_BUF_SIZE];
1329 1339
1330 struct ftrace_event_call *call = &event_print; 1340 struct ftrace_event_call *call = &event_print;
1331 struct ring_buffer_event *event; 1341 struct ring_buffer_event *event;
1332 struct trace_array *tr = &global_trace; 1342 struct ring_buffer *buffer;
1333 struct trace_array_cpu *data; 1343 struct trace_array_cpu *data;
1334 int cpu, len = 0, size, pc; 1344 int cpu, len = 0, size, pc;
1335 struct print_entry *entry; 1345 struct print_entry *entry;
@@ -1357,7 +1367,9 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1357 trace_buf[len] = 0; 1367 trace_buf[len] = 0;
1358 1368
1359 size = sizeof(*entry) + len + 1; 1369 size = sizeof(*entry) + len + 1;
1360 event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc); 1370 buffer = tr->buffer;
1371 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
1372 irq_flags, pc);
1361 if (!event) 1373 if (!event)
1362 goto out_unlock; 1374 goto out_unlock;
1363 entry = ring_buffer_event_data(event); 1375 entry = ring_buffer_event_data(event);
@@ -1365,8 +1377,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1365 1377
1366 memcpy(&entry->buf, trace_buf, len); 1378 memcpy(&entry->buf, trace_buf, len);
1367 entry->buf[len] = 0; 1379 entry->buf[len] = 0;
1368 if (!filter_check_discard(call, entry, tr->buffer, event)) 1380 if (!filter_check_discard(call, entry, buffer, event))
1369 ring_buffer_unlock_commit(tr->buffer, event); 1381 ring_buffer_unlock_commit(buffer, event);
1370 1382
1371 out_unlock: 1383 out_unlock:
1372 __raw_spin_unlock(&trace_buf_lock); 1384 __raw_spin_unlock(&trace_buf_lock);
@@ -1378,6 +1390,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1378 1390
1379 return len; 1391 return len;
1380} 1392}
1393
1394int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1395{
1396 return trace_array_printk(&global_trace, ip, fmt, args);
1397}
1381EXPORT_SYMBOL_GPL(trace_vprintk); 1398EXPORT_SYMBOL_GPL(trace_vprintk);
1382 1399
1383enum trace_file_type { 1400enum trace_file_type {
@@ -1517,6 +1534,37 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1517 return ent; 1534 return ent;
1518} 1535}
1519 1536
1537static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1538{
1539 struct trace_array *tr = iter->tr;
1540 struct ring_buffer_event *event;
1541 struct ring_buffer_iter *buf_iter;
1542 unsigned long entries = 0;
1543 u64 ts;
1544
1545 tr->data[cpu]->skipped_entries = 0;
1546
1547 if (!iter->buffer_iter[cpu])
1548 return;
1549
1550 buf_iter = iter->buffer_iter[cpu];
1551 ring_buffer_iter_reset(buf_iter);
1552
1553 /*
1554 * We could have the case with the max latency tracers
1555 * that a reset never took place on a cpu. This is evident
1556 * by the timestamp being before the start of the buffer.
1557 */
1558 while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
1559 if (ts >= iter->tr->time_start)
1560 break;
1561 entries++;
1562 ring_buffer_read(buf_iter, NULL);
1563 }
1564
1565 tr->data[cpu]->skipped_entries = entries;
1566}
1567
1520/* 1568/*
1521 * No necessary locking here. The worst thing which can 1569 * No necessary locking here. The worst thing which can
1522 * happen is loosing events consumed at the same time 1570 * happen is loosing events consumed at the same time
@@ -1555,10 +1603,9 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1555 1603
1556 if (cpu_file == TRACE_PIPE_ALL_CPU) { 1604 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1557 for_each_tracing_cpu(cpu) 1605 for_each_tracing_cpu(cpu)
1558 ring_buffer_iter_reset(iter->buffer_iter[cpu]); 1606 tracing_iter_reset(iter, cpu);
1559 } else 1607 } else
1560 ring_buffer_iter_reset(iter->buffer_iter[cpu_file]); 1608 tracing_iter_reset(iter, cpu_file);
1561
1562 1609
1563 ftrace_enable_cpu(); 1610 ftrace_enable_cpu();
1564 1611
@@ -1587,10 +1634,10 @@ static void print_lat_help_header(struct seq_file *m)
1587 seq_puts(m, "# | / _----=> need-resched \n"); 1634 seq_puts(m, "# | / _----=> need-resched \n");
1588 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1635 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1589 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1636 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1590 seq_puts(m, "# |||| / \n"); 1637 seq_puts(m, "# |||| /_--=> lock-depth \n");
1591 seq_puts(m, "# ||||| delay \n"); 1638 seq_puts(m, "# |||||/ delay \n");
1592 seq_puts(m, "# cmd pid ||||| time | caller \n"); 1639 seq_puts(m, "# cmd pid |||||| time | caller \n");
1593 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1640 seq_puts(m, "# \\ / |||||| \\ | / \n");
1594} 1641}
1595 1642
1596static void print_func_help_header(struct seq_file *m) 1643static void print_func_help_header(struct seq_file *m)
@@ -1607,16 +1654,32 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1607 struct trace_array *tr = iter->tr; 1654 struct trace_array *tr = iter->tr;
1608 struct trace_array_cpu *data = tr->data[tr->cpu]; 1655 struct trace_array_cpu *data = tr->data[tr->cpu];
1609 struct tracer *type = current_trace; 1656 struct tracer *type = current_trace;
1610 unsigned long total; 1657 unsigned long entries = 0;
1611 unsigned long entries; 1658 unsigned long total = 0;
1659 unsigned long count;
1612 const char *name = "preemption"; 1660 const char *name = "preemption";
1661 int cpu;
1613 1662
1614 if (type) 1663 if (type)
1615 name = type->name; 1664 name = type->name;
1616 1665
1617 entries = ring_buffer_entries(iter->tr->buffer); 1666
1618 total = entries + 1667 for_each_tracing_cpu(cpu) {
1619 ring_buffer_overruns(iter->tr->buffer); 1668 count = ring_buffer_entries_cpu(tr->buffer, cpu);
1669 /*
1670 * If this buffer has skipped entries, then we hold all
1671 * entries for the trace and we need to ignore the
1672 * ones before the time stamp.
1673 */
1674 if (tr->data[cpu]->skipped_entries) {
1675 count -= tr->data[cpu]->skipped_entries;
1676 /* total is the same as the entries */
1677 total += count;
1678 } else
1679 total += count +
1680 ring_buffer_overrun_cpu(tr->buffer, cpu);
1681 entries += count;
1682 }
1620 1683
1621 seq_printf(m, "# %s latency trace v1.1.5 on %s\n", 1684 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1622 name, UTS_RELEASE); 1685 name, UTS_RELEASE);
@@ -1658,7 +1721,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1658 seq_puts(m, "\n# => ended at: "); 1721 seq_puts(m, "\n# => ended at: ");
1659 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); 1722 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
1660 trace_print_seq(m, &iter->seq); 1723 trace_print_seq(m, &iter->seq);
1661 seq_puts(m, "#\n"); 1724 seq_puts(m, "\n#\n");
1662 } 1725 }
1663 1726
1664 seq_puts(m, "#\n"); 1727 seq_puts(m, "#\n");
@@ -1677,6 +1740,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1677 if (cpumask_test_cpu(iter->cpu, iter->started)) 1740 if (cpumask_test_cpu(iter->cpu, iter->started))
1678 return; 1741 return;
1679 1742
1743 if (iter->tr->data[iter->cpu]->skipped_entries)
1744 return;
1745
1680 cpumask_set_cpu(iter->cpu, iter->started); 1746 cpumask_set_cpu(iter->cpu, iter->started);
1681 1747
1682 /* Don't print started cpu buffer for the first entry of the trace */ 1748 /* Don't print started cpu buffer for the first entry of the trace */
@@ -1939,19 +2005,23 @@ __tracing_open(struct inode *inode, struct file *file)
1939 if (ring_buffer_overruns(iter->tr->buffer)) 2005 if (ring_buffer_overruns(iter->tr->buffer))
1940 iter->iter_flags |= TRACE_FILE_ANNOTATE; 2006 iter->iter_flags |= TRACE_FILE_ANNOTATE;
1941 2007
2008 /* stop the trace while dumping */
2009 tracing_stop();
2010
1942 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2011 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
1943 for_each_tracing_cpu(cpu) { 2012 for_each_tracing_cpu(cpu) {
1944 2013
1945 iter->buffer_iter[cpu] = 2014 iter->buffer_iter[cpu] =
1946 ring_buffer_read_start(iter->tr->buffer, cpu); 2015 ring_buffer_read_start(iter->tr->buffer, cpu);
2016 tracing_iter_reset(iter, cpu);
1947 } 2017 }
1948 } else { 2018 } else {
1949 cpu = iter->cpu_file; 2019 cpu = iter->cpu_file;
1950 iter->buffer_iter[cpu] = 2020 iter->buffer_iter[cpu] =
1951 ring_buffer_read_start(iter->tr->buffer, cpu); 2021 ring_buffer_read_start(iter->tr->buffer, cpu);
2022 tracing_iter_reset(iter, cpu);
1952 } 2023 }
1953 2024
1954 /* TODO stop tracer */
1955 ret = seq_open(file, &tracer_seq_ops); 2025 ret = seq_open(file, &tracer_seq_ops);
1956 if (ret < 0) { 2026 if (ret < 0) {
1957 fail_ret = ERR_PTR(ret); 2027 fail_ret = ERR_PTR(ret);
@@ -1961,9 +2031,6 @@ __tracing_open(struct inode *inode, struct file *file)
1961 m = file->private_data; 2031 m = file->private_data;
1962 m->private = iter; 2032 m->private = iter;
1963 2033
1964 /* stop the trace while dumping */
1965 tracing_stop();
1966
1967 mutex_unlock(&trace_types_lock); 2034 mutex_unlock(&trace_types_lock);
1968 2035
1969 return iter; 2036 return iter;
@@ -1974,6 +2041,7 @@ __tracing_open(struct inode *inode, struct file *file)
1974 ring_buffer_read_finish(iter->buffer_iter[cpu]); 2041 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1975 } 2042 }
1976 free_cpumask_var(iter->started); 2043 free_cpumask_var(iter->started);
2044 tracing_start();
1977 fail: 2045 fail:
1978 mutex_unlock(&trace_types_lock); 2046 mutex_unlock(&trace_types_lock);
1979 kfree(iter->trace); 2047 kfree(iter->trace);
@@ -2030,7 +2098,7 @@ static int tracing_open(struct inode *inode, struct file *file)
2030 2098
2031 /* If this file was open for write, then erase contents */ 2099 /* If this file was open for write, then erase contents */
2032 if ((file->f_mode & FMODE_WRITE) && 2100 if ((file->f_mode & FMODE_WRITE) &&
2033 !(file->f_flags & O_APPEND)) { 2101 (file->f_flags & O_TRUNC)) {
2034 long cpu = (long) inode->i_private; 2102 long cpu = (long) inode->i_private;
2035 2103
2036 if (cpu == TRACE_PIPE_ALL_CPU) 2104 if (cpu == TRACE_PIPE_ALL_CPU)
@@ -2255,8 +2323,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2255 len += 3; /* "no" and newline */ 2323 len += 3; /* "no" and newline */
2256 } 2324 }
2257 2325
2258 /* +2 for \n and \0 */ 2326 /* +1 for \0 */
2259 buf = kmalloc(len + 2, GFP_KERNEL); 2327 buf = kmalloc(len + 1, GFP_KERNEL);
2260 if (!buf) { 2328 if (!buf) {
2261 mutex_unlock(&trace_types_lock); 2329 mutex_unlock(&trace_types_lock);
2262 return -ENOMEM; 2330 return -ENOMEM;
@@ -2279,7 +2347,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2279 } 2347 }
2280 mutex_unlock(&trace_types_lock); 2348 mutex_unlock(&trace_types_lock);
2281 2349
2282 WARN_ON(r >= len + 2); 2350 WARN_ON(r >= len + 1);
2283 2351
2284 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2352 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2285 2353
@@ -2290,23 +2358,23 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2290/* Try to assign a tracer specific option */ 2358/* Try to assign a tracer specific option */
2291static int set_tracer_option(struct tracer *trace, char *cmp, int neg) 2359static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2292{ 2360{
2293 struct tracer_flags *trace_flags = trace->flags; 2361 struct tracer_flags *tracer_flags = trace->flags;
2294 struct tracer_opt *opts = NULL; 2362 struct tracer_opt *opts = NULL;
2295 int ret = 0, i = 0; 2363 int ret = 0, i = 0;
2296 int len; 2364 int len;
2297 2365
2298 for (i = 0; trace_flags->opts[i].name; i++) { 2366 for (i = 0; tracer_flags->opts[i].name; i++) {
2299 opts = &trace_flags->opts[i]; 2367 opts = &tracer_flags->opts[i];
2300 len = strlen(opts->name); 2368 len = strlen(opts->name);
2301 2369
2302 if (strncmp(cmp, opts->name, len) == 0) { 2370 if (strncmp(cmp, opts->name, len) == 0) {
2303 ret = trace->set_flag(trace_flags->val, 2371 ret = trace->set_flag(tracer_flags->val,
2304 opts->bit, !neg); 2372 opts->bit, !neg);
2305 break; 2373 break;
2306 } 2374 }
2307 } 2375 }
2308 /* Not found */ 2376 /* Not found */
2309 if (!trace_flags->opts[i].name) 2377 if (!tracer_flags->opts[i].name)
2310 return -EINVAL; 2378 return -EINVAL;
2311 2379
2312 /* Refused to handle */ 2380 /* Refused to handle */
@@ -2314,9 +2382,9 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2314 return ret; 2382 return ret;
2315 2383
2316 if (neg) 2384 if (neg)
2317 trace_flags->val &= ~opts->bit; 2385 tracer_flags->val &= ~opts->bit;
2318 else 2386 else
2319 trace_flags->val |= opts->bit; 2387 tracer_flags->val |= opts->bit;
2320 2388
2321 return 0; 2389 return 0;
2322} 2390}
@@ -2331,22 +2399,6 @@ static void set_tracer_flags(unsigned int mask, int enabled)
2331 trace_flags |= mask; 2399 trace_flags |= mask;
2332 else 2400 else
2333 trace_flags &= ~mask; 2401 trace_flags &= ~mask;
2334
2335 if (mask == TRACE_ITER_GLOBAL_CLK) {
2336 u64 (*func)(void);
2337
2338 if (enabled)
2339 func = trace_clock_global;
2340 else
2341 func = trace_clock_local;
2342
2343 mutex_lock(&trace_types_lock);
2344 ring_buffer_set_clock(global_trace.buffer, func);
2345
2346 if (max_tr.buffer)
2347 ring_buffer_set_clock(max_tr.buffer, func);
2348 mutex_unlock(&trace_types_lock);
2349 }
2350} 2402}
2351 2403
2352static ssize_t 2404static ssize_t
@@ -2541,7 +2593,7 @@ static ssize_t
2541tracing_set_trace_read(struct file *filp, char __user *ubuf, 2593tracing_set_trace_read(struct file *filp, char __user *ubuf,
2542 size_t cnt, loff_t *ppos) 2594 size_t cnt, loff_t *ppos)
2543{ 2595{
2544 char buf[max_tracer_type_len+2]; 2596 char buf[MAX_TRACER_SIZE+2];
2545 int r; 2597 int r;
2546 2598
2547 mutex_lock(&trace_types_lock); 2599 mutex_lock(&trace_types_lock);
@@ -2691,15 +2743,15 @@ static ssize_t
2691tracing_set_trace_write(struct file *filp, const char __user *ubuf, 2743tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2692 size_t cnt, loff_t *ppos) 2744 size_t cnt, loff_t *ppos)
2693{ 2745{
2694 char buf[max_tracer_type_len+1]; 2746 char buf[MAX_TRACER_SIZE+1];
2695 int i; 2747 int i;
2696 size_t ret; 2748 size_t ret;
2697 int err; 2749 int err;
2698 2750
2699 ret = cnt; 2751 ret = cnt;
2700 2752
2701 if (cnt > max_tracer_type_len) 2753 if (cnt > MAX_TRACER_SIZE)
2702 cnt = max_tracer_type_len; 2754 cnt = MAX_TRACER_SIZE;
2703 2755
2704 if (copy_from_user(&buf, ubuf, cnt)) 2756 if (copy_from_user(&buf, ubuf, cnt))
2705 return -EFAULT; 2757 return -EFAULT;
@@ -3084,7 +3136,8 @@ tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
3084 break; 3136 break;
3085 } 3137 }
3086 3138
3087 trace_consume(iter); 3139 if (ret != TRACE_TYPE_NO_CONSUME)
3140 trace_consume(iter);
3088 rem -= count; 3141 rem -= count;
3089 if (!find_next_entry_inc(iter)) { 3142 if (!find_next_entry_inc(iter)) {
3090 rem = 0; 3143 rem = 0;
@@ -3313,6 +3366,62 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3313 return cnt; 3366 return cnt;
3314} 3367}
3315 3368
3369static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf,
3370 size_t cnt, loff_t *ppos)
3371{
3372 char buf[64];
3373 int bufiter = 0;
3374 int i;
3375
3376 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
3377 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter,
3378 "%s%s%s%s", i ? " " : "",
3379 i == trace_clock_id ? "[" : "", trace_clocks[i].name,
3380 i == trace_clock_id ? "]" : "");
3381 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n");
3382
3383 return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter);
3384}
3385
3386static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
3387 size_t cnt, loff_t *fpos)
3388{
3389 char buf[64];
3390 const char *clockstr;
3391 int i;
3392
3393 if (cnt >= sizeof(buf))
3394 return -EINVAL;
3395
3396 if (copy_from_user(&buf, ubuf, cnt))
3397 return -EFAULT;
3398
3399 buf[cnt] = 0;
3400
3401 clockstr = strstrip(buf);
3402
3403 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) {
3404 if (strcmp(trace_clocks[i].name, clockstr) == 0)
3405 break;
3406 }
3407 if (i == ARRAY_SIZE(trace_clocks))
3408 return -EINVAL;
3409
3410 trace_clock_id = i;
3411
3412 mutex_lock(&trace_types_lock);
3413
3414 ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func);
3415 if (max_tr.buffer)
3416 ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
3417
3418 mutex_unlock(&trace_types_lock);
3419
3420 *fpos += cnt;
3421
3422 return cnt;
3423}
3424
3316static const struct file_operations tracing_max_lat_fops = { 3425static const struct file_operations tracing_max_lat_fops = {
3317 .open = tracing_open_generic, 3426 .open = tracing_open_generic,
3318 .read = tracing_max_lat_read, 3427 .read = tracing_max_lat_read,
@@ -3350,6 +3459,12 @@ static const struct file_operations tracing_mark_fops = {
3350 .write = tracing_mark_write, 3459 .write = tracing_mark_write,
3351}; 3460};
3352 3461
3462static const struct file_operations trace_clock_fops = {
3463 .open = tracing_open_generic,
3464 .read = tracing_clock_read,
3465 .write = tracing_clock_write,
3466};
3467
3353struct ftrace_buffer_info { 3468struct ftrace_buffer_info {
3354 struct trace_array *tr; 3469 struct trace_array *tr;
3355 void *spare; 3470 void *spare;
@@ -3630,9 +3745,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3630 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 3745 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
3631 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 3746 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
3632 3747
3633 cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
3634 trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
3635
3636 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 3748 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
3637 3749
3638 kfree(s); 3750 kfree(s);
@@ -3893,17 +4005,9 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
3893 if (ret < 0) 4005 if (ret < 0)
3894 return ret; 4006 return ret;
3895 4007
3896 switch (val) { 4008 if (val != 0 && val != 1)
3897 case 0:
3898 trace_flags &= ~(1 << index);
3899 break;
3900 case 1:
3901 trace_flags |= 1 << index;
3902 break;
3903
3904 default:
3905 return -EINVAL; 4009 return -EINVAL;
3906 } 4010 set_tracer_flags(1 << index, val);
3907 4011
3908 *ppos += cnt; 4012 *ppos += cnt;
3909 4013
@@ -4071,11 +4175,13 @@ static __init int tracer_init_debugfs(void)
4071 trace_create_file("current_tracer", 0644, d_tracer, 4175 trace_create_file("current_tracer", 0644, d_tracer,
4072 &global_trace, &set_tracer_fops); 4176 &global_trace, &set_tracer_fops);
4073 4177
4178#ifdef CONFIG_TRACER_MAX_TRACE
4074 trace_create_file("tracing_max_latency", 0644, d_tracer, 4179 trace_create_file("tracing_max_latency", 0644, d_tracer,
4075 &tracing_max_latency, &tracing_max_lat_fops); 4180 &tracing_max_latency, &tracing_max_lat_fops);
4076 4181
4077 trace_create_file("tracing_thresh", 0644, d_tracer, 4182 trace_create_file("tracing_thresh", 0644, d_tracer,
4078 &tracing_thresh, &tracing_max_lat_fops); 4183 &tracing_thresh, &tracing_max_lat_fops);
4184#endif
4079 4185
4080 trace_create_file("README", 0444, d_tracer, 4186 trace_create_file("README", 0444, d_tracer,
4081 NULL, &tracing_readme_fops); 4187 NULL, &tracing_readme_fops);
@@ -4092,6 +4198,9 @@ static __init int tracer_init_debugfs(void)
4092 trace_create_file("saved_cmdlines", 0444, d_tracer, 4198 trace_create_file("saved_cmdlines", 0444, d_tracer,
4093 NULL, &tracing_saved_cmdlines_fops); 4199 NULL, &tracing_saved_cmdlines_fops);
4094 4200
4201 trace_create_file("trace_clock", 0644, d_tracer, NULL,
4202 &trace_clock_fops);
4203
4095#ifdef CONFIG_DYNAMIC_FTRACE 4204#ifdef CONFIG_DYNAMIC_FTRACE
4096 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, 4205 trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
4097 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 4206 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -4232,8 +4341,11 @@ static void __ftrace_dump(bool disable_tracing)
4232 iter.pos = -1; 4341 iter.pos = -1;
4233 4342
4234 if (find_next_entry_inc(&iter) != NULL) { 4343 if (find_next_entry_inc(&iter) != NULL) {
4235 print_trace_line(&iter); 4344 int ret;
4236 trace_consume(&iter); 4345
4346 ret = print_trace_line(&iter);
4347 if (ret != TRACE_TYPE_NO_CONSUME)
4348 trace_consume(&iter);
4237 } 4349 }
4238 4350
4239 trace_printk_seq(&iter.seq); 4351 trace_printk_seq(&iter.seq);
@@ -4267,7 +4379,6 @@ void ftrace_dump(void)
4267 4379
4268__init static int tracer_alloc_buffers(void) 4380__init static int tracer_alloc_buffers(void)
4269{ 4381{
4270 struct trace_array_cpu *data;
4271 int ring_buf_size; 4382 int ring_buf_size;
4272 int i; 4383 int i;
4273 int ret = -ENOMEM; 4384 int ret = -ENOMEM;
@@ -4317,7 +4428,7 @@ __init static int tracer_alloc_buffers(void)
4317 4428
4318 /* Allocate the first page for all buffers */ 4429 /* Allocate the first page for all buffers */
4319 for_each_tracing_cpu(i) { 4430 for_each_tracing_cpu(i) {
4320 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4431 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4321 max_tr.data[i] = &per_cpu(max_data, i); 4432 max_tr.data[i] = &per_cpu(max_data, i);
4322 } 4433 }
4323 4434
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3548ae5cc780..405cb850b75d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -7,10 +7,10 @@
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/tracepoint.h>
10#include <linux/ftrace.h> 11#include <linux/ftrace.h>
11#include <trace/boot.h> 12#include <trace/boot.h>
12#include <linux/kmemtrace.h> 13#include <linux/kmemtrace.h>
13#include <trace/power.h>
14 14
15#include <linux/trace_seq.h> 15#include <linux/trace_seq.h>
16#include <linux/ftrace_event.h> 16#include <linux/ftrace_event.h>
@@ -34,167 +34,61 @@ enum trace_type {
34 TRACE_GRAPH_ENT, 34 TRACE_GRAPH_ENT,
35 TRACE_USER_STACK, 35 TRACE_USER_STACK,
36 TRACE_HW_BRANCHES, 36 TRACE_HW_BRANCHES,
37 TRACE_SYSCALL_ENTER,
38 TRACE_SYSCALL_EXIT,
39 TRACE_KMEM_ALLOC, 37 TRACE_KMEM_ALLOC,
40 TRACE_KMEM_FREE, 38 TRACE_KMEM_FREE,
41 TRACE_POWER,
42 TRACE_BLK, 39 TRACE_BLK,
43 40
44 __TRACE_LAST_TYPE, 41 __TRACE_LAST_TYPE,
45}; 42};
46 43
47/* 44enum kmemtrace_type_id {
48 * Function trace entry - function address and parent function addres: 45 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
49 */ 46 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
50struct ftrace_entry { 47 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
51 struct trace_entry ent;
52 unsigned long ip;
53 unsigned long parent_ip;
54};
55
56/* Function call entry */
57struct ftrace_graph_ent_entry {
58 struct trace_entry ent;
59 struct ftrace_graph_ent graph_ent;
60}; 48};
61 49
62/* Function return entry */
63struct ftrace_graph_ret_entry {
64 struct trace_entry ent;
65 struct ftrace_graph_ret ret;
66};
67extern struct tracer boot_tracer; 50extern struct tracer boot_tracer;
68 51
69/* 52#undef __field
70 * Context switch trace entry - which task (and prio) we switched from/to: 53#define __field(type, item) type item;
71 */
72struct ctx_switch_entry {
73 struct trace_entry ent;
74 unsigned int prev_pid;
75 unsigned char prev_prio;
76 unsigned char prev_state;
77 unsigned int next_pid;
78 unsigned char next_prio;
79 unsigned char next_state;
80 unsigned int next_cpu;
81};
82 54
83/* 55#undef __field_struct
84 * Special (free-form) trace entry: 56#define __field_struct(type, item) __field(type, item)
85 */
86struct special_entry {
87 struct trace_entry ent;
88 unsigned long arg1;
89 unsigned long arg2;
90 unsigned long arg3;
91};
92 57
93/* 58#undef __field_desc
94 * Stack-trace entry: 59#define __field_desc(type, container, item)
95 */
96 60
97#define FTRACE_STACK_ENTRIES 8 61#undef __array
62#define __array(type, item, size) type item[size];
98 63
99struct stack_entry { 64#undef __array_desc
100 struct trace_entry ent; 65#define __array_desc(type, container, item, size)
101 unsigned long caller[FTRACE_STACK_ENTRIES];
102};
103 66
104struct userstack_entry { 67#undef __dynamic_array
105 struct trace_entry ent; 68#define __dynamic_array(type, item) type item[];
106 unsigned long caller[FTRACE_STACK_ENTRIES];
107};
108 69
109/* 70#undef F_STRUCT
110 * trace_printk entry: 71#define F_STRUCT(args...) args
111 */
112struct bprint_entry {
113 struct trace_entry ent;
114 unsigned long ip;
115 const char *fmt;
116 u32 buf[];
117};
118
119struct print_entry {
120 struct trace_entry ent;
121 unsigned long ip;
122 char buf[];
123};
124
125#define TRACE_OLD_SIZE 88
126
127struct trace_field_cont {
128 unsigned char type;
129 /* Temporary till we get rid of this completely */
130 char buf[TRACE_OLD_SIZE - 1];
131};
132
133struct trace_mmiotrace_rw {
134 struct trace_entry ent;
135 struct mmiotrace_rw rw;
136};
137
138struct trace_mmiotrace_map {
139 struct trace_entry ent;
140 struct mmiotrace_map map;
141};
142
143struct trace_boot_call {
144 struct trace_entry ent;
145 struct boot_trace_call boot_call;
146};
147
148struct trace_boot_ret {
149 struct trace_entry ent;
150 struct boot_trace_ret boot_ret;
151};
152
153#define TRACE_FUNC_SIZE 30
154#define TRACE_FILE_SIZE 20
155struct trace_branch {
156 struct trace_entry ent;
157 unsigned line;
158 char func[TRACE_FUNC_SIZE+1];
159 char file[TRACE_FILE_SIZE+1];
160 char correct;
161};
162
163struct hw_branch_entry {
164 struct trace_entry ent;
165 u64 from;
166 u64 to;
167};
168 72
169struct trace_power { 73#undef FTRACE_ENTRY
170 struct trace_entry ent; 74#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
171 struct power_trace state_data; 75 struct struct_name { \
172}; 76 struct trace_entry ent; \
77 tstruct \
78 }
173 79
174enum kmemtrace_type_id { 80#undef TP_ARGS
175 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ 81#define TP_ARGS(args...) args
176 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
177 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
178};
179 82
180struct kmemtrace_alloc_entry { 83#undef FTRACE_ENTRY_DUP
181 struct trace_entry ent; 84#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk)
182 enum kmemtrace_type_id type_id;
183 unsigned long call_site;
184 const void *ptr;
185 size_t bytes_req;
186 size_t bytes_alloc;
187 gfp_t gfp_flags;
188 int node;
189};
190 85
191struct kmemtrace_free_entry { 86#include "trace_entries.h"
192 struct trace_entry ent;
193 enum kmemtrace_type_id type_id;
194 unsigned long call_site;
195 const void *ptr;
196};
197 87
88/*
89 * syscalls are special, and need special handling, this is why
90 * they are not included in trace_entries.h
91 */
198struct syscall_trace_enter { 92struct syscall_trace_enter {
199 struct trace_entry ent; 93 struct trace_entry ent;
200 int nr; 94 int nr;
@@ -207,13 +101,12 @@ struct syscall_trace_exit {
207 unsigned long ret; 101 unsigned long ret;
208}; 102};
209 103
210
211/* 104/*
212 * trace_flag_type is an enumeration that holds different 105 * trace_flag_type is an enumeration that holds different
213 * states when a trace occurs. These are: 106 * states when a trace occurs. These are:
214 * IRQS_OFF - interrupts were disabled 107 * IRQS_OFF - interrupts were disabled
215 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags 108 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
216 * NEED_RESCED - reschedule is requested 109 * NEED_RESCHED - reschedule is requested
217 * HARDIRQ - inside an interrupt handler 110 * HARDIRQ - inside an interrupt handler
218 * SOFTIRQ - inside a softirq handler 111 * SOFTIRQ - inside a softirq handler
219 */ 112 */
@@ -236,9 +129,6 @@ struct trace_array_cpu {
236 atomic_t disabled; 129 atomic_t disabled;
237 void *buffer_page; /* ring buffer spare */ 130 void *buffer_page; /* ring buffer spare */
238 131
239 /* these fields get copied into max-trace: */
240 unsigned long trace_idx;
241 unsigned long overrun;
242 unsigned long saved_latency; 132 unsigned long saved_latency;
243 unsigned long critical_start; 133 unsigned long critical_start;
244 unsigned long critical_end; 134 unsigned long critical_end;
@@ -246,6 +136,7 @@ struct trace_array_cpu {
246 unsigned long nice; 136 unsigned long nice;
247 unsigned long policy; 137 unsigned long policy;
248 unsigned long rt_priority; 138 unsigned long rt_priority;
139 unsigned long skipped_entries;
249 cycle_t preempt_timestamp; 140 cycle_t preempt_timestamp;
250 pid_t pid; 141 pid_t pid;
251 uid_t uid; 142 uid_t uid;
@@ -314,15 +205,10 @@ extern void __ftrace_bad_type(void);
314 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 205 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
315 TRACE_GRAPH_RET); \ 206 TRACE_GRAPH_RET); \
316 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ 207 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
317 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
318 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ 208 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
319 TRACE_KMEM_ALLOC); \ 209 TRACE_KMEM_ALLOC); \
320 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 210 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
321 TRACE_KMEM_FREE); \ 211 TRACE_KMEM_FREE); \
322 IF_ASSIGN(var, ent, struct syscall_trace_enter, \
323 TRACE_SYSCALL_ENTER); \
324 IF_ASSIGN(var, ent, struct syscall_trace_exit, \
325 TRACE_SYSCALL_EXIT); \
326 __ftrace_bad_type(); \ 212 __ftrace_bad_type(); \
327 } while (0) 213 } while (0)
328 214
@@ -398,7 +284,6 @@ struct tracer {
398 struct tracer *next; 284 struct tracer *next;
399 int print_max; 285 int print_max;
400 struct tracer_flags *flags; 286 struct tracer_flags *flags;
401 struct tracer_stat *stats;
402}; 287};
403 288
404 289
@@ -423,12 +308,13 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
423 308
424struct ring_buffer_event; 309struct ring_buffer_event;
425 310
426struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, 311struct ring_buffer_event *
427 int type, 312trace_buffer_lock_reserve(struct ring_buffer *buffer,
428 unsigned long len, 313 int type,
429 unsigned long flags, 314 unsigned long len,
430 int pc); 315 unsigned long flags,
431void trace_buffer_unlock_commit(struct trace_array *tr, 316 int pc);
317void trace_buffer_unlock_commit(struct ring_buffer *buffer,
432 struct ring_buffer_event *event, 318 struct ring_buffer_event *event,
433 unsigned long flags, int pc); 319 unsigned long flags, int pc);
434 320
@@ -438,10 +324,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
438struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 324struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
439 int *ent_cpu, u64 *ent_ts); 325 int *ent_cpu, u64 *ent_ts);
440 326
441void tracing_generic_entry_update(struct trace_entry *entry,
442 unsigned long flags,
443 int pc);
444
445void default_wait_pipe(struct trace_iterator *iter); 327void default_wait_pipe(struct trace_iterator *iter);
446void poll_wait_pipe(struct trace_iterator *iter); 328void poll_wait_pipe(struct trace_iterator *iter);
447 329
@@ -471,6 +353,7 @@ void trace_function(struct trace_array *tr,
471 353
472void trace_graph_return(struct ftrace_graph_ret *trace); 354void trace_graph_return(struct ftrace_graph_ret *trace);
473int trace_graph_entry(struct ftrace_graph_ent *trace); 355int trace_graph_entry(struct ftrace_graph_ent *trace);
356void set_graph_array(struct trace_array *tr);
474 357
475void tracing_start_cmdline_record(void); 358void tracing_start_cmdline_record(void);
476void tracing_stop_cmdline_record(void); 359void tracing_stop_cmdline_record(void);
@@ -479,35 +362,46 @@ void tracing_stop_sched_switch_record(void);
479void tracing_start_sched_switch_record(void); 362void tracing_start_sched_switch_record(void);
480int register_tracer(struct tracer *type); 363int register_tracer(struct tracer *type);
481void unregister_tracer(struct tracer *type); 364void unregister_tracer(struct tracer *type);
365int is_tracing_stopped(void);
482 366
483extern unsigned long nsecs_to_usecs(unsigned long nsecs); 367extern unsigned long nsecs_to_usecs(unsigned long nsecs);
484 368
369#ifdef CONFIG_TRACER_MAX_TRACE
485extern unsigned long tracing_max_latency; 370extern unsigned long tracing_max_latency;
486extern unsigned long tracing_thresh; 371extern unsigned long tracing_thresh;
487 372
488void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); 373void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
489void update_max_tr_single(struct trace_array *tr, 374void update_max_tr_single(struct trace_array *tr,
490 struct task_struct *tsk, int cpu); 375 struct task_struct *tsk, int cpu);
376#endif /* CONFIG_TRACER_MAX_TRACE */
491 377
492void __trace_stack(struct trace_array *tr, 378#ifdef CONFIG_STACKTRACE
493 unsigned long flags, 379void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
494 int skip, int pc); 380 int skip, int pc);
495 381
496extern cycle_t ftrace_now(int cpu); 382void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
383 int pc);
497 384
498#ifdef CONFIG_CONTEXT_SWITCH_TRACER 385void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
499typedef void 386 int pc);
500(*tracer_switch_func_t)(void *private, 387#else
501 void *__rq, 388static inline void ftrace_trace_stack(struct trace_array *tr,
502 struct task_struct *prev, 389 unsigned long flags, int skip, int pc)
503 struct task_struct *next); 390{
504 391}
505struct tracer_switch_ops { 392
506 tracer_switch_func_t func; 393static inline void ftrace_trace_userstack(struct trace_array *tr,
507 void *private; 394 unsigned long flags, int pc)
508 struct tracer_switch_ops *next; 395{
509}; 396}
510#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 397
398static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
399 int skip, int pc)
400{
401}
402#endif /* CONFIG_STACKTRACE */
403
404extern cycle_t ftrace_now(int cpu);
511 405
512extern void trace_find_cmdline(int pid, char comm[]); 406extern void trace_find_cmdline(int pid, char comm[]);
513 407
@@ -517,6 +411,10 @@ extern unsigned long ftrace_update_tot_cnt;
517extern int DYN_FTRACE_TEST_NAME(void); 411extern int DYN_FTRACE_TEST_NAME(void);
518#endif 412#endif
519 413
414extern int ring_buffer_expanded;
415extern bool tracing_selftest_disabled;
416DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
417
520#ifdef CONFIG_FTRACE_STARTUP_TEST 418#ifdef CONFIG_FTRACE_STARTUP_TEST
521extern int trace_selftest_startup_function(struct tracer *trace, 419extern int trace_selftest_startup_function(struct tracer *trace,
522 struct trace_array *tr); 420 struct trace_array *tr);
@@ -548,9 +446,16 @@ extern int
548trace_vbprintk(unsigned long ip, const char *fmt, va_list args); 446trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
549extern int 447extern int
550trace_vprintk(unsigned long ip, const char *fmt, va_list args); 448trace_vprintk(unsigned long ip, const char *fmt, va_list args);
449extern int
450trace_array_vprintk(struct trace_array *tr,
451 unsigned long ip, const char *fmt, va_list args);
452int trace_array_printk(struct trace_array *tr,
453 unsigned long ip, const char *fmt, ...);
551 454
552extern unsigned long trace_flags; 455extern unsigned long trace_flags;
553 456
457extern int trace_clock_id;
458
554/* Standard output formatting function used for function return traces */ 459/* Standard output formatting function used for function return traces */
555#ifdef CONFIG_FUNCTION_GRAPH_TRACER 460#ifdef CONFIG_FUNCTION_GRAPH_TRACER
556extern enum print_line_t print_graph_function(struct trace_iterator *iter); 461extern enum print_line_t print_graph_function(struct trace_iterator *iter);
@@ -613,6 +518,41 @@ static inline int ftrace_trace_task(struct task_struct *task)
613#endif 518#endif
614 519
615/* 520/*
521 * struct trace_parser - servers for reading the user input separated by spaces
522 * @cont: set if the input is not complete - no final space char was found
523 * @buffer: holds the parsed user input
524 * @idx: user input lenght
525 * @size: buffer size
526 */
527struct trace_parser {
528 bool cont;
529 char *buffer;
530 unsigned idx;
531 unsigned size;
532};
533
534static inline bool trace_parser_loaded(struct trace_parser *parser)
535{
536 return (parser->idx != 0);
537}
538
539static inline bool trace_parser_cont(struct trace_parser *parser)
540{
541 return parser->cont;
542}
543
544static inline void trace_parser_clear(struct trace_parser *parser)
545{
546 parser->cont = false;
547 parser->idx = 0;
548}
549
550extern int trace_parser_get_init(struct trace_parser *parser, int size);
551extern void trace_parser_put(struct trace_parser *parser);
552extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
553 size_t cnt, loff_t *ppos);
554
555/*
616 * trace_iterator_flags is an enumeration that defines bit 556 * trace_iterator_flags is an enumeration that defines bit
617 * positions into trace_flags that controls the output. 557 * positions into trace_flags that controls the output.
618 * 558 *
@@ -639,9 +579,8 @@ enum trace_iterator_flags {
639 TRACE_ITER_PRINTK_MSGONLY = 0x10000, 579 TRACE_ITER_PRINTK_MSGONLY = 0x10000,
640 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ 580 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
641 TRACE_ITER_LATENCY_FMT = 0x40000, 581 TRACE_ITER_LATENCY_FMT = 0x40000,
642 TRACE_ITER_GLOBAL_CLK = 0x80000, 582 TRACE_ITER_SLEEP_TIME = 0x80000,
643 TRACE_ITER_SLEEP_TIME = 0x100000, 583 TRACE_ITER_GRAPH_TIME = 0x100000,
644 TRACE_ITER_GRAPH_TIME = 0x200000,
645}; 584};
646 585
647/* 586/*
@@ -738,6 +677,7 @@ struct ftrace_event_field {
738 struct list_head link; 677 struct list_head link;
739 char *name; 678 char *name;
740 char *type; 679 char *type;
680 int filter_type;
741 int offset; 681 int offset;
742 int size; 682 int size;
743 int is_signed; 683 int is_signed;
@@ -747,13 +687,15 @@ struct event_filter {
747 int n_preds; 687 int n_preds;
748 struct filter_pred **preds; 688 struct filter_pred **preds;
749 char *filter_string; 689 char *filter_string;
690 bool no_reset;
750}; 691};
751 692
752struct event_subsystem { 693struct event_subsystem {
753 struct list_head list; 694 struct list_head list;
754 const char *name; 695 const char *name;
755 struct dentry *entry; 696 struct dentry *entry;
756 void *filter; 697 struct event_filter *filter;
698 int nr_events;
757}; 699};
758 700
759struct filter_pred; 701struct filter_pred;
@@ -781,6 +723,7 @@ extern int apply_subsystem_event_filter(struct event_subsystem *system,
781 char *filter_string); 723 char *filter_string);
782extern void print_subsystem_event_filter(struct event_subsystem *system, 724extern void print_subsystem_event_filter(struct event_subsystem *system,
783 struct trace_seq *s); 725 struct trace_seq *s);
726extern int filter_assign_type(const char *type);
784 727
785static inline int 728static inline int
786filter_check_discard(struct ftrace_event_call *call, void *rec, 729filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -795,58 +738,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
795 return 0; 738 return 0;
796} 739}
797 740
798#define DEFINE_COMPARISON_PRED(type) \
799static int filter_pred_##type(struct filter_pred *pred, void *event, \
800 int val1, int val2) \
801{ \
802 type *addr = (type *)(event + pred->offset); \
803 type val = (type)pred->val; \
804 int match = 0; \
805 \
806 switch (pred->op) { \
807 case OP_LT: \
808 match = (*addr < val); \
809 break; \
810 case OP_LE: \
811 match = (*addr <= val); \
812 break; \
813 case OP_GT: \
814 match = (*addr > val); \
815 break; \
816 case OP_GE: \
817 match = (*addr >= val); \
818 break; \
819 default: \
820 break; \
821 } \
822 \
823 return match; \
824}
825
826#define DEFINE_EQUALITY_PRED(size) \
827static int filter_pred_##size(struct filter_pred *pred, void *event, \
828 int val1, int val2) \
829{ \
830 u##size *addr = (u##size *)(event + pred->offset); \
831 u##size val = (u##size)pred->val; \
832 int match; \
833 \
834 match = (val == *addr) ^ pred->not; \
835 \
836 return match; \
837}
838
839extern struct mutex event_mutex; 741extern struct mutex event_mutex;
840extern struct list_head ftrace_events; 742extern struct list_head ftrace_events;
841 743
842extern const char *__start___trace_bprintk_fmt[]; 744extern const char *__start___trace_bprintk_fmt[];
843extern const char *__stop___trace_bprintk_fmt[]; 745extern const char *__stop___trace_bprintk_fmt[];
844 746
845#undef TRACE_EVENT_FORMAT 747#undef FTRACE_ENTRY
846#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 748#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
847 extern struct ftrace_event_call event_##call; 749 extern struct ftrace_event_call event_##call;
848#undef TRACE_EVENT_FORMAT_NOFILTER 750#undef FTRACE_ENTRY_DUP
849#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) 751#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
850#include "trace_event_types.h" 752 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
753#include "trace_entries.h"
851 754
852#endif /* _LINUX_KERNEL_TRACE_H */ 755#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index a29ef23ffb47..c21d5f3956ad 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -41,14 +41,12 @@ void disable_boot_trace(void)
41 41
42static int boot_trace_init(struct trace_array *tr) 42static int boot_trace_init(struct trace_array *tr)
43{ 43{
44 int cpu;
45 boot_trace = tr; 44 boot_trace = tr;
46 45
47 if (!tr) 46 if (!tr)
48 return 0; 47 return 0;
49 48
50 for_each_cpu(cpu, cpu_possible_mask) 49 tracing_reset_online_cpus(tr);
51 tracing_reset(tr, cpu);
52 50
53 tracing_sched_switch_assign_trace(tr); 51 tracing_sched_switch_assign_trace(tr);
54 return 0; 52 return 0;
@@ -131,7 +129,9 @@ struct tracer boot_tracer __read_mostly =
131 129
132void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) 130void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
133{ 131{
132 struct ftrace_event_call *call = &event_boot_call;
134 struct ring_buffer_event *event; 133 struct ring_buffer_event *event;
134 struct ring_buffer *buffer;
135 struct trace_boot_call *entry; 135 struct trace_boot_call *entry;
136 struct trace_array *tr = boot_trace; 136 struct trace_array *tr = boot_trace;
137 137
@@ -144,20 +144,24 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
144 sprint_symbol(bt->func, (unsigned long)fn); 144 sprint_symbol(bt->func, (unsigned long)fn);
145 preempt_disable(); 145 preempt_disable();
146 146
147 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL, 147 buffer = tr->buffer;
148 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_CALL,
148 sizeof(*entry), 0, 0); 149 sizeof(*entry), 0, 0);
149 if (!event) 150 if (!event)
150 goto out; 151 goto out;
151 entry = ring_buffer_event_data(event); 152 entry = ring_buffer_event_data(event);
152 entry->boot_call = *bt; 153 entry->boot_call = *bt;
153 trace_buffer_unlock_commit(tr, event, 0, 0); 154 if (!filter_check_discard(call, entry, buffer, event))
155 trace_buffer_unlock_commit(buffer, event, 0, 0);
154 out: 156 out:
155 preempt_enable(); 157 preempt_enable();
156} 158}
157 159
158void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) 160void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
159{ 161{
162 struct ftrace_event_call *call = &event_boot_ret;
160 struct ring_buffer_event *event; 163 struct ring_buffer_event *event;
164 struct ring_buffer *buffer;
161 struct trace_boot_ret *entry; 165 struct trace_boot_ret *entry;
162 struct trace_array *tr = boot_trace; 166 struct trace_array *tr = boot_trace;
163 167
@@ -167,13 +171,15 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
167 sprint_symbol(bt->func, (unsigned long)fn); 171 sprint_symbol(bt->func, (unsigned long)fn);
168 preempt_disable(); 172 preempt_disable();
169 173
170 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET, 174 buffer = tr->buffer;
175 event = trace_buffer_lock_reserve(buffer, TRACE_BOOT_RET,
171 sizeof(*entry), 0, 0); 176 sizeof(*entry), 0, 0);
172 if (!event) 177 if (!event)
173 goto out; 178 goto out;
174 entry = ring_buffer_event_data(event); 179 entry = ring_buffer_event_data(event);
175 entry->boot_ret = *bt; 180 entry->boot_ret = *bt;
176 trace_buffer_unlock_commit(tr, event, 0, 0); 181 if (!filter_check_discard(call, entry, buffer, event))
182 trace_buffer_unlock_commit(buffer, event, 0, 0);
177 out: 183 out:
178 preempt_enable(); 184 preempt_enable();
179} 185}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index b588fd81f7f9..20c5f92e28a8 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -66,10 +66,14 @@ u64 notrace trace_clock(void)
66 * Used by plugins that need globally coherent timestamps. 66 * Used by plugins that need globally coherent timestamps.
67 */ 67 */
68 68
69static u64 prev_trace_clock_time; 69/* keep prev_time and lock in the same cacheline. */
70 70static struct {
71static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = 71 u64 prev_time;
72 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 72 raw_spinlock_t lock;
73} trace_clock_struct ____cacheline_aligned_in_smp =
74 {
75 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED,
76 };
73 77
74u64 notrace trace_clock_global(void) 78u64 notrace trace_clock_global(void)
75{ 79{
@@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void)
88 if (unlikely(in_nmi())) 92 if (unlikely(in_nmi()))
89 goto out; 93 goto out;
90 94
91 __raw_spin_lock(&trace_clock_lock); 95 __raw_spin_lock(&trace_clock_struct.lock);
92 96
93 /* 97 /*
94 * TODO: if this happens often then maybe we should reset 98 * TODO: if this happens often then maybe we should reset
95 * my_scd->clock to prev_trace_clock_time+1, to make sure 99 * my_scd->clock to prev_time+1, to make sure
96 * we start ticking with the local clock from now on? 100 * we start ticking with the local clock from now on?
97 */ 101 */
98 if ((s64)(now - prev_trace_clock_time) < 0) 102 if ((s64)(now - trace_clock_struct.prev_time) < 0)
99 now = prev_trace_clock_time + 1; 103 now = trace_clock_struct.prev_time + 1;
100 104
101 prev_trace_clock_time = now; 105 trace_clock_struct.prev_time = now;
102 106
103 __raw_spin_unlock(&trace_clock_lock); 107 __raw_spin_unlock(&trace_clock_struct.lock);
104 108
105 out: 109 out:
106 raw_local_irq_restore(flags); 110 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
new file mode 100644
index 000000000000..ead3d724599d
--- /dev/null
+++ b/kernel/trace/trace_entries.h
@@ -0,0 +1,366 @@
1/*
2 * This file defines the trace event structures that go into the ring
3 * buffer directly. They are created via macros so that changes for them
4 * appear in the format file. Using macros will automate this process.
5 *
6 * The macro used to create a ftrace data structure is:
7 *
8 * FTRACE_ENTRY( name, struct_name, id, structure, print )
9 *
10 * @name: the name used the event name, as well as the name of
11 * the directory that holds the format file.
12 *
13 * @struct_name: the name of the structure that is created.
14 *
15 * @id: The event identifier that is used to detect what event
16 * this is from the ring buffer.
17 *
18 * @structure: the structure layout
19 *
20 * - __field( type, item )
21 * This is equivalent to declaring
22 * type item;
23 * in the structure.
24 * - __array( type, item, size )
25 * This is equivalent to declaring
26 * type item[size];
27 * in the structure.
28 *
29 * * for structures within structures, the format of the internal
30 * structure is layed out. This allows the internal structure
31 * to be deciphered for the format file. Although these macros
32 * may become out of sync with the internal structure, they
33 * will create a compile error if it happens. Since the
34 * internel structures are just tracing helpers, this is not
35 * an issue.
36 *
37 * When an internal structure is used, it should use:
38 *
39 * __field_struct( type, item )
40 *
41 * instead of __field. This will prevent it from being shown in
42 * the output file. The fields in the structure should use.
43 *
44 * __field_desc( type, container, item )
45 * __array_desc( type, container, item, len )
46 *
47 * type, item and len are the same as __field and __array, but
48 * container is added. This is the name of the item in
49 * __field_struct that this is describing.
50 *
51 *
52 * @print: the print format shown to users in the format file.
53 */
54
55/*
56 * Function trace entry - function address and parent function addres:
57 */
58FTRACE_ENTRY(function, ftrace_entry,
59
60 TRACE_FN,
61
62 F_STRUCT(
63 __field( unsigned long, ip )
64 __field( unsigned long, parent_ip )
65 ),
66
67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip)
68);
69
70/* Function call entry */
71FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
72
73 TRACE_GRAPH_ENT,
74
75 F_STRUCT(
76 __field_struct( struct ftrace_graph_ent, graph_ent )
77 __field_desc( unsigned long, graph_ent, func )
78 __field_desc( int, graph_ent, depth )
79 ),
80
81 F_printk("--> %lx (%d)", __entry->func, __entry->depth)
82);
83
84/* Function return entry */
85FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
86
87 TRACE_GRAPH_RET,
88
89 F_STRUCT(
90 __field_struct( struct ftrace_graph_ret, ret )
91 __field_desc( unsigned long, ret, func )
92 __field_desc( unsigned long long, ret, calltime)
93 __field_desc( unsigned long long, ret, rettime )
94 __field_desc( unsigned long, ret, overrun )
95 __field_desc( int, ret, depth )
96 ),
97
98 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
99 __entry->func, __entry->depth,
100 __entry->calltime, __entry->rettime,
101 __entry->depth)
102);
103
104/*
105 * Context switch trace entry - which task (and prio) we switched from/to:
106 *
107 * This is used for both wakeup and context switches. We only want
108 * to create one structure, but we need two outputs for it.
109 */
110#define FTRACE_CTX_FIELDS \
111 __field( unsigned int, prev_pid ) \
112 __field( unsigned char, prev_prio ) \
113 __field( unsigned char, prev_state ) \
114 __field( unsigned int, next_pid ) \
115 __field( unsigned char, next_prio ) \
116 __field( unsigned char, next_state ) \
117 __field( unsigned int, next_cpu )
118
119FTRACE_ENTRY(context_switch, ctx_switch_entry,
120
121 TRACE_CTX,
122
123 F_STRUCT(
124 FTRACE_CTX_FIELDS
125 ),
126
127 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
128 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
129 __entry->next_pid, __entry->next_prio, __entry->next_state,
130 __entry->next_cpu
131 )
132);
133
134/*
135 * FTRACE_ENTRY_DUP only creates the format file, it will not
136 * create another structure.
137 */
138FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
139
140 TRACE_WAKE,
141
142 F_STRUCT(
143 FTRACE_CTX_FIELDS
144 ),
145
146 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
147 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
148 __entry->next_pid, __entry->next_prio, __entry->next_state,
149 __entry->next_cpu
150 )
151);
152
153/*
154 * Special (free-form) trace entry:
155 */
156FTRACE_ENTRY(special, special_entry,
157
158 TRACE_SPECIAL,
159
160 F_STRUCT(
161 __field( unsigned long, arg1 )
162 __field( unsigned long, arg2 )
163 __field( unsigned long, arg3 )
164 ),
165
166 F_printk("(%08lx) (%08lx) (%08lx)",
167 __entry->arg1, __entry->arg2, __entry->arg3)
168);
169
170/*
171 * Stack-trace entry:
172 */
173
174#define FTRACE_STACK_ENTRIES 8
175
176FTRACE_ENTRY(kernel_stack, stack_entry,
177
178 TRACE_STACK,
179
180 F_STRUCT(
181 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
182 ),
183
184 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
185 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
186 __entry->caller[0], __entry->caller[1], __entry->caller[2],
187 __entry->caller[3], __entry->caller[4], __entry->caller[5],
188 __entry->caller[6], __entry->caller[7])
189);
190
191FTRACE_ENTRY(user_stack, userstack_entry,
192
193 TRACE_USER_STACK,
194
195 F_STRUCT(
196 __field( unsigned int, tgid )
197 __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
198 ),
199
200 F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
201 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
202 __entry->caller[0], __entry->caller[1], __entry->caller[2],
203 __entry->caller[3], __entry->caller[4], __entry->caller[5],
204 __entry->caller[6], __entry->caller[7])
205);
206
207/*
208 * trace_printk entry:
209 */
210FTRACE_ENTRY(bprint, bprint_entry,
211
212 TRACE_BPRINT,
213
214 F_STRUCT(
215 __field( unsigned long, ip )
216 __field( const char *, fmt )
217 __dynamic_array( u32, buf )
218 ),
219
220 F_printk("%08lx fmt:%p",
221 __entry->ip, __entry->fmt)
222);
223
224FTRACE_ENTRY(print, print_entry,
225
226 TRACE_PRINT,
227
228 F_STRUCT(
229 __field( unsigned long, ip )
230 __dynamic_array( char, buf )
231 ),
232
233 F_printk("%08lx %s",
234 __entry->ip, __entry->buf)
235);
236
237FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
238
239 TRACE_MMIO_RW,
240
241 F_STRUCT(
242 __field_struct( struct mmiotrace_rw, rw )
243 __field_desc( resource_size_t, rw, phys )
244 __field_desc( unsigned long, rw, value )
245 __field_desc( unsigned long, rw, pc )
246 __field_desc( int, rw, map_id )
247 __field_desc( unsigned char, rw, opcode )
248 __field_desc( unsigned char, rw, width )
249 ),
250
251 F_printk("%lx %lx %lx %d %x %x",
252 (unsigned long)__entry->phys, __entry->value, __entry->pc,
253 __entry->map_id, __entry->opcode, __entry->width)
254);
255
256FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
257
258 TRACE_MMIO_MAP,
259
260 F_STRUCT(
261 __field_struct( struct mmiotrace_map, map )
262 __field_desc( resource_size_t, map, phys )
263 __field_desc( unsigned long, map, virt )
264 __field_desc( unsigned long, map, len )
265 __field_desc( int, map, map_id )
266 __field_desc( unsigned char, map, opcode )
267 ),
268
269 F_printk("%lx %lx %lx %d %x",
270 (unsigned long)__entry->phys, __entry->virt, __entry->len,
271 __entry->map_id, __entry->opcode)
272);
273
274FTRACE_ENTRY(boot_call, trace_boot_call,
275
276 TRACE_BOOT_CALL,
277
278 F_STRUCT(
279 __field_struct( struct boot_trace_call, boot_call )
280 __field_desc( pid_t, boot_call, caller )
281 __array_desc( char, boot_call, func, KSYM_SYMBOL_LEN)
282 ),
283
284 F_printk("%d %s", __entry->caller, __entry->func)
285);
286
287FTRACE_ENTRY(boot_ret, trace_boot_ret,
288
289 TRACE_BOOT_RET,
290
291 F_STRUCT(
292 __field_struct( struct boot_trace_ret, boot_ret )
293 __array_desc( char, boot_ret, func, KSYM_SYMBOL_LEN)
294 __field_desc( int, boot_ret, result )
295 __field_desc( unsigned long, boot_ret, duration )
296 ),
297
298 F_printk("%s %d %lx",
299 __entry->func, __entry->result, __entry->duration)
300);
301
302#define TRACE_FUNC_SIZE 30
303#define TRACE_FILE_SIZE 20
304
305FTRACE_ENTRY(branch, trace_branch,
306
307 TRACE_BRANCH,
308
309 F_STRUCT(
310 __field( unsigned int, line )
311 __array( char, func, TRACE_FUNC_SIZE+1 )
312 __array( char, file, TRACE_FILE_SIZE+1 )
313 __field( char, correct )
314 ),
315
316 F_printk("%u:%s:%s (%u)",
317 __entry->line,
318 __entry->func, __entry->file, __entry->correct)
319);
320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
334
335 TRACE_KMEM_ALLOC,
336
337 F_STRUCT(
338 __field( enum kmemtrace_type_id, type_id )
339 __field( unsigned long, call_site )
340 __field( const void *, ptr )
341 __field( size_t, bytes_req )
342 __field( size_t, bytes_alloc )
343 __field( gfp_t, gfp_flags )
344 __field( int, node )
345 ),
346
347 F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi"
348 " flags:%x node:%d",
349 __entry->type_id, __entry->call_site, __entry->ptr,
350 __entry->bytes_req, __entry->bytes_alloc,
351 __entry->gfp_flags, __entry->node)
352);
353
354FTRACE_ENTRY(kmem_free, kmemtrace_free_entry,
355
356 TRACE_KMEM_FREE,
357
358 F_STRUCT(
359 __field( enum kmemtrace_type_id, type_id )
360 __field( unsigned long, call_site )
361 __field( const void *, ptr )
362 ),
363
364 F_printk("type:%u call_site:%lx ptr:%p",
365 __entry->type_id, __entry->call_site, __entry->ptr)
366);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 5b5895afecfe..dd44b8768867 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -5,8 +5,60 @@
5 * 5 *
6 */ 6 */
7 7
8#include <linux/module.h>
8#include "trace.h" 9#include "trace.h"
9 10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16
17char *trace_profile_buf;
18EXPORT_SYMBOL_GPL(trace_profile_buf);
19
20char *trace_profile_buf_nmi;
21EXPORT_SYMBOL_GPL(trace_profile_buf_nmi);
22
23/* Count the events in use (per event id, not per instance) */
24static int total_profile_count;
25
26static int ftrace_profile_enable_event(struct ftrace_event_call *event)
27{
28 char *buf;
29 int ret = -ENOMEM;
30
31 if (atomic_inc_return(&event->profile_count))
32 return 0;
33
34 if (!total_profile_count++) {
35 buf = (char *)alloc_percpu(profile_buf_t);
36 if (!buf)
37 goto fail_buf;
38
39 rcu_assign_pointer(trace_profile_buf, buf);
40
41 buf = (char *)alloc_percpu(profile_buf_t);
42 if (!buf)
43 goto fail_buf_nmi;
44
45 rcu_assign_pointer(trace_profile_buf_nmi, buf);
46 }
47
48 ret = event->profile_enable();
49 if (!ret)
50 return 0;
51
52 kfree(trace_profile_buf_nmi);
53fail_buf_nmi:
54 kfree(trace_profile_buf);
55fail_buf:
56 total_profile_count--;
57 atomic_dec(&event->profile_count);
58
59 return ret;
60}
61
10int ftrace_profile_enable(int event_id) 62int ftrace_profile_enable(int event_id)
11{ 63{
12 struct ftrace_event_call *event; 64 struct ftrace_event_call *event;
@@ -14,8 +66,9 @@ int ftrace_profile_enable(int event_id)
14 66
15 mutex_lock(&event_mutex); 67 mutex_lock(&event_mutex);
16 list_for_each_entry(event, &ftrace_events, list) { 68 list_for_each_entry(event, &ftrace_events, list) {
17 if (event->id == event_id) { 69 if (event->id == event_id && event->profile_enable &&
18 ret = event->profile_enable(event); 70 try_module_get(event->mod)) {
71 ret = ftrace_profile_enable_event(event);
19 break; 72 break;
20 } 73 }
21 } 74 }
@@ -24,6 +77,33 @@ int ftrace_profile_enable(int event_id)
24 return ret; 77 return ret;
25} 78}
26 79
80static void ftrace_profile_disable_event(struct ftrace_event_call *event)
81{
82 char *buf, *nmi_buf;
83
84 if (!atomic_add_negative(-1, &event->profile_count))
85 return;
86
87 event->profile_disable();
88
89 if (!--total_profile_count) {
90 buf = trace_profile_buf;
91 rcu_assign_pointer(trace_profile_buf, NULL);
92
93 nmi_buf = trace_profile_buf_nmi;
94 rcu_assign_pointer(trace_profile_buf_nmi, NULL);
95
96 /*
97 * Ensure every events in profiling have finished before
98 * releasing the buffers
99 */
100 synchronize_sched();
101
102 free_percpu(buf);
103 free_percpu(nmi_buf);
104 }
105}
106
27void ftrace_profile_disable(int event_id) 107void ftrace_profile_disable(int event_id)
28{ 108{
29 struct ftrace_event_call *event; 109 struct ftrace_event_call *event;
@@ -31,7 +111,8 @@ void ftrace_profile_disable(int event_id)
31 mutex_lock(&event_mutex); 111 mutex_lock(&event_mutex);
32 list_for_each_entry(event, &ftrace_events, list) { 112 list_for_each_entry(event, &ftrace_events, list) {
33 if (event->id == event_id) { 113 if (event->id == event_id) {
34 event->profile_disable(event); 114 ftrace_profile_disable_event(event);
115 module_put(event->mod);
35 break; 116 break;
36 } 117 }
37 } 118 }
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
deleted file mode 100644
index 5e32e375134d..000000000000
--- a/kernel/trace/trace_event_types.h
+++ /dev/null
@@ -1,175 +0,0 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ftrace
3
4/*
5 * We cheat and use the proto type field as the ID
6 * and args as the entry type (minus 'struct')
7 */
8TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
9 TRACE_STRUCT(
10 TRACE_FIELD(unsigned long, ip, ip)
11 TRACE_FIELD(unsigned long, parent_ip, parent_ip)
12 ),
13 TP_RAW_FMT(" %lx <-- %lx")
14);
15
16TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
17 ftrace_graph_ent_entry, ignore,
18 TRACE_STRUCT(
19 TRACE_FIELD(unsigned long, graph_ent.func, func)
20 TRACE_FIELD(int, graph_ent.depth, depth)
21 ),
22 TP_RAW_FMT("--> %lx (%d)")
23);
24
25TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(int, ret.depth, depth)
30 ),
31 TP_RAW_FMT("<-- %lx (%d)")
32);
33
34TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
35 TRACE_STRUCT(
36 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
37 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
38 TRACE_FIELD(unsigned char, prev_state, prev_state)
39 TRACE_FIELD(unsigned int, next_pid, next_pid)
40 TRACE_FIELD(unsigned char, next_prio, next_prio)
41 TRACE_FIELD(unsigned char, next_state, next_state)
42 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
43 ),
44 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
45);
46
47TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
48 TRACE_STRUCT(
49 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
50 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
51 TRACE_FIELD(unsigned char, prev_state, prev_state)
52 TRACE_FIELD(unsigned int, next_pid, next_pid)
53 TRACE_FIELD(unsigned char, next_prio, next_prio)
54 TRACE_FIELD(unsigned char, next_state, next_state)
55 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
56 ),
57 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
58);
59
60TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
61 TRACE_STRUCT(
62 TRACE_FIELD(unsigned long, arg1, arg1)
63 TRACE_FIELD(unsigned long, arg2, arg2)
64 TRACE_FIELD(unsigned long, arg3, arg3)
65 ),
66 TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
67);
68
69/*
70 * Stack-trace entry:
71 */
72
73/* #define FTRACE_STACK_ENTRIES 8 */
74
75TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
76 TRACE_STRUCT(
77 TRACE_FIELD(unsigned long, caller[0], stack0)
78 TRACE_FIELD(unsigned long, caller[1], stack1)
79 TRACE_FIELD(unsigned long, caller[2], stack2)
80 TRACE_FIELD(unsigned long, caller[3], stack3)
81 TRACE_FIELD(unsigned long, caller[4], stack4)
82 TRACE_FIELD(unsigned long, caller[5], stack5)
83 TRACE_FIELD(unsigned long, caller[6], stack6)
84 TRACE_FIELD(unsigned long, caller[7], stack7)
85 ),
86 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
87 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
88);
89
90TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
91 TRACE_STRUCT(
92 TRACE_FIELD(unsigned long, caller[0], stack0)
93 TRACE_FIELD(unsigned long, caller[1], stack1)
94 TRACE_FIELD(unsigned long, caller[2], stack2)
95 TRACE_FIELD(unsigned long, caller[3], stack3)
96 TRACE_FIELD(unsigned long, caller[4], stack4)
97 TRACE_FIELD(unsigned long, caller[5], stack5)
98 TRACE_FIELD(unsigned long, caller[6], stack6)
99 TRACE_FIELD(unsigned long, caller[7], stack7)
100 ),
101 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
102 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
103);
104
105TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
106 TRACE_STRUCT(
107 TRACE_FIELD(unsigned long, ip, ip)
108 TRACE_FIELD(char *, fmt, fmt)
109 TRACE_FIELD_ZERO_CHAR(buf)
110 ),
111 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
112);
113
114TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
115 TRACE_STRUCT(
116 TRACE_FIELD(unsigned long, ip, ip)
117 TRACE_FIELD_ZERO_CHAR(buf)
118 ),
119 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
120);
121
122TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
123 TRACE_STRUCT(
124 TRACE_FIELD(unsigned int, line, line)
125 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
126 TRACE_FUNC_SIZE+1, func)
127 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
128 TRACE_FUNC_SIZE+1, file)
129 TRACE_FIELD(char, correct, correct)
130 ),
131 TP_RAW_FMT("%u:%s:%s (%u)")
132);
133
134TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
135 TRACE_STRUCT(
136 TRACE_FIELD(u64, from, from)
137 TRACE_FIELD(u64, to, to)
138 ),
139 TP_RAW_FMT("from: %llx to: %llx")
140);
141
142TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
143 TRACE_STRUCT(
144 TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
145 TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
146 TRACE_FIELD(int, state_data.type, type)
147 TRACE_FIELD(int, state_data.state, state)
148 ),
149 TP_RAW_FMT("%llx->%llx type:%u state:%u")
150);
151
152TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
153 TRACE_STRUCT(
154 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
155 TRACE_FIELD(unsigned long, call_site, call_site)
156 TRACE_FIELD(const void *, ptr, ptr)
157 TRACE_FIELD(size_t, bytes_req, bytes_req)
158 TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
159 TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
160 TRACE_FIELD(int, node, node)
161 ),
162 TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
163 " flags:%x node:%d")
164);
165
166TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
167 TRACE_STRUCT(
168 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
169 TRACE_FIELD(unsigned long, call_site, call_site)
170 TRACE_FIELD(const void *, ptr, ptr)
171 ),
172 TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
173);
174
175#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53c8fd376a88..6f03c8a1105e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -17,16 +17,20 @@
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/delay.h> 18#include <linux/delay.h>
19 19
20#include <asm/setup.h>
21
20#include "trace_output.h" 22#include "trace_output.h"
21 23
24#undef TRACE_SYSTEM
22#define TRACE_SYSTEM "TRACE_SYSTEM" 25#define TRACE_SYSTEM "TRACE_SYSTEM"
23 26
24DEFINE_MUTEX(event_mutex); 27DEFINE_MUTEX(event_mutex);
25 28
26LIST_HEAD(ftrace_events); 29LIST_HEAD(ftrace_events);
27 30
28int trace_define_field(struct ftrace_event_call *call, char *type, 31int trace_define_field(struct ftrace_event_call *call, const char *type,
29 char *name, int offset, int size, int is_signed) 32 const char *name, int offset, int size, int is_signed,
33 int filter_type)
30{ 34{
31 struct ftrace_event_field *field; 35 struct ftrace_event_field *field;
32 36
@@ -42,9 +46,15 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
42 if (!field->type) 46 if (!field->type)
43 goto err; 47 goto err;
44 48
49 if (filter_type == FILTER_OTHER)
50 field->filter_type = filter_assign_type(type);
51 else
52 field->filter_type = filter_type;
53
45 field->offset = offset; 54 field->offset = offset;
46 field->size = size; 55 field->size = size;
47 field->is_signed = is_signed; 56 field->is_signed = is_signed;
57
48 list_add(&field->link, &call->fields); 58 list_add(&field->link, &call->fields);
49 59
50 return 0; 60 return 0;
@@ -60,6 +70,29 @@ err:
60} 70}
61EXPORT_SYMBOL_GPL(trace_define_field); 71EXPORT_SYMBOL_GPL(trace_define_field);
62 72
73#define __common_field(type, item) \
74 ret = trace_define_field(call, #type, "common_" #item, \
75 offsetof(typeof(ent), item), \
76 sizeof(ent.item), \
77 is_signed_type(type), FILTER_OTHER); \
78 if (ret) \
79 return ret;
80
81int trace_define_common_fields(struct ftrace_event_call *call)
82{
83 int ret;
84 struct trace_entry ent;
85
86 __common_field(unsigned short, type);
87 __common_field(unsigned char, flags);
88 __common_field(unsigned char, preempt_count);
89 __common_field(int, pid);
90 __common_field(int, lock_depth);
91
92 return ret;
93}
94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95
63#ifdef CONFIG_MODULES 96#ifdef CONFIG_MODULES
64 97
65static void trace_destroy_fields(struct ftrace_event_call *call) 98static void trace_destroy_fields(struct ftrace_event_call *call)
@@ -84,14 +117,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
84 if (call->enabled) { 117 if (call->enabled) {
85 call->enabled = 0; 118 call->enabled = 0;
86 tracing_stop_cmdline_record(); 119 tracing_stop_cmdline_record();
87 call->unregfunc(); 120 call->unregfunc(call->data);
88 } 121 }
89 break; 122 break;
90 case 1: 123 case 1:
91 if (!call->enabled) { 124 if (!call->enabled) {
92 call->enabled = 1; 125 call->enabled = 1;
93 tracing_start_cmdline_record(); 126 tracing_start_cmdline_record();
94 call->regfunc(); 127 call->regfunc(call->data);
95 } 128 }
96 break; 129 break;
97 } 130 }
@@ -198,11 +231,9 @@ static ssize_t
198ftrace_event_write(struct file *file, const char __user *ubuf, 231ftrace_event_write(struct file *file, const char __user *ubuf,
199 size_t cnt, loff_t *ppos) 232 size_t cnt, loff_t *ppos)
200{ 233{
234 struct trace_parser parser;
201 size_t read = 0; 235 size_t read = 0;
202 int i, set = 1;
203 ssize_t ret; 236 ssize_t ret;
204 char *buf;
205 char ch;
206 237
207 if (!cnt || cnt < 0) 238 if (!cnt || cnt < 0)
208 return 0; 239 return 0;
@@ -211,60 +242,28 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
211 if (ret < 0) 242 if (ret < 0)
212 return ret; 243 return ret;
213 244
214 ret = get_user(ch, ubuf++); 245 if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
215 if (ret)
216 return ret;
217 read++;
218 cnt--;
219
220 /* skip white space */
221 while (cnt && isspace(ch)) {
222 ret = get_user(ch, ubuf++);
223 if (ret)
224 return ret;
225 read++;
226 cnt--;
227 }
228
229 /* Only white space found? */
230 if (isspace(ch)) {
231 file->f_pos += read;
232 ret = read;
233 return ret;
234 }
235
236 buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
237 if (!buf)
238 return -ENOMEM; 246 return -ENOMEM;
239 247
240 if (cnt > EVENT_BUF_SIZE) 248 read = trace_get_user(&parser, ubuf, cnt, ppos);
241 cnt = EVENT_BUF_SIZE; 249
250 if (trace_parser_loaded((&parser))) {
251 int set = 1;
242 252
243 i = 0; 253 if (*parser.buffer == '!')
244 while (cnt && !isspace(ch)) {
245 if (!i && ch == '!')
246 set = 0; 254 set = 0;
247 else
248 buf[i++] = ch;
249 255
250 ret = get_user(ch, ubuf++); 256 parser.buffer[parser.idx] = 0;
257
258 ret = ftrace_set_clr_event(parser.buffer + !set, set);
251 if (ret) 259 if (ret)
252 goto out_free; 260 goto out_put;
253 read++;
254 cnt--;
255 } 261 }
256 buf[i] = 0;
257
258 file->f_pos += read;
259
260 ret = ftrace_set_clr_event(buf, set);
261 if (ret)
262 goto out_free;
263 262
264 ret = read; 263 ret = read;
265 264
266 out_free: 265 out_put:
267 kfree(buf); 266 trace_parser_put(&parser);
268 267
269 return ret; 268 return ret;
270} 269}
@@ -272,42 +271,32 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
272static void * 271static void *
273t_next(struct seq_file *m, void *v, loff_t *pos) 272t_next(struct seq_file *m, void *v, loff_t *pos)
274{ 273{
275 struct list_head *list = m->private; 274 struct ftrace_event_call *call = v;
276 struct ftrace_event_call *call;
277 275
278 (*pos)++; 276 (*pos)++;
279 277
280 for (;;) { 278 list_for_each_entry_continue(call, &ftrace_events, list) {
281 if (list == &ftrace_events)
282 return NULL;
283
284 call = list_entry(list, struct ftrace_event_call, list);
285
286 /* 279 /*
287 * The ftrace subsystem is for showing formats only. 280 * The ftrace subsystem is for showing formats only.
288 * They can not be enabled or disabled via the event files. 281 * They can not be enabled or disabled via the event files.
289 */ 282 */
290 if (call->regfunc) 283 if (call->regfunc)
291 break; 284 return call;
292
293 list = list->next;
294 } 285 }
295 286
296 m->private = list->next; 287 return NULL;
297
298 return call;
299} 288}
300 289
301static void *t_start(struct seq_file *m, loff_t *pos) 290static void *t_start(struct seq_file *m, loff_t *pos)
302{ 291{
303 struct ftrace_event_call *call = NULL; 292 struct ftrace_event_call *call;
304 loff_t l; 293 loff_t l;
305 294
306 mutex_lock(&event_mutex); 295 mutex_lock(&event_mutex);
307 296
308 m->private = ftrace_events.next; 297 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
309 for (l = 0; l <= *pos; ) { 298 for (l = 0; l <= *pos; ) {
310 call = t_next(m, NULL, &l); 299 call = t_next(m, call, &l);
311 if (!call) 300 if (!call)
312 break; 301 break;
313 } 302 }
@@ -317,37 +306,28 @@ static void *t_start(struct seq_file *m, loff_t *pos)
317static void * 306static void *
318s_next(struct seq_file *m, void *v, loff_t *pos) 307s_next(struct seq_file *m, void *v, loff_t *pos)
319{ 308{
320 struct list_head *list = m->private; 309 struct ftrace_event_call *call = v;
321 struct ftrace_event_call *call;
322 310
323 (*pos)++; 311 (*pos)++;
324 312
325 retry: 313 list_for_each_entry_continue(call, &ftrace_events, list) {
326 if (list == &ftrace_events) 314 if (call->enabled)
327 return NULL; 315 return call;
328
329 call = list_entry(list, struct ftrace_event_call, list);
330
331 if (!call->enabled) {
332 list = list->next;
333 goto retry;
334 } 316 }
335 317
336 m->private = list->next; 318 return NULL;
337
338 return call;
339} 319}
340 320
341static void *s_start(struct seq_file *m, loff_t *pos) 321static void *s_start(struct seq_file *m, loff_t *pos)
342{ 322{
343 struct ftrace_event_call *call = NULL; 323 struct ftrace_event_call *call;
344 loff_t l; 324 loff_t l;
345 325
346 mutex_lock(&event_mutex); 326 mutex_lock(&event_mutex);
347 327
348 m->private = ftrace_events.next; 328 call = list_entry(&ftrace_events, struct ftrace_event_call, list);
349 for (l = 0; l <= *pos; ) { 329 for (l = 0; l <= *pos; ) {
350 call = s_next(m, NULL, &l); 330 call = s_next(m, call, &l);
351 if (!call) 331 if (!call)
352 break; 332 break;
353 } 333 }
@@ -376,7 +356,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
376 const struct seq_operations *seq_ops; 356 const struct seq_operations *seq_ops;
377 357
378 if ((file->f_mode & FMODE_WRITE) && 358 if ((file->f_mode & FMODE_WRITE) &&
379 !(file->f_flags & O_APPEND)) 359 (file->f_flags & O_TRUNC))
380 ftrace_clear_events(); 360 ftrace_clear_events();
381 361
382 seq_ops = inode->i_private; 362 seq_ops = inode->i_private;
@@ -546,7 +526,7 @@ static int trace_write_header(struct trace_seq *s)
546 FIELD(unsigned char, flags), 526 FIELD(unsigned char, flags),
547 FIELD(unsigned char, preempt_count), 527 FIELD(unsigned char, preempt_count),
548 FIELD(int, pid), 528 FIELD(int, pid),
549 FIELD(int, tgid)); 529 FIELD(int, lock_depth));
550} 530}
551 531
552static ssize_t 532static ssize_t
@@ -574,7 +554,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
574 trace_seq_printf(s, "format:\n"); 554 trace_seq_printf(s, "format:\n");
575 trace_write_header(s); 555 trace_write_header(s);
576 556
577 r = call->show_format(s); 557 r = call->show_format(call, s);
578 if (!r) { 558 if (!r) {
579 /* 559 /*
580 * ug! The format output is bigger than a PAGE!! 560 * ug! The format output is bigger than a PAGE!!
@@ -849,8 +829,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
849 829
850 /* First see if we did not already create this dir */ 830 /* First see if we did not already create this dir */
851 list_for_each_entry(system, &event_subsystems, list) { 831 list_for_each_entry(system, &event_subsystems, list) {
852 if (strcmp(system->name, name) == 0) 832 if (strcmp(system->name, name) == 0) {
833 system->nr_events++;
853 return system->entry; 834 return system->entry;
835 }
854 } 836 }
855 837
856 /* need to create new entry */ 838 /* need to create new entry */
@@ -869,6 +851,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
869 return d_events; 851 return d_events;
870 } 852 }
871 853
854 system->nr_events = 1;
872 system->name = kstrdup(name, GFP_KERNEL); 855 system->name = kstrdup(name, GFP_KERNEL);
873 if (!system->name) { 856 if (!system->name) {
874 debugfs_remove(system->entry); 857 debugfs_remove(system->entry);
@@ -920,15 +903,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
920 if (strcmp(call->system, TRACE_SYSTEM) != 0) 903 if (strcmp(call->system, TRACE_SYSTEM) != 0)
921 d_events = event_subsystem_dir(call->system, d_events); 904 d_events = event_subsystem_dir(call->system, d_events);
922 905
923 if (call->raw_init) {
924 ret = call->raw_init();
925 if (ret < 0) {
926 pr_warning("Could not initialize trace point"
927 " events/%s\n", call->name);
928 return ret;
929 }
930 }
931
932 call->dir = debugfs_create_dir(call->name, d_events); 906 call->dir = debugfs_create_dir(call->name, d_events);
933 if (!call->dir) { 907 if (!call->dir) {
934 pr_warning("Could not create debugfs " 908 pr_warning("Could not create debugfs "
@@ -940,12 +914,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
940 entry = trace_create_file("enable", 0644, call->dir, call, 914 entry = trace_create_file("enable", 0644, call->dir, call,
941 enable); 915 enable);
942 916
943 if (call->id) 917 if (call->id && call->profile_enable)
944 entry = trace_create_file("id", 0444, call->dir, call, 918 entry = trace_create_file("id", 0444, call->dir, call,
945 id); 919 id);
946 920
947 if (call->define_fields) { 921 if (call->define_fields) {
948 ret = call->define_fields(); 922 ret = call->define_fields(call);
949 if (ret < 0) { 923 if (ret < 0) {
950 pr_warning("Could not initialize trace point" 924 pr_warning("Could not initialize trace point"
951 " events/%s\n", call->name); 925 " events/%s\n", call->name);
@@ -987,6 +961,32 @@ struct ftrace_module_file_ops {
987 struct file_operations filter; 961 struct file_operations filter;
988}; 962};
989 963
964static void remove_subsystem_dir(const char *name)
965{
966 struct event_subsystem *system;
967
968 if (strcmp(name, TRACE_SYSTEM) == 0)
969 return;
970
971 list_for_each_entry(system, &event_subsystems, list) {
972 if (strcmp(system->name, name) == 0) {
973 if (!--system->nr_events) {
974 struct event_filter *filter = system->filter;
975
976 debugfs_remove_recursive(system->entry);
977 list_del(&system->list);
978 if (filter) {
979 kfree(filter->filter_string);
980 kfree(filter);
981 }
982 kfree(system->name);
983 kfree(system);
984 }
985 break;
986 }
987 }
988}
989
990static struct ftrace_module_file_ops * 990static struct ftrace_module_file_ops *
991trace_create_file_ops(struct module *mod) 991trace_create_file_ops(struct module *mod)
992{ 992{
@@ -1027,6 +1027,7 @@ static void trace_module_add_events(struct module *mod)
1027 struct ftrace_module_file_ops *file_ops = NULL; 1027 struct ftrace_module_file_ops *file_ops = NULL;
1028 struct ftrace_event_call *call, *start, *end; 1028 struct ftrace_event_call *call, *start, *end;
1029 struct dentry *d_events; 1029 struct dentry *d_events;
1030 int ret;
1030 1031
1031 start = mod->trace_events; 1032 start = mod->trace_events;
1032 end = mod->trace_events + mod->num_trace_events; 1033 end = mod->trace_events + mod->num_trace_events;
@@ -1042,7 +1043,15 @@ static void trace_module_add_events(struct module *mod)
1042 /* The linker may leave blanks */ 1043 /* The linker may leave blanks */
1043 if (!call->name) 1044 if (!call->name)
1044 continue; 1045 continue;
1045 1046 if (call->raw_init) {
1047 ret = call->raw_init();
1048 if (ret < 0) {
1049 if (ret != -ENOSYS)
1050 pr_warning("Could not initialize trace "
1051 "point events/%s\n", call->name);
1052 continue;
1053 }
1054 }
1046 /* 1055 /*
1047 * This module has events, create file ops for this module 1056 * This module has events, create file ops for this module
1048 * if not already done. 1057 * if not already done.
@@ -1077,6 +1086,7 @@ static void trace_module_remove_events(struct module *mod)
1077 list_del(&call->list); 1086 list_del(&call->list);
1078 trace_destroy_fields(call); 1087 trace_destroy_fields(call);
1079 destroy_preds(call); 1088 destroy_preds(call);
1089 remove_subsystem_dir(call->system);
1080 } 1090 }
1081 } 1091 }
1082 1092
@@ -1125,7 +1135,7 @@ static int trace_module_notify(struct notifier_block *self,
1125} 1135}
1126#endif /* CONFIG_MODULES */ 1136#endif /* CONFIG_MODULES */
1127 1137
1128struct notifier_block trace_module_nb = { 1138static struct notifier_block trace_module_nb = {
1129 .notifier_call = trace_module_notify, 1139 .notifier_call = trace_module_notify,
1130 .priority = 0, 1140 .priority = 0,
1131}; 1141};
@@ -1133,6 +1143,18 @@ struct notifier_block trace_module_nb = {
1133extern struct ftrace_event_call __start_ftrace_events[]; 1143extern struct ftrace_event_call __start_ftrace_events[];
1134extern struct ftrace_event_call __stop_ftrace_events[]; 1144extern struct ftrace_event_call __stop_ftrace_events[];
1135 1145
1146static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1147
1148static __init int setup_trace_event(char *str)
1149{
1150 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
1151 ring_buffer_expanded = 1;
1152 tracing_selftest_disabled = 1;
1153
1154 return 1;
1155}
1156__setup("trace_event=", setup_trace_event);
1157
1136static __init int event_trace_init(void) 1158static __init int event_trace_init(void)
1137{ 1159{
1138 struct ftrace_event_call *call; 1160 struct ftrace_event_call *call;
@@ -1140,6 +1162,8 @@ static __init int event_trace_init(void)
1140 struct dentry *entry; 1162 struct dentry *entry;
1141 struct dentry *d_events; 1163 struct dentry *d_events;
1142 int ret; 1164 int ret;
1165 char *buf = bootup_event_buf;
1166 char *token;
1143 1167
1144 d_tracer = tracing_init_dentry(); 1168 d_tracer = tracing_init_dentry();
1145 if (!d_tracer) 1169 if (!d_tracer)
@@ -1179,12 +1203,34 @@ static __init int event_trace_init(void)
1179 /* The linker may leave blanks */ 1203 /* The linker may leave blanks */
1180 if (!call->name) 1204 if (!call->name)
1181 continue; 1205 continue;
1206 if (call->raw_init) {
1207 ret = call->raw_init();
1208 if (ret < 0) {
1209 if (ret != -ENOSYS)
1210 pr_warning("Could not initialize trace "
1211 "point events/%s\n", call->name);
1212 continue;
1213 }
1214 }
1182 list_add(&call->list, &ftrace_events); 1215 list_add(&call->list, &ftrace_events);
1183 event_create_dir(call, d_events, &ftrace_event_id_fops, 1216 event_create_dir(call, d_events, &ftrace_event_id_fops,
1184 &ftrace_enable_fops, &ftrace_event_filter_fops, 1217 &ftrace_enable_fops, &ftrace_event_filter_fops,
1185 &ftrace_event_format_fops); 1218 &ftrace_event_format_fops);
1186 } 1219 }
1187 1220
1221 while (true) {
1222 token = strsep(&buf, ",");
1223
1224 if (!token)
1225 break;
1226 if (!*token)
1227 continue;
1228
1229 ret = ftrace_set_clr_event(token, 1);
1230 if (ret)
1231 pr_warning("Failed to enable trace event: %s\n", token);
1232 }
1233
1188 ret = register_module_notifier(&trace_module_nb); 1234 ret = register_module_notifier(&trace_module_nb);
1189 if (ret) 1235 if (ret)
1190 pr_warning("Failed to register trace events module notifier\n"); 1236 pr_warning("Failed to register trace events module notifier\n");
@@ -1261,6 +1307,18 @@ static __init void event_trace_self_tests(void)
1261 if (!call->regfunc) 1307 if (!call->regfunc)
1262 continue; 1308 continue;
1263 1309
1310/*
1311 * Testing syscall events here is pretty useless, but
1312 * we still do it if configured. But this is time consuming.
1313 * What we really need is a user thread to perform the
1314 * syscalls as we test.
1315 */
1316#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1317 if (call->system &&
1318 strcmp(call->system, "syscalls") == 0)
1319 continue;
1320#endif
1321
1264 pr_info("Testing event %s: ", call->name); 1322 pr_info("Testing event %s: ", call->name);
1265 1323
1266 /* 1324 /*
@@ -1334,12 +1392,13 @@ static __init void event_trace_self_tests(void)
1334 1392
1335#ifdef CONFIG_FUNCTION_TRACER 1393#ifdef CONFIG_FUNCTION_TRACER
1336 1394
1337static DEFINE_PER_CPU(atomic_t, test_event_disable); 1395static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
1338 1396
1339static void 1397static void
1340function_test_events_call(unsigned long ip, unsigned long parent_ip) 1398function_test_events_call(unsigned long ip, unsigned long parent_ip)
1341{ 1399{
1342 struct ring_buffer_event *event; 1400 struct ring_buffer_event *event;
1401 struct ring_buffer *buffer;
1343 struct ftrace_entry *entry; 1402 struct ftrace_entry *entry;
1344 unsigned long flags; 1403 unsigned long flags;
1345 long disabled; 1404 long disabled;
@@ -1350,14 +1409,15 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1350 pc = preempt_count(); 1409 pc = preempt_count();
1351 resched = ftrace_preempt_disable(); 1410 resched = ftrace_preempt_disable();
1352 cpu = raw_smp_processor_id(); 1411 cpu = raw_smp_processor_id();
1353 disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); 1412 disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
1354 1413
1355 if (disabled != 1) 1414 if (disabled != 1)
1356 goto out; 1415 goto out;
1357 1416
1358 local_save_flags(flags); 1417 local_save_flags(flags);
1359 1418
1360 event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), 1419 event = trace_current_buffer_lock_reserve(&buffer,
1420 TRACE_FN, sizeof(*entry),
1361 flags, pc); 1421 flags, pc);
1362 if (!event) 1422 if (!event)
1363 goto out; 1423 goto out;
@@ -1365,10 +1425,10 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
1365 entry->ip = ip; 1425 entry->ip = ip;
1366 entry->parent_ip = parent_ip; 1426 entry->parent_ip = parent_ip;
1367 1427
1368 trace_nowake_buffer_unlock_commit(event, flags, pc); 1428 trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
1369 1429
1370 out: 1430 out:
1371 atomic_dec(&per_cpu(test_event_disable, cpu)); 1431 atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
1372 ftrace_preempt_enable(resched); 1432 ftrace_preempt_enable(resched);
1373} 1433}
1374 1434
@@ -1392,10 +1452,10 @@ static __init void event_trace_self_test_with_function(void)
1392 1452
1393static __init int event_trace_self_tests_init(void) 1453static __init int event_trace_self_tests_init(void)
1394{ 1454{
1395 1455 if (!tracing_selftest_disabled) {
1396 event_trace_self_tests(); 1456 event_trace_self_tests();
1397 1457 event_trace_self_test_with_function();
1398 event_trace_self_test_with_function(); 1458 }
1399 1459
1400 return 0; 1460 return 0;
1401} 1461}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 936c621bbf46..23245785927f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -121,6 +121,47 @@ struct filter_parse_state {
121 } operand; 121 } operand;
122}; 122};
123 123
124#define DEFINE_COMPARISON_PRED(type) \
125static int filter_pred_##type(struct filter_pred *pred, void *event, \
126 int val1, int val2) \
127{ \
128 type *addr = (type *)(event + pred->offset); \
129 type val = (type)pred->val; \
130 int match = 0; \
131 \
132 switch (pred->op) { \
133 case OP_LT: \
134 match = (*addr < val); \
135 break; \
136 case OP_LE: \
137 match = (*addr <= val); \
138 break; \
139 case OP_GT: \
140 match = (*addr > val); \
141 break; \
142 case OP_GE: \
143 match = (*addr >= val); \
144 break; \
145 default: \
146 break; \
147 } \
148 \
149 return match; \
150}
151
152#define DEFINE_EQUALITY_PRED(size) \
153static int filter_pred_##size(struct filter_pred *pred, void *event, \
154 int val1, int val2) \
155{ \
156 u##size *addr = (u##size *)(event + pred->offset); \
157 u##size val = (u##size)pred->val; \
158 int match; \
159 \
160 match = (val == *addr) ^ pred->not; \
161 \
162 return match; \
163}
164
124DEFINE_COMPARISON_PRED(s64); 165DEFINE_COMPARISON_PRED(s64);
125DEFINE_COMPARISON_PRED(u64); 166DEFINE_COMPARISON_PRED(u64);
126DEFINE_COMPARISON_PRED(s32); 167DEFINE_COMPARISON_PRED(s32);
@@ -163,6 +204,20 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
163 return match; 204 return match;
164} 205}
165 206
207/* Filter predicate for char * pointers */
208static int filter_pred_pchar(struct filter_pred *pred, void *event,
209 int val1, int val2)
210{
211 char **addr = (char **)(event + pred->offset);
212 int cmp, match;
213
214 cmp = strncmp(*addr, pred->str_val, pred->str_len);
215
216 match = (!cmp) ^ pred->not;
217
218 return match;
219}
220
166/* 221/*
167 * Filter predicate for dynamic sized arrays of characters. 222 * Filter predicate for dynamic sized arrays of characters.
168 * These are implemented through a list of strings at the end 223 * These are implemented through a list of strings at the end
@@ -176,11 +231,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
176static int filter_pred_strloc(struct filter_pred *pred, void *event, 231static int filter_pred_strloc(struct filter_pred *pred, void *event,
177 int val1, int val2) 232 int val1, int val2)
178{ 233{
179 unsigned short str_loc = *(unsigned short *)(event + pred->offset); 234 u32 str_item = *(u32 *)(event + pred->offset);
235 int str_loc = str_item & 0xffff;
236 int str_len = str_item >> 16;
180 char *addr = (char *)(event + str_loc); 237 char *addr = (char *)(event + str_loc);
181 int cmp, match; 238 int cmp, match;
182 239
183 cmp = strncmp(addr, pred->str_val, pred->str_len); 240 cmp = strncmp(addr, pred->str_val, str_len);
184 241
185 match = (!cmp) ^ pred->not; 242 match = (!cmp) ^ pred->not;
186 243
@@ -293,7 +350,7 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
293 struct event_filter *filter = call->filter; 350 struct event_filter *filter = call->filter;
294 351
295 mutex_lock(&event_mutex); 352 mutex_lock(&event_mutex);
296 if (filter->filter_string) 353 if (filter && filter->filter_string)
297 trace_seq_printf(s, "%s\n", filter->filter_string); 354 trace_seq_printf(s, "%s\n", filter->filter_string);
298 else 355 else
299 trace_seq_printf(s, "none\n"); 356 trace_seq_printf(s, "none\n");
@@ -306,7 +363,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
306 struct event_filter *filter = system->filter; 363 struct event_filter *filter = system->filter;
307 364
308 mutex_lock(&event_mutex); 365 mutex_lock(&event_mutex);
309 if (filter->filter_string) 366 if (filter && filter->filter_string)
310 trace_seq_printf(s, "%s\n", filter->filter_string); 367 trace_seq_printf(s, "%s\n", filter->filter_string);
311 else 368 else
312 trace_seq_printf(s, "none\n"); 369 trace_seq_printf(s, "none\n");
@@ -374,6 +431,9 @@ void destroy_preds(struct ftrace_event_call *call)
374 struct event_filter *filter = call->filter; 431 struct event_filter *filter = call->filter;
375 int i; 432 int i;
376 433
434 if (!filter)
435 return;
436
377 for (i = 0; i < MAX_FILTER_PRED; i++) { 437 for (i = 0; i < MAX_FILTER_PRED; i++) {
378 if (filter->preds[i]) 438 if (filter->preds[i])
379 filter_free_pred(filter->preds[i]); 439 filter_free_pred(filter->preds[i]);
@@ -384,17 +444,19 @@ void destroy_preds(struct ftrace_event_call *call)
384 call->filter = NULL; 444 call->filter = NULL;
385} 445}
386 446
387int init_preds(struct ftrace_event_call *call) 447static int init_preds(struct ftrace_event_call *call)
388{ 448{
389 struct event_filter *filter; 449 struct event_filter *filter;
390 struct filter_pred *pred; 450 struct filter_pred *pred;
391 int i; 451 int i;
392 452
453 if (call->filter)
454 return 0;
455
393 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); 456 filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
394 if (!call->filter) 457 if (!call->filter)
395 return -ENOMEM; 458 return -ENOMEM;
396 459
397 call->filter_active = 0;
398 filter->n_preds = 0; 460 filter->n_preds = 0;
399 461
400 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); 462 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
@@ -416,30 +478,55 @@ oom:
416 478
417 return -ENOMEM; 479 return -ENOMEM;
418} 480}
419EXPORT_SYMBOL_GPL(init_preds);
420 481
421static void filter_free_subsystem_preds(struct event_subsystem *system) 482static int init_subsystem_preds(struct event_subsystem *system)
422{ 483{
423 struct event_filter *filter = system->filter;
424 struct ftrace_event_call *call; 484 struct ftrace_event_call *call;
425 int i; 485 int err;
426 486
427 if (filter->n_preds) { 487 list_for_each_entry(call, &ftrace_events, list) {
428 for (i = 0; i < filter->n_preds; i++) 488 if (!call->define_fields)
429 filter_free_pred(filter->preds[i]); 489 continue;
430 kfree(filter->preds); 490
431 filter->preds = NULL; 491 if (strcmp(call->system, system->name) != 0)
432 filter->n_preds = 0; 492 continue;
493
494 err = init_preds(call);
495 if (err)
496 return err;
433 } 497 }
434 498
499 return 0;
500}
501
502enum {
503 FILTER_DISABLE_ALL,
504 FILTER_INIT_NO_RESET,
505 FILTER_SKIP_NO_RESET,
506};
507
508static void filter_free_subsystem_preds(struct event_subsystem *system,
509 int flag)
510{
511 struct ftrace_event_call *call;
512
435 list_for_each_entry(call, &ftrace_events, list) { 513 list_for_each_entry(call, &ftrace_events, list) {
436 if (!call->define_fields) 514 if (!call->define_fields)
437 continue; 515 continue;
438 516
439 if (!strcmp(call->system, system->name)) { 517 if (strcmp(call->system, system->name) != 0)
440 filter_disable_preds(call); 518 continue;
441 remove_filter_string(call->filter); 519
520 if (flag == FILTER_INIT_NO_RESET) {
521 call->filter->no_reset = false;
522 continue;
442 } 523 }
524
525 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
526 continue;
527
528 filter_disable_preds(call);
529 remove_filter_string(call->filter);
443 } 530 }
444} 531}
445 532
@@ -468,12 +555,7 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
468 return 0; 555 return 0;
469} 556}
470 557
471enum { 558int filter_assign_type(const char *type)
472 FILTER_STATIC_STRING = 1,
473 FILTER_DYN_STRING
474};
475
476static int is_string_field(const char *type)
477{ 559{
478 if (strstr(type, "__data_loc") && strstr(type, "char")) 560 if (strstr(type, "__data_loc") && strstr(type, "char"))
479 return FILTER_DYN_STRING; 561 return FILTER_DYN_STRING;
@@ -481,12 +563,19 @@ static int is_string_field(const char *type)
481 if (strchr(type, '[') && strstr(type, "char")) 563 if (strchr(type, '[') && strstr(type, "char"))
482 return FILTER_STATIC_STRING; 564 return FILTER_STATIC_STRING;
483 565
484 return 0; 566 return FILTER_OTHER;
567}
568
569static bool is_string_field(struct ftrace_event_field *field)
570{
571 return field->filter_type == FILTER_DYN_STRING ||
572 field->filter_type == FILTER_STATIC_STRING ||
573 field->filter_type == FILTER_PTR_STRING;
485} 574}
486 575
487static int is_legal_op(struct ftrace_event_field *field, int op) 576static int is_legal_op(struct ftrace_event_field *field, int op)
488{ 577{
489 if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) 578 if (is_string_field(field) && (op != OP_EQ && op != OP_NE))
490 return 0; 579 return 0;
491 580
492 return 1; 581 return 1;
@@ -537,22 +626,24 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
537 626
538static int filter_add_pred(struct filter_parse_state *ps, 627static int filter_add_pred(struct filter_parse_state *ps,
539 struct ftrace_event_call *call, 628 struct ftrace_event_call *call,
540 struct filter_pred *pred) 629 struct filter_pred *pred,
630 bool dry_run)
541{ 631{
542 struct ftrace_event_field *field; 632 struct ftrace_event_field *field;
543 filter_pred_fn_t fn; 633 filter_pred_fn_t fn;
544 unsigned long long val; 634 unsigned long long val;
545 int string_type;
546 int ret; 635 int ret;
547 636
548 pred->fn = filter_pred_none; 637 pred->fn = filter_pred_none;
549 638
550 if (pred->op == OP_AND) { 639 if (pred->op == OP_AND) {
551 pred->pop_n = 2; 640 pred->pop_n = 2;
552 return filter_add_pred_fn(ps, call, pred, filter_pred_and); 641 fn = filter_pred_and;
642 goto add_pred_fn;
553 } else if (pred->op == OP_OR) { 643 } else if (pred->op == OP_OR) {
554 pred->pop_n = 2; 644 pred->pop_n = 2;
555 return filter_add_pred_fn(ps, call, pred, filter_pred_or); 645 fn = filter_pred_or;
646 goto add_pred_fn;
556 } 647 }
557 648
558 field = find_event_field(call, pred->field_name); 649 field = find_event_field(call, pred->field_name);
@@ -568,16 +659,17 @@ static int filter_add_pred(struct filter_parse_state *ps,
568 return -EINVAL; 659 return -EINVAL;
569 } 660 }
570 661
571 string_type = is_string_field(field->type); 662 if (is_string_field(field)) {
572 if (string_type) { 663 pred->str_len = field->size;
573 if (string_type == FILTER_STATIC_STRING) 664
665 if (field->filter_type == FILTER_STATIC_STRING)
574 fn = filter_pred_string; 666 fn = filter_pred_string;
575 else 667 else if (field->filter_type == FILTER_DYN_STRING)
576 fn = filter_pred_strloc; 668 fn = filter_pred_strloc;
577 pred->str_len = field->size; 669 else {
578 if (pred->op == OP_NE) 670 fn = filter_pred_pchar;
579 pred->not = 1; 671 pred->str_len = strlen(pred->str_val);
580 return filter_add_pred_fn(ps, call, pred, fn); 672 }
581 } else { 673 } else {
582 if (field->is_signed) 674 if (field->is_signed)
583 ret = strict_strtoll(pred->str_val, 0, &val); 675 ret = strict_strtoll(pred->str_val, 0, &val);
@@ -588,44 +680,33 @@ static int filter_add_pred(struct filter_parse_state *ps,
588 return -EINVAL; 680 return -EINVAL;
589 } 681 }
590 pred->val = val; 682 pred->val = val;
591 }
592 683
593 fn = select_comparison_fn(pred->op, field->size, field->is_signed); 684 fn = select_comparison_fn(pred->op, field->size,
594 if (!fn) { 685 field->is_signed);
595 parse_error(ps, FILT_ERR_INVALID_OP, 0); 686 if (!fn) {
596 return -EINVAL; 687 parse_error(ps, FILT_ERR_INVALID_OP, 0);
688 return -EINVAL;
689 }
597 } 690 }
598 691
599 if (pred->op == OP_NE) 692 if (pred->op == OP_NE)
600 pred->not = 1; 693 pred->not = 1;
601 694
602 return filter_add_pred_fn(ps, call, pred, fn); 695add_pred_fn:
696 if (!dry_run)
697 return filter_add_pred_fn(ps, call, pred, fn);
698 return 0;
603} 699}
604 700
605static int filter_add_subsystem_pred(struct filter_parse_state *ps, 701static int filter_add_subsystem_pred(struct filter_parse_state *ps,
606 struct event_subsystem *system, 702 struct event_subsystem *system,
607 struct filter_pred *pred, 703 struct filter_pred *pred,
608 char *filter_string) 704 char *filter_string,
705 bool dry_run)
609{ 706{
610 struct event_filter *filter = system->filter;
611 struct ftrace_event_call *call; 707 struct ftrace_event_call *call;
612 int err = 0; 708 int err = 0;
613 709 bool fail = true;
614 if (!filter->preds) {
615 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
616 GFP_KERNEL);
617
618 if (!filter->preds)
619 return -ENOMEM;
620 }
621
622 if (filter->n_preds == MAX_FILTER_PRED) {
623 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
624 return -ENOSPC;
625 }
626
627 filter->preds[filter->n_preds] = pred;
628 filter->n_preds++;
629 710
630 list_for_each_entry(call, &ftrace_events, list) { 711 list_for_each_entry(call, &ftrace_events, list) {
631 712
@@ -635,16 +716,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
635 if (strcmp(call->system, system->name)) 716 if (strcmp(call->system, system->name))
636 continue; 717 continue;
637 718
638 err = filter_add_pred(ps, call, pred); 719 if (call->filter->no_reset)
639 if (err) { 720 continue;
640 filter_free_subsystem_preds(system); 721
641 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 722 err = filter_add_pred(ps, call, pred, dry_run);
642 goto out; 723 if (err)
643 } 724 call->filter->no_reset = true;
644 replace_filter_string(call->filter, filter_string); 725 else
726 fail = false;
727
728 if (!dry_run)
729 replace_filter_string(call->filter, filter_string);
645 } 730 }
646out: 731
647 return err; 732 if (fail) {
733 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
734 return err;
735 }
736 return 0;
648} 737}
649 738
650static void parse_init(struct filter_parse_state *ps, 739static void parse_init(struct filter_parse_state *ps,
@@ -1003,12 +1092,14 @@ static int check_preds(struct filter_parse_state *ps)
1003static int replace_preds(struct event_subsystem *system, 1092static int replace_preds(struct event_subsystem *system,
1004 struct ftrace_event_call *call, 1093 struct ftrace_event_call *call,
1005 struct filter_parse_state *ps, 1094 struct filter_parse_state *ps,
1006 char *filter_string) 1095 char *filter_string,
1096 bool dry_run)
1007{ 1097{
1008 char *operand1 = NULL, *operand2 = NULL; 1098 char *operand1 = NULL, *operand2 = NULL;
1009 struct filter_pred *pred; 1099 struct filter_pred *pred;
1010 struct postfix_elt *elt; 1100 struct postfix_elt *elt;
1011 int err; 1101 int err;
1102 int n_preds = 0;
1012 1103
1013 err = check_preds(ps); 1104 err = check_preds(ps);
1014 if (err) 1105 if (err)
@@ -1027,19 +1118,14 @@ static int replace_preds(struct event_subsystem *system,
1027 continue; 1118 continue;
1028 } 1119 }
1029 1120
1121 if (n_preds++ == MAX_FILTER_PRED) {
1122 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1123 return -ENOSPC;
1124 }
1125
1030 if (elt->op == OP_AND || elt->op == OP_OR) { 1126 if (elt->op == OP_AND || elt->op == OP_OR) {
1031 pred = create_logical_pred(elt->op); 1127 pred = create_logical_pred(elt->op);
1032 if (call) { 1128 goto add_pred;
1033 err = filter_add_pred(ps, call, pred);
1034 filter_free_pred(pred);
1035 } else
1036 err = filter_add_subsystem_pred(ps, system,
1037 pred, filter_string);
1038 if (err)
1039 return err;
1040
1041 operand1 = operand2 = NULL;
1042 continue;
1043 } 1129 }
1044 1130
1045 if (!operand1 || !operand2) { 1131 if (!operand1 || !operand2) {
@@ -1048,12 +1134,15 @@ static int replace_preds(struct event_subsystem *system,
1048 } 1134 }
1049 1135
1050 pred = create_pred(elt->op, operand1, operand2); 1136 pred = create_pred(elt->op, operand1, operand2);
1051 if (call) { 1137add_pred:
1052 err = filter_add_pred(ps, call, pred); 1138 if (!pred)
1053 filter_free_pred(pred); 1139 return -ENOMEM;
1054 } else 1140 if (call)
1141 err = filter_add_pred(ps, call, pred, false);
1142 else
1055 err = filter_add_subsystem_pred(ps, system, pred, 1143 err = filter_add_subsystem_pred(ps, system, pred,
1056 filter_string); 1144 filter_string, dry_run);
1145 filter_free_pred(pred);
1057 if (err) 1146 if (err)
1058 return err; 1147 return err;
1059 1148
@@ -1071,6 +1160,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1071 1160
1072 mutex_lock(&event_mutex); 1161 mutex_lock(&event_mutex);
1073 1162
1163 err = init_preds(call);
1164 if (err)
1165 goto out_unlock;
1166
1074 if (!strcmp(strstrip(filter_string), "0")) { 1167 if (!strcmp(strstrip(filter_string), "0")) {
1075 filter_disable_preds(call); 1168 filter_disable_preds(call);
1076 remove_filter_string(call->filter); 1169 remove_filter_string(call->filter);
@@ -1093,7 +1186,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1093 goto out; 1186 goto out;
1094 } 1187 }
1095 1188
1096 err = replace_preds(NULL, call, ps, filter_string); 1189 err = replace_preds(NULL, call, ps, filter_string, false);
1097 if (err) 1190 if (err)
1098 append_filter_err(ps, call->filter); 1191 append_filter_err(ps, call->filter);
1099 1192
@@ -1116,8 +1209,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1116 1209
1117 mutex_lock(&event_mutex); 1210 mutex_lock(&event_mutex);
1118 1211
1212 err = init_subsystem_preds(system);
1213 if (err)
1214 goto out_unlock;
1215
1119 if (!strcmp(strstrip(filter_string), "0")) { 1216 if (!strcmp(strstrip(filter_string), "0")) {
1120 filter_free_subsystem_preds(system); 1217 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
1121 remove_filter_string(system->filter); 1218 remove_filter_string(system->filter);
1122 mutex_unlock(&event_mutex); 1219 mutex_unlock(&event_mutex);
1123 return 0; 1220 return 0;
@@ -1128,7 +1225,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1128 if (!ps) 1225 if (!ps)
1129 goto out_unlock; 1226 goto out_unlock;
1130 1227
1131 filter_free_subsystem_preds(system);
1132 replace_filter_string(system->filter, filter_string); 1228 replace_filter_string(system->filter, filter_string);
1133 1229
1134 parse_init(ps, filter_ops, filter_string); 1230 parse_init(ps, filter_ops, filter_string);
@@ -1138,9 +1234,23 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1138 goto out; 1234 goto out;
1139 } 1235 }
1140 1236
1141 err = replace_preds(system, NULL, ps, filter_string); 1237 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
1142 if (err) 1238
1239 /* try to see the filter can be applied to which events */
1240 err = replace_preds(system, NULL, ps, filter_string, true);
1241 if (err) {
1143 append_filter_err(ps, system->filter); 1242 append_filter_err(ps, system->filter);
1243 goto out;
1244 }
1245
1246 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
1247
1248 /* really apply the filter to the events */
1249 err = replace_preds(system, NULL, ps, filter_string, false);
1250 if (err) {
1251 append_filter_err(ps, system->filter);
1252 filter_free_subsystem_preds(system, 2);
1253 }
1144 1254
1145out: 1255out:
1146 filter_opstack_clear(ps); 1256 filter_opstack_clear(ps);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d06cf898dc86..9753fcc61bc5 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,116 +15,209 @@
15 15
16#include "trace_output.h" 16#include "trace_output.h"
17 17
18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM ftrace
18 20
19#undef TRACE_STRUCT 21/* not needed for this file */
20#define TRACE_STRUCT(args...) args 22#undef __field_struct
23#define __field_struct(type, item)
21 24
22extern void __bad_type_size(void); 25#undef __field
26#define __field(type, item) type item;
23 27
24#undef TRACE_FIELD 28#undef __field_desc
25#define TRACE_FIELD(type, item, assign) \ 29#define __field_desc(type, container, item) type item;
26 if (sizeof(type) != sizeof(field.item)) \ 30
27 __bad_type_size(); \ 31#undef __array
32#define __array(type, item, size) type item[size];
33
34#undef __array_desc
35#define __array_desc(type, container, item, size) type item[size];
36
37#undef __dynamic_array
38#define __dynamic_array(type, item) type item[];
39
40#undef F_STRUCT
41#define F_STRUCT(args...) args
42
43#undef F_printk
44#define F_printk(fmt, args...) fmt, args
45
46#undef FTRACE_ENTRY
47#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
48struct ____ftrace_##name { \
49 tstruct \
50}; \
51static void __used ____ftrace_check_##name(void) \
52{ \
53 struct ____ftrace_##name *__entry = NULL; \
54 \
55 /* force cmpile-time check on F_printk() */ \
56 printk(print); \
57}
58
59#undef FTRACE_ENTRY_DUP
60#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \
61 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print))
62
63#include "trace_entries.h"
64
65
66#undef __field
67#define __field(type, item) \
28 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ 68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
29 "offset:%u;\tsize:%u;\n", \ 69 "offset:%zu;\tsize:%zu;\n", \
30 (unsigned int)offsetof(typeof(field), item), \ 70 offsetof(typeof(field), item), \
31 (unsigned int)sizeof(field.item)); \ 71 sizeof(field.item)); \
32 if (!ret) \ 72 if (!ret) \
33 return 0; 73 return 0;
34 74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item)); \
81 if (!ret) \
82 return 0;
83
84#undef __array
85#define __array(type, item, len) \
86 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
87 "offset:%zu;\tsize:%zu;\n", \
88 offsetof(typeof(field), item), \
89 sizeof(field.item)); \
90 if (!ret) \
91 return 0;
35 92
36#undef TRACE_FIELD_SPECIAL 93#undef __array_desc
37#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ 94#define __array_desc(type, container, item, len) \
38 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ 95 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
39 "offset:%u;\tsize:%u;\n", \ 96 "offset:%zu;\tsize:%zu;\n", \
40 (unsigned int)offsetof(typeof(field), item), \ 97 offsetof(typeof(field), container.item), \
41 (unsigned int)sizeof(field.item)); \ 98 sizeof(field.container.item)); \
42 if (!ret) \ 99 if (!ret) \
43 return 0; 100 return 0;
44 101
45#undef TRACE_FIELD_ZERO_CHAR 102#undef __dynamic_array
46#define TRACE_FIELD_ZERO_CHAR(item) \ 103#define __dynamic_array(type, item) \
47 ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ 104 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
48 "offset:%u;\tsize:0;\n", \ 105 "offset:%zu;\tsize:0;\n", \
49 (unsigned int)offsetof(typeof(field), item)); \ 106 offsetof(typeof(field), item)); \
50 if (!ret) \ 107 if (!ret) \
51 return 0; 108 return 0;
52 109
53#undef TRACE_FIELD_SIGN 110#undef F_printk
54#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 111#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
55 TRACE_FIELD(type, item, assign)
56 112
57#undef TP_RAW_FMT 113#undef __entry
58#define TP_RAW_FMT(args...) args 114#define __entry REC
59 115
60#undef TRACE_EVENT_FORMAT 116#undef FTRACE_ENTRY
61#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 117#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
62static int \ 118static int \
63ftrace_format_##call(struct trace_seq *s) \ 119ftrace_format_##name(struct ftrace_event_call *unused, \
120 struct trace_seq *s) \
64{ \ 121{ \
65 struct args field; \ 122 struct struct_name field __attribute__((unused)); \
66 int ret; \ 123 int ret = 0; \
67 \ 124 \
68 tstruct; \ 125 tstruct; \
69 \ 126 \
70 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ 127 trace_seq_printf(s, "\nprint fmt: " print); \
71 \ 128 \
72 return ret; \ 129 return ret; \
73} 130}
74 131
75#undef TRACE_EVENT_FORMAT_NOFILTER 132#include "trace_entries.h"
76#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ 133
77 tpfmt) \ 134
78static int \ 135#undef __field
79ftrace_format_##call(struct trace_seq *s) \ 136#define __field(type, item) \
137 ret = trace_define_field(event_call, #type, #item, \
138 offsetof(typeof(field), item), \
139 sizeof(field.item), \
140 is_signed_type(type), FILTER_OTHER); \
141 if (ret) \
142 return ret;
143
144#undef __field_desc
145#define __field_desc(type, container, item) \
146 ret = trace_define_field(event_call, #type, #item, \
147 offsetof(typeof(field), \
148 container.item), \
149 sizeof(field.container.item), \
150 is_signed_type(type), FILTER_OTHER); \
151 if (ret) \
152 return ret;
153
154#undef __array
155#define __array(type, item, len) \
156 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
157 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
158 offsetof(typeof(field), item), \
159 sizeof(field.item), 0, FILTER_OTHER); \
160 if (ret) \
161 return ret;
162
163#undef __array_desc
164#define __array_desc(type, container, item, len) \
165 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
166 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
167 offsetof(typeof(field), \
168 container.item), \
169 sizeof(field.container.item), 0, \
170 FILTER_OTHER); \
171 if (ret) \
172 return ret;
173
174#undef __dynamic_array
175#define __dynamic_array(type, item)
176
177#undef FTRACE_ENTRY
178#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
179int \
180ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
80{ \ 181{ \
81 struct args field; \ 182 struct struct_name field; \
82 int ret; \ 183 int ret; \
83 \ 184 \
84 tstruct; \ 185 ret = trace_define_common_fields(event_call); \
186 if (ret) \
187 return ret; \
85 \ 188 \
86 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ 189 tstruct; \
87 \ 190 \
88 return ret; \ 191 return ret; \
89} 192}
90 193
91#include "trace_event_types.h" 194#include "trace_entries.h"
92
93#undef TRACE_ZERO_CHAR
94#define TRACE_ZERO_CHAR(arg)
95 195
96#undef TRACE_FIELD
97#define TRACE_FIELD(type, item, assign)\
98 entry->item = assign;
99 196
100#undef TRACE_FIELD 197#undef __field
101#define TRACE_FIELD(type, item, assign)\ 198#define __field(type, item)
102 entry->item = assign;
103 199
104#undef TRACE_FIELD_SIGN 200#undef __field_desc
105#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ 201#define __field_desc(type, container, item)
106 TRACE_FIELD(type, item, assign)
107 202
108#undef TP_CMD 203#undef __array
109#define TP_CMD(cmd...) cmd 204#define __array(type, item, len)
110 205
111#undef TRACE_ENTRY 206#undef __array_desc
112#define TRACE_ENTRY entry 207#define __array_desc(type, container, item, len)
113 208
114#undef TRACE_FIELD_SPECIAL 209#undef __dynamic_array
115#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ 210#define __dynamic_array(type, item)
116 cmd;
117 211
118#undef TRACE_EVENT_FORMAT 212#undef FTRACE_ENTRY
119#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ 213#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
120int ftrace_define_fields_##call(void); \
121static int ftrace_raw_init_event_##call(void); \ 214static int ftrace_raw_init_event_##call(void); \
122 \ 215 \
123struct ftrace_event_call __used \ 216struct ftrace_event_call __used \
124__attribute__((__aligned__(4))) \ 217__attribute__((__aligned__(4))) \
125__attribute__((section("_ftrace_events"))) event_##call = { \ 218__attribute__((section("_ftrace_events"))) event_##call = { \
126 .name = #call, \ 219 .name = #call, \
127 .id = proto, \ 220 .id = type, \
128 .system = __stringify(TRACE_SYSTEM), \ 221 .system = __stringify(TRACE_SYSTEM), \
129 .raw_init = ftrace_raw_init_event_##call, \ 222 .raw_init = ftrace_raw_init_event_##call, \
130 .show_format = ftrace_format_##call, \ 223 .show_format = ftrace_format_##call, \
@@ -133,74 +226,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
133static int ftrace_raw_init_event_##call(void) \ 226static int ftrace_raw_init_event_##call(void) \
134{ \ 227{ \
135 INIT_LIST_HEAD(&event_##call.fields); \ 228 INIT_LIST_HEAD(&event_##call.fields); \
136 init_preds(&event_##call); \
137 return 0; \ 229 return 0; \
138} \ 230} \
139 231
140#undef TRACE_EVENT_FORMAT_NOFILTER 232#include "trace_entries.h"
141#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
142 tpfmt) \
143 \
144struct ftrace_event_call __used \
145__attribute__((__aligned__(4))) \
146__attribute__((section("_ftrace_events"))) event_##call = { \
147 .name = #call, \
148 .id = proto, \
149 .system = __stringify(TRACE_SYSTEM), \
150 .show_format = ftrace_format_##call, \
151};
152
153#include "trace_event_types.h"
154
155#undef TRACE_FIELD
156#define TRACE_FIELD(type, item, assign) \
157 ret = trace_define_field(event_call, #type, #item, \
158 offsetof(typeof(field), item), \
159 sizeof(field.item), is_signed_type(type)); \
160 if (ret) \
161 return ret;
162
163#undef TRACE_FIELD_SPECIAL
164#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \
165 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
166 offsetof(typeof(field), item), \
167 sizeof(field.item), 0); \
168 if (ret) \
169 return ret;
170
171#undef TRACE_FIELD_SIGN
172#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \
173 ret = trace_define_field(event_call, #type, #item, \
174 offsetof(typeof(field), item), \
175 sizeof(field.item), is_signed); \
176 if (ret) \
177 return ret;
178
179#undef TRACE_FIELD_ZERO_CHAR
180#define TRACE_FIELD_ZERO_CHAR(item)
181
182#undef TRACE_EVENT_FORMAT
183#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
184int \
185ftrace_define_fields_##call(void) \
186{ \
187 struct ftrace_event_call *event_call = &event_##call; \
188 struct args field; \
189 int ret; \
190 \
191 __common_field(unsigned char, type, 0); \
192 __common_field(unsigned char, flags, 0); \
193 __common_field(unsigned char, preempt_count, 0); \
194 __common_field(int, pid, 1); \
195 __common_field(int, tgid, 1); \
196 \
197 tstruct; \
198 \
199 return ret; \
200}
201
202#undef TRACE_EVENT_FORMAT_NOFILTER
203#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \
204 tpfmt)
205
206#include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 7402144bff21..b3f3776b0cd6 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -288,11 +288,9 @@ static int
288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
289 struct ftrace_probe_ops *ops, void *data) 289 struct ftrace_probe_ops *ops, void *data)
290{ 290{
291 char str[KSYM_SYMBOL_LEN];
292 long count = (long)data; 291 long count = (long)data;
293 292
294 kallsyms_lookup(ip, NULL, NULL, NULL, str); 293 seq_printf(m, "%ps:", (void *)ip);
295 seq_printf(m, "%s:", str);
296 294
297 if (ops == &traceon_probe_ops) 295 if (ops == &traceon_probe_ops)
298 seq_printf(m, "traceon"); 296 seq_printf(m, "traceon");
@@ -363,7 +361,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
363 out_reg: 361 out_reg:
364 ret = register_ftrace_function_probe(glob, ops, count); 362 ret = register_ftrace_function_probe(glob, ops, count);
365 363
366 return ret; 364 return ret < 0 ? ret : 0;
367} 365}
368 366
369static struct ftrace_func_command ftrace_traceon_cmd = { 367static struct ftrace_func_command ftrace_traceon_cmd = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d2249abafb53..45e6c01b2e4d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -52,7 +52,7 @@ static struct tracer_flags tracer_flags = {
52 .opts = trace_opts 52 .opts = trace_opts
53}; 53};
54 54
55/* pid on the last trace processed */ 55static struct trace_array *graph_array;
56 56
57 57
58/* Add a function return address to the trace stack on thread info.*/ 58/* Add a function return address to the trace stack on thread info.*/
@@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) { 124 if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
125 ftrace_graph_stop(); 125 ftrace_graph_stop();
126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n" 126 WARN(1, "Bad frame pointer: expected %lx, received %lx\n"
127 " from func %pF return to %lx\n", 127 " from func %ps return to %lx\n",
128 current->ret_stack[index].fp, 128 current->ret_stack[index].fp,
129 frame_pointer, 129 frame_pointer,
130 (void *)current->ret_stack[index].func, 130 (void *)current->ret_stack[index].func,
@@ -166,10 +166,123 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
166 return ret; 166 return ret;
167} 167}
168 168
169static int __trace_graph_entry(struct trace_array *tr,
170 struct ftrace_graph_ent *trace,
171 unsigned long flags,
172 int pc)
173{
174 struct ftrace_event_call *call = &event_funcgraph_entry;
175 struct ring_buffer_event *event;
176 struct ring_buffer *buffer = tr->buffer;
177 struct ftrace_graph_ent_entry *entry;
178
179 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
180 return 0;
181
182 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
183 sizeof(*entry), flags, pc);
184 if (!event)
185 return 0;
186 entry = ring_buffer_event_data(event);
187 entry->graph_ent = *trace;
188 if (!filter_current_check_discard(buffer, call, entry, event))
189 ring_buffer_unlock_commit(buffer, event);
190
191 return 1;
192}
193
194int trace_graph_entry(struct ftrace_graph_ent *trace)
195{
196 struct trace_array *tr = graph_array;
197 struct trace_array_cpu *data;
198 unsigned long flags;
199 long disabled;
200 int ret;
201 int cpu;
202 int pc;
203
204 if (unlikely(!tr))
205 return 0;
206
207 if (!ftrace_trace_task(current))
208 return 0;
209
210 if (!ftrace_graph_addr(trace->func))
211 return 0;
212
213 local_irq_save(flags);
214 cpu = raw_smp_processor_id();
215 data = tr->data[cpu];
216 disabled = atomic_inc_return(&data->disabled);
217 if (likely(disabled == 1)) {
218 pc = preempt_count();
219 ret = __trace_graph_entry(tr, trace, flags, pc);
220 } else {
221 ret = 0;
222 }
223 /* Only do the atomic if it is not already set */
224 if (!test_tsk_trace_graph(current))
225 set_tsk_trace_graph(current);
226
227 atomic_dec(&data->disabled);
228 local_irq_restore(flags);
229
230 return ret;
231}
232
233static void __trace_graph_return(struct trace_array *tr,
234 struct ftrace_graph_ret *trace,
235 unsigned long flags,
236 int pc)
237{
238 struct ftrace_event_call *call = &event_funcgraph_exit;
239 struct ring_buffer_event *event;
240 struct ring_buffer *buffer = tr->buffer;
241 struct ftrace_graph_ret_entry *entry;
242
243 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
244 return;
245
246 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
247 sizeof(*entry), flags, pc);
248 if (!event)
249 return;
250 entry = ring_buffer_event_data(event);
251 entry->ret = *trace;
252 if (!filter_current_check_discard(buffer, call, entry, event))
253 ring_buffer_unlock_commit(buffer, event);
254}
255
256void trace_graph_return(struct ftrace_graph_ret *trace)
257{
258 struct trace_array *tr = graph_array;
259 struct trace_array_cpu *data;
260 unsigned long flags;
261 long disabled;
262 int cpu;
263 int pc;
264
265 local_irq_save(flags);
266 cpu = raw_smp_processor_id();
267 data = tr->data[cpu];
268 disabled = atomic_inc_return(&data->disabled);
269 if (likely(disabled == 1)) {
270 pc = preempt_count();
271 __trace_graph_return(tr, trace, flags, pc);
272 }
273 if (!trace->depth)
274 clear_tsk_trace_graph(current);
275 atomic_dec(&data->disabled);
276 local_irq_restore(flags);
277}
278
169static int graph_trace_init(struct trace_array *tr) 279static int graph_trace_init(struct trace_array *tr)
170{ 280{
171 int ret = register_ftrace_graph(&trace_graph_return, 281 int ret;
172 &trace_graph_entry); 282
283 graph_array = tr;
284 ret = register_ftrace_graph(&trace_graph_return,
285 &trace_graph_entry);
173 if (ret) 286 if (ret)
174 return ret; 287 return ret;
175 tracing_start_cmdline_record(); 288 tracing_start_cmdline_record();
@@ -177,49 +290,30 @@ static int graph_trace_init(struct trace_array *tr)
177 return 0; 290 return 0;
178} 291}
179 292
293void set_graph_array(struct trace_array *tr)
294{
295 graph_array = tr;
296}
297
180static void graph_trace_reset(struct trace_array *tr) 298static void graph_trace_reset(struct trace_array *tr)
181{ 299{
182 tracing_stop_cmdline_record(); 300 tracing_stop_cmdline_record();
183 unregister_ftrace_graph(); 301 unregister_ftrace_graph();
184} 302}
185 303
186static inline int log10_cpu(int nb) 304static int max_bytes_for_cpu;
187{
188 if (nb / 100)
189 return 3;
190 if (nb / 10)
191 return 2;
192 return 1;
193}
194 305
195static enum print_line_t 306static enum print_line_t
196print_graph_cpu(struct trace_seq *s, int cpu) 307print_graph_cpu(struct trace_seq *s, int cpu)
197{ 308{
198 int i;
199 int ret; 309 int ret;
200 int log10_this = log10_cpu(cpu);
201 int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
202
203 310
204 /* 311 /*
205 * Start with a space character - to make it stand out 312 * Start with a space character - to make it stand out
206 * to the right a bit when trace output is pasted into 313 * to the right a bit when trace output is pasted into
207 * email: 314 * email:
208 */ 315 */
209 ret = trace_seq_printf(s, " "); 316 ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
210
211 /*
212 * Tricky - we space the CPU field according to the max
213 * number of online CPUs. On a 2-cpu system it would take
214 * a maximum of 1 digit - on a 128 cpu system it would
215 * take up to 3 digits:
216 */
217 for (i = 0; i < log10_all - log10_this; i++) {
218 ret = trace_seq_printf(s, " ");
219 if (!ret)
220 return TRACE_TYPE_PARTIAL_LINE;
221 }
222 ret = trace_seq_printf(s, "%d) ", cpu);
223 if (!ret) 317 if (!ret)
224 return TRACE_TYPE_PARTIAL_LINE; 318 return TRACE_TYPE_PARTIAL_LINE;
225 319
@@ -270,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
270} 364}
271 365
272 366
367static enum print_line_t
368print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
369{
370 if (!trace_seq_putc(s, ' '))
371 return 0;
372
373 return trace_print_lat_fmt(s, entry);
374}
375
273/* If the pid changed since the last trace, output this event */ 376/* If the pid changed since the last trace, output this event */
274static enum print_line_t 377static enum print_line_t
275verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) 378verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
@@ -427,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
427 if (ret == TRACE_TYPE_PARTIAL_LINE) 530 if (ret == TRACE_TYPE_PARTIAL_LINE)
428 return TRACE_TYPE_PARTIAL_LINE; 531 return TRACE_TYPE_PARTIAL_LINE;
429 } 532 }
533
430 /* Proc */ 534 /* Proc */
431 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 535 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
432 ret = print_graph_proc(s, pid); 536 ret = print_graph_proc(s, pid);
@@ -565,11 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
565 return TRACE_TYPE_PARTIAL_LINE; 669 return TRACE_TYPE_PARTIAL_LINE;
566 } 670 }
567 671
568 ret = seq_print_ip_sym(s, call->func, 0); 672 ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);
569 if (!ret)
570 return TRACE_TYPE_PARTIAL_LINE;
571
572 ret = trace_seq_printf(s, "();\n");
573 if (!ret) 673 if (!ret)
574 return TRACE_TYPE_PARTIAL_LINE; 674 return TRACE_TYPE_PARTIAL_LINE;
575 675
@@ -612,11 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
612 return TRACE_TYPE_PARTIAL_LINE; 712 return TRACE_TYPE_PARTIAL_LINE;
613 } 713 }
614 714
615 ret = seq_print_ip_sym(s, call->func, 0); 715 ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);
616 if (!ret)
617 return TRACE_TYPE_PARTIAL_LINE;
618
619 ret = trace_seq_printf(s, "() {\n");
620 if (!ret) 716 if (!ret)
621 return TRACE_TYPE_PARTIAL_LINE; 717 return TRACE_TYPE_PARTIAL_LINE;
622 718
@@ -672,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
672 return TRACE_TYPE_PARTIAL_LINE; 768 return TRACE_TYPE_PARTIAL_LINE;
673 } 769 }
674 770
771 /* Latency format */
772 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
773 ret = print_graph_lat_fmt(s, ent);
774 if (ret == TRACE_TYPE_PARTIAL_LINE)
775 return TRACE_TYPE_PARTIAL_LINE;
776 }
777
675 return 0; 778 return 0;
676} 779}
677 780
@@ -843,9 +946,16 @@ print_graph_function(struct trace_iterator *iter)
843 946
844 switch (entry->type) { 947 switch (entry->type) {
845 case TRACE_GRAPH_ENT: { 948 case TRACE_GRAPH_ENT: {
846 struct ftrace_graph_ent_entry *field; 949 /*
950 * print_graph_entry() may consume the current event,
951 * thus @field may become invalid, so we need to save it.
952 * sizeof(struct ftrace_graph_ent_entry) is very small,
953 * it can be safely saved at the stack.
954 */
955 struct ftrace_graph_ent_entry *field, saved;
847 trace_assign_type(field, entry); 956 trace_assign_type(field, entry);
848 return print_graph_entry(field, s, iter); 957 saved = *field;
958 return print_graph_entry(&saved, s, iter);
849 } 959 }
850 case TRACE_GRAPH_RET: { 960 case TRACE_GRAPH_RET: {
851 struct ftrace_graph_ret_entry *field; 961 struct ftrace_graph_ret_entry *field;
@@ -859,28 +969,59 @@ print_graph_function(struct trace_iterator *iter)
859 return TRACE_TYPE_HANDLED; 969 return TRACE_TYPE_HANDLED;
860} 970}
861 971
972static void print_lat_header(struct seq_file *s)
973{
974 static const char spaces[] = " " /* 16 spaces */
975 " " /* 4 spaces */
976 " "; /* 17 spaces */
977 int size = 0;
978
979 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
980 size += 16;
981 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
982 size += 4;
983 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
984 size += 17;
985
986 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
987 seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces);
988 seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces);
989 seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces);
990 seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces);
991 seq_printf(s, "#%.*s|||| / \n", size, spaces);
992}
993
862static void print_graph_headers(struct seq_file *s) 994static void print_graph_headers(struct seq_file *s)
863{ 995{
996 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
997
998 if (lat)
999 print_lat_header(s);
1000
864 /* 1st line */ 1001 /* 1st line */
865 seq_printf(s, "# "); 1002 seq_printf(s, "#");
866 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1003 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
867 seq_printf(s, " TIME "); 1004 seq_printf(s, " TIME ");
868 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1005 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
869 seq_printf(s, "CPU"); 1006 seq_printf(s, " CPU");
870 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1007 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
871 seq_printf(s, " TASK/PID "); 1008 seq_printf(s, " TASK/PID ");
1009 if (lat)
1010 seq_printf(s, "|||||");
872 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1011 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
873 seq_printf(s, " DURATION "); 1012 seq_printf(s, " DURATION ");
874 seq_printf(s, " FUNCTION CALLS\n"); 1013 seq_printf(s, " FUNCTION CALLS\n");
875 1014
876 /* 2nd line */ 1015 /* 2nd line */
877 seq_printf(s, "# "); 1016 seq_printf(s, "#");
878 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1017 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
879 seq_printf(s, " | "); 1018 seq_printf(s, " | ");
880 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1019 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
881 seq_printf(s, "| "); 1020 seq_printf(s, " | ");
882 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1021 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
883 seq_printf(s, " | | "); 1022 seq_printf(s, " | | ");
1023 if (lat)
1024 seq_printf(s, "|||||");
884 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1025 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
885 seq_printf(s, " | | "); 1026 seq_printf(s, " | | ");
886 seq_printf(s, " | | | |\n"); 1027 seq_printf(s, " | | | |\n");
@@ -927,6 +1068,8 @@ static struct tracer graph_trace __read_mostly = {
927 1068
928static __init int init_graph_trace(void) 1069static __init int init_graph_trace(void)
929{ 1070{
1071 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1072
930 return register_tracer(&graph_trace); 1073 return register_tracer(&graph_trace);
931} 1074}
932 1075
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index ca7d7c4d0c2a..23b63859130e 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
155 seq_print_ip_sym(seq, it->from, symflags) && 155 seq_print_ip_sym(seq, it->from, symflags) &&
156 trace_seq_printf(seq, "\n")) 156 trace_seq_printf(seq, "\n"))
157 return TRACE_TYPE_HANDLED; 157 return TRACE_TYPE_HANDLED;
158 return TRACE_TYPE_PARTIAL_LINE;; 158 return TRACE_TYPE_PARTIAL_LINE;
159 } 159 }
160 return TRACE_TYPE_UNHANDLED; 160 return TRACE_TYPE_UNHANDLED;
161} 161}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b923d13e2fad..3aa7eaa2114c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr,
129 unsigned long parent_ip, 129 unsigned long parent_ip,
130 int cpu) 130 int cpu)
131{ 131{
132 unsigned long latency, t0, t1;
133 cycle_t T0, T1, delta; 132 cycle_t T0, T1, delta;
134 unsigned long flags; 133 unsigned long flags;
135 int pc; 134 int pc;
136 135
137 /*
138 * usecs conversion is slow so we try to delay the conversion
139 * as long as possible:
140 */
141 T0 = data->preempt_timestamp; 136 T0 = data->preempt_timestamp;
142 T1 = ftrace_now(cpu); 137 T1 = ftrace_now(cpu);
143 delta = T1-T0; 138 delta = T1-T0;
@@ -157,18 +152,15 @@ check_critical_timing(struct trace_array *tr,
157 152
158 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
159 154
160 latency = nsecs_to_usecs(delta);
161
162 if (data->critical_sequence != max_sequence) 155 if (data->critical_sequence != max_sequence)
163 goto out_unlock; 156 goto out_unlock;
164 157
165 tracing_max_latency = delta;
166 t0 = nsecs_to_usecs(T0);
167 t1 = nsecs_to_usecs(T1);
168
169 data->critical_end = parent_ip; 158 data->critical_end = parent_ip;
170 159
171 update_max_tr_single(tr, current, cpu); 160 if (likely(!is_tracing_stopped())) {
161 tracing_max_latency = delta;
162 update_max_tr_single(tr, current, cpu);
163 }
172 164
173 max_sequence++; 165 max_sequence++;
174 166
@@ -178,7 +170,6 @@ out_unlock:
178out: 170out:
179 data->critical_sequence = max_sequence; 171 data->critical_sequence = max_sequence;
180 data->preempt_timestamp = ftrace_now(cpu); 172 data->preempt_timestamp = ftrace_now(cpu);
181 tracing_reset(tr, cpu);
182 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 173 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
183} 174}
184 175
@@ -208,7 +199,6 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
208 data->critical_sequence = max_sequence; 199 data->critical_sequence = max_sequence;
209 data->preempt_timestamp = ftrace_now(cpu); 200 data->preempt_timestamp = ftrace_now(cpu);
210 data->critical_start = parent_ip ? : ip; 201 data->critical_start = parent_ip ? : ip;
211 tracing_reset(tr, cpu);
212 202
213 local_save_flags(flags); 203 local_save_flags(flags);
214 204
@@ -379,6 +369,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
379 irqsoff_trace = tr; 369 irqsoff_trace = tr;
380 /* make sure that the tracer is visible */ 370 /* make sure that the tracer is visible */
381 smp_wmb(); 371 smp_wmb();
372 tracing_reset_online_cpus(tr);
382 start_irqsoff_tracer(tr); 373 start_irqsoff_tracer(tr);
383} 374}
384 375
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index d53b45ed0806..0acd834659ed 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,11 +307,13 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
307 struct trace_array_cpu *data, 307 struct trace_array_cpu *data,
308 struct mmiotrace_rw *rw) 308 struct mmiotrace_rw *rw)
309{ 309{
310 struct ftrace_event_call *call = &event_mmiotrace_rw;
311 struct ring_buffer *buffer = tr->buffer;
310 struct ring_buffer_event *event; 312 struct ring_buffer_event *event;
311 struct trace_mmiotrace_rw *entry; 313 struct trace_mmiotrace_rw *entry;
312 int pc = preempt_count(); 314 int pc = preempt_count();
313 315
314 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW, 316 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW,
315 sizeof(*entry), 0, pc); 317 sizeof(*entry), 0, pc);
316 if (!event) { 318 if (!event) {
317 atomic_inc(&dropped_count); 319 atomic_inc(&dropped_count);
@@ -319,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
319 } 321 }
320 entry = ring_buffer_event_data(event); 322 entry = ring_buffer_event_data(event);
321 entry->rw = *rw; 323 entry->rw = *rw;
322 trace_buffer_unlock_commit(tr, event, 0, pc); 324
325 if (!filter_check_discard(call, entry, buffer, event))
326 trace_buffer_unlock_commit(buffer, event, 0, pc);
323} 327}
324 328
325void mmio_trace_rw(struct mmiotrace_rw *rw) 329void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -333,11 +337,13 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
333 struct trace_array_cpu *data, 337 struct trace_array_cpu *data,
334 struct mmiotrace_map *map) 338 struct mmiotrace_map *map)
335{ 339{
340 struct ftrace_event_call *call = &event_mmiotrace_map;
341 struct ring_buffer *buffer = tr->buffer;
336 struct ring_buffer_event *event; 342 struct ring_buffer_event *event;
337 struct trace_mmiotrace_map *entry; 343 struct trace_mmiotrace_map *entry;
338 int pc = preempt_count(); 344 int pc = preempt_count();
339 345
340 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP, 346 event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP,
341 sizeof(*entry), 0, pc); 347 sizeof(*entry), 0, pc);
342 if (!event) { 348 if (!event) {
343 atomic_inc(&dropped_count); 349 atomic_inc(&dropped_count);
@@ -345,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
345 } 351 }
346 entry = ring_buffer_event_data(event); 352 entry = ring_buffer_event_data(event);
347 entry->map = *map; 353 entry->map = *map;
348 trace_buffer_unlock_commit(tr, event, 0, pc); 354
355 if (!filter_check_discard(call, entry, buffer, event))
356 trace_buffer_unlock_commit(buffer, event, 0, pc);
349} 357}
350 358
351void mmio_trace_mapping(struct mmiotrace_map *map) 359void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7938f3ae93e3..f572f44c6e1e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -27,8 +27,7 @@ void trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 29
30 s->buffer[len] = 0; 30 seq_write(m, s->buffer, len);
31 seq_puts(m, s->buffer);
32 31
33 trace_seq_init(s); 32 trace_seq_init(s);
34} 33}
@@ -408,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
408 * since individual threads might have already quit! 407 * since individual threads might have already quit!
409 */ 408 */
410 rcu_read_lock(); 409 rcu_read_lock();
411 task = find_task_by_vpid(entry->ent.tgid); 410 task = find_task_by_vpid(entry->tgid);
412 if (task) 411 if (task)
413 mm = get_task_mm(task); 412 mm = get_task_mm(task);
414 rcu_read_unlock(); 413 rcu_read_unlock();
@@ -461,18 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
461 return ret; 460 return ret;
462} 461}
463 462
464static int 463/**
465lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) 464 * trace_print_lat_fmt - print the irq, preempt and lockdep fields
465 * @s: trace seq struct to write to
466 * @entry: The trace entry field from the ring buffer
467 *
468 * Prints the generic fields of irqs off, in hard or softirq, preempt
469 * count and lock depth.
470 */
471int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
466{ 472{
467 int hardirq, softirq; 473 int hardirq, softirq;
468 char comm[TASK_COMM_LEN]; 474 int ret;
469 475
470 trace_find_cmdline(entry->pid, comm);
471 hardirq = entry->flags & TRACE_FLAG_HARDIRQ; 476 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
472 softirq = entry->flags & TRACE_FLAG_SOFTIRQ; 477 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
473 478
474 if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", 479 if (!trace_seq_printf(s, "%c%c%c",
475 comm, entry->pid, cpu,
476 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 480 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
477 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 481 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
478 'X' : '.', 482 'X' : '.',
@@ -482,9 +486,30 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
482 hardirq ? 'h' : softirq ? 's' : '.')) 486 hardirq ? 'h' : softirq ? 's' : '.'))
483 return 0; 487 return 0;
484 488
489 if (entry->lock_depth < 0)
490 ret = trace_seq_putc(s, '.');
491 else
492 ret = trace_seq_printf(s, "%d", entry->lock_depth);
493 if (!ret)
494 return 0;
495
485 if (entry->preempt_count) 496 if (entry->preempt_count)
486 return trace_seq_printf(s, "%x", entry->preempt_count); 497 return trace_seq_printf(s, "%x", entry->preempt_count);
487 return trace_seq_puts(s, "."); 498 return trace_seq_putc(s, '.');
499}
500
501static int
502lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
503{
504 char comm[TASK_COMM_LEN];
505
506 trace_find_cmdline(entry->pid, comm);
507
508 if (!trace_seq_printf(s, "%8.8s-%-5d %3d",
509 comm, entry->pid, cpu))
510 return 0;
511
512 return trace_print_lat_fmt(s, entry);
488} 513}
489 514
490static unsigned long preempt_mark_thresh = 100; 515static unsigned long preempt_mark_thresh = 100;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index d38bec4a9c30..9d91c72ba38b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags);
29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
29 31
30/* used by module unregistering */ 32/* used by module unregistering */
31extern int __unregister_ftrace_event(struct trace_event *event); 33extern int __unregister_ftrace_event(struct trace_event *event);
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
deleted file mode 100644
index 8a30d9874cd4..000000000000
--- a/kernel/trace/trace_power.c
+++ /dev/null
@@ -1,214 +0,0 @@
1/*
2 * ring buffer based C-state tracer
3 *
4 * Arjan van de Ven <arjan@linux.intel.com>
5 * Copyright (C) 2008 Intel Corporation
6 *
7 * Much is borrowed from trace_boot.c which is
8 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
9 *
10 */
11
12#include <linux/init.h>
13#include <linux/debugfs.h>
14#include <trace/power.h>
15#include <linux/kallsyms.h>
16#include <linux/module.h>
17
18#include "trace.h"
19#include "trace_output.h"
20
21static struct trace_array *power_trace;
22static int __read_mostly trace_power_enabled;
23
24static void probe_power_start(struct power_trace *it, unsigned int type,
25 unsigned int level)
26{
27 if (!trace_power_enabled)
28 return;
29
30 memset(it, 0, sizeof(struct power_trace));
31 it->state = level;
32 it->type = type;
33 it->stamp = ktime_get();
34}
35
36
37static void probe_power_end(struct power_trace *it)
38{
39 struct ftrace_event_call *call = &event_power;
40 struct ring_buffer_event *event;
41 struct trace_power *entry;
42 struct trace_array_cpu *data;
43 struct trace_array *tr = power_trace;
44
45 if (!trace_power_enabled)
46 return;
47
48 preempt_disable();
49 it->end = ktime_get();
50 data = tr->data[smp_processor_id()];
51
52 event = trace_buffer_lock_reserve(tr, TRACE_POWER,
53 sizeof(*entry), 0, 0);
54 if (!event)
55 goto out;
56 entry = ring_buffer_event_data(event);
57 entry->state_data = *it;
58 if (!filter_check_discard(call, entry, tr->buffer, event))
59 trace_buffer_unlock_commit(tr, event, 0, 0);
60 out:
61 preempt_enable();
62}
63
64static void probe_power_mark(struct power_trace *it, unsigned int type,
65 unsigned int level)
66{
67 struct ftrace_event_call *call = &event_power;
68 struct ring_buffer_event *event;
69 struct trace_power *entry;
70 struct trace_array_cpu *data;
71 struct trace_array *tr = power_trace;
72
73 if (!trace_power_enabled)
74 return;
75
76 memset(it, 0, sizeof(struct power_trace));
77 it->state = level;
78 it->type = type;
79 it->stamp = ktime_get();
80 preempt_disable();
81 it->end = it->stamp;
82 data = tr->data[smp_processor_id()];
83
84 event = trace_buffer_lock_reserve(tr, TRACE_POWER,
85 sizeof(*entry), 0, 0);
86 if (!event)
87 goto out;
88 entry = ring_buffer_event_data(event);
89 entry->state_data = *it;
90 if (!filter_check_discard(call, entry, tr->buffer, event))
91 trace_buffer_unlock_commit(tr, event, 0, 0);
92 out:
93 preempt_enable();
94}
95
96static int tracing_power_register(void)
97{
98 int ret;
99
100 ret = register_trace_power_start(probe_power_start);
101 if (ret) {
102 pr_info("power trace: Couldn't activate tracepoint"
103 " probe to trace_power_start\n");
104 return ret;
105 }
106 ret = register_trace_power_end(probe_power_end);
107 if (ret) {
108 pr_info("power trace: Couldn't activate tracepoint"
109 " probe to trace_power_end\n");
110 goto fail_start;
111 }
112 ret = register_trace_power_mark(probe_power_mark);
113 if (ret) {
114 pr_info("power trace: Couldn't activate tracepoint"
115 " probe to trace_power_mark\n");
116 goto fail_end;
117 }
118 return ret;
119fail_end:
120 unregister_trace_power_end(probe_power_end);
121fail_start:
122 unregister_trace_power_start(probe_power_start);
123 return ret;
124}
125
126static void start_power_trace(struct trace_array *tr)
127{
128 trace_power_enabled = 1;
129}
130
131static void stop_power_trace(struct trace_array *tr)
132{
133 trace_power_enabled = 0;
134}
135
136static void power_trace_reset(struct trace_array *tr)
137{
138 trace_power_enabled = 0;
139 unregister_trace_power_start(probe_power_start);
140 unregister_trace_power_end(probe_power_end);
141 unregister_trace_power_mark(probe_power_mark);
142}
143
144
145static int power_trace_init(struct trace_array *tr)
146{
147 int cpu;
148 power_trace = tr;
149
150 trace_power_enabled = 1;
151 tracing_power_register();
152
153 for_each_cpu(cpu, cpu_possible_mask)
154 tracing_reset(tr, cpu);
155 return 0;
156}
157
158static enum print_line_t power_print_line(struct trace_iterator *iter)
159{
160 int ret = 0;
161 struct trace_entry *entry = iter->ent;
162 struct trace_power *field ;
163 struct power_trace *it;
164 struct trace_seq *s = &iter->seq;
165 struct timespec stamp;
166 struct timespec duration;
167
168 trace_assign_type(field, entry);
169 it = &field->state_data;
170 stamp = ktime_to_timespec(it->stamp);
171 duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
172
173 if (entry->type == TRACE_POWER) {
174 if (it->type == POWER_CSTATE)
175 ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
176 stamp.tv_sec,
177 stamp.tv_nsec,
178 it->state, iter->cpu,
179 duration.tv_sec,
180 duration.tv_nsec);
181 if (it->type == POWER_PSTATE)
182 ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
183 stamp.tv_sec,
184 stamp.tv_nsec,
185 it->state, iter->cpu);
186 if (!ret)
187 return TRACE_TYPE_PARTIAL_LINE;
188 return TRACE_TYPE_HANDLED;
189 }
190 return TRACE_TYPE_UNHANDLED;
191}
192
193static void power_print_header(struct seq_file *s)
194{
195 seq_puts(s, "# TIMESTAMP STATE EVENT\n");
196 seq_puts(s, "# | | |\n");
197}
198
199static struct tracer power_tracer __read_mostly =
200{
201 .name = "power",
202 .init = power_trace_init,
203 .start = start_power_trace,
204 .stop = stop_power_trace,
205 .reset = power_trace_reset,
206 .print_line = power_print_line,
207 .print_header = power_print_header,
208};
209
210static int init_power_trace(void)
211{
212 return register_tracer(&power_tracer);
213}
214device_initcall(init_power_trace);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 7b6278110827..2547d8813cf0 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,7 +11,6 @@
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/string.h> 12#include <linux/string.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/marker.h>
15#include <linux/mutex.h> 14#include <linux/mutex.h>
16#include <linux/ctype.h> 15#include <linux/ctype.h>
17#include <linux/list.h> 16#include <linux/list.h>
@@ -176,7 +175,7 @@ static int t_show(struct seq_file *m, void *v)
176 const char *str = *fmt; 175 const char *str = *fmt;
177 int i; 176 int i;
178 177
179 seq_printf(m, "0x%lx : \"", (unsigned long)fmt); 178 seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
180 179
181 /* 180 /*
182 * Tabs and new lines need to be converted. 181 * Tabs and new lines need to be converted.
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a98106dd979c..5fca0f51fde4 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -20,6 +20,35 @@ static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 20static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped; 21static int sched_stopped;
22 22
23
24void
25tracing_sched_switch_trace(struct trace_array *tr,
26 struct task_struct *prev,
27 struct task_struct *next,
28 unsigned long flags, int pc)
29{
30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer *buffer = tr->buffer;
32 struct ring_buffer_event *event;
33 struct ctx_switch_entry *entry;
34
35 event = trace_buffer_lock_reserve(buffer, TRACE_CTX,
36 sizeof(*entry), flags, pc);
37 if (!event)
38 return;
39 entry = ring_buffer_event_data(event);
40 entry->prev_pid = prev->pid;
41 entry->prev_prio = prev->prio;
42 entry->prev_state = prev->state;
43 entry->next_pid = next->pid;
44 entry->next_prio = next->prio;
45 entry->next_state = next->state;
46 entry->next_cpu = task_cpu(next);
47
48 if (!filter_check_discard(call, entry, buffer, event))
49 trace_buffer_unlock_commit(buffer, event, flags, pc);
50}
51
23static void 52static void
24probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(struct rq *__rq, struct task_struct *prev,
25 struct task_struct *next) 54 struct task_struct *next)
@@ -49,6 +78,36 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
49 local_irq_restore(flags); 78 local_irq_restore(flags);
50} 79}
51 80
81void
82tracing_sched_wakeup_trace(struct trace_array *tr,
83 struct task_struct *wakee,
84 struct task_struct *curr,
85 unsigned long flags, int pc)
86{
87 struct ftrace_event_call *call = &event_wakeup;
88 struct ring_buffer_event *event;
89 struct ctx_switch_entry *entry;
90 struct ring_buffer *buffer = tr->buffer;
91
92 event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
93 sizeof(*entry), flags, pc);
94 if (!event)
95 return;
96 entry = ring_buffer_event_data(event);
97 entry->prev_pid = curr->pid;
98 entry->prev_prio = curr->prio;
99 entry->prev_state = curr->state;
100 entry->next_pid = wakee->pid;
101 entry->next_prio = wakee->prio;
102 entry->next_state = wakee->state;
103 entry->next_cpu = task_cpu(wakee);
104
105 if (!filter_check_discard(call, entry, buffer, event))
106 ring_buffer_unlock_commit(buffer, event);
107 ftrace_trace_stack(tr->buffer, flags, 6, pc);
108 ftrace_trace_userstack(tr->buffer, flags, pc);
109}
110
52static void 111static void
53probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
54{ 113{
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index eacb27225173..26185d727676 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -24,6 +24,7 @@ static int __read_mostly tracer_enabled;
24 24
25static struct task_struct *wakeup_task; 25static struct task_struct *wakeup_task;
26static int wakeup_cpu; 26static int wakeup_cpu;
27static int wakeup_current_cpu;
27static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
28static int wakeup_rt; 29static int wakeup_rt;
29 30
@@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
56 resched = ftrace_preempt_disable(); 57 resched = ftrace_preempt_disable();
57 58
58 cpu = raw_smp_processor_id(); 59 cpu = raw_smp_processor_id();
60 if (cpu != wakeup_current_cpu)
61 goto out_enable;
62
59 data = tr->data[cpu]; 63 data = tr->data[cpu];
60 disabled = atomic_inc_return(&data->disabled); 64 disabled = atomic_inc_return(&data->disabled);
61 if (unlikely(disabled != 1)) 65 if (unlikely(disabled != 1))
62 goto out; 66 goto out;
63 67
64 local_irq_save(flags); 68 local_irq_save(flags);
65 __raw_spin_lock(&wakeup_lock);
66
67 if (unlikely(!wakeup_task))
68 goto unlock;
69
70 /*
71 * The task can't disappear because it needs to
72 * wake up first, and we have the wakeup_lock.
73 */
74 if (task_cpu(wakeup_task) != cpu)
75 goto unlock;
76 69
77 trace_function(tr, ip, parent_ip, flags, pc); 70 trace_function(tr, ip, parent_ip, flags, pc);
78 71
79 unlock:
80 __raw_spin_unlock(&wakeup_lock);
81 local_irq_restore(flags); 72 local_irq_restore(flags);
82 73
83 out: 74 out:
84 atomic_dec(&data->disabled); 75 atomic_dec(&data->disabled);
85 76 out_enable:
86 ftrace_preempt_enable(resched); 77 ftrace_preempt_enable(resched);
87} 78}
88 79
@@ -107,11 +98,18 @@ static int report_latency(cycle_t delta)
107 return 1; 98 return 1;
108} 99}
109 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
102{
103 if (task != wakeup_task)
104 return;
105
106 wakeup_current_cpu = cpu;
107}
108
110static void notrace 109static void notrace
111probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
112 struct task_struct *next) 111 struct task_struct *next)
113{ 112{
114 unsigned long latency = 0, t0 = 0, t1 = 0;
115 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
116 cycle_t T0, T1, delta; 114 cycle_t T0, T1, delta;
117 unsigned long flags; 115 unsigned long flags;
@@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
159 157
160 /*
161 * usecs conversion is slow so we try to delay the conversion
162 * as long as possible:
163 */
164 T0 = data->preempt_timestamp; 158 T0 = data->preempt_timestamp;
165 T1 = ftrace_now(cpu); 159 T1 = ftrace_now(cpu);
166 delta = T1-T0; 160 delta = T1-T0;
@@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
168 if (!report_latency(delta)) 162 if (!report_latency(delta))
169 goto out_unlock; 163 goto out_unlock;
170 164
171 latency = nsecs_to_usecs(delta); 165 if (likely(!is_tracing_stopped())) {
172 166 tracing_max_latency = delta;
173 tracing_max_latency = delta; 167 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
174 t0 = nsecs_to_usecs(T0); 168 }
175 t1 = nsecs_to_usecs(T1);
176
177 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
178 169
179out_unlock: 170out_unlock:
180 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
@@ -186,11 +177,6 @@ out:
186 177
187static void __wakeup_reset(struct trace_array *tr) 178static void __wakeup_reset(struct trace_array *tr)
188{ 179{
189 int cpu;
190
191 for_each_possible_cpu(cpu)
192 tracing_reset(tr, cpu);
193
194 wakeup_cpu = -1; 180 wakeup_cpu = -1;
195 wakeup_prio = -1; 181 wakeup_prio = -1;
196 182
@@ -204,6 +190,8 @@ static void wakeup_reset(struct trace_array *tr)
204{ 190{
205 unsigned long flags; 191 unsigned long flags;
206 192
193 tracing_reset_online_cpus(tr);
194
207 local_irq_save(flags); 195 local_irq_save(flags);
208 __raw_spin_lock(&wakeup_lock); 196 __raw_spin_lock(&wakeup_lock);
209 __wakeup_reset(tr); 197 __wakeup_reset(tr);
@@ -247,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
247 __wakeup_reset(wakeup_trace); 235 __wakeup_reset(wakeup_trace);
248 236
249 wakeup_cpu = task_cpu(p); 237 wakeup_cpu = task_cpu(p);
238 wakeup_current_cpu = wakeup_cpu;
250 wakeup_prio = p->prio; 239 wakeup_prio = p->prio;
251 240
252 wakeup_task = p; 241 wakeup_task = p;
@@ -296,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
296 goto fail_deprobe_wake_new; 285 goto fail_deprobe_wake_new;
297 } 286 }
298 287
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task);
289 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n");
292 return;
293 }
294
299 wakeup_reset(tr); 295 wakeup_reset(tr);
300 296
301 /* 297 /*
@@ -328,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
328 unregister_trace_sched_switch(probe_wakeup_sched_switch); 324 unregister_trace_sched_switch(probe_wakeup_sched_switch);
329 unregister_trace_sched_wakeup_new(probe_wakeup); 325 unregister_trace_sched_wakeup_new(probe_wakeup);
330 unregister_trace_sched_wakeup(probe_wakeup); 326 unregister_trace_sched_wakeup(probe_wakeup);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);
331} 328}
332 329
333static int __wakeup_tracer_init(struct trace_array *tr) 330static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 00dd6485bdd7..d2cdbabb4ead 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
288 * to detect and recover from possible hangs 288 * to detect and recover from possible hangs
289 */ 289 */
290 tracing_reset_online_cpus(tr); 290 tracing_reset_online_cpus(tr);
291 set_graph_array(tr);
291 ret = register_ftrace_graph(&trace_graph_return, 292 ret = register_ftrace_graph(&trace_graph_return,
292 &trace_graph_entry_watchdog); 293 &trace_graph_entry_watchdog);
293 if (ret) { 294 if (ret) {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 2d7aebd71dbd..0f6facb050a1 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -186,43 +186,33 @@ static const struct file_operations stack_max_size_fops = {
186}; 186};
187 187
188static void * 188static void *
189t_next(struct seq_file *m, void *v, loff_t *pos) 189__next(struct seq_file *m, loff_t *pos)
190{ 190{
191 long i; 191 long n = *pos - 1;
192
193 (*pos)++;
194
195 if (v == SEQ_START_TOKEN)
196 i = 0;
197 else {
198 i = *(long *)v;
199 i++;
200 }
201 192
202 if (i >= max_stack_trace.nr_entries || 193 if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
203 stack_dump_trace[i] == ULONG_MAX)
204 return NULL; 194 return NULL;
205 195
206 m->private = (void *)i; 196 m->private = (void *)n;
207
208 return &m->private; 197 return &m->private;
209} 198}
210 199
211static void *t_start(struct seq_file *m, loff_t *pos) 200static void *
201t_next(struct seq_file *m, void *v, loff_t *pos)
212{ 202{
213 void *t = SEQ_START_TOKEN; 203 (*pos)++;
214 loff_t l = 0; 204 return __next(m, pos);
205}
215 206
207static void *t_start(struct seq_file *m, loff_t *pos)
208{
216 local_irq_disable(); 209 local_irq_disable();
217 __raw_spin_lock(&max_stack_lock); 210 __raw_spin_lock(&max_stack_lock);
218 211
219 if (*pos == 0) 212 if (*pos == 0)
220 return SEQ_START_TOKEN; 213 return SEQ_START_TOKEN;
221 214
222 for (; t && l < *pos; t = t_next(m, t, &l)) 215 return __next(m, pos);
223 ;
224
225 return t;
226} 216}
227 217
228static void t_stop(struct seq_file *m, void *p) 218static void t_stop(struct seq_file *m, void *p)
@@ -234,15 +224,8 @@ static void t_stop(struct seq_file *m, void *p)
234static int trace_lookup_stack(struct seq_file *m, long i) 224static int trace_lookup_stack(struct seq_file *m, long i)
235{ 225{
236 unsigned long addr = stack_dump_trace[i]; 226 unsigned long addr = stack_dump_trace[i];
237#ifdef CONFIG_KALLSYMS
238 char str[KSYM_SYMBOL_LEN];
239 227
240 sprint_symbol(str, addr); 228 return seq_printf(m, "%pF\n", (void *)addr);
241
242 return seq_printf(m, "%s\n", str);
243#else
244 return seq_printf(m, "%p\n", (void*)addr);
245#endif
246} 229}
247 230
248static void print_disabled(struct seq_file *m) 231static void print_disabled(struct seq_file *m)
@@ -301,17 +284,14 @@ static const struct seq_operations stack_trace_seq_ops = {
301 284
302static int stack_trace_open(struct inode *inode, struct file *file) 285static int stack_trace_open(struct inode *inode, struct file *file)
303{ 286{
304 int ret; 287 return seq_open(file, &stack_trace_seq_ops);
305
306 ret = seq_open(file, &stack_trace_seq_ops);
307
308 return ret;
309} 288}
310 289
311static const struct file_operations stack_trace_fops = { 290static const struct file_operations stack_trace_fops = {
312 .open = stack_trace_open, 291 .open = stack_trace_open,
313 .read = seq_read, 292 .read = seq_read,
314 .llseek = seq_lseek, 293 .llseek = seq_lseek,
294 .release = seq_release,
315}; 295};
316 296
317int 297int
@@ -326,10 +306,10 @@ stack_trace_sysctl(struct ctl_table *table, int write,
326 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 306 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
327 307
328 if (ret || !write || 308 if (ret || !write ||
329 (last_stack_tracer_enabled == stack_tracer_enabled)) 309 (last_stack_tracer_enabled == !!stack_tracer_enabled))
330 goto out; 310 goto out;
331 311
332 last_stack_tracer_enabled = stack_tracer_enabled; 312 last_stack_tracer_enabled = !!stack_tracer_enabled;
333 313
334 if (stack_tracer_enabled) 314 if (stack_tracer_enabled)
335 register_ftrace_function(&trace_ops); 315 register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index e66f5e493342..a4bb239eb987 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -49,7 +49,8 @@ static struct dentry *stat_dir;
49 * but it will at least advance closer to the next one 49 * but it will at least advance closer to the next one
50 * to be released. 50 * to be released.
51 */ 51 */
52static struct rb_node *release_next(struct rb_node *node) 52static struct rb_node *release_next(struct tracer_stat *ts,
53 struct rb_node *node)
53{ 54{
54 struct stat_node *snode; 55 struct stat_node *snode;
55 struct rb_node *parent = rb_parent(node); 56 struct rb_node *parent = rb_parent(node);
@@ -67,26 +68,35 @@ static struct rb_node *release_next(struct rb_node *node)
67 parent->rb_right = NULL; 68 parent->rb_right = NULL;
68 69
69 snode = container_of(node, struct stat_node, node); 70 snode = container_of(node, struct stat_node, node);
71 if (ts->stat_release)
72 ts->stat_release(snode->stat);
70 kfree(snode); 73 kfree(snode);
71 74
72 return parent; 75 return parent;
73 } 76 }
74} 77}
75 78
76static void reset_stat_session(struct stat_session *session) 79static void __reset_stat_session(struct stat_session *session)
77{ 80{
78 struct rb_node *node = session->stat_root.rb_node; 81 struct rb_node *node = session->stat_root.rb_node;
79 82
80 while (node) 83 while (node)
81 node = release_next(node); 84 node = release_next(session->ts, node);
82 85
83 session->stat_root = RB_ROOT; 86 session->stat_root = RB_ROOT;
84} 87}
85 88
89static void reset_stat_session(struct stat_session *session)
90{
91 mutex_lock(&session->stat_mutex);
92 __reset_stat_session(session);
93 mutex_unlock(&session->stat_mutex);
94}
95
86static void destroy_session(struct stat_session *session) 96static void destroy_session(struct stat_session *session)
87{ 97{
88 debugfs_remove(session->file); 98 debugfs_remove(session->file);
89 reset_stat_session(session); 99 __reset_stat_session(session);
90 mutex_destroy(&session->stat_mutex); 100 mutex_destroy(&session->stat_mutex);
91 kfree(session); 101 kfree(session);
92} 102}
@@ -150,7 +160,7 @@ static int stat_seq_init(struct stat_session *session)
150 int i; 160 int i;
151 161
152 mutex_lock(&session->stat_mutex); 162 mutex_lock(&session->stat_mutex);
153 reset_stat_session(session); 163 __reset_stat_session(session);
154 164
155 if (!ts->stat_cmp) 165 if (!ts->stat_cmp)
156 ts->stat_cmp = dummy_cmp; 166 ts->stat_cmp = dummy_cmp;
@@ -183,7 +193,7 @@ exit:
183 return ret; 193 return ret;
184 194
185exit_free_rbtree: 195exit_free_rbtree:
186 reset_stat_session(session); 196 __reset_stat_session(session);
187 mutex_unlock(&session->stat_mutex); 197 mutex_unlock(&session->stat_mutex);
188 return ret; 198 return ret;
189} 199}
@@ -193,17 +203,21 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
193{ 203{
194 struct stat_session *session = s->private; 204 struct stat_session *session = s->private;
195 struct rb_node *node; 205 struct rb_node *node;
206 int n = *pos;
196 int i; 207 int i;
197 208
198 /* Prevent from tracer switch or rbtree modification */ 209 /* Prevent from tracer switch or rbtree modification */
199 mutex_lock(&session->stat_mutex); 210 mutex_lock(&session->stat_mutex);
200 211
201 /* If we are in the beginning of the file, print the headers */ 212 /* If we are in the beginning of the file, print the headers */
202 if (!*pos && session->ts->stat_headers) 213 if (session->ts->stat_headers) {
203 return SEQ_START_TOKEN; 214 if (n == 0)
215 return SEQ_START_TOKEN;
216 n--;
217 }
204 218
205 node = rb_first(&session->stat_root); 219 node = rb_first(&session->stat_root);
206 for (i = 0; node && i < *pos; i++) 220 for (i = 0; node && i < n; i++)
207 node = rb_next(node); 221 node = rb_next(node);
208 222
209 return node; 223 return node;
@@ -250,16 +264,21 @@ static const struct seq_operations trace_stat_seq_ops = {
250static int tracing_stat_open(struct inode *inode, struct file *file) 264static int tracing_stat_open(struct inode *inode, struct file *file)
251{ 265{
252 int ret; 266 int ret;
253 267 struct seq_file *m;
254 struct stat_session *session = inode->i_private; 268 struct stat_session *session = inode->i_private;
255 269
270 ret = stat_seq_init(session);
271 if (ret)
272 return ret;
273
256 ret = seq_open(file, &trace_stat_seq_ops); 274 ret = seq_open(file, &trace_stat_seq_ops);
257 if (!ret) { 275 if (ret) {
258 struct seq_file *m = file->private_data; 276 reset_stat_session(session);
259 m->private = session; 277 return ret;
260 ret = stat_seq_init(session);
261 } 278 }
262 279
280 m = file->private_data;
281 m->private = session;
263 return ret; 282 return ret;
264} 283}
265 284
@@ -270,11 +289,9 @@ static int tracing_stat_release(struct inode *i, struct file *f)
270{ 289{
271 struct stat_session *session = i->i_private; 290 struct stat_session *session = i->i_private;
272 291
273 mutex_lock(&session->stat_mutex);
274 reset_stat_session(session); 292 reset_stat_session(session);
275 mutex_unlock(&session->stat_mutex);
276 293
277 return 0; 294 return seq_release(i, f);
278} 295}
279 296
280static const struct file_operations tracing_stat_fops = { 297static const struct file_operations tracing_stat_fops = {
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index f3546a2cd826..8f03914b9a6a 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -18,6 +18,8 @@ struct tracer_stat {
18 int (*stat_cmp)(void *p1, void *p2); 18 int (*stat_cmp)(void *p1, void *p2);
19 /* Print a stat entry */ 19 /* Print a stat entry */
20 int (*stat_show)(struct seq_file *s, void *p); 20 int (*stat_show)(struct seq_file *s, void *p);
21 /* Release an entry */
22 void (*stat_release)(void *stat);
21 /* Print the headers of your stat entries */ 23 /* Print the headers of your stat entries */
22 int (*stat_headers)(struct seq_file *s); 24 int (*stat_headers)(struct seq_file *s);
23}; 25};
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e579645ac86..9fbce6c9d2e1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,30 +1,18 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h>
2#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/ftrace.h>
5#include <linux/perf_event.h>
3#include <asm/syscall.h> 6#include <asm/syscall.h>
4 7
5#include "trace_output.h" 8#include "trace_output.h"
6#include "trace.h" 9#include "trace.h"
7 10
8/* Keep a counter of the syscall tracing users */
9static int refcount;
10
11/* Prevent from races on thread flags toggling */
12static DEFINE_MUTEX(syscall_trace_lock); 11static DEFINE_MUTEX(syscall_trace_lock);
13 12static int sys_refcount_enter;
14/* Option to display the parameters types */ 13static int sys_refcount_exit;
15enum { 14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16 TRACE_SYSCALLS_OPT_TYPES = 0x1, 15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17};
18
19static struct tracer_opt syscalls_opts[] = {
20 { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
21 { }
22};
23
24static struct tracer_flags syscalls_flags = {
25 .val = 0, /* By default: no parameters types */
26 .opts = syscalls_opts
27};
28 16
29enum print_line_t 17enum print_line_t
30print_syscall_enter(struct trace_iterator *iter, int flags) 18print_syscall_enter(struct trace_iterator *iter, int flags)
@@ -35,35 +23,46 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
35 struct syscall_metadata *entry; 23 struct syscall_metadata *entry;
36 int i, ret, syscall; 24 int i, ret, syscall;
37 25
38 trace_assign_type(trace, ent); 26 trace = (typeof(trace))ent;
39
40 syscall = trace->nr; 27 syscall = trace->nr;
41
42 entry = syscall_nr_to_meta(syscall); 28 entry = syscall_nr_to_meta(syscall);
29
43 if (!entry) 30 if (!entry)
44 goto end; 31 goto end;
45 32
33 if (entry->enter_id != ent->type) {
34 WARN_ON_ONCE(1);
35 goto end;
36 }
37
46 ret = trace_seq_printf(s, "%s(", entry->name); 38 ret = trace_seq_printf(s, "%s(", entry->name);
47 if (!ret) 39 if (!ret)
48 return TRACE_TYPE_PARTIAL_LINE; 40 return TRACE_TYPE_PARTIAL_LINE;
49 41
50 for (i = 0; i < entry->nb_args; i++) { 42 for (i = 0; i < entry->nb_args; i++) {
51 /* parameter types */ 43 /* parameter types */
52 if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) { 44 if (trace_flags & TRACE_ITER_VERBOSE) {
53 ret = trace_seq_printf(s, "%s ", entry->types[i]); 45 ret = trace_seq_printf(s, "%s ", entry->types[i]);
54 if (!ret) 46 if (!ret)
55 return TRACE_TYPE_PARTIAL_LINE; 47 return TRACE_TYPE_PARTIAL_LINE;
56 } 48 }
57 /* parameter values */ 49 /* parameter values */
58 ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i], 50 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
59 trace->args[i], 51 trace->args[i],
60 i == entry->nb_args - 1 ? ")" : ","); 52 i == entry->nb_args - 1 ? "" : ", ");
61 if (!ret) 53 if (!ret)
62 return TRACE_TYPE_PARTIAL_LINE; 54 return TRACE_TYPE_PARTIAL_LINE;
63 } 55 }
64 56
57 ret = trace_seq_putc(s, ')');
58 if (!ret)
59 return TRACE_TYPE_PARTIAL_LINE;
60
65end: 61end:
66 trace_seq_printf(s, "\n"); 62 ret = trace_seq_putc(s, '\n');
63 if (!ret)
64 return TRACE_TYPE_PARTIAL_LINE;
65
67 return TRACE_TYPE_HANDLED; 66 return TRACE_TYPE_HANDLED;
68} 67}
69 68
@@ -77,16 +76,20 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
77 struct syscall_metadata *entry; 76 struct syscall_metadata *entry;
78 int ret; 77 int ret;
79 78
80 trace_assign_type(trace, ent); 79 trace = (typeof(trace))ent;
81
82 syscall = trace->nr; 80 syscall = trace->nr;
83
84 entry = syscall_nr_to_meta(syscall); 81 entry = syscall_nr_to_meta(syscall);
82
85 if (!entry) { 83 if (!entry) {
86 trace_seq_printf(s, "\n"); 84 trace_seq_printf(s, "\n");
87 return TRACE_TYPE_HANDLED; 85 return TRACE_TYPE_HANDLED;
88 } 86 }
89 87
88 if (entry->exit_id != ent->type) {
89 WARN_ON_ONCE(1);
90 return TRACE_TYPE_UNHANDLED;
91 }
92
90 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 93 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
91 trace->ret); 94 trace->ret);
92 if (!ret) 95 if (!ret)
@@ -95,62 +98,140 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
95 return TRACE_TYPE_HANDLED; 98 return TRACE_TYPE_HANDLED;
96} 99}
97 100
98void start_ftrace_syscalls(void) 101extern char *__bad_type_size(void);
102
103#define SYSCALL_FIELD(type, name) \
104 sizeof(type) != sizeof(trace.name) ? \
105 __bad_type_size() : \
106 #type, #name, offsetof(typeof(trace), name), sizeof(trace.name)
107
108int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s)
99{ 109{
100 unsigned long flags; 110 int i;
101 struct task_struct *g, *t; 111 int nr;
112 int ret;
113 struct syscall_metadata *entry;
114 struct syscall_trace_enter trace;
115 int offset = offsetof(struct syscall_trace_enter, args);
102 116
103 mutex_lock(&syscall_trace_lock); 117 nr = syscall_name_to_nr(call->data);
118 entry = syscall_nr_to_meta(nr);
104 119
105 /* Don't enable the flag on the tasks twice */ 120 if (!entry)
106 if (++refcount != 1) 121 return 0;
107 goto unlock;
108 122
109 arch_init_ftrace_syscalls(); 123 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
110 read_lock_irqsave(&tasklist_lock, flags); 124 SYSCALL_FIELD(int, nr));
125 if (!ret)
126 return 0;
111 127
112 do_each_thread(g, t) { 128 for (i = 0; i < entry->nb_args; i++) {
113 set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 129 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i],
114 } while_each_thread(g, t); 130 entry->args[i]);
131 if (!ret)
132 return 0;
133 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;\n", offset,
134 sizeof(unsigned long));
135 if (!ret)
136 return 0;
137 offset += sizeof(unsigned long);
138 }
115 139
116 read_unlock_irqrestore(&tasklist_lock, flags); 140 trace_seq_puts(s, "\nprint fmt: \"");
141 for (i = 0; i < entry->nb_args; i++) {
142 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i],
143 sizeof(unsigned long),
144 i == entry->nb_args - 1 ? "" : ", ");
145 if (!ret)
146 return 0;
147 }
148 trace_seq_putc(s, '"');
117 149
118unlock: 150 for (i = 0; i < entry->nb_args; i++) {
119 mutex_unlock(&syscall_trace_lock); 151 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
152 entry->args[i]);
153 if (!ret)
154 return 0;
155 }
156
157 return trace_seq_putc(s, '\n');
120} 158}
121 159
122void stop_ftrace_syscalls(void) 160int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
123{ 161{
124 unsigned long flags; 162 int ret;
125 struct task_struct *g, *t; 163 struct syscall_trace_exit trace;
126 164
127 mutex_lock(&syscall_trace_lock); 165 ret = trace_seq_printf(s,
166 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
167 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
168 SYSCALL_FIELD(int, nr),
169 SYSCALL_FIELD(unsigned long, ret));
170 if (!ret)
171 return 0;
128 172
129 /* There are perhaps still some users */ 173 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n");
130 if (--refcount) 174}
131 goto unlock;
132 175
133 read_lock_irqsave(&tasklist_lock, flags); 176int syscall_enter_define_fields(struct ftrace_event_call *call)
177{
178 struct syscall_trace_enter trace;
179 struct syscall_metadata *meta;
180 int ret;
181 int nr;
182 int i;
183 int offset = offsetof(typeof(trace), args);
184
185 nr = syscall_name_to_nr(call->data);
186 meta = syscall_nr_to_meta(nr);
187
188 if (!meta)
189 return 0;
190
191 ret = trace_define_common_fields(call);
192 if (ret)
193 return ret;
194
195 for (i = 0; i < meta->nb_args; i++) {
196 ret = trace_define_field(call, meta->types[i],
197 meta->args[i], offset,
198 sizeof(unsigned long), 0,
199 FILTER_OTHER);
200 offset += sizeof(unsigned long);
201 }
134 202
135 do_each_thread(g, t) { 203 return ret;
136 clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE); 204}
137 } while_each_thread(g, t);
138 205
139 read_unlock_irqrestore(&tasklist_lock, flags); 206int syscall_exit_define_fields(struct ftrace_event_call *call)
207{
208 struct syscall_trace_exit trace;
209 int ret;
140 210
141unlock: 211 ret = trace_define_common_fields(call);
142 mutex_unlock(&syscall_trace_lock); 212 if (ret)
213 return ret;
214
215 ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0,
216 FILTER_OTHER);
217
218 return ret;
143} 219}
144 220
145void ftrace_syscall_enter(struct pt_regs *regs) 221void ftrace_syscall_enter(struct pt_regs *regs, long id)
146{ 222{
147 struct syscall_trace_enter *entry; 223 struct syscall_trace_enter *entry;
148 struct syscall_metadata *sys_data; 224 struct syscall_metadata *sys_data;
149 struct ring_buffer_event *event; 225 struct ring_buffer_event *event;
226 struct ring_buffer *buffer;
150 int size; 227 int size;
151 int syscall_nr; 228 int syscall_nr;
152 229
153 syscall_nr = syscall_get_nr(current, regs); 230 syscall_nr = syscall_get_nr(current, regs);
231 if (syscall_nr < 0)
232 return;
233 if (!test_bit(syscall_nr, enabled_enter_syscalls))
234 return;
154 235
155 sys_data = syscall_nr_to_meta(syscall_nr); 236 sys_data = syscall_nr_to_meta(syscall_nr);
156 if (!sys_data) 237 if (!sys_data)
@@ -158,8 +239,8 @@ void ftrace_syscall_enter(struct pt_regs *regs)
158 239
159 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 240 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
160 241
161 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size, 242 event = trace_current_buffer_lock_reserve(&buffer, sys_data->enter_id,
162 0, 0); 243 size, 0, 0);
163 if (!event) 244 if (!event)
164 return; 245 return;
165 246
@@ -167,24 +248,30 @@ void ftrace_syscall_enter(struct pt_regs *regs)
167 entry->nr = syscall_nr; 248 entry->nr = syscall_nr;
168 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 249 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
169 250
170 trace_current_buffer_unlock_commit(event, 0, 0); 251 if (!filter_current_check_discard(buffer, sys_data->enter_event,
171 trace_wake_up(); 252 entry, event))
253 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
172} 254}
173 255
174void ftrace_syscall_exit(struct pt_regs *regs) 256void ftrace_syscall_exit(struct pt_regs *regs, long ret)
175{ 257{
176 struct syscall_trace_exit *entry; 258 struct syscall_trace_exit *entry;
177 struct syscall_metadata *sys_data; 259 struct syscall_metadata *sys_data;
178 struct ring_buffer_event *event; 260 struct ring_buffer_event *event;
261 struct ring_buffer *buffer;
179 int syscall_nr; 262 int syscall_nr;
180 263
181 syscall_nr = syscall_get_nr(current, regs); 264 syscall_nr = syscall_get_nr(current, regs);
265 if (syscall_nr < 0)
266 return;
267 if (!test_bit(syscall_nr, enabled_exit_syscalls))
268 return;
182 269
183 sys_data = syscall_nr_to_meta(syscall_nr); 270 sys_data = syscall_nr_to_meta(syscall_nr);
184 if (!sys_data) 271 if (!sys_data)
185 return; 272 return;
186 273
187 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT, 274 event = trace_current_buffer_lock_reserve(&buffer, sys_data->exit_id,
188 sizeof(*entry), 0, 0); 275 sizeof(*entry), 0, 0);
189 if (!event) 276 if (!event)
190 return; 277 return;
@@ -193,58 +280,303 @@ void ftrace_syscall_exit(struct pt_regs *regs)
193 entry->nr = syscall_nr; 280 entry->nr = syscall_nr;
194 entry->ret = syscall_get_return_value(current, regs); 281 entry->ret = syscall_get_return_value(current, regs);
195 282
196 trace_current_buffer_unlock_commit(event, 0, 0); 283 if (!filter_current_check_discard(buffer, sys_data->exit_event,
197 trace_wake_up(); 284 entry, event))
285 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
198} 286}
199 287
200static int init_syscall_tracer(struct trace_array *tr) 288int reg_event_syscall_enter(void *ptr)
201{ 289{
202 start_ftrace_syscalls(); 290 int ret = 0;
291 int num;
292 char *name;
293
294 name = (char *)ptr;
295 num = syscall_name_to_nr(name);
296 if (num < 0 || num >= NR_syscalls)
297 return -ENOSYS;
298 mutex_lock(&syscall_trace_lock);
299 if (!sys_refcount_enter)
300 ret = register_trace_sys_enter(ftrace_syscall_enter);
301 if (ret) {
302 pr_info("event trace: Could not activate"
303 "syscall entry trace point");
304 } else {
305 set_bit(num, enabled_enter_syscalls);
306 sys_refcount_enter++;
307 }
308 mutex_unlock(&syscall_trace_lock);
309 return ret;
310}
311
312void unreg_event_syscall_enter(void *ptr)
313{
314 int num;
315 char *name;
203 316
204 return 0; 317 name = (char *)ptr;
318 num = syscall_name_to_nr(name);
319 if (num < 0 || num >= NR_syscalls)
320 return;
321 mutex_lock(&syscall_trace_lock);
322 sys_refcount_enter--;
323 clear_bit(num, enabled_enter_syscalls);
324 if (!sys_refcount_enter)
325 unregister_trace_sys_enter(ftrace_syscall_enter);
326 mutex_unlock(&syscall_trace_lock);
205} 327}
206 328
207static void reset_syscall_tracer(struct trace_array *tr) 329int reg_event_syscall_exit(void *ptr)
208{ 330{
209 stop_ftrace_syscalls(); 331 int ret = 0;
210 tracing_reset_online_cpus(tr); 332 int num;
333 char *name;
334
335 name = (char *)ptr;
336 num = syscall_name_to_nr(name);
337 if (num < 0 || num >= NR_syscalls)
338 return -ENOSYS;
339 mutex_lock(&syscall_trace_lock);
340 if (!sys_refcount_exit)
341 ret = register_trace_sys_exit(ftrace_syscall_exit);
342 if (ret) {
343 pr_info("event trace: Could not activate"
344 "syscall exit trace point");
345 } else {
346 set_bit(num, enabled_exit_syscalls);
347 sys_refcount_exit++;
348 }
349 mutex_unlock(&syscall_trace_lock);
350 return ret;
211} 351}
212 352
213static struct trace_event syscall_enter_event = { 353void unreg_event_syscall_exit(void *ptr)
214 .type = TRACE_SYSCALL_ENTER, 354{
215 .trace = print_syscall_enter, 355 int num;
216}; 356 char *name;
357
358 name = (char *)ptr;
359 num = syscall_name_to_nr(name);
360 if (num < 0 || num >= NR_syscalls)
361 return;
362 mutex_lock(&syscall_trace_lock);
363 sys_refcount_exit--;
364 clear_bit(num, enabled_exit_syscalls);
365 if (!sys_refcount_exit)
366 unregister_trace_sys_exit(ftrace_syscall_exit);
367 mutex_unlock(&syscall_trace_lock);
368}
217 369
218static struct trace_event syscall_exit_event = { 370struct trace_event event_syscall_enter = {
219 .type = TRACE_SYSCALL_EXIT, 371 .trace = print_syscall_enter,
220 .trace = print_syscall_exit,
221}; 372};
222 373
223static struct tracer syscall_tracer __read_mostly = { 374struct trace_event event_syscall_exit = {
224 .name = "syscall", 375 .trace = print_syscall_exit,
225 .init = init_syscall_tracer,
226 .reset = reset_syscall_tracer,
227 .flags = &syscalls_flags,
228}; 376};
229 377
230__init int register_ftrace_syscalls(void) 378#ifdef CONFIG_EVENT_PROFILE
379
380static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
381static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
382static int sys_prof_refcount_enter;
383static int sys_prof_refcount_exit;
384
385static void prof_syscall_enter(struct pt_regs *regs, long id)
231{ 386{
232 int ret; 387 struct syscall_metadata *sys_data;
388 struct syscall_trace_enter *rec;
389 unsigned long flags;
390 char *raw_data;
391 int syscall_nr;
392 int size;
393 int cpu;
233 394
234 ret = register_ftrace_event(&syscall_enter_event); 395 syscall_nr = syscall_get_nr(current, regs);
235 if (!ret) { 396 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
236 printk(KERN_WARNING "event %d failed to register\n", 397 return;
237 syscall_enter_event.type); 398
238 WARN_ON_ONCE(1); 399 sys_data = syscall_nr_to_meta(syscall_nr);
400 if (!sys_data)
401 return;
402
403 /* get the size after alignment with the u32 buffer size field */
404 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
405 size = ALIGN(size + sizeof(u32), sizeof(u64));
406 size -= sizeof(u32);
407
408 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
409 "profile buffer not large enough"))
410 return;
411
412 /* Protect the per cpu buffer, begin the rcu read side */
413 local_irq_save(flags);
414
415 cpu = smp_processor_id();
416
417 if (in_nmi())
418 raw_data = rcu_dereference(trace_profile_buf_nmi);
419 else
420 raw_data = rcu_dereference(trace_profile_buf);
421
422 if (!raw_data)
423 goto end;
424
425 raw_data = per_cpu_ptr(raw_data, cpu);
426
427 /* zero the dead bytes from align to not leak stack to user */
428 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
429
430 rec = (struct syscall_trace_enter *) raw_data;
431 tracing_generic_entry_update(&rec->ent, 0, 0);
432 rec->ent.type = sys_data->enter_id;
433 rec->nr = syscall_nr;
434 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
435 (unsigned long *)&rec->args);
436 perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
437
438end:
439 local_irq_restore(flags);
440}
441
442int reg_prof_syscall_enter(char *name)
443{
444 int ret = 0;
445 int num;
446
447 num = syscall_name_to_nr(name);
448 if (num < 0 || num >= NR_syscalls)
449 return -ENOSYS;
450
451 mutex_lock(&syscall_trace_lock);
452 if (!sys_prof_refcount_enter)
453 ret = register_trace_sys_enter(prof_syscall_enter);
454 if (ret) {
455 pr_info("event trace: Could not activate"
456 "syscall entry trace point");
457 } else {
458 set_bit(num, enabled_prof_enter_syscalls);
459 sys_prof_refcount_enter++;
239 } 460 }
461 mutex_unlock(&syscall_trace_lock);
462 return ret;
463}
240 464
241 ret = register_ftrace_event(&syscall_exit_event); 465void unreg_prof_syscall_enter(char *name)
242 if (!ret) { 466{
243 printk(KERN_WARNING "event %d failed to register\n", 467 int num;
244 syscall_exit_event.type); 468
245 WARN_ON_ONCE(1); 469 num = syscall_name_to_nr(name);
470 if (num < 0 || num >= NR_syscalls)
471 return;
472
473 mutex_lock(&syscall_trace_lock);
474 sys_prof_refcount_enter--;
475 clear_bit(num, enabled_prof_enter_syscalls);
476 if (!sys_prof_refcount_enter)
477 unregister_trace_sys_enter(prof_syscall_enter);
478 mutex_unlock(&syscall_trace_lock);
479}
480
481static void prof_syscall_exit(struct pt_regs *regs, long ret)
482{
483 struct syscall_metadata *sys_data;
484 struct syscall_trace_exit *rec;
485 unsigned long flags;
486 int syscall_nr;
487 char *raw_data;
488 int size;
489 int cpu;
490
491 syscall_nr = syscall_get_nr(current, regs);
492 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
493 return;
494
495 sys_data = syscall_nr_to_meta(syscall_nr);
496 if (!sys_data)
497 return;
498
499 /* We can probably do that at build time */
500 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
501 size -= sizeof(u32);
502
503 /*
504 * Impossible, but be paranoid with the future
505 * How to put this check outside runtime?
506 */
507 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
508 "exit event has grown above profile buffer size"))
509 return;
510
511 /* Protect the per cpu buffer, begin the rcu read side */
512 local_irq_save(flags);
513 cpu = smp_processor_id();
514
515 if (in_nmi())
516 raw_data = rcu_dereference(trace_profile_buf_nmi);
517 else
518 raw_data = rcu_dereference(trace_profile_buf);
519
520 if (!raw_data)
521 goto end;
522
523 raw_data = per_cpu_ptr(raw_data, cpu);
524
525 /* zero the dead bytes from align to not leak stack to user */
526 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
527
528 rec = (struct syscall_trace_exit *)raw_data;
529
530 tracing_generic_entry_update(&rec->ent, 0, 0);
531 rec->ent.type = sys_data->exit_id;
532 rec->nr = syscall_nr;
533 rec->ret = syscall_get_return_value(current, regs);
534
535 perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
536
537end:
538 local_irq_restore(flags);
539}
540
541int reg_prof_syscall_exit(char *name)
542{
543 int ret = 0;
544 int num;
545
546 num = syscall_name_to_nr(name);
547 if (num < 0 || num >= NR_syscalls)
548 return -ENOSYS;
549
550 mutex_lock(&syscall_trace_lock);
551 if (!sys_prof_refcount_exit)
552 ret = register_trace_sys_exit(prof_syscall_exit);
553 if (ret) {
554 pr_info("event trace: Could not activate"
555 "syscall entry trace point");
556 } else {
557 set_bit(num, enabled_prof_exit_syscalls);
558 sys_prof_refcount_exit++;
246 } 559 }
560 mutex_unlock(&syscall_trace_lock);
561 return ret;
562}
563
564void unreg_prof_syscall_exit(char *name)
565{
566 int num;
567
568 num = syscall_name_to_nr(name);
569 if (num < 0 || num >= NR_syscalls)
570 return;
247 571
248 return register_tracer(&syscall_tracer); 572 mutex_lock(&syscall_trace_lock);
573 sys_prof_refcount_exit--;
574 clear_bit(num, enabled_prof_exit_syscalls);
575 if (!sys_prof_refcount_exit)
576 unregister_trace_sys_exit(prof_syscall_exit);
577 mutex_unlock(&syscall_trace_lock);
249} 578}
250device_initcall(register_ftrace_syscalls); 579
580#endif
581
582
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 97fcea4acce1..40cafb07dffd 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
9#include <trace/events/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/kref.h>
12#include "trace_stat.h" 13#include "trace_stat.h"
13#include "trace.h" 14#include "trace.h"
14 15
@@ -16,6 +17,7 @@
16/* A cpu workqueue thread */ 17/* A cpu workqueue thread */
17struct cpu_workqueue_stats { 18struct cpu_workqueue_stats {
18 struct list_head list; 19 struct list_head list;
20 struct kref kref;
19 int cpu; 21 int cpu;
20 pid_t pid; 22 pid_t pid;
21/* Can be inserted from interrupt or user context, need to be atomic */ 23/* Can be inserted from interrupt or user context, need to be atomic */
@@ -39,6 +41,11 @@ struct workqueue_global_stats {
39static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); 41static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
40#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) 42#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
41 43
44static void cpu_workqueue_stat_free(struct kref *kref)
45{
46 kfree(container_of(kref, struct cpu_workqueue_stats, kref));
47}
48
42/* Insertion of a work */ 49/* Insertion of a work */
43static void 50static void
44probe_workqueue_insertion(struct task_struct *wq_thread, 51probe_workqueue_insertion(struct task_struct *wq_thread,
@@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
96 return; 103 return;
97 } 104 }
98 INIT_LIST_HEAD(&cws->list); 105 INIT_LIST_HEAD(&cws->list);
106 kref_init(&cws->kref);
99 cws->cpu = cpu; 107 cws->cpu = cpu;
100
101 cws->pid = wq_thread->pid; 108 cws->pid = wq_thread->pid;
102 109
103 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 110 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
@@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
118 list) { 125 list) {
119 if (node->pid == wq_thread->pid) { 126 if (node->pid == wq_thread->pid) {
120 list_del(&node->list); 127 list_del(&node->list);
121 kfree(node); 128 kref_put(&node->kref, cpu_workqueue_stat_free);
122 goto found; 129 goto found;
123 } 130 }
124 } 131 }
@@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
137 144
138 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 145 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
139 146
140 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) 147 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
141 ret = list_entry(workqueue_cpu_stat(cpu)->list.next, 148 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
142 struct cpu_workqueue_stats, list); 149 struct cpu_workqueue_stats, list);
150 kref_get(&ret->kref);
151 }
143 152
144 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 153 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
145 154
@@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace)
162static void *workqueue_stat_next(void *prev, int idx) 171static void *workqueue_stat_next(void *prev, int idx)
163{ 172{
164 struct cpu_workqueue_stats *prev_cws = prev; 173 struct cpu_workqueue_stats *prev_cws = prev;
174 struct cpu_workqueue_stats *ret;
165 int cpu = prev_cws->cpu; 175 int cpu = prev_cws->cpu;
166 unsigned long flags; 176 unsigned long flags;
167 void *ret = NULL;
168 177
169 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 178 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
170 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { 179 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
@@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx)
175 return NULL; 184 return NULL;
176 } while (!(ret = workqueue_stat_start_cpu(cpu))); 185 } while (!(ret = workqueue_stat_start_cpu(cpu)));
177 return ret; 186 return ret;
187 } else {
188 ret = list_entry(prev_cws->list.next,
189 struct cpu_workqueue_stats, list);
190 kref_get(&ret->kref);
178 } 191 }
179 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 192 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
180 193
181 return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, 194 return ret;
182 list);
183} 195}
184 196
185static int workqueue_stat_show(struct seq_file *s, void *p) 197static int workqueue_stat_show(struct seq_file *s, void *p)
@@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
203 return 0; 215 return 0;
204} 216}
205 217
218static void workqueue_stat_release(void *stat)
219{
220 struct cpu_workqueue_stats *node = stat;
221
222 kref_put(&node->kref, cpu_workqueue_stat_free);
223}
224
206static int workqueue_stat_headers(struct seq_file *s) 225static int workqueue_stat_headers(struct seq_file *s)
207{ 226{
208 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); 227 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
@@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = {
215 .stat_start = workqueue_stat_start, 234 .stat_start = workqueue_stat_start,
216 .stat_next = workqueue_stat_next, 235 .stat_next = workqueue_stat_next,
217 .stat_show = workqueue_stat_show, 236 .stat_show = workqueue_stat_show,
237 .stat_release = workqueue_stat_release,
218 .stat_headers = workqueue_stat_headers 238 .stat_headers = workqueue_stat_headers
219}; 239};
220 240
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 1ef5d3a601c7..cc89be5bc0f8 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -24,6 +24,7 @@
24#include <linux/tracepoint.h> 24#include <linux/tracepoint.h>
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h>
27 28
28extern struct tracepoint __start___tracepoints[]; 29extern struct tracepoint __start___tracepoints[];
29extern struct tracepoint __stop___tracepoints[]; 30extern struct tracepoint __stop___tracepoints[];
@@ -47,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
47 48
48/* 49/*
49 * Note about RCU : 50 * Note about RCU :
50 * It is used to to delay the free of multiple probes array until a quiescent 51 * It is used to delay the free of multiple probes array until a quiescent
51 * state is reached. 52 * state is reached.
52 * Tracepoint entries modifications are protected by the tracepoints_mutex. 53 * Tracepoint entries modifications are protected by the tracepoints_mutex.
53 */ 54 */
@@ -242,6 +243,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
242{ 243{
243 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 244 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
244 245
246 if (elem->regfunc && !elem->state && active)
247 elem->regfunc();
248 else if (elem->unregfunc && elem->state && !active)
249 elem->unregfunc();
250
245 /* 251 /*
246 * rcu_assign_pointer has a smp_wmb() which makes sure that the new 252 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
247 * probe callbacks array is consistent before setting a pointer to it. 253 * probe callbacks array is consistent before setting a pointer to it.
@@ -261,6 +267,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
261 */ 267 */
262static void disable_tracepoint(struct tracepoint *elem) 268static void disable_tracepoint(struct tracepoint *elem)
263{ 269{
270 if (elem->unregfunc && elem->state)
271 elem->unregfunc();
272
264 elem->state = 0; 273 elem->state = 0;
265 rcu_assign_pointer(elem->funcs, NULL); 274 rcu_assign_pointer(elem->funcs, NULL);
266} 275}
@@ -554,9 +563,6 @@ int tracepoint_module_notify(struct notifier_block *self,
554 563
555 switch (val) { 564 switch (val) {
556 case MODULE_STATE_COMING: 565 case MODULE_STATE_COMING:
557 tracepoint_update_probe_range(mod->tracepoints,
558 mod->tracepoints + mod->num_tracepoints);
559 break;
560 case MODULE_STATE_GOING: 566 case MODULE_STATE_GOING:
561 tracepoint_update_probe_range(mod->tracepoints, 567 tracepoint_update_probe_range(mod->tracepoints,
562 mod->tracepoints + mod->num_tracepoints); 568 mod->tracepoints + mod->num_tracepoints);
@@ -577,3 +583,41 @@ static int init_tracepoints(void)
577__initcall(init_tracepoints); 583__initcall(init_tracepoints);
578 584
579#endif /* CONFIG_MODULES */ 585#endif /* CONFIG_MODULES */
586
587#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
588
589/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
590static int sys_tracepoint_refcount;
591
592void syscall_regfunc(void)
593{
594 unsigned long flags;
595 struct task_struct *g, *t;
596
597 if (!sys_tracepoint_refcount) {
598 read_lock_irqsave(&tasklist_lock, flags);
599 do_each_thread(g, t) {
600 /* Skip kernel threads. */
601 if (t->mm)
602 set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
603 } while_each_thread(g, t);
604 read_unlock_irqrestore(&tasklist_lock, flags);
605 }
606 sys_tracepoint_refcount++;
607}
608
609void syscall_unregfunc(void)
610{
611 unsigned long flags;
612 struct task_struct *g, *t;
613
614 sys_tracepoint_refcount--;
615 if (!sys_tracepoint_refcount) {
616 read_lock_irqsave(&tasklist_lock, flags);
617 do_each_thread(g, t) {
618 clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
619 } while_each_thread(g, t);
620 read_unlock_irqrestore(&tasklist_lock, flags);
621 }
622}
623#endif
diff --git a/kernel/wait.c b/kernel/wait.c
index ea7c3b4275cf..c4bd3d825f35 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,13 +10,14 @@
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13void init_waitqueue_head(wait_queue_head_t *q) 13void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
14{ 14{
15 spin_lock_init(&q->lock); 15 spin_lock_init(&q->lock);
16 lockdep_set_class(&q->lock, key);
16 INIT_LIST_HEAD(&q->task_list); 17 INIT_LIST_HEAD(&q->task_list);
17} 18}
18 19
19EXPORT_SYMBOL(init_waitqueue_head); 20EXPORT_SYMBOL(__init_waitqueue_head);
20 21
21void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 22void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
22{ 23{
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0668795d8818..addfe2df93b1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -317,8 +317,6 @@ static int worker_thread(void *__cwq)
317 if (cwq->wq->freezeable) 317 if (cwq->wq->freezeable)
318 set_freezable(); 318 set_freezable();
319 319
320 set_user_nice(current, -5);
321
322 for (;;) { 320 for (;;) {
323 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); 321 prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
324 if (!freezing(current) && 322 if (!freezing(current) &&
@@ -600,7 +598,12 @@ static struct workqueue_struct *keventd_wq __read_mostly;
600 * schedule_work - put work task in global workqueue 598 * schedule_work - put work task in global workqueue
601 * @work: job to be done 599 * @work: job to be done
602 * 600 *
603 * This puts a job in the kernel-global workqueue. 601 * Returns zero if @work was already on the kernel-global workqueue and
602 * non-zero otherwise.
603 *
604 * This puts a job in the kernel-global workqueue if it was not already
605 * queued and leaves it in the same position on the kernel-global
606 * workqueue otherwise.
604 */ 607 */
605int schedule_work(struct work_struct *work) 608int schedule_work(struct work_struct *work)
606{ 609{