aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@linutronix.de>2009-05-20 03:02:28 -0400
committerThomas Gleixner <tglx@linutronix.de>2009-05-20 03:02:28 -0400
commit521c180874dae86f675d23c4eade4dba8b1f2cc8 (patch)
tree7509303da3a9a1b40a26f6811f321c89cd31737b /kernel
parentf1a11e0576c7a73d759d05d776692b2b2d37172b (diff)
parent64d1304a64477629cb16b75491a77bafe6f86963 (diff)
Merge branch 'core/urgent' into core/futexes
Merge reason: this branch was on an pre -rc1 base, merge it up to -rc6+ to get the latest upstream fixes. Conflicts: kernel/futex.c Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c9
-rw-r--r--kernel/audit_tree.c5
-rw-r--r--kernel/auditfilter.c20
-rw-r--r--kernel/auditsc.c33
-rw-r--r--kernel/cgroup.c3
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/extable.c25
-rw-r--r--kernel/fork.c21
-rw-r--r--kernel/futex.c34
-rw-r--r--kernel/hrtimer.c55
-rw-r--r--kernel/hung_task.c217
-rw-r--r--kernel/irq/devres.c16
-rw-r--r--kernel/irq/handle.c58
-rw-r--r--kernel/irq/manage.c194
-rw-r--r--kernel/irq/numa_migrate.c1
-rw-r--r--kernel/kallsyms.c19
-rw-r--r--kernel/kgdb.c4
-rw-r--r--kernel/kmod.c10
-rw-r--r--kernel/kprobes.c313
-rw-r--r--kernel/kthread.c26
-rw-r--r--kernel/lockdep.c44
-rw-r--r--kernel/lockdep_internals.h4
-rw-r--r--kernel/module.c274
-rw-r--r--kernel/mutex.c3
-rw-r--r--kernel/panic.c52
-rw-r--r--kernel/params.c26
-rw-r--r--kernel/posix-cpu-timers.c17
-rw-r--r--kernel/power/disk.c49
-rw-r--r--kernel/power/main.c24
-rw-r--r--kernel/power/swap.c2
-rw-r--r--kernel/power/user.c9
-rw-r--r--kernel/printk.c7
-rw-r--r--kernel/ptrace.c27
-rw-r--r--kernel/rcuclassic.c23
-rw-r--r--kernel/rcupdate.c18
-rw-r--r--kernel/rcupreempt.c48
-rw-r--r--kernel/rcutree.c39
-rw-r--r--kernel/rcutree.h10
-rw-r--r--kernel/rcutree_trace.c16
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/resource.c46
-rw-r--r--kernel/sched.c229
-rw-r--r--kernel/sched_clock.c15
-rw-r--r--kernel/sched_cpupri.c5
-rw-r--r--kernel/sched_rt.c15
-rw-r--r--kernel/slow-work.c4
-rw-r--r--kernel/softirq.c34
-rw-r--r--kernel/softlockup.c100
-rw-r--r--kernel/sys.c24
-rw-r--r--kernel/sysctl.c44
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/tick-common.c12
-rw-r--r--kernel/time/timekeeping.c12
-rw-r--r--kernel/timer.c7
-rw-r--r--kernel/trace/Kconfig125
-rw-r--r--kernel/trace/Makefile13
-rw-r--r--kernel/trace/blktrace.c1550
-rw-r--r--kernel/trace/events.c14
-rw-r--r--kernel/trace/ftrace.c1131
-rw-r--r--kernel/trace/kmemtrace.c464
-rw-r--r--kernel/trace/ring_buffer.c693
-rw-r--r--kernel/trace/trace.c3062
-rw-r--r--kernel/trace/trace.h321
-rw-r--r--kernel/trace/trace_boot.c36
-rw-r--r--kernel/trace/trace_branch.c286
-rw-r--r--kernel/trace/trace_clock.c109
-rw-r--r--kernel/trace/trace_event_profile.c31
-rw-r--r--kernel/trace/trace_event_types.h173
-rw-r--r--kernel/trace/trace_events.c828
-rw-r--r--kernel/trace/trace_events_filter.c433
-rw-r--r--kernel/trace/trace_events_stage_1.h39
-rw-r--r--kernel/trace/trace_events_stage_2.h176
-rw-r--r--kernel/trace/trace_events_stage_3.h281
-rw-r--r--kernel/trace/trace_export.c102
-rw-r--r--kernel/trace/trace_functions.c369
-rw-r--r--kernel/trace/trace_functions_graph.c570
-rw-r--r--kernel/trace/trace_hw_branches.c185
-rw-r--r--kernel/trace/trace_irqsoff.c54
-rw-r--r--kernel/trace/trace_mmiotrace.c45
-rw-r--r--kernel/trace/trace_nop.c6
-rw-r--r--kernel/trace/trace_output.c1017
-rw-r--r--kernel/trace/trace_output.h71
-rw-r--r--kernel/trace/trace_power.c201
-rw-r--r--kernel/trace/trace_printk.c270
-rw-r--r--kernel/trace/trace_sched_switch.c27
-rw-r--r--kernel/trace/trace_sched_wakeup.c102
-rw-r--r--kernel/trace/trace_selftest.c169
-rw-r--r--kernel/trace/trace_stack.c19
-rw-r--r--kernel/trace/trace_stat.c326
-rw-r--r--kernel/trace/trace_stat.h31
-rw-r--r--kernel/trace/trace_syscalls.c250
-rw-r--r--kernel/trace/trace_sysprof.c23
-rw-r--r--kernel/trace/trace_workqueue.c288
-rw-r--r--kernel/tracepoint.c7
-rw-r--r--kernel/workqueue.c52
97 files changed, 13061 insertions, 3210 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index bab1dffe37e9..42423665660a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
74obj-$(CONFIG_KPROBES) += kprobes.o 74obj-$(CONFIG_KPROBES) += kprobes.o
75obj-$(CONFIG_KGDB) += kgdb.o 75obj-$(CONFIG_KGDB) += kgdb.o
76obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o 76obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
77obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
77obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 78obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
78obj-$(CONFIG_SECCOMP) += seccomp.o 79obj-$(CONFIG_SECCOMP) += seccomp.o
79obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 80obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/audit.c b/kernel/audit.c
index ce6d8ea3131e..9442c3533ba9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -766,6 +766,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
766 766
767 audit_log_format(ab, " msg="); 767 audit_log_format(ab, " msg=");
768 size = nlmsg_len(nlh); 768 size = nlmsg_len(nlh);
769 if (size > 0 &&
770 ((unsigned char *)data)[size - 1] == '\0')
771 size--;
769 audit_log_n_untrustedstring(ab, data, size); 772 audit_log_n_untrustedstring(ab, data, size);
770 } 773 }
771 audit_set_pid(ab, pid); 774 audit_set_pid(ab, pid);
@@ -1382,7 +1385,7 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
1382int audit_string_contains_control(const char *string, size_t len) 1385int audit_string_contains_control(const char *string, size_t len)
1383{ 1386{
1384 const unsigned char *p; 1387 const unsigned char *p;
1385 for (p = string; p < (const unsigned char *)string + len && *p; p++) { 1388 for (p = string; p < (const unsigned char *)string + len; p++) {
1386 if (*p == '"' || *p < 0x21 || *p > 0x7e) 1389 if (*p == '"' || *p < 0x21 || *p > 0x7e)
1387 return 1; 1390 return 1;
1388 } 1391 }
@@ -1437,13 +1440,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1437 /* We will allow 11 spaces for ' (deleted)' to be appended */ 1440 /* We will allow 11 spaces for ' (deleted)' to be appended */
1438 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); 1441 pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
1439 if (!pathname) { 1442 if (!pathname) {
1440 audit_log_format(ab, "<no memory>"); 1443 audit_log_string(ab, "<no_memory>");
1441 return; 1444 return;
1442 } 1445 }
1443 p = d_path(path, pathname, PATH_MAX+11); 1446 p = d_path(path, pathname, PATH_MAX+11);
1444 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ 1447 if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
1445 /* FIXME: can we save some information here? */ 1448 /* FIXME: can we save some information here? */
1446 audit_log_format(ab, "<too long>"); 1449 audit_log_string(ab, "<too_long>");
1447 } else 1450 } else
1448 audit_log_untrustedstring(ab, p); 1451 audit_log_untrustedstring(ab, p);
1449 kfree(pathname); 1452 kfree(pathname);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8ad9545b8db9..6e7351739a82 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -385,6 +385,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
385 mutex_lock(&inode->inotify_mutex); 385 mutex_lock(&inode->inotify_mutex);
386 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 386 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
387 mutex_unlock(&inode->inotify_mutex); 387 mutex_unlock(&inode->inotify_mutex);
388 put_inotify_watch(&old->watch);
388 free_chunk(chunk); 389 free_chunk(chunk);
389 return -ENOSPC; 390 return -ENOSPC;
390 } 391 }
@@ -394,6 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
394 chunk->dead = 1; 395 chunk->dead = 1;
395 inotify_evict_watch(&chunk->watch); 396 inotify_evict_watch(&chunk->watch);
396 mutex_unlock(&inode->inotify_mutex); 397 mutex_unlock(&inode->inotify_mutex);
398 put_inotify_watch(&old->watch);
397 put_inotify_watch(&chunk->watch); 399 put_inotify_watch(&chunk->watch);
398 return 0; 400 return 0;
399 } 401 }
@@ -732,9 +734,6 @@ int audit_tag_tree(char *old, char *new)
732 dentry = dget(path.dentry); 734 dentry = dget(path.dentry);
733 path_put(&path); 735 path_put(&path);
734 736
735 if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
736 follow_up(&mnt, &dentry);
737
738 list_add_tail(&list, &tagged->mnt_list); 737 list_add_tail(&list, &tagged->mnt_list);
739 738
740 mutex_lock(&audit_filter_mutex); 739 mutex_lock(&audit_filter_mutex);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index fbf24d121d97..713098ee5a02 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -135,18 +135,18 @@ static void audit_remove_watch(struct audit_watch *watch)
135static inline void audit_free_rule(struct audit_entry *e) 135static inline void audit_free_rule(struct audit_entry *e)
136{ 136{
137 int i; 137 int i;
138 138 struct audit_krule *erule = &e->rule;
139 /* some rules don't have associated watches */ 139 /* some rules don't have associated watches */
140 if (e->rule.watch) 140 if (erule->watch)
141 audit_put_watch(e->rule.watch); 141 audit_put_watch(erule->watch);
142 if (e->rule.fields) 142 if (erule->fields)
143 for (i = 0; i < e->rule.field_count; i++) { 143 for (i = 0; i < erule->field_count; i++) {
144 struct audit_field *f = &e->rule.fields[i]; 144 struct audit_field *f = &erule->fields[i];
145 kfree(f->lsm_str); 145 kfree(f->lsm_str);
146 security_audit_rule_free(f->lsm_rule); 146 security_audit_rule_free(f->lsm_rule);
147 } 147 }
148 kfree(e->rule.fields); 148 kfree(erule->fields);
149 kfree(e->rule.filterkey); 149 kfree(erule->filterkey);
150 kfree(e); 150 kfree(e);
151} 151}
152 152
@@ -1028,7 +1028,7 @@ static void audit_update_watch(struct audit_parent *parent,
1028 1028
1029 if (audit_enabled) { 1029 if (audit_enabled) {
1030 struct audit_buffer *ab; 1030 struct audit_buffer *ab;
1031 ab = audit_log_start(NULL, GFP_KERNEL, 1031 ab = audit_log_start(NULL, GFP_NOFS,
1032 AUDIT_CONFIG_CHANGE); 1032 AUDIT_CONFIG_CHANGE);
1033 audit_log_format(ab, "auid=%u ses=%u", 1033 audit_log_format(ab, "auid=%u ses=%u",
1034 audit_get_loginuid(current), 1034 audit_get_loginuid(current),
@@ -1067,7 +1067,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
1067 e = container_of(r, struct audit_entry, rule); 1067 e = container_of(r, struct audit_entry, rule);
1068 if (audit_enabled) { 1068 if (audit_enabled) {
1069 struct audit_buffer *ab; 1069 struct audit_buffer *ab;
1070 ab = audit_log_start(NULL, GFP_KERNEL, 1070 ab = audit_log_start(NULL, GFP_NOFS,
1071 AUDIT_CONFIG_CHANGE); 1071 AUDIT_CONFIG_CHANGE);
1072 audit_log_format(ab, "auid=%u ses=%u", 1072 audit_log_format(ab, "auid=%u ses=%u",
1073 audit_get_loginuid(current), 1073 audit_get_loginuid(current),
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2bfc64786765..7d6ac7c1f414 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -329,6 +329,14 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
329 */ 329 */
330 330
331#ifdef CONFIG_AUDIT_TREE 331#ifdef CONFIG_AUDIT_TREE
332static void audit_set_auditable(struct audit_context *ctx)
333{
334 if (!ctx->prio) {
335 ctx->prio = 1;
336 ctx->current_state = AUDIT_RECORD_CONTEXT;
337 }
338}
339
332static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk) 340static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
333{ 341{
334 struct audit_tree_refs *p = ctx->trees; 342 struct audit_tree_refs *p = ctx->trees;
@@ -742,17 +750,9 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
742 rcu_read_unlock(); 750 rcu_read_unlock();
743} 751}
744 752
745static void audit_set_auditable(struct audit_context *ctx)
746{
747 if (!ctx->prio) {
748 ctx->prio = 1;
749 ctx->current_state = AUDIT_RECORD_CONTEXT;
750 }
751}
752
753static inline struct audit_context *audit_get_context(struct task_struct *tsk, 753static inline struct audit_context *audit_get_context(struct task_struct *tsk,
754 int return_valid, 754 int return_valid,
755 int return_code) 755 long return_code)
756{ 756{
757 struct audit_context *context = tsk->audit_context; 757 struct audit_context *context = tsk->audit_context;
758 758
@@ -1024,7 +1024,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1024{ 1024{
1025 char arg_num_len_buf[12]; 1025 char arg_num_len_buf[12];
1026 const char __user *tmp_p = p; 1026 const char __user *tmp_p = p;
1027 /* how many digits are in arg_num? 3 is the length of a=\n */ 1027 /* how many digits are in arg_num? 3 is the length of " a=" */
1028 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; 1028 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
1029 size_t len, len_left, to_send; 1029 size_t len, len_left, to_send;
1030 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; 1030 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
@@ -1110,7 +1110,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1110 * so we can be sure nothing was lost. 1110 * so we can be sure nothing was lost.
1111 */ 1111 */
1112 if ((i == 0) && (too_long)) 1112 if ((i == 0) && (too_long))
1113 audit_log_format(*ab, "a%d_len=%zu ", arg_num, 1113 audit_log_format(*ab, " a%d_len=%zu", arg_num,
1114 has_cntl ? 2*len : len); 1114 has_cntl ? 2*len : len);
1115 1115
1116 /* 1116 /*
@@ -1130,7 +1130,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1130 buf[to_send] = '\0'; 1130 buf[to_send] = '\0';
1131 1131
1132 /* actually log it */ 1132 /* actually log it */
1133 audit_log_format(*ab, "a%d", arg_num); 1133 audit_log_format(*ab, " a%d", arg_num);
1134 if (too_long) 1134 if (too_long)
1135 audit_log_format(*ab, "[%d]", i); 1135 audit_log_format(*ab, "[%d]", i);
1136 audit_log_format(*ab, "="); 1136 audit_log_format(*ab, "=");
@@ -1138,7 +1138,6 @@ static int audit_log_single_execve_arg(struct audit_context *context,
1138 audit_log_n_hex(*ab, buf, to_send); 1138 audit_log_n_hex(*ab, buf, to_send);
1139 else 1139 else
1140 audit_log_format(*ab, "\"%s\"", buf); 1140 audit_log_format(*ab, "\"%s\"", buf);
1141 audit_log_format(*ab, "\n");
1142 1141
1143 p += to_send; 1142 p += to_send;
1144 len_left -= to_send; 1143 len_left -= to_send;
@@ -1166,7 +1165,7 @@ static void audit_log_execve_info(struct audit_context *context,
1166 1165
1167 p = (const char __user *)axi->mm->arg_start; 1166 p = (const char __user *)axi->mm->arg_start;
1168 1167
1169 audit_log_format(*ab, "argc=%d ", axi->argc); 1168 audit_log_format(*ab, "argc=%d", axi->argc);
1170 1169
1171 /* 1170 /*
1172 * we need some kernel buffer to hold the userspace args. Just 1171 * we need some kernel buffer to hold the userspace args. Just
@@ -1479,7 +1478,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1479 case 0: 1478 case 0:
1480 /* name was specified as a relative path and the 1479 /* name was specified as a relative path and the
1481 * directory component is the cwd */ 1480 * directory component is the cwd */
1482 audit_log_d_path(ab, " name=", &context->pwd); 1481 audit_log_d_path(ab, "name=", &context->pwd);
1483 break; 1482 break;
1484 default: 1483 default:
1485 /* log the name's directory component */ 1484 /* log the name's directory component */
@@ -2150,7 +2149,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
2150 * __audit_mq_open - record audit data for a POSIX MQ open 2149 * __audit_mq_open - record audit data for a POSIX MQ open
2151 * @oflag: open flag 2150 * @oflag: open flag
2152 * @mode: mode bits 2151 * @mode: mode bits
2153 * @u_attr: queue attributes 2152 * @attr: queue attributes
2154 * 2153 *
2155 */ 2154 */
2156void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) 2155void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
@@ -2197,7 +2196,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
2197/** 2196/**
2198 * __audit_mq_notify - record audit data for a POSIX MQ notify 2197 * __audit_mq_notify - record audit data for a POSIX MQ notify
2199 * @mqdes: MQ descriptor 2198 * @mqdes: MQ descriptor
2200 * @u_notification: Notification event 2199 * @notification: Notification event
2201 * 2200 *
2202 */ 2201 */
2203 2202
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 382109b5baeb..a7267bfd3765 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1133,8 +1133,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
1133 free_cg_links: 1133 free_cg_links:
1134 free_cg_links(&tmp_cg_links); 1134 free_cg_links(&tmp_cg_links);
1135 drop_new_super: 1135 drop_new_super:
1136 up_write(&sb->s_umount); 1136 deactivate_locked_super(sb);
1137 deactivate_super(sb);
1138 return ret; 1137 return ret;
1139} 1138}
1140 1139
diff --git a/kernel/exit.c b/kernel/exit.c
index 6686ed1e4aa3..abf9cf3b95c6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -837,8 +837,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
837 */ 837 */
838 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && 838 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
839 (tsk->parent_exec_id != tsk->real_parent->self_exec_id || 839 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
840 tsk->self_exec_id != tsk->parent_exec_id) && 840 tsk->self_exec_id != tsk->parent_exec_id))
841 !capable(CAP_KILL))
842 tsk->exit_signal = SIGCHLD; 841 tsk->exit_signal = SIGCHLD;
843 842
844 signal = tracehook_notify_death(tsk, &cookie, group_dead); 843 signal = tracehook_notify_death(tsk, &cookie, group_dead);
@@ -924,6 +923,8 @@ NORET_TYPE void do_exit(long code)
924 schedule(); 923 schedule();
925 } 924 }
926 925
926 exit_irq_thread();
927
927 exit_signals(tsk); /* sets PF_EXITING */ 928 exit_signals(tsk); /* sets PF_EXITING */
928 /* 929 /*
929 * tsk->flags are checked in the futex code to protect against 930 * tsk->flags are checked in the futex code to protect against
diff --git a/kernel/extable.c b/kernel/extable.c
index c46da6a47036..7f8f263f8524 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -15,11 +15,22 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18#include <linux/ftrace.h>
19#include <linux/memory.h>
18#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/mutex.h>
19#include <linux/init.h> 22#include <linux/init.h>
20#include <linux/ftrace.h> 23
21#include <asm/uaccess.h>
22#include <asm/sections.h> 24#include <asm/sections.h>
25#include <asm/uaccess.h>
26
27/*
28 * mutex protecting text section modification (dynamic code patching).
29 * some users need to sleep (allocating memory...) while they hold this lock.
30 *
31 * NOT exported to modules - patching kernel text is a really delicate matter.
32 */
33DEFINE_MUTEX(text_mutex);
23 34
24extern struct exception_table_entry __start___ex_table[]; 35extern struct exception_table_entry __start___ex_table[];
25extern struct exception_table_entry __stop___ex_table[]; 36extern struct exception_table_entry __stop___ex_table[];
@@ -49,7 +60,7 @@ static inline int init_kernel_text(unsigned long addr)
49 return 0; 60 return 0;
50} 61}
51 62
52__notrace_funcgraph int core_kernel_text(unsigned long addr) 63int core_kernel_text(unsigned long addr)
53{ 64{
54 if (addr >= (unsigned long)_stext && 65 if (addr >= (unsigned long)_stext &&
55 addr <= (unsigned long)_etext) 66 addr <= (unsigned long)_etext)
@@ -61,11 +72,11 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr)
61 return 0; 72 return 0;
62} 73}
63 74
64__notrace_funcgraph int __kernel_text_address(unsigned long addr) 75int __kernel_text_address(unsigned long addr)
65{ 76{
66 if (core_kernel_text(addr)) 77 if (core_kernel_text(addr))
67 return 1; 78 return 1;
68 if (__module_text_address(addr)) 79 if (is_module_text_address(addr))
69 return 1; 80 return 1;
70 /* 81 /*
71 * There might be init symbols in saved stacktraces. 82 * There might be init symbols in saved stacktraces.
@@ -84,7 +95,7 @@ int kernel_text_address(unsigned long addr)
84{ 95{
85 if (core_kernel_text(addr)) 96 if (core_kernel_text(addr))
86 return 1; 97 return 1;
87 return module_text_address(addr) != NULL; 98 return is_module_text_address(addr);
88} 99}
89 100
90/* 101/*
@@ -100,5 +111,5 @@ int func_ptr_is_kernel_text(void *ptr)
100 addr = (unsigned long) dereference_function_descriptor(ptr); 111 addr = (unsigned long) dereference_function_descriptor(ptr);
101 if (core_kernel_text(addr)) 112 if (core_kernel_text(addr))
102 return 1; 113 return 1;
103 return module_text_address(addr) != NULL; 114 return is_module_text_address(addr);
104} 115}
diff --git a/kernel/fork.c b/kernel/fork.c
index 660c2b8765bc..b9e2edd00726 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -645,6 +645,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
645 645
646 tsk->min_flt = tsk->maj_flt = 0; 646 tsk->min_flt = tsk->maj_flt = 0;
647 tsk->nvcsw = tsk->nivcsw = 0; 647 tsk->nvcsw = tsk->nivcsw = 0;
648#ifdef CONFIG_DETECT_HUNG_TASK
649 tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
650#endif
648 651
649 tsk->mm = NULL; 652 tsk->mm = NULL;
650 tsk->active_mm = NULL; 653 tsk->active_mm = NULL;
@@ -797,6 +800,12 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
797 sig->cputime_expires.virt_exp = cputime_zero; 800 sig->cputime_expires.virt_exp = cputime_zero;
798 sig->cputime_expires.sched_exp = 0; 801 sig->cputime_expires.sched_exp = 0;
799 802
803 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
804 sig->cputime_expires.prof_exp =
805 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
806 sig->cputimer.running = 1;
807 }
808
800 /* The timer lists. */ 809 /* The timer lists. */
801 INIT_LIST_HEAD(&sig->cpu_timers[0]); 810 INIT_LIST_HEAD(&sig->cpu_timers[0]);
802 INIT_LIST_HEAD(&sig->cpu_timers[1]); 811 INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -812,11 +821,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
812 atomic_inc(&current->signal->live); 821 atomic_inc(&current->signal->live);
813 return 0; 822 return 0;
814 } 823 }
815 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
816
817 if (sig)
818 posix_cpu_timers_init_group(sig);
819 824
825 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
820 tsk->signal = sig; 826 tsk->signal = sig;
821 if (!sig) 827 if (!sig)
822 return -ENOMEM; 828 return -ENOMEM;
@@ -856,6 +862,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
856 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 862 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
857 task_unlock(current->group_leader); 863 task_unlock(current->group_leader);
858 864
865 posix_cpu_timers_init_group(sig);
866
859 acct_init_pacct(&sig->pacct); 867 acct_init_pacct(&sig->pacct);
860 868
861 tty_audit_fork(sig); 869 tty_audit_fork(sig);
@@ -1032,11 +1040,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1032 1040
1033 p->default_timer_slack_ns = current->timer_slack_ns; 1041 p->default_timer_slack_ns = current->timer_slack_ns;
1034 1042
1035#ifdef CONFIG_DETECT_SOFTLOCKUP
1036 p->last_switch_count = 0;
1037 p->last_switch_timestamp = 0;
1038#endif
1039
1040 task_io_accounting_init(&p->ioac); 1043 task_io_accounting_init(&p->ioac);
1041 acct_clear_integrals(p); 1044 acct_clear_integrals(p);
1042 1045
diff --git a/kernel/futex.c b/kernel/futex.c
index 157bfcd725b8..476603afd147 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -199,6 +199,7 @@ static void drop_futex_key_refs(union futex_key *key)
199 * @uaddr: virtual address of the futex 199 * @uaddr: virtual address of the futex
200 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 200 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
201 * @key: address where result is stored. 201 * @key: address where result is stored.
202 * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE)
202 * 203 *
203 * Returns a negative error code or 0 204 * Returns a negative error code or 0
204 * The key words are stored in *key on success. 205 * The key words are stored in *key on success.
@@ -209,7 +210,8 @@ static void drop_futex_key_refs(union futex_key *key)
209 * 210 *
210 * lock_page() might sleep, the caller should not hold a spinlock. 211 * lock_page() might sleep, the caller should not hold a spinlock.
211 */ 212 */
212static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) 213static int
214get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
213{ 215{
214 unsigned long address = (unsigned long)uaddr; 216 unsigned long address = (unsigned long)uaddr;
215 struct mm_struct *mm = current->mm; 217 struct mm_struct *mm = current->mm;
@@ -232,7 +234,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
232 * but access_ok() should be faster than find_vma() 234 * but access_ok() should be faster than find_vma()
233 */ 235 */
234 if (!fshared) { 236 if (!fshared) {
235 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) 237 if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
236 return -EFAULT; 238 return -EFAULT;
237 key->private.mm = mm; 239 key->private.mm = mm;
238 key->private.address = address; 240 key->private.address = address;
@@ -241,7 +243,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
241 } 243 }
242 244
243again: 245again:
244 err = get_user_pages_fast(address, 1, 0, &page); 246 err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
245 if (err < 0) 247 if (err < 0)
246 return err; 248 return err;
247 249
@@ -834,7 +836,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
834 if (!bitset) 836 if (!bitset)
835 return -EINVAL; 837 return -EINVAL;
836 838
837 ret = get_futex_key(uaddr, fshared, &key); 839 ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
838 if (unlikely(ret != 0)) 840 if (unlikely(ret != 0))
839 goto out; 841 goto out;
840 842
@@ -880,10 +882,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
880 int ret, op_ret; 882 int ret, op_ret;
881 883
882retry: 884retry:
883 ret = get_futex_key(uaddr1, fshared, &key1); 885 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
884 if (unlikely(ret != 0)) 886 if (unlikely(ret != 0))
885 goto out; 887 goto out;
886 ret = get_futex_key(uaddr2, fshared, &key2); 888 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
887 if (unlikely(ret != 0)) 889 if (unlikely(ret != 0))
888 goto out_put_key1; 890 goto out_put_key1;
889 891
@@ -1131,10 +1133,11 @@ retry:
1131 pi_state = NULL; 1133 pi_state = NULL;
1132 } 1134 }
1133 1135
1134 ret = get_futex_key(uaddr1, fshared, &key1); 1136 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
1135 if (unlikely(ret != 0)) 1137 if (unlikely(ret != 0))
1136 goto out; 1138 goto out;
1137 ret = get_futex_key(uaddr2, fshared, &key2); 1139 ret = get_futex_key(uaddr2, fshared, &key2,
1140 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1138 if (unlikely(ret != 0)) 1141 if (unlikely(ret != 0))
1139 goto out_put_key1; 1142 goto out_put_key1;
1140 1143
@@ -1267,7 +1270,12 @@ retry_private:
1267out_unlock: 1270out_unlock:
1268 double_unlock_hb(hb1, hb2); 1271 double_unlock_hb(hb1, hb2);
1269 1272
1270 /* drop_futex_key_refs() must be called outside the spinlocks. */ 1273 /*
1274 * drop_futex_key_refs() must be called outside the spinlocks. During
1275 * the requeue we moved futex_q's from the hash bucket at key1 to the
1276 * one at key2 and updated their key pointer. We no longer need to
1277 * hold the references to key1.
1278 */
1271 while (--drop_count >= 0) 1279 while (--drop_count >= 0)
1272 drop_futex_key_refs(&key1); 1280 drop_futex_key_refs(&key1);
1273 1281
@@ -1660,7 +1668,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1660 */ 1668 */
1661retry: 1669retry:
1662 q->key = FUTEX_KEY_INIT; 1670 q->key = FUTEX_KEY_INIT;
1663 ret = get_futex_key(uaddr, fshared, &q->key); 1671 ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
1664 if (unlikely(ret != 0)) 1672 if (unlikely(ret != 0))
1665 return ret; 1673 return ret;
1666 1674
@@ -1819,7 +1827,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1819 q.rt_waiter = NULL; 1827 q.rt_waiter = NULL;
1820retry: 1828retry:
1821 q.key = FUTEX_KEY_INIT; 1829 q.key = FUTEX_KEY_INIT;
1822 ret = get_futex_key(uaddr, fshared, &q.key); 1830 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
1823 if (unlikely(ret != 0)) 1831 if (unlikely(ret != 0))
1824 goto out; 1832 goto out;
1825 1833
@@ -1960,7 +1968,7 @@ retry:
1960 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 1968 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1961 return -EPERM; 1969 return -EPERM;
1962 1970
1963 ret = get_futex_key(uaddr, fshared, &key); 1971 ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
1964 if (unlikely(ret != 0)) 1972 if (unlikely(ret != 0))
1965 goto out; 1973 goto out;
1966 1974
@@ -2171,7 +2179,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2171 q.rt_waiter = &rt_waiter; 2179 q.rt_waiter = &rt_waiter;
2172 2180
2173 key2 = FUTEX_KEY_INIT; 2181 key2 = FUTEX_KEY_INIT;
2174 ret = get_futex_key(uaddr2, fshared, &key2); 2182 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
2175 if (unlikely(ret != 0)) 2183 if (unlikely(ret != 0))
2176 goto out; 2184 goto out;
2177 2185
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f394d2a42ca3..cb8a15c19583 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -651,14 +651,20 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
651 * and expiry check is done in the hrtimer_interrupt or in the softirq. 651 * and expiry check is done in the hrtimer_interrupt or in the softirq.
652 */ 652 */
653static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 653static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
654 struct hrtimer_clock_base *base) 654 struct hrtimer_clock_base *base,
655 int wakeup)
655{ 656{
656 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 657 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
657 spin_unlock(&base->cpu_base->lock); 658 if (wakeup) {
658 raise_softirq_irqoff(HRTIMER_SOFTIRQ); 659 spin_unlock(&base->cpu_base->lock);
659 spin_lock(&base->cpu_base->lock); 660 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
661 spin_lock(&base->cpu_base->lock);
662 } else
663 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
664
660 return 1; 665 return 1;
661 } 666 }
667
662 return 0; 668 return 0;
663} 669}
664 670
@@ -703,7 +709,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
703static inline int hrtimer_switch_to_hres(void) { return 0; } 709static inline int hrtimer_switch_to_hres(void) { return 0; }
704static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } 710static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
705static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 711static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
706 struct hrtimer_clock_base *base) 712 struct hrtimer_clock_base *base,
713 int wakeup)
707{ 714{
708 return 0; 715 return 0;
709} 716}
@@ -886,20 +893,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
886 return 0; 893 return 0;
887} 894}
888 895
889/** 896int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
890 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU 897 unsigned long delta_ns, const enum hrtimer_mode mode,
891 * @timer: the timer to be added 898 int wakeup)
892 * @tim: expiry time
893 * @delta_ns: "slack" range for the timer
894 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
895 *
896 * Returns:
897 * 0 on success
898 * 1 when the timer was active
899 */
900int
901hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
902 const enum hrtimer_mode mode)
903{ 899{
904 struct hrtimer_clock_base *base, *new_base; 900 struct hrtimer_clock_base *base, *new_base;
905 unsigned long flags; 901 unsigned long flags;
@@ -940,12 +936,29 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
940 * XXX send_remote_softirq() ? 936 * XXX send_remote_softirq() ?
941 */ 937 */
942 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) 938 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
943 hrtimer_enqueue_reprogram(timer, new_base); 939 hrtimer_enqueue_reprogram(timer, new_base, wakeup);
944 940
945 unlock_hrtimer_base(timer, &flags); 941 unlock_hrtimer_base(timer, &flags);
946 942
947 return ret; 943 return ret;
948} 944}
945
946/**
947 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
948 * @timer: the timer to be added
949 * @tim: expiry time
950 * @delta_ns: "slack" range for the timer
951 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
952 *
953 * Returns:
954 * 0 on success
955 * 1 when the timer was active
956 */
957int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
958 unsigned long delta_ns, const enum hrtimer_mode mode)
959{
960 return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
961}
949EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); 962EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
950 963
951/** 964/**
@@ -961,7 +974,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
961int 974int
962hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) 975hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
963{ 976{
964 return hrtimer_start_range_ns(timer, tim, 0, mode); 977 return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
965} 978}
966EXPORT_SYMBOL_GPL(hrtimer_start); 979EXPORT_SYMBOL_GPL(hrtimer_start);
967 980
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 000000000000..022a4927b785
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,217 @@
1/*
2 * Detect Hung Task
3 *
4 * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
5 *
6 */
7
8#include <linux/mm.h>
9#include <linux/cpu.h>
10#include <linux/nmi.h>
11#include <linux/init.h>
12#include <linux/delay.h>
13#include <linux/freezer.h>
14#include <linux/kthread.h>
15#include <linux/lockdep.h>
16#include <linux/module.h>
17#include <linux/sysctl.h>
18
19/*
20 * The number of tasks checked:
21 */
22unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
23
24/*
25 * Limit number of tasks checked in a batch.
26 *
27 * This value controls the preemptibility of khungtaskd since preemption
28 * is disabled during the critical section. It also controls the size of
29 * the RCU grace period. So it needs to be upper-bound.
30 */
31#define HUNG_TASK_BATCHING 1024
32
33/*
34 * Zero means infinite timeout - no checking done:
35 */
36unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
37
38unsigned long __read_mostly sysctl_hung_task_warnings = 10;
39
40static int __read_mostly did_panic;
41
42static struct task_struct *watchdog_task;
43
44/*
45 * Should we panic (and reboot, if panic_timeout= is set) when a
46 * hung task is detected:
47 */
48unsigned int __read_mostly sysctl_hung_task_panic =
49 CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
50
51static int __init hung_task_panic_setup(char *str)
52{
53 sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
54
55 return 1;
56}
57__setup("hung_task_panic=", hung_task_panic_setup);
58
59static int
60hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
61{
62 did_panic = 1;
63
64 return NOTIFY_DONE;
65}
66
67static struct notifier_block panic_block = {
68 .notifier_call = hung_task_panic,
69};
70
71static void check_hung_task(struct task_struct *t, unsigned long timeout)
72{
73 unsigned long switch_count = t->nvcsw + t->nivcsw;
74
75 /*
76 * Ensure the task is not frozen.
77 * Also, when a freshly created task is scheduled once, changes
78 * its state to TASK_UNINTERRUPTIBLE without having ever been
79 * switched out once, it musn't be checked.
80 */
81 if (unlikely(t->flags & PF_FROZEN || !switch_count))
82 return;
83
84 if (switch_count != t->last_switch_count) {
85 t->last_switch_count = switch_count;
86 return;
87 }
88 if (!sysctl_hung_task_warnings)
89 return;
90 sysctl_hung_task_warnings--;
91
92 /*
93 * Ok, the task did not get scheduled for more than 2 minutes,
94 * complain:
95 */
96 printk(KERN_ERR "INFO: task %s:%d blocked for more than "
97 "%ld seconds.\n", t->comm, t->pid, timeout);
98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
99 " disables this message.\n");
100 sched_show_task(t);
101 __debug_show_held_locks(t);
102
103 touch_nmi_watchdog();
104
105 if (sysctl_hung_task_panic)
106 panic("hung_task: blocked tasks");
107}
108
109/*
110 * To avoid extending the RCU grace period for an unbounded amount of time,
111 * periodically exit the critical section and enter a new one.
112 *
113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
114 * exit the grace period. For classic RCU, a reschedule is required.
115 */
116static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
117{
118 get_task_struct(g);
119 get_task_struct(t);
120 rcu_read_unlock();
121 cond_resched();
122 rcu_read_lock();
123 put_task_struct(t);
124 put_task_struct(g);
125}
126
127/*
128 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
129 * a really long time (120 seconds). If that happens, print out
130 * a warning.
131 */
132static void check_hung_uninterruptible_tasks(unsigned long timeout)
133{
134 int max_count = sysctl_hung_task_check_count;
135 int batch_count = HUNG_TASK_BATCHING;
136 struct task_struct *g, *t;
137
138 /*
139 * If the system crashed already then all bets are off,
140 * do not report extra hung tasks:
141 */
142 if (test_taint(TAINT_DIE) || did_panic)
143 return;
144
145 rcu_read_lock();
146 do_each_thread(g, t) {
147 if (!--max_count)
148 goto unlock;
149 if (!--batch_count) {
150 batch_count = HUNG_TASK_BATCHING;
151 rcu_lock_break(g, t);
152 /* Exit if t or g was unhashed during refresh. */
153 if (t->state == TASK_DEAD || g->state == TASK_DEAD)
154 goto unlock;
155 }
156 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
157 if (t->state == TASK_UNINTERRUPTIBLE)
158 check_hung_task(t, timeout);
159 } while_each_thread(g, t);
160 unlock:
161 rcu_read_unlock();
162}
163
164static unsigned long timeout_jiffies(unsigned long timeout)
165{
166 /* timeout of 0 will disable the watchdog */
167 return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
168}
169
170/*
171 * Process updating of timeout sysctl
172 */
173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
174 struct file *filp, void __user *buffer,
175 size_t *lenp, loff_t *ppos)
176{
177 int ret;
178
179 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
180
181 if (ret || !write)
182 goto out;
183
184 wake_up_process(watchdog_task);
185
186 out:
187 return ret;
188}
189
190/*
191 * kthread which checks for tasks stuck in D state
192 */
193static int watchdog(void *dummy)
194{
195 set_user_nice(current, 0);
196
197 for ( ; ; ) {
198 unsigned long timeout = sysctl_hung_task_timeout_secs;
199
200 while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
201 timeout = sysctl_hung_task_timeout_secs;
202
203 check_hung_uninterruptible_tasks(timeout);
204 }
205
206 return 0;
207}
208
209static int __init hung_task_init(void)
210{
211 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
212 watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
213
214 return 0;
215}
216
217module_init(hung_task_init);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 38a25b8d8bff..d06df9c41cba 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
26} 26}
27 27
28/** 28/**
29 * devm_request_irq - allocate an interrupt line for a managed device 29 * devm_request_threaded_irq - allocate an interrupt line for a managed device
30 * @dev: device to request interrupt for 30 * @dev: device to request interrupt for
31 * @irq: Interrupt line to allocate 31 * @irq: Interrupt line to allocate
32 * @handler: Function to be called when the IRQ occurs 32 * @handler: Function to be called when the IRQ occurs
33 * @thread_fn: function to be called in a threaded interrupt context. NULL
34 * for devices which handle everything in @handler
33 * @irqflags: Interrupt type flags 35 * @irqflags: Interrupt type flags
34 * @devname: An ascii name for the claiming device 36 * @devname: An ascii name for the claiming device
35 * @dev_id: A cookie passed back to the handler function 37 * @dev_id: A cookie passed back to the handler function
@@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
42 * If an IRQ allocated with this function needs to be freed 44 * If an IRQ allocated with this function needs to be freed
43 * separately, dev_free_irq() must be used. 45 * separately, dev_free_irq() must be used.
44 */ 46 */
45int devm_request_irq(struct device *dev, unsigned int irq, 47int devm_request_threaded_irq(struct device *dev, unsigned int irq,
46 irq_handler_t handler, unsigned long irqflags, 48 irq_handler_t handler, irq_handler_t thread_fn,
47 const char *devname, void *dev_id) 49 unsigned long irqflags, const char *devname,
50 void *dev_id)
48{ 51{
49 struct irq_devres *dr; 52 struct irq_devres *dr;
50 int rc; 53 int rc;
@@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq,
54 if (!dr) 57 if (!dr)
55 return -ENOMEM; 58 return -ENOMEM;
56 59
57 rc = request_irq(irq, handler, irqflags, devname, dev_id); 60 rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
61 dev_id);
58 if (rc) { 62 if (rc) {
59 devres_free(dr); 63 devres_free(dr);
60 return rc; 64 return rc;
@@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq,
66 70
67 return 0; 71 return 0;
68} 72}
69EXPORT_SYMBOL(devm_request_irq); 73EXPORT_SYMBOL(devm_request_threaded_irq);
70 74
71/** 75/**
72 * devm_free_irq - free an interrupt 76 * devm_free_irq - free an interrupt
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 9ebf77968871..26e08754744f 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,7 @@
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/rculist.h> 18#include <linux/rculist.h>
19#include <linux/hash.h> 19#include <linux/hash.h>
20#include <trace/irq.h>
20#include <linux/bootmem.h> 21#include <linux/bootmem.h>
21 22
22#include "internals.h" 23#include "internals.h"
@@ -338,6 +339,18 @@ irqreturn_t no_action(int cpl, void *dev_id)
338 return IRQ_NONE; 339 return IRQ_NONE;
339} 340}
340 341
342static void warn_no_thread(unsigned int irq, struct irqaction *action)
343{
344 if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
345 return;
346
347 printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
348 "but no thread function available.", irq, action->name);
349}
350
351DEFINE_TRACE(irq_handler_entry);
352DEFINE_TRACE(irq_handler_exit);
353
341/** 354/**
342 * handle_IRQ_event - irq action chain handler 355 * handle_IRQ_event - irq action chain handler
343 * @irq: the interrupt number 356 * @irq: the interrupt number
@@ -350,15 +363,54 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
350 irqreturn_t ret, retval = IRQ_NONE; 363 irqreturn_t ret, retval = IRQ_NONE;
351 unsigned int status = 0; 364 unsigned int status = 0;
352 365
353 WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!");
354
355 if (!(action->flags & IRQF_DISABLED)) 366 if (!(action->flags & IRQF_DISABLED))
356 local_irq_enable_in_hardirq(); 367 local_irq_enable_in_hardirq();
357 368
358 do { 369 do {
370 trace_irq_handler_entry(irq, action);
359 ret = action->handler(irq, action->dev_id); 371 ret = action->handler(irq, action->dev_id);
360 if (ret == IRQ_HANDLED) 372 trace_irq_handler_exit(irq, action, ret);
373
374 switch (ret) {
375 case IRQ_WAKE_THREAD:
376 /*
377 * Set result to handled so the spurious check
378 * does not trigger.
379 */
380 ret = IRQ_HANDLED;
381
382 /*
383 * Catch drivers which return WAKE_THREAD but
384 * did not set up a thread function
385 */
386 if (unlikely(!action->thread_fn)) {
387 warn_no_thread(irq, action);
388 break;
389 }
390
391 /*
392 * Wake up the handler thread for this
393 * action. In case the thread crashed and was
394 * killed we just pretend that we handled the
395 * interrupt. The hardirq handler above has
396 * disabled the device interrupt, so no irq
397 * storm is lurking.
398 */
399 if (likely(!test_bit(IRQTF_DIED,
400 &action->thread_flags))) {
401 set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
402 wake_up_process(action->thread);
403 }
404
405 /* Fall through to add to randomness */
406 case IRQ_HANDLED:
361 status |= action->flags; 407 status |= action->flags;
408 break;
409
410 default:
411 break;
412 }
413
362 retval |= ret; 414 retval |= ret;
363 action = action->next; 415 action = action->next;
364 } while (action); 416 } while (action);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1516ab77355c..2734eca59243 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -8,16 +8,15 @@
8 */ 8 */
9 9
10#include <linux/irq.h> 10#include <linux/irq.h>
11#include <linux/kthread.h>
11#include <linux/module.h> 12#include <linux/module.h>
12#include <linux/random.h> 13#include <linux/random.h>
13#include <linux/interrupt.h> 14#include <linux/interrupt.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/sched.h>
15 17
16#include "internals.h" 18#include "internals.h"
17 19
18#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
19cpumask_var_t irq_default_affinity;
20
21/** 20/**
22 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
23 * @irq: interrupt number to wait for 22 * @irq: interrupt number to wait for
@@ -53,9 +52,18 @@ void synchronize_irq(unsigned int irq)
53 52
54 /* Oops, that failed? */ 53 /* Oops, that failed? */
55 } while (status & IRQ_INPROGRESS); 54 } while (status & IRQ_INPROGRESS);
55
56 /*
57 * We made sure that no hardirq handler is running. Now verify
58 * that no threaded handlers are active.
59 */
60 wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
56} 61}
57EXPORT_SYMBOL(synchronize_irq); 62EXPORT_SYMBOL(synchronize_irq);
58 63
64#ifdef CONFIG_SMP
65cpumask_var_t irq_default_affinity;
66
59/** 67/**
60 * irq_can_set_affinity - Check if the affinity of a given irq can be set 68 * irq_can_set_affinity - Check if the affinity of a given irq can be set
61 * @irq: Interrupt to check 69 * @irq: Interrupt to check
@@ -72,6 +80,18 @@ int irq_can_set_affinity(unsigned int irq)
72 return 1; 80 return 1;
73} 81}
74 82
83static void
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
85{
86 struct irqaction *action = desc->action;
87
88 while (action) {
89 if (action->thread)
90 set_cpus_allowed_ptr(action->thread, cpumask);
91 action = action->next;
92 }
93}
94
75/** 95/**
76 * irq_set_affinity - Set the irq affinity of a given irq 96 * irq_set_affinity - Set the irq affinity of a given irq
77 * @irq: Interrupt to set affinity 97 * @irq: Interrupt to set affinity
@@ -89,10 +109,9 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
89 spin_lock_irqsave(&desc->lock, flags); 109 spin_lock_irqsave(&desc->lock, flags);
90 110
91#ifdef CONFIG_GENERIC_PENDING_IRQ 111#ifdef CONFIG_GENERIC_PENDING_IRQ
92 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { 112 if (desc->status & IRQ_MOVE_PCNTXT)
93 cpumask_copy(desc->affinity, cpumask);
94 desc->chip->set_affinity(irq, cpumask); 113 desc->chip->set_affinity(irq, cpumask);
95 } else { 114 else {
96 desc->status |= IRQ_MOVE_PENDING; 115 desc->status |= IRQ_MOVE_PENDING;
97 cpumask_copy(desc->pending_mask, cpumask); 116 cpumask_copy(desc->pending_mask, cpumask);
98 } 117 }
@@ -100,6 +119,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
100 cpumask_copy(desc->affinity, cpumask); 119 cpumask_copy(desc->affinity, cpumask);
101 desc->chip->set_affinity(irq, cpumask); 120 desc->chip->set_affinity(irq, cpumask);
102#endif 121#endif
122 irq_set_thread_affinity(desc, cpumask);
103 desc->status |= IRQ_AFFINITY_SET; 123 desc->status |= IRQ_AFFINITY_SET;
104 spin_unlock_irqrestore(&desc->lock, flags); 124 spin_unlock_irqrestore(&desc->lock, flags);
105 return 0; 125 return 0;
@@ -150,6 +170,8 @@ int irq_select_affinity_usr(unsigned int irq)
150 170
151 spin_lock_irqsave(&desc->lock, flags); 171 spin_lock_irqsave(&desc->lock, flags);
152 ret = setup_affinity(irq, desc); 172 ret = setup_affinity(irq, desc);
173 if (!ret)
174 irq_set_thread_affinity(desc, desc->affinity);
153 spin_unlock_irqrestore(&desc->lock, flags); 175 spin_unlock_irqrestore(&desc->lock, flags);
154 176
155 return ret; 177 return ret;
@@ -401,6 +423,90 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
401 return ret; 423 return ret;
402} 424}
403 425
426static int irq_wait_for_interrupt(struct irqaction *action)
427{
428 while (!kthread_should_stop()) {
429 set_current_state(TASK_INTERRUPTIBLE);
430
431 if (test_and_clear_bit(IRQTF_RUNTHREAD,
432 &action->thread_flags)) {
433 __set_current_state(TASK_RUNNING);
434 return 0;
435 }
436 schedule();
437 }
438 return -1;
439}
440
441/*
442 * Interrupt handler thread
443 */
444static int irq_thread(void *data)
445{
446 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
447 struct irqaction *action = data;
448 struct irq_desc *desc = irq_to_desc(action->irq);
449 int wake;
450
451 sched_setscheduler(current, SCHED_FIFO, &param);
452 current->irqaction = action;
453
454 while (!irq_wait_for_interrupt(action)) {
455
456 atomic_inc(&desc->threads_active);
457
458 spin_lock_irq(&desc->lock);
459 if (unlikely(desc->status & IRQ_DISABLED)) {
460 /*
461 * CHECKME: We might need a dedicated
462 * IRQ_THREAD_PENDING flag here, which
463 * retriggers the thread in check_irq_resend()
464 * but AFAICT IRQ_PENDING should be fine as it
465 * retriggers the interrupt itself --- tglx
466 */
467 desc->status |= IRQ_PENDING;
468 spin_unlock_irq(&desc->lock);
469 } else {
470 spin_unlock_irq(&desc->lock);
471
472 action->thread_fn(action->irq, action->dev_id);
473 }
474
475 wake = atomic_dec_and_test(&desc->threads_active);
476
477 if (wake && waitqueue_active(&desc->wait_for_threads))
478 wake_up(&desc->wait_for_threads);
479 }
480
481 /*
482 * Clear irqaction. Otherwise exit_irq_thread() would make
483 * fuzz about an active irq thread going into nirvana.
484 */
485 current->irqaction = NULL;
486 return 0;
487}
488
489/*
490 * Called from do_exit()
491 */
492void exit_irq_thread(void)
493{
494 struct task_struct *tsk = current;
495
496 if (!tsk->irqaction)
497 return;
498
499 printk(KERN_ERR
500 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
501 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
502
503 /*
504 * Set the THREAD DIED flag to prevent further wakeups of the
505 * soon to be gone threaded handler.
506 */
507 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
508}
509
404/* 510/*
405 * Internal function to register an irqaction - typically used to 511 * Internal function to register an irqaction - typically used to
406 * allocate special interrupts that are part of the architecture. 512 * allocate special interrupts that are part of the architecture.
@@ -437,6 +543,26 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
437 } 543 }
438 544
439 /* 545 /*
546 * Threaded handler ?
547 */
548 if (new->thread_fn) {
549 struct task_struct *t;
550
551 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
552 new->name);
553 if (IS_ERR(t))
554 return PTR_ERR(t);
555 /*
556 * We keep the reference to the task struct even if
557 * the thread dies to avoid that the interrupt code
558 * references an already freed task_struct.
559 */
560 get_task_struct(t);
561 new->thread = t;
562 wake_up_process(t);
563 }
564
565 /*
440 * The following block of code has to be executed atomically 566 * The following block of code has to be executed atomically
441 */ 567 */
442 spin_lock_irqsave(&desc->lock, flags); 568 spin_lock_irqsave(&desc->lock, flags);
@@ -473,15 +599,15 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
473 if (!shared) { 599 if (!shared) {
474 irq_chip_set_defaults(desc->chip); 600 irq_chip_set_defaults(desc->chip);
475 601
602 init_waitqueue_head(&desc->wait_for_threads);
603
476 /* Setup the type (level, edge polarity) if configured: */ 604 /* Setup the type (level, edge polarity) if configured: */
477 if (new->flags & IRQF_TRIGGER_MASK) { 605 if (new->flags & IRQF_TRIGGER_MASK) {
478 ret = __irq_set_trigger(desc, irq, 606 ret = __irq_set_trigger(desc, irq,
479 new->flags & IRQF_TRIGGER_MASK); 607 new->flags & IRQF_TRIGGER_MASK);
480 608
481 if (ret) { 609 if (ret)
482 spin_unlock_irqrestore(&desc->lock, flags); 610 goto out_thread;
483 return ret;
484 }
485 } else 611 } else
486 compat_irq_chip_set_default_handler(desc); 612 compat_irq_chip_set_default_handler(desc);
487#if defined(CONFIG_IRQ_PER_CPU) 613#if defined(CONFIG_IRQ_PER_CPU)
@@ -549,8 +675,19 @@ mismatch:
549 dump_stack(); 675 dump_stack();
550 } 676 }
551#endif 677#endif
678 ret = -EBUSY;
679
680out_thread:
552 spin_unlock_irqrestore(&desc->lock, flags); 681 spin_unlock_irqrestore(&desc->lock, flags);
553 return -EBUSY; 682 if (new->thread) {
683 struct task_struct *t = new->thread;
684
685 new->thread = NULL;
686 if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
687 kthread_stop(t);
688 put_task_struct(t);
689 }
690 return ret;
554} 691}
555 692
556/** 693/**
@@ -576,6 +713,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
576{ 713{
577 struct irq_desc *desc = irq_to_desc(irq); 714 struct irq_desc *desc = irq_to_desc(irq);
578 struct irqaction *action, **action_ptr; 715 struct irqaction *action, **action_ptr;
716 struct task_struct *irqthread;
579 unsigned long flags; 717 unsigned long flags;
580 718
581 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); 719 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -622,6 +760,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
622 else 760 else
623 desc->chip->disable(irq); 761 desc->chip->disable(irq);
624 } 762 }
763
764 irqthread = action->thread;
765 action->thread = NULL;
766
625 spin_unlock_irqrestore(&desc->lock, flags); 767 spin_unlock_irqrestore(&desc->lock, flags);
626 768
627 unregister_handler_proc(irq, action); 769 unregister_handler_proc(irq, action);
@@ -629,6 +771,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
629 /* Make sure it's not being used on another CPU: */ 771 /* Make sure it's not being used on another CPU: */
630 synchronize_irq(irq); 772 synchronize_irq(irq);
631 773
774 if (irqthread) {
775 if (!test_bit(IRQTF_DIED, &action->thread_flags))
776 kthread_stop(irqthread);
777 put_task_struct(irqthread);
778 }
779
632#ifdef CONFIG_DEBUG_SHIRQ 780#ifdef CONFIG_DEBUG_SHIRQ
633 /* 781 /*
634 * It's a shared IRQ -- the driver ought to be prepared for an IRQ 782 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -681,9 +829,12 @@ void free_irq(unsigned int irq, void *dev_id)
681EXPORT_SYMBOL(free_irq); 829EXPORT_SYMBOL(free_irq);
682 830
683/** 831/**
684 * request_irq - allocate an interrupt line 832 * request_threaded_irq - allocate an interrupt line
685 * @irq: Interrupt line to allocate 833 * @irq: Interrupt line to allocate
686 * @handler: Function to be called when the IRQ occurs 834 * @handler: Function to be called when the IRQ occurs.
835 * Primary handler for threaded interrupts
836 * @thread_fn: Function called from the irq handler thread
837 * If NULL, no irq thread is created
687 * @irqflags: Interrupt type flags 838 * @irqflags: Interrupt type flags
688 * @devname: An ascii name for the claiming device 839 * @devname: An ascii name for the claiming device
689 * @dev_id: A cookie passed back to the handler function 840 * @dev_id: A cookie passed back to the handler function
@@ -695,6 +846,15 @@ EXPORT_SYMBOL(free_irq);
695 * raises, you must take care both to initialise your hardware 846 * raises, you must take care both to initialise your hardware
696 * and to set up the interrupt handler in the right order. 847 * and to set up the interrupt handler in the right order.
697 * 848 *
849 * If you want to set up a threaded irq handler for your device
850 * then you need to supply @handler and @thread_fn. @handler ist
851 * still called in hard interrupt context and has to check
852 * whether the interrupt originates from the device. If yes it
853 * needs to disable the interrupt on the device and return
854 * IRQ_THREAD_WAKE which will wake up the handler thread and run
855 * @thread_fn. This split handler design is necessary to support
856 * shared interrupts.
857 *
698 * Dev_id must be globally unique. Normally the address of the 858 * Dev_id must be globally unique. Normally the address of the
699 * device data structure is used as the cookie. Since the handler 859 * device data structure is used as the cookie. Since the handler
700 * receives this value it makes sense to use it. 860 * receives this value it makes sense to use it.
@@ -710,8 +870,9 @@ EXPORT_SYMBOL(free_irq);
710 * IRQF_TRIGGER_* Specify active edge(s) or level 870 * IRQF_TRIGGER_* Specify active edge(s) or level
711 * 871 *
712 */ 872 */
713int request_irq(unsigned int irq, irq_handler_t handler, 873int request_threaded_irq(unsigned int irq, irq_handler_t handler,
714 unsigned long irqflags, const char *devname, void *dev_id) 874 irq_handler_t thread_fn, unsigned long irqflags,
875 const char *devname, void *dev_id)
715{ 876{
716 struct irqaction *action; 877 struct irqaction *action;
717 struct irq_desc *desc; 878 struct irq_desc *desc;
@@ -759,6 +920,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
759 return -ENOMEM; 920 return -ENOMEM;
760 921
761 action->handler = handler; 922 action->handler = handler;
923 action->thread_fn = thread_fn;
762 action->flags = irqflags; 924 action->flags = irqflags;
763 action->name = devname; 925 action->name = devname;
764 action->dev_id = dev_id; 926 action->dev_id = dev_id;
@@ -788,4 +950,4 @@ int request_irq(unsigned int irq, irq_handler_t handler,
788#endif 950#endif
789 return retval; 951 return retval;
790} 952}
791EXPORT_SYMBOL(request_irq); 953EXPORT_SYMBOL(request_threaded_irq);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 243d6121e50e..44bbdcbaf8d2 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -54,6 +54,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
54static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) 54static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
55{ 55{
56 free_kstat_irqs(old_desc, desc); 56 free_kstat_irqs(old_desc, desc);
57 free_desc_masks(old_desc, desc);
57 arch_free_chip_data(old_desc, desc); 58 arch_free_chip_data(old_desc, desc);
58} 59}
59 60
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7b8b0f21a5b1..374faf9bfdc7 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -161,6 +161,25 @@ unsigned long kallsyms_lookup_name(const char *name)
161 return module_kallsyms_lookup_name(name); 161 return module_kallsyms_lookup_name(name);
162} 162}
163 163
164int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
165 unsigned long),
166 void *data)
167{
168 char namebuf[KSYM_NAME_LEN];
169 unsigned long i;
170 unsigned int off;
171 int ret;
172
173 for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
174 off = kallsyms_expand_symbol(off, namebuf);
175 ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
176 if (ret != 0)
177 return ret;
178 }
179 return module_kallsyms_on_each_symbol(fn, data);
180}
181EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
182
164static unsigned long get_symbol_pos(unsigned long addr, 183static unsigned long get_symbol_pos(unsigned long addr,
165 unsigned long *symbolsize, 184 unsigned long *symbolsize,
166 unsigned long *offset) 185 unsigned long *offset)
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index e4dcfb2272a4..9147a3190c9d 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -1583,8 +1583,8 @@ static void sysrq_handle_gdb(int key, struct tty_struct *tty)
1583 1583
1584static struct sysrq_key_op sysrq_gdb_op = { 1584static struct sysrq_key_op sysrq_gdb_op = {
1585 .handler = sysrq_handle_gdb, 1585 .handler = sysrq_handle_gdb,
1586 .help_msg = "Gdb", 1586 .help_msg = "debug(G)",
1587 .action_msg = "GDB", 1587 .action_msg = "DEBUG",
1588}; 1588};
1589#endif 1589#endif
1590 1590
diff --git a/kernel/kmod.c b/kernel/kmod.c
index f0c8f545180d..b750675251e5 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -50,7 +50,8 @@ static struct workqueue_struct *khelper_wq;
50char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; 50char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
51 51
52/** 52/**
53 * request_module - try to load a kernel module 53 * __request_module - try to load a kernel module
54 * @wait: wait (or not) for the operation to complete
54 * @fmt: printf style format string for the name of the module 55 * @fmt: printf style format string for the name of the module
55 * @...: arguments as specified in the format string 56 * @...: arguments as specified in the format string
56 * 57 *
@@ -63,7 +64,7 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
63 * If module auto-loading support is disabled then this function 64 * If module auto-loading support is disabled then this function
64 * becomes a no-operation. 65 * becomes a no-operation.
65 */ 66 */
66int request_module(const char *fmt, ...) 67int __request_module(bool wait, const char *fmt, ...)
67{ 68{
68 va_list args; 69 va_list args;
69 char module_name[MODULE_NAME_LEN]; 70 char module_name[MODULE_NAME_LEN];
@@ -108,11 +109,12 @@ int request_module(const char *fmt, ...)
108 return -ENOMEM; 109 return -ENOMEM;
109 } 110 }
110 111
111 ret = call_usermodehelper(modprobe_path, argv, envp, 1); 112 ret = call_usermodehelper(modprobe_path, argv, envp,
113 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
112 atomic_dec(&kmod_concurrent); 114 atomic_dec(&kmod_concurrent);
113 return ret; 115 return ret;
114} 116}
115EXPORT_SYMBOL(request_module); 117EXPORT_SYMBOL(__request_module);
116#endif /* CONFIG_MODULES */ 118#endif /* CONFIG_MODULES */
117 119
118struct subprocess_info { 120struct subprocess_info {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7ba8cd9845cb..c0fa54b276d9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -43,6 +43,7 @@
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/kdebug.h> 45#include <linux/kdebug.h>
46#include <linux/memory.h>
46 47
47#include <asm-generic/sections.h> 48#include <asm-generic/sections.h>
48#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
@@ -67,7 +68,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
67static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 68static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
68 69
69/* NOTE: change this value only with kprobe_mutex held */ 70/* NOTE: change this value only with kprobe_mutex held */
70static bool kprobe_enabled; 71static bool kprobes_all_disarmed;
71 72
72static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 73static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 74static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -318,6 +319,22 @@ struct kprobe __kprobes *get_kprobe(void *addr)
318 return NULL; 319 return NULL;
319} 320}
320 321
322/* Arm a kprobe with text_mutex */
323static void __kprobes arm_kprobe(struct kprobe *kp)
324{
325 mutex_lock(&text_mutex);
326 arch_arm_kprobe(kp);
327 mutex_unlock(&text_mutex);
328}
329
330/* Disarm a kprobe with text_mutex */
331static void __kprobes disarm_kprobe(struct kprobe *kp)
332{
333 mutex_lock(&text_mutex);
334 arch_disarm_kprobe(kp);
335 mutex_unlock(&text_mutex);
336}
337
321/* 338/*
322 * Aggregate handlers for multiple kprobes support - these handlers 339 * Aggregate handlers for multiple kprobes support - these handlers
323 * take care of invoking the individual kprobe handlers on p->list 340 * take care of invoking the individual kprobe handlers on p->list
@@ -327,7 +344,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
327 struct kprobe *kp; 344 struct kprobe *kp;
328 345
329 list_for_each_entry_rcu(kp, &p->list, list) { 346 list_for_each_entry_rcu(kp, &p->list, list) {
330 if (kp->pre_handler && !kprobe_gone(kp)) { 347 if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
331 set_kprobe_instance(kp); 348 set_kprobe_instance(kp);
332 if (kp->pre_handler(kp, regs)) 349 if (kp->pre_handler(kp, regs))
333 return 1; 350 return 1;
@@ -343,7 +360,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
343 struct kprobe *kp; 360 struct kprobe *kp;
344 361
345 list_for_each_entry_rcu(kp, &p->list, list) { 362 list_for_each_entry_rcu(kp, &p->list, list) {
346 if (kp->post_handler && !kprobe_gone(kp)) { 363 if (kp->post_handler && likely(!kprobe_disabled(kp))) {
347 set_kprobe_instance(kp); 364 set_kprobe_instance(kp);
348 kp->post_handler(kp, regs, flags); 365 kp->post_handler(kp, regs, flags);
349 reset_kprobe_instance(); 366 reset_kprobe_instance();
@@ -517,20 +534,28 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
517} 534}
518 535
519/* 536/*
520* Add the new probe to old_p->list. Fail if this is the 537* Add the new probe to ap->list. Fail if this is the
521* second jprobe at the address - two jprobes can't coexist 538* second jprobe at the address - two jprobes can't coexist
522*/ 539*/
523static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) 540static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
524{ 541{
542 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
525 if (p->break_handler) { 543 if (p->break_handler) {
526 if (old_p->break_handler) 544 if (ap->break_handler)
527 return -EEXIST; 545 return -EEXIST;
528 list_add_tail_rcu(&p->list, &old_p->list); 546 list_add_tail_rcu(&p->list, &ap->list);
529 old_p->break_handler = aggr_break_handler; 547 ap->break_handler = aggr_break_handler;
530 } else 548 } else
531 list_add_rcu(&p->list, &old_p->list); 549 list_add_rcu(&p->list, &ap->list);
532 if (p->post_handler && !old_p->post_handler) 550 if (p->post_handler && !ap->post_handler)
533 old_p->post_handler = aggr_post_handler; 551 ap->post_handler = aggr_post_handler;
552
553 if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
554 ap->flags &= ~KPROBE_FLAG_DISABLED;
555 if (!kprobes_all_disarmed)
556 /* Arm the breakpoint again. */
557 arm_kprobe(ap);
558 }
534 return 0; 559 return 0;
535} 560}
536 561
@@ -543,6 +568,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
543 copy_kprobe(p, ap); 568 copy_kprobe(p, ap);
544 flush_insn_slot(ap); 569 flush_insn_slot(ap);
545 ap->addr = p->addr; 570 ap->addr = p->addr;
571 ap->flags = p->flags;
546 ap->pre_handler = aggr_pre_handler; 572 ap->pre_handler = aggr_pre_handler;
547 ap->fault_handler = aggr_fault_handler; 573 ap->fault_handler = aggr_fault_handler;
548 /* We don't care the kprobe which has gone. */ 574 /* We don't care the kprobe which has gone. */
@@ -565,44 +591,59 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
565 struct kprobe *p) 591 struct kprobe *p)
566{ 592{
567 int ret = 0; 593 int ret = 0;
568 struct kprobe *ap; 594 struct kprobe *ap = old_p;
595
596 if (old_p->pre_handler != aggr_pre_handler) {
597 /* If old_p is not an aggr_probe, create new aggr_kprobe. */
598 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
599 if (!ap)
600 return -ENOMEM;
601 add_aggr_kprobe(ap, old_p);
602 }
569 603
570 if (kprobe_gone(old_p)) { 604 if (kprobe_gone(ap)) {
571 /* 605 /*
572 * Attempting to insert new probe at the same location that 606 * Attempting to insert new probe at the same location that
573 * had a probe in the module vaddr area which already 607 * had a probe in the module vaddr area which already
574 * freed. So, the instruction slot has already been 608 * freed. So, the instruction slot has already been
575 * released. We need a new slot for the new probe. 609 * released. We need a new slot for the new probe.
576 */ 610 */
577 ret = arch_prepare_kprobe(old_p); 611 ret = arch_prepare_kprobe(ap);
578 if (ret) 612 if (ret)
613 /*
614 * Even if fail to allocate new slot, don't need to
615 * free aggr_probe. It will be used next time, or
616 * freed by unregister_kprobe.
617 */
579 return ret; 618 return ret;
580 } 619
581 if (old_p->pre_handler == aggr_pre_handler) {
582 copy_kprobe(old_p, p);
583 ret = add_new_kprobe(old_p, p);
584 ap = old_p;
585 } else {
586 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
587 if (!ap) {
588 if (kprobe_gone(old_p))
589 arch_remove_kprobe(old_p);
590 return -ENOMEM;
591 }
592 add_aggr_kprobe(ap, old_p);
593 copy_kprobe(ap, p);
594 ret = add_new_kprobe(ap, p);
595 }
596 if (kprobe_gone(old_p)) {
597 /* 620 /*
598 * If the old_p has gone, its breakpoint has been disarmed. 621 * Clear gone flag to prevent allocating new slot again, and
599 * We have to arm it again after preparing real kprobes. 622 * set disabled flag because it is not armed yet.
600 */ 623 */
601 ap->flags &= ~KPROBE_FLAG_GONE; 624 ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
602 if (kprobe_enabled) 625 | KPROBE_FLAG_DISABLED;
603 arch_arm_kprobe(ap);
604 } 626 }
605 return ret; 627
628 copy_kprobe(ap, p);
629 return add_new_kprobe(ap, p);
630}
631
632/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
633static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
634{
635 struct kprobe *kp;
636
637 list_for_each_entry_rcu(kp, &p->list, list) {
638 if (!kprobe_disabled(kp))
639 /*
640 * There is an active probe on the list.
641 * We can't disable aggr_kprobe.
642 */
643 return 0;
644 }
645 p->flags |= KPROBE_FLAG_DISABLED;
646 return 1;
606} 647}
607 648
608static int __kprobes in_kprobes_functions(unsigned long addr) 649static int __kprobes in_kprobes_functions(unsigned long addr)
@@ -663,7 +704,9 @@ int __kprobes register_kprobe(struct kprobe *p)
663 return -EINVAL; 704 return -EINVAL;
664 } 705 }
665 706
666 p->flags = 0; 707 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
708 p->flags &= KPROBE_FLAG_DISABLED;
709
667 /* 710 /*
668 * Check if are we probing a module. 711 * Check if are we probing a module.
669 */ 712 */
@@ -699,17 +742,20 @@ int __kprobes register_kprobe(struct kprobe *p)
699 goto out; 742 goto out;
700 } 743 }
701 744
745 mutex_lock(&text_mutex);
702 ret = arch_prepare_kprobe(p); 746 ret = arch_prepare_kprobe(p);
703 if (ret) 747 if (ret)
704 goto out; 748 goto out_unlock_text;
705 749
706 INIT_HLIST_NODE(&p->hlist); 750 INIT_HLIST_NODE(&p->hlist);
707 hlist_add_head_rcu(&p->hlist, 751 hlist_add_head_rcu(&p->hlist,
708 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 752 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
709 753
710 if (kprobe_enabled) 754 if (!kprobes_all_disarmed && !kprobe_disabled(p))
711 arch_arm_kprobe(p); 755 arch_arm_kprobe(p);
712 756
757out_unlock_text:
758 mutex_unlock(&text_mutex);
713out: 759out:
714 mutex_unlock(&kprobe_mutex); 760 mutex_unlock(&kprobe_mutex);
715 761
@@ -718,26 +764,39 @@ out:
718 764
719 return ret; 765 return ret;
720} 766}
767EXPORT_SYMBOL_GPL(register_kprobe);
721 768
722/* 769/* Check passed kprobe is valid and return kprobe in kprobe_table. */
723 * Unregister a kprobe without a scheduler synchronization. 770static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
724 */
725static int __kprobes __unregister_kprobe_top(struct kprobe *p)
726{ 771{
727 struct kprobe *old_p, *list_p; 772 struct kprobe *old_p, *list_p;
728 773
729 old_p = get_kprobe(p->addr); 774 old_p = get_kprobe(p->addr);
730 if (unlikely(!old_p)) 775 if (unlikely(!old_p))
731 return -EINVAL; 776 return NULL;
732 777
733 if (p != old_p) { 778 if (p != old_p) {
734 list_for_each_entry_rcu(list_p, &old_p->list, list) 779 list_for_each_entry_rcu(list_p, &old_p->list, list)
735 if (list_p == p) 780 if (list_p == p)
736 /* kprobe p is a valid probe */ 781 /* kprobe p is a valid probe */
737 goto valid_p; 782 goto valid;
738 return -EINVAL; 783 return NULL;
739 } 784 }
740valid_p: 785valid:
786 return old_p;
787}
788
789/*
790 * Unregister a kprobe without a scheduler synchronization.
791 */
792static int __kprobes __unregister_kprobe_top(struct kprobe *p)
793{
794 struct kprobe *old_p, *list_p;
795
796 old_p = __get_valid_kprobe(p);
797 if (old_p == NULL)
798 return -EINVAL;
799
741 if (old_p == p || 800 if (old_p == p ||
742 (old_p->pre_handler == aggr_pre_handler && 801 (old_p->pre_handler == aggr_pre_handler &&
743 list_is_singular(&old_p->list))) { 802 list_is_singular(&old_p->list))) {
@@ -746,8 +805,8 @@ valid_p:
746 * enabled and not gone - otherwise, the breakpoint would 805 * enabled and not gone - otherwise, the breakpoint would
747 * already have been removed. We save on flushing icache. 806 * already have been removed. We save on flushing icache.
748 */ 807 */
749 if (kprobe_enabled && !kprobe_gone(old_p)) 808 if (!kprobes_all_disarmed && !kprobe_disabled(old_p))
750 arch_disarm_kprobe(p); 809 disarm_kprobe(p);
751 hlist_del_rcu(&old_p->hlist); 810 hlist_del_rcu(&old_p->hlist);
752 } else { 811 } else {
753 if (p->break_handler && !kprobe_gone(p)) 812 if (p->break_handler && !kprobe_gone(p))
@@ -761,6 +820,11 @@ valid_p:
761 } 820 }
762noclean: 821noclean:
763 list_del_rcu(&p->list); 822 list_del_rcu(&p->list);
823 if (!kprobe_disabled(old_p)) {
824 try_to_disable_aggr_kprobe(old_p);
825 if (!kprobes_all_disarmed && kprobe_disabled(old_p))
826 disarm_kprobe(old_p);
827 }
764 } 828 }
765 return 0; 829 return 0;
766} 830}
@@ -796,11 +860,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
796 } 860 }
797 return ret; 861 return ret;
798} 862}
863EXPORT_SYMBOL_GPL(register_kprobes);
799 864
800void __kprobes unregister_kprobe(struct kprobe *p) 865void __kprobes unregister_kprobe(struct kprobe *p)
801{ 866{
802 unregister_kprobes(&p, 1); 867 unregister_kprobes(&p, 1);
803} 868}
869EXPORT_SYMBOL_GPL(unregister_kprobe);
804 870
805void __kprobes unregister_kprobes(struct kprobe **kps, int num) 871void __kprobes unregister_kprobes(struct kprobe **kps, int num)
806{ 872{
@@ -819,6 +885,7 @@ void __kprobes unregister_kprobes(struct kprobe **kps, int num)
819 if (kps[i]->addr) 885 if (kps[i]->addr)
820 __unregister_kprobe_bottom(kps[i]); 886 __unregister_kprobe_bottom(kps[i]);
821} 887}
888EXPORT_SYMBOL_GPL(unregister_kprobes);
822 889
823static struct notifier_block kprobe_exceptions_nb = { 890static struct notifier_block kprobe_exceptions_nb = {
824 .notifier_call = kprobe_exceptions_notify, 891 .notifier_call = kprobe_exceptions_notify,
@@ -858,16 +925,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
858 } 925 }
859 return ret; 926 return ret;
860} 927}
928EXPORT_SYMBOL_GPL(register_jprobes);
861 929
862int __kprobes register_jprobe(struct jprobe *jp) 930int __kprobes register_jprobe(struct jprobe *jp)
863{ 931{
864 return register_jprobes(&jp, 1); 932 return register_jprobes(&jp, 1);
865} 933}
934EXPORT_SYMBOL_GPL(register_jprobe);
866 935
867void __kprobes unregister_jprobe(struct jprobe *jp) 936void __kprobes unregister_jprobe(struct jprobe *jp)
868{ 937{
869 unregister_jprobes(&jp, 1); 938 unregister_jprobes(&jp, 1);
870} 939}
940EXPORT_SYMBOL_GPL(unregister_jprobe);
871 941
872void __kprobes unregister_jprobes(struct jprobe **jps, int num) 942void __kprobes unregister_jprobes(struct jprobe **jps, int num)
873{ 943{
@@ -887,6 +957,7 @@ void __kprobes unregister_jprobes(struct jprobe **jps, int num)
887 __unregister_kprobe_bottom(&jps[i]->kp); 957 __unregister_kprobe_bottom(&jps[i]->kp);
888 } 958 }
889} 959}
960EXPORT_SYMBOL_GPL(unregister_jprobes);
890 961
891#ifdef CONFIG_KRETPROBES 962#ifdef CONFIG_KRETPROBES
892/* 963/*
@@ -912,10 +983,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
912 ri->rp = rp; 983 ri->rp = rp;
913 ri->task = current; 984 ri->task = current;
914 985
915 if (rp->entry_handler && rp->entry_handler(ri, regs)) { 986 if (rp->entry_handler && rp->entry_handler(ri, regs))
916 spin_unlock_irqrestore(&rp->lock, flags);
917 return 0; 987 return 0;
918 }
919 988
920 arch_prepare_kretprobe(ri, regs); 989 arch_prepare_kretprobe(ri, regs);
921 990
@@ -982,6 +1051,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
982 free_rp_inst(rp); 1051 free_rp_inst(rp);
983 return ret; 1052 return ret;
984} 1053}
1054EXPORT_SYMBOL_GPL(register_kretprobe);
985 1055
986int __kprobes register_kretprobes(struct kretprobe **rps, int num) 1056int __kprobes register_kretprobes(struct kretprobe **rps, int num)
987{ 1057{
@@ -999,11 +1069,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
999 } 1069 }
1000 return ret; 1070 return ret;
1001} 1071}
1072EXPORT_SYMBOL_GPL(register_kretprobes);
1002 1073
1003void __kprobes unregister_kretprobe(struct kretprobe *rp) 1074void __kprobes unregister_kretprobe(struct kretprobe *rp)
1004{ 1075{
1005 unregister_kretprobes(&rp, 1); 1076 unregister_kretprobes(&rp, 1);
1006} 1077}
1078EXPORT_SYMBOL_GPL(unregister_kretprobe);
1007 1079
1008void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1080void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1009{ 1081{
@@ -1025,24 +1097,30 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1025 } 1097 }
1026 } 1098 }
1027} 1099}
1100EXPORT_SYMBOL_GPL(unregister_kretprobes);
1028 1101
1029#else /* CONFIG_KRETPROBES */ 1102#else /* CONFIG_KRETPROBES */
1030int __kprobes register_kretprobe(struct kretprobe *rp) 1103int __kprobes register_kretprobe(struct kretprobe *rp)
1031{ 1104{
1032 return -ENOSYS; 1105 return -ENOSYS;
1033} 1106}
1107EXPORT_SYMBOL_GPL(register_kretprobe);
1034 1108
1035int __kprobes register_kretprobes(struct kretprobe **rps, int num) 1109int __kprobes register_kretprobes(struct kretprobe **rps, int num)
1036{ 1110{
1037 return -ENOSYS; 1111 return -ENOSYS;
1038} 1112}
1113EXPORT_SYMBOL_GPL(register_kretprobes);
1114
1039void __kprobes unregister_kretprobe(struct kretprobe *rp) 1115void __kprobes unregister_kretprobe(struct kretprobe *rp)
1040{ 1116{
1041} 1117}
1118EXPORT_SYMBOL_GPL(unregister_kretprobe);
1042 1119
1043void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1120void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1044{ 1121{
1045} 1122}
1123EXPORT_SYMBOL_GPL(unregister_kretprobes);
1046 1124
1047static int __kprobes pre_handler_kretprobe(struct kprobe *p, 1125static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1048 struct pt_regs *regs) 1126 struct pt_regs *regs)
@@ -1056,6 +1134,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1056static void __kprobes kill_kprobe(struct kprobe *p) 1134static void __kprobes kill_kprobe(struct kprobe *p)
1057{ 1135{
1058 struct kprobe *kp; 1136 struct kprobe *kp;
1137
1059 p->flags |= KPROBE_FLAG_GONE; 1138 p->flags |= KPROBE_FLAG_GONE;
1060 if (p->pre_handler == aggr_pre_handler) { 1139 if (p->pre_handler == aggr_pre_handler) {
1061 /* 1140 /*
@@ -1168,8 +1247,8 @@ static int __init init_kprobes(void)
1168 } 1247 }
1169 } 1248 }
1170 1249
1171 /* By default, kprobes are enabled */ 1250 /* By default, kprobes are armed */
1172 kprobe_enabled = true; 1251 kprobes_all_disarmed = false;
1173 1252
1174 err = arch_init_kprobes(); 1253 err = arch_init_kprobes();
1175 if (!err) 1254 if (!err)
@@ -1197,12 +1276,18 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1197 else 1276 else
1198 kprobe_type = "k"; 1277 kprobe_type = "k";
1199 if (sym) 1278 if (sym)
1200 seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type, 1279 seq_printf(pi, "%p %s %s+0x%x %s %s%s\n",
1201 sym, offset, (modname ? modname : " "), 1280 p->addr, kprobe_type, sym, offset,
1202 (kprobe_gone(p) ? "[GONE]" : "")); 1281 (modname ? modname : " "),
1282 (kprobe_gone(p) ? "[GONE]" : ""),
1283 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1284 "[DISABLED]" : ""));
1203 else 1285 else
1204 seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr, 1286 seq_printf(pi, "%p %s %p %s%s\n",
1205 (kprobe_gone(p) ? "[GONE]" : "")); 1287 p->addr, kprobe_type, p->addr,
1288 (kprobe_gone(p) ? "[GONE]" : ""),
1289 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1290 "[DISABLED]" : ""));
1206} 1291}
1207 1292
1208static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1293static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1267,7 +1352,72 @@ static struct file_operations debugfs_kprobes_operations = {
1267 .release = seq_release, 1352 .release = seq_release,
1268}; 1353};
1269 1354
1270static void __kprobes enable_all_kprobes(void) 1355/* Disable one kprobe */
1356int __kprobes disable_kprobe(struct kprobe *kp)
1357{
1358 int ret = 0;
1359 struct kprobe *p;
1360
1361 mutex_lock(&kprobe_mutex);
1362
1363 /* Check whether specified probe is valid. */
1364 p = __get_valid_kprobe(kp);
1365 if (unlikely(p == NULL)) {
1366 ret = -EINVAL;
1367 goto out;
1368 }
1369
1370 /* If the probe is already disabled (or gone), just return */
1371 if (kprobe_disabled(kp))
1372 goto out;
1373
1374 kp->flags |= KPROBE_FLAG_DISABLED;
1375 if (p != kp)
1376 /* When kp != p, p is always enabled. */
1377 try_to_disable_aggr_kprobe(p);
1378
1379 if (!kprobes_all_disarmed && kprobe_disabled(p))
1380 disarm_kprobe(p);
1381out:
1382 mutex_unlock(&kprobe_mutex);
1383 return ret;
1384}
1385EXPORT_SYMBOL_GPL(disable_kprobe);
1386
1387/* Enable one kprobe */
1388int __kprobes enable_kprobe(struct kprobe *kp)
1389{
1390 int ret = 0;
1391 struct kprobe *p;
1392
1393 mutex_lock(&kprobe_mutex);
1394
1395 /* Check whether specified probe is valid. */
1396 p = __get_valid_kprobe(kp);
1397 if (unlikely(p == NULL)) {
1398 ret = -EINVAL;
1399 goto out;
1400 }
1401
1402 if (kprobe_gone(kp)) {
1403 /* This kprobe has gone, we couldn't enable it. */
1404 ret = -EINVAL;
1405 goto out;
1406 }
1407
1408 if (!kprobes_all_disarmed && kprobe_disabled(p))
1409 arm_kprobe(p);
1410
1411 p->flags &= ~KPROBE_FLAG_DISABLED;
1412 if (p != kp)
1413 kp->flags &= ~KPROBE_FLAG_DISABLED;
1414out:
1415 mutex_unlock(&kprobe_mutex);
1416 return ret;
1417}
1418EXPORT_SYMBOL_GPL(enable_kprobe);
1419
1420static void __kprobes arm_all_kprobes(void)
1271{ 1421{
1272 struct hlist_head *head; 1422 struct hlist_head *head;
1273 struct hlist_node *node; 1423 struct hlist_node *node;
@@ -1276,18 +1426,20 @@ static void __kprobes enable_all_kprobes(void)
1276 1426
1277 mutex_lock(&kprobe_mutex); 1427 mutex_lock(&kprobe_mutex);
1278 1428
1279 /* If kprobes are already enabled, just return */ 1429 /* If kprobes are armed, just return */
1280 if (kprobe_enabled) 1430 if (!kprobes_all_disarmed)
1281 goto already_enabled; 1431 goto already_enabled;
1282 1432
1433 mutex_lock(&text_mutex);
1283 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1434 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1284 head = &kprobe_table[i]; 1435 head = &kprobe_table[i];
1285 hlist_for_each_entry_rcu(p, node, head, hlist) 1436 hlist_for_each_entry_rcu(p, node, head, hlist)
1286 if (!kprobe_gone(p)) 1437 if (!kprobe_disabled(p))
1287 arch_arm_kprobe(p); 1438 arch_arm_kprobe(p);
1288 } 1439 }
1440 mutex_unlock(&text_mutex);
1289 1441
1290 kprobe_enabled = true; 1442 kprobes_all_disarmed = false;
1291 printk(KERN_INFO "Kprobes globally enabled\n"); 1443 printk(KERN_INFO "Kprobes globally enabled\n");
1292 1444
1293already_enabled: 1445already_enabled:
@@ -1295,7 +1447,7 @@ already_enabled:
1295 return; 1447 return;
1296} 1448}
1297 1449
1298static void __kprobes disable_all_kprobes(void) 1450static void __kprobes disarm_all_kprobes(void)
1299{ 1451{
1300 struct hlist_head *head; 1452 struct hlist_head *head;
1301 struct hlist_node *node; 1453 struct hlist_node *node;
@@ -1304,20 +1456,22 @@ static void __kprobes disable_all_kprobes(void)
1304 1456
1305 mutex_lock(&kprobe_mutex); 1457 mutex_lock(&kprobe_mutex);
1306 1458
1307 /* If kprobes are already disabled, just return */ 1459 /* If kprobes are already disarmed, just return */
1308 if (!kprobe_enabled) 1460 if (kprobes_all_disarmed)
1309 goto already_disabled; 1461 goto already_disabled;
1310 1462
1311 kprobe_enabled = false; 1463 kprobes_all_disarmed = true;
1312 printk(KERN_INFO "Kprobes globally disabled\n"); 1464 printk(KERN_INFO "Kprobes globally disabled\n");
1465 mutex_lock(&text_mutex);
1313 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1466 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1314 head = &kprobe_table[i]; 1467 head = &kprobe_table[i];
1315 hlist_for_each_entry_rcu(p, node, head, hlist) { 1468 hlist_for_each_entry_rcu(p, node, head, hlist) {
1316 if (!arch_trampoline_kprobe(p) && !kprobe_gone(p)) 1469 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1317 arch_disarm_kprobe(p); 1470 arch_disarm_kprobe(p);
1318 } 1471 }
1319 } 1472 }
1320 1473
1474 mutex_unlock(&text_mutex);
1321 mutex_unlock(&kprobe_mutex); 1475 mutex_unlock(&kprobe_mutex);
1322 /* Allow all currently running kprobes to complete */ 1476 /* Allow all currently running kprobes to complete */
1323 synchronize_sched(); 1477 synchronize_sched();
@@ -1338,7 +1492,7 @@ static ssize_t read_enabled_file_bool(struct file *file,
1338{ 1492{
1339 char buf[3]; 1493 char buf[3];
1340 1494
1341 if (kprobe_enabled) 1495 if (!kprobes_all_disarmed)
1342 buf[0] = '1'; 1496 buf[0] = '1';
1343 else 1497 else
1344 buf[0] = '0'; 1498 buf[0] = '0';
@@ -1361,12 +1515,12 @@ static ssize_t write_enabled_file_bool(struct file *file,
1361 case 'y': 1515 case 'y':
1362 case 'Y': 1516 case 'Y':
1363 case '1': 1517 case '1':
1364 enable_all_kprobes(); 1518 arm_all_kprobes();
1365 break; 1519 break;
1366 case 'n': 1520 case 'n':
1367 case 'N': 1521 case 'N':
1368 case '0': 1522 case '0':
1369 disable_all_kprobes(); 1523 disarm_all_kprobes();
1370 break; 1524 break;
1371 } 1525 }
1372 1526
@@ -1409,16 +1563,5 @@ late_initcall(debugfs_kprobe_init);
1409 1563
1410module_init(init_kprobes); 1564module_init(init_kprobes);
1411 1565
1412EXPORT_SYMBOL_GPL(register_kprobe); 1566/* defined in arch/.../kernel/kprobes.c */
1413EXPORT_SYMBOL_GPL(unregister_kprobe);
1414EXPORT_SYMBOL_GPL(register_kprobes);
1415EXPORT_SYMBOL_GPL(unregister_kprobes);
1416EXPORT_SYMBOL_GPL(register_jprobe);
1417EXPORT_SYMBOL_GPL(unregister_jprobe);
1418EXPORT_SYMBOL_GPL(register_jprobes);
1419EXPORT_SYMBOL_GPL(unregister_jprobes);
1420EXPORT_SYMBOL_GPL(jprobe_return); 1567EXPORT_SYMBOL_GPL(jprobe_return);
1421EXPORT_SYMBOL_GPL(register_kretprobe);
1422EXPORT_SYMBOL_GPL(unregister_kretprobe);
1423EXPORT_SYMBOL_GPL(register_kretprobes);
1424EXPORT_SYMBOL_GPL(unregister_kretprobes);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 84bbadd4d021..4ebaf8519abf 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -76,6 +76,7 @@ static int kthread(void *_create)
76 76
77 /* OK, tell user we're spawned, wait for stop or wakeup */ 77 /* OK, tell user we're spawned, wait for stop or wakeup */
78 __set_current_state(TASK_UNINTERRUPTIBLE); 78 __set_current_state(TASK_UNINTERRUPTIBLE);
79 create->result = current;
79 complete(&create->started); 80 complete(&create->started);
80 schedule(); 81 schedule();
81 82
@@ -96,22 +97,10 @@ static void create_kthread(struct kthread_create_info *create)
96 97
97 /* We want our own signal handler (we take no signals by default). */ 98 /* We want our own signal handler (we take no signals by default). */
98 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 99 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
99 if (pid < 0) { 100 if (pid < 0)
100 create->result = ERR_PTR(pid); 101 create->result = ERR_PTR(pid);
101 } else { 102 else
102 struct sched_param param = { .sched_priority = 0 };
103 wait_for_completion(&create->started); 103 wait_for_completion(&create->started);
104 read_lock(&tasklist_lock);
105 create->result = find_task_by_pid_ns(pid, &init_pid_ns);
106 read_unlock(&tasklist_lock);
107 /*
108 * root may have changed our (kthreadd's) priority or CPU mask.
109 * The kernel thread should not inherit these properties.
110 */
111 sched_setscheduler(create->result, SCHED_NORMAL, &param);
112 set_user_nice(create->result, KTHREAD_NICE_LEVEL);
113 set_cpus_allowed_ptr(create->result, cpu_all_mask);
114 }
115 complete(&create->done); 104 complete(&create->done);
116} 105}
117 106
@@ -154,11 +143,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
154 wait_for_completion(&create.done); 143 wait_for_completion(&create.done);
155 144
156 if (!IS_ERR(create.result)) { 145 if (!IS_ERR(create.result)) {
146 struct sched_param param = { .sched_priority = 0 };
157 va_list args; 147 va_list args;
148
158 va_start(args, namefmt); 149 va_start(args, namefmt);
159 vsnprintf(create.result->comm, sizeof(create.result->comm), 150 vsnprintf(create.result->comm, sizeof(create.result->comm),
160 namefmt, args); 151 namefmt, args);
161 va_end(args); 152 va_end(args);
153 /*
154 * root may have changed our (kthreadd's) priority or CPU mask.
155 * The kernel thread should not inherit these properties.
156 */
157 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
158 set_user_nice(create.result, KTHREAD_NICE_LEVEL);
159 set_cpus_allowed_ptr(create.result, cpu_all_mask);
162 } 160 }
163 return create.result; 161 return create.result;
164} 162}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 981cd4854281..accb40cdb12a 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,7 @@
42#include <linux/hash.h> 42#include <linux/hash.h>
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <trace/lockdep.h>
45 46
46#include <asm/sections.h> 47#include <asm/sections.h>
47 48
@@ -792,6 +793,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
792 793
793 printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); 794 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
794 printk("turning off the locking correctness validator.\n"); 795 printk("turning off the locking correctness validator.\n");
796 dump_stack();
795 return NULL; 797 return NULL;
796 } 798 }
797 class = lock_classes + nr_lock_classes++; 799 class = lock_classes + nr_lock_classes++;
@@ -855,6 +857,7 @@ static struct lock_list *alloc_list_entry(void)
855 857
856 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); 858 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
857 printk("turning off the locking correctness validator.\n"); 859 printk("turning off the locking correctness validator.\n");
860 dump_stack();
858 return NULL; 861 return NULL;
859 } 862 }
860 return list_entries + nr_list_entries++; 863 return list_entries + nr_list_entries++;
@@ -1681,6 +1684,7 @@ cache_hit:
1681 1684
1682 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); 1685 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
1683 printk("turning off the locking correctness validator.\n"); 1686 printk("turning off the locking correctness validator.\n");
1687 dump_stack();
1684 return 0; 1688 return 0;
1685 } 1689 }
1686 chain = lock_chains + nr_lock_chains++; 1690 chain = lock_chains + nr_lock_chains++;
@@ -2486,13 +2490,20 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2486void lockdep_init_map(struct lockdep_map *lock, const char *name, 2490void lockdep_init_map(struct lockdep_map *lock, const char *name,
2487 struct lock_class_key *key, int subclass) 2491 struct lock_class_key *key, int subclass)
2488{ 2492{
2489 if (unlikely(!debug_locks)) 2493 lock->class_cache = NULL;
2494#ifdef CONFIG_LOCK_STAT
2495 lock->cpu = raw_smp_processor_id();
2496#endif
2497
2498 if (DEBUG_LOCKS_WARN_ON(!name)) {
2499 lock->name = "NULL";
2490 return; 2500 return;
2501 }
2502
2503 lock->name = name;
2491 2504
2492 if (DEBUG_LOCKS_WARN_ON(!key)) 2505 if (DEBUG_LOCKS_WARN_ON(!key))
2493 return; 2506 return;
2494 if (DEBUG_LOCKS_WARN_ON(!name))
2495 return;
2496 /* 2507 /*
2497 * Sanity check, the lock-class key must be persistent: 2508 * Sanity check, the lock-class key must be persistent:
2498 */ 2509 */
@@ -2501,12 +2512,11 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2501 DEBUG_LOCKS_WARN_ON(1); 2512 DEBUG_LOCKS_WARN_ON(1);
2502 return; 2513 return;
2503 } 2514 }
2504 lock->name = name;
2505 lock->key = key; 2515 lock->key = key;
2506 lock->class_cache = NULL; 2516
2507#ifdef CONFIG_LOCK_STAT 2517 if (unlikely(!debug_locks))
2508 lock->cpu = raw_smp_processor_id(); 2518 return;
2509#endif 2519
2510 if (subclass) 2520 if (subclass)
2511 register_lock_class(lock, subclass, 1); 2521 register_lock_class(lock, subclass, 1);
2512} 2522}
@@ -2540,6 +2550,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2540 debug_locks_off(); 2550 debug_locks_off();
2541 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); 2551 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
2542 printk("turning off the locking correctness validator.\n"); 2552 printk("turning off the locking correctness validator.\n");
2553 dump_stack();
2543 return 0; 2554 return 0;
2544 } 2555 }
2545 2556
@@ -2636,6 +2647,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2636 debug_locks_off(); 2647 debug_locks_off();
2637 printk("BUG: MAX_LOCK_DEPTH too low!\n"); 2648 printk("BUG: MAX_LOCK_DEPTH too low!\n");
2638 printk("turning off the locking correctness validator.\n"); 2649 printk("turning off the locking correctness validator.\n");
2650 dump_stack();
2639 return 0; 2651 return 0;
2640 } 2652 }
2641 2653
@@ -2923,6 +2935,8 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
2923} 2935}
2924EXPORT_SYMBOL_GPL(lock_set_class); 2936EXPORT_SYMBOL_GPL(lock_set_class);
2925 2937
2938DEFINE_TRACE(lock_acquire);
2939
2926/* 2940/*
2927 * We are not always called with irqs disabled - do that here, 2941 * We are not always called with irqs disabled - do that here,
2928 * and also avoid lockdep recursion: 2942 * and also avoid lockdep recursion:
@@ -2933,6 +2947,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2933{ 2947{
2934 unsigned long flags; 2948 unsigned long flags;
2935 2949
2950 trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
2951
2936 if (unlikely(current->lockdep_recursion)) 2952 if (unlikely(current->lockdep_recursion))
2937 return; 2953 return;
2938 2954
@@ -2947,11 +2963,15 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2947} 2963}
2948EXPORT_SYMBOL_GPL(lock_acquire); 2964EXPORT_SYMBOL_GPL(lock_acquire);
2949 2965
2966DEFINE_TRACE(lock_release);
2967
2950void lock_release(struct lockdep_map *lock, int nested, 2968void lock_release(struct lockdep_map *lock, int nested,
2951 unsigned long ip) 2969 unsigned long ip)
2952{ 2970{
2953 unsigned long flags; 2971 unsigned long flags;
2954 2972
2973 trace_lock_release(lock, nested, ip);
2974
2955 if (unlikely(current->lockdep_recursion)) 2975 if (unlikely(current->lockdep_recursion))
2956 return; 2976 return;
2957 2977
@@ -3100,10 +3120,14 @@ found_it:
3100 lock->ip = ip; 3120 lock->ip = ip;
3101} 3121}
3102 3122
3123DEFINE_TRACE(lock_contended);
3124
3103void lock_contended(struct lockdep_map *lock, unsigned long ip) 3125void lock_contended(struct lockdep_map *lock, unsigned long ip)
3104{ 3126{
3105 unsigned long flags; 3127 unsigned long flags;
3106 3128
3129 trace_lock_contended(lock, ip);
3130
3107 if (unlikely(!lock_stat)) 3131 if (unlikely(!lock_stat))
3108 return; 3132 return;
3109 3133
@@ -3119,10 +3143,14 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3119} 3143}
3120EXPORT_SYMBOL_GPL(lock_contended); 3144EXPORT_SYMBOL_GPL(lock_contended);
3121 3145
3146DEFINE_TRACE(lock_acquired);
3147
3122void lock_acquired(struct lockdep_map *lock, unsigned long ip) 3148void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3123{ 3149{
3124 unsigned long flags; 3150 unsigned long flags;
3125 3151
3152 trace_lock_acquired(lock, ip);
3153
3126 if (unlikely(!lock_stat)) 3154 if (unlikely(!lock_stat))
3127 return; 3155 return;
3128 3156
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2cc7e9a6e84..699a2ac3a0d7 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -54,9 +54,9 @@ enum {
54 * table (if it's not there yet), and we check it for lock order 54 * table (if it's not there yet), and we check it for lock order
55 * conflicts and deadlocks. 55 * conflicts and deadlocks.
56 */ 56 */
57#define MAX_LOCKDEP_ENTRIES 8192UL 57#define MAX_LOCKDEP_ENTRIES 16384UL
58 58
59#define MAX_LOCKDEP_CHAINS_BITS 14 59#define MAX_LOCKDEP_CHAINS_BITS 15
60#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) 60#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
61 61
62#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) 62#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
diff --git a/kernel/module.c b/kernel/module.c
index f77ac320d0b5..e797812a4d95 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -68,7 +68,8 @@
68 68
69/* List of modules, protected by module_mutex or preempt_disable 69/* List of modules, protected by module_mutex or preempt_disable
70 * (delete uses stop_machine/add uses RCU list operations). */ 70 * (delete uses stop_machine/add uses RCU list operations). */
71static DEFINE_MUTEX(module_mutex); 71DEFINE_MUTEX(module_mutex);
72EXPORT_SYMBOL_GPL(module_mutex);
72static LIST_HEAD(modules); 73static LIST_HEAD(modules);
73 74
74/* Waiting for a module to finish initializing? */ 75/* Waiting for a module to finish initializing? */
@@ -76,7 +77,7 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
76 77
77static BLOCKING_NOTIFIER_HEAD(module_notify_list); 78static BLOCKING_NOTIFIER_HEAD(module_notify_list);
78 79
79/* Bounds of module allocation, for speeding __module_text_address */ 80/* Bounds of module allocation, for speeding __module_address */
80static unsigned long module_addr_min = -1UL, module_addr_max = 0; 81static unsigned long module_addr_min = -1UL, module_addr_max = 0;
81 82
82int register_module_notifier(struct notifier_block * nb) 83int register_module_notifier(struct notifier_block * nb)
@@ -186,17 +187,6 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
186#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) 187#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
187#endif 188#endif
188 189
189struct symsearch {
190 const struct kernel_symbol *start, *stop;
191 const unsigned long *crcs;
192 enum {
193 NOT_GPL_ONLY,
194 GPL_ONLY,
195 WILL_BE_GPL_ONLY,
196 } licence;
197 bool unused;
198};
199
200static bool each_symbol_in_section(const struct symsearch *arr, 190static bool each_symbol_in_section(const struct symsearch *arr,
201 unsigned int arrsize, 191 unsigned int arrsize,
202 struct module *owner, 192 struct module *owner,
@@ -217,10 +207,8 @@ static bool each_symbol_in_section(const struct symsearch *arr,
217} 207}
218 208
219/* Returns true as soon as fn returns true, otherwise false. */ 209/* Returns true as soon as fn returns true, otherwise false. */
220static bool each_symbol(bool (*fn)(const struct symsearch *arr, 210bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
221 struct module *owner, 211 unsigned int symnum, void *data), void *data)
222 unsigned int symnum, void *data),
223 void *data)
224{ 212{
225 struct module *mod; 213 struct module *mod;
226 const struct symsearch arr[] = { 214 const struct symsearch arr[] = {
@@ -273,6 +261,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
273 } 261 }
274 return false; 262 return false;
275} 263}
264EXPORT_SYMBOL_GPL(each_symbol);
276 265
277struct find_symbol_arg { 266struct find_symbol_arg {
278 /* Input */ 267 /* Input */
@@ -283,7 +272,7 @@ struct find_symbol_arg {
283 /* Output */ 272 /* Output */
284 struct module *owner; 273 struct module *owner;
285 const unsigned long *crc; 274 const unsigned long *crc;
286 unsigned long value; 275 const struct kernel_symbol *sym;
287}; 276};
288 277
289static bool find_symbol_in_section(const struct symsearch *syms, 278static bool find_symbol_in_section(const struct symsearch *syms,
@@ -324,17 +313,17 @@ static bool find_symbol_in_section(const struct symsearch *syms,
324 313
325 fsa->owner = owner; 314 fsa->owner = owner;
326 fsa->crc = symversion(syms->crcs, symnum); 315 fsa->crc = symversion(syms->crcs, symnum);
327 fsa->value = syms->start[symnum].value; 316 fsa->sym = &syms->start[symnum];
328 return true; 317 return true;
329} 318}
330 319
331/* Find a symbol, return value, (optional) crc and (optional) module 320/* Find a symbol and return it, along with, (optional) crc and
332 * which owns it */ 321 * (optional) module which owns it */
333static unsigned long find_symbol(const char *name, 322const struct kernel_symbol *find_symbol(const char *name,
334 struct module **owner, 323 struct module **owner,
335 const unsigned long **crc, 324 const unsigned long **crc,
336 bool gplok, 325 bool gplok,
337 bool warn) 326 bool warn)
338{ 327{
339 struct find_symbol_arg fsa; 328 struct find_symbol_arg fsa;
340 329
@@ -347,15 +336,16 @@ static unsigned long find_symbol(const char *name,
347 *owner = fsa.owner; 336 *owner = fsa.owner;
348 if (crc) 337 if (crc)
349 *crc = fsa.crc; 338 *crc = fsa.crc;
350 return fsa.value; 339 return fsa.sym;
351 } 340 }
352 341
353 DEBUGP("Failed to find symbol %s\n", name); 342 DEBUGP("Failed to find symbol %s\n", name);
354 return -ENOENT; 343 return NULL;
355} 344}
345EXPORT_SYMBOL_GPL(find_symbol);
356 346
357/* Search for module by name: must hold module_mutex. */ 347/* Search for module by name: must hold module_mutex. */
358static struct module *find_module(const char *name) 348struct module *find_module(const char *name)
359{ 349{
360 struct module *mod; 350 struct module *mod;
361 351
@@ -365,6 +355,7 @@ static struct module *find_module(const char *name)
365 } 355 }
366 return NULL; 356 return NULL;
367} 357}
358EXPORT_SYMBOL_GPL(find_module);
368 359
369#ifdef CONFIG_SMP 360#ifdef CONFIG_SMP
370 361
@@ -641,7 +632,7 @@ static int already_uses(struct module *a, struct module *b)
641} 632}
642 633
643/* Module a uses b */ 634/* Module a uses b */
644static int use_module(struct module *a, struct module *b) 635int use_module(struct module *a, struct module *b)
645{ 636{
646 struct module_use *use; 637 struct module_use *use;
647 int no_warn, err; 638 int no_warn, err;
@@ -674,6 +665,7 @@ static int use_module(struct module *a, struct module *b)
674 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); 665 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
675 return 1; 666 return 1;
676} 667}
668EXPORT_SYMBOL_GPL(use_module);
677 669
678/* Clear the unload stuff of the module. */ 670/* Clear the unload stuff of the module. */
679static void module_unload_free(struct module *mod) 671static void module_unload_free(struct module *mod)
@@ -894,7 +886,7 @@ void __symbol_put(const char *symbol)
894 struct module *owner; 886 struct module *owner;
895 887
896 preempt_disable(); 888 preempt_disable();
897 if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false))) 889 if (!find_symbol(symbol, &owner, NULL, true, false))
898 BUG(); 890 BUG();
899 module_put(owner); 891 module_put(owner);
900 preempt_enable(); 892 preempt_enable();
@@ -908,8 +900,10 @@ void symbol_put_addr(void *addr)
908 if (core_kernel_text((unsigned long)addr)) 900 if (core_kernel_text((unsigned long)addr))
909 return; 901 return;
910 902
911 if (!(modaddr = module_text_address((unsigned long)addr))) 903 /* module_text_address is safe here: we're supposed to have reference
912 BUG(); 904 * to module from symbol_get, so it can't go away. */
905 modaddr = __module_text_address((unsigned long)addr);
906 BUG_ON(!modaddr);
913 module_put(modaddr); 907 module_put(modaddr);
914} 908}
915EXPORT_SYMBOL_GPL(symbol_put_addr); 909EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -949,10 +943,11 @@ static inline void module_unload_free(struct module *mod)
949{ 943{
950} 944}
951 945
952static inline int use_module(struct module *a, struct module *b) 946int use_module(struct module *a, struct module *b)
953{ 947{
954 return strong_try_module_get(b) == 0; 948 return strong_try_module_get(b) == 0;
955} 949}
950EXPORT_SYMBOL_GPL(use_module);
956 951
957static inline void module_unload_init(struct module *mod) 952static inline void module_unload_init(struct module *mod)
958{ 953{
@@ -995,12 +990,12 @@ static struct module_attribute *modinfo_attrs[] = {
995 990
996static const char vermagic[] = VERMAGIC_STRING; 991static const char vermagic[] = VERMAGIC_STRING;
997 992
998static int try_to_force_load(struct module *mod, const char *symname) 993static int try_to_force_load(struct module *mod, const char *reason)
999{ 994{
1000#ifdef CONFIG_MODULE_FORCE_LOAD 995#ifdef CONFIG_MODULE_FORCE_LOAD
1001 if (!test_taint(TAINT_FORCED_MODULE)) 996 if (!test_taint(TAINT_FORCED_MODULE))
1002 printk("%s: no version for \"%s\" found: kernel tainted.\n", 997 printk(KERN_WARNING "%s: %s: kernel tainted.\n",
1003 mod->name, symname); 998 mod->name, reason);
1004 add_taint_module(mod, TAINT_FORCED_MODULE); 999 add_taint_module(mod, TAINT_FORCED_MODULE);
1005 return 0; 1000 return 0;
1006#else 1001#else
@@ -1057,9 +1052,9 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1057{ 1052{
1058 const unsigned long *crc; 1053 const unsigned long *crc;
1059 1054
1060 if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false))) 1055 if (!find_symbol("module_layout", NULL, &crc, true, false))
1061 BUG(); 1056 BUG();
1062 return check_version(sechdrs, versindex, "struct_module", mod, crc); 1057 return check_version(sechdrs, versindex, "module_layout", mod, crc);
1063} 1058}
1064 1059
1065/* First part is kernel version, which we ignore if module has crcs. */ 1060/* First part is kernel version, which we ignore if module has crcs. */
@@ -1098,25 +1093,25 @@ static inline int same_magic(const char *amagic, const char *bmagic,
1098 1093
1099/* Resolve a symbol for this module. I.e. if we find one, record usage. 1094/* Resolve a symbol for this module. I.e. if we find one, record usage.
1100 Must be holding module_mutex. */ 1095 Must be holding module_mutex. */
1101static unsigned long resolve_symbol(Elf_Shdr *sechdrs, 1096static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1102 unsigned int versindex, 1097 unsigned int versindex,
1103 const char *name, 1098 const char *name,
1104 struct module *mod) 1099 struct module *mod)
1105{ 1100{
1106 struct module *owner; 1101 struct module *owner;
1107 unsigned long ret; 1102 const struct kernel_symbol *sym;
1108 const unsigned long *crc; 1103 const unsigned long *crc;
1109 1104
1110 ret = find_symbol(name, &owner, &crc, 1105 sym = find_symbol(name, &owner, &crc,
1111 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); 1106 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
1112 if (!IS_ERR_VALUE(ret)) { 1107 /* use_module can fail due to OOM,
1113 /* use_module can fail due to OOM, 1108 or module initialization or unloading */
1114 or module initialization or unloading */ 1109 if (sym) {
1115 if (!check_version(sechdrs, versindex, name, mod, crc) || 1110 if (!check_version(sechdrs, versindex, name, mod, crc) ||
1116 !use_module(mod, owner)) 1111 !use_module(mod, owner))
1117 ret = -EINVAL; 1112 sym = NULL;
1118 } 1113 }
1119 return ret; 1114 return sym;
1120} 1115}
1121 1116
1122/* 1117/*
@@ -1491,6 +1486,9 @@ static void free_module(struct module *mod)
1491 /* Module unload stuff */ 1486 /* Module unload stuff */
1492 module_unload_free(mod); 1487 module_unload_free(mod);
1493 1488
1489 /* Free any allocated parameters. */
1490 destroy_params(mod->kp, mod->num_kp);
1491
1494 /* release any pointers to mcount in this module */ 1492 /* release any pointers to mcount in this module */
1495 ftrace_release(mod->module_core, mod->core_size); 1493 ftrace_release(mod->module_core, mod->core_size);
1496 1494
@@ -1513,17 +1511,15 @@ static void free_module(struct module *mod)
1513void *__symbol_get(const char *symbol) 1511void *__symbol_get(const char *symbol)
1514{ 1512{
1515 struct module *owner; 1513 struct module *owner;
1516 unsigned long value; 1514 const struct kernel_symbol *sym;
1517 1515
1518 preempt_disable(); 1516 preempt_disable();
1519 value = find_symbol(symbol, &owner, NULL, true, true); 1517 sym = find_symbol(symbol, &owner, NULL, true, true);
1520 if (IS_ERR_VALUE(value)) 1518 if (sym && strong_try_module_get(owner))
1521 value = 0; 1519 sym = NULL;
1522 else if (strong_try_module_get(owner))
1523 value = 0;
1524 preempt_enable(); 1520 preempt_enable();
1525 1521
1526 return (void *)value; 1522 return sym ? (void *)sym->value : NULL;
1527} 1523}
1528EXPORT_SYMBOL_GPL(__symbol_get); 1524EXPORT_SYMBOL_GPL(__symbol_get);
1529 1525
@@ -1551,8 +1547,7 @@ static int verify_export_symbols(struct module *mod)
1551 1547
1552 for (i = 0; i < ARRAY_SIZE(arr); i++) { 1548 for (i = 0; i < ARRAY_SIZE(arr); i++) {
1553 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { 1549 for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
1554 if (!IS_ERR_VALUE(find_symbol(s->name, &owner, 1550 if (find_symbol(s->name, &owner, NULL, true, false)) {
1555 NULL, true, false))) {
1556 printk(KERN_ERR 1551 printk(KERN_ERR
1557 "%s: exports duplicate symbol %s" 1552 "%s: exports duplicate symbol %s"
1558 " (owned by %s)\n", 1553 " (owned by %s)\n",
@@ -1576,6 +1571,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1576 unsigned long secbase; 1571 unsigned long secbase;
1577 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym); 1572 unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
1578 int ret = 0; 1573 int ret = 0;
1574 const struct kernel_symbol *ksym;
1579 1575
1580 for (i = 1; i < n; i++) { 1576 for (i = 1; i < n; i++) {
1581 switch (sym[i].st_shndx) { 1577 switch (sym[i].st_shndx) {
@@ -1595,13 +1591,14 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1595 break; 1591 break;
1596 1592
1597 case SHN_UNDEF: 1593 case SHN_UNDEF:
1598 sym[i].st_value 1594 ksym = resolve_symbol(sechdrs, versindex,
1599 = resolve_symbol(sechdrs, versindex, 1595 strtab + sym[i].st_name, mod);
1600 strtab + sym[i].st_name, mod);
1601
1602 /* Ok if resolved. */ 1596 /* Ok if resolved. */
1603 if (!IS_ERR_VALUE(sym[i].st_value)) 1597 if (ksym) {
1598 sym[i].st_value = ksym->value;
1604 break; 1599 break;
1600 }
1601
1605 /* Ok if weak. */ 1602 /* Ok if weak. */
1606 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) 1603 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
1607 break; 1604 break;
@@ -1676,8 +1673,7 @@ static void layout_sections(struct module *mod,
1676 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1673 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1677 || (s->sh_flags & masks[m][1]) 1674 || (s->sh_flags & masks[m][1])
1678 || s->sh_entsize != ~0UL 1675 || s->sh_entsize != ~0UL
1679 || strncmp(secstrings + s->sh_name, 1676 || strstarts(secstrings + s->sh_name, ".init"))
1680 ".init", 5) == 0)
1681 continue; 1677 continue;
1682 s->sh_entsize = get_offset(mod, &mod->core_size, s, i); 1678 s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
1683 DEBUGP("\t%s\n", secstrings + s->sh_name); 1679 DEBUGP("\t%s\n", secstrings + s->sh_name);
@@ -1694,8 +1690,7 @@ static void layout_sections(struct module *mod,
1694 if ((s->sh_flags & masks[m][0]) != masks[m][0] 1690 if ((s->sh_flags & masks[m][0]) != masks[m][0]
1695 || (s->sh_flags & masks[m][1]) 1691 || (s->sh_flags & masks[m][1])
1696 || s->sh_entsize != ~0UL 1692 || s->sh_entsize != ~0UL
1697 || strncmp(secstrings + s->sh_name, 1693 || !strstarts(secstrings + s->sh_name, ".init"))
1698 ".init", 5) != 0)
1699 continue; 1694 continue;
1700 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) 1695 s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
1701 | INIT_OFFSET_MASK); 1696 | INIT_OFFSET_MASK);
@@ -1828,8 +1823,7 @@ static char elf_type(const Elf_Sym *sym,
1828 else 1823 else
1829 return 'b'; 1824 return 'b';
1830 } 1825 }
1831 if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name, 1826 if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
1832 ".debug", strlen(".debug")) == 0)
1833 return 'n'; 1827 return 'n';
1834 return '?'; 1828 return '?';
1835} 1829}
@@ -1898,8 +1892,7 @@ static noinline struct module *load_module(void __user *umod,
1898 unsigned int symindex = 0; 1892 unsigned int symindex = 0;
1899 unsigned int strindex = 0; 1893 unsigned int strindex = 0;
1900 unsigned int modindex, versindex, infoindex, pcpuindex; 1894 unsigned int modindex, versindex, infoindex, pcpuindex;
1901 unsigned int num_kp, num_mcount; 1895 unsigned int num_mcount;
1902 struct kernel_param *kp;
1903 struct module *mod; 1896 struct module *mod;
1904 long err = 0; 1897 long err = 0;
1905 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1898 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1916,12 +1909,6 @@ static noinline struct module *load_module(void __user *umod,
1916 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) 1909 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
1917 return ERR_PTR(-ENOMEM); 1910 return ERR_PTR(-ENOMEM);
1918 1911
1919 /* Create stop_machine threads since the error path relies on
1920 * a non-failing stop_machine call. */
1921 err = stop_machine_create();
1922 if (err)
1923 goto free_hdr;
1924
1925 if (copy_from_user(hdr, umod, len) != 0) { 1912 if (copy_from_user(hdr, umod, len) != 0) {
1926 err = -EFAULT; 1913 err = -EFAULT;
1927 goto free_hdr; 1914 goto free_hdr;
@@ -1962,7 +1949,7 @@ static noinline struct module *load_module(void __user *umod,
1962 } 1949 }
1963#ifndef CONFIG_MODULE_UNLOAD 1950#ifndef CONFIG_MODULE_UNLOAD
1964 /* Don't load .exit sections */ 1951 /* Don't load .exit sections */
1965 if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0) 1952 if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
1966 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; 1953 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
1967#endif 1954#endif
1968 } 1955 }
@@ -2006,7 +1993,7 @@ static noinline struct module *load_module(void __user *umod,
2006 modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); 1993 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
2007 /* This is allowed: modprobe --force will invalidate it. */ 1994 /* This is allowed: modprobe --force will invalidate it. */
2008 if (!modmagic) { 1995 if (!modmagic) {
2009 err = try_to_force_load(mod, "magic"); 1996 err = try_to_force_load(mod, "bad vermagic");
2010 if (err) 1997 if (err)
2011 goto free_hdr; 1998 goto free_hdr;
2012 } else if (!same_magic(modmagic, vermagic, versindex)) { 1999 } else if (!same_magic(modmagic, vermagic, versindex)) {
@@ -2144,8 +2131,8 @@ static noinline struct module *load_module(void __user *umod,
2144 2131
2145 /* Now we've got everything in the final locations, we can 2132 /* Now we've got everything in the final locations, we can
2146 * find optional sections. */ 2133 * find optional sections. */
2147 kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp), 2134 mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
2148 &num_kp); 2135 sizeof(*mod->kp), &mod->num_kp);
2149 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab", 2136 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
2150 sizeof(*mod->syms), &mod->num_syms); 2137 sizeof(*mod->syms), &mod->num_syms);
2151 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab"); 2138 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
@@ -2195,8 +2182,8 @@ static noinline struct module *load_module(void __user *umod,
2195 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) 2182 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
2196#endif 2183#endif
2197 ) { 2184 ) {
2198 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); 2185 err = try_to_force_load(mod,
2199 err = try_to_force_load(mod, "nocrc"); 2186 "no versions for exported symbols");
2200 if (err) 2187 if (err)
2201 goto cleanup; 2188 goto cleanup;
2202 } 2189 }
@@ -2291,11 +2278,11 @@ static noinline struct module *load_module(void __user *umod,
2291 */ 2278 */
2292 list_add_rcu(&mod->list, &modules); 2279 list_add_rcu(&mod->list, &modules);
2293 2280
2294 err = parse_args(mod->name, mod->args, kp, num_kp, NULL); 2281 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2295 if (err < 0) 2282 if (err < 0)
2296 goto unlink; 2283 goto unlink;
2297 2284
2298 err = mod_sysfs_setup(mod, kp, num_kp); 2285 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
2299 if (err < 0) 2286 if (err < 0)
2300 goto unlink; 2287 goto unlink;
2301 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2288 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2304,12 +2291,13 @@ static noinline struct module *load_module(void __user *umod,
2304 /* Get rid of temporary copy */ 2291 /* Get rid of temporary copy */
2305 vfree(hdr); 2292 vfree(hdr);
2306 2293
2307 stop_machine_destroy();
2308 /* Done! */ 2294 /* Done! */
2309 return mod; 2295 return mod;
2310 2296
2311 unlink: 2297 unlink:
2312 stop_machine(__unlink_module, mod, NULL); 2298 /* Unlink carefully: kallsyms could be walking list. */
2299 list_del_rcu(&mod->list);
2300 synchronize_sched();
2313 module_arch_cleanup(mod); 2301 module_arch_cleanup(mod);
2314 cleanup: 2302 cleanup:
2315 kobject_del(&mod->mkobj.kobj); 2303 kobject_del(&mod->mkobj.kobj);
@@ -2317,8 +2305,8 @@ static noinline struct module *load_module(void __user *umod,
2317 ftrace_release(mod->module_core, mod->core_size); 2305 ftrace_release(mod->module_core, mod->core_size);
2318 free_unload: 2306 free_unload:
2319 module_unload_free(mod); 2307 module_unload_free(mod);
2320 free_init:
2321#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) 2308#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
2309 free_init:
2322 percpu_modfree(mod->refptr); 2310 percpu_modfree(mod->refptr);
2323#endif 2311#endif
2324 module_free(mod, mod->module_init); 2312 module_free(mod, mod->module_init);
@@ -2332,7 +2320,6 @@ static noinline struct module *load_module(void __user *umod,
2332 kfree(args); 2320 kfree(args);
2333 free_hdr: 2321 free_hdr:
2334 vfree(hdr); 2322 vfree(hdr);
2335 stop_machine_destroy();
2336 return ERR_PTR(err); 2323 return ERR_PTR(err);
2337 2324
2338 truncated: 2325 truncated:
@@ -2401,6 +2388,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2401 blocking_notifier_call_chain(&module_notify_list, 2388 blocking_notifier_call_chain(&module_notify_list,
2402 MODULE_STATE_LIVE, mod); 2389 MODULE_STATE_LIVE, mod);
2403 2390
2391 /* We need to finish all async code before the module init sequence is done */
2392 async_synchronize_full();
2393
2404 mutex_lock(&module_mutex); 2394 mutex_lock(&module_mutex);
2405 /* Drop initial reference. */ 2395 /* Drop initial reference. */
2406 module_put(mod); 2396 module_put(mod);
@@ -2609,6 +2599,25 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2609 preempt_enable(); 2599 preempt_enable();
2610 return ret; 2600 return ret;
2611} 2601}
2602
2603int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
2604 struct module *, unsigned long),
2605 void *data)
2606{
2607 struct module *mod;
2608 unsigned int i;
2609 int ret;
2610
2611 list_for_each_entry(mod, &modules, list) {
2612 for (i = 0; i < mod->num_symtab; i++) {
2613 ret = fn(data, mod->strtab + mod->symtab[i].st_name,
2614 mod, mod->symtab[i].st_value);
2615 if (ret != 0)
2616 return ret;
2617 }
2618 }
2619 return 0;
2620}
2612#endif /* CONFIG_KALLSYMS */ 2621#endif /* CONFIG_KALLSYMS */
2613 2622
2614static char *module_flags(struct module *mod, char *buf) 2623static char *module_flags(struct module *mod, char *buf)
@@ -2744,29 +2753,31 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2744} 2753}
2745 2754
2746/* 2755/*
2747 * Is this a valid module address? 2756 * is_module_address - is this address inside a module?
2757 * @addr: the address to check.
2758 *
2759 * See is_module_text_address() if you simply want to see if the address
2760 * is code (not data).
2748 */ 2761 */
2749int is_module_address(unsigned long addr) 2762bool is_module_address(unsigned long addr)
2750{ 2763{
2751 struct module *mod; 2764 bool ret;
2752 2765
2753 preempt_disable(); 2766 preempt_disable();
2754 2767 ret = __module_address(addr) != NULL;
2755 list_for_each_entry_rcu(mod, &modules, list) {
2756 if (within_module_core(addr, mod)) {
2757 preempt_enable();
2758 return 1;
2759 }
2760 }
2761
2762 preempt_enable(); 2768 preempt_enable();
2763 2769
2764 return 0; 2770 return ret;
2765} 2771}
2766 2772
2767 2773/*
2768/* Is this a valid kernel address? */ 2774 * __module_address - get the module which contains an address.
2769__notrace_funcgraph struct module *__module_text_address(unsigned long addr) 2775 * @addr: the address.
2776 *
2777 * Must be called with preempt disabled or module mutex held so that
2778 * module doesn't get freed during this.
2779 */
2780struct module *__module_address(unsigned long addr)
2770{ 2781{
2771 struct module *mod; 2782 struct module *mod;
2772 2783
@@ -2774,22 +2785,51 @@ __notrace_funcgraph struct module *__module_text_address(unsigned long addr)
2774 return NULL; 2785 return NULL;
2775 2786
2776 list_for_each_entry_rcu(mod, &modules, list) 2787 list_for_each_entry_rcu(mod, &modules, list)
2777 if (within(addr, mod->module_init, mod->init_text_size) 2788 if (within_module_core(addr, mod)
2778 || within(addr, mod->module_core, mod->core_text_size)) 2789 || within_module_init(addr, mod))
2779 return mod; 2790 return mod;
2780 return NULL; 2791 return NULL;
2781} 2792}
2793EXPORT_SYMBOL_GPL(__module_address);
2782 2794
2783struct module *module_text_address(unsigned long addr) 2795/*
2796 * is_module_text_address - is this address inside module code?
2797 * @addr: the address to check.
2798 *
2799 * See is_module_address() if you simply want to see if the address is
2800 * anywhere in a module. See kernel_text_address() for testing if an
2801 * address corresponds to kernel or module code.
2802 */
2803bool is_module_text_address(unsigned long addr)
2784{ 2804{
2785 struct module *mod; 2805 bool ret;
2786 2806
2787 preempt_disable(); 2807 preempt_disable();
2788 mod = __module_text_address(addr); 2808 ret = __module_text_address(addr) != NULL;
2789 preempt_enable(); 2809 preempt_enable();
2790 2810
2811 return ret;
2812}
2813
2814/*
2815 * __module_text_address - get the module whose code contains an address.
2816 * @addr: the address.
2817 *
2818 * Must be called with preempt disabled or module mutex held so that
2819 * module doesn't get freed during this.
2820 */
2821struct module *__module_text_address(unsigned long addr)
2822{
2823 struct module *mod = __module_address(addr);
2824 if (mod) {
2825 /* Make sure it's within the text section. */
2826 if (!within(addr, mod->module_init, mod->init_text_size)
2827 && !within(addr, mod->module_core, mod->core_text_size))
2828 mod = NULL;
2829 }
2791 return mod; 2830 return mod;
2792} 2831}
2832EXPORT_SYMBOL_GPL(__module_text_address);
2793 2833
2794/* Don't grab lock, we're oopsing. */ 2834/* Don't grab lock, we're oopsing. */
2795void print_modules(void) 2835void print_modules(void)
@@ -2809,9 +2849,17 @@ void print_modules(void)
2809} 2849}
2810 2850
2811#ifdef CONFIG_MODVERSIONS 2851#ifdef CONFIG_MODVERSIONS
2812/* Generate the signature for struct module here, too, for modversions. */ 2852/* Generate the signature for all relevant module structures here.
2813void struct_module(struct module *mod) { return; } 2853 * If these change, we don't want to try to parse the module. */
2814EXPORT_SYMBOL(struct_module); 2854void module_layout(struct module *mod,
2855 struct modversion_info *ver,
2856 struct kernel_param *kp,
2857 struct kernel_symbol *ks,
2858 struct marker *marker,
2859 struct tracepoint *tp)
2860{
2861}
2862EXPORT_SYMBOL(module_layout);
2815#endif 2863#endif
2816 2864
2817#ifdef CONFIG_MARKERS 2865#ifdef CONFIG_MARKERS
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5d79781394a3..507cf2b5e9f1 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,7 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
148 148
149 preempt_disable(); 149 preempt_disable();
150 mutex_acquire(&lock->dep_map, subclass, 0, ip); 150 mutex_acquire(&lock->dep_map, subclass, 0, ip);
151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) 151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \
152 !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES)
152 /* 153 /*
153 * Optimistic spinning. 154 * Optimistic spinning.
154 * 155 *
diff --git a/kernel/panic.c b/kernel/panic.c
index 3fd8c5bf8b39..984b3ecbd72c 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -213,8 +213,16 @@ unsigned long get_taint(void)
213 213
214void add_taint(unsigned flag) 214void add_taint(unsigned flag)
215{ 215{
216 /* can't trust the integrity of the kernel anymore: */ 216 /*
217 debug_locks = 0; 217 * Can't trust the integrity of the kernel anymore.
218 * We don't call directly debug_locks_off() because the issue
219 * is not necessarily serious enough to set oops_in_progress to 1
220 * Also we want to keep up lockdep for staging development and
221 * post-warning case.
222 */
223 if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
224 printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
225
218 set_bit(flag, &tainted_mask); 226 set_bit(flag, &tainted_mask);
219} 227}
220EXPORT_SYMBOL(add_taint); 228EXPORT_SYMBOL(add_taint);
@@ -332,34 +340,46 @@ void oops_exit(void)
332} 340}
333 341
334#ifdef WANT_WARN_ON_SLOWPATH 342#ifdef WANT_WARN_ON_SLOWPATH
335void warn_slowpath(const char *file, int line, const char *fmt, ...) 343struct slowpath_args {
336{ 344 const char *fmt;
337 va_list args; 345 va_list args;
338 char function[KSYM_SYMBOL_LEN]; 346};
339 unsigned long caller = (unsigned long)__builtin_return_address(0);
340 const char *board;
341 347
342 sprint_symbol(function, caller); 348static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args)
349{
350 const char *board;
343 351
344 printk(KERN_WARNING "------------[ cut here ]------------\n"); 352 printk(KERN_WARNING "------------[ cut here ]------------\n");
345 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, 353 printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
346 line, function);
347 board = dmi_get_system_info(DMI_PRODUCT_NAME); 354 board = dmi_get_system_info(DMI_PRODUCT_NAME);
348 if (board) 355 if (board)
349 printk(KERN_WARNING "Hardware name: %s\n", board); 356 printk(KERN_WARNING "Hardware name: %s\n", board);
350 357
351 if (fmt) { 358 if (args)
352 va_start(args, fmt); 359 vprintk(args->fmt, args->args);
353 vprintk(fmt, args);
354 va_end(args);
355 }
356 360
357 print_modules(); 361 print_modules();
358 dump_stack(); 362 dump_stack();
359 print_oops_end_marker(); 363 print_oops_end_marker();
360 add_taint(TAINT_WARN); 364 add_taint(TAINT_WARN);
361} 365}
362EXPORT_SYMBOL(warn_slowpath); 366
367void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
368{
369 struct slowpath_args args;
370
371 args.fmt = fmt;
372 va_start(args.args, fmt);
373 warn_slowpath_common(file, line, __builtin_return_address(0), &args);
374 va_end(args.args);
375}
376EXPORT_SYMBOL(warn_slowpath_fmt);
377
378void warn_slowpath_null(const char *file, int line)
379{
380 warn_slowpath_common(file, line, __builtin_return_address(0), NULL);
381}
382EXPORT_SYMBOL(warn_slowpath_null);
363#endif 383#endif
364 384
365#ifdef CONFIG_CC_STACKPROTECTOR 385#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/params.c b/kernel/params.c
index a1e3025b19a9..de273ec85bd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,9 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26 26
27/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
28#define KPARAM_KMALLOCED 0x80000000
29
27#if 0 30#if 0
28#define DEBUGP printk 31#define DEBUGP printk
29#else 32#else
@@ -217,7 +220,19 @@ int param_set_charp(const char *val, struct kernel_param *kp)
217 return -ENOSPC; 220 return -ENOSPC;
218 } 221 }
219 222
220 *(char **)kp->arg = (char *)val; 223 if (kp->perm & KPARAM_KMALLOCED)
224 kfree(*(char **)kp->arg);
225
226 /* This is a hack. We can't need to strdup in early boot, and we
227 * don't need to; this mangled commandline is preserved. */
228 if (slab_is_available()) {
229 kp->perm |= KPARAM_KMALLOCED;
230 *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
231 if (!kp->arg)
232 return -ENOMEM;
233 } else
234 *(const char **)kp->arg = val;
235
221 return 0; 236 return 0;
222} 237}
223 238
@@ -571,6 +586,15 @@ void module_param_sysfs_remove(struct module *mod)
571} 586}
572#endif 587#endif
573 588
589void destroy_params(const struct kernel_param *params, unsigned num)
590{
591 unsigned int i;
592
593 for (i = 0; i < num; i++)
594 if (params[i].perm & KPARAM_KMALLOCED)
595 kfree(*(char **)params[i].arg);
596}
597
574static void __init kernel_add_sysfs_param(const char *name, 598static void __init kernel_add_sysfs_param(const char *name,
575 struct kernel_param *kparam, 599 struct kernel_param *kparam,
576 unsigned int name_skip) 600 unsigned int name_skip)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8e5d9a68b022..bece7c0b67b2 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -18,7 +18,7 @@ void update_rlimit_cpu(unsigned long rlim_new)
18 18
19 cputime = secs_to_cputime(rlim_new); 19 cputime = secs_to_cputime(rlim_new);
20 if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || 20 if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
21 cputime_lt(current->signal->it_prof_expires, cputime)) { 21 cputime_gt(current->signal->it_prof_expires, cputime)) {
22 spin_lock_irq(&current->sighand->siglock); 22 spin_lock_irq(&current->sighand->siglock);
23 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); 23 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
24 spin_unlock_irq(&current->sighand->siglock); 24 spin_unlock_irq(&current->sighand->siglock);
@@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
224 cpu->cpu = virt_ticks(p); 224 cpu->cpu = virt_ticks(p);
225 break; 225 break;
226 case CPUCLOCK_SCHED: 226 case CPUCLOCK_SCHED:
227 cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p); 227 cpu->sched = task_sched_runtime(p);
228 break; 228 break;
229 } 229 }
230 return 0; 230 return 0;
@@ -305,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
305{ 305{
306 struct task_cputime cputime; 306 struct task_cputime cputime;
307 307
308 thread_group_cputime(p, &cputime);
309 switch (CPUCLOCK_WHICH(which_clock)) { 308 switch (CPUCLOCK_WHICH(which_clock)) {
310 default: 309 default:
311 return -EINVAL; 310 return -EINVAL;
312 case CPUCLOCK_PROF: 311 case CPUCLOCK_PROF:
312 thread_group_cputime(p, &cputime);
313 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 313 cpu->cpu = cputime_add(cputime.utime, cputime.stime);
314 break; 314 break;
315 case CPUCLOCK_VIRT: 315 case CPUCLOCK_VIRT:
316 thread_group_cputime(p, &cputime);
316 cpu->cpu = cputime.utime; 317 cpu->cpu = cputime.utime;
317 break; 318 break;
318 case CPUCLOCK_SCHED: 319 case CPUCLOCK_SCHED:
319 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); 320 cpu->sched = thread_group_sched_runtime(p);
320 break; 321 break;
321 } 322 }
322 return 0; 323 return 0;
@@ -1419,19 +1420,19 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1419 * timer call will interfere. 1420 * timer call will interfere.
1420 */ 1421 */
1421 list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { 1422 list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) {
1422 int firing; 1423 int cpu_firing;
1424
1423 spin_lock(&timer->it_lock); 1425 spin_lock(&timer->it_lock);
1424 list_del_init(&timer->it.cpu.entry); 1426 list_del_init(&timer->it.cpu.entry);
1425 firing = timer->it.cpu.firing; 1427 cpu_firing = timer->it.cpu.firing;
1426 timer->it.cpu.firing = 0; 1428 timer->it.cpu.firing = 0;
1427 /* 1429 /*
1428 * The firing flag is -1 if we collided with a reset 1430 * The firing flag is -1 if we collided with a reset
1429 * of the timer, which already reported this 1431 * of the timer, which already reported this
1430 * almost-firing as an overrun. So don't generate an event. 1432 * almost-firing as an overrun. So don't generate an event.
1431 */ 1433 */
1432 if (likely(firing >= 0)) { 1434 if (likely(cpu_firing >= 0))
1433 cpu_timer_fire(timer); 1435 cpu_timer_fire(timer);
1434 }
1435 spin_unlock(&timer->it_lock); 1436 spin_unlock(&timer->it_lock);
1436 } 1437 }
1437} 1438}
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 5f21ab2bbcdf..b0dc9e7a0d17 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <scsi/scsi_scan.h>
25#include <asm/suspend.h> 26#include <asm/suspend.h>
26 27
27#include "power.h" 28#include "power.h"
@@ -240,9 +241,9 @@ static int create_image(int platform_mode)
240 241
241 local_irq_disable(); 242 local_irq_disable();
242 243
243 sysdev_suspend(PMSG_FREEZE); 244 error = sysdev_suspend(PMSG_FREEZE);
244 if (error) { 245 if (error) {
245 printk(KERN_ERR "PM: Some devices failed to power down, " 246 printk(KERN_ERR "PM: Some system devices failed to power down, "
246 "aborting hibernation\n"); 247 "aborting hibernation\n");
247 goto Enable_irqs; 248 goto Enable_irqs;
248 } 249 }
@@ -655,32 +656,42 @@ static int software_resume(void)
655 * here to avoid lockdep complaining. 656 * here to avoid lockdep complaining.
656 */ 657 */
657 mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); 658 mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
659
660 if (swsusp_resume_device)
661 goto Check_image;
662
663 if (!strlen(resume_file)) {
664 error = -ENOENT;
665 goto Unlock;
666 }
667
668 pr_debug("PM: Checking image partition %s\n", resume_file);
669
670 /* Check if the device is there */
671 swsusp_resume_device = name_to_dev_t(resume_file);
658 if (!swsusp_resume_device) { 672 if (!swsusp_resume_device) {
659 if (!strlen(resume_file)) {
660 mutex_unlock(&pm_mutex);
661 return -ENOENT;
662 }
663 /* 673 /*
664 * Some device discovery might still be in progress; we need 674 * Some device discovery might still be in progress; we need
665 * to wait for this to finish. 675 * to wait for this to finish.
666 */ 676 */
667 wait_for_device_probe(); 677 wait_for_device_probe();
678 /*
679 * We can't depend on SCSI devices being available after loading
680 * one of their modules until scsi_complete_async_scans() is
681 * called and the resume device usually is a SCSI one.
682 */
683 scsi_complete_async_scans();
684
668 swsusp_resume_device = name_to_dev_t(resume_file); 685 swsusp_resume_device = name_to_dev_t(resume_file);
669 pr_debug("PM: Resume from partition %s\n", resume_file); 686 if (!swsusp_resume_device) {
670 } else { 687 error = -ENODEV;
671 pr_debug("PM: Resume from partition %d:%d\n", 688 goto Unlock;
672 MAJOR(swsusp_resume_device), 689 }
673 MINOR(swsusp_resume_device));
674 } 690 }
675 691
676 if (noresume) { 692 Check_image:
677 /** 693 pr_debug("PM: Resume from partition %d:%d\n",
678 * FIXME: If noresume is specified, we need to find the 694 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
679 * partition and reset it back to normal swap space.
680 */
681 mutex_unlock(&pm_mutex);
682 return 0;
683 }
684 695
685 pr_debug("PM: Checking hibernation image.\n"); 696 pr_debug("PM: Checking hibernation image.\n");
686 error = swsusp_check(); 697 error = swsusp_check();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f172f41858bb..f99ed6a75eac 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -291,20 +291,26 @@ static int suspend_enter(suspend_state_t state)
291 291
292 device_pm_lock(); 292 device_pm_lock();
293 293
294 if (suspend_ops->prepare) {
295 error = suspend_ops->prepare();
296 if (error)
297 goto Done;
298 }
299
294 error = device_power_down(PMSG_SUSPEND); 300 error = device_power_down(PMSG_SUSPEND);
295 if (error) { 301 if (error) {
296 printk(KERN_ERR "PM: Some devices failed to power down\n"); 302 printk(KERN_ERR "PM: Some devices failed to power down\n");
297 goto Done; 303 goto Platfrom_finish;
298 } 304 }
299 305
300 if (suspend_ops->prepare) { 306 if (suspend_ops->prepare_late) {
301 error = suspend_ops->prepare(); 307 error = suspend_ops->prepare_late();
302 if (error) 308 if (error)
303 goto Power_up_devices; 309 goto Power_up_devices;
304 } 310 }
305 311
306 if (suspend_test(TEST_PLATFORM)) 312 if (suspend_test(TEST_PLATFORM))
307 goto Platfrom_finish; 313 goto Platform_wake;
308 314
309 error = disable_nonboot_cpus(); 315 error = disable_nonboot_cpus();
310 if (error || suspend_test(TEST_CPUS)) 316 if (error || suspend_test(TEST_CPUS))
@@ -326,13 +332,17 @@ static int suspend_enter(suspend_state_t state)
326 Enable_cpus: 332 Enable_cpus:
327 enable_nonboot_cpus(); 333 enable_nonboot_cpus();
328 334
329 Platfrom_finish: 335 Platform_wake:
330 if (suspend_ops->finish) 336 if (suspend_ops->wake)
331 suspend_ops->finish(); 337 suspend_ops->wake();
332 338
333 Power_up_devices: 339 Power_up_devices:
334 device_power_up(PMSG_RESUME); 340 device_power_up(PMSG_RESUME);
335 341
342 Platfrom_finish:
343 if (suspend_ops->finish)
344 suspend_ops->finish();
345
336 Done: 346 Done:
337 device_pm_unlock(); 347 device_pm_unlock();
338 348
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 505f319e489c..8ba052c86d48 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -64,8 +64,6 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
64 struct bio *bio; 64 struct bio *bio;
65 65
66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); 66 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
67 if (!bio)
68 return -ENOMEM;
69 bio->bi_sector = page_off * (PAGE_SIZE >> 9); 67 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
70 bio->bi_bdev = resume_bdev; 68 bio->bi_bdev = resume_bdev;
71 bio->bi_end_io = end_swap_bio_read; 69 bio->bi_end_io = end_swap_bio_read;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6c85359364f2..ed97375daae9 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,6 +24,7 @@
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
27#include <scsi/scsi_scan.h>
27 28
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29 30
@@ -92,6 +93,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
92 filp->private_data = data; 93 filp->private_data = data;
93 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 94 memset(&data->handle, 0, sizeof(struct snapshot_handle));
94 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { 95 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
96 /* Hibernating. The image device should be accessible. */
95 data->swap = swsusp_resume_device ? 97 data->swap = swsusp_resume_device ?
96 swap_type_of(swsusp_resume_device, 0, NULL) : -1; 98 swap_type_of(swsusp_resume_device, 0, NULL) : -1;
97 data->mode = O_RDONLY; 99 data->mode = O_RDONLY;
@@ -99,6 +101,13 @@ static int snapshot_open(struct inode *inode, struct file *filp)
99 if (error) 101 if (error)
100 pm_notifier_call_chain(PM_POST_HIBERNATION); 102 pm_notifier_call_chain(PM_POST_HIBERNATION);
101 } else { 103 } else {
104 /*
105 * Resuming. We may need to wait for the image device to
106 * appear.
107 */
108 wait_for_device_probe();
109 scsi_complete_async_scans();
110
102 data->swap = -1; 111 data->swap = -1;
103 data->mode = O_WRONLY; 112 data->mode = O_WRONLY;
104 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 113 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
diff --git a/kernel/printk.c b/kernel/printk.c
index a5f61a9acedb..5052b5497c67 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1311,8 +1311,11 @@ EXPORT_SYMBOL(printk_ratelimit);
1311bool printk_timed_ratelimit(unsigned long *caller_jiffies, 1311bool printk_timed_ratelimit(unsigned long *caller_jiffies,
1312 unsigned int interval_msecs) 1312 unsigned int interval_msecs)
1313{ 1313{
1314 if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) { 1314 if (*caller_jiffies == 0
1315 *caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs); 1315 || !time_in_range(jiffies, *caller_jiffies,
1316 *caller_jiffies
1317 + msecs_to_jiffies(interval_msecs))) {
1318 *caller_jiffies = jiffies;
1316 return true; 1319 return true;
1317 } 1320 }
1318 return false; 1321 return false;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aaad0ec34194..0692ab5a0d67 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -21,9 +21,7 @@
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24 24#include <linux/uaccess.h>
25#include <asm/pgtable.h>
26#include <asm/uaccess.h>
27 25
28 26
29/* 27/*
@@ -48,7 +46,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
48 list_add(&child->ptrace_entry, &new_parent->ptraced); 46 list_add(&child->ptrace_entry, &new_parent->ptraced);
49 child->parent = new_parent; 47 child->parent = new_parent;
50} 48}
51 49
52/* 50/*
53 * Turn a tracing stop into a normal stop now, since with no tracer there 51 * Turn a tracing stop into a normal stop now, since with no tracer there
54 * would be no way to wake it up with SIGCONT or SIGKILL. If there was a 52 * would be no way to wake it up with SIGCONT or SIGKILL. If there was a
@@ -173,7 +171,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
173 task_lock(task); 171 task_lock(task);
174 err = __ptrace_may_access(task, mode); 172 err = __ptrace_may_access(task, mode);
175 task_unlock(task); 173 task_unlock(task);
176 return (!err ? true : false); 174 return !err;
177} 175}
178 176
179int ptrace_attach(struct task_struct *task) 177int ptrace_attach(struct task_struct *task)
@@ -190,7 +188,7 @@ int ptrace_attach(struct task_struct *task)
190 /* Protect exec's credential calculations against our interference; 188 /* Protect exec's credential calculations against our interference;
191 * SUID, SGID and LSM creds get determined differently under ptrace. 189 * SUID, SGID and LSM creds get determined differently under ptrace.
192 */ 190 */
193 retval = mutex_lock_interruptible(&current->cred_exec_mutex); 191 retval = mutex_lock_interruptible(&task->cred_exec_mutex);
194 if (retval < 0) 192 if (retval < 0)
195 goto out; 193 goto out;
196 194
@@ -234,7 +232,7 @@ repeat:
234bad: 232bad:
235 write_unlock_irqrestore(&tasklist_lock, flags); 233 write_unlock_irqrestore(&tasklist_lock, flags);
236 task_unlock(task); 234 task_unlock(task);
237 mutex_unlock(&current->cred_exec_mutex); 235 mutex_unlock(&task->cred_exec_mutex);
238out: 236out:
239 return retval; 237 return retval;
240} 238}
@@ -358,7 +356,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
358 copied += retval; 356 copied += retval;
359 src += retval; 357 src += retval;
360 dst += retval; 358 dst += retval;
361 len -= retval; 359 len -= retval;
362 } 360 }
363 return copied; 361 return copied;
364} 362}
@@ -383,7 +381,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
383 copied += retval; 381 copied += retval;
384 src += retval; 382 src += retval;
385 dst += retval; 383 dst += retval;
386 len -= retval; 384 len -= retval;
387 } 385 }
388 return copied; 386 return copied;
389} 387}
@@ -496,9 +494,9 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
496 if (unlikely(!arch_has_single_step())) 494 if (unlikely(!arch_has_single_step()))
497 return -EIO; 495 return -EIO;
498 user_enable_single_step(child); 496 user_enable_single_step(child);
499 } 497 } else {
500 else
501 user_disable_single_step(child); 498 user_disable_single_step(child);
499 }
502 500
503 child->exit_code = data; 501 child->exit_code = data;
504 wake_up_process(child); 502 wake_up_process(child);
@@ -606,10 +604,11 @@ repeat:
606 ret = security_ptrace_traceme(current->parent); 604 ret = security_ptrace_traceme(current->parent);
607 605
608 /* 606 /*
609 * Set the ptrace bit in the process ptrace flags. 607 * Check PF_EXITING to ensure ->real_parent has not passed
610 * Then link us on our parent's ptraced list. 608 * exit_ptrace(). Otherwise we don't report the error but
609 * pretend ->real_parent untraces us right after return.
611 */ 610 */
612 if (!ret) { 611 if (!ret && !(current->real_parent->flags & PF_EXITING)) {
613 current->ptrace |= PT_PTRACED; 612 current->ptrace |= PT_PTRACED;
614 __ptrace_link(current, current->real_parent); 613 __ptrace_link(current, current->real_parent);
615 } 614 }
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 654c640a6b9c..0f2b0b311304 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -65,6 +65,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_BITS_NONE, 66 .cpumask = CPU_BITS_NONE,
67}; 67};
68
68static struct rcu_ctrlblk rcu_bh_ctrlblk = { 69static struct rcu_ctrlblk rcu_bh_ctrlblk = {
69 .cur = -300, 70 .cur = -300,
70 .completed = -300, 71 .completed = -300,
@@ -73,8 +74,26 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
73 .cpumask = CPU_BITS_NONE, 74 .cpumask = CPU_BITS_NONE,
74}; 75};
75 76
76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 77static DEFINE_PER_CPU(struct rcu_data, rcu_data);
77DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; 78static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79
80/*
81 * Increment the quiescent state counter.
82 * The counter is a bit degenerated: We do not need to know
83 * how many quiescent states passed, just if there was at least
84 * one since the start of the grace period. Thus just a flag.
85 */
86void rcu_qsctr_inc(int cpu)
87{
88 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
89 rdp->passed_quiesc = 1;
90}
91
92void rcu_bh_qsctr_inc(int cpu)
93{
94 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
95 rdp->passed_quiesc = 1;
96}
78 97
79static int blimit = 10; 98static int blimit = 10;
80static int qhimark = 10000; 99static int qhimark = 10000;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2c7b8457d0d2..a967c9feb90a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -58,6 +58,10 @@ static DEFINE_MUTEX(rcu_barrier_mutex);
58static struct completion rcu_barrier_completion; 58static struct completion rcu_barrier_completion;
59int rcu_scheduler_active __read_mostly; 59int rcu_scheduler_active __read_mostly;
60 60
61static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
62static struct rcu_head rcu_migrate_head[3];
63static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
64
61/* 65/*
62 * Awaken the corresponding synchronize_rcu() instance now that a 66 * Awaken the corresponding synchronize_rcu() instance now that a
63 * grace period has elapsed. 67 * grace period has elapsed.
@@ -122,7 +126,10 @@ static void rcu_barrier_func(void *type)
122 } 126 }
123} 127}
124 128
125static inline void wait_migrated_callbacks(void); 129static inline void wait_migrated_callbacks(void)
130{
131 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
132}
126 133
127/* 134/*
128 * Orchestrate the specified type of RCU barrier, waiting for all 135 * Orchestrate the specified type of RCU barrier, waiting for all
@@ -179,21 +186,12 @@ void rcu_barrier_sched(void)
179} 186}
180EXPORT_SYMBOL_GPL(rcu_barrier_sched); 187EXPORT_SYMBOL_GPL(rcu_barrier_sched);
181 188
182static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
183static struct rcu_head rcu_migrate_head[3];
184static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
185
186static void rcu_migrate_callback(struct rcu_head *notused) 189static void rcu_migrate_callback(struct rcu_head *notused)
187{ 190{
188 if (atomic_dec_and_test(&rcu_migrate_type_count)) 191 if (atomic_dec_and_test(&rcu_migrate_type_count))
189 wake_up(&rcu_migrate_wq); 192 wake_up(&rcu_migrate_wq);
190} 193}
191 194
192static inline void wait_migrated_callbacks(void)
193{
194 wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
195}
196
197static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, 195static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
198 unsigned long action, void *hcpu) 196 unsigned long action, void *hcpu)
199{ 197{
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 5d59e850fb71..ce97a4df64d3 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -147,7 +147,51 @@ struct rcu_ctrlblk {
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ 147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
148}; 148};
149 149
150struct rcu_dyntick_sched {
151 int dynticks;
152 int dynticks_snap;
153 int sched_qs;
154 int sched_qs_snap;
155 int sched_dynticks_snap;
156};
157
158static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
159 .dynticks = 1,
160};
161
162void rcu_qsctr_inc(int cpu)
163{
164 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
165
166 rdssp->sched_qs++;
167}
168
169#ifdef CONFIG_NO_HZ
170
171void rcu_enter_nohz(void)
172{
173 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
174
175 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
176 __get_cpu_var(rcu_dyntick_sched).dynticks++;
177 WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
178}
179
180void rcu_exit_nohz(void)
181{
182 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
183
184 __get_cpu_var(rcu_dyntick_sched).dynticks++;
185 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
186 WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
187 &rs);
188}
189
190#endif /* CONFIG_NO_HZ */
191
192
150static DEFINE_PER_CPU(struct rcu_data, rcu_data); 193static DEFINE_PER_CPU(struct rcu_data, rcu_data);
194
151static struct rcu_ctrlblk rcu_ctrlblk = { 195static struct rcu_ctrlblk rcu_ctrlblk = {
152 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), 196 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
153 .completed = 0, 197 .completed = 0,
@@ -427,10 +471,6 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
427 } 471 }
428} 472}
429 473
430DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
431 .dynticks = 1,
432};
433
434#ifdef CONFIG_NO_HZ 474#ifdef CONFIG_NO_HZ
435static DEFINE_PER_CPU(int, rcu_update_flag); 475static DEFINE_PER_CPU(int, rcu_update_flag);
436 476
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 97ce31579ec0..d2a372fb0b9b 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -78,6 +78,26 @@ DEFINE_PER_CPU(struct rcu_data, rcu_data);
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 80
81/*
82 * Increment the quiescent state counter.
83 * The counter is a bit degenerated: We do not need to know
84 * how many quiescent states passed, just if there was at least
85 * one since the start of the grace period. Thus just a flag.
86 */
87void rcu_qsctr_inc(int cpu)
88{
89 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
90 rdp->passed_quiesc = 1;
91 rdp->passed_quiesc_completed = rdp->completed;
92}
93
94void rcu_bh_qsctr_inc(int cpu)
95{
96 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
97 rdp->passed_quiesc = 1;
98 rdp->passed_quiesc_completed = rdp->completed;
99}
100
81#ifdef CONFIG_NO_HZ 101#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 102DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
83 .dynticks_nesting = 1, 103 .dynticks_nesting = 1,
@@ -510,8 +530,6 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
510 rdp->qs_pending = 1; 530 rdp->qs_pending = 1;
511 rdp->passed_quiesc = 0; 531 rdp->passed_quiesc = 0;
512 rdp->gpnum = rsp->gpnum; 532 rdp->gpnum = rsp->gpnum;
513 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
514 RCU_JIFFIES_TILL_FORCE_QS;
515} 533}
516 534
517/* 535/*
@@ -558,8 +576,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
558 rsp->gpnum++; 576 rsp->gpnum++;
559 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 577 rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
560 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 578 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
561 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
562 RCU_JIFFIES_TILL_FORCE_QS;
563 record_gp_stall_check_time(rsp); 579 record_gp_stall_check_time(rsp);
564 dyntick_record_completed(rsp, rsp->completed - 1); 580 dyntick_record_completed(rsp, rsp->completed - 1);
565 note_new_gpnum(rsp, rdp); 581 note_new_gpnum(rsp, rdp);
@@ -1035,7 +1051,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1035{ 1051{
1036 unsigned long flags; 1052 unsigned long flags;
1037 long lastcomp; 1053 long lastcomp;
1038 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
1039 struct rcu_node *rnp = rcu_get_root(rsp); 1054 struct rcu_node *rnp = rcu_get_root(rsp);
1040 u8 signaled; 1055 u8 signaled;
1041 1056
@@ -1046,16 +1061,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1046 return; /* Someone else is already on the job. */ 1061 return; /* Someone else is already on the job. */
1047 } 1062 }
1048 if (relaxed && 1063 if (relaxed &&
1049 (long)(rsp->jiffies_force_qs - jiffies) >= 0 && 1064 (long)(rsp->jiffies_force_qs - jiffies) >= 0)
1050 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) >= 0)
1051 goto unlock_ret; /* no emergency and done recently. */ 1065 goto unlock_ret; /* no emergency and done recently. */
1052 rsp->n_force_qs++; 1066 rsp->n_force_qs++;
1053 spin_lock(&rnp->lock); 1067 spin_lock(&rnp->lock);
1054 lastcomp = rsp->completed; 1068 lastcomp = rsp->completed;
1055 signaled = rsp->signaled; 1069 signaled = rsp->signaled;
1056 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1070 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1057 rdp->n_rcu_pending_force_qs = rdp->n_rcu_pending +
1058 RCU_JIFFIES_TILL_FORCE_QS;
1059 if (lastcomp == rsp->gpnum) { 1071 if (lastcomp == rsp->gpnum) {
1060 rsp->n_force_qs_ngp++; 1072 rsp->n_force_qs_ngp++;
1061 spin_unlock(&rnp->lock); 1073 spin_unlock(&rnp->lock);
@@ -1124,8 +1136,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1124 * If an RCU GP has gone long enough, go check for dyntick 1136 * If an RCU GP has gone long enough, go check for dyntick
1125 * idle CPUs and, if needed, send resched IPIs. 1137 * idle CPUs and, if needed, send resched IPIs.
1126 */ 1138 */
1127 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 || 1139 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
1128 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1129 force_quiescent_state(rsp, 1); 1140 force_quiescent_state(rsp, 1);
1130 1141
1131 /* 1142 /*
@@ -1210,8 +1221,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1210 if (unlikely(++rdp->qlen > qhimark)) { 1221 if (unlikely(++rdp->qlen > qhimark)) {
1211 rdp->blimit = LONG_MAX; 1222 rdp->blimit = LONG_MAX;
1212 force_quiescent_state(rsp, 0); 1223 force_quiescent_state(rsp, 0);
1213 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 || 1224 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
1214 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0)
1215 force_quiescent_state(rsp, 1); 1225 force_quiescent_state(rsp, 1);
1216 local_irq_restore(flags); 1226 local_irq_restore(flags);
1217} 1227}
@@ -1270,8 +1280,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1270 1280
1271 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1281 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1272 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && 1282 if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
1273 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0 || 1283 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0))
1274 (rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending) < 0))
1275 return 1; 1284 return 1;
1276 1285
1277 /* nothing to do */ 1286 /* nothing to do */
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
new file mode 100644
index 000000000000..5e872bbf07f5
--- /dev/null
+++ b/kernel/rcutree.h
@@ -0,0 +1,10 @@
1
2/*
3 * RCU implementation internal declarations:
4 */
5extern struct rcu_state rcu_state;
6DECLARE_PER_CPU(struct rcu_data, rcu_data);
7
8extern struct rcu_state rcu_bh_state;
9DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
10
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d6db3e837826..4b1875ba9404 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,18 +43,18 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#include "rcutree.h"
47
46static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 48static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
47{ 49{
48 if (!rdp->beenonline) 50 if (!rdp->beenonline)
49 return; 51 return;
50 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d rpfq=%ld rp=%x", 52 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d",
51 rdp->cpu, 53 rdp->cpu,
52 cpu_is_offline(rdp->cpu) ? '!' : ' ', 54 cpu_is_offline(rdp->cpu) ? '!' : ' ',
53 rdp->completed, rdp->gpnum, 55 rdp->completed, rdp->gpnum,
54 rdp->passed_quiesc, rdp->passed_quiesc_completed, 56 rdp->passed_quiesc, rdp->passed_quiesc_completed,
55 rdp->qs_pending, 57 rdp->qs_pending);
56 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
57 (int)(rdp->n_rcu_pending & 0xffff));
58#ifdef CONFIG_NO_HZ 58#ifdef CONFIG_NO_HZ
59 seq_printf(m, " dt=%d/%d dn=%d df=%lu", 59 seq_printf(m, " dt=%d/%d dn=%d df=%lu",
60 rdp->dynticks->dynticks, 60 rdp->dynticks->dynticks,
@@ -100,14 +100,12 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
100{ 100{
101 if (!rdp->beenonline) 101 if (!rdp->beenonline)
102 return; 102 return;
103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d,%ld,%ld", 103 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d",
104 rdp->cpu, 104 rdp->cpu,
105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"", 105 cpu_is_offline(rdp->cpu) ? "\"Y\"" : "\"N\"",
106 rdp->completed, rdp->gpnum, 106 rdp->completed, rdp->gpnum,
107 rdp->passed_quiesc, rdp->passed_quiesc_completed, 107 rdp->passed_quiesc, rdp->passed_quiesc_completed,
108 rdp->qs_pending, 108 rdp->qs_pending);
109 rdp->n_rcu_pending_force_qs - rdp->n_rcu_pending,
110 rdp->n_rcu_pending);
111#ifdef CONFIG_NO_HZ 109#ifdef CONFIG_NO_HZ
112 seq_printf(m, ",%d,%d,%d,%lu", 110 seq_printf(m, ",%d,%d,%d,%lu",
113 rdp->dynticks->dynticks, 111 rdp->dynticks->dynticks,
@@ -121,7 +119,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
121 119
122static int show_rcudata_csv(struct seq_file *m, void *unused) 120static int show_rcudata_csv(struct seq_file *m, void *unused)
123{ 121{
124 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",\"rpfq\",\"rp\","); 122 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
125#ifdef CONFIG_NO_HZ 123#ifdef CONFIG_NO_HZ
126 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 124 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
127#endif /* #ifdef CONFIG_NO_HZ */ 125#endif /* #ifdef CONFIG_NO_HZ */
diff --git a/kernel/relay.c b/kernel/relay.c
index e92db8c06acf..bc188549788f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -677,9 +677,7 @@ int relay_late_setup_files(struct rchan *chan,
677 */ 677 */
678 for_each_online_cpu(i) { 678 for_each_online_cpu(i) {
679 if (unlikely(!chan->buf[i])) { 679 if (unlikely(!chan->buf[i])) {
680 printk(KERN_ERR "relay_late_setup_files: CPU %u " 680 WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
681 "has no buffer, it must have!\n", i);
682 BUG();
683 err = -EINVAL; 681 err = -EINVAL;
684 break; 682 break;
685 } 683 }
diff --git a/kernel/resource.c b/kernel/resource.c
index fd5d7d574bb9..ac5f3a36923f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -533,43 +533,21 @@ static void __init __reserve_region_with_split(struct resource *root,
533 res->end = end; 533 res->end = end;
534 res->flags = IORESOURCE_BUSY; 534 res->flags = IORESOURCE_BUSY;
535 535
536 for (;;) { 536 conflict = __request_resource(parent, res);
537 conflict = __request_resource(parent, res); 537 if (!conflict)
538 if (!conflict) 538 return;
539 break;
540 if (conflict != parent) {
541 parent = conflict;
542 if (!(conflict->flags & IORESOURCE_BUSY))
543 continue;
544 }
545
546 /* Uhhuh, that didn't work out.. */
547 kfree(res);
548 res = NULL;
549 break;
550 }
551
552 if (!res) {
553 /* failed, split and try again */
554
555 /* conflict covered whole area */
556 if (conflict->start <= start && conflict->end >= end)
557 return;
558 539
559 if (conflict->start > start) 540 /* failed, split and try again */
560 __reserve_region_with_split(root, start, conflict->start-1, name); 541 kfree(res);
561 if (!(conflict->flags & IORESOURCE_BUSY)) {
562 resource_size_t common_start, common_end;
563 542
564 common_start = max(conflict->start, start); 543 /* conflict covered whole area */
565 common_end = min(conflict->end, end); 544 if (conflict->start <= start && conflict->end >= end)
566 if (common_start < common_end) 545 return;
567 __reserve_region_with_split(root, common_start, common_end, name);
568 }
569 if (conflict->end < end)
570 __reserve_region_with_split(root, conflict->end+1, end, name);
571 }
572 546
547 if (conflict->start > start)
548 __reserve_region_with_split(root, start, conflict->start-1, name);
549 if (conflict->end < end)
550 __reserve_region_with_split(root, conflict->end+1, end, name);
573} 551}
574 552
575void __init reserve_region_with_split(struct resource *root, 553void __init reserve_region_with_split(struct resource *root,
diff --git a/kernel/sched.c b/kernel/sched.c
index 2325db2be31b..26efa475bdc1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
231 231
232 spin_lock(&rt_b->rt_runtime_lock); 232 spin_lock(&rt_b->rt_runtime_lock);
233 for (;;) { 233 for (;;) {
234 unsigned long delta;
235 ktime_t soft, hard;
236
234 if (hrtimer_active(&rt_b->rt_period_timer)) 237 if (hrtimer_active(&rt_b->rt_period_timer))
235 break; 238 break;
236 239
237 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 240 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
238 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 241 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
239 hrtimer_start_expires(&rt_b->rt_period_timer, 242
240 HRTIMER_MODE_ABS); 243 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
244 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
245 delta = ktime_to_ns(ktime_sub(hard, soft));
246 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
247 HRTIMER_MODE_ABS, 0);
241 } 248 }
242 spin_unlock(&rt_b->rt_runtime_lock); 249 spin_unlock(&rt_b->rt_runtime_lock);
243} 250}
@@ -1146,7 +1153,8 @@ static __init void init_hrtick(void)
1146 */ 1153 */
1147static void hrtick_start(struct rq *rq, u64 delay) 1154static void hrtick_start(struct rq *rq, u64 delay)
1148{ 1155{
1149 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1156 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1157 HRTIMER_MODE_REL, 0);
1150} 1158}
1151 1159
1152static inline void init_hrtick(void) 1160static inline void init_hrtick(void)
@@ -1410,10 +1418,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1410 struct rq_iterator *iterator); 1418 struct rq_iterator *iterator);
1411#endif 1419#endif
1412 1420
1421/* Time spent by the tasks of the cpu accounting group executing in ... */
1422enum cpuacct_stat_index {
1423 CPUACCT_STAT_USER, /* ... user mode */
1424 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1425
1426 CPUACCT_STAT_NSTATS,
1427};
1428
1413#ifdef CONFIG_CGROUP_CPUACCT 1429#ifdef CONFIG_CGROUP_CPUACCT
1414static void cpuacct_charge(struct task_struct *tsk, u64 cputime); 1430static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1431static void cpuacct_update_stats(struct task_struct *tsk,
1432 enum cpuacct_stat_index idx, cputime_t val);
1415#else 1433#else
1416static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1434static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1435static inline void cpuacct_update_stats(struct task_struct *tsk,
1436 enum cpuacct_stat_index idx, cputime_t val) {}
1417#endif 1437#endif
1418 1438
1419static inline void inc_cpu_load(struct rq *rq, unsigned long load) 1439static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -3818,19 +3838,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3818 */ 3838 */
3819#define MAX_PINNED_INTERVAL 512 3839#define MAX_PINNED_INTERVAL 512
3820 3840
3841/* Working cpumask for load_balance and load_balance_newidle. */
3842static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3843
3821/* 3844/*
3822 * Check this_cpu to ensure it is balanced within domain. Attempt to move 3845 * Check this_cpu to ensure it is balanced within domain. Attempt to move
3823 * tasks if there is an imbalance. 3846 * tasks if there is an imbalance.
3824 */ 3847 */
3825static int load_balance(int this_cpu, struct rq *this_rq, 3848static int load_balance(int this_cpu, struct rq *this_rq,
3826 struct sched_domain *sd, enum cpu_idle_type idle, 3849 struct sched_domain *sd, enum cpu_idle_type idle,
3827 int *balance, struct cpumask *cpus) 3850 int *balance)
3828{ 3851{
3829 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3852 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3830 struct sched_group *group; 3853 struct sched_group *group;
3831 unsigned long imbalance; 3854 unsigned long imbalance;
3832 struct rq *busiest; 3855 struct rq *busiest;
3833 unsigned long flags; 3856 unsigned long flags;
3857 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3834 3858
3835 cpumask_setall(cpus); 3859 cpumask_setall(cpus);
3836 3860
@@ -3985,8 +4009,7 @@ out:
3985 * this_rq is locked. 4009 * this_rq is locked.
3986 */ 4010 */
3987static int 4011static int
3988load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 4012load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3989 struct cpumask *cpus)
3990{ 4013{
3991 struct sched_group *group; 4014 struct sched_group *group;
3992 struct rq *busiest = NULL; 4015 struct rq *busiest = NULL;
@@ -3994,6 +4017,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3994 int ld_moved = 0; 4017 int ld_moved = 0;
3995 int sd_idle = 0; 4018 int sd_idle = 0;
3996 int all_pinned = 0; 4019 int all_pinned = 0;
4020 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3997 4021
3998 cpumask_setall(cpus); 4022 cpumask_setall(cpus);
3999 4023
@@ -4134,10 +4158,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4134 struct sched_domain *sd; 4158 struct sched_domain *sd;
4135 int pulled_task = 0; 4159 int pulled_task = 0;
4136 unsigned long next_balance = jiffies + HZ; 4160 unsigned long next_balance = jiffies + HZ;
4137 cpumask_var_t tmpmask;
4138
4139 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
4140 return;
4141 4161
4142 for_each_domain(this_cpu, sd) { 4162 for_each_domain(this_cpu, sd) {
4143 unsigned long interval; 4163 unsigned long interval;
@@ -4148,7 +4168,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4148 if (sd->flags & SD_BALANCE_NEWIDLE) 4168 if (sd->flags & SD_BALANCE_NEWIDLE)
4149 /* If we've pulled tasks over stop searching: */ 4169 /* If we've pulled tasks over stop searching: */
4150 pulled_task = load_balance_newidle(this_cpu, this_rq, 4170 pulled_task = load_balance_newidle(this_cpu, this_rq,
4151 sd, tmpmask); 4171 sd);
4152 4172
4153 interval = msecs_to_jiffies(sd->balance_interval); 4173 interval = msecs_to_jiffies(sd->balance_interval);
4154 if (time_after(next_balance, sd->last_balance + interval)) 4174 if (time_after(next_balance, sd->last_balance + interval))
@@ -4163,7 +4183,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4163 */ 4183 */
4164 this_rq->next_balance = next_balance; 4184 this_rq->next_balance = next_balance;
4165 } 4185 }
4166 free_cpumask_var(tmpmask);
4167} 4186}
4168 4187
4169/* 4188/*
@@ -4313,11 +4332,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4313 unsigned long next_balance = jiffies + 60*HZ; 4332 unsigned long next_balance = jiffies + 60*HZ;
4314 int update_next_balance = 0; 4333 int update_next_balance = 0;
4315 int need_serialize; 4334 int need_serialize;
4316 cpumask_var_t tmp;
4317
4318 /* Fails alloc? Rebalancing probably not a priority right now. */
4319 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
4320 return;
4321 4335
4322 for_each_domain(cpu, sd) { 4336 for_each_domain(cpu, sd) {
4323 if (!(sd->flags & SD_LOAD_BALANCE)) 4337 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -4342,7 +4356,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4342 } 4356 }
4343 4357
4344 if (time_after_eq(jiffies, sd->last_balance + interval)) { 4358 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4345 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { 4359 if (load_balance(cpu, rq, sd, idle, &balance)) {
4346 /* 4360 /*
4347 * We've pulled tasks over so either we're no 4361 * We've pulled tasks over so either we're no
4348 * longer idle, or one of our SMT siblings is 4362 * longer idle, or one of our SMT siblings is
@@ -4376,8 +4390,6 @@ out:
4376 */ 4390 */
4377 if (likely(update_next_balance)) 4391 if (likely(update_next_balance))
4378 rq->next_balance = next_balance; 4392 rq->next_balance = next_balance;
4379
4380 free_cpumask_var(tmp);
4381} 4393}
4382 4394
4383/* 4395/*
@@ -4511,9 +4523,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
4511EXPORT_PER_CPU_SYMBOL(kstat); 4523EXPORT_PER_CPU_SYMBOL(kstat);
4512 4524
4513/* 4525/*
4514 * Return any ns on the sched_clock that have not yet been banked in 4526 * Return any ns on the sched_clock that have not yet been accounted in
4515 * @p in case that task is currently running. 4527 * @p in case that task is currently running.
4528 *
4529 * Called with task_rq_lock() held on @rq.
4516 */ 4530 */
4531static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
4532{
4533 u64 ns = 0;
4534
4535 if (task_current(rq, p)) {
4536 update_rq_clock(rq);
4537 ns = rq->clock - p->se.exec_start;
4538 if ((s64)ns < 0)
4539 ns = 0;
4540 }
4541
4542 return ns;
4543}
4544
4517unsigned long long task_delta_exec(struct task_struct *p) 4545unsigned long long task_delta_exec(struct task_struct *p)
4518{ 4546{
4519 unsigned long flags; 4547 unsigned long flags;
@@ -4521,16 +4549,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
4521 u64 ns = 0; 4549 u64 ns = 0;
4522 4550
4523 rq = task_rq_lock(p, &flags); 4551 rq = task_rq_lock(p, &flags);
4552 ns = do_task_delta_exec(p, rq);
4553 task_rq_unlock(rq, &flags);
4524 4554
4525 if (task_current(rq, p)) { 4555 return ns;
4526 u64 delta_exec; 4556}
4527 4557
4528 update_rq_clock(rq); 4558/*
4529 delta_exec = rq->clock - p->se.exec_start; 4559 * Return accounted runtime for the task.
4530 if ((s64)delta_exec > 0) 4560 * In case the task is currently running, return the runtime plus current's
4531 ns = delta_exec; 4561 * pending runtime that have not been accounted yet.
4532 } 4562 */
4563unsigned long long task_sched_runtime(struct task_struct *p)
4564{
4565 unsigned long flags;
4566 struct rq *rq;
4567 u64 ns = 0;
4568
4569 rq = task_rq_lock(p, &flags);
4570 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
4571 task_rq_unlock(rq, &flags);
4572
4573 return ns;
4574}
4533 4575
4576/*
4577 * Return sum_exec_runtime for the thread group.
4578 * In case the task is currently running, return the sum plus current's
4579 * pending runtime that have not been accounted yet.
4580 *
4581 * Note that the thread group might have other running tasks as well,
4582 * so the return value not includes other pending runtime that other
4583 * running tasks might have.
4584 */
4585unsigned long long thread_group_sched_runtime(struct task_struct *p)
4586{
4587 struct task_cputime totals;
4588 unsigned long flags;
4589 struct rq *rq;
4590 u64 ns;
4591
4592 rq = task_rq_lock(p, &flags);
4593 thread_group_cputime(p, &totals);
4594 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
4534 task_rq_unlock(rq, &flags); 4595 task_rq_unlock(rq, &flags);
4535 4596
4536 return ns; 4597 return ns;
@@ -4559,6 +4620,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
4559 cpustat->nice = cputime64_add(cpustat->nice, tmp); 4620 cpustat->nice = cputime64_add(cpustat->nice, tmp);
4560 else 4621 else
4561 cpustat->user = cputime64_add(cpustat->user, tmp); 4622 cpustat->user = cputime64_add(cpustat->user, tmp);
4623
4624 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
4562 /* Account for user time used */ 4625 /* Account for user time used */
4563 acct_update_integrals(p); 4626 acct_update_integrals(p);
4564} 4627}
@@ -4620,6 +4683,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4620 else 4683 else
4621 cpustat->system = cputime64_add(cpustat->system, tmp); 4684 cpustat->system = cputime64_add(cpustat->system, tmp);
4622 4685
4686 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
4687
4623 /* Account for system time used */ 4688 /* Account for system time used */
4624 acct_update_integrals(p); 4689 acct_update_integrals(p);
4625} 4690}
@@ -4667,7 +4732,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
4667 4732
4668 if (user_tick) 4733 if (user_tick)
4669 account_user_time(p, one_jiffy, one_jiffy_scaled); 4734 account_user_time(p, one_jiffy, one_jiffy_scaled);
4670 else if (p != rq->idle) 4735 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
4671 account_system_time(p, HARDIRQ_OFFSET, one_jiffy, 4736 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
4672 one_jiffy_scaled); 4737 one_jiffy_scaled);
4673 else 4738 else
@@ -4781,10 +4846,7 @@ void scheduler_tick(void)
4781#endif 4846#endif
4782} 4847}
4783 4848
4784#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 4849notrace unsigned long get_parent_ip(unsigned long addr)
4785 defined(CONFIG_PREEMPT_TRACER))
4786
4787static inline unsigned long get_parent_ip(unsigned long addr)
4788{ 4850{
4789 if (in_lock_functions(addr)) { 4851 if (in_lock_functions(addr)) {
4790 addr = CALLER_ADDR2; 4852 addr = CALLER_ADDR2;
@@ -4794,6 +4856,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
4794 return addr; 4856 return addr;
4795} 4857}
4796 4858
4859#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4860 defined(CONFIG_PREEMPT_TRACER))
4861
4797void __kprobes add_preempt_count(int val) 4862void __kprobes add_preempt_count(int val)
4798{ 4863{
4799#ifdef CONFIG_DEBUG_PREEMPT 4864#ifdef CONFIG_DEBUG_PREEMPT
@@ -7302,7 +7367,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7302 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 7367 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
7303 7368
7304 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7369 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7370
7305 printk(KERN_CONT " %s", str); 7371 printk(KERN_CONT " %s", str);
7372 if (group->__cpu_power != SCHED_LOAD_SCALE) {
7373 printk(KERN_CONT " (__cpu_power = %d)",
7374 group->__cpu_power);
7375 }
7306 7376
7307 group = group->next; 7377 group = group->next;
7308 } while (group != sd->groups); 7378 } while (group != sd->groups);
@@ -7728,7 +7798,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7728{ 7798{
7729 int group; 7799 int group;
7730 7800
7731 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7801 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7732 group = cpumask_first(mask); 7802 group = cpumask_first(mask);
7733 if (sg) 7803 if (sg)
7734 *sg = &per_cpu(sched_group_core, group).sg; 7804 *sg = &per_cpu(sched_group_core, group).sg;
@@ -7757,7 +7827,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7757 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 7827 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7758 group = cpumask_first(mask); 7828 group = cpumask_first(mask);
7759#elif defined(CONFIG_SCHED_SMT) 7829#elif defined(CONFIG_SCHED_SMT)
7760 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7830 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7761 group = cpumask_first(mask); 7831 group = cpumask_first(mask);
7762#else 7832#else
7763 group = cpu; 7833 group = cpu;
@@ -8100,7 +8170,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8100 SD_INIT(sd, SIBLING); 8170 SD_INIT(sd, SIBLING);
8101 set_domain_attribute(sd, attr); 8171 set_domain_attribute(sd, attr);
8102 cpumask_and(sched_domain_span(sd), 8172 cpumask_and(sched_domain_span(sd),
8103 &per_cpu(cpu_sibling_map, i), cpu_map); 8173 topology_thread_cpumask(i), cpu_map);
8104 sd->parent = p; 8174 sd->parent = p;
8105 p->child = sd; 8175 p->child = sd;
8106 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 8176 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -8111,7 +8181,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8111 /* Set up CPU (sibling) groups */ 8181 /* Set up CPU (sibling) groups */
8112 for_each_cpu(i, cpu_map) { 8182 for_each_cpu(i, cpu_map) {
8113 cpumask_and(this_sibling_map, 8183 cpumask_and(this_sibling_map,
8114 &per_cpu(cpu_sibling_map, i), cpu_map); 8184 topology_thread_cpumask(i), cpu_map);
8115 if (i != cpumask_first(this_sibling_map)) 8185 if (i != cpumask_first(this_sibling_map))
8116 continue; 8186 continue;
8117 8187
@@ -8787,6 +8857,9 @@ void __init sched_init(void)
8787#ifdef CONFIG_USER_SCHED 8857#ifdef CONFIG_USER_SCHED
8788 alloc_size *= 2; 8858 alloc_size *= 2;
8789#endif 8859#endif
8860#ifdef CONFIG_CPUMASK_OFFSTACK
8861 alloc_size += num_possible_cpus() * cpumask_size();
8862#endif
8790 /* 8863 /*
8791 * As sched_init() is called before page_alloc is setup, 8864 * As sched_init() is called before page_alloc is setup,
8792 * we use alloc_bootmem(). 8865 * we use alloc_bootmem().
@@ -8824,6 +8897,12 @@ void __init sched_init(void)
8824 ptr += nr_cpu_ids * sizeof(void **); 8897 ptr += nr_cpu_ids * sizeof(void **);
8825#endif /* CONFIG_USER_SCHED */ 8898#endif /* CONFIG_USER_SCHED */
8826#endif /* CONFIG_RT_GROUP_SCHED */ 8899#endif /* CONFIG_RT_GROUP_SCHED */
8900#ifdef CONFIG_CPUMASK_OFFSTACK
8901 for_each_possible_cpu(i) {
8902 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
8903 ptr += cpumask_size();
8904 }
8905#endif /* CONFIG_CPUMASK_OFFSTACK */
8827 } 8906 }
8828 8907
8829#ifdef CONFIG_SMP 8908#ifdef CONFIG_SMP
@@ -9916,6 +9995,7 @@ struct cpuacct {
9916 struct cgroup_subsys_state css; 9995 struct cgroup_subsys_state css;
9917 /* cpuusage holds pointer to a u64-type object on every cpu */ 9996 /* cpuusage holds pointer to a u64-type object on every cpu */
9918 u64 *cpuusage; 9997 u64 *cpuusage;
9998 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9919 struct cpuacct *parent; 9999 struct cpuacct *parent;
9920}; 10000};
9921 10001
@@ -9940,20 +10020,32 @@ static struct cgroup_subsys_state *cpuacct_create(
9940 struct cgroup_subsys *ss, struct cgroup *cgrp) 10020 struct cgroup_subsys *ss, struct cgroup *cgrp)
9941{ 10021{
9942 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 10022 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
10023 int i;
9943 10024
9944 if (!ca) 10025 if (!ca)
9945 return ERR_PTR(-ENOMEM); 10026 goto out;
9946 10027
9947 ca->cpuusage = alloc_percpu(u64); 10028 ca->cpuusage = alloc_percpu(u64);
9948 if (!ca->cpuusage) { 10029 if (!ca->cpuusage)
9949 kfree(ca); 10030 goto out_free_ca;
9950 return ERR_PTR(-ENOMEM); 10031
9951 } 10032 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
10033 if (percpu_counter_init(&ca->cpustat[i], 0))
10034 goto out_free_counters;
9952 10035
9953 if (cgrp->parent) 10036 if (cgrp->parent)
9954 ca->parent = cgroup_ca(cgrp->parent); 10037 ca->parent = cgroup_ca(cgrp->parent);
9955 10038
9956 return &ca->css; 10039 return &ca->css;
10040
10041out_free_counters:
10042 while (--i >= 0)
10043 percpu_counter_destroy(&ca->cpustat[i]);
10044 free_percpu(ca->cpuusage);
10045out_free_ca:
10046 kfree(ca);
10047out:
10048 return ERR_PTR(-ENOMEM);
9957} 10049}
9958 10050
9959/* destroy an existing cpu accounting group */ 10051/* destroy an existing cpu accounting group */
@@ -9961,7 +10053,10 @@ static void
9961cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 10053cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9962{ 10054{
9963 struct cpuacct *ca = cgroup_ca(cgrp); 10055 struct cpuacct *ca = cgroup_ca(cgrp);
10056 int i;
9964 10057
10058 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
10059 percpu_counter_destroy(&ca->cpustat[i]);
9965 free_percpu(ca->cpuusage); 10060 free_percpu(ca->cpuusage);
9966 kfree(ca); 10061 kfree(ca);
9967} 10062}
@@ -10048,6 +10143,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
10048 return 0; 10143 return 0;
10049} 10144}
10050 10145
10146static const char *cpuacct_stat_desc[] = {
10147 [CPUACCT_STAT_USER] = "user",
10148 [CPUACCT_STAT_SYSTEM] = "system",
10149};
10150
10151static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
10152 struct cgroup_map_cb *cb)
10153{
10154 struct cpuacct *ca = cgroup_ca(cgrp);
10155 int i;
10156
10157 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
10158 s64 val = percpu_counter_read(&ca->cpustat[i]);
10159 val = cputime64_to_clock_t(val);
10160 cb->fill(cb, cpuacct_stat_desc[i], val);
10161 }
10162 return 0;
10163}
10164
10051static struct cftype files[] = { 10165static struct cftype files[] = {
10052 { 10166 {
10053 .name = "usage", 10167 .name = "usage",
@@ -10058,7 +10172,10 @@ static struct cftype files[] = {
10058 .name = "usage_percpu", 10172 .name = "usage_percpu",
10059 .read_seq_string = cpuacct_percpu_seq_read, 10173 .read_seq_string = cpuacct_percpu_seq_read,
10060 }, 10174 },
10061 10175 {
10176 .name = "stat",
10177 .read_map = cpuacct_stats_show,
10178 },
10062}; 10179};
10063 10180
10064static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 10181static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -10080,12 +10197,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10080 return; 10197 return;
10081 10198
10082 cpu = task_cpu(tsk); 10199 cpu = task_cpu(tsk);
10200
10201 rcu_read_lock();
10202
10083 ca = task_ca(tsk); 10203 ca = task_ca(tsk);
10084 10204
10085 for (; ca; ca = ca->parent) { 10205 for (; ca; ca = ca->parent) {
10086 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 10206 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
10087 *cpuusage += cputime; 10207 *cpuusage += cputime;
10088 } 10208 }
10209
10210 rcu_read_unlock();
10211}
10212
10213/*
10214 * Charge the system/user time to the task's accounting group.
10215 */
10216static void cpuacct_update_stats(struct task_struct *tsk,
10217 enum cpuacct_stat_index idx, cputime_t val)
10218{
10219 struct cpuacct *ca;
10220
10221 if (unlikely(!cpuacct_subsys.active))
10222 return;
10223
10224 rcu_read_lock();
10225 ca = task_ca(tsk);
10226
10227 do {
10228 percpu_counter_add(&ca->cpustat[idx], val);
10229 ca = ca->parent;
10230 } while (ca);
10231 rcu_read_unlock();
10089} 10232}
10090 10233
10091struct cgroup_subsys cpuacct_subsys = { 10234struct cgroup_subsys cpuacct_subsys = {
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 390f33234bd0..e1d16c9a7680 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -25,6 +25,7 @@
25 * consistent between cpus (never more than 2 jiffies difference). 25 * consistent between cpus (never more than 2 jiffies difference).
26 */ 26 */
27#include <linux/spinlock.h> 27#include <linux/spinlock.h>
28#include <linux/hardirq.h>
28#include <linux/module.h> 29#include <linux/module.h>
29#include <linux/percpu.h> 30#include <linux/percpu.h>
30#include <linux/ktime.h> 31#include <linux/ktime.h>
@@ -37,7 +38,8 @@
37 */ 38 */
38unsigned long long __attribute__((weak)) sched_clock(void) 39unsigned long long __attribute__((weak)) sched_clock(void)
39{ 40{
40 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); 41 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
42 * (NSEC_PER_SEC / HZ);
41} 43}
42 44
43static __read_mostly int sched_clock_running; 45static __read_mostly int sched_clock_running;
@@ -154,6 +156,17 @@ u64 sched_clock_cpu(int cpu)
154 return sched_clock(); 156 return sched_clock();
155 157
156 scd = cpu_sdc(cpu); 158 scd = cpu_sdc(cpu);
159
160 /*
161 * Normally this is not called in NMI context - but if it is,
162 * trying to do any locking here is totally lethal.
163 */
164 if (unlikely(in_nmi()))
165 return scd->clock;
166
167 if (unlikely(!sched_clock_running))
168 return 0ull;
169
157 WARN_ON_ONCE(!irqs_disabled()); 170 WARN_ON_ONCE(!irqs_disabled());
158 now = sched_clock(); 171 now = sched_clock();
159 172
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 1e00bfacf9b8..cdd3c89574cd 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -55,7 +55,7 @@ static int convert_prio(int prio)
55 * cpupri_find - find the best (lowest-pri) CPU in the system 55 * cpupri_find - find the best (lowest-pri) CPU in the system
56 * @cp: The cpupri context 56 * @cp: The cpupri context
57 * @p: The task 57 * @p: The task
58 * @lowest_mask: A mask to fill in with selected CPUs 58 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
59 * 59 *
60 * Note: This function returns the recommended CPUs as calculated during the 60 * Note: This function returns the recommended CPUs as calculated during the
61 * current invokation. By the time the call returns, the CPUs may have in 61 * current invokation. By the time the call returns, the CPUs may have in
@@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82 continue; 82 continue;
83 83
84 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 84 if (lowest_mask)
85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
85 return 1; 86 return 1;
86 } 87 }
87 88
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 299d012b4394..f2c66f8f9712 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -948,20 +948,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
948 948
949static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 949static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
950{ 950{
951 cpumask_var_t mask;
952
953 if (rq->curr->rt.nr_cpus_allowed == 1) 951 if (rq->curr->rt.nr_cpus_allowed == 1)
954 return; 952 return;
955 953
956 if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
957 return;
958
959 if (p->rt.nr_cpus_allowed != 1 954 if (p->rt.nr_cpus_allowed != 1
960 && cpupri_find(&rq->rd->cpupri, p, mask)) 955 && cpupri_find(&rq->rd->cpupri, p, NULL))
961 goto free; 956 return;
962 957
963 if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask)) 958 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
964 goto free; 959 return;
965 960
966 /* 961 /*
967 * There appears to be other cpus that can accept 962 * There appears to be other cpus that can accept
@@ -970,8 +965,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
970 */ 965 */
971 requeue_task_rt(rq, p, 1); 966 requeue_task_rt(rq, p, 1);
972 resched_task(rq->curr); 967 resched_task(rq->curr);
973free:
974 free_cpumask_var(mask);
975} 968}
976 969
977#endif /* CONFIG_SMP */ 970#endif /* CONFIG_SMP */
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index cf2bc01186ef..b28d19135f43 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -609,14 +609,14 @@ void slow_work_unregister_user(void)
609 if (slow_work_user_count == 0) { 609 if (slow_work_user_count == 0) {
610 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n"); 610 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
611 slow_work_threads_should_exit = true; 611 slow_work_threads_should_exit = true;
612 del_timer_sync(&slow_work_cull_timer);
613 del_timer_sync(&slow_work_oom_timer);
612 wake_up_all(&slow_work_thread_wq); 614 wake_up_all(&slow_work_thread_wq);
613 wait_for_completion(&slow_work_last_thread_exited); 615 wait_for_completion(&slow_work_last_thread_exited);
614 printk(KERN_NOTICE "Slow work thread pool:" 616 printk(KERN_NOTICE "Slow work thread pool:"
615 " Shut down complete\n"); 617 " Shut down complete\n");
616 } 618 }
617 619
618 del_timer_sync(&slow_work_cull_timer);
619
620 mutex_unlock(&slow_work_user_lock); 620 mutex_unlock(&slow_work_user_lock);
621} 621}
622EXPORT_SYMBOL(slow_work_unregister_user); 622EXPORT_SYMBOL(slow_work_unregister_user);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ea23ec087ee9..b525dd348511 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -21,8 +21,10 @@
21#include <linux/freezer.h> 21#include <linux/freezer.h>
22#include <linux/kthread.h> 22#include <linux/kthread.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/ftrace.h>
24#include <linux/smp.h> 25#include <linux/smp.h>
25#include <linux/tick.h> 26#include <linux/tick.h>
27#include <trace/irq.h>
26 28
27#include <asm/irq.h> 29#include <asm/irq.h>
28/* 30/*
@@ -52,13 +54,18 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
52 54
53static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 55static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
54 56
57char *softirq_to_name[NR_SOFTIRQS] = {
58 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK",
59 "TASKLET", "SCHED", "HRTIMER", "RCU"
60};
61
55/* 62/*
56 * we cannot loop indefinitely here to avoid userspace starvation, 63 * we cannot loop indefinitely here to avoid userspace starvation,
57 * but we also don't want to introduce a worst case 1/HZ latency 64 * but we also don't want to introduce a worst case 1/HZ latency
58 * to the pending events, so lets the scheduler to balance 65 * to the pending events, so lets the scheduler to balance
59 * the softirq load for us. 66 * the softirq load for us.
60 */ 67 */
61static inline void wakeup_softirqd(void) 68void wakeup_softirqd(void)
62{ 69{
63 /* Interrupts are disabled: no need to stop preemption */ 70 /* Interrupts are disabled: no need to stop preemption */
64 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 71 struct task_struct *tsk = __get_cpu_var(ksoftirqd);
@@ -79,13 +86,23 @@ static void __local_bh_disable(unsigned long ip)
79 WARN_ON_ONCE(in_irq()); 86 WARN_ON_ONCE(in_irq());
80 87
81 raw_local_irq_save(flags); 88 raw_local_irq_save(flags);
82 add_preempt_count(SOFTIRQ_OFFSET); 89 /*
90 * The preempt tracer hooks into add_preempt_count and will break
91 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
92 * is set and before current->softirq_enabled is cleared.
93 * We must manually increment preempt_count here and manually
94 * call the trace_preempt_off later.
95 */
96 preempt_count() += SOFTIRQ_OFFSET;
83 /* 97 /*
84 * Were softirqs turned off above: 98 * Were softirqs turned off above:
85 */ 99 */
86 if (softirq_count() == SOFTIRQ_OFFSET) 100 if (softirq_count() == SOFTIRQ_OFFSET)
87 trace_softirqs_off(ip); 101 trace_softirqs_off(ip);
88 raw_local_irq_restore(flags); 102 raw_local_irq_restore(flags);
103
104 if (preempt_count() == SOFTIRQ_OFFSET)
105 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
89} 106}
90#else /* !CONFIG_TRACE_IRQFLAGS */ 107#else /* !CONFIG_TRACE_IRQFLAGS */
91static inline void __local_bh_disable(unsigned long ip) 108static inline void __local_bh_disable(unsigned long ip)
@@ -169,6 +186,9 @@ EXPORT_SYMBOL(local_bh_enable_ip);
169 */ 186 */
170#define MAX_SOFTIRQ_RESTART 10 187#define MAX_SOFTIRQ_RESTART 10
171 188
189DEFINE_TRACE(softirq_entry);
190DEFINE_TRACE(softirq_exit);
191
172asmlinkage void __do_softirq(void) 192asmlinkage void __do_softirq(void)
173{ 193{
174 struct softirq_action *h; 194 struct softirq_action *h;
@@ -195,12 +215,14 @@ restart:
195 if (pending & 1) { 215 if (pending & 1) {
196 int prev_count = preempt_count(); 216 int prev_count = preempt_count();
197 217
218 trace_softirq_entry(h, softirq_vec);
198 h->action(h); 219 h->action(h);
199 220 trace_softirq_exit(h, softirq_vec);
200 if (unlikely(prev_count != preempt_count())) { 221 if (unlikely(prev_count != preempt_count())) {
201 printk(KERN_ERR "huh, entered softirq %td %p" 222 printk(KERN_ERR "huh, entered softirq %td %s %p"
202 "with preempt_count %08x," 223 "with preempt_count %08x,"
203 " exited with %08x?\n", h - softirq_vec, 224 " exited with %08x?\n", h - softirq_vec,
225 softirq_to_name[h - softirq_vec],
204 h->action, prev_count, preempt_count()); 226 h->action, prev_count, preempt_count());
205 preempt_count() = prev_count; 227 preempt_count() = prev_count;
206 } 228 }
@@ -450,9 +472,9 @@ void tasklet_kill(struct tasklet_struct *t)
450 printk("Attempt to kill tasklet from interrupt\n"); 472 printk("Attempt to kill tasklet from interrupt\n");
451 473
452 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { 474 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
453 do 475 do {
454 yield(); 476 yield();
455 while (test_bit(TASKLET_STATE_SCHED, &t->state)); 477 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
456 } 478 }
457 tasklet_unlock_wait(t); 479 tasklet_unlock_wait(t);
458 clear_bit(TASKLET_STATE_SCHED, &t->state); 480 clear_bit(TASKLET_STATE_SCHED, &t->state);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 85d5a2455103..88796c330838 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -166,97 +166,11 @@ void softlockup_tick(void)
166} 166}
167 167
168/* 168/*
169 * Have a reasonable limit on the number of tasks checked:
170 */
171unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
172
173/*
174 * Zero means infinite timeout - no checking done:
175 */
176unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
177
178unsigned long __read_mostly sysctl_hung_task_warnings = 10;
179
180/*
181 * Only do the hung-tasks check on one CPU:
182 */
183static int check_cpu __read_mostly = -1;
184
185static void check_hung_task(struct task_struct *t, unsigned long now)
186{
187 unsigned long switch_count = t->nvcsw + t->nivcsw;
188
189 if (t->flags & PF_FROZEN)
190 return;
191
192 if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
193 t->last_switch_count = switch_count;
194 t->last_switch_timestamp = now;
195 return;
196 }
197 if ((long)(now - t->last_switch_timestamp) <
198 sysctl_hung_task_timeout_secs)
199 return;
200 if (!sysctl_hung_task_warnings)
201 return;
202 sysctl_hung_task_warnings--;
203
204 /*
205 * Ok, the task did not get scheduled for more than 2 minutes,
206 * complain:
207 */
208 printk(KERN_ERR "INFO: task %s:%d blocked for more than "
209 "%ld seconds.\n", t->comm, t->pid,
210 sysctl_hung_task_timeout_secs);
211 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
212 " disables this message.\n");
213 sched_show_task(t);
214 __debug_show_held_locks(t);
215
216 t->last_switch_timestamp = now;
217 touch_nmi_watchdog();
218
219 if (softlockup_panic)
220 panic("softlockup: blocked tasks");
221}
222
223/*
224 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
225 * a really long time (120 seconds). If that happens, print out
226 * a warning.
227 */
228static void check_hung_uninterruptible_tasks(int this_cpu)
229{
230 int max_count = sysctl_hung_task_check_count;
231 unsigned long now = get_timestamp(this_cpu);
232 struct task_struct *g, *t;
233
234 /*
235 * If the system crashed already then all bets are off,
236 * do not report extra hung tasks:
237 */
238 if (test_taint(TAINT_DIE) || did_panic)
239 return;
240
241 read_lock(&tasklist_lock);
242 do_each_thread(g, t) {
243 if (!--max_count)
244 goto unlock;
245 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
246 if (t->state == TASK_UNINTERRUPTIBLE)
247 check_hung_task(t, now);
248 } while_each_thread(g, t);
249 unlock:
250 read_unlock(&tasklist_lock);
251}
252
253/*
254 * The watchdog thread - runs every second and touches the timestamp. 169 * The watchdog thread - runs every second and touches the timestamp.
255 */ 170 */
256static int watchdog(void *__bind_cpu) 171static int watchdog(void *__bind_cpu)
257{ 172{
258 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 173 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
259 int this_cpu = (long)__bind_cpu;
260 174
261 sched_setscheduler(current, SCHED_FIFO, &param); 175 sched_setscheduler(current, SCHED_FIFO, &param);
262 176
@@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu)
276 if (kthread_should_stop()) 190 if (kthread_should_stop())
277 break; 191 break;
278 192
279 if (this_cpu == check_cpu) {
280 if (sysctl_hung_task_timeout_secs)
281 check_hung_uninterruptible_tasks(this_cpu);
282 }
283
284 set_current_state(TASK_INTERRUPTIBLE); 193 set_current_state(TASK_INTERRUPTIBLE);
285 } 194 }
286 __set_current_state(TASK_RUNNING); 195 __set_current_state(TASK_RUNNING);
@@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
312 break; 221 break;
313 case CPU_ONLINE: 222 case CPU_ONLINE:
314 case CPU_ONLINE_FROZEN: 223 case CPU_ONLINE_FROZEN:
315 check_cpu = cpumask_any(cpu_online_mask);
316 wake_up_process(per_cpu(watchdog_task, hotcpu)); 224 wake_up_process(per_cpu(watchdog_task, hotcpu));
317 break; 225 break;
318#ifdef CONFIG_HOTPLUG_CPU 226#ifdef CONFIG_HOTPLUG_CPU
319 case CPU_DOWN_PREPARE:
320 case CPU_DOWN_PREPARE_FROZEN:
321 if (hotcpu == check_cpu) {
322 /* Pick any other online cpu. */
323 check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
324 }
325 break;
326
327 case CPU_UP_CANCELED: 227 case CPU_UP_CANCELED:
328 case CPU_UP_CANCELED_FROZEN: 228 case CPU_UP_CANCELED_FROZEN:
329 if (!per_cpu(watchdog_task, hotcpu)) 229 if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/sys.c b/kernel/sys.c
index 51dbb55604e8..e7998cf31498 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -360,6 +360,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
360 void __user *, arg) 360 void __user *, arg)
361{ 361{
362 char buffer[256]; 362 char buffer[256];
363 int ret = 0;
363 364
364 /* We only trust the superuser with rebooting the system. */ 365 /* We only trust the superuser with rebooting the system. */
365 if (!capable(CAP_SYS_BOOT)) 366 if (!capable(CAP_SYS_BOOT))
@@ -397,7 +398,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
397 kernel_halt(); 398 kernel_halt();
398 unlock_kernel(); 399 unlock_kernel();
399 do_exit(0); 400 do_exit(0);
400 break; 401 panic("cannot halt");
401 402
402 case LINUX_REBOOT_CMD_POWER_OFF: 403 case LINUX_REBOOT_CMD_POWER_OFF:
403 kernel_power_off(); 404 kernel_power_off();
@@ -417,29 +418,22 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
417 418
418#ifdef CONFIG_KEXEC 419#ifdef CONFIG_KEXEC
419 case LINUX_REBOOT_CMD_KEXEC: 420 case LINUX_REBOOT_CMD_KEXEC:
420 { 421 ret = kernel_kexec();
421 int ret; 422 break;
422 ret = kernel_kexec();
423 unlock_kernel();
424 return ret;
425 }
426#endif 423#endif
427 424
428#ifdef CONFIG_HIBERNATION 425#ifdef CONFIG_HIBERNATION
429 case LINUX_REBOOT_CMD_SW_SUSPEND: 426 case LINUX_REBOOT_CMD_SW_SUSPEND:
430 { 427 ret = hibernate();
431 int ret = hibernate(); 428 break;
432 unlock_kernel();
433 return ret;
434 }
435#endif 429#endif
436 430
437 default: 431 default:
438 unlock_kernel(); 432 ret = -EINVAL;
439 return -EINVAL; 433 break;
440 } 434 }
441 unlock_kernel(); 435 unlock_kernel();
442 return 0; 436 return ret;
443} 437}
444 438
445static void deferred_cad(struct work_struct *dummy) 439static void deferred_cad(struct work_struct *dummy)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 82350f8f04f6..b2970d56fb76 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,11 +97,14 @@ static int neg_one = -1;
97#endif 97#endif
98 98
99static int zero; 99static int zero;
100static int one = 1; 100static int __maybe_unused one = 1;
101static int two = 2; 101static int __maybe_unused two = 2;
102static unsigned long one_ul = 1; 102static unsigned long one_ul = 1;
103static int one_hundred = 100; 103static int one_hundred = 100;
104 104
105/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
106static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
107
105/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 108/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
106static int maxolduid = 65535; 109static int maxolduid = 65535;
107static int minolduid; 110static int minolduid;
@@ -813,6 +816,19 @@ static struct ctl_table kern_table[] = {
813 .extra1 = &neg_one, 816 .extra1 = &neg_one,
814 .extra2 = &sixty, 817 .extra2 = &sixty,
815 }, 818 },
819#endif
820#ifdef CONFIG_DETECT_HUNG_TASK
821 {
822 .ctl_name = CTL_UNNUMBERED,
823 .procname = "hung_task_panic",
824 .data = &sysctl_hung_task_panic,
825 .maxlen = sizeof(int),
826 .mode = 0644,
827 .proc_handler = &proc_dointvec_minmax,
828 .strategy = &sysctl_intvec,
829 .extra1 = &zero,
830 .extra2 = &one,
831 },
816 { 832 {
817 .ctl_name = CTL_UNNUMBERED, 833 .ctl_name = CTL_UNNUMBERED,
818 .procname = "hung_task_check_count", 834 .procname = "hung_task_check_count",
@@ -828,7 +844,7 @@ static struct ctl_table kern_table[] = {
828 .data = &sysctl_hung_task_timeout_secs, 844 .data = &sysctl_hung_task_timeout_secs,
829 .maxlen = sizeof(unsigned long), 845 .maxlen = sizeof(unsigned long),
830 .mode = 0644, 846 .mode = 0644,
831 .proc_handler = &proc_doulongvec_minmax, 847 .proc_handler = &proc_dohung_task_timeout_secs,
832 .strategy = &sysctl_intvec, 848 .strategy = &sysctl_intvec,
833 }, 849 },
834 { 850 {
@@ -888,16 +904,6 @@ static struct ctl_table kern_table[] = {
888 .proc_handler = &proc_dointvec, 904 .proc_handler = &proc_dointvec,
889 }, 905 },
890#endif 906#endif
891#ifdef CONFIG_UNEVICTABLE_LRU
892 {
893 .ctl_name = CTL_UNNUMBERED,
894 .procname = "scan_unevictable_pages",
895 .data = &scan_unevictable_pages,
896 .maxlen = sizeof(scan_unevictable_pages),
897 .mode = 0644,
898 .proc_handler = &scan_unevictable_handler,
899 },
900#endif
901#ifdef CONFIG_SLOW_WORK 907#ifdef CONFIG_SLOW_WORK
902 { 908 {
903 .ctl_name = CTL_UNNUMBERED, 909 .ctl_name = CTL_UNNUMBERED,
@@ -1002,7 +1008,7 @@ static struct ctl_table vm_table[] = {
1002 .mode = 0644, 1008 .mode = 0644,
1003 .proc_handler = &dirty_bytes_handler, 1009 .proc_handler = &dirty_bytes_handler,
1004 .strategy = &sysctl_intvec, 1010 .strategy = &sysctl_intvec,
1005 .extra1 = &one_ul, 1011 .extra1 = &dirty_bytes_min,
1006 }, 1012 },
1007 { 1013 {
1008 .procname = "dirty_writeback_centisecs", 1014 .procname = "dirty_writeback_centisecs",
@@ -1266,6 +1272,16 @@ static struct ctl_table vm_table[] = {
1266 .extra2 = &one, 1272 .extra2 = &one,
1267 }, 1273 },
1268#endif 1274#endif
1275#ifdef CONFIG_UNEVICTABLE_LRU
1276 {
1277 .ctl_name = CTL_UNNUMBERED,
1278 .procname = "scan_unevictable_pages",
1279 .data = &scan_unevictable_pages,
1280 .maxlen = sizeof(scan_unevictable_pages),
1281 .mode = 0644,
1282 .proc_handler = &scan_unevictable_handler,
1283 },
1284#endif
1269/* 1285/*
1270 * NOTE: do not add new entries to this table unless you have read 1286 * NOTE: do not add new entries to this table unless you have read
1271 * Documentation/sysctl/ctl_unnumbered.txt 1287 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c46c931a7fe7..ecfd7b5187e0 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -181,12 +181,12 @@ static void clocksource_watchdog(unsigned long data)
181 181
182 resumed = test_and_clear_bit(0, &watchdog_resumed); 182 resumed = test_and_clear_bit(0, &watchdog_resumed);
183 183
184 wdnow = watchdog->read(); 184 wdnow = watchdog->read(watchdog);
185 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask); 185 wd_nsec = cyc2ns(watchdog, (wdnow - watchdog_last) & watchdog->mask);
186 watchdog_last = wdnow; 186 watchdog_last = wdnow;
187 187
188 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { 188 list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
189 csnow = cs->read(); 189 csnow = cs->read(cs);
190 190
191 if (unlikely(resumed)) { 191 if (unlikely(resumed)) {
192 cs->wd_last = csnow; 192 cs->wd_last = csnow;
@@ -247,7 +247,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
247 247
248 list_add(&cs->wd_list, &watchdog_list); 248 list_add(&cs->wd_list, &watchdog_list);
249 if (!started && watchdog) { 249 if (!started && watchdog) {
250 watchdog_last = watchdog->read(); 250 watchdog_last = watchdog->read(watchdog);
251 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 251 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
252 add_timer_on(&watchdog_timer, 252 add_timer_on(&watchdog_timer,
253 cpumask_first(cpu_online_mask)); 253 cpumask_first(cpu_online_mask));
@@ -268,7 +268,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
268 cse->flags &= ~CLOCK_SOURCE_WATCHDOG; 268 cse->flags &= ~CLOCK_SOURCE_WATCHDOG;
269 /* Start if list is not empty */ 269 /* Start if list is not empty */
270 if (!list_empty(&watchdog_list)) { 270 if (!list_empty(&watchdog_list)) {
271 watchdog_last = watchdog->read(); 271 watchdog_last = watchdog->read(watchdog);
272 watchdog_timer.expires = 272 watchdog_timer.expires =
273 jiffies + WATCHDOG_INTERVAL; 273 jiffies + WATCHDOG_INTERVAL;
274 add_timer_on(&watchdog_timer, 274 add_timer_on(&watchdog_timer,
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 06f197560f3b..c3f6c30816e3 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -50,7 +50,7 @@
50 */ 50 */
51#define JIFFIES_SHIFT 8 51#define JIFFIES_SHIFT 8
52 52
53static cycle_t jiffies_read(void) 53static cycle_t jiffies_read(struct clocksource *cs)
54{ 54{
55 return (cycle_t) jiffies; 55 return (cycle_t) jiffies;
56} 56}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 21a5ca849514..83c4417b6a3c 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -93,7 +93,17 @@ void tick_handle_periodic(struct clock_event_device *dev)
93 for (;;) { 93 for (;;) {
94 if (!clockevents_program_event(dev, next, ktime_get())) 94 if (!clockevents_program_event(dev, next, ktime_get()))
95 return; 95 return;
96 tick_periodic(cpu); 96 /*
97 * Have to be careful here. If we're in oneshot mode,
98 * before we call tick_periodic() in a loop, we need
99 * to be sure we're using a real hardware clocksource.
100 * Otherwise we could get trapped in an infinite
101 * loop, as the tick_periodic() increments jiffies,
102 * when then will increment time, posibly causing
103 * the loop to trigger again and again.
104 */
105 if (timekeeping_valid_for_hres())
106 tick_periodic(cpu);
97 next = ktime_add(next, tick_period); 107 next = ktime_add(next, tick_period);
98 } 108 }
99} 109}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 900f1b6598d1..687dff49f6e7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -182,7 +182,7 @@ EXPORT_SYMBOL(do_settimeofday);
182 */ 182 */
183static void change_clocksource(void) 183static void change_clocksource(void)
184{ 184{
185 struct clocksource *new; 185 struct clocksource *new, *old;
186 186
187 new = clocksource_get_next(); 187 new = clocksource_get_next();
188 188
@@ -191,11 +191,16 @@ static void change_clocksource(void)
191 191
192 clocksource_forward_now(); 192 clocksource_forward_now();
193 193
194 new->raw_time = clock->raw_time; 194 if (clocksource_enable(new))
195 return;
195 196
197 new->raw_time = clock->raw_time;
198 old = clock;
196 clock = new; 199 clock = new;
200 clocksource_disable(old);
201
197 clock->cycle_last = 0; 202 clock->cycle_last = 0;
198 clock->cycle_last = clocksource_read(new); 203 clock->cycle_last = clocksource_read(clock);
199 clock->error = 0; 204 clock->error = 0;
200 clock->xtime_nsec = 0; 205 clock->xtime_nsec = 0;
201 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 206 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -292,6 +297,7 @@ void __init timekeeping_init(void)
292 ntp_init(); 297 ntp_init();
293 298
294 clock = clocksource_get_next(); 299 clock = clocksource_get_next();
300 clocksource_enable(clock);
295 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 301 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
296 clock->cycle_last = clocksource_read(clock); 302 clock->cycle_last = clocksource_read(clock);
297 303
diff --git a/kernel/timer.c b/kernel/timer.c
index b4555568b4e4..cffffad01c31 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -531,10 +531,13 @@ static void __init_timer(struct timer_list *timer,
531} 531}
532 532
533/** 533/**
534 * init_timer - initialize a timer. 534 * init_timer_key - initialize a timer
535 * @timer: the timer to be initialized 535 * @timer: the timer to be initialized
536 * @name: name of the timer
537 * @key: lockdep class key of the fake lock used for tracking timer
538 * sync lock dependencies
536 * 539 *
537 * init_timer() must be done to a timer prior calling *any* of the 540 * init_timer_key() must be done to a timer prior calling *any* of the
538 * other timer functions. 541 * other timer functions.
539 */ 542 */
540void init_timer_key(struct timer_list *timer, 543void init_timer_key(struct timer_list *timer,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 504086ab4443..417d1985e299 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
9config NOP_TRACER 9config NOP_TRACER
10 bool 10 bool
11 11
12config HAVE_FTRACE_NMI_ENTER
13 bool
14
12config HAVE_FUNCTION_TRACER 15config HAVE_FUNCTION_TRACER
13 bool 16 bool
14 17
@@ -31,12 +34,20 @@ config HAVE_FTRACE_MCOUNT_RECORD
31config HAVE_HW_BRANCH_TRACER 34config HAVE_HW_BRANCH_TRACER
32 bool 35 bool
33 36
37config HAVE_FTRACE_SYSCALLS
38 bool
39
34config TRACER_MAX_TRACE 40config TRACER_MAX_TRACE
35 bool 41 bool
36 42
37config RING_BUFFER 43config RING_BUFFER
38 bool 44 bool
39 45
46config FTRACE_NMI_ENTER
47 bool
48 depends on HAVE_FTRACE_NMI_ENTER
49 default y
50
40config TRACING 51config TRACING
41 bool 52 bool
42 select DEBUG_FS 53 select DEBUG_FS
@@ -44,13 +55,29 @@ config TRACING
44 select STACKTRACE if STACKTRACE_SUPPORT 55 select STACKTRACE if STACKTRACE_SUPPORT
45 select TRACEPOINTS 56 select TRACEPOINTS
46 select NOP_TRACER 57 select NOP_TRACER
58 select BINARY_PRINTF
59
60#
61# Minimum requirements an architecture has to meet for us to
62# be able to offer generic tracing facilities:
63#
64config TRACING_SUPPORT
65 bool
66 # PPC32 has no irqflags tracing support, but it can use most of the
67 # tracers anyway, they were tested to build and work. Note that new
68 # exceptions to this list aren't welcomed, better implement the
69 # irqflags tracing for your architecture.
70 depends on TRACE_IRQFLAGS_SUPPORT || PPC32
71 depends on STACKTRACE_SUPPORT
72 default y
73
74if TRACING_SUPPORT
47 75
48menu "Tracers" 76menu "Tracers"
49 77
50config FUNCTION_TRACER 78config FUNCTION_TRACER
51 bool "Kernel Function Tracer" 79 bool "Kernel Function Tracer"
52 depends on HAVE_FUNCTION_TRACER 80 depends on HAVE_FUNCTION_TRACER
53 depends on DEBUG_KERNEL
54 select FRAME_POINTER 81 select FRAME_POINTER
55 select KALLSYMS 82 select KALLSYMS
56 select TRACING 83 select TRACING
@@ -82,7 +109,6 @@ config IRQSOFF_TRACER
82 default n 109 default n
83 depends on TRACE_IRQFLAGS_SUPPORT 110 depends on TRACE_IRQFLAGS_SUPPORT
84 depends on GENERIC_TIME 111 depends on GENERIC_TIME
85 depends on DEBUG_KERNEL
86 select TRACE_IRQFLAGS 112 select TRACE_IRQFLAGS
87 select TRACING 113 select TRACING
88 select TRACER_MAX_TRACE 114 select TRACER_MAX_TRACE
@@ -105,7 +131,6 @@ config PREEMPT_TRACER
105 default n 131 default n
106 depends on GENERIC_TIME 132 depends on GENERIC_TIME
107 depends on PREEMPT 133 depends on PREEMPT
108 depends on DEBUG_KERNEL
109 select TRACING 134 select TRACING
110 select TRACER_MAX_TRACE 135 select TRACER_MAX_TRACE
111 help 136 help
@@ -126,13 +151,13 @@ config SYSPROF_TRACER
126 bool "Sysprof Tracer" 151 bool "Sysprof Tracer"
127 depends on X86 152 depends on X86
128 select TRACING 153 select TRACING
154 select CONTEXT_SWITCH_TRACER
129 help 155 help
130 This tracer provides the trace needed by the 'Sysprof' userspace 156 This tracer provides the trace needed by the 'Sysprof' userspace
131 tool. 157 tool.
132 158
133config SCHED_TRACER 159config SCHED_TRACER
134 bool "Scheduling Latency Tracer" 160 bool "Scheduling Latency Tracer"
135 depends on DEBUG_KERNEL
136 select TRACING 161 select TRACING
137 select CONTEXT_SWITCH_TRACER 162 select CONTEXT_SWITCH_TRACER
138 select TRACER_MAX_TRACE 163 select TRACER_MAX_TRACE
@@ -142,16 +167,30 @@ config SCHED_TRACER
142 167
143config CONTEXT_SWITCH_TRACER 168config CONTEXT_SWITCH_TRACER
144 bool "Trace process context switches" 169 bool "Trace process context switches"
145 depends on DEBUG_KERNEL
146 select TRACING 170 select TRACING
147 select MARKERS 171 select MARKERS
148 help 172 help
149 This tracer gets called from the context switch and records 173 This tracer gets called from the context switch and records
150 all switching of tasks. 174 all switching of tasks.
151 175
176config EVENT_TRACER
177 bool "Trace various events in the kernel"
178 select TRACING
179 help
180 This tracer hooks to various trace points in the kernel
181 allowing the user to pick and choose which trace point they
182 want to trace.
183
184config FTRACE_SYSCALLS
185 bool "Trace syscalls"
186 depends on HAVE_FTRACE_SYSCALLS
187 select TRACING
188 select KALLSYMS
189 help
190 Basic tracer to catch the syscall entry and exit events.
191
152config BOOT_TRACER 192config BOOT_TRACER
153 bool "Trace boot initcalls" 193 bool "Trace boot initcalls"
154 depends on DEBUG_KERNEL
155 select TRACING 194 select TRACING
156 select CONTEXT_SWITCH_TRACER 195 select CONTEXT_SWITCH_TRACER
157 help 196 help
@@ -164,13 +203,11 @@ config BOOT_TRACER
164 representation of the delays during initcalls - but the raw 203 representation of the delays during initcalls - but the raw
165 /debug/tracing/trace text output is readable too. 204 /debug/tracing/trace text output is readable too.
166 205
167 ( Note that tracing self tests can't be enabled if this tracer is 206 You must pass in ftrace=initcall to the kernel command line
168 selected, because the self-tests are an initcall as well and that 207 to enable this on bootup.
169 would invalidate the boot trace. )
170 208
171config TRACE_BRANCH_PROFILING 209config TRACE_BRANCH_PROFILING
172 bool "Trace likely/unlikely profiler" 210 bool "Trace likely/unlikely profiler"
173 depends on DEBUG_KERNEL
174 select TRACING 211 select TRACING
175 help 212 help
176 This tracer profiles all the the likely and unlikely macros 213 This tracer profiles all the the likely and unlikely macros
@@ -223,7 +260,6 @@ config BRANCH_TRACER
223 260
224config POWER_TRACER 261config POWER_TRACER
225 bool "Trace power consumption behavior" 262 bool "Trace power consumption behavior"
226 depends on DEBUG_KERNEL
227 depends on X86 263 depends on X86
228 select TRACING 264 select TRACING
229 help 265 help
@@ -235,7 +271,6 @@ config POWER_TRACER
235config STACK_TRACER 271config STACK_TRACER
236 bool "Trace max stack" 272 bool "Trace max stack"
237 depends on HAVE_FUNCTION_TRACER 273 depends on HAVE_FUNCTION_TRACER
238 depends on DEBUG_KERNEL
239 select FUNCTION_TRACER 274 select FUNCTION_TRACER
240 select STACKTRACE 275 select STACKTRACE
241 select KALLSYMS 276 select KALLSYMS
@@ -265,11 +300,66 @@ config HW_BRANCH_TRACER
265 This tracer records all branches on the system in a circular 300 This tracer records all branches on the system in a circular
266 buffer giving access to the last N branches for each cpu. 301 buffer giving access to the last N branches for each cpu.
267 302
303config KMEMTRACE
304 bool "Trace SLAB allocations"
305 select TRACING
306 help
307 kmemtrace provides tracing for slab allocator functions, such as
308 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
309 data is then fed to the userspace application in order to analyse
310 allocation hotspots, internal fragmentation and so on, making it
311 possible to see how well an allocator performs, as well as debug
312 and profile kernel code.
313
314 This requires an userspace application to use. See
315 Documentation/trace/kmemtrace.txt for more information.
316
317 Saying Y will make the kernel somewhat larger and slower. However,
318 if you disable kmemtrace at run-time or boot-time, the performance
319 impact is minimal (depending on the arch the kernel is built for).
320
321 If unsure, say N.
322
323config WORKQUEUE_TRACER
324 bool "Trace workqueues"
325 select TRACING
326 help
327 The workqueue tracer provides some statistical informations
328 about each cpu workqueue thread such as the number of the
329 works inserted and executed since their creation. It can help
330 to evaluate the amount of work each of them have to perform.
331 For example it can help a developer to decide whether he should
332 choose a per cpu workqueue instead of a singlethreaded one.
333
334config BLK_DEV_IO_TRACE
335 bool "Support for tracing block io actions"
336 depends on SYSFS
337 depends on BLOCK
338 select RELAY
339 select DEBUG_FS
340 select TRACEPOINTS
341 select TRACING
342 select STACKTRACE
343 help
344 Say Y here if you want to be able to trace the block layer actions
345 on a given queue. Tracing allows you to see any traffic happening
346 on a block device queue. For more information (and the userspace
347 support tools needed), fetch the blktrace tools from:
348
349 git://git.kernel.dk/blktrace.git
350
351 Tracing also is possible using the ftrace interface, e.g.:
352
353 echo 1 > /sys/block/sda/sda1/trace/enable
354 echo blk > /sys/kernel/debug/tracing/current_tracer
355 cat /sys/kernel/debug/tracing/trace_pipe
356
357 If unsure, say N.
358
268config DYNAMIC_FTRACE 359config DYNAMIC_FTRACE
269 bool "enable/disable ftrace tracepoints dynamically" 360 bool "enable/disable ftrace tracepoints dynamically"
270 depends on FUNCTION_TRACER 361 depends on FUNCTION_TRACER
271 depends on HAVE_DYNAMIC_FTRACE 362 depends on HAVE_DYNAMIC_FTRACE
272 depends on DEBUG_KERNEL
273 default y 363 default y
274 help 364 help
275 This option will modify all the calls to ftrace dynamically 365 This option will modify all the calls to ftrace dynamically
@@ -295,7 +385,7 @@ config FTRACE_SELFTEST
295 385
296config FTRACE_STARTUP_TEST 386config FTRACE_STARTUP_TEST
297 bool "Perform a startup test on ftrace" 387 bool "Perform a startup test on ftrace"
298 depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER 388 depends on TRACING
299 select FTRACE_SELFTEST 389 select FTRACE_SELFTEST
300 help 390 help
301 This option performs a series of startup tests on ftrace. On bootup 391 This option performs a series of startup tests on ftrace. On bootup
@@ -305,7 +395,7 @@ config FTRACE_STARTUP_TEST
305 395
306config MMIOTRACE 396config MMIOTRACE
307 bool "Memory mapped IO tracing" 397 bool "Memory mapped IO tracing"
308 depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI 398 depends on HAVE_MMIOTRACE_SUPPORT && PCI
309 select TRACING 399 select TRACING
310 help 400 help
311 Mmiotrace traces Memory Mapped I/O access and is meant for 401 Mmiotrace traces Memory Mapped I/O access and is meant for
@@ -313,7 +403,7 @@ config MMIOTRACE
313 implementation and works via page faults. Tracing is disabled by 403 implementation and works via page faults. Tracing is disabled by
314 default and can be enabled at run-time. 404 default and can be enabled at run-time.
315 405
316 See Documentation/tracers/mmiotrace.txt. 406 See Documentation/trace/mmiotrace.txt.
317 If you are not helping to develop drivers, say N. 407 If you are not helping to develop drivers, say N.
318 408
319config MMIOTRACE_TEST 409config MMIOTRACE_TEST
@@ -327,3 +417,6 @@ config MMIOTRACE_TEST
327 Say N, unless you absolutely know what you are doing. 417 Say N, unless you absolutely know what you are doing.
328 418
329endmenu 419endmenu
420
421endif # TRACING_SUPPORT
422
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a93653f..2630f5121ec1 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -19,6 +19,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o 19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
20 20
21obj-$(CONFIG_TRACING) += trace.o 21obj-$(CONFIG_TRACING) += trace.o
22obj-$(CONFIG_TRACING) += trace_clock.o
23obj-$(CONFIG_TRACING) += trace_output.o
24obj-$(CONFIG_TRACING) += trace_stat.o
25obj-$(CONFIG_TRACING) += trace_printk.o
22obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 26obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
23obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o 27obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
24obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o 28obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
@@ -33,5 +37,14 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
33obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 37obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
34obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o 38obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
35obj-$(CONFIG_POWER_TRACER) += trace_power.o 39obj-$(CONFIG_POWER_TRACER) += trace_power.o
40obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
41obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
42obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
43obj-$(CONFIG_EVENT_TRACER) += trace_events.o
44obj-$(CONFIG_EVENT_TRACER) += events.o
45obj-$(CONFIG_EVENT_TRACER) += trace_export.o
46obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
47obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
48obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
36 49
37libftrace-y := ftrace.o 50libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
new file mode 100644
index 000000000000..921ef5d1f0ba
--- /dev/null
+++ b/kernel/trace/blktrace.c
@@ -0,0 +1,1550 @@
1/*
2 * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 *
17 */
18#include <linux/kernel.h>
19#include <linux/blkdev.h>
20#include <linux/blktrace_api.h>
21#include <linux/percpu.h>
22#include <linux/init.h>
23#include <linux/mutex.h>
24#include <linux/debugfs.h>
25#include <linux/time.h>
26#include <trace/block.h>
27#include <linux/uaccess.h>
28#include "trace_output.h"
29
30static unsigned int blktrace_seq __read_mostly = 1;
31
32static struct trace_array *blk_tr;
33static bool blk_tracer_enabled __read_mostly;
34
35/* Select an alternative, minimalistic output than the original one */
36#define TRACE_BLK_OPT_CLASSIC 0x1
37
38static struct tracer_opt blk_tracer_opts[] = {
39 /* Default disable the minimalistic output */
40 { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
41 { }
42};
43
44static struct tracer_flags blk_tracer_flags = {
45 .val = 0,
46 .opts = blk_tracer_opts,
47};
48
49/* Global reference count of probes */
50static atomic_t blk_probes_ref = ATOMIC_INIT(0);
51
52static void blk_register_tracepoints(void);
53static void blk_unregister_tracepoints(void);
54
55/*
56 * Send out a notify message.
57 */
58static void trace_note(struct blk_trace *bt, pid_t pid, int action,
59 const void *data, size_t len)
60{
61 struct blk_io_trace *t;
62 struct ring_buffer_event *event = NULL;
63 int pc = 0;
64 int cpu = smp_processor_id();
65 bool blk_tracer = blk_tracer_enabled;
66
67 if (blk_tracer) {
68 pc = preempt_count();
69 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
70 sizeof(*t) + len,
71 0, pc);
72 if (!event)
73 return;
74 t = ring_buffer_event_data(event);
75 goto record_it;
76 }
77
78 if (!bt->rchan)
79 return;
80
81 t = relay_reserve(bt->rchan, sizeof(*t) + len);
82 if (t) {
83 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
84 t->time = ktime_to_ns(ktime_get());
85record_it:
86 t->device = bt->dev;
87 t->action = action;
88 t->pid = pid;
89 t->cpu = cpu;
90 t->pdu_len = len;
91 memcpy((void *) t + sizeof(*t), data, len);
92
93 if (blk_tracer)
94 trace_buffer_unlock_commit(blk_tr, event, 0, pc);
95 }
96}
97
98/*
99 * Send out a notify for this process, if we haven't done so since a trace
100 * started
101 */
102static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
103{
104 tsk->btrace_seq = blktrace_seq;
105 trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
106}
107
108static void trace_note_time(struct blk_trace *bt)
109{
110 struct timespec now;
111 unsigned long flags;
112 u32 words[2];
113
114 getnstimeofday(&now);
115 words[0] = now.tv_sec;
116 words[1] = now.tv_nsec;
117
118 local_irq_save(flags);
119 trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
120 local_irq_restore(flags);
121}
122
123void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
124{
125 int n;
126 va_list args;
127 unsigned long flags;
128 char *buf;
129
130 if (unlikely(bt->trace_state != Blktrace_running &&
131 !blk_tracer_enabled))
132 return;
133
134 local_irq_save(flags);
135 buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
136 va_start(args, fmt);
137 n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
138 va_end(args);
139
140 trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
141 local_irq_restore(flags);
142}
143EXPORT_SYMBOL_GPL(__trace_note_message);
144
145static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
146 pid_t pid)
147{
148 if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
149 return 1;
150 if (sector < bt->start_lba || sector > bt->end_lba)
151 return 1;
152 if (bt->pid && pid != bt->pid)
153 return 1;
154
155 return 0;
156}
157
158/*
159 * Data direction bit lookup
160 */
161static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
162 BLK_TC_ACT(BLK_TC_WRITE) };
163
164/* The ilog2() calls fall out because they're constant */
165#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
166 (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
167
168/*
169 * The worker for the various blk_add_trace*() types. Fills out a
170 * blk_io_trace structure and places it in a per-cpu subbuffer.
171 */
172static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
173 int rw, u32 what, int error, int pdu_len, void *pdu_data)
174{
175 struct task_struct *tsk = current;
176 struct ring_buffer_event *event = NULL;
177 struct blk_io_trace *t;
178 unsigned long flags = 0;
179 unsigned long *sequence;
180 pid_t pid;
181 int cpu, pc = 0;
182 bool blk_tracer = blk_tracer_enabled;
183
184 if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
185 return;
186
187 what |= ddir_act[rw & WRITE];
188 what |= MASK_TC_BIT(rw, BARRIER);
189 what |= MASK_TC_BIT(rw, SYNCIO);
190 what |= MASK_TC_BIT(rw, AHEAD);
191 what |= MASK_TC_BIT(rw, META);
192 what |= MASK_TC_BIT(rw, DISCARD);
193
194 pid = tsk->pid;
195 if (unlikely(act_log_check(bt, what, sector, pid)))
196 return;
197 cpu = raw_smp_processor_id();
198
199 if (blk_tracer) {
200 tracing_record_cmdline(current);
201
202 pc = preempt_count();
203 event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
204 sizeof(*t) + pdu_len,
205 0, pc);
206 if (!event)
207 return;
208 t = ring_buffer_event_data(event);
209 goto record_it;
210 }
211
212 /*
213 * A word about the locking here - we disable interrupts to reserve
214 * some space in the relay per-cpu buffer, to prevent an irq
215 * from coming in and stepping on our toes.
216 */
217 local_irq_save(flags);
218
219 if (unlikely(tsk->btrace_seq != blktrace_seq))
220 trace_note_tsk(bt, tsk);
221
222 t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
223 if (t) {
224 sequence = per_cpu_ptr(bt->sequence, cpu);
225
226 t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
227 t->sequence = ++(*sequence);
228 t->time = ktime_to_ns(ktime_get());
229record_it:
230 /*
231 * These two are not needed in ftrace as they are in the
232 * generic trace_entry, filled by tracing_generic_entry_update,
233 * but for the trace_event->bin() synthesizer benefit we do it
234 * here too.
235 */
236 t->cpu = cpu;
237 t->pid = pid;
238
239 t->sector = sector;
240 t->bytes = bytes;
241 t->action = what;
242 t->device = bt->dev;
243 t->error = error;
244 t->pdu_len = pdu_len;
245
246 if (pdu_len)
247 memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
248
249 if (blk_tracer) {
250 trace_buffer_unlock_commit(blk_tr, event, 0, pc);
251 return;
252 }
253 }
254
255 local_irq_restore(flags);
256}
257
258static struct dentry *blk_tree_root;
259static DEFINE_MUTEX(blk_tree_mutex);
260
261static void blk_trace_free(struct blk_trace *bt)
262{
263 debugfs_remove(bt->msg_file);
264 debugfs_remove(bt->dropped_file);
265 relay_close(bt->rchan);
266 free_percpu(bt->sequence);
267 free_percpu(bt->msg_data);
268 kfree(bt);
269}
270
271static void blk_trace_cleanup(struct blk_trace *bt)
272{
273 blk_trace_free(bt);
274 if (atomic_dec_and_test(&blk_probes_ref))
275 blk_unregister_tracepoints();
276}
277
278int blk_trace_remove(struct request_queue *q)
279{
280 struct blk_trace *bt;
281
282 bt = xchg(&q->blk_trace, NULL);
283 if (!bt)
284 return -EINVAL;
285
286 if (bt->trace_state != Blktrace_running)
287 blk_trace_cleanup(bt);
288
289 return 0;
290}
291EXPORT_SYMBOL_GPL(blk_trace_remove);
292
293static int blk_dropped_open(struct inode *inode, struct file *filp)
294{
295 filp->private_data = inode->i_private;
296
297 return 0;
298}
299
300static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
301 size_t count, loff_t *ppos)
302{
303 struct blk_trace *bt = filp->private_data;
304 char buf[16];
305
306 snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
307
308 return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
309}
310
311static const struct file_operations blk_dropped_fops = {
312 .owner = THIS_MODULE,
313 .open = blk_dropped_open,
314 .read = blk_dropped_read,
315};
316
317static int blk_msg_open(struct inode *inode, struct file *filp)
318{
319 filp->private_data = inode->i_private;
320
321 return 0;
322}
323
324static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
325 size_t count, loff_t *ppos)
326{
327 char *msg;
328 struct blk_trace *bt;
329
330 if (count >= BLK_TN_MAX_MSG)
331 return -EINVAL;
332
333 msg = kmalloc(count + 1, GFP_KERNEL);
334 if (msg == NULL)
335 return -ENOMEM;
336
337 if (copy_from_user(msg, buffer, count)) {
338 kfree(msg);
339 return -EFAULT;
340 }
341
342 msg[count] = '\0';
343 bt = filp->private_data;
344 __trace_note_message(bt, "%s", msg);
345 kfree(msg);
346
347 return count;
348}
349
350static const struct file_operations blk_msg_fops = {
351 .owner = THIS_MODULE,
352 .open = blk_msg_open,
353 .write = blk_msg_write,
354};
355
356/*
357 * Keep track of how many times we encountered a full subbuffer, to aid
358 * the user space app in telling how many lost events there were.
359 */
360static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
361 void *prev_subbuf, size_t prev_padding)
362{
363 struct blk_trace *bt;
364
365 if (!relay_buf_full(buf))
366 return 1;
367
368 bt = buf->chan->private_data;
369 atomic_inc(&bt->dropped);
370 return 0;
371}
372
373static int blk_remove_buf_file_callback(struct dentry *dentry)
374{
375 struct dentry *parent = dentry->d_parent;
376 debugfs_remove(dentry);
377
378 /*
379 * this will fail for all but the last file, but that is ok. what we
380 * care about is the top level buts->name directory going away, when
381 * the last trace file is gone. Then we don't have to rmdir() that
382 * manually on trace stop, so it nicely solves the issue with
383 * force killing of running traces.
384 */
385
386 debugfs_remove(parent);
387 return 0;
388}
389
390static struct dentry *blk_create_buf_file_callback(const char *filename,
391 struct dentry *parent,
392 int mode,
393 struct rchan_buf *buf,
394 int *is_global)
395{
396 return debugfs_create_file(filename, mode, parent, buf,
397 &relay_file_operations);
398}
399
400static struct rchan_callbacks blk_relay_callbacks = {
401 .subbuf_start = blk_subbuf_start_callback,
402 .create_buf_file = blk_create_buf_file_callback,
403 .remove_buf_file = blk_remove_buf_file_callback,
404};
405
406/*
407 * Setup everything required to start tracing
408 */
409int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
410 struct blk_user_trace_setup *buts)
411{
412 struct blk_trace *old_bt, *bt = NULL;
413 struct dentry *dir = NULL;
414 int ret, i;
415
416 if (!buts->buf_size || !buts->buf_nr)
417 return -EINVAL;
418
419 strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
420 buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
421
422 /*
423 * some device names have larger paths - convert the slashes
424 * to underscores for this to work as expected
425 */
426 for (i = 0; i < strlen(buts->name); i++)
427 if (buts->name[i] == '/')
428 buts->name[i] = '_';
429
430 bt = kzalloc(sizeof(*bt), GFP_KERNEL);
431 if (!bt)
432 return -ENOMEM;
433
434 ret = -ENOMEM;
435 bt->sequence = alloc_percpu(unsigned long);
436 if (!bt->sequence)
437 goto err;
438
439 bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
440 if (!bt->msg_data)
441 goto err;
442
443 ret = -ENOENT;
444
445 mutex_lock(&blk_tree_mutex);
446 if (!blk_tree_root) {
447 blk_tree_root = debugfs_create_dir("block", NULL);
448 if (!blk_tree_root) {
449 mutex_unlock(&blk_tree_mutex);
450 goto err;
451 }
452 }
453 mutex_unlock(&blk_tree_mutex);
454
455 dir = debugfs_create_dir(buts->name, blk_tree_root);
456
457 if (!dir)
458 goto err;
459
460 bt->dir = dir;
461 bt->dev = dev;
462 atomic_set(&bt->dropped, 0);
463
464 ret = -EIO;
465 bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
466 &blk_dropped_fops);
467 if (!bt->dropped_file)
468 goto err;
469
470 bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
471 if (!bt->msg_file)
472 goto err;
473
474 bt->rchan = relay_open("trace", dir, buts->buf_size,
475 buts->buf_nr, &blk_relay_callbacks, bt);
476 if (!bt->rchan)
477 goto err;
478
479 bt->act_mask = buts->act_mask;
480 if (!bt->act_mask)
481 bt->act_mask = (u16) -1;
482
483 bt->start_lba = buts->start_lba;
484 bt->end_lba = buts->end_lba;
485 if (!bt->end_lba)
486 bt->end_lba = -1ULL;
487
488 bt->pid = buts->pid;
489 bt->trace_state = Blktrace_setup;
490
491 ret = -EBUSY;
492 old_bt = xchg(&q->blk_trace, bt);
493 if (old_bt) {
494 (void) xchg(&q->blk_trace, old_bt);
495 goto err;
496 }
497
498 if (atomic_inc_return(&blk_probes_ref) == 1)
499 blk_register_tracepoints();
500
501 return 0;
502err:
503 blk_trace_free(bt);
504 return ret;
505}
506
507int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
508 char __user *arg)
509{
510 struct blk_user_trace_setup buts;
511 int ret;
512
513 ret = copy_from_user(&buts, arg, sizeof(buts));
514 if (ret)
515 return -EFAULT;
516
517 ret = do_blk_trace_setup(q, name, dev, &buts);
518 if (ret)
519 return ret;
520
521 if (copy_to_user(arg, &buts, sizeof(buts)))
522 return -EFAULT;
523
524 return 0;
525}
526EXPORT_SYMBOL_GPL(blk_trace_setup);
527
528int blk_trace_startstop(struct request_queue *q, int start)
529{
530 int ret;
531 struct blk_trace *bt = q->blk_trace;
532
533 if (bt == NULL)
534 return -EINVAL;
535
536 /*
537 * For starting a trace, we can transition from a setup or stopped
538 * trace. For stopping a trace, the state must be running
539 */
540 ret = -EINVAL;
541 if (start) {
542 if (bt->trace_state == Blktrace_setup ||
543 bt->trace_state == Blktrace_stopped) {
544 blktrace_seq++;
545 smp_mb();
546 bt->trace_state = Blktrace_running;
547
548 trace_note_time(bt);
549 ret = 0;
550 }
551 } else {
552 if (bt->trace_state == Blktrace_running) {
553 bt->trace_state = Blktrace_stopped;
554 relay_flush(bt->rchan);
555 ret = 0;
556 }
557 }
558
559 return ret;
560}
561EXPORT_SYMBOL_GPL(blk_trace_startstop);
562
563/**
564 * blk_trace_ioctl: - handle the ioctls associated with tracing
565 * @bdev: the block device
566 * @cmd: the ioctl cmd
567 * @arg: the argument data, if any
568 *
569 **/
570int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
571{
572 struct request_queue *q;
573 int ret, start = 0;
574 char b[BDEVNAME_SIZE];
575
576 q = bdev_get_queue(bdev);
577 if (!q)
578 return -ENXIO;
579
580 mutex_lock(&bdev->bd_mutex);
581
582 switch (cmd) {
583 case BLKTRACESETUP:
584 bdevname(bdev, b);
585 ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
586 break;
587 case BLKTRACESTART:
588 start = 1;
589 case BLKTRACESTOP:
590 ret = blk_trace_startstop(q, start);
591 break;
592 case BLKTRACETEARDOWN:
593 ret = blk_trace_remove(q);
594 break;
595 default:
596 ret = -ENOTTY;
597 break;
598 }
599
600 mutex_unlock(&bdev->bd_mutex);
601 return ret;
602}
603
604/**
605 * blk_trace_shutdown: - stop and cleanup trace structures
606 * @q: the request queue associated with the device
607 *
608 **/
609void blk_trace_shutdown(struct request_queue *q)
610{
611 if (q->blk_trace) {
612 blk_trace_startstop(q, 0);
613 blk_trace_remove(q);
614 }
615}
616
617/*
618 * blktrace probes
619 */
620
621/**
622 * blk_add_trace_rq - Add a trace for a request oriented action
623 * @q: queue the io is for
624 * @rq: the source request
625 * @what: the action
626 *
627 * Description:
628 * Records an action against a request. Will log the bio offset + size.
629 *
630 **/
631static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
632 u32 what)
633{
634 struct blk_trace *bt = q->blk_trace;
635 int rw = rq->cmd_flags & 0x03;
636
637 if (likely(!bt))
638 return;
639
640 if (blk_discard_rq(rq))
641 rw |= (1 << BIO_RW_DISCARD);
642
643 if (blk_pc_request(rq)) {
644 what |= BLK_TC_ACT(BLK_TC_PC);
645 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
646 rq->cmd_len, rq->cmd);
647 } else {
648 what |= BLK_TC_ACT(BLK_TC_FS);
649 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
650 rw, what, rq->errors, 0, NULL);
651 }
652}
653
654static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
655{
656 blk_add_trace_rq(q, rq, BLK_TA_ABORT);
657}
658
659static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
660{
661 blk_add_trace_rq(q, rq, BLK_TA_INSERT);
662}
663
664static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
665{
666 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
667}
668
669static void blk_add_trace_rq_requeue(struct request_queue *q,
670 struct request *rq)
671{
672 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
673}
674
675static void blk_add_trace_rq_complete(struct request_queue *q,
676 struct request *rq)
677{
678 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
679}
680
681/**
682 * blk_add_trace_bio - Add a trace for a bio oriented action
683 * @q: queue the io is for
684 * @bio: the source bio
685 * @what: the action
686 *
687 * Description:
688 * Records an action against a bio. Will log the bio offset + size.
689 *
690 **/
691static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
692 u32 what)
693{
694 struct blk_trace *bt = q->blk_trace;
695
696 if (likely(!bt))
697 return;
698
699 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
700 !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
701}
702
703static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
704{
705 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
706}
707
708static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
709{
710 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
711}
712
713static void blk_add_trace_bio_backmerge(struct request_queue *q,
714 struct bio *bio)
715{
716 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
717}
718
719static void blk_add_trace_bio_frontmerge(struct request_queue *q,
720 struct bio *bio)
721{
722 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
723}
724
725static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
726{
727 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
728}
729
730static void blk_add_trace_getrq(struct request_queue *q,
731 struct bio *bio, int rw)
732{
733 if (bio)
734 blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
735 else {
736 struct blk_trace *bt = q->blk_trace;
737
738 if (bt)
739 __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
740 }
741}
742
743
744static void blk_add_trace_sleeprq(struct request_queue *q,
745 struct bio *bio, int rw)
746{
747 if (bio)
748 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
749 else {
750 struct blk_trace *bt = q->blk_trace;
751
752 if (bt)
753 __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
754 0, 0, NULL);
755 }
756}
757
758static void blk_add_trace_plug(struct request_queue *q)
759{
760 struct blk_trace *bt = q->blk_trace;
761
762 if (bt)
763 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
764}
765
766static void blk_add_trace_unplug_io(struct request_queue *q)
767{
768 struct blk_trace *bt = q->blk_trace;
769
770 if (bt) {
771 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
772 __be64 rpdu = cpu_to_be64(pdu);
773
774 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
775 sizeof(rpdu), &rpdu);
776 }
777}
778
779static void blk_add_trace_unplug_timer(struct request_queue *q)
780{
781 struct blk_trace *bt = q->blk_trace;
782
783 if (bt) {
784 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
785 __be64 rpdu = cpu_to_be64(pdu);
786
787 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
788 sizeof(rpdu), &rpdu);
789 }
790}
791
792static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
793 unsigned int pdu)
794{
795 struct blk_trace *bt = q->blk_trace;
796
797 if (bt) {
798 __be64 rpdu = cpu_to_be64(pdu);
799
800 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
801 BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
802 sizeof(rpdu), &rpdu);
803 }
804}
805
806/**
807 * blk_add_trace_remap - Add a trace for a remap operation
808 * @q: queue the io is for
809 * @bio: the source bio
810 * @dev: target device
811 * @from: source sector
812 * @to: target sector
813 *
814 * Description:
815 * Device mapper or raid target sometimes need to split a bio because
816 * it spans a stripe (or similar). Add a trace for that action.
817 *
818 **/
819static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
820 dev_t dev, sector_t from, sector_t to)
821{
822 struct blk_trace *bt = q->blk_trace;
823 struct blk_io_trace_remap r;
824
825 if (likely(!bt))
826 return;
827
828 r.device = cpu_to_be32(dev);
829 r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
830 r.sector = cpu_to_be64(to);
831
832 __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
833 !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
834}
835
836/**
837 * blk_add_driver_data - Add binary message with driver-specific data
838 * @q: queue the io is for
839 * @rq: io request
840 * @data: driver-specific data
841 * @len: length of driver-specific data
842 *
843 * Description:
844 * Some drivers might want to write driver-specific data per request.
845 *
846 **/
847void blk_add_driver_data(struct request_queue *q,
848 struct request *rq,
849 void *data, size_t len)
850{
851 struct blk_trace *bt = q->blk_trace;
852
853 if (likely(!bt))
854 return;
855
856 if (blk_pc_request(rq))
857 __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
858 rq->errors, len, data);
859 else
860 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
861 0, BLK_TA_DRV_DATA, rq->errors, len, data);
862}
863EXPORT_SYMBOL_GPL(blk_add_driver_data);
864
865static void blk_register_tracepoints(void)
866{
867 int ret;
868
869 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
870 WARN_ON(ret);
871 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
872 WARN_ON(ret);
873 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
874 WARN_ON(ret);
875 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
876 WARN_ON(ret);
877 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
878 WARN_ON(ret);
879 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
880 WARN_ON(ret);
881 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
882 WARN_ON(ret);
883 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
884 WARN_ON(ret);
885 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
886 WARN_ON(ret);
887 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
888 WARN_ON(ret);
889 ret = register_trace_block_getrq(blk_add_trace_getrq);
890 WARN_ON(ret);
891 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
892 WARN_ON(ret);
893 ret = register_trace_block_plug(blk_add_trace_plug);
894 WARN_ON(ret);
895 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
896 WARN_ON(ret);
897 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
898 WARN_ON(ret);
899 ret = register_trace_block_split(blk_add_trace_split);
900 WARN_ON(ret);
901 ret = register_trace_block_remap(blk_add_trace_remap);
902 WARN_ON(ret);
903}
904
905static void blk_unregister_tracepoints(void)
906{
907 unregister_trace_block_remap(blk_add_trace_remap);
908 unregister_trace_block_split(blk_add_trace_split);
909 unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
910 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
911 unregister_trace_block_plug(blk_add_trace_plug);
912 unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
913 unregister_trace_block_getrq(blk_add_trace_getrq);
914 unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
915 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
916 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
917 unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
918 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
919 unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
920 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
921 unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
922 unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
923 unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
924
925 tracepoint_synchronize_unregister();
926}
927
928/*
929 * struct blk_io_tracer formatting routines
930 */
931
932static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
933{
934 int i = 0;
935 int tc = t->action >> BLK_TC_SHIFT;
936
937 if (t->action == BLK_TN_MESSAGE) {
938 rwbs[i++] = 'N';
939 goto out;
940 }
941
942 if (tc & BLK_TC_DISCARD)
943 rwbs[i++] = 'D';
944 else if (tc & BLK_TC_WRITE)
945 rwbs[i++] = 'W';
946 else if (t->bytes)
947 rwbs[i++] = 'R';
948 else
949 rwbs[i++] = 'N';
950
951 if (tc & BLK_TC_AHEAD)
952 rwbs[i++] = 'A';
953 if (tc & BLK_TC_BARRIER)
954 rwbs[i++] = 'B';
955 if (tc & BLK_TC_SYNC)
956 rwbs[i++] = 'S';
957 if (tc & BLK_TC_META)
958 rwbs[i++] = 'M';
959out:
960 rwbs[i] = '\0';
961}
962
963static inline
964const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
965{
966 return (const struct blk_io_trace *)ent;
967}
968
969static inline const void *pdu_start(const struct trace_entry *ent)
970{
971 return te_blk_io_trace(ent) + 1;
972}
973
974static inline u32 t_sec(const struct trace_entry *ent)
975{
976 return te_blk_io_trace(ent)->bytes >> 9;
977}
978
979static inline unsigned long long t_sector(const struct trace_entry *ent)
980{
981 return te_blk_io_trace(ent)->sector;
982}
983
984static inline __u16 t_error(const struct trace_entry *ent)
985{
986 return te_blk_io_trace(ent)->error;
987}
988
989static __u64 get_pdu_int(const struct trace_entry *ent)
990{
991 const __u64 *val = pdu_start(ent);
992 return be64_to_cpu(*val);
993}
994
995static void get_pdu_remap(const struct trace_entry *ent,
996 struct blk_io_trace_remap *r)
997{
998 const struct blk_io_trace_remap *__r = pdu_start(ent);
999 __u64 sector = __r->sector;
1000
1001 r->device = be32_to_cpu(__r->device);
1002 r->device_from = be32_to_cpu(__r->device_from);
1003 r->sector = be64_to_cpu(sector);
1004}
1005
1006typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
1007
1008static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
1009{
1010 char rwbs[6];
1011 unsigned long long ts = iter->ts;
1012 unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
1013 unsigned secs = (unsigned long)ts;
1014 const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1015
1016 fill_rwbs(rwbs, t);
1017
1018 return trace_seq_printf(&iter->seq,
1019 "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
1020 MAJOR(t->device), MINOR(t->device), iter->cpu,
1021 secs, nsec_rem, iter->ent->pid, act, rwbs);
1022}
1023
1024static int blk_log_action(struct trace_iterator *iter, const char *act)
1025{
1026 char rwbs[6];
1027 const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
1028
1029 fill_rwbs(rwbs, t);
1030 return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
1031 MAJOR(t->device), MINOR(t->device), act, rwbs);
1032}
1033
1034static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
1035{
1036 char cmd[TASK_COMM_LEN];
1037
1038 trace_find_cmdline(ent->pid, cmd);
1039
1040 if (t_sec(ent))
1041 return trace_seq_printf(s, "%llu + %u [%s]\n",
1042 t_sector(ent), t_sec(ent), cmd);
1043 return trace_seq_printf(s, "[%s]\n", cmd);
1044}
1045
1046static int blk_log_with_error(struct trace_seq *s,
1047 const struct trace_entry *ent)
1048{
1049 if (t_sec(ent))
1050 return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
1051 t_sec(ent), t_error(ent));
1052 return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
1053}
1054
1055static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
1056{
1057 struct blk_io_trace_remap r = { .device = 0, };
1058
1059 get_pdu_remap(ent, &r);
1060 return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
1061 t_sector(ent),
1062 t_sec(ent), MAJOR(r.device), MINOR(r.device),
1063 (unsigned long long)r.sector);
1064}
1065
1066static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
1067{
1068 char cmd[TASK_COMM_LEN];
1069
1070 trace_find_cmdline(ent->pid, cmd);
1071
1072 return trace_seq_printf(s, "[%s]\n", cmd);
1073}
1074
1075static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
1076{
1077 char cmd[TASK_COMM_LEN];
1078
1079 trace_find_cmdline(ent->pid, cmd);
1080
1081 return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
1082}
1083
1084static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
1085{
1086 char cmd[TASK_COMM_LEN];
1087
1088 trace_find_cmdline(ent->pid, cmd);
1089
1090 return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
1091 get_pdu_int(ent), cmd);
1092}
1093
1094static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
1095{
1096 int ret;
1097 const struct blk_io_trace *t = te_blk_io_trace(ent);
1098
1099 ret = trace_seq_putmem(s, t + 1, t->pdu_len);
1100 if (ret)
1101 return trace_seq_putc(s, '\n');
1102 return ret;
1103}
1104
1105/*
1106 * struct tracer operations
1107 */
1108
1109static void blk_tracer_print_header(struct seq_file *m)
1110{
1111 if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1112 return;
1113 seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n"
1114 "# | | | | | |\n");
1115}
1116
1117static void blk_tracer_start(struct trace_array *tr)
1118{
1119 blk_tracer_enabled = true;
1120 trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
1121}
1122
1123static int blk_tracer_init(struct trace_array *tr)
1124{
1125 blk_tr = tr;
1126 blk_tracer_start(tr);
1127 return 0;
1128}
1129
1130static void blk_tracer_stop(struct trace_array *tr)
1131{
1132 blk_tracer_enabled = false;
1133 trace_flags |= TRACE_ITER_CONTEXT_INFO;
1134}
1135
1136static void blk_tracer_reset(struct trace_array *tr)
1137{
1138 blk_tracer_stop(tr);
1139}
1140
1141static const struct {
1142 const char *act[2];
1143 int (*print)(struct trace_seq *s, const struct trace_entry *ent);
1144} what2act[] = {
1145 [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic },
1146 [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic },
1147 [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic },
1148 [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic },
1149 [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic },
1150 [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error },
1151 [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic },
1152 [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error },
1153 [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug },
1154 [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug },
1155 [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug },
1156 [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic },
1157 [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split },
1158 [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic },
1159 [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap },
1160};
1161
1162static enum print_line_t print_one_line(struct trace_iterator *iter,
1163 bool classic)
1164{
1165 struct trace_seq *s = &iter->seq;
1166 const struct blk_io_trace *t;
1167 u16 what;
1168 int ret;
1169 bool long_act;
1170 blk_log_action_t *log_action;
1171
1172 t = te_blk_io_trace(iter->ent);
1173 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
1174 long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
1175 log_action = classic ? &blk_log_action_classic : &blk_log_action;
1176
1177 if (t->action == BLK_TN_MESSAGE) {
1178 ret = log_action(iter, long_act ? "message" : "m");
1179 if (ret)
1180 ret = blk_log_msg(s, iter->ent);
1181 goto out;
1182 }
1183
1184 if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
1185 ret = trace_seq_printf(s, "Bad pc action %x\n", what);
1186 else {
1187 ret = log_action(iter, what2act[what].act[long_act]);
1188 if (ret)
1189 ret = what2act[what].print(s, iter->ent);
1190 }
1191out:
1192 return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1193}
1194
1195static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1196 int flags)
1197{
1198 if (!trace_print_context(iter))
1199 return TRACE_TYPE_PARTIAL_LINE;
1200
1201 return print_one_line(iter, false);
1202}
1203
1204static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1205{
1206 struct trace_seq *s = &iter->seq;
1207 struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
1208 const int offset = offsetof(struct blk_io_trace, sector);
1209 struct blk_io_trace old = {
1210 .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
1211 .time = iter->ts,
1212 };
1213
1214 if (!trace_seq_putmem(s, &old, offset))
1215 return 0;
1216 return trace_seq_putmem(s, &t->sector,
1217 sizeof(old) - offset + t->pdu_len);
1218}
1219
1220static enum print_line_t
1221blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
1222{
1223 return blk_trace_synthesize_old_trace(iter) ?
1224 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
1225}
1226
1227static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
1228{
1229 if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
1230 return TRACE_TYPE_UNHANDLED;
1231
1232 return print_one_line(iter, true);
1233}
1234
1235static struct tracer blk_tracer __read_mostly = {
1236 .name = "blk",
1237 .init = blk_tracer_init,
1238 .reset = blk_tracer_reset,
1239 .start = blk_tracer_start,
1240 .stop = blk_tracer_stop,
1241 .print_header = blk_tracer_print_header,
1242 .print_line = blk_tracer_print_line,
1243 .flags = &blk_tracer_flags,
1244};
1245
1246static struct trace_event trace_blk_event = {
1247 .type = TRACE_BLK,
1248 .trace = blk_trace_event_print,
1249 .binary = blk_trace_event_print_binary,
1250};
1251
1252static int __init init_blk_tracer(void)
1253{
1254 if (!register_ftrace_event(&trace_blk_event)) {
1255 pr_warning("Warning: could not register block events\n");
1256 return 1;
1257 }
1258
1259 if (register_tracer(&blk_tracer) != 0) {
1260 pr_warning("Warning: could not register the block tracer\n");
1261 unregister_ftrace_event(&trace_blk_event);
1262 return 1;
1263 }
1264
1265 return 0;
1266}
1267
1268device_initcall(init_blk_tracer);
1269
1270static int blk_trace_remove_queue(struct request_queue *q)
1271{
1272 struct blk_trace *bt;
1273
1274 bt = xchg(&q->blk_trace, NULL);
1275 if (bt == NULL)
1276 return -EINVAL;
1277
1278 if (atomic_dec_and_test(&blk_probes_ref))
1279 blk_unregister_tracepoints();
1280
1281 blk_trace_free(bt);
1282 return 0;
1283}
1284
1285/*
1286 * Setup everything required to start tracing
1287 */
1288static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
1289{
1290 struct blk_trace *old_bt, *bt = NULL;
1291 int ret = -ENOMEM;
1292
1293 bt = kzalloc(sizeof(*bt), GFP_KERNEL);
1294 if (!bt)
1295 return -ENOMEM;
1296
1297 bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
1298 if (!bt->msg_data)
1299 goto free_bt;
1300
1301 bt->dev = dev;
1302 bt->act_mask = (u16)-1;
1303 bt->end_lba = -1ULL;
1304
1305 old_bt = xchg(&q->blk_trace, bt);
1306 if (old_bt != NULL) {
1307 (void)xchg(&q->blk_trace, old_bt);
1308 ret = -EBUSY;
1309 goto free_bt;
1310 }
1311
1312 if (atomic_inc_return(&blk_probes_ref) == 1)
1313 blk_register_tracepoints();
1314 return 0;
1315
1316free_bt:
1317 blk_trace_free(bt);
1318 return ret;
1319}
1320
1321/*
1322 * sysfs interface to enable and configure tracing
1323 */
1324
1325static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1326 struct device_attribute *attr,
1327 char *buf);
1328static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1329 struct device_attribute *attr,
1330 const char *buf, size_t count);
1331#define BLK_TRACE_DEVICE_ATTR(_name) \
1332 DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
1333 sysfs_blk_trace_attr_show, \
1334 sysfs_blk_trace_attr_store)
1335
1336static BLK_TRACE_DEVICE_ATTR(enable);
1337static BLK_TRACE_DEVICE_ATTR(act_mask);
1338static BLK_TRACE_DEVICE_ATTR(pid);
1339static BLK_TRACE_DEVICE_ATTR(start_lba);
1340static BLK_TRACE_DEVICE_ATTR(end_lba);
1341
1342static struct attribute *blk_trace_attrs[] = {
1343 &dev_attr_enable.attr,
1344 &dev_attr_act_mask.attr,
1345 &dev_attr_pid.attr,
1346 &dev_attr_start_lba.attr,
1347 &dev_attr_end_lba.attr,
1348 NULL
1349};
1350
1351struct attribute_group blk_trace_attr_group = {
1352 .name = "trace",
1353 .attrs = blk_trace_attrs,
1354};
1355
1356static const struct {
1357 int mask;
1358 const char *str;
1359} mask_maps[] = {
1360 { BLK_TC_READ, "read" },
1361 { BLK_TC_WRITE, "write" },
1362 { BLK_TC_BARRIER, "barrier" },
1363 { BLK_TC_SYNC, "sync" },
1364 { BLK_TC_QUEUE, "queue" },
1365 { BLK_TC_REQUEUE, "requeue" },
1366 { BLK_TC_ISSUE, "issue" },
1367 { BLK_TC_COMPLETE, "complete" },
1368 { BLK_TC_FS, "fs" },
1369 { BLK_TC_PC, "pc" },
1370 { BLK_TC_AHEAD, "ahead" },
1371 { BLK_TC_META, "meta" },
1372 { BLK_TC_DISCARD, "discard" },
1373 { BLK_TC_DRV_DATA, "drv_data" },
1374};
1375
1376static int blk_trace_str2mask(const char *str)
1377{
1378 int i;
1379 int mask = 0;
1380 char *buf, *s, *token;
1381
1382 buf = kstrdup(str, GFP_KERNEL);
1383 if (buf == NULL)
1384 return -ENOMEM;
1385 s = strstrip(buf);
1386
1387 while (1) {
1388 token = strsep(&s, ",");
1389 if (token == NULL)
1390 break;
1391
1392 if (*token == '\0')
1393 continue;
1394
1395 for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
1396 if (strcasecmp(token, mask_maps[i].str) == 0) {
1397 mask |= mask_maps[i].mask;
1398 break;
1399 }
1400 }
1401 if (i == ARRAY_SIZE(mask_maps)) {
1402 mask = -EINVAL;
1403 break;
1404 }
1405 }
1406 kfree(buf);
1407
1408 return mask;
1409}
1410
1411static ssize_t blk_trace_mask2str(char *buf, int mask)
1412{
1413 int i;
1414 char *p = buf;
1415
1416 for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
1417 if (mask & mask_maps[i].mask) {
1418 p += sprintf(p, "%s%s",
1419 (p == buf) ? "" : ",", mask_maps[i].str);
1420 }
1421 }
1422 *p++ = '\n';
1423
1424 return p - buf;
1425}
1426
1427static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
1428{
1429 if (bdev->bd_disk == NULL)
1430 return NULL;
1431
1432 return bdev_get_queue(bdev);
1433}
1434
1435static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
1436 struct device_attribute *attr,
1437 char *buf)
1438{
1439 struct hd_struct *p = dev_to_part(dev);
1440 struct request_queue *q;
1441 struct block_device *bdev;
1442 ssize_t ret = -ENXIO;
1443
1444 lock_kernel();
1445 bdev = bdget(part_devt(p));
1446 if (bdev == NULL)
1447 goto out_unlock_kernel;
1448
1449 q = blk_trace_get_queue(bdev);
1450 if (q == NULL)
1451 goto out_bdput;
1452
1453 mutex_lock(&bdev->bd_mutex);
1454
1455 if (attr == &dev_attr_enable) {
1456 ret = sprintf(buf, "%u\n", !!q->blk_trace);
1457 goto out_unlock_bdev;
1458 }
1459
1460 if (q->blk_trace == NULL)
1461 ret = sprintf(buf, "disabled\n");
1462 else if (attr == &dev_attr_act_mask)
1463 ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
1464 else if (attr == &dev_attr_pid)
1465 ret = sprintf(buf, "%u\n", q->blk_trace->pid);
1466 else if (attr == &dev_attr_start_lba)
1467 ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
1468 else if (attr == &dev_attr_end_lba)
1469 ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
1470
1471out_unlock_bdev:
1472 mutex_unlock(&bdev->bd_mutex);
1473out_bdput:
1474 bdput(bdev);
1475out_unlock_kernel:
1476 unlock_kernel();
1477 return ret;
1478}
1479
1480static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
1481 struct device_attribute *attr,
1482 const char *buf, size_t count)
1483{
1484 struct block_device *bdev;
1485 struct request_queue *q;
1486 struct hd_struct *p;
1487 u64 value;
1488 ssize_t ret = -EINVAL;
1489
1490 if (count == 0)
1491 goto out;
1492
1493 if (attr == &dev_attr_act_mask) {
1494 if (sscanf(buf, "%llx", &value) != 1) {
1495 /* Assume it is a list of trace category names */
1496 ret = blk_trace_str2mask(buf);
1497 if (ret < 0)
1498 goto out;
1499 value = ret;
1500 }
1501 } else if (sscanf(buf, "%llu", &value) != 1)
1502 goto out;
1503
1504 ret = -ENXIO;
1505
1506 lock_kernel();
1507 p = dev_to_part(dev);
1508 bdev = bdget(part_devt(p));
1509 if (bdev == NULL)
1510 goto out_unlock_kernel;
1511
1512 q = blk_trace_get_queue(bdev);
1513 if (q == NULL)
1514 goto out_bdput;
1515
1516 mutex_lock(&bdev->bd_mutex);
1517
1518 if (attr == &dev_attr_enable) {
1519 if (value)
1520 ret = blk_trace_setup_queue(q, bdev->bd_dev);
1521 else
1522 ret = blk_trace_remove_queue(q);
1523 goto out_unlock_bdev;
1524 }
1525
1526 ret = 0;
1527 if (q->blk_trace == NULL)
1528 ret = blk_trace_setup_queue(q, bdev->bd_dev);
1529
1530 if (ret == 0) {
1531 if (attr == &dev_attr_act_mask)
1532 q->blk_trace->act_mask = value;
1533 else if (attr == &dev_attr_pid)
1534 q->blk_trace->pid = value;
1535 else if (attr == &dev_attr_start_lba)
1536 q->blk_trace->start_lba = value;
1537 else if (attr == &dev_attr_end_lba)
1538 q->blk_trace->end_lba = value;
1539 }
1540
1541out_unlock_bdev:
1542 mutex_unlock(&bdev->bd_mutex);
1543out_bdput:
1544 bdput(bdev);
1545out_unlock_kernel:
1546 unlock_kernel();
1547out:
1548 return ret ? ret : count;
1549}
1550
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
new file mode 100644
index 000000000000..246f2aa6dc46
--- /dev/null
+++ b/kernel/trace/events.c
@@ -0,0 +1,14 @@
1/*
2 * This is the place to register all trace points as events.
3 */
4
5#include <linux/stringify.h>
6
7#include <trace/trace_events.h>
8
9#include "trace_output.h"
10
11#include "trace_events_stage_1.h"
12#include "trace_events_stage_2.h"
13#include "trace_events_stage_3.h"
14
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 53e8c8bc0c98..f1ed080406c3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,9 @@
27#include <linux/sysctl.h> 27#include <linux/sysctl.h>
28#include <linux/ctype.h> 28#include <linux/ctype.h>
29#include <linux/list.h> 29#include <linux/list.h>
30#include <linux/hash.h>
31
32#include <trace/sched.h>
30 33
31#include <asm/ftrace.h> 34#include <asm/ftrace.h>
32 35
@@ -44,14 +47,14 @@
44 ftrace_kill(); \ 47 ftrace_kill(); \
45 } while (0) 48 } while (0)
46 49
50/* hash bits for specific function selection */
51#define FTRACE_HASH_BITS 7
52#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
53
47/* ftrace_enabled is a method to turn ftrace on or off */ 54/* ftrace_enabled is a method to turn ftrace on or off */
48int ftrace_enabled __read_mostly; 55int ftrace_enabled __read_mostly;
49static int last_ftrace_enabled; 56static int last_ftrace_enabled;
50 57
51/* set when tracing only a pid */
52struct pid *ftrace_pid_trace;
53static struct pid * const ftrace_swapper_pid = &init_struct_pid;
54
55/* Quick disabling of function tracer. */ 58/* Quick disabling of function tracer. */
56int function_trace_stop; 59int function_trace_stop;
57 60
@@ -61,9 +64,7 @@ int function_trace_stop;
61 */ 64 */
62static int ftrace_disabled __read_mostly; 65static int ftrace_disabled __read_mostly;
63 66
64static DEFINE_SPINLOCK(ftrace_lock); 67static DEFINE_MUTEX(ftrace_lock);
65static DEFINE_MUTEX(ftrace_sysctl_lock);
66static DEFINE_MUTEX(ftrace_start_lock);
67 68
68static struct ftrace_ops ftrace_list_end __read_mostly = 69static struct ftrace_ops ftrace_list_end __read_mostly =
69{ 70{
@@ -134,9 +135,6 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
134 135
135static int __register_ftrace_function(struct ftrace_ops *ops) 136static int __register_ftrace_function(struct ftrace_ops *ops)
136{ 137{
137 /* should not be called from interrupt context */
138 spin_lock(&ftrace_lock);
139
140 ops->next = ftrace_list; 138 ops->next = ftrace_list;
141 /* 139 /*
142 * We are entering ops into the ftrace_list but another 140 * We are entering ops into the ftrace_list but another
@@ -172,18 +170,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
172#endif 170#endif
173 } 171 }
174 172
175 spin_unlock(&ftrace_lock);
176
177 return 0; 173 return 0;
178} 174}
179 175
180static int __unregister_ftrace_function(struct ftrace_ops *ops) 176static int __unregister_ftrace_function(struct ftrace_ops *ops)
181{ 177{
182 struct ftrace_ops **p; 178 struct ftrace_ops **p;
183 int ret = 0;
184
185 /* should not be called from interrupt context */
186 spin_lock(&ftrace_lock);
187 179
188 /* 180 /*
189 * If we are removing the last function, then simply point 181 * If we are removing the last function, then simply point
@@ -192,17 +184,15 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
192 if (ftrace_list == ops && ops->next == &ftrace_list_end) { 184 if (ftrace_list == ops && ops->next == &ftrace_list_end) {
193 ftrace_trace_function = ftrace_stub; 185 ftrace_trace_function = ftrace_stub;
194 ftrace_list = &ftrace_list_end; 186 ftrace_list = &ftrace_list_end;
195 goto out; 187 return 0;
196 } 188 }
197 189
198 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next) 190 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
199 if (*p == ops) 191 if (*p == ops)
200 break; 192 break;
201 193
202 if (*p != ops) { 194 if (*p != ops)
203 ret = -1; 195 return -1;
204 goto out;
205 }
206 196
207 *p = (*p)->next; 197 *p = (*p)->next;
208 198
@@ -223,21 +213,15 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
223 } 213 }
224 } 214 }
225 215
226 out: 216 return 0;
227 spin_unlock(&ftrace_lock);
228
229 return ret;
230} 217}
231 218
232static void ftrace_update_pid_func(void) 219static void ftrace_update_pid_func(void)
233{ 220{
234 ftrace_func_t func; 221 ftrace_func_t func;
235 222
236 /* should not be called from interrupt context */
237 spin_lock(&ftrace_lock);
238
239 if (ftrace_trace_function == ftrace_stub) 223 if (ftrace_trace_function == ftrace_stub)
240 goto out; 224 return;
241 225
242 func = ftrace_trace_function; 226 func = ftrace_trace_function;
243 227
@@ -254,23 +238,29 @@ static void ftrace_update_pid_func(void)
254#else 238#else
255 __ftrace_trace_function = func; 239 __ftrace_trace_function = func;
256#endif 240#endif
257
258 out:
259 spin_unlock(&ftrace_lock);
260} 241}
261 242
243/* set when tracing only a pid */
244struct pid *ftrace_pid_trace;
245static struct pid * const ftrace_swapper_pid = &init_struct_pid;
246
262#ifdef CONFIG_DYNAMIC_FTRACE 247#ifdef CONFIG_DYNAMIC_FTRACE
248
263#ifndef CONFIG_FTRACE_MCOUNT_RECORD 249#ifndef CONFIG_FTRACE_MCOUNT_RECORD
264# error Dynamic ftrace depends on MCOUNT_RECORD 250# error Dynamic ftrace depends on MCOUNT_RECORD
265#endif 251#endif
266 252
267/* 253static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
268 * Since MCOUNT_ADDR may point to mcount itself, we do not want 254
269 * to get it confused by reading a reference in the code as we 255struct ftrace_func_probe {
270 * are parsing on objcopy output of text. Use a variable for 256 struct hlist_node node;
271 * it instead. 257 struct ftrace_probe_ops *ops;
272 */ 258 unsigned long flags;
273static unsigned long mcount_addr = MCOUNT_ADDR; 259 unsigned long ip;
260 void *data;
261 struct rcu_head rcu;
262};
263
274 264
275enum { 265enum {
276 FTRACE_ENABLE_CALLS = (1 << 0), 266 FTRACE_ENABLE_CALLS = (1 << 0),
@@ -284,13 +274,13 @@ enum {
284 274
285static int ftrace_filtered; 275static int ftrace_filtered;
286 276
287static LIST_HEAD(ftrace_new_addrs); 277static struct dyn_ftrace *ftrace_new_addrs;
288 278
289static DEFINE_MUTEX(ftrace_regex_lock); 279static DEFINE_MUTEX(ftrace_regex_lock);
290 280
291struct ftrace_page { 281struct ftrace_page {
292 struct ftrace_page *next; 282 struct ftrace_page *next;
293 unsigned long index; 283 int index;
294 struct dyn_ftrace records[]; 284 struct dyn_ftrace records[];
295}; 285};
296 286
@@ -305,6 +295,19 @@ static struct ftrace_page *ftrace_pages;
305 295
306static struct dyn_ftrace *ftrace_free_records; 296static struct dyn_ftrace *ftrace_free_records;
307 297
298/*
299 * This is a double for. Do not use 'break' to break out of the loop,
300 * you must use a goto.
301 */
302#define do_for_each_ftrace_rec(pg, rec) \
303 for (pg = ftrace_pages_start; pg; pg = pg->next) { \
304 int _____i; \
305 for (_____i = 0; _____i < pg->index; _____i++) { \
306 rec = &pg->records[_____i];
307
308#define while_for_each_ftrace_rec() \
309 } \
310 }
308 311
309#ifdef CONFIG_KPROBES 312#ifdef CONFIG_KPROBES
310 313
@@ -338,7 +341,7 @@ static inline int record_frozen(struct dyn_ftrace *rec)
338 341
339static void ftrace_free_rec(struct dyn_ftrace *rec) 342static void ftrace_free_rec(struct dyn_ftrace *rec)
340{ 343{
341 rec->ip = (unsigned long)ftrace_free_records; 344 rec->freelist = ftrace_free_records;
342 ftrace_free_records = rec; 345 ftrace_free_records = rec;
343 rec->flags |= FTRACE_FL_FREE; 346 rec->flags |= FTRACE_FL_FREE;
344} 347}
@@ -349,23 +352,22 @@ void ftrace_release(void *start, unsigned long size)
349 struct ftrace_page *pg; 352 struct ftrace_page *pg;
350 unsigned long s = (unsigned long)start; 353 unsigned long s = (unsigned long)start;
351 unsigned long e = s + size; 354 unsigned long e = s + size;
352 int i;
353 355
354 if (ftrace_disabled || !start) 356 if (ftrace_disabled || !start)
355 return; 357 return;
356 358
357 /* should not be called from interrupt context */ 359 mutex_lock(&ftrace_lock);
358 spin_lock(&ftrace_lock); 360 do_for_each_ftrace_rec(pg, rec) {
359 361 if ((rec->ip >= s) && (rec->ip < e)) {
360 for (pg = ftrace_pages_start; pg; pg = pg->next) { 362 /*
361 for (i = 0; i < pg->index; i++) { 363 * rec->ip is changed in ftrace_free_rec()
362 rec = &pg->records[i]; 364 * It should not between s and e if record was freed.
363 365 */
364 if ((rec->ip >= s) && (rec->ip < e)) 366 FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
365 ftrace_free_rec(rec); 367 ftrace_free_rec(rec);
366 } 368 }
367 } 369 } while_for_each_ftrace_rec();
368 spin_unlock(&ftrace_lock); 370 mutex_unlock(&ftrace_lock);
369} 371}
370 372
371static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) 373static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
@@ -382,7 +384,7 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
382 return NULL; 384 return NULL;
383 } 385 }
384 386
385 ftrace_free_records = (void *)rec->ip; 387 ftrace_free_records = rec->freelist;
386 memset(rec, 0, sizeof(*rec)); 388 memset(rec, 0, sizeof(*rec));
387 return rec; 389 return rec;
388 } 390 }
@@ -414,8 +416,8 @@ ftrace_record_ip(unsigned long ip)
414 return NULL; 416 return NULL;
415 417
416 rec->ip = ip; 418 rec->ip = ip;
417 419 rec->newlist = ftrace_new_addrs;
418 list_add(&rec->list, &ftrace_new_addrs); 420 ftrace_new_addrs = rec;
419 421
420 return rec; 422 return rec;
421} 423}
@@ -461,10 +463,10 @@ static void ftrace_bug(int failed, unsigned long ip)
461static int 463static int
462__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 464__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
463{ 465{
464 unsigned long ip, fl;
465 unsigned long ftrace_addr; 466 unsigned long ftrace_addr;
467 unsigned long ip, fl;
466 468
467 ftrace_addr = (unsigned long)ftrace_caller; 469 ftrace_addr = (unsigned long)FTRACE_ADDR;
468 470
469 ip = rec->ip; 471 ip = rec->ip;
470 472
@@ -473,7 +475,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
473 * it is not enabled then do nothing. 475 * it is not enabled then do nothing.
474 * 476 *
475 * If this record is not to be traced and 477 * If this record is not to be traced and
476 * it is enabled then disabled it. 478 * it is enabled then disable it.
477 * 479 *
478 */ 480 */
479 if (rec->flags & FTRACE_FL_NOTRACE) { 481 if (rec->flags & FTRACE_FL_NOTRACE) {
@@ -493,7 +495,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
493 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) 495 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
494 return 0; 496 return 0;
495 497
496 /* Record is not filtered and is not enabled do nothing */ 498 /* Record is not filtered or enabled, do nothing */
497 if (!fl) 499 if (!fl)
498 return 0; 500 return 0;
499 501
@@ -515,7 +517,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
515 517
516 } else { 518 } else {
517 519
518 /* if record is not enabled do nothing */ 520 /* if record is not enabled, do nothing */
519 if (!(rec->flags & FTRACE_FL_ENABLED)) 521 if (!(rec->flags & FTRACE_FL_ENABLED))
520 return 0; 522 return 0;
521 523
@@ -531,41 +533,41 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
531 533
532static void ftrace_replace_code(int enable) 534static void ftrace_replace_code(int enable)
533{ 535{
534 int i, failed;
535 struct dyn_ftrace *rec; 536 struct dyn_ftrace *rec;
536 struct ftrace_page *pg; 537 struct ftrace_page *pg;
538 int failed;
537 539
538 for (pg = ftrace_pages_start; pg; pg = pg->next) { 540 do_for_each_ftrace_rec(pg, rec) {
539 for (i = 0; i < pg->index; i++) { 541 /*
540 rec = &pg->records[i]; 542 * Skip over free records, records that have
541 543 * failed and not converted.
542 /* 544 */
543 * Skip over free records and records that have 545 if (rec->flags & FTRACE_FL_FREE ||
544 * failed. 546 rec->flags & FTRACE_FL_FAILED ||
545 */ 547 !(rec->flags & FTRACE_FL_CONVERTED))
546 if (rec->flags & FTRACE_FL_FREE || 548 continue;
547 rec->flags & FTRACE_FL_FAILED)
548 continue;
549 549
550 /* ignore updates to this record's mcount site */ 550 /* ignore updates to this record's mcount site */
551 if (get_kprobe((void *)rec->ip)) { 551 if (get_kprobe((void *)rec->ip)) {
552 freeze_record(rec); 552 freeze_record(rec);
553 continue; 553 continue;
554 } else { 554 } else {
555 unfreeze_record(rec); 555 unfreeze_record(rec);
556 } 556 }
557 557
558 failed = __ftrace_replace_code(rec, enable); 558 failed = __ftrace_replace_code(rec, enable);
559 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { 559 if (failed) {
560 rec->flags |= FTRACE_FL_FAILED; 560 rec->flags |= FTRACE_FL_FAILED;
561 if ((system_state == SYSTEM_BOOTING) || 561 if ((system_state == SYSTEM_BOOTING) ||
562 !core_kernel_text(rec->ip)) { 562 !core_kernel_text(rec->ip)) {
563 ftrace_free_rec(rec); 563 ftrace_free_rec(rec);
564 } else 564 } else {
565 ftrace_bug(failed, rec->ip); 565 ftrace_bug(failed, rec->ip);
566 } 566 /* Stop processing */
567 return;
568 }
567 } 569 }
568 } 570 } while_for_each_ftrace_rec();
569} 571}
570 572
571static int 573static int
@@ -576,7 +578,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
576 578
577 ip = rec->ip; 579 ip = rec->ip;
578 580
579 ret = ftrace_make_nop(mod, rec, mcount_addr); 581 ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
580 if (ret) { 582 if (ret) {
581 ftrace_bug(ret, ip); 583 ftrace_bug(ret, ip);
582 rec->flags |= FTRACE_FL_FAILED; 584 rec->flags |= FTRACE_FL_FAILED;
@@ -585,6 +587,24 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
585 return 1; 587 return 1;
586} 588}
587 589
590/*
591 * archs can override this function if they must do something
592 * before the modifying code is performed.
593 */
594int __weak ftrace_arch_code_modify_prepare(void)
595{
596 return 0;
597}
598
599/*
600 * archs can override this function if they must do something
601 * after the modifying code is performed.
602 */
603int __weak ftrace_arch_code_modify_post_process(void)
604{
605 return 0;
606}
607
588static int __ftrace_modify_code(void *data) 608static int __ftrace_modify_code(void *data)
589{ 609{
590 int *command = data; 610 int *command = data;
@@ -607,7 +627,17 @@ static int __ftrace_modify_code(void *data)
607 627
608static void ftrace_run_update_code(int command) 628static void ftrace_run_update_code(int command)
609{ 629{
630 int ret;
631
632 ret = ftrace_arch_code_modify_prepare();
633 FTRACE_WARN_ON(ret);
634 if (ret)
635 return;
636
610 stop_machine(__ftrace_modify_code, &command, NULL); 637 stop_machine(__ftrace_modify_code, &command, NULL);
638
639 ret = ftrace_arch_code_modify_post_process();
640 FTRACE_WARN_ON(ret);
611} 641}
612 642
613static ftrace_func_t saved_ftrace_func; 643static ftrace_func_t saved_ftrace_func;
@@ -631,13 +661,10 @@ static void ftrace_startup(int command)
631 if (unlikely(ftrace_disabled)) 661 if (unlikely(ftrace_disabled))
632 return; 662 return;
633 663
634 mutex_lock(&ftrace_start_lock);
635 ftrace_start_up++; 664 ftrace_start_up++;
636 command |= FTRACE_ENABLE_CALLS; 665 command |= FTRACE_ENABLE_CALLS;
637 666
638 ftrace_startup_enable(command); 667 ftrace_startup_enable(command);
639
640 mutex_unlock(&ftrace_start_lock);
641} 668}
642 669
643static void ftrace_shutdown(int command) 670static void ftrace_shutdown(int command)
@@ -645,7 +672,6 @@ static void ftrace_shutdown(int command)
645 if (unlikely(ftrace_disabled)) 672 if (unlikely(ftrace_disabled))
646 return; 673 return;
647 674
648 mutex_lock(&ftrace_start_lock);
649 ftrace_start_up--; 675 ftrace_start_up--;
650 if (!ftrace_start_up) 676 if (!ftrace_start_up)
651 command |= FTRACE_DISABLE_CALLS; 677 command |= FTRACE_DISABLE_CALLS;
@@ -656,11 +682,9 @@ static void ftrace_shutdown(int command)
656 } 682 }
657 683
658 if (!command || !ftrace_enabled) 684 if (!command || !ftrace_enabled)
659 goto out; 685 return;
660 686
661 ftrace_run_update_code(command); 687 ftrace_run_update_code(command);
662 out:
663 mutex_unlock(&ftrace_start_lock);
664} 688}
665 689
666static void ftrace_startup_sysctl(void) 690static void ftrace_startup_sysctl(void)
@@ -670,7 +694,6 @@ static void ftrace_startup_sysctl(void)
670 if (unlikely(ftrace_disabled)) 694 if (unlikely(ftrace_disabled))
671 return; 695 return;
672 696
673 mutex_lock(&ftrace_start_lock);
674 /* Force update next time */ 697 /* Force update next time */
675 saved_ftrace_func = NULL; 698 saved_ftrace_func = NULL;
676 /* ftrace_start_up is true if we want ftrace running */ 699 /* ftrace_start_up is true if we want ftrace running */
@@ -678,7 +701,6 @@ static void ftrace_startup_sysctl(void)
678 command |= FTRACE_ENABLE_CALLS; 701 command |= FTRACE_ENABLE_CALLS;
679 702
680 ftrace_run_update_code(command); 703 ftrace_run_update_code(command);
681 mutex_unlock(&ftrace_start_lock);
682} 704}
683 705
684static void ftrace_shutdown_sysctl(void) 706static void ftrace_shutdown_sysctl(void)
@@ -688,13 +710,11 @@ static void ftrace_shutdown_sysctl(void)
688 if (unlikely(ftrace_disabled)) 710 if (unlikely(ftrace_disabled))
689 return; 711 return;
690 712
691 mutex_lock(&ftrace_start_lock);
692 /* ftrace_start_up is true if ftrace is running */ 713 /* ftrace_start_up is true if ftrace is running */
693 if (ftrace_start_up) 714 if (ftrace_start_up)
694 command |= FTRACE_DISABLE_CALLS; 715 command |= FTRACE_DISABLE_CALLS;
695 716
696 ftrace_run_update_code(command); 717 ftrace_run_update_code(command);
697 mutex_unlock(&ftrace_start_lock);
698} 718}
699 719
700static cycle_t ftrace_update_time; 720static cycle_t ftrace_update_time;
@@ -703,19 +723,21 @@ unsigned long ftrace_update_tot_cnt;
703 723
704static int ftrace_update_code(struct module *mod) 724static int ftrace_update_code(struct module *mod)
705{ 725{
706 struct dyn_ftrace *p, *t; 726 struct dyn_ftrace *p;
707 cycle_t start, stop; 727 cycle_t start, stop;
708 728
709 start = ftrace_now(raw_smp_processor_id()); 729 start = ftrace_now(raw_smp_processor_id());
710 ftrace_update_cnt = 0; 730 ftrace_update_cnt = 0;
711 731
712 list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) { 732 while (ftrace_new_addrs) {
713 733
714 /* If something went wrong, bail without enabling anything */ 734 /* If something went wrong, bail without enabling anything */
715 if (unlikely(ftrace_disabled)) 735 if (unlikely(ftrace_disabled))
716 return -1; 736 return -1;
717 737
718 list_del_init(&p->list); 738 p = ftrace_new_addrs;
739 ftrace_new_addrs = p->newlist;
740 p->flags = 0L;
719 741
720 /* convert record (i.e, patch mcount-call with NOP) */ 742 /* convert record (i.e, patch mcount-call with NOP) */
721 if (ftrace_code_disable(mod, p)) { 743 if (ftrace_code_disable(mod, p)) {
@@ -781,13 +803,16 @@ enum {
781 FTRACE_ITER_CONT = (1 << 1), 803 FTRACE_ITER_CONT = (1 << 1),
782 FTRACE_ITER_NOTRACE = (1 << 2), 804 FTRACE_ITER_NOTRACE = (1 << 2),
783 FTRACE_ITER_FAILURES = (1 << 3), 805 FTRACE_ITER_FAILURES = (1 << 3),
806 FTRACE_ITER_PRINTALL = (1 << 4),
807 FTRACE_ITER_HASH = (1 << 5),
784}; 808};
785 809
786#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 810#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
787 811
788struct ftrace_iterator { 812struct ftrace_iterator {
789 struct ftrace_page *pg; 813 struct ftrace_page *pg;
790 unsigned idx; 814 int hidx;
815 int idx;
791 unsigned flags; 816 unsigned flags;
792 unsigned char buffer[FTRACE_BUFF_MAX+1]; 817 unsigned char buffer[FTRACE_BUFF_MAX+1];
793 unsigned buffer_idx; 818 unsigned buffer_idx;
@@ -795,15 +820,89 @@ struct ftrace_iterator {
795}; 820};
796 821
797static void * 822static void *
823t_hash_next(struct seq_file *m, void *v, loff_t *pos)
824{
825 struct ftrace_iterator *iter = m->private;
826 struct hlist_node *hnd = v;
827 struct hlist_head *hhd;
828
829 WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
830
831 (*pos)++;
832
833 retry:
834 if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
835 return NULL;
836
837 hhd = &ftrace_func_hash[iter->hidx];
838
839 if (hlist_empty(hhd)) {
840 iter->hidx++;
841 hnd = NULL;
842 goto retry;
843 }
844
845 if (!hnd)
846 hnd = hhd->first;
847 else {
848 hnd = hnd->next;
849 if (!hnd) {
850 iter->hidx++;
851 goto retry;
852 }
853 }
854
855 return hnd;
856}
857
858static void *t_hash_start(struct seq_file *m, loff_t *pos)
859{
860 struct ftrace_iterator *iter = m->private;
861 void *p = NULL;
862
863 iter->flags |= FTRACE_ITER_HASH;
864
865 return t_hash_next(m, p, pos);
866}
867
868static int t_hash_show(struct seq_file *m, void *v)
869{
870 struct ftrace_func_probe *rec;
871 struct hlist_node *hnd = v;
872 char str[KSYM_SYMBOL_LEN];
873
874 rec = hlist_entry(hnd, struct ftrace_func_probe, node);
875
876 if (rec->ops->print)
877 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
878
879 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
880 seq_printf(m, "%s:", str);
881
882 kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
883 seq_printf(m, "%s", str);
884
885 if (rec->data)
886 seq_printf(m, ":%p", rec->data);
887 seq_putc(m, '\n');
888
889 return 0;
890}
891
892static void *
798t_next(struct seq_file *m, void *v, loff_t *pos) 893t_next(struct seq_file *m, void *v, loff_t *pos)
799{ 894{
800 struct ftrace_iterator *iter = m->private; 895 struct ftrace_iterator *iter = m->private;
801 struct dyn_ftrace *rec = NULL; 896 struct dyn_ftrace *rec = NULL;
802 897
898 if (iter->flags & FTRACE_ITER_HASH)
899 return t_hash_next(m, v, pos);
900
803 (*pos)++; 901 (*pos)++;
804 902
805 /* should not be called from interrupt context */ 903 if (iter->flags & FTRACE_ITER_PRINTALL)
806 spin_lock(&ftrace_lock); 904 return NULL;
905
807 retry: 906 retry:
808 if (iter->idx >= iter->pg->index) { 907 if (iter->idx >= iter->pg->index) {
809 if (iter->pg->next) { 908 if (iter->pg->next) {
@@ -832,7 +931,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
832 goto retry; 931 goto retry;
833 } 932 }
834 } 933 }
835 spin_unlock(&ftrace_lock);
836 934
837 return rec; 935 return rec;
838} 936}
@@ -842,6 +940,23 @@ static void *t_start(struct seq_file *m, loff_t *pos)
842 struct ftrace_iterator *iter = m->private; 940 struct ftrace_iterator *iter = m->private;
843 void *p = NULL; 941 void *p = NULL;
844 942
943 mutex_lock(&ftrace_lock);
944 /*
945 * For set_ftrace_filter reading, if we have the filter
946 * off, we can short cut and just print out that all
947 * functions are enabled.
948 */
949 if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) {
950 if (*pos > 0)
951 return t_hash_start(m, pos);
952 iter->flags |= FTRACE_ITER_PRINTALL;
953 (*pos)++;
954 return iter;
955 }
956
957 if (iter->flags & FTRACE_ITER_HASH)
958 return t_hash_start(m, pos);
959
845 if (*pos > 0) { 960 if (*pos > 0) {
846 if (iter->idx < 0) 961 if (iter->idx < 0)
847 return p; 962 return p;
@@ -851,18 +966,31 @@ static void *t_start(struct seq_file *m, loff_t *pos)
851 966
852 p = t_next(m, p, pos); 967 p = t_next(m, p, pos);
853 968
969 if (!p)
970 return t_hash_start(m, pos);
971
854 return p; 972 return p;
855} 973}
856 974
857static void t_stop(struct seq_file *m, void *p) 975static void t_stop(struct seq_file *m, void *p)
858{ 976{
977 mutex_unlock(&ftrace_lock);
859} 978}
860 979
861static int t_show(struct seq_file *m, void *v) 980static int t_show(struct seq_file *m, void *v)
862{ 981{
982 struct ftrace_iterator *iter = m->private;
863 struct dyn_ftrace *rec = v; 983 struct dyn_ftrace *rec = v;
864 char str[KSYM_SYMBOL_LEN]; 984 char str[KSYM_SYMBOL_LEN];
865 985
986 if (iter->flags & FTRACE_ITER_HASH)
987 return t_hash_show(m, v);
988
989 if (iter->flags & FTRACE_ITER_PRINTALL) {
990 seq_printf(m, "#### all functions enabled ####\n");
991 return 0;
992 }
993
866 if (!rec) 994 if (!rec)
867 return 0; 995 return 0;
868 996
@@ -941,23 +1069,16 @@ static void ftrace_filter_reset(int enable)
941 struct ftrace_page *pg; 1069 struct ftrace_page *pg;
942 struct dyn_ftrace *rec; 1070 struct dyn_ftrace *rec;
943 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1071 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
944 unsigned i;
945 1072
946 /* should not be called from interrupt context */ 1073 mutex_lock(&ftrace_lock);
947 spin_lock(&ftrace_lock);
948 if (enable) 1074 if (enable)
949 ftrace_filtered = 0; 1075 ftrace_filtered = 0;
950 pg = ftrace_pages_start; 1076 do_for_each_ftrace_rec(pg, rec) {
951 while (pg) { 1077 if (rec->flags & FTRACE_FL_FAILED)
952 for (i = 0; i < pg->index; i++) { 1078 continue;
953 rec = &pg->records[i]; 1079 rec->flags &= ~type;
954 if (rec->flags & FTRACE_FL_FAILED) 1080 } while_for_each_ftrace_rec();
955 continue; 1081 mutex_unlock(&ftrace_lock);
956 rec->flags &= ~type;
957 }
958 pg = pg->next;
959 }
960 spin_unlock(&ftrace_lock);
961} 1082}
962 1083
963static int 1084static int
@@ -1008,16 +1129,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
1008 return ftrace_regex_open(inode, file, 0); 1129 return ftrace_regex_open(inode, file, 0);
1009} 1130}
1010 1131
1011static ssize_t
1012ftrace_regex_read(struct file *file, char __user *ubuf,
1013 size_t cnt, loff_t *ppos)
1014{
1015 if (file->f_mode & FMODE_READ)
1016 return seq_read(file, ubuf, cnt, ppos);
1017 else
1018 return -EPERM;
1019}
1020
1021static loff_t 1132static loff_t
1022ftrace_regex_lseek(struct file *file, loff_t offset, int origin) 1133ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1023{ 1134{
@@ -1038,86 +1149,536 @@ enum {
1038 MATCH_END_ONLY, 1149 MATCH_END_ONLY,
1039}; 1150};
1040 1151
1041static void 1152/*
1042ftrace_match(unsigned char *buff, int len, int enable) 1153 * (static function - no need for kernel doc)
1154 *
1155 * Pass in a buffer containing a glob and this function will
1156 * set search to point to the search part of the buffer and
1157 * return the type of search it is (see enum above).
1158 * This does modify buff.
1159 *
1160 * Returns enum type.
1161 * search returns the pointer to use for comparison.
1162 * not returns 1 if buff started with a '!'
1163 * 0 otherwise.
1164 */
1165static int
1166ftrace_setup_glob(char *buff, int len, char **search, int *not)
1043{ 1167{
1044 char str[KSYM_SYMBOL_LEN];
1045 char *search = NULL;
1046 struct ftrace_page *pg;
1047 struct dyn_ftrace *rec;
1048 int type = MATCH_FULL; 1168 int type = MATCH_FULL;
1049 unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1169 int i;
1050 unsigned i, match = 0, search_len = 0;
1051 int not = 0;
1052 1170
1053 if (buff[0] == '!') { 1171 if (buff[0] == '!') {
1054 not = 1; 1172 *not = 1;
1055 buff++; 1173 buff++;
1056 len--; 1174 len--;
1057 } 1175 } else
1176 *not = 0;
1177
1178 *search = buff;
1058 1179
1059 for (i = 0; i < len; i++) { 1180 for (i = 0; i < len; i++) {
1060 if (buff[i] == '*') { 1181 if (buff[i] == '*') {
1061 if (!i) { 1182 if (!i) {
1062 search = buff + i + 1; 1183 *search = buff + 1;
1063 type = MATCH_END_ONLY; 1184 type = MATCH_END_ONLY;
1064 search_len = len - (i + 1);
1065 } else { 1185 } else {
1066 if (type == MATCH_END_ONLY) { 1186 if (type == MATCH_END_ONLY)
1067 type = MATCH_MIDDLE_ONLY; 1187 type = MATCH_MIDDLE_ONLY;
1068 } else { 1188 else
1069 match = i;
1070 type = MATCH_FRONT_ONLY; 1189 type = MATCH_FRONT_ONLY;
1071 }
1072 buff[i] = 0; 1190 buff[i] = 0;
1073 break; 1191 break;
1074 } 1192 }
1075 } 1193 }
1076 } 1194 }
1077 1195
1078 /* should not be called from interrupt context */ 1196 return type;
1079 spin_lock(&ftrace_lock); 1197}
1080 if (enable) 1198
1081 ftrace_filtered = 1; 1199static int ftrace_match(char *str, char *regex, int len, int type)
1082 pg = ftrace_pages_start; 1200{
1083 while (pg) { 1201 int matched = 0;
1084 for (i = 0; i < pg->index; i++) { 1202 char *ptr;
1085 int matched = 0; 1203
1086 char *ptr; 1204 switch (type) {
1087 1205 case MATCH_FULL:
1088 rec = &pg->records[i]; 1206 if (strcmp(str, regex) == 0)
1089 if (rec->flags & FTRACE_FL_FAILED) 1207 matched = 1;
1208 break;
1209 case MATCH_FRONT_ONLY:
1210 if (strncmp(str, regex, len) == 0)
1211 matched = 1;
1212 break;
1213 case MATCH_MIDDLE_ONLY:
1214 if (strstr(str, regex))
1215 matched = 1;
1216 break;
1217 case MATCH_END_ONLY:
1218 ptr = strstr(str, regex);
1219 if (ptr && (ptr[len] == 0))
1220 matched = 1;
1221 break;
1222 }
1223
1224 return matched;
1225}
1226
1227static int
1228ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
1229{
1230 char str[KSYM_SYMBOL_LEN];
1231
1232 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1233 return ftrace_match(str, regex, len, type);
1234}
1235
1236static void ftrace_match_records(char *buff, int len, int enable)
1237{
1238 unsigned int search_len;
1239 struct ftrace_page *pg;
1240 struct dyn_ftrace *rec;
1241 unsigned long flag;
1242 char *search;
1243 int type;
1244 int not;
1245
1246 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1247 type = ftrace_setup_glob(buff, len, &search, &not);
1248
1249 search_len = strlen(search);
1250
1251 mutex_lock(&ftrace_lock);
1252 do_for_each_ftrace_rec(pg, rec) {
1253
1254 if (rec->flags & FTRACE_FL_FAILED)
1255 continue;
1256
1257 if (ftrace_match_record(rec, search, search_len, type)) {
1258 if (not)
1259 rec->flags &= ~flag;
1260 else
1261 rec->flags |= flag;
1262 }
1263 /*
1264 * Only enable filtering if we have a function that
1265 * is filtered on.
1266 */
1267 if (enable && (rec->flags & FTRACE_FL_FILTER))
1268 ftrace_filtered = 1;
1269 } while_for_each_ftrace_rec();
1270 mutex_unlock(&ftrace_lock);
1271}
1272
1273static int
1274ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
1275 char *regex, int len, int type)
1276{
1277 char str[KSYM_SYMBOL_LEN];
1278 char *modname;
1279
1280 kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
1281
1282 if (!modname || strcmp(modname, mod))
1283 return 0;
1284
1285 /* blank search means to match all funcs in the mod */
1286 if (len)
1287 return ftrace_match(str, regex, len, type);
1288 else
1289 return 1;
1290}
1291
1292static void ftrace_match_module_records(char *buff, char *mod, int enable)
1293{
1294 unsigned search_len = 0;
1295 struct ftrace_page *pg;
1296 struct dyn_ftrace *rec;
1297 int type = MATCH_FULL;
1298 char *search = buff;
1299 unsigned long flag;
1300 int not = 0;
1301
1302 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1303
1304 /* blank or '*' mean the same */
1305 if (strcmp(buff, "*") == 0)
1306 buff[0] = 0;
1307
1308 /* handle the case of 'dont filter this module' */
1309 if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) {
1310 buff[0] = 0;
1311 not = 1;
1312 }
1313
1314 if (strlen(buff)) {
1315 type = ftrace_setup_glob(buff, strlen(buff), &search, &not);
1316 search_len = strlen(search);
1317 }
1318
1319 mutex_lock(&ftrace_lock);
1320 do_for_each_ftrace_rec(pg, rec) {
1321
1322 if (rec->flags & FTRACE_FL_FAILED)
1323 continue;
1324
1325 if (ftrace_match_module_record(rec, mod,
1326 search, search_len, type)) {
1327 if (not)
1328 rec->flags &= ~flag;
1329 else
1330 rec->flags |= flag;
1331 }
1332 if (enable && (rec->flags & FTRACE_FL_FILTER))
1333 ftrace_filtered = 1;
1334
1335 } while_for_each_ftrace_rec();
1336 mutex_unlock(&ftrace_lock);
1337}
1338
1339/*
1340 * We register the module command as a template to show others how
1341 * to register the a command as well.
1342 */
1343
1344static int
1345ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1346{
1347 char *mod;
1348
1349 /*
1350 * cmd == 'mod' because we only registered this func
1351 * for the 'mod' ftrace_func_command.
1352 * But if you register one func with multiple commands,
1353 * you can tell which command was used by the cmd
1354 * parameter.
1355 */
1356
1357 /* we must have a module name */
1358 if (!param)
1359 return -EINVAL;
1360
1361 mod = strsep(&param, ":");
1362 if (!strlen(mod))
1363 return -EINVAL;
1364
1365 ftrace_match_module_records(func, mod, enable);
1366 return 0;
1367}
1368
1369static struct ftrace_func_command ftrace_mod_cmd = {
1370 .name = "mod",
1371 .func = ftrace_mod_callback,
1372};
1373
1374static int __init ftrace_mod_cmd_init(void)
1375{
1376 return register_ftrace_command(&ftrace_mod_cmd);
1377}
1378device_initcall(ftrace_mod_cmd_init);
1379
1380static void
1381function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
1382{
1383 struct ftrace_func_probe *entry;
1384 struct hlist_head *hhd;
1385 struct hlist_node *n;
1386 unsigned long key;
1387 int resched;
1388
1389 key = hash_long(ip, FTRACE_HASH_BITS);
1390
1391 hhd = &ftrace_func_hash[key];
1392
1393 if (hlist_empty(hhd))
1394 return;
1395
1396 /*
1397 * Disable preemption for these calls to prevent a RCU grace
1398 * period. This syncs the hash iteration and freeing of items
1399 * on the hash. rcu_read_lock is too dangerous here.
1400 */
1401 resched = ftrace_preempt_disable();
1402 hlist_for_each_entry_rcu(entry, n, hhd, node) {
1403 if (entry->ip == ip)
1404 entry->ops->func(ip, parent_ip, &entry->data);
1405 }
1406 ftrace_preempt_enable(resched);
1407}
1408
1409static struct ftrace_ops trace_probe_ops __read_mostly =
1410{
1411 .func = function_trace_probe_call,
1412};
1413
1414static int ftrace_probe_registered;
1415
1416static void __enable_ftrace_function_probe(void)
1417{
1418 int i;
1419
1420 if (ftrace_probe_registered)
1421 return;
1422
1423 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
1424 struct hlist_head *hhd = &ftrace_func_hash[i];
1425 if (hhd->first)
1426 break;
1427 }
1428 /* Nothing registered? */
1429 if (i == FTRACE_FUNC_HASHSIZE)
1430 return;
1431
1432 __register_ftrace_function(&trace_probe_ops);
1433 ftrace_startup(0);
1434 ftrace_probe_registered = 1;
1435}
1436
1437static void __disable_ftrace_function_probe(void)
1438{
1439 int i;
1440
1441 if (!ftrace_probe_registered)
1442 return;
1443
1444 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
1445 struct hlist_head *hhd = &ftrace_func_hash[i];
1446 if (hhd->first)
1447 return;
1448 }
1449
1450 /* no more funcs left */
1451 __unregister_ftrace_function(&trace_probe_ops);
1452 ftrace_shutdown(0);
1453 ftrace_probe_registered = 0;
1454}
1455
1456
1457static void ftrace_free_entry_rcu(struct rcu_head *rhp)
1458{
1459 struct ftrace_func_probe *entry =
1460 container_of(rhp, struct ftrace_func_probe, rcu);
1461
1462 if (entry->ops->free)
1463 entry->ops->free(&entry->data);
1464 kfree(entry);
1465}
1466
1467
1468int
1469register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1470 void *data)
1471{
1472 struct ftrace_func_probe *entry;
1473 struct ftrace_page *pg;
1474 struct dyn_ftrace *rec;
1475 int type, len, not;
1476 unsigned long key;
1477 int count = 0;
1478 char *search;
1479
1480 type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
1481 len = strlen(search);
1482
1483 /* we do not support '!' for function probes */
1484 if (WARN_ON(not))
1485 return -EINVAL;
1486
1487 mutex_lock(&ftrace_lock);
1488 do_for_each_ftrace_rec(pg, rec) {
1489
1490 if (rec->flags & FTRACE_FL_FAILED)
1491 continue;
1492
1493 if (!ftrace_match_record(rec, search, len, type))
1494 continue;
1495
1496 entry = kmalloc(sizeof(*entry), GFP_KERNEL);
1497 if (!entry) {
1498 /* If we did not process any, then return error */
1499 if (!count)
1500 count = -ENOMEM;
1501 goto out_unlock;
1502 }
1503
1504 count++;
1505
1506 entry->data = data;
1507
1508 /*
1509 * The caller might want to do something special
1510 * for each function we find. We call the callback
1511 * to give the caller an opportunity to do so.
1512 */
1513 if (ops->callback) {
1514 if (ops->callback(rec->ip, &entry->data) < 0) {
1515 /* caller does not like this func */
1516 kfree(entry);
1090 continue; 1517 continue;
1091 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1092 switch (type) {
1093 case MATCH_FULL:
1094 if (strcmp(str, buff) == 0)
1095 matched = 1;
1096 break;
1097 case MATCH_FRONT_ONLY:
1098 if (memcmp(str, buff, match) == 0)
1099 matched = 1;
1100 break;
1101 case MATCH_MIDDLE_ONLY:
1102 if (strstr(str, search))
1103 matched = 1;
1104 break;
1105 case MATCH_END_ONLY:
1106 ptr = strstr(str, search);
1107 if (ptr && (ptr[search_len] == 0))
1108 matched = 1;
1109 break;
1110 } 1518 }
1111 if (matched) { 1519 }
1112 if (not) 1520
1113 rec->flags &= ~flag; 1521 entry->ops = ops;
1114 else 1522 entry->ip = rec->ip;
1115 rec->flags |= flag; 1523
1524 key = hash_long(entry->ip, FTRACE_HASH_BITS);
1525 hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
1526
1527 } while_for_each_ftrace_rec();
1528 __enable_ftrace_function_probe();
1529
1530 out_unlock:
1531 mutex_unlock(&ftrace_lock);
1532
1533 return count;
1534}
1535
1536enum {
1537 PROBE_TEST_FUNC = 1,
1538 PROBE_TEST_DATA = 2
1539};
1540
1541static void
1542__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1543 void *data, int flags)
1544{
1545 struct ftrace_func_probe *entry;
1546 struct hlist_node *n, *tmp;
1547 char str[KSYM_SYMBOL_LEN];
1548 int type = MATCH_FULL;
1549 int i, len = 0;
1550 char *search;
1551
1552 if (glob && (strcmp(glob, "*") || !strlen(glob)))
1553 glob = NULL;
1554 else {
1555 int not;
1556
1557 type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
1558 len = strlen(search);
1559
1560 /* we do not support '!' for function probes */
1561 if (WARN_ON(not))
1562 return;
1563 }
1564
1565 mutex_lock(&ftrace_lock);
1566 for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
1567 struct hlist_head *hhd = &ftrace_func_hash[i];
1568
1569 hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
1570
1571 /* break up if statements for readability */
1572 if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
1573 continue;
1574
1575 if ((flags & PROBE_TEST_DATA) && entry->data != data)
1576 continue;
1577
1578 /* do this last, since it is the most expensive */
1579 if (glob) {
1580 kallsyms_lookup(entry->ip, NULL, NULL,
1581 NULL, str);
1582 if (!ftrace_match(str, glob, len, type))
1583 continue;
1116 } 1584 }
1585
1586 hlist_del(&entry->node);
1587 call_rcu(&entry->rcu, ftrace_free_entry_rcu);
1117 } 1588 }
1118 pg = pg->next;
1119 } 1589 }
1120 spin_unlock(&ftrace_lock); 1590 __disable_ftrace_function_probe();
1591 mutex_unlock(&ftrace_lock);
1592}
1593
1594void
1595unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
1596 void *data)
1597{
1598 __unregister_ftrace_function_probe(glob, ops, data,
1599 PROBE_TEST_FUNC | PROBE_TEST_DATA);
1600}
1601
1602void
1603unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
1604{
1605 __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
1606}
1607
1608void unregister_ftrace_function_probe_all(char *glob)
1609{
1610 __unregister_ftrace_function_probe(glob, NULL, NULL, 0);
1611}
1612
1613static LIST_HEAD(ftrace_commands);
1614static DEFINE_MUTEX(ftrace_cmd_mutex);
1615
1616int register_ftrace_command(struct ftrace_func_command *cmd)
1617{
1618 struct ftrace_func_command *p;
1619 int ret = 0;
1620
1621 mutex_lock(&ftrace_cmd_mutex);
1622 list_for_each_entry(p, &ftrace_commands, list) {
1623 if (strcmp(cmd->name, p->name) == 0) {
1624 ret = -EBUSY;
1625 goto out_unlock;
1626 }
1627 }
1628 list_add(&cmd->list, &ftrace_commands);
1629 out_unlock:
1630 mutex_unlock(&ftrace_cmd_mutex);
1631
1632 return ret;
1633}
1634
1635int unregister_ftrace_command(struct ftrace_func_command *cmd)
1636{
1637 struct ftrace_func_command *p, *n;
1638 int ret = -ENODEV;
1639
1640 mutex_lock(&ftrace_cmd_mutex);
1641 list_for_each_entry_safe(p, n, &ftrace_commands, list) {
1642 if (strcmp(cmd->name, p->name) == 0) {
1643 ret = 0;
1644 list_del_init(&p->list);
1645 goto out_unlock;
1646 }
1647 }
1648 out_unlock:
1649 mutex_unlock(&ftrace_cmd_mutex);
1650
1651 return ret;
1652}
1653
1654static int ftrace_process_regex(char *buff, int len, int enable)
1655{
1656 char *func, *command, *next = buff;
1657 struct ftrace_func_command *p;
1658 int ret = -EINVAL;
1659
1660 func = strsep(&next, ":");
1661
1662 if (!next) {
1663 ftrace_match_records(func, len, enable);
1664 return 0;
1665 }
1666
1667 /* command found */
1668
1669 command = strsep(&next, ":");
1670
1671 mutex_lock(&ftrace_cmd_mutex);
1672 list_for_each_entry(p, &ftrace_commands, list) {
1673 if (strcmp(p->name, command) == 0) {
1674 ret = p->func(func, command, next, enable);
1675 goto out_unlock;
1676 }
1677 }
1678 out_unlock:
1679 mutex_unlock(&ftrace_cmd_mutex);
1680
1681 return ret;
1121} 1682}
1122 1683
1123static ssize_t 1684static ssize_t
@@ -1187,7 +1748,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
1187 if (isspace(ch)) { 1748 if (isspace(ch)) {
1188 iter->filtered++; 1749 iter->filtered++;
1189 iter->buffer[iter->buffer_idx] = 0; 1750 iter->buffer[iter->buffer_idx] = 0;
1190 ftrace_match(iter->buffer, iter->buffer_idx, enable); 1751 ret = ftrace_process_regex(iter->buffer,
1752 iter->buffer_idx, enable);
1753 if (ret)
1754 goto out;
1191 iter->buffer_idx = 0; 1755 iter->buffer_idx = 0;
1192 } else 1756 } else
1193 iter->flags |= FTRACE_ITER_CONT; 1757 iter->flags |= FTRACE_ITER_CONT;
@@ -1226,7 +1790,7 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
1226 if (reset) 1790 if (reset)
1227 ftrace_filter_reset(enable); 1791 ftrace_filter_reset(enable);
1228 if (buf) 1792 if (buf)
1229 ftrace_match(buf, len, enable); 1793 ftrace_match_records(buf, len, enable);
1230 mutex_unlock(&ftrace_regex_lock); 1794 mutex_unlock(&ftrace_regex_lock);
1231} 1795}
1232 1796
@@ -1276,15 +1840,13 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1276 if (iter->buffer_idx) { 1840 if (iter->buffer_idx) {
1277 iter->filtered++; 1841 iter->filtered++;
1278 iter->buffer[iter->buffer_idx] = 0; 1842 iter->buffer[iter->buffer_idx] = 0;
1279 ftrace_match(iter->buffer, iter->buffer_idx, enable); 1843 ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
1280 } 1844 }
1281 1845
1282 mutex_lock(&ftrace_sysctl_lock); 1846 mutex_lock(&ftrace_lock);
1283 mutex_lock(&ftrace_start_lock);
1284 if (ftrace_start_up && ftrace_enabled) 1847 if (ftrace_start_up && ftrace_enabled)
1285 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1848 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1286 mutex_unlock(&ftrace_start_lock); 1849 mutex_unlock(&ftrace_lock);
1287 mutex_unlock(&ftrace_sysctl_lock);
1288 1850
1289 kfree(iter); 1851 kfree(iter);
1290 mutex_unlock(&ftrace_regex_lock); 1852 mutex_unlock(&ftrace_regex_lock);
@@ -1303,31 +1865,31 @@ ftrace_notrace_release(struct inode *inode, struct file *file)
1303 return ftrace_regex_release(inode, file, 0); 1865 return ftrace_regex_release(inode, file, 0);
1304} 1866}
1305 1867
1306static struct file_operations ftrace_avail_fops = { 1868static const struct file_operations ftrace_avail_fops = {
1307 .open = ftrace_avail_open, 1869 .open = ftrace_avail_open,
1308 .read = seq_read, 1870 .read = seq_read,
1309 .llseek = seq_lseek, 1871 .llseek = seq_lseek,
1310 .release = ftrace_avail_release, 1872 .release = ftrace_avail_release,
1311}; 1873};
1312 1874
1313static struct file_operations ftrace_failures_fops = { 1875static const struct file_operations ftrace_failures_fops = {
1314 .open = ftrace_failures_open, 1876 .open = ftrace_failures_open,
1315 .read = seq_read, 1877 .read = seq_read,
1316 .llseek = seq_lseek, 1878 .llseek = seq_lseek,
1317 .release = ftrace_avail_release, 1879 .release = ftrace_avail_release,
1318}; 1880};
1319 1881
1320static struct file_operations ftrace_filter_fops = { 1882static const struct file_operations ftrace_filter_fops = {
1321 .open = ftrace_filter_open, 1883 .open = ftrace_filter_open,
1322 .read = ftrace_regex_read, 1884 .read = seq_read,
1323 .write = ftrace_filter_write, 1885 .write = ftrace_filter_write,
1324 .llseek = ftrace_regex_lseek, 1886 .llseek = ftrace_regex_lseek,
1325 .release = ftrace_filter_release, 1887 .release = ftrace_filter_release,
1326}; 1888};
1327 1889
1328static struct file_operations ftrace_notrace_fops = { 1890static const struct file_operations ftrace_notrace_fops = {
1329 .open = ftrace_notrace_open, 1891 .open = ftrace_notrace_open,
1330 .read = ftrace_regex_read, 1892 .read = seq_read,
1331 .write = ftrace_notrace_write, 1893 .write = ftrace_notrace_write,
1332 .llseek = ftrace_regex_lseek, 1894 .llseek = ftrace_regex_lseek,
1333 .release = ftrace_notrace_release, 1895 .release = ftrace_notrace_release,
@@ -1360,6 +1922,10 @@ static void *g_start(struct seq_file *m, loff_t *pos)
1360 1922
1361 mutex_lock(&graph_lock); 1923 mutex_lock(&graph_lock);
1362 1924
1925 /* Nothing, tell g_show to print all functions are enabled */
1926 if (!ftrace_graph_count && !*pos)
1927 return (void *)1;
1928
1363 p = g_next(m, p, pos); 1929 p = g_next(m, p, pos);
1364 1930
1365 return p; 1931 return p;
@@ -1378,6 +1944,11 @@ static int g_show(struct seq_file *m, void *v)
1378 if (!ptr) 1944 if (!ptr)
1379 return 0; 1945 return 0;
1380 1946
1947 if (ptr == (unsigned long *)1) {
1948 seq_printf(m, "#### all functions enabled ####\n");
1949 return 0;
1950 }
1951
1381 kallsyms_lookup(*ptr, NULL, NULL, NULL, str); 1952 kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
1382 1953
1383 seq_printf(m, "%s\n", str); 1954 seq_printf(m, "%s\n", str);
@@ -1420,53 +1991,53 @@ ftrace_graph_open(struct inode *inode, struct file *file)
1420 return ret; 1991 return ret;
1421} 1992}
1422 1993
1423static ssize_t
1424ftrace_graph_read(struct file *file, char __user *ubuf,
1425 size_t cnt, loff_t *ppos)
1426{
1427 if (file->f_mode & FMODE_READ)
1428 return seq_read(file, ubuf, cnt, ppos);
1429 else
1430 return -EPERM;
1431}
1432
1433static int 1994static int
1434ftrace_set_func(unsigned long *array, int idx, char *buffer) 1995ftrace_set_func(unsigned long *array, int *idx, char *buffer)
1435{ 1996{
1436 char str[KSYM_SYMBOL_LEN];
1437 struct dyn_ftrace *rec; 1997 struct dyn_ftrace *rec;
1438 struct ftrace_page *pg; 1998 struct ftrace_page *pg;
1999 int search_len;
1439 int found = 0; 2000 int found = 0;
1440 int i, j; 2001 int type, not;
2002 char *search;
2003 bool exists;
2004 int i;
1441 2005
1442 if (ftrace_disabled) 2006 if (ftrace_disabled)
1443 return -ENODEV; 2007 return -ENODEV;
1444 2008
1445 /* should not be called from interrupt context */ 2009 /* decode regex */
1446 spin_lock(&ftrace_lock); 2010 type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not);
2011 if (not)
2012 return -EINVAL;
2013
2014 search_len = strlen(search);
1447 2015
1448 for (pg = ftrace_pages_start; pg; pg = pg->next) { 2016 mutex_lock(&ftrace_lock);
1449 for (i = 0; i < pg->index; i++) { 2017 do_for_each_ftrace_rec(pg, rec) {
1450 rec = &pg->records[i];
1451 2018
1452 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 2019 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
1453 continue; 2020 break;
2021
2022 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
2023 continue;
1454 2024
1455 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 2025 if (ftrace_match_record(rec, search, search_len, type)) {
1456 if (strcmp(str, buffer) == 0) { 2026 /* ensure it is not already in the array */
2027 exists = false;
2028 for (i = 0; i < *idx; i++)
2029 if (array[i] == rec->ip) {
2030 exists = true;
2031 break;
2032 }
2033 if (!exists) {
2034 array[(*idx)++] = rec->ip;
1457 found = 1; 2035 found = 1;
1458 for (j = 0; j < idx; j++)
1459 if (array[j] == rec->ip) {
1460 found = 0;
1461 break;
1462 }
1463 if (found)
1464 array[idx] = rec->ip;
1465 break;
1466 } 2036 }
1467 } 2037 }
1468 } 2038 } while_for_each_ftrace_rec();
1469 spin_unlock(&ftrace_lock); 2039
2040 mutex_unlock(&ftrace_lock);
1470 2041
1471 return found ? 0 : -EINVAL; 2042 return found ? 0 : -EINVAL;
1472} 2043}
@@ -1534,13 +2105,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
1534 } 2105 }
1535 buffer[index] = 0; 2106 buffer[index] = 0;
1536 2107
1537 /* we allow only one at a time */ 2108 /* we allow only one expression at a time */
1538 ret = ftrace_set_func(array, ftrace_graph_count, buffer); 2109 ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
1539 if (ret) 2110 if (ret)
1540 goto out; 2111 goto out;
1541 2112
1542 ftrace_graph_count++;
1543
1544 file->f_pos += read; 2113 file->f_pos += read;
1545 2114
1546 ret = read; 2115 ret = read;
@@ -1552,7 +2121,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
1552 2121
1553static const struct file_operations ftrace_graph_fops = { 2122static const struct file_operations ftrace_graph_fops = {
1554 .open = ftrace_graph_open, 2123 .open = ftrace_graph_open,
1555 .read = ftrace_graph_read, 2124 .read = seq_read,
1556 .write = ftrace_graph_write, 2125 .write = ftrace_graph_write,
1557}; 2126};
1558#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 2127#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -1604,7 +2173,7 @@ static int ftrace_convert_nops(struct module *mod,
1604 unsigned long addr; 2173 unsigned long addr;
1605 unsigned long flags; 2174 unsigned long flags;
1606 2175
1607 mutex_lock(&ftrace_start_lock); 2176 mutex_lock(&ftrace_lock);
1608 p = start; 2177 p = start;
1609 while (p < end) { 2178 while (p < end) {
1610 addr = ftrace_call_adjust(*p++); 2179 addr = ftrace_call_adjust(*p++);
@@ -1623,7 +2192,7 @@ static int ftrace_convert_nops(struct module *mod,
1623 local_irq_save(flags); 2192 local_irq_save(flags);
1624 ftrace_update_code(mod); 2193 ftrace_update_code(mod);
1625 local_irq_restore(flags); 2194 local_irq_restore(flags);
1626 mutex_unlock(&ftrace_start_lock); 2195 mutex_unlock(&ftrace_lock);
1627 2196
1628 return 0; 2197 return 0;
1629} 2198}
@@ -1700,7 +2269,7 @@ ftrace_pid_read(struct file *file, char __user *ubuf,
1700 if (ftrace_pid_trace == ftrace_swapper_pid) 2269 if (ftrace_pid_trace == ftrace_swapper_pid)
1701 r = sprintf(buf, "swapper tasks\n"); 2270 r = sprintf(buf, "swapper tasks\n");
1702 else if (ftrace_pid_trace) 2271 else if (ftrace_pid_trace)
1703 r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace)); 2272 r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
1704 else 2273 else
1705 r = sprintf(buf, "no pid\n"); 2274 r = sprintf(buf, "no pid\n");
1706 2275
@@ -1796,7 +2365,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
1796 if (ret < 0) 2365 if (ret < 0)
1797 return ret; 2366 return ret;
1798 2367
1799 mutex_lock(&ftrace_start_lock); 2368 mutex_lock(&ftrace_lock);
1800 if (val < 0) { 2369 if (val < 0) {
1801 /* disable pid tracing */ 2370 /* disable pid tracing */
1802 if (!ftrace_pid_trace) 2371 if (!ftrace_pid_trace)
@@ -1835,12 +2404,12 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
1835 ftrace_startup_enable(0); 2404 ftrace_startup_enable(0);
1836 2405
1837 out: 2406 out:
1838 mutex_unlock(&ftrace_start_lock); 2407 mutex_unlock(&ftrace_lock);
1839 2408
1840 return cnt; 2409 return cnt;
1841} 2410}
1842 2411
1843static struct file_operations ftrace_pid_fops = { 2412static const struct file_operations ftrace_pid_fops = {
1844 .read = ftrace_pid_read, 2413 .read = ftrace_pid_read,
1845 .write = ftrace_pid_write, 2414 .write = ftrace_pid_write,
1846}; 2415};
@@ -1863,7 +2432,6 @@ static __init int ftrace_init_debugfs(void)
1863 "'set_ftrace_pid' entry\n"); 2432 "'set_ftrace_pid' entry\n");
1864 return 0; 2433 return 0;
1865} 2434}
1866
1867fs_initcall(ftrace_init_debugfs); 2435fs_initcall(ftrace_init_debugfs);
1868 2436
1869/** 2437/**
@@ -1898,12 +2466,12 @@ int register_ftrace_function(struct ftrace_ops *ops)
1898 if (unlikely(ftrace_disabled)) 2466 if (unlikely(ftrace_disabled))
1899 return -1; 2467 return -1;
1900 2468
1901 mutex_lock(&ftrace_sysctl_lock); 2469 mutex_lock(&ftrace_lock);
1902 2470
1903 ret = __register_ftrace_function(ops); 2471 ret = __register_ftrace_function(ops);
1904 ftrace_startup(0); 2472 ftrace_startup(0);
1905 2473
1906 mutex_unlock(&ftrace_sysctl_lock); 2474 mutex_unlock(&ftrace_lock);
1907 return ret; 2475 return ret;
1908} 2476}
1909 2477
@@ -1917,10 +2485,10 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
1917{ 2485{
1918 int ret; 2486 int ret;
1919 2487
1920 mutex_lock(&ftrace_sysctl_lock); 2488 mutex_lock(&ftrace_lock);
1921 ret = __unregister_ftrace_function(ops); 2489 ret = __unregister_ftrace_function(ops);
1922 ftrace_shutdown(0); 2490 ftrace_shutdown(0);
1923 mutex_unlock(&ftrace_sysctl_lock); 2491 mutex_unlock(&ftrace_lock);
1924 2492
1925 return ret; 2493 return ret;
1926} 2494}
@@ -1935,7 +2503,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1935 if (unlikely(ftrace_disabled)) 2503 if (unlikely(ftrace_disabled))
1936 return -ENODEV; 2504 return -ENODEV;
1937 2505
1938 mutex_lock(&ftrace_sysctl_lock); 2506 mutex_lock(&ftrace_lock);
1939 2507
1940 ret = proc_dointvec(table, write, file, buffer, lenp, ppos); 2508 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
1941 2509
@@ -1964,7 +2532,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1964 } 2532 }
1965 2533
1966 out: 2534 out:
1967 mutex_unlock(&ftrace_sysctl_lock); 2535 mutex_unlock(&ftrace_lock);
1968 return ret; 2536 return ret;
1969} 2537}
1970 2538
@@ -2029,6 +2597,38 @@ free:
2029 return ret; 2597 return ret;
2030} 2598}
2031 2599
2600static void
2601ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
2602 struct task_struct *next)
2603{
2604 unsigned long long timestamp;
2605 int index;
2606
2607 /*
2608 * Does the user want to count the time a function was asleep.
2609 * If so, do not update the time stamps.
2610 */
2611 if (trace_flags & TRACE_ITER_SLEEP_TIME)
2612 return;
2613
2614 timestamp = trace_clock_local();
2615
2616 prev->ftrace_timestamp = timestamp;
2617
2618 /* only process tasks that we timestamped */
2619 if (!next->ftrace_timestamp)
2620 return;
2621
2622 /*
2623 * Update all the counters in next to make up for the
2624 * time next was sleeping.
2625 */
2626 timestamp -= next->ftrace_timestamp;
2627
2628 for (index = next->curr_ret_stack; index >= 0; index--)
2629 next->ret_stack[index].calltime += timestamp;
2630}
2631
2032/* Allocate a return stack for each task */ 2632/* Allocate a return stack for each task */
2033static int start_graph_tracing(void) 2633static int start_graph_tracing(void)
2034{ 2634{
@@ -2050,6 +2650,13 @@ static int start_graph_tracing(void)
2050 ret = alloc_retstack_tasklist(ret_stack_list); 2650 ret = alloc_retstack_tasklist(ret_stack_list);
2051 } while (ret == -EAGAIN); 2651 } while (ret == -EAGAIN);
2052 2652
2653 if (!ret) {
2654 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
2655 if (ret)
2656 pr_info("ftrace_graph: Couldn't activate tracepoint"
2657 " probe to kernel_sched_switch\n");
2658 }
2659
2053 kfree(ret_stack_list); 2660 kfree(ret_stack_list);
2054 return ret; 2661 return ret;
2055} 2662}
@@ -2080,7 +2687,13 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2080{ 2687{
2081 int ret = 0; 2688 int ret = 0;
2082 2689
2083 mutex_lock(&ftrace_sysctl_lock); 2690 mutex_lock(&ftrace_lock);
2691
2692 /* we currently allow only one tracer registered at a time */
2693 if (atomic_read(&ftrace_graph_active)) {
2694 ret = -EBUSY;
2695 goto out;
2696 }
2084 2697
2085 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; 2698 ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
2086 register_pm_notifier(&ftrace_suspend_notifier); 2699 register_pm_notifier(&ftrace_suspend_notifier);
@@ -2098,21 +2711,26 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2098 ftrace_startup(FTRACE_START_FUNC_RET); 2711 ftrace_startup(FTRACE_START_FUNC_RET);
2099 2712
2100out: 2713out:
2101 mutex_unlock(&ftrace_sysctl_lock); 2714 mutex_unlock(&ftrace_lock);
2102 return ret; 2715 return ret;
2103} 2716}
2104 2717
2105void unregister_ftrace_graph(void) 2718void unregister_ftrace_graph(void)
2106{ 2719{
2107 mutex_lock(&ftrace_sysctl_lock); 2720 mutex_lock(&ftrace_lock);
2721
2722 if (!unlikely(atomic_read(&ftrace_graph_active)))
2723 goto out;
2108 2724
2109 atomic_dec(&ftrace_graph_active); 2725 atomic_dec(&ftrace_graph_active);
2726 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
2110 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 2727 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
2111 ftrace_graph_entry = ftrace_graph_entry_stub; 2728 ftrace_graph_entry = ftrace_graph_entry_stub;
2112 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 2729 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
2113 unregister_pm_notifier(&ftrace_suspend_notifier); 2730 unregister_pm_notifier(&ftrace_suspend_notifier);
2114 2731
2115 mutex_unlock(&ftrace_sysctl_lock); 2732 out:
2733 mutex_unlock(&ftrace_lock);
2116} 2734}
2117 2735
2118/* Allocate a return stack for newly created task */ 2736/* Allocate a return stack for newly created task */
@@ -2127,6 +2745,7 @@ void ftrace_graph_init_task(struct task_struct *t)
2127 t->curr_ret_stack = -1; 2745 t->curr_ret_stack = -1;
2128 atomic_set(&t->tracing_graph_pause, 0); 2746 atomic_set(&t->tracing_graph_pause, 0);
2129 atomic_set(&t->trace_overrun, 0); 2747 atomic_set(&t->trace_overrun, 0);
2748 t->ftrace_timestamp = 0;
2130 } else 2749 } else
2131 t->ret_stack = NULL; 2750 t->ret_stack = NULL;
2132} 2751}
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
new file mode 100644
index 000000000000..5011f4d91e37
--- /dev/null
+++ b/kernel/trace/kmemtrace.c
@@ -0,0 +1,464 @@
1/*
2 * Memory allocator tracing
3 *
4 * Copyright (C) 2008 Eduard - Gabriel Munteanu
5 * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
6 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
7 */
8
9#include <linux/tracepoint.h>
10#include <linux/seq_file.h>
11#include <linux/debugfs.h>
12#include <linux/dcache.h>
13#include <linux/fs.h>
14
15#include <trace/kmemtrace.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20/* Select an alternative, minimalistic output than the original one */
21#define TRACE_KMEM_OPT_MINIMAL 0x1
22
23static struct tracer_opt kmem_opts[] = {
24 /* Default disable the minimalistic output */
25 { TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
26 { }
27};
28
29static struct tracer_flags kmem_tracer_flags = {
30 .val = 0,
31 .opts = kmem_opts
32};
33
34static struct trace_array *kmemtrace_array;
35
36/* Trace allocations */
37static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
38 unsigned long call_site,
39 const void *ptr,
40 size_t bytes_req,
41 size_t bytes_alloc,
42 gfp_t gfp_flags,
43 int node)
44{
45 struct trace_array *tr = kmemtrace_array;
46 struct kmemtrace_alloc_entry *entry;
47 struct ring_buffer_event *event;
48
49 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
50 if (!event)
51 return;
52
53 entry = ring_buffer_event_data(event);
54 tracing_generic_entry_update(&entry->ent, 0, 0);
55
56 entry->ent.type = TRACE_KMEM_ALLOC;
57 entry->type_id = type_id;
58 entry->call_site = call_site;
59 entry->ptr = ptr;
60 entry->bytes_req = bytes_req;
61 entry->bytes_alloc = bytes_alloc;
62 entry->gfp_flags = gfp_flags;
63 entry->node = node;
64
65 ring_buffer_unlock_commit(tr->buffer, event);
66
67 trace_wake_up();
68}
69
70static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
71 unsigned long call_site,
72 const void *ptr)
73{
74 struct trace_array *tr = kmemtrace_array;
75 struct kmemtrace_free_entry *entry;
76 struct ring_buffer_event *event;
77
78 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
79 if (!event)
80 return;
81 entry = ring_buffer_event_data(event);
82 tracing_generic_entry_update(&entry->ent, 0, 0);
83
84 entry->ent.type = TRACE_KMEM_FREE;
85 entry->type_id = type_id;
86 entry->call_site = call_site;
87 entry->ptr = ptr;
88
89 ring_buffer_unlock_commit(tr->buffer, event);
90
91 trace_wake_up();
92}
93
94static void kmemtrace_kmalloc(unsigned long call_site,
95 const void *ptr,
96 size_t bytes_req,
97 size_t bytes_alloc,
98 gfp_t gfp_flags)
99{
100 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
101 bytes_req, bytes_alloc, gfp_flags, -1);
102}
103
104static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
105 const void *ptr,
106 size_t bytes_req,
107 size_t bytes_alloc,
108 gfp_t gfp_flags)
109{
110 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
111 bytes_req, bytes_alloc, gfp_flags, -1);
112}
113
114static void kmemtrace_kmalloc_node(unsigned long call_site,
115 const void *ptr,
116 size_t bytes_req,
117 size_t bytes_alloc,
118 gfp_t gfp_flags,
119 int node)
120{
121 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
122 bytes_req, bytes_alloc, gfp_flags, node);
123}
124
125static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
126 const void *ptr,
127 size_t bytes_req,
128 size_t bytes_alloc,
129 gfp_t gfp_flags,
130 int node)
131{
132 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
133 bytes_req, bytes_alloc, gfp_flags, node);
134}
135
136static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
137{
138 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
139}
140
141static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
142{
143 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
144}
145
146static int kmemtrace_start_probes(void)
147{
148 int err;
149
150 err = register_trace_kmalloc(kmemtrace_kmalloc);
151 if (err)
152 return err;
153 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
154 if (err)
155 return err;
156 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
157 if (err)
158 return err;
159 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
160 if (err)
161 return err;
162 err = register_trace_kfree(kmemtrace_kfree);
163 if (err)
164 return err;
165 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
166
167 return err;
168}
169
170static void kmemtrace_stop_probes(void)
171{
172 unregister_trace_kmalloc(kmemtrace_kmalloc);
173 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
174 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
175 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
176 unregister_trace_kfree(kmemtrace_kfree);
177 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
178}
179
180static int kmem_trace_init(struct trace_array *tr)
181{
182 int cpu;
183 kmemtrace_array = tr;
184
185 for_each_cpu_mask(cpu, cpu_possible_map)
186 tracing_reset(tr, cpu);
187
188 kmemtrace_start_probes();
189
190 return 0;
191}
192
193static void kmem_trace_reset(struct trace_array *tr)
194{
195 kmemtrace_stop_probes();
196}
197
198static void kmemtrace_headers(struct seq_file *s)
199{
200 /* Don't need headers for the original kmemtrace output */
201 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
202 return;
203
204 seq_printf(s, "#\n");
205 seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
206 " POINTER NODE CALLER\n");
207 seq_printf(s, "# FREE | | | | "
208 " | | | |\n");
209 seq_printf(s, "# |\n\n");
210}
211
212/*
213 * The following functions give the original output from kmemtrace,
214 * plus the origin CPU, since reordering occurs in-kernel now.
215 */
216
217#define KMEMTRACE_USER_ALLOC 0
218#define KMEMTRACE_USER_FREE 1
219
220struct kmemtrace_user_event {
221 u8 event_id;
222 u8 type_id;
223 u16 event_size;
224 u32 cpu;
225 u64 timestamp;
226 unsigned long call_site;
227 unsigned long ptr;
228};
229
230struct kmemtrace_user_event_alloc {
231 size_t bytes_req;
232 size_t bytes_alloc;
233 unsigned gfp_flags;
234 int node;
235};
236
237static enum print_line_t
238kmemtrace_print_alloc_user(struct trace_iterator *iter,
239 struct kmemtrace_alloc_entry *entry)
240{
241 struct kmemtrace_user_event_alloc *ev_alloc;
242 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_user_event *ev;
244
245 ev = trace_seq_reserve(s, sizeof(*ev));
246 if (!ev)
247 return TRACE_TYPE_PARTIAL_LINE;
248
249 ev->event_id = KMEMTRACE_USER_ALLOC;
250 ev->type_id = entry->type_id;
251 ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
252 ev->cpu = iter->cpu;
253 ev->timestamp = iter->ts;
254 ev->call_site = entry->call_site;
255 ev->ptr = (unsigned long)entry->ptr;
256
257 ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
258 if (!ev_alloc)
259 return TRACE_TYPE_PARTIAL_LINE;
260
261 ev_alloc->bytes_req = entry->bytes_req;
262 ev_alloc->bytes_alloc = entry->bytes_alloc;
263 ev_alloc->gfp_flags = entry->gfp_flags;
264 ev_alloc->node = entry->node;
265
266 return TRACE_TYPE_HANDLED;
267}
268
269static enum print_line_t
270kmemtrace_print_free_user(struct trace_iterator *iter,
271 struct kmemtrace_free_entry *entry)
272{
273 struct trace_seq *s = &iter->seq;
274 struct kmemtrace_user_event *ev;
275
276 ev = trace_seq_reserve(s, sizeof(*ev));
277 if (!ev)
278 return TRACE_TYPE_PARTIAL_LINE;
279
280 ev->event_id = KMEMTRACE_USER_FREE;
281 ev->type_id = entry->type_id;
282 ev->event_size = sizeof(*ev);
283 ev->cpu = iter->cpu;
284 ev->timestamp = iter->ts;
285 ev->call_site = entry->call_site;
286 ev->ptr = (unsigned long)entry->ptr;
287
288 return TRACE_TYPE_HANDLED;
289}
290
291/* The two other following provide a more minimalistic output */
292static enum print_line_t
293kmemtrace_print_alloc_compress(struct trace_iterator *iter,
294 struct kmemtrace_alloc_entry *entry)
295{
296 struct trace_seq *s = &iter->seq;
297 int ret;
298
299 /* Alloc entry */
300 ret = trace_seq_printf(s, " + ");
301 if (!ret)
302 return TRACE_TYPE_PARTIAL_LINE;
303
304 /* Type */
305 switch (entry->type_id) {
306 case KMEMTRACE_TYPE_KMALLOC:
307 ret = trace_seq_printf(s, "K ");
308 break;
309 case KMEMTRACE_TYPE_CACHE:
310 ret = trace_seq_printf(s, "C ");
311 break;
312 case KMEMTRACE_TYPE_PAGES:
313 ret = trace_seq_printf(s, "P ");
314 break;
315 default:
316 ret = trace_seq_printf(s, "? ");
317 }
318
319 if (!ret)
320 return TRACE_TYPE_PARTIAL_LINE;
321
322 /* Requested */
323 ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
324 if (!ret)
325 return TRACE_TYPE_PARTIAL_LINE;
326
327 /* Allocated */
328 ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
329 if (!ret)
330 return TRACE_TYPE_PARTIAL_LINE;
331
332 /* Flags
333 * TODO: would be better to see the name of the GFP flag names
334 */
335 ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
336 if (!ret)
337 return TRACE_TYPE_PARTIAL_LINE;
338
339 /* Pointer to allocated */
340 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
341 if (!ret)
342 return TRACE_TYPE_PARTIAL_LINE;
343
344 /* Node */
345 ret = trace_seq_printf(s, "%4d ", entry->node);
346 if (!ret)
347 return TRACE_TYPE_PARTIAL_LINE;
348
349 /* Call site */
350 ret = seq_print_ip_sym(s, entry->call_site, 0);
351 if (!ret)
352 return TRACE_TYPE_PARTIAL_LINE;
353
354 if (!trace_seq_printf(s, "\n"))
355 return TRACE_TYPE_PARTIAL_LINE;
356
357 return TRACE_TYPE_HANDLED;
358}
359
360static enum print_line_t
361kmemtrace_print_free_compress(struct trace_iterator *iter,
362 struct kmemtrace_free_entry *entry)
363{
364 struct trace_seq *s = &iter->seq;
365 int ret;
366
367 /* Free entry */
368 ret = trace_seq_printf(s, " - ");
369 if (!ret)
370 return TRACE_TYPE_PARTIAL_LINE;
371
372 /* Type */
373 switch (entry->type_id) {
374 case KMEMTRACE_TYPE_KMALLOC:
375 ret = trace_seq_printf(s, "K ");
376 break;
377 case KMEMTRACE_TYPE_CACHE:
378 ret = trace_seq_printf(s, "C ");
379 break;
380 case KMEMTRACE_TYPE_PAGES:
381 ret = trace_seq_printf(s, "P ");
382 break;
383 default:
384 ret = trace_seq_printf(s, "? ");
385 }
386
387 if (!ret)
388 return TRACE_TYPE_PARTIAL_LINE;
389
390 /* Skip requested/allocated/flags */
391 ret = trace_seq_printf(s, " ");
392 if (!ret)
393 return TRACE_TYPE_PARTIAL_LINE;
394
395 /* Pointer to allocated */
396 ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
397 if (!ret)
398 return TRACE_TYPE_PARTIAL_LINE;
399
400 /* Skip node */
401 ret = trace_seq_printf(s, " ");
402 if (!ret)
403 return TRACE_TYPE_PARTIAL_LINE;
404
405 /* Call site */
406 ret = seq_print_ip_sym(s, entry->call_site, 0);
407 if (!ret)
408 return TRACE_TYPE_PARTIAL_LINE;
409
410 if (!trace_seq_printf(s, "\n"))
411 return TRACE_TYPE_PARTIAL_LINE;
412
413 return TRACE_TYPE_HANDLED;
414}
415
416static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
417{
418 struct trace_entry *entry = iter->ent;
419
420 switch (entry->type) {
421 case TRACE_KMEM_ALLOC: {
422 struct kmemtrace_alloc_entry *field;
423
424 trace_assign_type(field, entry);
425 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
426 return kmemtrace_print_alloc_compress(iter, field);
427 else
428 return kmemtrace_print_alloc_user(iter, field);
429 }
430
431 case TRACE_KMEM_FREE: {
432 struct kmemtrace_free_entry *field;
433
434 trace_assign_type(field, entry);
435 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
436 return kmemtrace_print_free_compress(iter, field);
437 else
438 return kmemtrace_print_free_user(iter, field);
439 }
440
441 default:
442 return TRACE_TYPE_UNHANDLED;
443 }
444}
445
446static struct tracer kmem_tracer __read_mostly = {
447 .name = "kmemtrace",
448 .init = kmem_trace_init,
449 .reset = kmem_trace_reset,
450 .print_line = kmemtrace_print_line,
451 .print_header = kmemtrace_headers,
452 .flags = &kmem_tracer_flags
453};
454
455void kmemtrace_init(void)
456{
457 /* earliest opportunity to start kmem tracing */
458}
459
460static int __init init_kmem_tracer(void)
461{
462 return register_tracer(&kmem_tracer);
463}
464device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd38c5cfd8ad..960cbf44c844 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,21 +4,92 @@
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */ 5 */
6#include <linux/ring_buffer.h> 6#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h>
8#include <linux/ftrace_irq.h>
7#include <linux/spinlock.h> 9#include <linux/spinlock.h>
8#include <linux/debugfs.h> 10#include <linux/debugfs.h>
9#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/hardirq.h>
10#include <linux/module.h> 13#include <linux/module.h>
11#include <linux/percpu.h> 14#include <linux/percpu.h>
12#include <linux/mutex.h> 15#include <linux/mutex.h>
13#include <linux/sched.h> /* used for sched_clock() (for now) */
14#include <linux/init.h> 16#include <linux/init.h>
15#include <linux/hash.h> 17#include <linux/hash.h>
16#include <linux/list.h> 18#include <linux/list.h>
19#include <linux/cpu.h>
17#include <linux/fs.h> 20#include <linux/fs.h>
18 21
19#include "trace.h" 22#include "trace.h"
20 23
21/* 24/*
25 * The ring buffer is made up of a list of pages. A separate list of pages is
26 * allocated for each CPU. A writer may only write to a buffer that is
27 * associated with the CPU it is currently executing on. A reader may read
28 * from any per cpu buffer.
29 *
30 * The reader is special. For each per cpu buffer, the reader has its own
31 * reader page. When a reader has read the entire reader page, this reader
32 * page is swapped with another page in the ring buffer.
33 *
34 * Now, as long as the writer is off the reader page, the reader can do what
35 * ever it wants with that page. The writer will never write to that page
36 * again (as long as it is out of the ring buffer).
37 *
38 * Here's some silly ASCII art.
39 *
40 * +------+
41 * |reader| RING BUFFER
42 * |page |
43 * +------+ +---+ +---+ +---+
44 * | |-->| |-->| |
45 * +---+ +---+ +---+
46 * ^ |
47 * | |
48 * +---------------+
49 *
50 *
51 * +------+
52 * |reader| RING BUFFER
53 * |page |------------------v
54 * +------+ +---+ +---+ +---+
55 * | |-->| |-->| |
56 * +---+ +---+ +---+
57 * ^ |
58 * | |
59 * +---------------+
60 *
61 *
62 * +------+
63 * |reader| RING BUFFER
64 * |page |------------------v
65 * +------+ +---+ +---+ +---+
66 * ^ | |-->| |-->| |
67 * | +---+ +---+ +---+
68 * | |
69 * | |
70 * +------------------------------+
71 *
72 *
73 * +------+
74 * |buffer| RING BUFFER
75 * |page |------------------v
76 * +------+ +---+ +---+ +---+
77 * ^ | | | |-->| |
78 * | New +---+ +---+ +---+
79 * | Reader------^ |
80 * | page |
81 * +------------------------------+
82 *
83 *
84 * After we make this swap, the reader can hand this page off to the splice
85 * code and be done with it. It can even allocate a new page if it needs to
86 * and swap that into the ring buffer.
87 *
88 * We will be using cmpxchg soon to make all this lockless.
89 *
90 */
91
92/*
22 * A fast way to enable or disable all ring buffers is to 93 * A fast way to enable or disable all ring buffers is to
23 * call tracing_on or tracing_off. Turning off the ring buffers 94 * call tracing_on or tracing_off. Turning off the ring buffers
24 * prevents all ring buffers from being recorded to. 95 * prevents all ring buffers from being recorded to.
@@ -57,7 +128,9 @@ enum {
57 RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, 128 RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
58}; 129};
59 130
60static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; 131static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
132
133#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
61 134
62/** 135/**
63 * tracing_on - enable all tracing buffers 136 * tracing_on - enable all tracing buffers
@@ -89,59 +162,92 @@ EXPORT_SYMBOL_GPL(tracing_off);
89 * tracing_off_permanent - permanently disable ring buffers 162 * tracing_off_permanent - permanently disable ring buffers
90 * 163 *
91 * This function, once called, will disable all ring buffers 164 * This function, once called, will disable all ring buffers
92 * permanenty. 165 * permanently.
93 */ 166 */
94void tracing_off_permanent(void) 167void tracing_off_permanent(void)
95{ 168{
96 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); 169 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
97} 170}
98 171
172/**
173 * tracing_is_on - show state of ring buffers enabled
174 */
175int tracing_is_on(void)
176{
177 return ring_buffer_flags == RB_BUFFERS_ON;
178}
179EXPORT_SYMBOL_GPL(tracing_is_on);
180
99#include "trace.h" 181#include "trace.h"
100 182
101/* Up this if you want to test the TIME_EXTENTS and normalization */ 183#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
102#define DEBUG_SHIFT 0 184#define RB_ALIGNMENT 4U
185#define RB_MAX_SMALL_DATA 28
186
187enum {
188 RB_LEN_TIME_EXTEND = 8,
189 RB_LEN_TIME_STAMP = 16,
190};
103 191
104/* FIXME!!! */ 192static inline int rb_null_event(struct ring_buffer_event *event)
105u64 ring_buffer_time_stamp(int cpu)
106{ 193{
107 u64 time; 194 return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
195}
108 196
109 preempt_disable_notrace(); 197static inline int rb_discarded_event(struct ring_buffer_event *event)
110 /* shift to debug/test normalization and TIME_EXTENTS */ 198{
111 time = sched_clock() << DEBUG_SHIFT; 199 return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
112 preempt_enable_no_resched_notrace(); 200}
113 201
114 return time; 202static void rb_event_set_padding(struct ring_buffer_event *event)
203{
204 event->type = RINGBUF_TYPE_PADDING;
205 event->time_delta = 0;
115} 206}
116EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
117 207
118void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) 208/**
209 * ring_buffer_event_discard - discard an event in the ring buffer
210 * @buffer: the ring buffer
211 * @event: the event to discard
212 *
213 * Sometimes a event that is in the ring buffer needs to be ignored.
214 * This function lets the user discard an event in the ring buffer
215 * and then that event will not be read later.
216 *
217 * Note, it is up to the user to be careful with this, and protect
218 * against races. If the user discards an event that has been consumed
219 * it is possible that it could corrupt the ring buffer.
220 */
221void ring_buffer_event_discard(struct ring_buffer_event *event)
119{ 222{
120 /* Just stupid testing the normalize function and deltas */ 223 event->type = RINGBUF_TYPE_PADDING;
121 *ts >>= DEBUG_SHIFT; 224 /* time delta must be non zero */
225 if (!event->time_delta)
226 event->time_delta = 1;
122} 227}
123EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
124 228
125#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event)) 229static unsigned
126#define RB_ALIGNMENT_SHIFT 2 230rb_event_data_length(struct ring_buffer_event *event)
127#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT) 231{
128#define RB_MAX_SMALL_DATA 28 232 unsigned length;
129 233
130enum { 234 if (event->len)
131 RB_LEN_TIME_EXTEND = 8, 235 length = event->len * RB_ALIGNMENT;
132 RB_LEN_TIME_STAMP = 16, 236 else
133}; 237 length = event->array[0];
238 return length + RB_EVNT_HDR_SIZE;
239}
134 240
135/* inline for ring buffer fast paths */ 241/* inline for ring buffer fast paths */
136static inline unsigned 242static unsigned
137rb_event_length(struct ring_buffer_event *event) 243rb_event_length(struct ring_buffer_event *event)
138{ 244{
139 unsigned length;
140
141 switch (event->type) { 245 switch (event->type) {
142 case RINGBUF_TYPE_PADDING: 246 case RINGBUF_TYPE_PADDING:
143 /* undefined */ 247 if (rb_null_event(event))
144 return -1; 248 /* undefined */
249 return -1;
250 return rb_event_data_length(event);
145 251
146 case RINGBUF_TYPE_TIME_EXTEND: 252 case RINGBUF_TYPE_TIME_EXTEND:
147 return RB_LEN_TIME_EXTEND; 253 return RB_LEN_TIME_EXTEND;
@@ -150,11 +256,7 @@ rb_event_length(struct ring_buffer_event *event)
150 return RB_LEN_TIME_STAMP; 256 return RB_LEN_TIME_STAMP;
151 257
152 case RINGBUF_TYPE_DATA: 258 case RINGBUF_TYPE_DATA:
153 if (event->len) 259 return rb_event_data_length(event);
154 length = event->len << RB_ALIGNMENT_SHIFT;
155 else
156 length = event->array[0];
157 return length + RB_EVNT_HDR_SIZE;
158 default: 260 default:
159 BUG(); 261 BUG();
160 } 262 }
@@ -179,7 +281,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
179EXPORT_SYMBOL_GPL(ring_buffer_event_length); 281EXPORT_SYMBOL_GPL(ring_buffer_event_length);
180 282
181/* inline for ring buffer fast paths */ 283/* inline for ring buffer fast paths */
182static inline void * 284static void *
183rb_event_data(struct ring_buffer_event *event) 285rb_event_data(struct ring_buffer_event *event)
184{ 286{
185 BUG_ON(event->type != RINGBUF_TYPE_DATA); 287 BUG_ON(event->type != RINGBUF_TYPE_DATA);
@@ -209,7 +311,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
209 311
210struct buffer_data_page { 312struct buffer_data_page {
211 u64 time_stamp; /* page time stamp */ 313 u64 time_stamp; /* page time stamp */
212 local_t commit; /* write commited index */ 314 local_t commit; /* write committed index */
213 unsigned char data[]; /* data of buffer page */ 315 unsigned char data[]; /* data of buffer page */
214}; 316};
215 317
@@ -225,14 +327,25 @@ static void rb_init_page(struct buffer_data_page *bpage)
225 local_set(&bpage->commit, 0); 327 local_set(&bpage->commit, 0);
226} 328}
227 329
330/**
331 * ring_buffer_page_len - the size of data on the page.
332 * @page: The page to read
333 *
334 * Returns the amount of data on the page, including buffer page header.
335 */
336size_t ring_buffer_page_len(void *page)
337{
338 return local_read(&((struct buffer_data_page *)page)->commit)
339 + BUF_PAGE_HDR_SIZE;
340}
341
228/* 342/*
229 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing 343 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
230 * this issue out. 344 * this issue out.
231 */ 345 */
232static inline void free_buffer_page(struct buffer_page *bpage) 346static void free_buffer_page(struct buffer_page *bpage)
233{ 347{
234 if (bpage->page) 348 free_page((unsigned long)bpage->page);
235 free_page((unsigned long)bpage->page);
236 kfree(bpage); 349 kfree(bpage);
237} 350}
238 351
@@ -246,7 +359,7 @@ static inline int test_time_stamp(u64 delta)
246 return 0; 359 return 0;
247} 360}
248 361
249#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data)) 362#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
250 363
251/* 364/*
252 * head_page == tail_page && head == tail then buffer is empty. 365 * head_page == tail_page && head == tail then buffer is empty.
@@ -260,7 +373,7 @@ struct ring_buffer_per_cpu {
260 struct list_head pages; 373 struct list_head pages;
261 struct buffer_page *head_page; /* read from head */ 374 struct buffer_page *head_page; /* read from head */
262 struct buffer_page *tail_page; /* write to tail */ 375 struct buffer_page *tail_page; /* write to tail */
263 struct buffer_page *commit_page; /* commited pages */ 376 struct buffer_page *commit_page; /* committed pages */
264 struct buffer_page *reader_page; 377 struct buffer_page *reader_page;
265 unsigned long overrun; 378 unsigned long overrun;
266 unsigned long entries; 379 unsigned long entries;
@@ -273,12 +386,17 @@ struct ring_buffer {
273 unsigned pages; 386 unsigned pages;
274 unsigned flags; 387 unsigned flags;
275 int cpus; 388 int cpus;
276 cpumask_var_t cpumask;
277 atomic_t record_disabled; 389 atomic_t record_disabled;
390 cpumask_var_t cpumask;
278 391
279 struct mutex mutex; 392 struct mutex mutex;
280 393
281 struct ring_buffer_per_cpu **buffers; 394 struct ring_buffer_per_cpu **buffers;
395
396#ifdef CONFIG_HOTPLUG_CPU
397 struct notifier_block cpu_notify;
398#endif
399 u64 (*clock)(void);
282}; 400};
283 401
284struct ring_buffer_iter { 402struct ring_buffer_iter {
@@ -299,11 +417,35 @@ struct ring_buffer_iter {
299 _____ret; \ 417 _____ret; \
300 }) 418 })
301 419
420/* Up this if you want to test the TIME_EXTENTS and normalization */
421#define DEBUG_SHIFT 0
422
423u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
424{
425 u64 time;
426
427 preempt_disable_notrace();
428 /* shift to debug/test normalization and TIME_EXTENTS */
429 time = buffer->clock() << DEBUG_SHIFT;
430 preempt_enable_no_resched_notrace();
431
432 return time;
433}
434EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
435
436void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
437 int cpu, u64 *ts)
438{
439 /* Just stupid testing the normalize function and deltas */
440 *ts >>= DEBUG_SHIFT;
441}
442EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
443
302/** 444/**
303 * check_pages - integrity check of buffer pages 445 * check_pages - integrity check of buffer pages
304 * @cpu_buffer: CPU buffer with pages to test 446 * @cpu_buffer: CPU buffer with pages to test
305 * 447 *
306 * As a safty measure we check to make sure the data pages have not 448 * As a safety measure we check to make sure the data pages have not
307 * been corrupted. 449 * been corrupted.
308 */ 450 */
309static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 451static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
@@ -421,7 +563,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
421 struct list_head *head = &cpu_buffer->pages; 563 struct list_head *head = &cpu_buffer->pages;
422 struct buffer_page *bpage, *tmp; 564 struct buffer_page *bpage, *tmp;
423 565
424 list_del_init(&cpu_buffer->reader_page->list);
425 free_buffer_page(cpu_buffer->reader_page); 566 free_buffer_page(cpu_buffer->reader_page);
426 567
427 list_for_each_entry_safe(bpage, tmp, head, list) { 568 list_for_each_entry_safe(bpage, tmp, head, list) {
@@ -437,6 +578,11 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
437 */ 578 */
438extern int ring_buffer_page_too_big(void); 579extern int ring_buffer_page_too_big(void);
439 580
581#ifdef CONFIG_HOTPLUG_CPU
582static int rb_cpu_notify(struct notifier_block *self,
583 unsigned long action, void *hcpu);
584#endif
585
440/** 586/**
441 * ring_buffer_alloc - allocate a new ring_buffer 587 * ring_buffer_alloc - allocate a new ring_buffer
442 * @size: the size in bytes per cpu that is needed. 588 * @size: the size in bytes per cpu that is needed.
@@ -469,12 +615,23 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
469 615
470 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 616 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
471 buffer->flags = flags; 617 buffer->flags = flags;
618 buffer->clock = trace_clock_local;
472 619
473 /* need at least two pages */ 620 /* need at least two pages */
474 if (buffer->pages == 1) 621 if (buffer->pages == 1)
475 buffer->pages++; 622 buffer->pages++;
476 623
624 /*
625 * In case of non-hotplug cpu, if the ring-buffer is allocated
626 * in early initcall, it will not be notified of secondary cpus.
627 * In that off case, we need to allocate for all possible cpus.
628 */
629#ifdef CONFIG_HOTPLUG_CPU
630 get_online_cpus();
631 cpumask_copy(buffer->cpumask, cpu_online_mask);
632#else
477 cpumask_copy(buffer->cpumask, cpu_possible_mask); 633 cpumask_copy(buffer->cpumask, cpu_possible_mask);
634#endif
478 buffer->cpus = nr_cpu_ids; 635 buffer->cpus = nr_cpu_ids;
479 636
480 bsize = sizeof(void *) * nr_cpu_ids; 637 bsize = sizeof(void *) * nr_cpu_ids;
@@ -490,6 +647,13 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
490 goto fail_free_buffers; 647 goto fail_free_buffers;
491 } 648 }
492 649
650#ifdef CONFIG_HOTPLUG_CPU
651 buffer->cpu_notify.notifier_call = rb_cpu_notify;
652 buffer->cpu_notify.priority = 0;
653 register_cpu_notifier(&buffer->cpu_notify);
654#endif
655
656 put_online_cpus();
493 mutex_init(&buffer->mutex); 657 mutex_init(&buffer->mutex);
494 658
495 return buffer; 659 return buffer;
@@ -503,6 +667,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
503 667
504 fail_free_cpumask: 668 fail_free_cpumask:
505 free_cpumask_var(buffer->cpumask); 669 free_cpumask_var(buffer->cpumask);
670 put_online_cpus();
506 671
507 fail_free_buffer: 672 fail_free_buffer:
508 kfree(buffer); 673 kfree(buffer);
@@ -519,15 +684,29 @@ ring_buffer_free(struct ring_buffer *buffer)
519{ 684{
520 int cpu; 685 int cpu;
521 686
687 get_online_cpus();
688
689#ifdef CONFIG_HOTPLUG_CPU
690 unregister_cpu_notifier(&buffer->cpu_notify);
691#endif
692
522 for_each_buffer_cpu(buffer, cpu) 693 for_each_buffer_cpu(buffer, cpu)
523 rb_free_cpu_buffer(buffer->buffers[cpu]); 694 rb_free_cpu_buffer(buffer->buffers[cpu]);
524 695
696 put_online_cpus();
697
525 free_cpumask_var(buffer->cpumask); 698 free_cpumask_var(buffer->cpumask);
526 699
527 kfree(buffer); 700 kfree(buffer);
528} 701}
529EXPORT_SYMBOL_GPL(ring_buffer_free); 702EXPORT_SYMBOL_GPL(ring_buffer_free);
530 703
704void ring_buffer_set_clock(struct ring_buffer *buffer,
705 u64 (*clock)(void))
706{
707 buffer->clock = clock;
708}
709
531static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); 710static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
532 711
533static void 712static void
@@ -627,16 +806,15 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
627 return size; 806 return size;
628 807
629 mutex_lock(&buffer->mutex); 808 mutex_lock(&buffer->mutex);
809 get_online_cpus();
630 810
631 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 811 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
632 812
633 if (size < buffer_size) { 813 if (size < buffer_size) {
634 814
635 /* easy case, just free pages */ 815 /* easy case, just free pages */
636 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) { 816 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
637 mutex_unlock(&buffer->mutex); 817 goto out_fail;
638 return -1;
639 }
640 818
641 rm_pages = buffer->pages - nr_pages; 819 rm_pages = buffer->pages - nr_pages;
642 820
@@ -655,10 +833,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
655 * add these pages to the cpu_buffers. Otherwise we just free 833 * add these pages to the cpu_buffers. Otherwise we just free
656 * them all and return -ENOMEM; 834 * them all and return -ENOMEM;
657 */ 835 */
658 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) { 836 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
659 mutex_unlock(&buffer->mutex); 837 goto out_fail;
660 return -1;
661 }
662 838
663 new_pages = nr_pages - buffer->pages; 839 new_pages = nr_pages - buffer->pages;
664 840
@@ -683,13 +859,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
683 rb_insert_pages(cpu_buffer, &pages, new_pages); 859 rb_insert_pages(cpu_buffer, &pages, new_pages);
684 } 860 }
685 861
686 if (RB_WARN_ON(buffer, !list_empty(&pages))) { 862 if (RB_WARN_ON(buffer, !list_empty(&pages)))
687 mutex_unlock(&buffer->mutex); 863 goto out_fail;
688 return -1;
689 }
690 864
691 out: 865 out:
692 buffer->pages = nr_pages; 866 buffer->pages = nr_pages;
867 put_online_cpus();
693 mutex_unlock(&buffer->mutex); 868 mutex_unlock(&buffer->mutex);
694 869
695 return size; 870 return size;
@@ -699,15 +874,20 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
699 list_del_init(&bpage->list); 874 list_del_init(&bpage->list);
700 free_buffer_page(bpage); 875 free_buffer_page(bpage);
701 } 876 }
877 put_online_cpus();
702 mutex_unlock(&buffer->mutex); 878 mutex_unlock(&buffer->mutex);
703 return -ENOMEM; 879 return -ENOMEM;
704}
705EXPORT_SYMBOL_GPL(ring_buffer_resize);
706 880
707static inline int rb_null_event(struct ring_buffer_event *event) 881 /*
708{ 882 * Something went totally wrong, and we are too paranoid
709 return event->type == RINGBUF_TYPE_PADDING; 883 * to even clean up the mess.
884 */
885 out_fail:
886 put_online_cpus();
887 mutex_unlock(&buffer->mutex);
888 return -1;
710} 889}
890EXPORT_SYMBOL_GPL(ring_buffer_resize);
711 891
712static inline void * 892static inline void *
713__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) 893__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
@@ -811,7 +991,7 @@ rb_event_index(struct ring_buffer_event *event)
811 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); 991 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
812} 992}
813 993
814static inline int 994static int
815rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, 995rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
816 struct ring_buffer_event *event) 996 struct ring_buffer_event *event)
817{ 997{
@@ -825,7 +1005,7 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
825 rb_commit_index(cpu_buffer) == index; 1005 rb_commit_index(cpu_buffer) == index;
826} 1006}
827 1007
828static inline void 1008static void
829rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, 1009rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
830 struct ring_buffer_event *event) 1010 struct ring_buffer_event *event)
831{ 1011{
@@ -850,7 +1030,7 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
850 local_set(&cpu_buffer->commit_page->page->commit, index); 1030 local_set(&cpu_buffer->commit_page->page->commit, index);
851} 1031}
852 1032
853static inline void 1033static void
854rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1034rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
855{ 1035{
856 /* 1036 /*
@@ -896,7 +1076,7 @@ static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
896 cpu_buffer->reader_page->read = 0; 1076 cpu_buffer->reader_page->read = 0;
897} 1077}
898 1078
899static inline void rb_inc_iter(struct ring_buffer_iter *iter) 1079static void rb_inc_iter(struct ring_buffer_iter *iter)
900{ 1080{
901 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 1081 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
902 1082
@@ -926,7 +1106,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
926 * and with this, we can determine what to place into the 1106 * and with this, we can determine what to place into the
927 * data field. 1107 * data field.
928 */ 1108 */
929static inline void 1109static void
930rb_update_event(struct ring_buffer_event *event, 1110rb_update_event(struct ring_buffer_event *event,
931 unsigned type, unsigned length) 1111 unsigned type, unsigned length)
932{ 1112{
@@ -938,15 +1118,11 @@ rb_update_event(struct ring_buffer_event *event,
938 break; 1118 break;
939 1119
940 case RINGBUF_TYPE_TIME_EXTEND: 1120 case RINGBUF_TYPE_TIME_EXTEND:
941 event->len = 1121 event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
942 (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
943 >> RB_ALIGNMENT_SHIFT;
944 break; 1122 break;
945 1123
946 case RINGBUF_TYPE_TIME_STAMP: 1124 case RINGBUF_TYPE_TIME_STAMP:
947 event->len = 1125 event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
948 (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
949 >> RB_ALIGNMENT_SHIFT;
950 break; 1126 break;
951 1127
952 case RINGBUF_TYPE_DATA: 1128 case RINGBUF_TYPE_DATA:
@@ -955,16 +1131,14 @@ rb_update_event(struct ring_buffer_event *event,
955 event->len = 0; 1131 event->len = 0;
956 event->array[0] = length; 1132 event->array[0] = length;
957 } else 1133 } else
958 event->len = 1134 event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
959 (length + (RB_ALIGNMENT-1))
960 >> RB_ALIGNMENT_SHIFT;
961 break; 1135 break;
962 default: 1136 default:
963 BUG(); 1137 BUG();
964 } 1138 }
965} 1139}
966 1140
967static inline unsigned rb_calculate_event_length(unsigned length) 1141static unsigned rb_calculate_event_length(unsigned length)
968{ 1142{
969 struct ring_buffer_event event; /* Used only for sizeof array */ 1143 struct ring_buffer_event event; /* Used only for sizeof array */
970 1144
@@ -990,6 +1164,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
990 struct ring_buffer *buffer = cpu_buffer->buffer; 1164 struct ring_buffer *buffer = cpu_buffer->buffer;
991 struct ring_buffer_event *event; 1165 struct ring_buffer_event *event;
992 unsigned long flags; 1166 unsigned long flags;
1167 bool lock_taken = false;
993 1168
994 commit_page = cpu_buffer->commit_page; 1169 commit_page = cpu_buffer->commit_page;
995 /* we just need to protect against interrupts */ 1170 /* we just need to protect against interrupts */
@@ -1003,7 +1178,30 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1003 struct buffer_page *next_page = tail_page; 1178 struct buffer_page *next_page = tail_page;
1004 1179
1005 local_irq_save(flags); 1180 local_irq_save(flags);
1006 __raw_spin_lock(&cpu_buffer->lock); 1181 /*
1182 * Since the write to the buffer is still not
1183 * fully lockless, we must be careful with NMIs.
1184 * The locks in the writers are taken when a write
1185 * crosses to a new page. The locks protect against
1186 * races with the readers (this will soon be fixed
1187 * with a lockless solution).
1188 *
1189 * Because we can not protect against NMIs, and we
1190 * want to keep traces reentrant, we need to manage
1191 * what happens when we are in an NMI.
1192 *
1193 * NMIs can happen after we take the lock.
1194 * If we are in an NMI, only take the lock
1195 * if it is not already taken. Otherwise
1196 * simply fail.
1197 */
1198 if (unlikely(in_nmi())) {
1199 if (!__raw_spin_trylock(&cpu_buffer->lock))
1200 goto out_reset;
1201 } else
1202 __raw_spin_lock(&cpu_buffer->lock);
1203
1204 lock_taken = true;
1007 1205
1008 rb_inc_page(cpu_buffer, &next_page); 1206 rb_inc_page(cpu_buffer, &next_page);
1009 1207
@@ -1012,7 +1210,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1012 1210
1013 /* we grabbed the lock before incrementing */ 1211 /* we grabbed the lock before incrementing */
1014 if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) 1212 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1015 goto out_unlock; 1213 goto out_reset;
1016 1214
1017 /* 1215 /*
1018 * If for some reason, we had an interrupt storm that made 1216 * If for some reason, we had an interrupt storm that made
@@ -1021,12 +1219,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1021 */ 1219 */
1022 if (unlikely(next_page == commit_page)) { 1220 if (unlikely(next_page == commit_page)) {
1023 WARN_ON_ONCE(1); 1221 WARN_ON_ONCE(1);
1024 goto out_unlock; 1222 goto out_reset;
1025 } 1223 }
1026 1224
1027 if (next_page == head_page) { 1225 if (next_page == head_page) {
1028 if (!(buffer->flags & RB_FL_OVERWRITE)) 1226 if (!(buffer->flags & RB_FL_OVERWRITE))
1029 goto out_unlock; 1227 goto out_reset;
1030 1228
1031 /* tail_page has not moved yet? */ 1229 /* tail_page has not moved yet? */
1032 if (tail_page == cpu_buffer->tail_page) { 1230 if (tail_page == cpu_buffer->tail_page) {
@@ -1050,7 +1248,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1050 cpu_buffer->tail_page = next_page; 1248 cpu_buffer->tail_page = next_page;
1051 1249
1052 /* reread the time stamp */ 1250 /* reread the time stamp */
1053 *ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1251 *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
1054 cpu_buffer->tail_page->page->time_stamp = *ts; 1252 cpu_buffer->tail_page->page->time_stamp = *ts;
1055 } 1253 }
1056 1254
@@ -1060,7 +1258,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1060 if (tail < BUF_PAGE_SIZE) { 1258 if (tail < BUF_PAGE_SIZE) {
1061 /* Mark the rest of the page with padding */ 1259 /* Mark the rest of the page with padding */
1062 event = __rb_page_index(tail_page, tail); 1260 event = __rb_page_index(tail_page, tail);
1063 event->type = RINGBUF_TYPE_PADDING; 1261 rb_event_set_padding(event);
1064 } 1262 }
1065 1263
1066 if (tail <= BUF_PAGE_SIZE) 1264 if (tail <= BUF_PAGE_SIZE)
@@ -1100,12 +1298,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1100 1298
1101 return event; 1299 return event;
1102 1300
1103 out_unlock: 1301 out_reset:
1104 /* reset write */ 1302 /* reset write */
1105 if (tail <= BUF_PAGE_SIZE) 1303 if (tail <= BUF_PAGE_SIZE)
1106 local_set(&tail_page->write, tail); 1304 local_set(&tail_page->write, tail);
1107 1305
1108 __raw_spin_unlock(&cpu_buffer->lock); 1306 if (likely(lock_taken))
1307 __raw_spin_unlock(&cpu_buffer->lock);
1109 local_irq_restore(flags); 1308 local_irq_restore(flags);
1110 return NULL; 1309 return NULL;
1111} 1310}
@@ -1192,7 +1391,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1192 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 1391 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1193 return NULL; 1392 return NULL;
1194 1393
1195 ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1394 ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
1196 1395
1197 /* 1396 /*
1198 * Only the first commit can update the timestamp. 1397 * Only the first commit can update the timestamp.
@@ -1265,7 +1464,6 @@ static DEFINE_PER_CPU(int, rb_need_resched);
1265 * ring_buffer_lock_reserve - reserve a part of the buffer 1464 * ring_buffer_lock_reserve - reserve a part of the buffer
1266 * @buffer: the ring buffer to reserve from 1465 * @buffer: the ring buffer to reserve from
1267 * @length: the length of the data to reserve (excluding event header) 1466 * @length: the length of the data to reserve (excluding event header)
1268 * @flags: a pointer to save the interrupt flags
1269 * 1467 *
1270 * Returns a reseverd event on the ring buffer to copy directly to. 1468 * Returns a reseverd event on the ring buffer to copy directly to.
1271 * The user of this interface will need to get the body to write into 1469 * The user of this interface will need to get the body to write into
@@ -1278,9 +1476,7 @@ static DEFINE_PER_CPU(int, rb_need_resched);
1278 * If NULL is returned, then nothing has been allocated or locked. 1476 * If NULL is returned, then nothing has been allocated or locked.
1279 */ 1477 */
1280struct ring_buffer_event * 1478struct ring_buffer_event *
1281ring_buffer_lock_reserve(struct ring_buffer *buffer, 1479ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1282 unsigned long length,
1283 unsigned long *flags)
1284{ 1480{
1285 struct ring_buffer_per_cpu *cpu_buffer; 1481 struct ring_buffer_per_cpu *cpu_buffer;
1286 struct ring_buffer_event *event; 1482 struct ring_buffer_event *event;
@@ -1347,15 +1543,13 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1347 * ring_buffer_unlock_commit - commit a reserved 1543 * ring_buffer_unlock_commit - commit a reserved
1348 * @buffer: The buffer to commit to 1544 * @buffer: The buffer to commit to
1349 * @event: The event pointer to commit. 1545 * @event: The event pointer to commit.
1350 * @flags: the interrupt flags received from ring_buffer_lock_reserve.
1351 * 1546 *
1352 * This commits the data to the ring buffer, and releases any locks held. 1547 * This commits the data to the ring buffer, and releases any locks held.
1353 * 1548 *
1354 * Must be paired with ring_buffer_lock_reserve. 1549 * Must be paired with ring_buffer_lock_reserve.
1355 */ 1550 */
1356int ring_buffer_unlock_commit(struct ring_buffer *buffer, 1551int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1357 struct ring_buffer_event *event, 1552 struct ring_buffer_event *event)
1358 unsigned long flags)
1359{ 1553{
1360 struct ring_buffer_per_cpu *cpu_buffer; 1554 struct ring_buffer_per_cpu *cpu_buffer;
1361 int cpu = raw_smp_processor_id(); 1555 int cpu = raw_smp_processor_id();
@@ -1438,7 +1632,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1438} 1632}
1439EXPORT_SYMBOL_GPL(ring_buffer_write); 1633EXPORT_SYMBOL_GPL(ring_buffer_write);
1440 1634
1441static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 1635static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1442{ 1636{
1443 struct buffer_page *reader = cpu_buffer->reader_page; 1637 struct buffer_page *reader = cpu_buffer->reader_page;
1444 struct buffer_page *head = cpu_buffer->head_page; 1638 struct buffer_page *head = cpu_buffer->head_page;
@@ -1528,12 +1722,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
1528unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) 1722unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1529{ 1723{
1530 struct ring_buffer_per_cpu *cpu_buffer; 1724 struct ring_buffer_per_cpu *cpu_buffer;
1725 unsigned long ret;
1531 1726
1532 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 1727 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1533 return 0; 1728 return 0;
1534 1729
1535 cpu_buffer = buffer->buffers[cpu]; 1730 cpu_buffer = buffer->buffers[cpu];
1536 return cpu_buffer->entries; 1731 ret = cpu_buffer->entries;
1732
1733 return ret;
1537} 1734}
1538EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 1735EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
1539 1736
@@ -1545,12 +1742,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
1545unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) 1742unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1546{ 1743{
1547 struct ring_buffer_per_cpu *cpu_buffer; 1744 struct ring_buffer_per_cpu *cpu_buffer;
1745 unsigned long ret;
1548 1746
1549 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 1747 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1550 return 0; 1748 return 0;
1551 1749
1552 cpu_buffer = buffer->buffers[cpu]; 1750 cpu_buffer = buffer->buffers[cpu];
1553 return cpu_buffer->overrun; 1751 ret = cpu_buffer->overrun;
1752
1753 return ret;
1554} 1754}
1555EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 1755EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1556 1756
@@ -1627,9 +1827,14 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
1627 */ 1827 */
1628void ring_buffer_iter_reset(struct ring_buffer_iter *iter) 1828void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1629{ 1829{
1630 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 1830 struct ring_buffer_per_cpu *cpu_buffer;
1631 unsigned long flags; 1831 unsigned long flags;
1632 1832
1833 if (!iter)
1834 return;
1835
1836 cpu_buffer = iter->cpu_buffer;
1837
1633 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 1838 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1634 rb_iter_reset(iter); 1839 rb_iter_reset(iter);
1635 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 1840 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -1803,7 +2008,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
1803 2008
1804 event = rb_reader_event(cpu_buffer); 2009 event = rb_reader_event(cpu_buffer);
1805 2010
1806 if (event->type == RINGBUF_TYPE_DATA) 2011 if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
1807 cpu_buffer->entries--; 2012 cpu_buffer->entries--;
1808 2013
1809 rb_update_read_stamp(cpu_buffer, event); 2014 rb_update_read_stamp(cpu_buffer, event);
@@ -1864,9 +2069,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1864 struct buffer_page *reader; 2069 struct buffer_page *reader;
1865 int nr_loops = 0; 2070 int nr_loops = 0;
1866 2071
1867 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1868 return NULL;
1869
1870 cpu_buffer = buffer->buffers[cpu]; 2072 cpu_buffer = buffer->buffers[cpu];
1871 2073
1872 again: 2074 again:
@@ -1889,9 +2091,18 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1889 2091
1890 switch (event->type) { 2092 switch (event->type) {
1891 case RINGBUF_TYPE_PADDING: 2093 case RINGBUF_TYPE_PADDING:
1892 RB_WARN_ON(cpu_buffer, 1); 2094 if (rb_null_event(event))
2095 RB_WARN_ON(cpu_buffer, 1);
2096 /*
2097 * Because the writer could be discarding every
2098 * event it creates (which would probably be bad)
2099 * if we were to go back to "again" then we may never
2100 * catch up, and will trigger the warn on, or lock
2101 * the box. Return the padding, and we will release
2102 * the current locks, and try again.
2103 */
1893 rb_advance_reader(cpu_buffer); 2104 rb_advance_reader(cpu_buffer);
1894 return NULL; 2105 return event;
1895 2106
1896 case RINGBUF_TYPE_TIME_EXTEND: 2107 case RINGBUF_TYPE_TIME_EXTEND:
1897 /* Internal data, OK to advance */ 2108 /* Internal data, OK to advance */
@@ -1906,7 +2117,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1906 case RINGBUF_TYPE_DATA: 2117 case RINGBUF_TYPE_DATA:
1907 if (ts) { 2118 if (ts) {
1908 *ts = cpu_buffer->read_stamp + event->time_delta; 2119 *ts = cpu_buffer->read_stamp + event->time_delta;
1909 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); 2120 ring_buffer_normalize_time_stamp(buffer,
2121 cpu_buffer->cpu, ts);
1910 } 2122 }
1911 return event; 2123 return event;
1912 2124
@@ -1951,8 +2163,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1951 2163
1952 switch (event->type) { 2164 switch (event->type) {
1953 case RINGBUF_TYPE_PADDING: 2165 case RINGBUF_TYPE_PADDING:
1954 rb_inc_iter(iter); 2166 if (rb_null_event(event)) {
1955 goto again; 2167 rb_inc_iter(iter);
2168 goto again;
2169 }
2170 rb_advance_iter(iter);
2171 return event;
1956 2172
1957 case RINGBUF_TYPE_TIME_EXTEND: 2173 case RINGBUF_TYPE_TIME_EXTEND:
1958 /* Internal data, OK to advance */ 2174 /* Internal data, OK to advance */
@@ -1967,7 +2183,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1967 case RINGBUF_TYPE_DATA: 2183 case RINGBUF_TYPE_DATA:
1968 if (ts) { 2184 if (ts) {
1969 *ts = iter->read_stamp + event->time_delta; 2185 *ts = iter->read_stamp + event->time_delta;
1970 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts); 2186 ring_buffer_normalize_time_stamp(buffer,
2187 cpu_buffer->cpu, ts);
1971 } 2188 }
1972 return event; 2189 return event;
1973 2190
@@ -1995,10 +2212,19 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1995 struct ring_buffer_event *event; 2212 struct ring_buffer_event *event;
1996 unsigned long flags; 2213 unsigned long flags;
1997 2214
2215 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2216 return NULL;
2217
2218 again:
1998 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2219 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1999 event = rb_buffer_peek(buffer, cpu, ts); 2220 event = rb_buffer_peek(buffer, cpu, ts);
2000 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2221 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2001 2222
2223 if (event && event->type == RINGBUF_TYPE_PADDING) {
2224 cpu_relax();
2225 goto again;
2226 }
2227
2002 return event; 2228 return event;
2003} 2229}
2004 2230
@@ -2017,10 +2243,16 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2017 struct ring_buffer_event *event; 2243 struct ring_buffer_event *event;
2018 unsigned long flags; 2244 unsigned long flags;
2019 2245
2246 again:
2020 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2247 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2021 event = rb_iter_peek(iter, ts); 2248 event = rb_iter_peek(iter, ts);
2022 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2249 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2023 2250
2251 if (event && event->type == RINGBUF_TYPE_PADDING) {
2252 cpu_relax();
2253 goto again;
2254 }
2255
2024 return event; 2256 return event;
2025} 2257}
2026 2258
@@ -2035,24 +2267,37 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2035struct ring_buffer_event * 2267struct ring_buffer_event *
2036ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 2268ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2037{ 2269{
2038 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2270 struct ring_buffer_per_cpu *cpu_buffer;
2039 struct ring_buffer_event *event; 2271 struct ring_buffer_event *event = NULL;
2040 unsigned long flags; 2272 unsigned long flags;
2041 2273
2274 again:
2275 /* might be called in atomic */
2276 preempt_disable();
2277
2042 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2278 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2043 return NULL; 2279 goto out;
2044 2280
2281 cpu_buffer = buffer->buffers[cpu];
2045 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2282 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2046 2283
2047 event = rb_buffer_peek(buffer, cpu, ts); 2284 event = rb_buffer_peek(buffer, cpu, ts);
2048 if (!event) 2285 if (!event)
2049 goto out; 2286 goto out_unlock;
2050 2287
2051 rb_advance_reader(cpu_buffer); 2288 rb_advance_reader(cpu_buffer);
2052 2289
2053 out: 2290 out_unlock:
2054 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2291 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2055 2292
2293 out:
2294 preempt_enable();
2295
2296 if (event && event->type == RINGBUF_TYPE_PADDING) {
2297 cpu_relax();
2298 goto again;
2299 }
2300
2056 return event; 2301 return event;
2057} 2302}
2058EXPORT_SYMBOL_GPL(ring_buffer_consume); 2303EXPORT_SYMBOL_GPL(ring_buffer_consume);
@@ -2131,6 +2376,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2131 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 2376 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2132 unsigned long flags; 2377 unsigned long flags;
2133 2378
2379 again:
2134 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2380 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2135 event = rb_iter_peek(iter, ts); 2381 event = rb_iter_peek(iter, ts);
2136 if (!event) 2382 if (!event)
@@ -2140,6 +2386,11 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2140 out: 2386 out:
2141 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2387 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2142 2388
2389 if (event && event->type == RINGBUF_TYPE_PADDING) {
2390 cpu_relax();
2391 goto again;
2392 }
2393
2143 return event; 2394 return event;
2144} 2395}
2145EXPORT_SYMBOL_GPL(ring_buffer_read); 2396EXPORT_SYMBOL_GPL(ring_buffer_read);
@@ -2232,6 +2483,7 @@ int ring_buffer_empty(struct ring_buffer *buffer)
2232 if (!rb_per_cpu_empty(cpu_buffer)) 2483 if (!rb_per_cpu_empty(cpu_buffer))
2233 return 0; 2484 return 0;
2234 } 2485 }
2486
2235 return 1; 2487 return 1;
2236} 2488}
2237EXPORT_SYMBOL_GPL(ring_buffer_empty); 2489EXPORT_SYMBOL_GPL(ring_buffer_empty);
@@ -2244,12 +2496,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
2244int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) 2496int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2245{ 2497{
2246 struct ring_buffer_per_cpu *cpu_buffer; 2498 struct ring_buffer_per_cpu *cpu_buffer;
2499 int ret;
2247 2500
2248 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2501 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2249 return 1; 2502 return 1;
2250 2503
2251 cpu_buffer = buffer->buffers[cpu]; 2504 cpu_buffer = buffer->buffers[cpu];
2252 return rb_per_cpu_empty(cpu_buffer); 2505 ret = rb_per_cpu_empty(cpu_buffer);
2506
2507
2508 return ret;
2253} 2509}
2254EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); 2510EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2255 2511
@@ -2268,18 +2524,36 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2268{ 2524{
2269 struct ring_buffer_per_cpu *cpu_buffer_a; 2525 struct ring_buffer_per_cpu *cpu_buffer_a;
2270 struct ring_buffer_per_cpu *cpu_buffer_b; 2526 struct ring_buffer_per_cpu *cpu_buffer_b;
2527 int ret = -EINVAL;
2271 2528
2272 if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || 2529 if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
2273 !cpumask_test_cpu(cpu, buffer_b->cpumask)) 2530 !cpumask_test_cpu(cpu, buffer_b->cpumask))
2274 return -EINVAL; 2531 goto out;
2275 2532
2276 /* At least make sure the two buffers are somewhat the same */ 2533 /* At least make sure the two buffers are somewhat the same */
2277 if (buffer_a->pages != buffer_b->pages) 2534 if (buffer_a->pages != buffer_b->pages)
2278 return -EINVAL; 2535 goto out;
2536
2537 ret = -EAGAIN;
2538
2539 if (ring_buffer_flags != RB_BUFFERS_ON)
2540 goto out;
2541
2542 if (atomic_read(&buffer_a->record_disabled))
2543 goto out;
2544
2545 if (atomic_read(&buffer_b->record_disabled))
2546 goto out;
2279 2547
2280 cpu_buffer_a = buffer_a->buffers[cpu]; 2548 cpu_buffer_a = buffer_a->buffers[cpu];
2281 cpu_buffer_b = buffer_b->buffers[cpu]; 2549 cpu_buffer_b = buffer_b->buffers[cpu];
2282 2550
2551 if (atomic_read(&cpu_buffer_a->record_disabled))
2552 goto out;
2553
2554 if (atomic_read(&cpu_buffer_b->record_disabled))
2555 goto out;
2556
2283 /* 2557 /*
2284 * We can't do a synchronize_sched here because this 2558 * We can't do a synchronize_sched here because this
2285 * function can be called in atomic context. 2559 * function can be called in atomic context.
@@ -2298,18 +2572,21 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2298 atomic_dec(&cpu_buffer_a->record_disabled); 2572 atomic_dec(&cpu_buffer_a->record_disabled);
2299 atomic_dec(&cpu_buffer_b->record_disabled); 2573 atomic_dec(&cpu_buffer_b->record_disabled);
2300 2574
2301 return 0; 2575 ret = 0;
2576out:
2577 return ret;
2302} 2578}
2303EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 2579EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
2304 2580
2305static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, 2581static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2306 struct buffer_data_page *bpage) 2582 struct buffer_data_page *bpage,
2583 unsigned int offset)
2307{ 2584{
2308 struct ring_buffer_event *event; 2585 struct ring_buffer_event *event;
2309 unsigned long head; 2586 unsigned long head;
2310 2587
2311 __raw_spin_lock(&cpu_buffer->lock); 2588 __raw_spin_lock(&cpu_buffer->lock);
2312 for (head = 0; head < local_read(&bpage->commit); 2589 for (head = offset; head < local_read(&bpage->commit);
2313 head += rb_event_length(event)) { 2590 head += rb_event_length(event)) {
2314 2591
2315 event = __rb_data_page_index(bpage, head); 2592 event = __rb_data_page_index(bpage, head);
@@ -2340,8 +2617,8 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2340 */ 2617 */
2341void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) 2618void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2342{ 2619{
2343 unsigned long addr;
2344 struct buffer_data_page *bpage; 2620 struct buffer_data_page *bpage;
2621 unsigned long addr;
2345 2622
2346 addr = __get_free_page(GFP_KERNEL); 2623 addr = __get_free_page(GFP_KERNEL);
2347 if (!addr) 2624 if (!addr)
@@ -2349,6 +2626,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2349 2626
2350 bpage = (void *)addr; 2627 bpage = (void *)addr;
2351 2628
2629 rb_init_page(bpage);
2630
2352 return bpage; 2631 return bpage;
2353} 2632}
2354 2633
@@ -2368,6 +2647,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2368 * ring_buffer_read_page - extract a page from the ring buffer 2647 * ring_buffer_read_page - extract a page from the ring buffer
2369 * @buffer: buffer to extract from 2648 * @buffer: buffer to extract from
2370 * @data_page: the page to use allocated from ring_buffer_alloc_read_page 2649 * @data_page: the page to use allocated from ring_buffer_alloc_read_page
2650 * @len: amount to extract
2371 * @cpu: the cpu of the buffer to extract 2651 * @cpu: the cpu of the buffer to extract
2372 * @full: should the extraction only happen when the page is full. 2652 * @full: should the extraction only happen when the page is full.
2373 * 2653 *
@@ -2377,12 +2657,12 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2377 * to swap with a page in the ring buffer. 2657 * to swap with a page in the ring buffer.
2378 * 2658 *
2379 * for example: 2659 * for example:
2380 * rpage = ring_buffer_alloc_page(buffer); 2660 * rpage = ring_buffer_alloc_read_page(buffer);
2381 * if (!rpage) 2661 * if (!rpage)
2382 * return error; 2662 * return error;
2383 * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0); 2663 * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
2384 * if (ret) 2664 * if (ret >= 0)
2385 * process_page(rpage); 2665 * process_page(rpage, ret);
2386 * 2666 *
2387 * When @full is set, the function will not return true unless 2667 * When @full is set, the function will not return true unless
2388 * the writer is off the reader page. 2668 * the writer is off the reader page.
@@ -2393,72 +2673,118 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2393 * responsible for that. 2673 * responsible for that.
2394 * 2674 *
2395 * Returns: 2675 * Returns:
2396 * 1 if data has been transferred 2676 * >=0 if data has been transferred, returns the offset of consumed data.
2397 * 0 if no data has been transferred. 2677 * <0 if no data has been transferred.
2398 */ 2678 */
2399int ring_buffer_read_page(struct ring_buffer *buffer, 2679int ring_buffer_read_page(struct ring_buffer *buffer,
2400 void **data_page, int cpu, int full) 2680 void **data_page, size_t len, int cpu, int full)
2401{ 2681{
2402 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 2682 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2403 struct ring_buffer_event *event; 2683 struct ring_buffer_event *event;
2404 struct buffer_data_page *bpage; 2684 struct buffer_data_page *bpage;
2685 struct buffer_page *reader;
2405 unsigned long flags; 2686 unsigned long flags;
2406 int ret = 0; 2687 unsigned int commit;
2688 unsigned int read;
2689 u64 save_timestamp;
2690 int ret = -1;
2691
2692 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2693 goto out;
2694
2695 /*
2696 * If len is not big enough to hold the page header, then
2697 * we can not copy anything.
2698 */
2699 if (len <= BUF_PAGE_HDR_SIZE)
2700 goto out;
2701
2702 len -= BUF_PAGE_HDR_SIZE;
2407 2703
2408 if (!data_page) 2704 if (!data_page)
2409 return 0; 2705 goto out;
2410 2706
2411 bpage = *data_page; 2707 bpage = *data_page;
2412 if (!bpage) 2708 if (!bpage)
2413 return 0; 2709 goto out;
2414 2710
2415 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2711 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2416 2712
2417 /* 2713 reader = rb_get_reader_page(cpu_buffer);
2418 * rb_buffer_peek will get the next ring buffer if 2714 if (!reader)
2419 * the current reader page is empty. 2715 goto out_unlock;
2420 */ 2716
2421 event = rb_buffer_peek(buffer, cpu, NULL); 2717 event = rb_reader_event(cpu_buffer);
2422 if (!event) 2718
2423 goto out; 2719 read = reader->read;
2720 commit = rb_page_commit(reader);
2424 2721
2425 /* check for data */
2426 if (!local_read(&cpu_buffer->reader_page->page->commit))
2427 goto out;
2428 /* 2722 /*
2429 * If the writer is already off of the read page, then simply 2723 * If this page has been partially read or
2430 * switch the read page with the given page. Otherwise 2724 * if len is not big enough to read the rest of the page or
2431 * we need to copy the data from the reader to the writer. 2725 * a writer is still on the page, then
2726 * we must copy the data from the page to the buffer.
2727 * Otherwise, we can simply swap the page with the one passed in.
2432 */ 2728 */
2433 if (cpu_buffer->reader_page == cpu_buffer->commit_page) { 2729 if (read || (len < (commit - read)) ||
2434 unsigned int read = cpu_buffer->reader_page->read; 2730 cpu_buffer->reader_page == cpu_buffer->commit_page) {
2731 struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
2732 unsigned int rpos = read;
2733 unsigned int pos = 0;
2734 unsigned int size;
2435 2735
2436 if (full) 2736 if (full)
2437 goto out; 2737 goto out_unlock;
2438 /* The writer is still on the reader page, we must copy */ 2738
2439 bpage = cpu_buffer->reader_page->page; 2739 if (len > (commit - read))
2440 memcpy(bpage->data, 2740 len = (commit - read);
2441 cpu_buffer->reader_page->page->data + read, 2741
2442 local_read(&bpage->commit) - read); 2742 size = rb_event_length(event);
2743
2744 if (len < size)
2745 goto out_unlock;
2443 2746
2444 /* consume what was read */ 2747 /* save the current timestamp, since the user will need it */
2445 cpu_buffer->reader_page += read; 2748 save_timestamp = cpu_buffer->read_stamp;
2446 2749
2750 /* Need to copy one event at a time */
2751 do {
2752 memcpy(bpage->data + pos, rpage->data + rpos, size);
2753
2754 len -= size;
2755
2756 rb_advance_reader(cpu_buffer);
2757 rpos = reader->read;
2758 pos += size;
2759
2760 event = rb_reader_event(cpu_buffer);
2761 size = rb_event_length(event);
2762 } while (len > size);
2763
2764 /* update bpage */
2765 local_set(&bpage->commit, pos);
2766 bpage->time_stamp = save_timestamp;
2767
2768 /* we copied everything to the beginning */
2769 read = 0;
2447 } else { 2770 } else {
2448 /* swap the pages */ 2771 /* swap the pages */
2449 rb_init_page(bpage); 2772 rb_init_page(bpage);
2450 bpage = cpu_buffer->reader_page->page; 2773 bpage = reader->page;
2451 cpu_buffer->reader_page->page = *data_page; 2774 reader->page = *data_page;
2452 cpu_buffer->reader_page->read = 0; 2775 local_set(&reader->write, 0);
2776 reader->read = 0;
2453 *data_page = bpage; 2777 *data_page = bpage;
2778
2779 /* update the entry counter */
2780 rb_remove_entries(cpu_buffer, bpage, read);
2454 } 2781 }
2455 ret = 1; 2782 ret = read;
2456 2783
2457 /* update the entry counter */ 2784 out_unlock:
2458 rb_remove_entries(cpu_buffer, bpage);
2459 out:
2460 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2785 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2461 2786
2787 out:
2462 return ret; 2788 return ret;
2463} 2789}
2464 2790
@@ -2466,7 +2792,7 @@ static ssize_t
2466rb_simple_read(struct file *filp, char __user *ubuf, 2792rb_simple_read(struct file *filp, char __user *ubuf,
2467 size_t cnt, loff_t *ppos) 2793 size_t cnt, loff_t *ppos)
2468{ 2794{
2469 long *p = filp->private_data; 2795 unsigned long *p = filp->private_data;
2470 char buf[64]; 2796 char buf[64];
2471 int r; 2797 int r;
2472 2798
@@ -2482,9 +2808,9 @@ static ssize_t
2482rb_simple_write(struct file *filp, const char __user *ubuf, 2808rb_simple_write(struct file *filp, const char __user *ubuf,
2483 size_t cnt, loff_t *ppos) 2809 size_t cnt, loff_t *ppos)
2484{ 2810{
2485 long *p = filp->private_data; 2811 unsigned long *p = filp->private_data;
2486 char buf[64]; 2812 char buf[64];
2487 long val; 2813 unsigned long val;
2488 int ret; 2814 int ret;
2489 2815
2490 if (cnt >= sizeof(buf)) 2816 if (cnt >= sizeof(buf))
@@ -2509,7 +2835,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
2509 return cnt; 2835 return cnt;
2510} 2836}
2511 2837
2512static struct file_operations rb_simple_fops = { 2838static const struct file_operations rb_simple_fops = {
2513 .open = tracing_open_generic, 2839 .open = tracing_open_generic,
2514 .read = rb_simple_read, 2840 .read = rb_simple_read,
2515 .write = rb_simple_write, 2841 .write = rb_simple_write,
@@ -2532,3 +2858,42 @@ static __init int rb_init_debugfs(void)
2532} 2858}
2533 2859
2534fs_initcall(rb_init_debugfs); 2860fs_initcall(rb_init_debugfs);
2861
2862#ifdef CONFIG_HOTPLUG_CPU
2863static int rb_cpu_notify(struct notifier_block *self,
2864 unsigned long action, void *hcpu)
2865{
2866 struct ring_buffer *buffer =
2867 container_of(self, struct ring_buffer, cpu_notify);
2868 long cpu = (long)hcpu;
2869
2870 switch (action) {
2871 case CPU_UP_PREPARE:
2872 case CPU_UP_PREPARE_FROZEN:
2873 if (cpu_isset(cpu, *buffer->cpumask))
2874 return NOTIFY_OK;
2875
2876 buffer->buffers[cpu] =
2877 rb_allocate_cpu_buffer(buffer, cpu);
2878 if (!buffer->buffers[cpu]) {
2879 WARN(1, "failed to allocate ring buffer on CPU %ld\n",
2880 cpu);
2881 return NOTIFY_OK;
2882 }
2883 smp_wmb();
2884 cpu_set(cpu, *buffer->cpumask);
2885 break;
2886 case CPU_DOWN_PREPARE:
2887 case CPU_DOWN_PREPARE_FROZEN:
2888 /*
2889 * Do nothing.
2890 * If we were to free the buffer, then the user would
2891 * lose any trace that was in the buffer.
2892 */
2893 break;
2894 default:
2895 break;
2896 }
2897 return NOTIFY_OK;
2898}
2899#endif
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 17bb88d86ac2..cda81ec58d9f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -11,32 +11,34 @@
11 * Copyright (C) 2004-2006 Ingo Molnar 11 * Copyright (C) 2004-2006 Ingo Molnar
12 * Copyright (C) 2004 William Lee Irwin III 12 * Copyright (C) 2004 William Lee Irwin III
13 */ 13 */
14#include <linux/ring_buffer.h>
14#include <linux/utsrelease.h> 15#include <linux/utsrelease.h>
16#include <linux/stacktrace.h>
17#include <linux/writeback.h>
15#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
16#include <linux/seq_file.h> 19#include <linux/seq_file.h>
17#include <linux/notifier.h> 20#include <linux/notifier.h>
21#include <linux/irqflags.h>
18#include <linux/debugfs.h> 22#include <linux/debugfs.h>
19#include <linux/pagemap.h> 23#include <linux/pagemap.h>
20#include <linux/hardirq.h> 24#include <linux/hardirq.h>
21#include <linux/linkage.h> 25#include <linux/linkage.h>
22#include <linux/uaccess.h> 26#include <linux/uaccess.h>
27#include <linux/kprobes.h>
23#include <linux/ftrace.h> 28#include <linux/ftrace.h>
24#include <linux/module.h> 29#include <linux/module.h>
25#include <linux/percpu.h> 30#include <linux/percpu.h>
31#include <linux/splice.h>
26#include <linux/kdebug.h> 32#include <linux/kdebug.h>
33#include <linux/string.h>
27#include <linux/ctype.h> 34#include <linux/ctype.h>
28#include <linux/init.h> 35#include <linux/init.h>
29#include <linux/poll.h> 36#include <linux/poll.h>
30#include <linux/gfp.h> 37#include <linux/gfp.h>
31#include <linux/fs.h> 38#include <linux/fs.h>
32#include <linux/kprobes.h>
33#include <linux/writeback.h>
34
35#include <linux/stacktrace.h>
36#include <linux/ring_buffer.h>
37#include <linux/irqflags.h>
38 39
39#include "trace.h" 40#include "trace.h"
41#include "trace_output.h"
40 42
41#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE) 43#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
42 44
@@ -44,14 +46,25 @@ unsigned long __read_mostly tracing_max_latency;
44unsigned long __read_mostly tracing_thresh; 46unsigned long __read_mostly tracing_thresh;
45 47
46/* 48/*
49 * On boot up, the ring buffer is set to the minimum size, so that
50 * we do not waste memory on systems that are not using tracing.
51 */
52static int ring_buffer_expanded;
53
54/*
47 * We need to change this state when a selftest is running. 55 * We need to change this state when a selftest is running.
48 * A selftest will lurk into the ring-buffer to count the 56 * A selftest will lurk into the ring-buffer to count the
49 * entries inserted during the selftest although some concurrent 57 * entries inserted during the selftest although some concurrent
50 * insertions into the ring-buffer such as ftrace_printk could occurred 58 * insertions into the ring-buffer such as trace_printk could occurred
51 * at the same time, giving false positive or negative results. 59 * at the same time, giving false positive or negative results.
52 */ 60 */
53static bool __read_mostly tracing_selftest_running; 61static bool __read_mostly tracing_selftest_running;
54 62
63/*
64 * If a tracer is running, we do not want to run SELFTEST.
65 */
66static bool __read_mostly tracing_selftest_disabled;
67
55/* For tracers that don't implement custom flags */ 68/* For tracers that don't implement custom flags */
56static struct tracer_opt dummy_tracer_opt[] = { 69static struct tracer_opt dummy_tracer_opt[] = {
57 { } 70 { }
@@ -73,7 +86,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
73 * of the tracer is successful. But that is the only place that sets 86 * of the tracer is successful. But that is the only place that sets
74 * this back to zero. 87 * this back to zero.
75 */ 88 */
76int tracing_disabled = 1; 89static int tracing_disabled = 1;
77 90
78static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 91static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
79 92
@@ -91,6 +104,9 @@ static inline void ftrace_enable_cpu(void)
91 104
92static cpumask_var_t __read_mostly tracing_buffer_mask; 105static cpumask_var_t __read_mostly tracing_buffer_mask;
93 106
107/* Define which cpu buffers are currently read in trace_pipe */
108static cpumask_var_t tracing_reader_cpumask;
109
94#define for_each_tracing_cpu(cpu) \ 110#define for_each_tracing_cpu(cpu) \
95 for_each_cpu(cpu, tracing_buffer_mask) 111 for_each_cpu(cpu, tracing_buffer_mask)
96 112
@@ -109,14 +125,21 @@ static cpumask_var_t __read_mostly tracing_buffer_mask;
109 */ 125 */
110int ftrace_dump_on_oops; 126int ftrace_dump_on_oops;
111 127
112static int tracing_set_tracer(char *buf); 128static int tracing_set_tracer(const char *buf);
129
130#define BOOTUP_TRACER_SIZE 100
131static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata;
132static char *default_bootup_tracer;
113 133
114static int __init set_ftrace(char *str) 134static int __init set_ftrace(char *str)
115{ 135{
116 tracing_set_tracer(str); 136 strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
137 default_bootup_tracer = bootup_tracer_buf;
138 /* We are using ftrace early, expand it */
139 ring_buffer_expanded = 1;
117 return 1; 140 return 1;
118} 141}
119__setup("ftrace", set_ftrace); 142__setup("ftrace=", set_ftrace);
120 143
121static int __init set_ftrace_dump_on_oops(char *str) 144static int __init set_ftrace_dump_on_oops(char *str)
122{ 145{
@@ -125,21 +148,13 @@ static int __init set_ftrace_dump_on_oops(char *str)
125} 148}
126__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 149__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
127 150
128long 151unsigned long long ns2usecs(cycle_t nsec)
129ns2usecs(cycle_t nsec)
130{ 152{
131 nsec += 500; 153 nsec += 500;
132 do_div(nsec, 1000); 154 do_div(nsec, 1000);
133 return nsec; 155 return nsec;
134} 156}
135 157
136cycle_t ftrace_now(int cpu)
137{
138 u64 ts = ring_buffer_time_stamp(cpu);
139 ring_buffer_normalize_time_stamp(cpu, &ts);
140 return ts;
141}
142
143/* 158/*
144 * The global_trace is the descriptor that holds the tracing 159 * The global_trace is the descriptor that holds the tracing
145 * buffers for the live tracing. For each CPU, it contains 160 * buffers for the live tracing. For each CPU, it contains
@@ -156,6 +171,20 @@ static struct trace_array global_trace;
156 171
157static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); 172static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
158 173
174cycle_t ftrace_now(int cpu)
175{
176 u64 ts;
177
178 /* Early boot up does not have a buffer yet */
179 if (!global_trace.buffer)
180 return trace_clock_local();
181
182 ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
183 ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
184
185 return ts;
186}
187
159/* 188/*
160 * The max_tr is used to snapshot the global_trace when a maximum 189 * The max_tr is used to snapshot the global_trace when a maximum
161 * latency is reached. Some tracers will use this to store a maximum 190 * latency is reached. Some tracers will use this to store a maximum
@@ -186,9 +215,6 @@ int tracing_is_enabled(void)
186 return tracer_enabled; 215 return tracer_enabled;
187} 216}
188 217
189/* function tracing enabled */
190int ftrace_function_enabled;
191
192/* 218/*
193 * trace_buf_size is the size in bytes that is allocated 219 * trace_buf_size is the size in bytes that is allocated
194 * for a buffer. Note, the number of bytes is always rounded 220 * for a buffer. Note, the number of bytes is always rounded
@@ -229,7 +255,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
229 255
230/* trace_flags holds trace_options default values */ 256/* trace_flags holds trace_options default values */
231unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | 257unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
232 TRACE_ITER_ANNOTATE; 258 TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
233 259
234/** 260/**
235 * trace_wake_up - wake up tasks waiting for trace input 261 * trace_wake_up - wake up tasks waiting for trace input
@@ -280,13 +306,17 @@ static const char *trace_options[] = {
280 "block", 306 "block",
281 "stacktrace", 307 "stacktrace",
282 "sched-tree", 308 "sched-tree",
283 "ftrace_printk", 309 "trace_printk",
284 "ftrace_preempt", 310 "ftrace_preempt",
285 "branch", 311 "branch",
286 "annotate", 312 "annotate",
287 "userstacktrace", 313 "userstacktrace",
288 "sym-userobj", 314 "sym-userobj",
289 "printk-msg-only", 315 "printk-msg-only",
316 "context-info",
317 "latency-format",
318 "global-clock",
319 "sleep-time",
290 NULL 320 NULL
291}; 321};
292 322
@@ -326,146 +356,37 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
326 data->rt_priority = tsk->rt_priority; 356 data->rt_priority = tsk->rt_priority;
327 357
328 /* record this tasks comm */ 358 /* record this tasks comm */
329 tracing_record_cmdline(current); 359 tracing_record_cmdline(tsk);
330} 360}
331 361
332/** 362ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
333 * trace_seq_printf - sequence printing of trace information
334 * @s: trace sequence descriptor
335 * @fmt: printf format string
336 *
337 * The tracer may use either sequence operations or its own
338 * copy to user routines. To simplify formating of a trace
339 * trace_seq_printf is used to store strings into a special
340 * buffer (@s). Then the output may be either used by
341 * the sequencer or pulled into another buffer.
342 */
343int
344trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
345{ 363{
346 int len = (PAGE_SIZE - 1) - s->len; 364 int len;
347 va_list ap;
348 int ret; 365 int ret;
349 366
350 if (!len) 367 if (!cnt)
351 return 0;
352
353 va_start(ap, fmt);
354 ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
355 va_end(ap);
356
357 /* If we can't write it all, don't bother writing anything */
358 if (ret >= len)
359 return 0;
360
361 s->len += ret;
362
363 return len;
364}
365
366/**
367 * trace_seq_puts - trace sequence printing of simple string
368 * @s: trace sequence descriptor
369 * @str: simple string to record
370 *
371 * The tracer may use either the sequence operations or its own
372 * copy to user routines. This function records a simple string
373 * into a special buffer (@s) for later retrieval by a sequencer
374 * or other mechanism.
375 */
376static int
377trace_seq_puts(struct trace_seq *s, const char *str)
378{
379 int len = strlen(str);
380
381 if (len > ((PAGE_SIZE - 1) - s->len))
382 return 0;
383
384 memcpy(s->buffer + s->len, str, len);
385 s->len += len;
386
387 return len;
388}
389
390static int
391trace_seq_putc(struct trace_seq *s, unsigned char c)
392{
393 if (s->len >= (PAGE_SIZE - 1))
394 return 0;
395
396 s->buffer[s->len++] = c;
397
398 return 1;
399}
400
401static int
402trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
403{
404 if (len > ((PAGE_SIZE - 1) - s->len))
405 return 0; 368 return 0;
406 369
407 memcpy(s->buffer + s->len, mem, len); 370 if (s->len <= s->readpos)
408 s->len += len; 371 return -EBUSY;
409
410 return len;
411}
412
413#define MAX_MEMHEX_BYTES 8
414#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
415
416static int
417trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
418{
419 unsigned char hex[HEX_CHARS];
420 unsigned char *data = mem;
421 int i, j;
422
423#ifdef __BIG_ENDIAN
424 for (i = 0, j = 0; i < len; i++) {
425#else
426 for (i = len-1, j = 0; i >= 0; i--) {
427#endif
428 hex[j++] = hex_asc_hi(data[i]);
429 hex[j++] = hex_asc_lo(data[i]);
430 }
431 hex[j++] = ' ';
432
433 return trace_seq_putmem(s, hex, j);
434}
435
436static int
437trace_seq_path(struct trace_seq *s, struct path *path)
438{
439 unsigned char *p;
440 372
441 if (s->len >= (PAGE_SIZE - 1)) 373 len = s->len - s->readpos;
442 return 0; 374 if (cnt > len)
443 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 375 cnt = len;
444 if (!IS_ERR(p)) { 376 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
445 p = mangle_path(s->buffer + s->len, p, "\n"); 377 if (ret == cnt)
446 if (p) { 378 return -EFAULT;
447 s->len = p - s->buffer;
448 return 1;
449 }
450 } else {
451 s->buffer[s->len++] = '?';
452 return 1;
453 }
454 379
455 return 0; 380 cnt -= ret;
456}
457 381
458static void 382 s->readpos += cnt;
459trace_seq_reset(struct trace_seq *s) 383 return cnt;
460{
461 s->len = 0;
462 s->readpos = 0;
463} 384}
464 385
465ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) 386static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
466{ 387{
467 int len; 388 int len;
468 int ret; 389 void *ret;
469 390
470 if (s->len <= s->readpos) 391 if (s->len <= s->readpos)
471 return -EBUSY; 392 return -EBUSY;
@@ -473,11 +394,11 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
473 len = s->len - s->readpos; 394 len = s->len - s->readpos;
474 if (cnt > len) 395 if (cnt > len)
475 cnt = len; 396 cnt = len;
476 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); 397 ret = memcpy(buf, s->buffer + s->readpos, cnt);
477 if (ret) 398 if (!ret)
478 return -EFAULT; 399 return -EFAULT;
479 400
480 s->readpos += len; 401 s->readpos += cnt;
481 return cnt; 402 return cnt;
482} 403}
483 404
@@ -489,7 +410,7 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s)
489 s->buffer[len] = 0; 410 s->buffer[len] = 0;
490 seq_puts(m, s->buffer); 411 seq_puts(m, s->buffer);
491 412
492 trace_seq_reset(s); 413 trace_seq_init(s);
493} 414}
494 415
495/** 416/**
@@ -543,7 +464,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
543 464
544 ftrace_enable_cpu(); 465 ftrace_enable_cpu();
545 466
546 WARN_ON_ONCE(ret); 467 WARN_ON_ONCE(ret && ret != -EAGAIN);
547 468
548 __update_max_tr(tr, tsk, cpu); 469 __update_max_tr(tr, tsk, cpu);
549 __raw_spin_unlock(&ftrace_max_lock); 470 __raw_spin_unlock(&ftrace_max_lock);
@@ -556,6 +477,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
556 * Register a new plugin tracer. 477 * Register a new plugin tracer.
557 */ 478 */
558int register_tracer(struct tracer *type) 479int register_tracer(struct tracer *type)
480__releases(kernel_lock)
481__acquires(kernel_lock)
559{ 482{
560 struct tracer *t; 483 struct tracer *t;
561 int len; 484 int len;
@@ -594,9 +517,12 @@ int register_tracer(struct tracer *type)
594 else 517 else
595 if (!type->flags->opts) 518 if (!type->flags->opts)
596 type->flags->opts = dummy_tracer_opt; 519 type->flags->opts = dummy_tracer_opt;
520 if (!type->wait_pipe)
521 type->wait_pipe = default_wait_pipe;
522
597 523
598#ifdef CONFIG_FTRACE_STARTUP_TEST 524#ifdef CONFIG_FTRACE_STARTUP_TEST
599 if (type->selftest) { 525 if (type->selftest && !tracing_selftest_disabled) {
600 struct tracer *saved_tracer = current_trace; 526 struct tracer *saved_tracer = current_trace;
601 struct trace_array *tr = &global_trace; 527 struct trace_array *tr = &global_trace;
602 int i; 528 int i;
@@ -638,8 +564,26 @@ int register_tracer(struct tracer *type)
638 out: 564 out:
639 tracing_selftest_running = false; 565 tracing_selftest_running = false;
640 mutex_unlock(&trace_types_lock); 566 mutex_unlock(&trace_types_lock);
641 lock_kernel();
642 567
568 if (ret || !default_bootup_tracer)
569 goto out_unlock;
570
571 if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE))
572 goto out_unlock;
573
574 printk(KERN_INFO "Starting tracer '%s'\n", type->name);
575 /* Do we want this tracer to start on bootup? */
576 tracing_set_tracer(type->name);
577 default_bootup_tracer = NULL;
578 /* disable other selftests, since this will break it. */
579 tracing_selftest_disabled = 1;
580#ifdef CONFIG_FTRACE_STARTUP_TEST
581 printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
582 type->name);
583#endif
584
585 out_unlock:
586 lock_kernel();
643 return ret; 587 return ret;
644} 588}
645 589
@@ -658,6 +602,15 @@ void unregister_tracer(struct tracer *type)
658 602
659 found: 603 found:
660 *t = (*t)->next; 604 *t = (*t)->next;
605
606 if (type == current_trace && tracer_enabled) {
607 tracer_enabled = 0;
608 tracing_stop();
609 if (current_trace->stop)
610 current_trace->stop(&global_trace);
611 current_trace = &nop_trace;
612 }
613
661 if (strlen(type->name) != max_tracer_type_len) 614 if (strlen(type->name) != max_tracer_type_len)
662 goto out; 615 goto out;
663 616
@@ -689,19 +642,20 @@ void tracing_reset_online_cpus(struct trace_array *tr)
689} 642}
690 643
691#define SAVED_CMDLINES 128 644#define SAVED_CMDLINES 128
645#define NO_CMDLINE_MAP UINT_MAX
692static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; 646static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
693static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; 647static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
694static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; 648static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
695static int cmdline_idx; 649static int cmdline_idx;
696static DEFINE_SPINLOCK(trace_cmdline_lock); 650static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED;
697 651
698/* temporary disable recording */ 652/* temporary disable recording */
699atomic_t trace_record_cmdline_disabled __read_mostly; 653static atomic_t trace_record_cmdline_disabled __read_mostly;
700 654
701static void trace_init_cmdlines(void) 655static void trace_init_cmdlines(void)
702{ 656{
703 memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline)); 657 memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline));
704 memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid)); 658 memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid));
705 cmdline_idx = 0; 659 cmdline_idx = 0;
706} 660}
707 661
@@ -738,13 +692,12 @@ void tracing_start(void)
738 return; 692 return;
739 693
740 spin_lock_irqsave(&tracing_start_lock, flags); 694 spin_lock_irqsave(&tracing_start_lock, flags);
741 if (--trace_stop_count) 695 if (--trace_stop_count) {
742 goto out; 696 if (trace_stop_count < 0) {
743 697 /* Someone screwed up their debugging */
744 if (trace_stop_count < 0) { 698 WARN_ON_ONCE(1);
745 /* Someone screwed up their debugging */ 699 trace_stop_count = 0;
746 WARN_ON_ONCE(1); 700 }
747 trace_stop_count = 0;
748 goto out; 701 goto out;
749 } 702 }
750 703
@@ -794,8 +747,7 @@ void trace_stop_cmdline_recording(void);
794 747
795static void trace_save_cmdline(struct task_struct *tsk) 748static void trace_save_cmdline(struct task_struct *tsk)
796{ 749{
797 unsigned map; 750 unsigned pid, idx;
798 unsigned idx;
799 751
800 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) 752 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
801 return; 753 return;
@@ -806,17 +758,24 @@ static void trace_save_cmdline(struct task_struct *tsk)
806 * nor do we want to disable interrupts, 758 * nor do we want to disable interrupts,
807 * so if we miss here, then better luck next time. 759 * so if we miss here, then better luck next time.
808 */ 760 */
809 if (!spin_trylock(&trace_cmdline_lock)) 761 if (!__raw_spin_trylock(&trace_cmdline_lock))
810 return; 762 return;
811 763
812 idx = map_pid_to_cmdline[tsk->pid]; 764 idx = map_pid_to_cmdline[tsk->pid];
813 if (idx >= SAVED_CMDLINES) { 765 if (idx == NO_CMDLINE_MAP) {
814 idx = (cmdline_idx + 1) % SAVED_CMDLINES; 766 idx = (cmdline_idx + 1) % SAVED_CMDLINES;
815 767
816 map = map_cmdline_to_pid[idx]; 768 /*
817 if (map <= PID_MAX_DEFAULT) 769 * Check whether the cmdline buffer at idx has a pid
818 map_pid_to_cmdline[map] = (unsigned)-1; 770 * mapped. We are going to overwrite that entry so we
771 * need to clear the map_pid_to_cmdline. Otherwise we
772 * would read the new comm for the old pid.
773 */
774 pid = map_cmdline_to_pid[idx];
775 if (pid != NO_CMDLINE_MAP)
776 map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
819 777
778 map_cmdline_to_pid[idx] = tsk->pid;
820 map_pid_to_cmdline[tsk->pid] = idx; 779 map_pid_to_cmdline[tsk->pid] = idx;
821 780
822 cmdline_idx = idx; 781 cmdline_idx = idx;
@@ -824,33 +783,37 @@ static void trace_save_cmdline(struct task_struct *tsk)
824 783
825 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); 784 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
826 785
827 spin_unlock(&trace_cmdline_lock); 786 __raw_spin_unlock(&trace_cmdline_lock);
828} 787}
829 788
830char *trace_find_cmdline(int pid) 789void trace_find_cmdline(int pid, char comm[])
831{ 790{
832 char *cmdline = "<...>";
833 unsigned map; 791 unsigned map;
834 792
835 if (!pid) 793 if (!pid) {
836 return "<idle>"; 794 strcpy(comm, "<idle>");
795 return;
796 }
837 797
838 if (pid > PID_MAX_DEFAULT) 798 if (pid > PID_MAX_DEFAULT) {
839 goto out; 799 strcpy(comm, "<...>");
800 return;
801 }
840 802
803 __raw_spin_lock(&trace_cmdline_lock);
841 map = map_pid_to_cmdline[pid]; 804 map = map_pid_to_cmdline[pid];
842 if (map >= SAVED_CMDLINES) 805 if (map != NO_CMDLINE_MAP)
843 goto out; 806 strcpy(comm, saved_cmdlines[map]);
844 807 else
845 cmdline = saved_cmdlines[map]; 808 strcpy(comm, "<...>");
846 809
847 out: 810 __raw_spin_unlock(&trace_cmdline_lock);
848 return cmdline;
849} 811}
850 812
851void tracing_record_cmdline(struct task_struct *tsk) 813void tracing_record_cmdline(struct task_struct *tsk)
852{ 814{
853 if (atomic_read(&trace_record_cmdline_disabled)) 815 if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled ||
816 !tracing_is_on())
854 return; 817 return;
855 818
856 trace_save_cmdline(tsk); 819 trace_save_cmdline(tsk);
@@ -864,7 +827,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
864 827
865 entry->preempt_count = pc & 0xff; 828 entry->preempt_count = pc & 0xff;
866 entry->pid = (tsk) ? tsk->pid : 0; 829 entry->pid = (tsk) ? tsk->pid : 0;
867 entry->tgid = (tsk) ? tsk->tgid : 0; 830 entry->tgid = (tsk) ? tsk->tgid : 0;
868 entry->flags = 831 entry->flags =
869#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 832#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
870 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 833 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -876,78 +839,132 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
876 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 839 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
877} 840}
878 841
842struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
843 unsigned char type,
844 unsigned long len,
845 unsigned long flags, int pc)
846{
847 struct ring_buffer_event *event;
848
849 event = ring_buffer_lock_reserve(tr->buffer, len);
850 if (event != NULL) {
851 struct trace_entry *ent = ring_buffer_event_data(event);
852
853 tracing_generic_entry_update(ent, flags, pc);
854 ent->type = type;
855 }
856
857 return event;
858}
859static void ftrace_trace_stack(struct trace_array *tr,
860 unsigned long flags, int skip, int pc);
861static void ftrace_trace_userstack(struct trace_array *tr,
862 unsigned long flags, int pc);
863
864static inline void __trace_buffer_unlock_commit(struct trace_array *tr,
865 struct ring_buffer_event *event,
866 unsigned long flags, int pc,
867 int wake)
868{
869 ring_buffer_unlock_commit(tr->buffer, event);
870
871 ftrace_trace_stack(tr, flags, 6, pc);
872 ftrace_trace_userstack(tr, flags, pc);
873
874 if (wake)
875 trace_wake_up();
876}
877
878void trace_buffer_unlock_commit(struct trace_array *tr,
879 struct ring_buffer_event *event,
880 unsigned long flags, int pc)
881{
882 __trace_buffer_unlock_commit(tr, event, flags, pc, 1);
883}
884
885struct ring_buffer_event *
886trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
887 unsigned long flags, int pc)
888{
889 return trace_buffer_lock_reserve(&global_trace,
890 type, len, flags, pc);
891}
892
893void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
894 unsigned long flags, int pc)
895{
896 return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
897}
898
899void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
900 unsigned long flags, int pc)
901{
902 return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
903}
904
879void 905void
880trace_function(struct trace_array *tr, struct trace_array_cpu *data, 906trace_function(struct trace_array *tr,
881 unsigned long ip, unsigned long parent_ip, unsigned long flags, 907 unsigned long ip, unsigned long parent_ip, unsigned long flags,
882 int pc) 908 int pc)
883{ 909{
884 struct ring_buffer_event *event; 910 struct ring_buffer_event *event;
885 struct ftrace_entry *entry; 911 struct ftrace_entry *entry;
886 unsigned long irq_flags;
887 912
888 /* If we are reading the ring buffer, don't trace */ 913 /* If we are reading the ring buffer, don't trace */
889 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 914 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
890 return; 915 return;
891 916
892 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 917 event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry),
893 &irq_flags); 918 flags, pc);
894 if (!event) 919 if (!event)
895 return; 920 return;
896 entry = ring_buffer_event_data(event); 921 entry = ring_buffer_event_data(event);
897 tracing_generic_entry_update(&entry->ent, flags, pc);
898 entry->ent.type = TRACE_FN;
899 entry->ip = ip; 922 entry->ip = ip;
900 entry->parent_ip = parent_ip; 923 entry->parent_ip = parent_ip;
901 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 924 ring_buffer_unlock_commit(tr->buffer, event);
902} 925}
903 926
904#ifdef CONFIG_FUNCTION_GRAPH_TRACER 927#ifdef CONFIG_FUNCTION_GRAPH_TRACER
905static void __trace_graph_entry(struct trace_array *tr, 928static int __trace_graph_entry(struct trace_array *tr,
906 struct trace_array_cpu *data,
907 struct ftrace_graph_ent *trace, 929 struct ftrace_graph_ent *trace,
908 unsigned long flags, 930 unsigned long flags,
909 int pc) 931 int pc)
910{ 932{
911 struct ring_buffer_event *event; 933 struct ring_buffer_event *event;
912 struct ftrace_graph_ent_entry *entry; 934 struct ftrace_graph_ent_entry *entry;
913 unsigned long irq_flags;
914 935
915 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 936 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
916 return; 937 return 0;
917 938
918 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), 939 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
919 &irq_flags); 940 sizeof(*entry), flags, pc);
920 if (!event) 941 if (!event)
921 return; 942 return 0;
922 entry = ring_buffer_event_data(event); 943 entry = ring_buffer_event_data(event);
923 tracing_generic_entry_update(&entry->ent, flags, pc);
924 entry->ent.type = TRACE_GRAPH_ENT;
925 entry->graph_ent = *trace; 944 entry->graph_ent = *trace;
926 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); 945 ring_buffer_unlock_commit(global_trace.buffer, event);
946
947 return 1;
927} 948}
928 949
929static void __trace_graph_return(struct trace_array *tr, 950static void __trace_graph_return(struct trace_array *tr,
930 struct trace_array_cpu *data,
931 struct ftrace_graph_ret *trace, 951 struct ftrace_graph_ret *trace,
932 unsigned long flags, 952 unsigned long flags,
933 int pc) 953 int pc)
934{ 954{
935 struct ring_buffer_event *event; 955 struct ring_buffer_event *event;
936 struct ftrace_graph_ret_entry *entry; 956 struct ftrace_graph_ret_entry *entry;
937 unsigned long irq_flags;
938 957
939 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 958 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
940 return; 959 return;
941 960
942 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry), 961 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
943 &irq_flags); 962 sizeof(*entry), flags, pc);
944 if (!event) 963 if (!event)
945 return; 964 return;
946 entry = ring_buffer_event_data(event); 965 entry = ring_buffer_event_data(event);
947 tracing_generic_entry_update(&entry->ent, flags, pc);
948 entry->ent.type = TRACE_GRAPH_RET;
949 entry->ret = *trace; 966 entry->ret = *trace;
950 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags); 967 ring_buffer_unlock_commit(global_trace.buffer, event);
951} 968}
952#endif 969#endif
953 970
@@ -957,31 +974,23 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
957 int pc) 974 int pc)
958{ 975{
959 if (likely(!atomic_read(&data->disabled))) 976 if (likely(!atomic_read(&data->disabled)))
960 trace_function(tr, data, ip, parent_ip, flags, pc); 977 trace_function(tr, ip, parent_ip, flags, pc);
961} 978}
962 979
963static void ftrace_trace_stack(struct trace_array *tr, 980static void __ftrace_trace_stack(struct trace_array *tr,
964 struct trace_array_cpu *data, 981 unsigned long flags,
965 unsigned long flags, 982 int skip, int pc)
966 int skip, int pc)
967{ 983{
968#ifdef CONFIG_STACKTRACE 984#ifdef CONFIG_STACKTRACE
969 struct ring_buffer_event *event; 985 struct ring_buffer_event *event;
970 struct stack_entry *entry; 986 struct stack_entry *entry;
971 struct stack_trace trace; 987 struct stack_trace trace;
972 unsigned long irq_flags;
973 988
974 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 989 event = trace_buffer_lock_reserve(tr, TRACE_STACK,
975 return; 990 sizeof(*entry), flags, pc);
976
977 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
978 &irq_flags);
979 if (!event) 991 if (!event)
980 return; 992 return;
981 entry = ring_buffer_event_data(event); 993 entry = ring_buffer_event_data(event);
982 tracing_generic_entry_update(&entry->ent, flags, pc);
983 entry->ent.type = TRACE_STACK;
984
985 memset(&entry->caller, 0, sizeof(entry->caller)); 994 memset(&entry->caller, 0, sizeof(entry->caller));
986 995
987 trace.nr_entries = 0; 996 trace.nr_entries = 0;
@@ -990,38 +999,43 @@ static void ftrace_trace_stack(struct trace_array *tr,
990 trace.entries = entry->caller; 999 trace.entries = entry->caller;
991 1000
992 save_stack_trace(&trace); 1001 save_stack_trace(&trace);
993 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1002 ring_buffer_unlock_commit(tr->buffer, event);
994#endif 1003#endif
995} 1004}
996 1005
1006static void ftrace_trace_stack(struct trace_array *tr,
1007 unsigned long flags,
1008 int skip, int pc)
1009{
1010 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1011 return;
1012
1013 __ftrace_trace_stack(tr, flags, skip, pc);
1014}
1015
997void __trace_stack(struct trace_array *tr, 1016void __trace_stack(struct trace_array *tr,
998 struct trace_array_cpu *data,
999 unsigned long flags, 1017 unsigned long flags,
1000 int skip) 1018 int skip, int pc)
1001{ 1019{
1002 ftrace_trace_stack(tr, data, flags, skip, preempt_count()); 1020 __ftrace_trace_stack(tr, flags, skip, pc);
1003} 1021}
1004 1022
1005static void ftrace_trace_userstack(struct trace_array *tr, 1023static void ftrace_trace_userstack(struct trace_array *tr,
1006 struct trace_array_cpu *data, 1024 unsigned long flags, int pc)
1007 unsigned long flags, int pc)
1008{ 1025{
1009#ifdef CONFIG_STACKTRACE 1026#ifdef CONFIG_STACKTRACE
1010 struct ring_buffer_event *event; 1027 struct ring_buffer_event *event;
1011 struct userstack_entry *entry; 1028 struct userstack_entry *entry;
1012 struct stack_trace trace; 1029 struct stack_trace trace;
1013 unsigned long irq_flags;
1014 1030
1015 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) 1031 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1016 return; 1032 return;
1017 1033
1018 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 1034 event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK,
1019 &irq_flags); 1035 sizeof(*entry), flags, pc);
1020 if (!event) 1036 if (!event)
1021 return; 1037 return;
1022 entry = ring_buffer_event_data(event); 1038 entry = ring_buffer_event_data(event);
1023 tracing_generic_entry_update(&entry->ent, flags, pc);
1024 entry->ent.type = TRACE_USER_STACK;
1025 1039
1026 memset(&entry->caller, 0, sizeof(entry->caller)); 1040 memset(&entry->caller, 0, sizeof(entry->caller));
1027 1041
@@ -1031,70 +1045,58 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1031 trace.entries = entry->caller; 1045 trace.entries = entry->caller;
1032 1046
1033 save_stack_trace_user(&trace); 1047 save_stack_trace_user(&trace);
1034 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1048 ring_buffer_unlock_commit(tr->buffer, event);
1035#endif 1049#endif
1036} 1050}
1037 1051
1038void __trace_userstack(struct trace_array *tr, 1052#ifdef UNUSED
1039 struct trace_array_cpu *data, 1053static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1040 unsigned long flags)
1041{ 1054{
1042 ftrace_trace_userstack(tr, data, flags, preempt_count()); 1055 ftrace_trace_userstack(tr, flags, preempt_count());
1043} 1056}
1057#endif /* UNUSED */
1044 1058
1045static void 1059static void
1046ftrace_trace_special(void *__tr, void *__data, 1060ftrace_trace_special(void *__tr,
1047 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1061 unsigned long arg1, unsigned long arg2, unsigned long arg3,
1048 int pc) 1062 int pc)
1049{ 1063{
1050 struct ring_buffer_event *event; 1064 struct ring_buffer_event *event;
1051 struct trace_array_cpu *data = __data;
1052 struct trace_array *tr = __tr; 1065 struct trace_array *tr = __tr;
1053 struct special_entry *entry; 1066 struct special_entry *entry;
1054 unsigned long irq_flags;
1055 1067
1056 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 1068 event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL,
1057 &irq_flags); 1069 sizeof(*entry), 0, pc);
1058 if (!event) 1070 if (!event)
1059 return; 1071 return;
1060 entry = ring_buffer_event_data(event); 1072 entry = ring_buffer_event_data(event);
1061 tracing_generic_entry_update(&entry->ent, 0, pc);
1062 entry->ent.type = TRACE_SPECIAL;
1063 entry->arg1 = arg1; 1073 entry->arg1 = arg1;
1064 entry->arg2 = arg2; 1074 entry->arg2 = arg2;
1065 entry->arg3 = arg3; 1075 entry->arg3 = arg3;
1066 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1076 trace_buffer_unlock_commit(tr, event, 0, pc);
1067 ftrace_trace_stack(tr, data, irq_flags, 4, pc);
1068 ftrace_trace_userstack(tr, data, irq_flags, pc);
1069
1070 trace_wake_up();
1071} 1077}
1072 1078
1073void 1079void
1074__trace_special(void *__tr, void *__data, 1080__trace_special(void *__tr, void *__data,
1075 unsigned long arg1, unsigned long arg2, unsigned long arg3) 1081 unsigned long arg1, unsigned long arg2, unsigned long arg3)
1076{ 1082{
1077 ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count()); 1083 ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
1078} 1084}
1079 1085
1080void 1086void
1081tracing_sched_switch_trace(struct trace_array *tr, 1087tracing_sched_switch_trace(struct trace_array *tr,
1082 struct trace_array_cpu *data,
1083 struct task_struct *prev, 1088 struct task_struct *prev,
1084 struct task_struct *next, 1089 struct task_struct *next,
1085 unsigned long flags, int pc) 1090 unsigned long flags, int pc)
1086{ 1091{
1087 struct ring_buffer_event *event; 1092 struct ring_buffer_event *event;
1088 struct ctx_switch_entry *entry; 1093 struct ctx_switch_entry *entry;
1089 unsigned long irq_flags;
1090 1094
1091 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 1095 event = trace_buffer_lock_reserve(tr, TRACE_CTX,
1092 &irq_flags); 1096 sizeof(*entry), flags, pc);
1093 if (!event) 1097 if (!event)
1094 return; 1098 return;
1095 entry = ring_buffer_event_data(event); 1099 entry = ring_buffer_event_data(event);
1096 tracing_generic_entry_update(&entry->ent, flags, pc);
1097 entry->ent.type = TRACE_CTX;
1098 entry->prev_pid = prev->pid; 1100 entry->prev_pid = prev->pid;
1099 entry->prev_prio = prev->prio; 1101 entry->prev_prio = prev->prio;
1100 entry->prev_state = prev->state; 1102 entry->prev_state = prev->state;
@@ -1102,29 +1104,23 @@ tracing_sched_switch_trace(struct trace_array *tr,
1102 entry->next_prio = next->prio; 1104 entry->next_prio = next->prio;
1103 entry->next_state = next->state; 1105 entry->next_state = next->state;
1104 entry->next_cpu = task_cpu(next); 1106 entry->next_cpu = task_cpu(next);
1105 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1107 trace_buffer_unlock_commit(tr, event, flags, pc);
1106 ftrace_trace_stack(tr, data, flags, 5, pc);
1107 ftrace_trace_userstack(tr, data, flags, pc);
1108} 1108}
1109 1109
1110void 1110void
1111tracing_sched_wakeup_trace(struct trace_array *tr, 1111tracing_sched_wakeup_trace(struct trace_array *tr,
1112 struct trace_array_cpu *data,
1113 struct task_struct *wakee, 1112 struct task_struct *wakee,
1114 struct task_struct *curr, 1113 struct task_struct *curr,
1115 unsigned long flags, int pc) 1114 unsigned long flags, int pc)
1116{ 1115{
1117 struct ring_buffer_event *event; 1116 struct ring_buffer_event *event;
1118 struct ctx_switch_entry *entry; 1117 struct ctx_switch_entry *entry;
1119 unsigned long irq_flags;
1120 1118
1121 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 1119 event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
1122 &irq_flags); 1120 sizeof(*entry), flags, pc);
1123 if (!event) 1121 if (!event)
1124 return; 1122 return;
1125 entry = ring_buffer_event_data(event); 1123 entry = ring_buffer_event_data(event);
1126 tracing_generic_entry_update(&entry->ent, flags, pc);
1127 entry->ent.type = TRACE_WAKE;
1128 entry->prev_pid = curr->pid; 1124 entry->prev_pid = curr->pid;
1129 entry->prev_prio = curr->prio; 1125 entry->prev_prio = curr->prio;
1130 entry->prev_state = curr->state; 1126 entry->prev_state = curr->state;
@@ -1132,11 +1128,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
1132 entry->next_prio = wakee->prio; 1128 entry->next_prio = wakee->prio;
1133 entry->next_state = wakee->state; 1129 entry->next_state = wakee->state;
1134 entry->next_cpu = task_cpu(wakee); 1130 entry->next_cpu = task_cpu(wakee);
1135 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
1136 ftrace_trace_stack(tr, data, flags, 6, pc);
1137 ftrace_trace_userstack(tr, data, flags, pc);
1138 1131
1139 trace_wake_up(); 1132 ring_buffer_unlock_commit(tr->buffer, event);
1133 ftrace_trace_stack(tr, flags, 6, pc);
1134 ftrace_trace_userstack(tr, flags, pc);
1140} 1135}
1141 1136
1142void 1137void
@@ -1157,66 +1152,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1157 data = tr->data[cpu]; 1152 data = tr->data[cpu];
1158 1153
1159 if (likely(atomic_inc_return(&data->disabled) == 1)) 1154 if (likely(atomic_inc_return(&data->disabled) == 1))
1160 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); 1155 ftrace_trace_special(tr, arg1, arg2, arg3, pc);
1161
1162 atomic_dec(&data->disabled);
1163 local_irq_restore(flags);
1164}
1165
1166#ifdef CONFIG_FUNCTION_TRACER
1167static void
1168function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
1169{
1170 struct trace_array *tr = &global_trace;
1171 struct trace_array_cpu *data;
1172 unsigned long flags;
1173 long disabled;
1174 int cpu, resched;
1175 int pc;
1176
1177 if (unlikely(!ftrace_function_enabled))
1178 return;
1179
1180 pc = preempt_count();
1181 resched = ftrace_preempt_disable();
1182 local_save_flags(flags);
1183 cpu = raw_smp_processor_id();
1184 data = tr->data[cpu];
1185 disabled = atomic_inc_return(&data->disabled);
1186
1187 if (likely(disabled == 1))
1188 trace_function(tr, data, ip, parent_ip, flags, pc);
1189
1190 atomic_dec(&data->disabled);
1191 ftrace_preempt_enable(resched);
1192}
1193
1194static void
1195function_trace_call(unsigned long ip, unsigned long parent_ip)
1196{
1197 struct trace_array *tr = &global_trace;
1198 struct trace_array_cpu *data;
1199 unsigned long flags;
1200 long disabled;
1201 int cpu;
1202 int pc;
1203
1204 if (unlikely(!ftrace_function_enabled))
1205 return;
1206
1207 /*
1208 * Need to use raw, since this must be called before the
1209 * recursive protection is performed.
1210 */
1211 local_irq_save(flags);
1212 cpu = raw_smp_processor_id();
1213 data = tr->data[cpu];
1214 disabled = atomic_inc_return(&data->disabled);
1215
1216 if (likely(disabled == 1)) {
1217 pc = preempt_count();
1218 trace_function(tr, data, ip, parent_ip, flags, pc);
1219 }
1220 1156
1221 atomic_dec(&data->disabled); 1157 atomic_dec(&data->disabled);
1222 local_irq_restore(flags); 1158 local_irq_restore(flags);
@@ -1229,6 +1165,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
1229 struct trace_array_cpu *data; 1165 struct trace_array_cpu *data;
1230 unsigned long flags; 1166 unsigned long flags;
1231 long disabled; 1167 long disabled;
1168 int ret;
1232 int cpu; 1169 int cpu;
1233 int pc; 1170 int pc;
1234 1171
@@ -1244,15 +1181,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
1244 disabled = atomic_inc_return(&data->disabled); 1181 disabled = atomic_inc_return(&data->disabled);
1245 if (likely(disabled == 1)) { 1182 if (likely(disabled == 1)) {
1246 pc = preempt_count(); 1183 pc = preempt_count();
1247 __trace_graph_entry(tr, data, trace, flags, pc); 1184 ret = __trace_graph_entry(tr, trace, flags, pc);
1185 } else {
1186 ret = 0;
1248 } 1187 }
1249 /* Only do the atomic if it is not already set */ 1188 /* Only do the atomic if it is not already set */
1250 if (!test_tsk_trace_graph(current)) 1189 if (!test_tsk_trace_graph(current))
1251 set_tsk_trace_graph(current); 1190 set_tsk_trace_graph(current);
1191
1252 atomic_dec(&data->disabled); 1192 atomic_dec(&data->disabled);
1253 local_irq_restore(flags); 1193 local_irq_restore(flags);
1254 1194
1255 return 1; 1195 return ret;
1256} 1196}
1257 1197
1258void trace_graph_return(struct ftrace_graph_ret *trace) 1198void trace_graph_return(struct ftrace_graph_ret *trace)
@@ -1270,7 +1210,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
1270 disabled = atomic_inc_return(&data->disabled); 1210 disabled = atomic_inc_return(&data->disabled);
1271 if (likely(disabled == 1)) { 1211 if (likely(disabled == 1)) {
1272 pc = preempt_count(); 1212 pc = preempt_count();
1273 __trace_graph_return(tr, data, trace, flags, pc); 1213 __trace_graph_return(tr, trace, flags, pc);
1274 } 1214 }
1275 if (!trace->depth) 1215 if (!trace->depth)
1276 clear_tsk_trace_graph(current); 1216 clear_tsk_trace_graph(current);
@@ -1279,30 +1219,122 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
1279} 1219}
1280#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 1220#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1281 1221
1282static struct ftrace_ops trace_ops __read_mostly =
1283{
1284 .func = function_trace_call,
1285};
1286 1222
1287void tracing_start_function_trace(void) 1223/**
1224 * trace_vbprintk - write binary msg to tracing buffer
1225 *
1226 */
1227int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1288{ 1228{
1289 ftrace_function_enabled = 0; 1229 static raw_spinlock_t trace_buf_lock =
1230 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
1231 static u32 trace_buf[TRACE_BUF_SIZE];
1290 1232
1291 if (trace_flags & TRACE_ITER_PREEMPTONLY) 1233 struct ring_buffer_event *event;
1292 trace_ops.func = function_trace_call_preempt_only; 1234 struct trace_array *tr = &global_trace;
1293 else 1235 struct trace_array_cpu *data;
1294 trace_ops.func = function_trace_call; 1236 struct bprint_entry *entry;
1237 unsigned long flags;
1238 int resched;
1239 int cpu, len = 0, size, pc;
1240
1241 if (unlikely(tracing_selftest_running || tracing_disabled))
1242 return 0;
1243
1244 /* Don't pollute graph traces with trace_vprintk internals */
1245 pause_graph_tracing();
1246
1247 pc = preempt_count();
1248 resched = ftrace_preempt_disable();
1249 cpu = raw_smp_processor_id();
1250 data = tr->data[cpu];
1251
1252 if (unlikely(atomic_read(&data->disabled)))
1253 goto out;
1254
1255 /* Lockdep uses trace_printk for lock tracing */
1256 local_irq_save(flags);
1257 __raw_spin_lock(&trace_buf_lock);
1258 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1259
1260 if (len > TRACE_BUF_SIZE || len < 0)
1261 goto out_unlock;
1262
1263 size = sizeof(*entry) + sizeof(u32) * len;
1264 event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc);
1265 if (!event)
1266 goto out_unlock;
1267 entry = ring_buffer_event_data(event);
1268 entry->ip = ip;
1269 entry->fmt = fmt;
1270
1271 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1272 ring_buffer_unlock_commit(tr->buffer, event);
1273
1274out_unlock:
1275 __raw_spin_unlock(&trace_buf_lock);
1276 local_irq_restore(flags);
1277
1278out:
1279 ftrace_preempt_enable(resched);
1280 unpause_graph_tracing();
1295 1281
1296 register_ftrace_function(&trace_ops); 1282 return len;
1297 ftrace_function_enabled = 1;
1298} 1283}
1284EXPORT_SYMBOL_GPL(trace_vbprintk);
1299 1285
1300void tracing_stop_function_trace(void) 1286int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
1301{ 1287{
1302 ftrace_function_enabled = 0; 1288 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
1303 unregister_ftrace_function(&trace_ops); 1289 static char trace_buf[TRACE_BUF_SIZE];
1290
1291 struct ring_buffer_event *event;
1292 struct trace_array *tr = &global_trace;
1293 struct trace_array_cpu *data;
1294 int cpu, len = 0, size, pc;
1295 struct print_entry *entry;
1296 unsigned long irq_flags;
1297
1298 if (tracing_disabled || tracing_selftest_running)
1299 return 0;
1300
1301 pc = preempt_count();
1302 preempt_disable_notrace();
1303 cpu = raw_smp_processor_id();
1304 data = tr->data[cpu];
1305
1306 if (unlikely(atomic_read(&data->disabled)))
1307 goto out;
1308
1309 pause_graph_tracing();
1310 raw_local_irq_save(irq_flags);
1311 __raw_spin_lock(&trace_buf_lock);
1312 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1313
1314 len = min(len, TRACE_BUF_SIZE-1);
1315 trace_buf[len] = 0;
1316
1317 size = sizeof(*entry) + len + 1;
1318 event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
1319 if (!event)
1320 goto out_unlock;
1321 entry = ring_buffer_event_data(event);
1322 entry->ip = ip;
1323
1324 memcpy(&entry->buf, trace_buf, len);
1325 entry->buf[len] = 0;
1326 ring_buffer_unlock_commit(tr->buffer, event);
1327
1328 out_unlock:
1329 __raw_spin_unlock(&trace_buf_lock);
1330 raw_local_irq_restore(irq_flags);
1331 unpause_graph_tracing();
1332 out:
1333 preempt_enable_notrace();
1334
1335 return len;
1304} 1336}
1305#endif 1337EXPORT_SYMBOL_GPL(trace_vprintk);
1306 1338
1307enum trace_file_type { 1339enum trace_file_type {
1308 TRACE_FILE_LAT_FMT = 1, 1340 TRACE_FILE_LAT_FMT = 1,
@@ -1345,10 +1377,25 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1345{ 1377{
1346 struct ring_buffer *buffer = iter->tr->buffer; 1378 struct ring_buffer *buffer = iter->tr->buffer;
1347 struct trace_entry *ent, *next = NULL; 1379 struct trace_entry *ent, *next = NULL;
1380 int cpu_file = iter->cpu_file;
1348 u64 next_ts = 0, ts; 1381 u64 next_ts = 0, ts;
1349 int next_cpu = -1; 1382 int next_cpu = -1;
1350 int cpu; 1383 int cpu;
1351 1384
1385 /*
1386 * If we are in a per_cpu trace file, don't bother by iterating over
1387 * all cpu and peek directly.
1388 */
1389 if (cpu_file > TRACE_PIPE_ALL_CPU) {
1390 if (ring_buffer_empty_cpu(buffer, cpu_file))
1391 return NULL;
1392 ent = peek_next_entry(iter, cpu_file, ent_ts);
1393 if (ent_cpu)
1394 *ent_cpu = cpu_file;
1395
1396 return ent;
1397 }
1398
1352 for_each_tracing_cpu(cpu) { 1399 for_each_tracing_cpu(cpu) {
1353 1400
1354 if (ring_buffer_empty_cpu(buffer, cpu)) 1401 if (ring_buffer_empty_cpu(buffer, cpu))
@@ -1376,8 +1423,8 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1376} 1423}
1377 1424
1378/* Find the next real entry, without updating the iterator itself */ 1425/* Find the next real entry, without updating the iterator itself */
1379static struct trace_entry * 1426struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1380find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) 1427 int *ent_cpu, u64 *ent_ts)
1381{ 1428{
1382 return __find_next_entry(iter, ent_cpu, ent_ts); 1429 return __find_next_entry(iter, ent_cpu, ent_ts);
1383} 1430}
@@ -1426,19 +1473,32 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1426 return ent; 1473 return ent;
1427} 1474}
1428 1475
1476/*
1477 * No necessary locking here. The worst thing which can
1478 * happen is loosing events consumed at the same time
1479 * by a trace_pipe reader.
1480 * Other than that, we don't risk to crash the ring buffer
1481 * because it serializes the readers.
1482 *
1483 * The current tracer is copied to avoid a global locking
1484 * all around.
1485 */
1429static void *s_start(struct seq_file *m, loff_t *pos) 1486static void *s_start(struct seq_file *m, loff_t *pos)
1430{ 1487{
1431 struct trace_iterator *iter = m->private; 1488 struct trace_iterator *iter = m->private;
1489 static struct tracer *old_tracer;
1490 int cpu_file = iter->cpu_file;
1432 void *p = NULL; 1491 void *p = NULL;
1433 loff_t l = 0; 1492 loff_t l = 0;
1434 int cpu; 1493 int cpu;
1435 1494
1495 /* copy the tracer to avoid using a global lock all around */
1436 mutex_lock(&trace_types_lock); 1496 mutex_lock(&trace_types_lock);
1437 1497 if (unlikely(old_tracer != current_trace && current_trace)) {
1438 if (!current_trace || current_trace != iter->trace) { 1498 old_tracer = current_trace;
1439 mutex_unlock(&trace_types_lock); 1499 *iter->trace = *current_trace;
1440 return NULL;
1441 } 1500 }
1501 mutex_unlock(&trace_types_lock);
1442 1502
1443 atomic_inc(&trace_record_cmdline_disabled); 1503 atomic_inc(&trace_record_cmdline_disabled);
1444 1504
@@ -1449,9 +1509,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1449 1509
1450 ftrace_disable_cpu(); 1510 ftrace_disable_cpu();
1451 1511
1452 for_each_tracing_cpu(cpu) { 1512 if (cpu_file == TRACE_PIPE_ALL_CPU) {
1453 ring_buffer_iter_reset(iter->buffer_iter[cpu]); 1513 for_each_tracing_cpu(cpu)
1454 } 1514 ring_buffer_iter_reset(iter->buffer_iter[cpu]);
1515 } else
1516 ring_buffer_iter_reset(iter->buffer_iter[cpu_file]);
1517
1455 1518
1456 ftrace_enable_cpu(); 1519 ftrace_enable_cpu();
1457 1520
@@ -1469,155 +1532,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1469static void s_stop(struct seq_file *m, void *p) 1532static void s_stop(struct seq_file *m, void *p)
1470{ 1533{
1471 atomic_dec(&trace_record_cmdline_disabled); 1534 atomic_dec(&trace_record_cmdline_disabled);
1472 mutex_unlock(&trace_types_lock);
1473}
1474
1475#ifdef CONFIG_KRETPROBES
1476static inline const char *kretprobed(const char *name)
1477{
1478 static const char tramp_name[] = "kretprobe_trampoline";
1479 int size = sizeof(tramp_name);
1480
1481 if (strncmp(tramp_name, name, size) == 0)
1482 return "[unknown/kretprobe'd]";
1483 return name;
1484}
1485#else
1486static inline const char *kretprobed(const char *name)
1487{
1488 return name;
1489}
1490#endif /* CONFIG_KRETPROBES */
1491
1492static int
1493seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
1494{
1495#ifdef CONFIG_KALLSYMS
1496 char str[KSYM_SYMBOL_LEN];
1497 const char *name;
1498
1499 kallsyms_lookup(address, NULL, NULL, NULL, str);
1500
1501 name = kretprobed(str);
1502
1503 return trace_seq_printf(s, fmt, name);
1504#endif
1505 return 1;
1506}
1507
1508static int
1509seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1510 unsigned long address)
1511{
1512#ifdef CONFIG_KALLSYMS
1513 char str[KSYM_SYMBOL_LEN];
1514 const char *name;
1515
1516 sprint_symbol(str, address);
1517 name = kretprobed(str);
1518
1519 return trace_seq_printf(s, fmt, name);
1520#endif
1521 return 1;
1522}
1523
1524#ifndef CONFIG_64BIT
1525# define IP_FMT "%08lx"
1526#else
1527# define IP_FMT "%016lx"
1528#endif
1529
1530int
1531seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1532{
1533 int ret;
1534
1535 if (!ip)
1536 return trace_seq_printf(s, "0");
1537
1538 if (sym_flags & TRACE_ITER_SYM_OFFSET)
1539 ret = seq_print_sym_offset(s, "%s", ip);
1540 else
1541 ret = seq_print_sym_short(s, "%s", ip);
1542
1543 if (!ret)
1544 return 0;
1545
1546 if (sym_flags & TRACE_ITER_SYM_ADDR)
1547 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1548 return ret;
1549}
1550
1551static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
1552 unsigned long ip, unsigned long sym_flags)
1553{
1554 struct file *file = NULL;
1555 unsigned long vmstart = 0;
1556 int ret = 1;
1557
1558 if (mm) {
1559 const struct vm_area_struct *vma;
1560
1561 down_read(&mm->mmap_sem);
1562 vma = find_vma(mm, ip);
1563 if (vma) {
1564 file = vma->vm_file;
1565 vmstart = vma->vm_start;
1566 }
1567 if (file) {
1568 ret = trace_seq_path(s, &file->f_path);
1569 if (ret)
1570 ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
1571 }
1572 up_read(&mm->mmap_sem);
1573 }
1574 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
1575 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1576 return ret;
1577}
1578
1579static int
1580seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
1581 unsigned long sym_flags)
1582{
1583 struct mm_struct *mm = NULL;
1584 int ret = 1;
1585 unsigned int i;
1586
1587 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
1588 struct task_struct *task;
1589 /*
1590 * we do the lookup on the thread group leader,
1591 * since individual threads might have already quit!
1592 */
1593 rcu_read_lock();
1594 task = find_task_by_vpid(entry->ent.tgid);
1595 if (task)
1596 mm = get_task_mm(task);
1597 rcu_read_unlock();
1598 }
1599
1600 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1601 unsigned long ip = entry->caller[i];
1602
1603 if (ip == ULONG_MAX || !ret)
1604 break;
1605 if (i && ret)
1606 ret = trace_seq_puts(s, " <- ");
1607 if (!ip) {
1608 if (ret)
1609 ret = trace_seq_puts(s, "??");
1610 continue;
1611 }
1612 if (!ret)
1613 break;
1614 if (ret)
1615 ret = seq_print_user_ip(s, mm, ip, sym_flags);
1616 }
1617
1618 if (mm)
1619 mmput(mm);
1620 return ret;
1621} 1535}
1622 1536
1623static void print_lat_help_header(struct seq_file *m) 1537static void print_lat_help_header(struct seq_file *m)
@@ -1658,11 +1572,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1658 total = entries + 1572 total = entries +
1659 ring_buffer_overruns(iter->tr->buffer); 1573 ring_buffer_overruns(iter->tr->buffer);
1660 1574
1661 seq_printf(m, "%s latency trace v1.1.5 on %s\n", 1575 seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
1662 name, UTS_RELEASE); 1576 name, UTS_RELEASE);
1663 seq_puts(m, "-----------------------------------" 1577 seq_puts(m, "# -----------------------------------"
1664 "---------------------------------\n"); 1578 "---------------------------------\n");
1665 seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |" 1579 seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |"
1666 " (M:%s VP:%d, KP:%d, SP:%d HP:%d", 1580 " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
1667 nsecs_to_usecs(data->saved_latency), 1581 nsecs_to_usecs(data->saved_latency),
1668 entries, 1582 entries,
@@ -1684,121 +1598,24 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1684#else 1598#else
1685 seq_puts(m, ")\n"); 1599 seq_puts(m, ")\n");
1686#endif 1600#endif
1687 seq_puts(m, " -----------------\n"); 1601 seq_puts(m, "# -----------------\n");
1688 seq_printf(m, " | task: %.16s-%d " 1602 seq_printf(m, "# | task: %.16s-%d "
1689 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", 1603 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
1690 data->comm, data->pid, data->uid, data->nice, 1604 data->comm, data->pid, data->uid, data->nice,
1691 data->policy, data->rt_priority); 1605 data->policy, data->rt_priority);
1692 seq_puts(m, " -----------------\n"); 1606 seq_puts(m, "# -----------------\n");
1693 1607
1694 if (data->critical_start) { 1608 if (data->critical_start) {
1695 seq_puts(m, " => started at: "); 1609 seq_puts(m, "# => started at: ");
1696 seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); 1610 seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
1697 trace_print_seq(m, &iter->seq); 1611 trace_print_seq(m, &iter->seq);
1698 seq_puts(m, "\n => ended at: "); 1612 seq_puts(m, "\n# => ended at: ");
1699 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); 1613 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
1700 trace_print_seq(m, &iter->seq); 1614 trace_print_seq(m, &iter->seq);
1701 seq_puts(m, "\n"); 1615 seq_puts(m, "#\n");
1702 }
1703
1704 seq_puts(m, "\n");
1705}
1706
1707static void
1708lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
1709{
1710 int hardirq, softirq;
1711 char *comm;
1712
1713 comm = trace_find_cmdline(entry->pid);
1714
1715 trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
1716 trace_seq_printf(s, "%3d", cpu);
1717 trace_seq_printf(s, "%c%c",
1718 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
1719 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
1720 ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
1721
1722 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
1723 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
1724 if (hardirq && softirq) {
1725 trace_seq_putc(s, 'H');
1726 } else {
1727 if (hardirq) {
1728 trace_seq_putc(s, 'h');
1729 } else {
1730 if (softirq)
1731 trace_seq_putc(s, 's');
1732 else
1733 trace_seq_putc(s, '.');
1734 }
1735 }
1736
1737 if (entry->preempt_count)
1738 trace_seq_printf(s, "%x", entry->preempt_count);
1739 else
1740 trace_seq_puts(s, ".");
1741}
1742
1743unsigned long preempt_mark_thresh = 100;
1744
1745static void
1746lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
1747 unsigned long rel_usecs)
1748{
1749 trace_seq_printf(s, " %4lldus", abs_usecs);
1750 if (rel_usecs > preempt_mark_thresh)
1751 trace_seq_puts(s, "!: ");
1752 else if (rel_usecs > 1)
1753 trace_seq_puts(s, "+: ");
1754 else
1755 trace_seq_puts(s, " : ");
1756}
1757
1758static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1759
1760static int task_state_char(unsigned long state)
1761{
1762 int bit = state ? __ffs(state) + 1 : 0;
1763
1764 return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
1765}
1766
1767/*
1768 * The message is supposed to contain an ending newline.
1769 * If the printing stops prematurely, try to add a newline of our own.
1770 */
1771void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
1772{
1773 struct trace_entry *ent;
1774 struct trace_field_cont *cont;
1775 bool ok = true;
1776
1777 ent = peek_next_entry(iter, iter->cpu, NULL);
1778 if (!ent || ent->type != TRACE_CONT) {
1779 trace_seq_putc(s, '\n');
1780 return;
1781 } 1616 }
1782 1617
1783 do { 1618 seq_puts(m, "#\n");
1784 cont = (struct trace_field_cont *)ent;
1785 if (ok)
1786 ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
1787
1788 ftrace_disable_cpu();
1789
1790 if (iter->buffer_iter[iter->cpu])
1791 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
1792 else
1793 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
1794
1795 ftrace_enable_cpu();
1796
1797 ent = peek_next_entry(iter, iter->cpu, NULL);
1798 } while (ent && ent->type == TRACE_CONT);
1799
1800 if (!ok)
1801 trace_seq_putc(s, '\n');
1802} 1619}
1803 1620
1804static void test_cpu_buff_start(struct trace_iterator *iter) 1621static void test_cpu_buff_start(struct trace_iterator *iter)
@@ -1815,141 +1632,11 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1815 return; 1632 return;
1816 1633
1817 cpumask_set_cpu(iter->cpu, iter->started); 1634 cpumask_set_cpu(iter->cpu, iter->started);
1818 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
1819}
1820
1821static enum print_line_t
1822print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1823{
1824 struct trace_seq *s = &iter->seq;
1825 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1826 struct trace_entry *next_entry;
1827 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
1828 struct trace_entry *entry = iter->ent;
1829 unsigned long abs_usecs;
1830 unsigned long rel_usecs;
1831 u64 next_ts;
1832 char *comm;
1833 int S, T;
1834 int i;
1835
1836 if (entry->type == TRACE_CONT)
1837 return TRACE_TYPE_HANDLED;
1838
1839 test_cpu_buff_start(iter);
1840
1841 next_entry = find_next_entry(iter, NULL, &next_ts);
1842 if (!next_entry)
1843 next_ts = iter->ts;
1844 rel_usecs = ns2usecs(next_ts - iter->ts);
1845 abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
1846
1847 if (verbose) {
1848 comm = trace_find_cmdline(entry->pid);
1849 trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]"
1850 " %ld.%03ldms (+%ld.%03ldms): ",
1851 comm,
1852 entry->pid, cpu, entry->flags,
1853 entry->preempt_count, trace_idx,
1854 ns2usecs(iter->ts),
1855 abs_usecs/1000,
1856 abs_usecs % 1000, rel_usecs/1000,
1857 rel_usecs % 1000);
1858 } else {
1859 lat_print_generic(s, entry, cpu);
1860 lat_print_timestamp(s, abs_usecs, rel_usecs);
1861 }
1862 switch (entry->type) {
1863 case TRACE_FN: {
1864 struct ftrace_entry *field;
1865
1866 trace_assign_type(field, entry);
1867
1868 seq_print_ip_sym(s, field->ip, sym_flags);
1869 trace_seq_puts(s, " (");
1870 seq_print_ip_sym(s, field->parent_ip, sym_flags);
1871 trace_seq_puts(s, ")\n");
1872 break;
1873 }
1874 case TRACE_CTX:
1875 case TRACE_WAKE: {
1876 struct ctx_switch_entry *field;
1877
1878 trace_assign_type(field, entry);
1879
1880 T = task_state_char(field->next_state);
1881 S = task_state_char(field->prev_state);
1882 comm = trace_find_cmdline(field->next_pid);
1883 trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
1884 field->prev_pid,
1885 field->prev_prio,
1886 S, entry->type == TRACE_CTX ? "==>" : " +",
1887 field->next_cpu,
1888 field->next_pid,
1889 field->next_prio,
1890 T, comm);
1891 break;
1892 }
1893 case TRACE_SPECIAL: {
1894 struct special_entry *field;
1895
1896 trace_assign_type(field, entry);
1897
1898 trace_seq_printf(s, "# %ld %ld %ld\n",
1899 field->arg1,
1900 field->arg2,
1901 field->arg3);
1902 break;
1903 }
1904 case TRACE_STACK: {
1905 struct stack_entry *field;
1906
1907 trace_assign_type(field, entry);
1908
1909 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1910 if (i)
1911 trace_seq_puts(s, " <= ");
1912 seq_print_ip_sym(s, field->caller[i], sym_flags);
1913 }
1914 trace_seq_puts(s, "\n");
1915 break;
1916 }
1917 case TRACE_PRINT: {
1918 struct print_entry *field;
1919
1920 trace_assign_type(field, entry);
1921
1922 seq_print_ip_sym(s, field->ip, sym_flags);
1923 trace_seq_printf(s, ": %s", field->buf);
1924 if (entry->flags & TRACE_FLAG_CONT)
1925 trace_seq_print_cont(s, iter);
1926 break;
1927 }
1928 case TRACE_BRANCH: {
1929 struct trace_branch *field;
1930
1931 trace_assign_type(field, entry);
1932
1933 trace_seq_printf(s, "[%s] %s:%s:%d\n",
1934 field->correct ? " ok " : " MISS ",
1935 field->func,
1936 field->file,
1937 field->line);
1938 break;
1939 }
1940 case TRACE_USER_STACK: {
1941 struct userstack_entry *field;
1942
1943 trace_assign_type(field, entry);
1944 1635
1945 seq_print_userip_objs(field, s, sym_flags); 1636 /* Don't print started cpu buffer for the first entry of the trace */
1946 trace_seq_putc(s, '\n'); 1637 if (iter->idx > 1)
1947 break; 1638 trace_seq_printf(s, "##### CPU %u buffer started ####\n",
1948 } 1639 iter->cpu);
1949 default:
1950 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1951 }
1952 return TRACE_TYPE_HANDLED;
1953} 1640}
1954 1641
1955static enum print_line_t print_trace_fmt(struct trace_iterator *iter) 1642static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
@@ -1957,333 +1644,84 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1957 struct trace_seq *s = &iter->seq; 1644 struct trace_seq *s = &iter->seq;
1958 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1645 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1959 struct trace_entry *entry; 1646 struct trace_entry *entry;
1960 unsigned long usec_rem; 1647 struct trace_event *event;
1961 unsigned long long t;
1962 unsigned long secs;
1963 char *comm;
1964 int ret;
1965 int S, T;
1966 int i;
1967 1648
1968 entry = iter->ent; 1649 entry = iter->ent;
1969 1650
1970 if (entry->type == TRACE_CONT)
1971 return TRACE_TYPE_HANDLED;
1972
1973 test_cpu_buff_start(iter); 1651 test_cpu_buff_start(iter);
1974 1652
1975 comm = trace_find_cmdline(iter->ent->pid); 1653 event = ftrace_find_event(entry->type);
1976
1977 t = ns2usecs(iter->ts);
1978 usec_rem = do_div(t, 1000000ULL);
1979 secs = (unsigned long)t;
1980
1981 ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
1982 if (!ret)
1983 return TRACE_TYPE_PARTIAL_LINE;
1984 ret = trace_seq_printf(s, "[%03d] ", iter->cpu);
1985 if (!ret)
1986 return TRACE_TYPE_PARTIAL_LINE;
1987 ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
1988 if (!ret)
1989 return TRACE_TYPE_PARTIAL_LINE;
1990
1991 switch (entry->type) {
1992 case TRACE_FN: {
1993 struct ftrace_entry *field;
1994
1995 trace_assign_type(field, entry);
1996
1997 ret = seq_print_ip_sym(s, field->ip, sym_flags);
1998 if (!ret)
1999 return TRACE_TYPE_PARTIAL_LINE;
2000 if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
2001 field->parent_ip) {
2002 ret = trace_seq_printf(s, " <-");
2003 if (!ret)
2004 return TRACE_TYPE_PARTIAL_LINE;
2005 ret = seq_print_ip_sym(s,
2006 field->parent_ip,
2007 sym_flags);
2008 if (!ret)
2009 return TRACE_TYPE_PARTIAL_LINE;
2010 }
2011 ret = trace_seq_printf(s, "\n");
2012 if (!ret)
2013 return TRACE_TYPE_PARTIAL_LINE;
2014 break;
2015 }
2016 case TRACE_CTX:
2017 case TRACE_WAKE: {
2018 struct ctx_switch_entry *field;
2019
2020 trace_assign_type(field, entry);
2021
2022 T = task_state_char(field->next_state);
2023 S = task_state_char(field->prev_state);
2024 ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
2025 field->prev_pid,
2026 field->prev_prio,
2027 S,
2028 entry->type == TRACE_CTX ? "==>" : " +",
2029 field->next_cpu,
2030 field->next_pid,
2031 field->next_prio,
2032 T);
2033 if (!ret)
2034 return TRACE_TYPE_PARTIAL_LINE;
2035 break;
2036 }
2037 case TRACE_SPECIAL: {
2038 struct special_entry *field;
2039 1654
2040 trace_assign_type(field, entry); 1655 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2041 1656 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2042 ret = trace_seq_printf(s, "# %ld %ld %ld\n", 1657 if (!trace_print_lat_context(iter))
2043 field->arg1, 1658 goto partial;
2044 field->arg2, 1659 } else {
2045 field->arg3); 1660 if (!trace_print_context(iter))
2046 if (!ret) 1661 goto partial;
2047 return TRACE_TYPE_PARTIAL_LINE;
2048 break;
2049 }
2050 case TRACE_STACK: {
2051 struct stack_entry *field;
2052
2053 trace_assign_type(field, entry);
2054
2055 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
2056 if (i) {
2057 ret = trace_seq_puts(s, " <= ");
2058 if (!ret)
2059 return TRACE_TYPE_PARTIAL_LINE;
2060 }
2061 ret = seq_print_ip_sym(s, field->caller[i],
2062 sym_flags);
2063 if (!ret)
2064 return TRACE_TYPE_PARTIAL_LINE;
2065 } 1662 }
2066 ret = trace_seq_puts(s, "\n");
2067 if (!ret)
2068 return TRACE_TYPE_PARTIAL_LINE;
2069 break;
2070 }
2071 case TRACE_PRINT: {
2072 struct print_entry *field;
2073
2074 trace_assign_type(field, entry);
2075
2076 seq_print_ip_sym(s, field->ip, sym_flags);
2077 trace_seq_printf(s, ": %s", field->buf);
2078 if (entry->flags & TRACE_FLAG_CONT)
2079 trace_seq_print_cont(s, iter);
2080 break;
2081 }
2082 case TRACE_GRAPH_RET: {
2083 return print_graph_function(iter);
2084 } 1663 }
2085 case TRACE_GRAPH_ENT: {
2086 return print_graph_function(iter);
2087 }
2088 case TRACE_BRANCH: {
2089 struct trace_branch *field;
2090
2091 trace_assign_type(field, entry);
2092 1664
2093 trace_seq_printf(s, "[%s] %s:%s:%d\n", 1665 if (event)
2094 field->correct ? " ok " : " MISS ", 1666 return event->trace(iter, sym_flags);
2095 field->func,
2096 field->file,
2097 field->line);
2098 break;
2099 }
2100 case TRACE_USER_STACK: {
2101 struct userstack_entry *field;
2102 1667
2103 trace_assign_type(field, entry); 1668 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
1669 goto partial;
2104 1670
2105 ret = seq_print_userip_objs(field, s, sym_flags);
2106 if (!ret)
2107 return TRACE_TYPE_PARTIAL_LINE;
2108 ret = trace_seq_putc(s, '\n');
2109 if (!ret)
2110 return TRACE_TYPE_PARTIAL_LINE;
2111 break;
2112 }
2113 }
2114 return TRACE_TYPE_HANDLED; 1671 return TRACE_TYPE_HANDLED;
1672partial:
1673 return TRACE_TYPE_PARTIAL_LINE;
2115} 1674}
2116 1675
2117static enum print_line_t print_raw_fmt(struct trace_iterator *iter) 1676static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
2118{ 1677{
2119 struct trace_seq *s = &iter->seq; 1678 struct trace_seq *s = &iter->seq;
2120 struct trace_entry *entry; 1679 struct trace_entry *entry;
2121 int ret; 1680 struct trace_event *event;
2122 int S, T;
2123 1681
2124 entry = iter->ent; 1682 entry = iter->ent;
2125 1683
2126 if (entry->type == TRACE_CONT) 1684 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2127 return TRACE_TYPE_HANDLED; 1685 if (!trace_seq_printf(s, "%d %d %llu ",
2128 1686 entry->pid, iter->cpu, iter->ts))
2129 ret = trace_seq_printf(s, "%d %d %llu ", 1687 goto partial;
2130 entry->pid, iter->cpu, iter->ts);
2131 if (!ret)
2132 return TRACE_TYPE_PARTIAL_LINE;
2133
2134 switch (entry->type) {
2135 case TRACE_FN: {
2136 struct ftrace_entry *field;
2137
2138 trace_assign_type(field, entry);
2139
2140 ret = trace_seq_printf(s, "%x %x\n",
2141 field->ip,
2142 field->parent_ip);
2143 if (!ret)
2144 return TRACE_TYPE_PARTIAL_LINE;
2145 break;
2146 } 1688 }
2147 case TRACE_CTX:
2148 case TRACE_WAKE: {
2149 struct ctx_switch_entry *field;
2150
2151 trace_assign_type(field, entry);
2152
2153 T = task_state_char(field->next_state);
2154 S = entry->type == TRACE_WAKE ? '+' :
2155 task_state_char(field->prev_state);
2156 ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
2157 field->prev_pid,
2158 field->prev_prio,
2159 S,
2160 field->next_cpu,
2161 field->next_pid,
2162 field->next_prio,
2163 T);
2164 if (!ret)
2165 return TRACE_TYPE_PARTIAL_LINE;
2166 break;
2167 }
2168 case TRACE_SPECIAL:
2169 case TRACE_USER_STACK:
2170 case TRACE_STACK: {
2171 struct special_entry *field;
2172
2173 trace_assign_type(field, entry);
2174 1689
2175 ret = trace_seq_printf(s, "# %ld %ld %ld\n", 1690 event = ftrace_find_event(entry->type);
2176 field->arg1, 1691 if (event)
2177 field->arg2, 1692 return event->raw(iter, 0);
2178 field->arg3);
2179 if (!ret)
2180 return TRACE_TYPE_PARTIAL_LINE;
2181 break;
2182 }
2183 case TRACE_PRINT: {
2184 struct print_entry *field;
2185 1693
2186 trace_assign_type(field, entry); 1694 if (!trace_seq_printf(s, "%d ?\n", entry->type))
1695 goto partial;
2187 1696
2188 trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
2189 if (entry->flags & TRACE_FLAG_CONT)
2190 trace_seq_print_cont(s, iter);
2191 break;
2192 }
2193 }
2194 return TRACE_TYPE_HANDLED; 1697 return TRACE_TYPE_HANDLED;
1698partial:
1699 return TRACE_TYPE_PARTIAL_LINE;
2195} 1700}
2196 1701
2197#define SEQ_PUT_FIELD_RET(s, x) \
2198do { \
2199 if (!trace_seq_putmem(s, &(x), sizeof(x))) \
2200 return 0; \
2201} while (0)
2202
2203#define SEQ_PUT_HEX_FIELD_RET(s, x) \
2204do { \
2205 BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
2206 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
2207 return 0; \
2208} while (0)
2209
2210static enum print_line_t print_hex_fmt(struct trace_iterator *iter) 1702static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
2211{ 1703{
2212 struct trace_seq *s = &iter->seq; 1704 struct trace_seq *s = &iter->seq;
2213 unsigned char newline = '\n'; 1705 unsigned char newline = '\n';
2214 struct trace_entry *entry; 1706 struct trace_entry *entry;
2215 int S, T; 1707 struct trace_event *event;
2216 1708
2217 entry = iter->ent; 1709 entry = iter->ent;
2218 1710
2219 if (entry->type == TRACE_CONT) 1711 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2220 return TRACE_TYPE_HANDLED; 1712 SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
2221 1713 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
2222 SEQ_PUT_HEX_FIELD_RET(s, entry->pid); 1714 SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
2223 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
2224 SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
2225
2226 switch (entry->type) {
2227 case TRACE_FN: {
2228 struct ftrace_entry *field;
2229
2230 trace_assign_type(field, entry);
2231
2232 SEQ_PUT_HEX_FIELD_RET(s, field->ip);
2233 SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
2234 break;
2235 } 1715 }
2236 case TRACE_CTX:
2237 case TRACE_WAKE: {
2238 struct ctx_switch_entry *field;
2239
2240 trace_assign_type(field, entry);
2241
2242 T = task_state_char(field->next_state);
2243 S = entry->type == TRACE_WAKE ? '+' :
2244 task_state_char(field->prev_state);
2245 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
2246 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
2247 SEQ_PUT_HEX_FIELD_RET(s, S);
2248 SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
2249 SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
2250 SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
2251 SEQ_PUT_HEX_FIELD_RET(s, T);
2252 break;
2253 }
2254 case TRACE_SPECIAL:
2255 case TRACE_USER_STACK:
2256 case TRACE_STACK: {
2257 struct special_entry *field;
2258
2259 trace_assign_type(field, entry);
2260 1716
2261 SEQ_PUT_HEX_FIELD_RET(s, field->arg1); 1717 event = ftrace_find_event(entry->type);
2262 SEQ_PUT_HEX_FIELD_RET(s, field->arg2); 1718 if (event) {
2263 SEQ_PUT_HEX_FIELD_RET(s, field->arg3); 1719 enum print_line_t ret = event->hex(iter, 0);
2264 break; 1720 if (ret != TRACE_TYPE_HANDLED)
2265 } 1721 return ret;
2266 } 1722 }
2267 SEQ_PUT_FIELD_RET(s, newline);
2268
2269 return TRACE_TYPE_HANDLED;
2270}
2271
2272static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
2273{
2274 struct trace_seq *s = &iter->seq;
2275 struct trace_entry *entry = iter->ent;
2276 struct print_entry *field;
2277 int ret;
2278
2279 trace_assign_type(field, entry);
2280 1723
2281 ret = trace_seq_printf(s, field->buf); 1724 SEQ_PUT_FIELD_RET(s, newline);
2282 if (!ret)
2283 return TRACE_TYPE_PARTIAL_LINE;
2284
2285 if (entry->flags & TRACE_FLAG_CONT)
2286 trace_seq_print_cont(s, iter);
2287 1725
2288 return TRACE_TYPE_HANDLED; 1726 return TRACE_TYPE_HANDLED;
2289} 1727}
@@ -2292,59 +1730,37 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
2292{ 1730{
2293 struct trace_seq *s = &iter->seq; 1731 struct trace_seq *s = &iter->seq;
2294 struct trace_entry *entry; 1732 struct trace_entry *entry;
1733 struct trace_event *event;
2295 1734
2296 entry = iter->ent; 1735 entry = iter->ent;
2297 1736
2298 if (entry->type == TRACE_CONT) 1737 if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
2299 return TRACE_TYPE_HANDLED; 1738 SEQ_PUT_FIELD_RET(s, entry->pid);
2300 1739 SEQ_PUT_FIELD_RET(s, iter->cpu);
2301 SEQ_PUT_FIELD_RET(s, entry->pid); 1740 SEQ_PUT_FIELD_RET(s, iter->ts);
2302 SEQ_PUT_FIELD_RET(s, entry->cpu);
2303 SEQ_PUT_FIELD_RET(s, iter->ts);
2304
2305 switch (entry->type) {
2306 case TRACE_FN: {
2307 struct ftrace_entry *field;
2308
2309 trace_assign_type(field, entry);
2310
2311 SEQ_PUT_FIELD_RET(s, field->ip);
2312 SEQ_PUT_FIELD_RET(s, field->parent_ip);
2313 break;
2314 }
2315 case TRACE_CTX: {
2316 struct ctx_switch_entry *field;
2317
2318 trace_assign_type(field, entry);
2319
2320 SEQ_PUT_FIELD_RET(s, field->prev_pid);
2321 SEQ_PUT_FIELD_RET(s, field->prev_prio);
2322 SEQ_PUT_FIELD_RET(s, field->prev_state);
2323 SEQ_PUT_FIELD_RET(s, field->next_pid);
2324 SEQ_PUT_FIELD_RET(s, field->next_prio);
2325 SEQ_PUT_FIELD_RET(s, field->next_state);
2326 break;
2327 } 1741 }
2328 case TRACE_SPECIAL:
2329 case TRACE_USER_STACK:
2330 case TRACE_STACK: {
2331 struct special_entry *field;
2332
2333 trace_assign_type(field, entry);
2334 1742
2335 SEQ_PUT_FIELD_RET(s, field->arg1); 1743 event = ftrace_find_event(entry->type);
2336 SEQ_PUT_FIELD_RET(s, field->arg2); 1744 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
2337 SEQ_PUT_FIELD_RET(s, field->arg3);
2338 break;
2339 }
2340 }
2341 return 1;
2342} 1745}
2343 1746
2344static int trace_empty(struct trace_iterator *iter) 1747static int trace_empty(struct trace_iterator *iter)
2345{ 1748{
2346 int cpu; 1749 int cpu;
2347 1750
1751 /* If we are looking at one CPU buffer, only check that one */
1752 if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
1753 cpu = iter->cpu_file;
1754 if (iter->buffer_iter[cpu]) {
1755 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
1756 return 0;
1757 } else {
1758 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
1759 return 0;
1760 }
1761 return 1;
1762 }
1763
2348 for_each_tracing_cpu(cpu) { 1764 for_each_tracing_cpu(cpu) {
2349 if (iter->buffer_iter[cpu]) { 1765 if (iter->buffer_iter[cpu]) {
2350 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) 1766 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
@@ -2368,10 +1784,15 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2368 return ret; 1784 return ret;
2369 } 1785 }
2370 1786
1787 if (iter->ent->type == TRACE_BPRINT &&
1788 trace_flags & TRACE_ITER_PRINTK &&
1789 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
1790 return trace_print_bprintk_msg_only(iter);
1791
2371 if (iter->ent->type == TRACE_PRINT && 1792 if (iter->ent->type == TRACE_PRINT &&
2372 trace_flags & TRACE_ITER_PRINTK && 1793 trace_flags & TRACE_ITER_PRINTK &&
2373 trace_flags & TRACE_ITER_PRINTK_MSGONLY) 1794 trace_flags & TRACE_ITER_PRINTK_MSGONLY)
2374 return print_printk_msg_only(iter); 1795 return trace_print_printk_msg_only(iter);
2375 1796
2376 if (trace_flags & TRACE_ITER_BIN) 1797 if (trace_flags & TRACE_ITER_BIN)
2377 return print_bin_fmt(iter); 1798 return print_bin_fmt(iter);
@@ -2382,9 +1803,6 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2382 if (trace_flags & TRACE_ITER_RAW) 1803 if (trace_flags & TRACE_ITER_RAW)
2383 return print_raw_fmt(iter); 1804 return print_raw_fmt(iter);
2384 1805
2385 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
2386 return print_lat_fmt(iter, iter->idx, iter->cpu);
2387
2388 return print_trace_fmt(iter); 1806 return print_trace_fmt(iter);
2389} 1807}
2390 1808
@@ -2426,30 +1844,45 @@ static struct seq_operations tracer_seq_ops = {
2426}; 1844};
2427 1845
2428static struct trace_iterator * 1846static struct trace_iterator *
2429__tracing_open(struct inode *inode, struct file *file, int *ret) 1847__tracing_open(struct inode *inode, struct file *file)
2430{ 1848{
1849 long cpu_file = (long) inode->i_private;
1850 void *fail_ret = ERR_PTR(-ENOMEM);
2431 struct trace_iterator *iter; 1851 struct trace_iterator *iter;
2432 struct seq_file *m; 1852 struct seq_file *m;
2433 int cpu; 1853 int cpu, ret;
2434 1854
2435 if (tracing_disabled) { 1855 if (tracing_disabled)
2436 *ret = -ENODEV; 1856 return ERR_PTR(-ENODEV);
2437 return NULL;
2438 }
2439 1857
2440 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 1858 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2441 if (!iter) { 1859 if (!iter)
2442 *ret = -ENOMEM; 1860 return ERR_PTR(-ENOMEM);
2443 goto out;
2444 }
2445 1861
1862 /*
1863 * We make a copy of the current tracer to avoid concurrent
1864 * changes on it while we are reading.
1865 */
2446 mutex_lock(&trace_types_lock); 1866 mutex_lock(&trace_types_lock);
1867 iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
1868 if (!iter->trace)
1869 goto fail;
1870
1871 if (current_trace)
1872 *iter->trace = *current_trace;
1873
1874 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
1875 goto fail;
1876
1877 cpumask_clear(iter->started);
1878
2447 if (current_trace && current_trace->print_max) 1879 if (current_trace && current_trace->print_max)
2448 iter->tr = &max_tr; 1880 iter->tr = &max_tr;
2449 else 1881 else
2450 iter->tr = inode->i_private; 1882 iter->tr = &global_trace;
2451 iter->trace = current_trace;
2452 iter->pos = -1; 1883 iter->pos = -1;
1884 mutex_init(&iter->mutex);
1885 iter->cpu_file = cpu_file;
2453 1886
2454 /* Notify the tracer early; before we stop tracing. */ 1887 /* Notify the tracer early; before we stop tracing. */
2455 if (iter->trace && iter->trace->open) 1888 if (iter->trace && iter->trace->open)
@@ -2459,20 +1892,24 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
2459 if (ring_buffer_overruns(iter->tr->buffer)) 1892 if (ring_buffer_overruns(iter->tr->buffer))
2460 iter->iter_flags |= TRACE_FILE_ANNOTATE; 1893 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2461 1894
1895 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
1896 for_each_tracing_cpu(cpu) {
2462 1897
2463 for_each_tracing_cpu(cpu) { 1898 iter->buffer_iter[cpu] =
2464 1899 ring_buffer_read_start(iter->tr->buffer, cpu);
1900 }
1901 } else {
1902 cpu = iter->cpu_file;
2465 iter->buffer_iter[cpu] = 1903 iter->buffer_iter[cpu] =
2466 ring_buffer_read_start(iter->tr->buffer, cpu); 1904 ring_buffer_read_start(iter->tr->buffer, cpu);
2467
2468 if (!iter->buffer_iter[cpu])
2469 goto fail_buffer;
2470 } 1905 }
2471 1906
2472 /* TODO stop tracer */ 1907 /* TODO stop tracer */
2473 *ret = seq_open(file, &tracer_seq_ops); 1908 ret = seq_open(file, &tracer_seq_ops);
2474 if (*ret) 1909 if (ret < 0) {
1910 fail_ret = ERR_PTR(ret);
2475 goto fail_buffer; 1911 goto fail_buffer;
1912 }
2476 1913
2477 m = file->private_data; 1914 m = file->private_data;
2478 m->private = iter; 1915 m->private = iter;
@@ -2482,7 +1919,6 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
2482 1919
2483 mutex_unlock(&trace_types_lock); 1920 mutex_unlock(&trace_types_lock);
2484 1921
2485 out:
2486 return iter; 1922 return iter;
2487 1923
2488 fail_buffer: 1924 fail_buffer:
@@ -2490,10 +1926,13 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
2490 if (iter->buffer_iter[cpu]) 1926 if (iter->buffer_iter[cpu])
2491 ring_buffer_read_finish(iter->buffer_iter[cpu]); 1927 ring_buffer_read_finish(iter->buffer_iter[cpu]);
2492 } 1928 }
1929 free_cpumask_var(iter->started);
1930 fail:
2493 mutex_unlock(&trace_types_lock); 1931 mutex_unlock(&trace_types_lock);
1932 kfree(iter->trace);
2494 kfree(iter); 1933 kfree(iter);
2495 1934
2496 return ERR_PTR(-ENOMEM); 1935 return fail_ret;
2497} 1936}
2498 1937
2499int tracing_open_generic(struct inode *inode, struct file *filp) 1938int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -2505,12 +1944,17 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2505 return 0; 1944 return 0;
2506} 1945}
2507 1946
2508int tracing_release(struct inode *inode, struct file *file) 1947static int tracing_release(struct inode *inode, struct file *file)
2509{ 1948{
2510 struct seq_file *m = (struct seq_file *)file->private_data; 1949 struct seq_file *m = (struct seq_file *)file->private_data;
2511 struct trace_iterator *iter = m->private; 1950 struct trace_iterator *iter;
2512 int cpu; 1951 int cpu;
2513 1952
1953 if (!(file->f_mode & FMODE_READ))
1954 return 0;
1955
1956 iter = m->private;
1957
2514 mutex_lock(&trace_types_lock); 1958 mutex_lock(&trace_types_lock);
2515 for_each_tracing_cpu(cpu) { 1959 for_each_tracing_cpu(cpu) {
2516 if (iter->buffer_iter[cpu]) 1960 if (iter->buffer_iter[cpu])
@@ -2525,33 +1969,39 @@ int tracing_release(struct inode *inode, struct file *file)
2525 mutex_unlock(&trace_types_lock); 1969 mutex_unlock(&trace_types_lock);
2526 1970
2527 seq_release(inode, file); 1971 seq_release(inode, file);
1972 mutex_destroy(&iter->mutex);
1973 free_cpumask_var(iter->started);
1974 kfree(iter->trace);
2528 kfree(iter); 1975 kfree(iter);
2529 return 0; 1976 return 0;
2530} 1977}
2531 1978
2532static int tracing_open(struct inode *inode, struct file *file) 1979static int tracing_open(struct inode *inode, struct file *file)
2533{ 1980{
2534 int ret;
2535
2536 __tracing_open(inode, file, &ret);
2537
2538 return ret;
2539}
2540
2541static int tracing_lt_open(struct inode *inode, struct file *file)
2542{
2543 struct trace_iterator *iter; 1981 struct trace_iterator *iter;
2544 int ret; 1982 int ret = 0;
2545 1983
2546 iter = __tracing_open(inode, file, &ret); 1984 /* If this file was open for write, then erase contents */
1985 if ((file->f_mode & FMODE_WRITE) &&
1986 !(file->f_flags & O_APPEND)) {
1987 long cpu = (long) inode->i_private;
2547 1988
2548 if (!ret) 1989 if (cpu == TRACE_PIPE_ALL_CPU)
2549 iter->iter_flags |= TRACE_FILE_LAT_FMT; 1990 tracing_reset_online_cpus(&global_trace);
1991 else
1992 tracing_reset(&global_trace, cpu);
1993 }
2550 1994
1995 if (file->f_mode & FMODE_READ) {
1996 iter = __tracing_open(inode, file);
1997 if (IS_ERR(iter))
1998 ret = PTR_ERR(iter);
1999 else if (trace_flags & TRACE_ITER_LATENCY_FMT)
2000 iter->iter_flags |= TRACE_FILE_LAT_FMT;
2001 }
2551 return ret; 2002 return ret;
2552} 2003}
2553 2004
2554
2555static void * 2005static void *
2556t_next(struct seq_file *m, void *v, loff_t *pos) 2006t_next(struct seq_file *m, void *v, loff_t *pos)
2557{ 2007{
@@ -2623,21 +2073,22 @@ static int show_traces_open(struct inode *inode, struct file *file)
2623 return ret; 2073 return ret;
2624} 2074}
2625 2075
2626static struct file_operations tracing_fops = { 2076static ssize_t
2627 .open = tracing_open, 2077tracing_write_stub(struct file *filp, const char __user *ubuf,
2628 .read = seq_read, 2078 size_t count, loff_t *ppos)
2629 .llseek = seq_lseek, 2079{
2630 .release = tracing_release, 2080 return count;
2631}; 2081}
2632 2082
2633static struct file_operations tracing_lt_fops = { 2083static const struct file_operations tracing_fops = {
2634 .open = tracing_lt_open, 2084 .open = tracing_open,
2635 .read = seq_read, 2085 .read = seq_read,
2086 .write = tracing_write_stub,
2636 .llseek = seq_lseek, 2087 .llseek = seq_lseek,
2637 .release = tracing_release, 2088 .release = tracing_release,
2638}; 2089};
2639 2090
2640static struct file_operations show_traces_fops = { 2091static const struct file_operations show_traces_fops = {
2641 .open = show_traces_open, 2092 .open = show_traces_open,
2642 .read = seq_read, 2093 .read = seq_read,
2643 .release = seq_release, 2094 .release = seq_release,
@@ -2730,7 +2181,7 @@ err_unlock:
2730 return err; 2181 return err;
2731} 2182}
2732 2183
2733static struct file_operations tracing_cpumask_fops = { 2184static const struct file_operations tracing_cpumask_fops = {
2734 .open = tracing_open_generic, 2185 .open = tracing_open_generic,
2735 .read = tracing_cpumask_read, 2186 .read = tracing_cpumask_read,
2736 .write = tracing_cpumask_write, 2187 .write = tracing_cpumask_write,
@@ -2740,57 +2191,62 @@ static ssize_t
2740tracing_trace_options_read(struct file *filp, char __user *ubuf, 2191tracing_trace_options_read(struct file *filp, char __user *ubuf,
2741 size_t cnt, loff_t *ppos) 2192 size_t cnt, loff_t *ppos)
2742{ 2193{
2743 int i; 2194 struct tracer_opt *trace_opts;
2195 u32 tracer_flags;
2196 int len = 0;
2744 char *buf; 2197 char *buf;
2745 int r = 0; 2198 int r = 0;
2746 int len = 0; 2199 int i;
2747 u32 tracer_flags = current_trace->flags->val;
2748 struct tracer_opt *trace_opts = current_trace->flags->opts;
2749 2200
2750 2201
2751 /* calulate max size */ 2202 /* calculate max size */
2752 for (i = 0; trace_options[i]; i++) { 2203 for (i = 0; trace_options[i]; i++) {
2753 len += strlen(trace_options[i]); 2204 len += strlen(trace_options[i]);
2754 len += 3; /* "no" and space */ 2205 len += 3; /* "no" and newline */
2755 } 2206 }
2756 2207
2208 mutex_lock(&trace_types_lock);
2209 tracer_flags = current_trace->flags->val;
2210 trace_opts = current_trace->flags->opts;
2211
2757 /* 2212 /*
2758 * Increase the size with names of options specific 2213 * Increase the size with names of options specific
2759 * of the current tracer. 2214 * of the current tracer.
2760 */ 2215 */
2761 for (i = 0; trace_opts[i].name; i++) { 2216 for (i = 0; trace_opts[i].name; i++) {
2762 len += strlen(trace_opts[i].name); 2217 len += strlen(trace_opts[i].name);
2763 len += 3; /* "no" and space */ 2218 len += 3; /* "no" and newline */
2764 } 2219 }
2765 2220
2766 /* +2 for \n and \0 */ 2221 /* +2 for \n and \0 */
2767 buf = kmalloc(len + 2, GFP_KERNEL); 2222 buf = kmalloc(len + 2, GFP_KERNEL);
2768 if (!buf) 2223 if (!buf) {
2224 mutex_unlock(&trace_types_lock);
2769 return -ENOMEM; 2225 return -ENOMEM;
2226 }
2770 2227
2771 for (i = 0; trace_options[i]; i++) { 2228 for (i = 0; trace_options[i]; i++) {
2772 if (trace_flags & (1 << i)) 2229 if (trace_flags & (1 << i))
2773 r += sprintf(buf + r, "%s ", trace_options[i]); 2230 r += sprintf(buf + r, "%s\n", trace_options[i]);
2774 else 2231 else
2775 r += sprintf(buf + r, "no%s ", trace_options[i]); 2232 r += sprintf(buf + r, "no%s\n", trace_options[i]);
2776 } 2233 }
2777 2234
2778 for (i = 0; trace_opts[i].name; i++) { 2235 for (i = 0; trace_opts[i].name; i++) {
2779 if (tracer_flags & trace_opts[i].bit) 2236 if (tracer_flags & trace_opts[i].bit)
2780 r += sprintf(buf + r, "%s ", 2237 r += sprintf(buf + r, "%s\n",
2781 trace_opts[i].name); 2238 trace_opts[i].name);
2782 else 2239 else
2783 r += sprintf(buf + r, "no%s ", 2240 r += sprintf(buf + r, "no%s\n",
2784 trace_opts[i].name); 2241 trace_opts[i].name);
2785 } 2242 }
2243 mutex_unlock(&trace_types_lock);
2786 2244
2787 r += sprintf(buf + r, "\n");
2788 WARN_ON(r >= len + 2); 2245 WARN_ON(r >= len + 2);
2789 2246
2790 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2247 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2791 2248
2792 kfree(buf); 2249 kfree(buf);
2793
2794 return r; 2250 return r;
2795} 2251}
2796 2252
@@ -2828,6 +2284,34 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2828 return 0; 2284 return 0;
2829} 2285}
2830 2286
2287static void set_tracer_flags(unsigned int mask, int enabled)
2288{
2289 /* do nothing if flag is already set */
2290 if (!!(trace_flags & mask) == !!enabled)
2291 return;
2292
2293 if (enabled)
2294 trace_flags |= mask;
2295 else
2296 trace_flags &= ~mask;
2297
2298 if (mask == TRACE_ITER_GLOBAL_CLK) {
2299 u64 (*func)(void);
2300
2301 if (enabled)
2302 func = trace_clock_global;
2303 else
2304 func = trace_clock_local;
2305
2306 mutex_lock(&trace_types_lock);
2307 ring_buffer_set_clock(global_trace.buffer, func);
2308
2309 if (max_tr.buffer)
2310 ring_buffer_set_clock(max_tr.buffer, func);
2311 mutex_unlock(&trace_types_lock);
2312 }
2313}
2314
2831static ssize_t 2315static ssize_t
2832tracing_trace_options_write(struct file *filp, const char __user *ubuf, 2316tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2833 size_t cnt, loff_t *ppos) 2317 size_t cnt, loff_t *ppos)
@@ -2855,17 +2339,16 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2855 int len = strlen(trace_options[i]); 2339 int len = strlen(trace_options[i]);
2856 2340
2857 if (strncmp(cmp, trace_options[i], len) == 0) { 2341 if (strncmp(cmp, trace_options[i], len) == 0) {
2858 if (neg) 2342 set_tracer_flags(1 << i, !neg);
2859 trace_flags &= ~(1 << i);
2860 else
2861 trace_flags |= (1 << i);
2862 break; 2343 break;
2863 } 2344 }
2864 } 2345 }
2865 2346
2866 /* If no option could be set, test the specific tracer options */ 2347 /* If no option could be set, test the specific tracer options */
2867 if (!trace_options[i]) { 2348 if (!trace_options[i]) {
2349 mutex_lock(&trace_types_lock);
2868 ret = set_tracer_option(current_trace, cmp, neg); 2350 ret = set_tracer_option(current_trace, cmp, neg);
2351 mutex_unlock(&trace_types_lock);
2869 if (ret) 2352 if (ret)
2870 return ret; 2353 return ret;
2871 } 2354 }
@@ -2875,7 +2358,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2875 return cnt; 2358 return cnt;
2876} 2359}
2877 2360
2878static struct file_operations tracing_iter_fops = { 2361static const struct file_operations tracing_iter_fops = {
2879 .open = tracing_open_generic, 2362 .open = tracing_open_generic,
2880 .read = tracing_trace_options_read, 2363 .read = tracing_trace_options_read,
2881 .write = tracing_trace_options_write, 2364 .write = tracing_trace_options_write,
@@ -2886,9 +2369,9 @@ static const char readme_msg[] =
2886 "# mkdir /debug\n" 2369 "# mkdir /debug\n"
2887 "# mount -t debugfs nodev /debug\n\n" 2370 "# mount -t debugfs nodev /debug\n\n"
2888 "# cat /debug/tracing/available_tracers\n" 2371 "# cat /debug/tracing/available_tracers\n"
2889 "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n" 2372 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
2890 "# cat /debug/tracing/current_tracer\n" 2373 "# cat /debug/tracing/current_tracer\n"
2891 "none\n" 2374 "nop\n"
2892 "# echo sched_switch > /debug/tracing/current_tracer\n" 2375 "# echo sched_switch > /debug/tracing/current_tracer\n"
2893 "# cat /debug/tracing/current_tracer\n" 2376 "# cat /debug/tracing/current_tracer\n"
2894 "sched_switch\n" 2377 "sched_switch\n"
@@ -2897,7 +2380,7 @@ static const char readme_msg[] =
2897 "# echo print-parent > /debug/tracing/trace_options\n" 2380 "# echo print-parent > /debug/tracing/trace_options\n"
2898 "# echo 1 > /debug/tracing/tracing_enabled\n" 2381 "# echo 1 > /debug/tracing/tracing_enabled\n"
2899 "# cat /debug/tracing/trace > /tmp/trace.txt\n" 2382 "# cat /debug/tracing/trace > /tmp/trace.txt\n"
2900 "echo 0 > /debug/tracing/tracing_enabled\n" 2383 "# echo 0 > /debug/tracing/tracing_enabled\n"
2901; 2384;
2902 2385
2903static ssize_t 2386static ssize_t
@@ -2908,7 +2391,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
2908 readme_msg, strlen(readme_msg)); 2391 readme_msg, strlen(readme_msg));
2909} 2392}
2910 2393
2911static struct file_operations tracing_readme_fops = { 2394static const struct file_operations tracing_readme_fops = {
2912 .open = tracing_open_generic, 2395 .open = tracing_open_generic,
2913 .read = tracing_readme_read, 2396 .read = tracing_readme_read,
2914}; 2397};
@@ -2930,7 +2413,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2930{ 2413{
2931 struct trace_array *tr = filp->private_data; 2414 struct trace_array *tr = filp->private_data;
2932 char buf[64]; 2415 char buf[64];
2933 long val; 2416 unsigned long val;
2934 int ret; 2417 int ret;
2935 2418
2936 if (cnt >= sizeof(buf)) 2419 if (cnt >= sizeof(buf))
@@ -2985,13 +2468,105 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
2985 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2468 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2986} 2469}
2987 2470
2988static int tracing_set_tracer(char *buf) 2471int tracer_init(struct tracer *t, struct trace_array *tr)
2472{
2473 tracing_reset_online_cpus(tr);
2474 return t->init(tr);
2475}
2476
2477static int tracing_resize_ring_buffer(unsigned long size)
2478{
2479 int ret;
2480
2481 /*
2482 * If kernel or user changes the size of the ring buffer
2483 * we use the size that was given, and we can forget about
2484 * expanding it later.
2485 */
2486 ring_buffer_expanded = 1;
2487
2488 ret = ring_buffer_resize(global_trace.buffer, size);
2489 if (ret < 0)
2490 return ret;
2491
2492 ret = ring_buffer_resize(max_tr.buffer, size);
2493 if (ret < 0) {
2494 int r;
2495
2496 r = ring_buffer_resize(global_trace.buffer,
2497 global_trace.entries);
2498 if (r < 0) {
2499 /*
2500 * AARGH! We are left with different
2501 * size max buffer!!!!
2502 * The max buffer is our "snapshot" buffer.
2503 * When a tracer needs a snapshot (one of the
2504 * latency tracers), it swaps the max buffer
2505 * with the saved snap shot. We succeeded to
2506 * update the size of the main buffer, but failed to
2507 * update the size of the max buffer. But when we tried
2508 * to reset the main buffer to the original size, we
2509 * failed there too. This is very unlikely to
2510 * happen, but if it does, warn and kill all
2511 * tracing.
2512 */
2513 WARN_ON(1);
2514 tracing_disabled = 1;
2515 }
2516 return ret;
2517 }
2518
2519 global_trace.entries = size;
2520
2521 return ret;
2522}
2523
2524/**
2525 * tracing_update_buffers - used by tracing facility to expand ring buffers
2526 *
2527 * To save on memory when the tracing is never used on a system with it
2528 * configured in. The ring buffers are set to a minimum size. But once
2529 * a user starts to use the tracing facility, then they need to grow
2530 * to their default size.
2531 *
2532 * This function is to be called when a tracer is about to be used.
2533 */
2534int tracing_update_buffers(void)
2535{
2536 int ret = 0;
2537
2538 mutex_lock(&trace_types_lock);
2539 if (!ring_buffer_expanded)
2540 ret = tracing_resize_ring_buffer(trace_buf_size);
2541 mutex_unlock(&trace_types_lock);
2542
2543 return ret;
2544}
2545
2546struct trace_option_dentry;
2547
2548static struct trace_option_dentry *
2549create_trace_option_files(struct tracer *tracer);
2550
2551static void
2552destroy_trace_option_files(struct trace_option_dentry *topts);
2553
2554static int tracing_set_tracer(const char *buf)
2989{ 2555{
2556 static struct trace_option_dentry *topts;
2990 struct trace_array *tr = &global_trace; 2557 struct trace_array *tr = &global_trace;
2991 struct tracer *t; 2558 struct tracer *t;
2992 int ret = 0; 2559 int ret = 0;
2993 2560
2994 mutex_lock(&trace_types_lock); 2561 mutex_lock(&trace_types_lock);
2562
2563 if (!ring_buffer_expanded) {
2564 ret = tracing_resize_ring_buffer(trace_buf_size);
2565 if (ret < 0)
2566 goto out;
2567 ret = 0;
2568 }
2569
2995 for (t = trace_types; t; t = t->next) { 2570 for (t = trace_types; t; t = t->next) {
2996 if (strcmp(t->name, buf) == 0) 2571 if (strcmp(t->name, buf) == 0)
2997 break; 2572 break;
@@ -3007,9 +2582,14 @@ static int tracing_set_tracer(char *buf)
3007 if (current_trace && current_trace->reset) 2582 if (current_trace && current_trace->reset)
3008 current_trace->reset(tr); 2583 current_trace->reset(tr);
3009 2584
2585 destroy_trace_option_files(topts);
2586
3010 current_trace = t; 2587 current_trace = t;
2588
2589 topts = create_trace_option_files(current_trace);
2590
3011 if (t->init) { 2591 if (t->init) {
3012 ret = t->init(tr); 2592 ret = tracer_init(t, tr);
3013 if (ret) 2593 if (ret)
3014 goto out; 2594 goto out;
3015 } 2595 }
@@ -3072,9 +2652,9 @@ static ssize_t
3072tracing_max_lat_write(struct file *filp, const char __user *ubuf, 2652tracing_max_lat_write(struct file *filp, const char __user *ubuf,
3073 size_t cnt, loff_t *ppos) 2653 size_t cnt, loff_t *ppos)
3074{ 2654{
3075 long *ptr = filp->private_data; 2655 unsigned long *ptr = filp->private_data;
3076 char buf[64]; 2656 char buf[64];
3077 long val; 2657 unsigned long val;
3078 int ret; 2658 int ret;
3079 2659
3080 if (cnt >= sizeof(buf)) 2660 if (cnt >= sizeof(buf))
@@ -3094,54 +2674,96 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
3094 return cnt; 2674 return cnt;
3095} 2675}
3096 2676
3097static atomic_t tracing_reader;
3098
3099static int tracing_open_pipe(struct inode *inode, struct file *filp) 2677static int tracing_open_pipe(struct inode *inode, struct file *filp)
3100{ 2678{
2679 long cpu_file = (long) inode->i_private;
3101 struct trace_iterator *iter; 2680 struct trace_iterator *iter;
2681 int ret = 0;
3102 2682
3103 if (tracing_disabled) 2683 if (tracing_disabled)
3104 return -ENODEV; 2684 return -ENODEV;
3105 2685
3106 /* We only allow for reader of the pipe */ 2686 mutex_lock(&trace_types_lock);
3107 if (atomic_inc_return(&tracing_reader) != 1) { 2687
3108 atomic_dec(&tracing_reader); 2688 /* We only allow one reader per cpu */
3109 return -EBUSY; 2689 if (cpu_file == TRACE_PIPE_ALL_CPU) {
2690 if (!cpumask_empty(tracing_reader_cpumask)) {
2691 ret = -EBUSY;
2692 goto out;
2693 }
2694 cpumask_setall(tracing_reader_cpumask);
2695 } else {
2696 if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
2697 cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
2698 else {
2699 ret = -EBUSY;
2700 goto out;
2701 }
3110 } 2702 }
3111 2703
3112 /* create a buffer to store the information to pass to userspace */ 2704 /* create a buffer to store the information to pass to userspace */
3113 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2705 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
3114 if (!iter) 2706 if (!iter) {
3115 return -ENOMEM; 2707 ret = -ENOMEM;
2708 goto out;
2709 }
3116 2710
3117 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 2711 /*
3118 kfree(iter); 2712 * We make a copy of the current tracer to avoid concurrent
3119 return -ENOMEM; 2713 * changes on it while we are reading.
2714 */
2715 iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL);
2716 if (!iter->trace) {
2717 ret = -ENOMEM;
2718 goto fail;
3120 } 2719 }
2720 if (current_trace)
2721 *iter->trace = *current_trace;
3121 2722
3122 mutex_lock(&trace_types_lock); 2723 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
2724 ret = -ENOMEM;
2725 goto fail;
2726 }
3123 2727
3124 /* trace pipe does not show start of buffer */ 2728 /* trace pipe does not show start of buffer */
3125 cpumask_setall(iter->started); 2729 cpumask_setall(iter->started);
3126 2730
2731 iter->cpu_file = cpu_file;
3127 iter->tr = &global_trace; 2732 iter->tr = &global_trace;
3128 iter->trace = current_trace; 2733 mutex_init(&iter->mutex);
3129 filp->private_data = iter; 2734 filp->private_data = iter;
3130 2735
3131 if (iter->trace->pipe_open) 2736 if (iter->trace->pipe_open)
3132 iter->trace->pipe_open(iter); 2737 iter->trace->pipe_open(iter);
2738
2739out:
3133 mutex_unlock(&trace_types_lock); 2740 mutex_unlock(&trace_types_lock);
2741 return ret;
3134 2742
3135 return 0; 2743fail:
2744 kfree(iter->trace);
2745 kfree(iter);
2746 mutex_unlock(&trace_types_lock);
2747 return ret;
3136} 2748}
3137 2749
3138static int tracing_release_pipe(struct inode *inode, struct file *file) 2750static int tracing_release_pipe(struct inode *inode, struct file *file)
3139{ 2751{
3140 struct trace_iterator *iter = file->private_data; 2752 struct trace_iterator *iter = file->private_data;
3141 2753
2754 mutex_lock(&trace_types_lock);
2755
2756 if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
2757 cpumask_clear(tracing_reader_cpumask);
2758 else
2759 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2760
2761 mutex_unlock(&trace_types_lock);
2762
3142 free_cpumask_var(iter->started); 2763 free_cpumask_var(iter->started);
2764 mutex_destroy(&iter->mutex);
2765 kfree(iter->trace);
3143 kfree(iter); 2766 kfree(iter);
3144 atomic_dec(&tracing_reader);
3145 2767
3146 return 0; 2768 return 0;
3147} 2769}
@@ -3167,67 +2789,57 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
3167 } 2789 }
3168} 2790}
3169 2791
3170/* 2792
3171 * Consumer reader. 2793void default_wait_pipe(struct trace_iterator *iter)
3172 */
3173static ssize_t
3174tracing_read_pipe(struct file *filp, char __user *ubuf,
3175 size_t cnt, loff_t *ppos)
3176{ 2794{
3177 struct trace_iterator *iter = filp->private_data; 2795 DEFINE_WAIT(wait);
3178 ssize_t sret;
3179 2796
3180 /* return any leftover data */ 2797 prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
3181 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 2798
3182 if (sret != -EBUSY) 2799 if (trace_empty(iter))
3183 return sret; 2800 schedule();
3184 2801
3185 trace_seq_reset(&iter->seq); 2802 finish_wait(&trace_wait, &wait);
2803}
3186 2804
3187 mutex_lock(&trace_types_lock); 2805/*
3188 if (iter->trace->read) { 2806 * This is a make-shift waitqueue.
3189 sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); 2807 * A tracer might use this callback on some rare cases:
3190 if (sret) 2808 *
3191 goto out; 2809 * 1) the current tracer might hold the runqueue lock when it wakes up
3192 } 2810 * a reader, hence a deadlock (sched, function, and function graph tracers)
2811 * 2) the function tracers, trace all functions, we don't want
2812 * the overhead of calling wake_up and friends
2813 * (and tracing them too)
2814 *
2815 * Anyway, this is really very primitive wakeup.
2816 */
2817void poll_wait_pipe(struct trace_iterator *iter)
2818{
2819 set_current_state(TASK_INTERRUPTIBLE);
2820 /* sleep for 100 msecs, and try again. */
2821 schedule_timeout(HZ / 10);
2822}
2823
2824/* Must be called with trace_types_lock mutex held. */
2825static int tracing_wait_pipe(struct file *filp)
2826{
2827 struct trace_iterator *iter = filp->private_data;
3193 2828
3194waitagain:
3195 sret = 0;
3196 while (trace_empty(iter)) { 2829 while (trace_empty(iter)) {
3197 2830
3198 if ((filp->f_flags & O_NONBLOCK)) { 2831 if ((filp->f_flags & O_NONBLOCK)) {
3199 sret = -EAGAIN; 2832 return -EAGAIN;
3200 goto out;
3201 } 2833 }
3202 2834
3203 /* 2835 mutex_unlock(&iter->mutex);
3204 * This is a make-shift waitqueue. The reason we don't use
3205 * an actual wait queue is because:
3206 * 1) we only ever have one waiter
3207 * 2) the tracing, traces all functions, we don't want
3208 * the overhead of calling wake_up and friends
3209 * (and tracing them too)
3210 * Anyway, this is really very primitive wakeup.
3211 */
3212 set_current_state(TASK_INTERRUPTIBLE);
3213 iter->tr->waiter = current;
3214
3215 mutex_unlock(&trace_types_lock);
3216
3217 /* sleep for 100 msecs, and try again. */
3218 schedule_timeout(HZ/10);
3219
3220 mutex_lock(&trace_types_lock);
3221 2836
3222 iter->tr->waiter = NULL; 2837 iter->trace->wait_pipe(iter);
3223 2838
3224 if (signal_pending(current)) { 2839 mutex_lock(&iter->mutex);
3225 sret = -EINTR;
3226 goto out;
3227 }
3228 2840
3229 if (iter->trace != current_trace) 2841 if (signal_pending(current))
3230 goto out; 2842 return -EINTR;
3231 2843
3232 /* 2844 /*
3233 * We block until we read something and tracing is disabled. 2845 * We block until we read something and tracing is disabled.
@@ -3240,13 +2852,59 @@ waitagain:
3240 */ 2852 */
3241 if (!tracer_enabled && iter->pos) 2853 if (!tracer_enabled && iter->pos)
3242 break; 2854 break;
2855 }
2856
2857 return 1;
2858}
2859
2860/*
2861 * Consumer reader.
2862 */
2863static ssize_t
2864tracing_read_pipe(struct file *filp, char __user *ubuf,
2865 size_t cnt, loff_t *ppos)
2866{
2867 struct trace_iterator *iter = filp->private_data;
2868 static struct tracer *old_tracer;
2869 ssize_t sret;
2870
2871 /* return any leftover data */
2872 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2873 if (sret != -EBUSY)
2874 return sret;
2875
2876 trace_seq_init(&iter->seq);
2877
2878 /* copy the tracer to avoid using a global lock all around */
2879 mutex_lock(&trace_types_lock);
2880 if (unlikely(old_tracer != current_trace && current_trace)) {
2881 old_tracer = current_trace;
2882 *iter->trace = *current_trace;
2883 }
2884 mutex_unlock(&trace_types_lock);
3243 2885
3244 continue; 2886 /*
2887 * Avoid more than one consumer on a single file descriptor
2888 * This is just a matter of traces coherency, the ring buffer itself
2889 * is protected.
2890 */
2891 mutex_lock(&iter->mutex);
2892 if (iter->trace->read) {
2893 sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
2894 if (sret)
2895 goto out;
3245 } 2896 }
3246 2897
2898waitagain:
2899 sret = tracing_wait_pipe(filp);
2900 if (sret <= 0)
2901 goto out;
2902
3247 /* stop when tracing is finished */ 2903 /* stop when tracing is finished */
3248 if (trace_empty(iter)) 2904 if (trace_empty(iter)) {
2905 sret = 0;
3249 goto out; 2906 goto out;
2907 }
3250 2908
3251 if (cnt >= PAGE_SIZE) 2909 if (cnt >= PAGE_SIZE)
3252 cnt = PAGE_SIZE - 1; 2910 cnt = PAGE_SIZE - 1;
@@ -3267,8 +2925,8 @@ waitagain:
3267 iter->seq.len = len; 2925 iter->seq.len = len;
3268 break; 2926 break;
3269 } 2927 }
3270 2928 if (ret != TRACE_TYPE_NO_CONSUME)
3271 trace_consume(iter); 2929 trace_consume(iter);
3272 2930
3273 if (iter->seq.len >= cnt) 2931 if (iter->seq.len >= cnt)
3274 break; 2932 break;
@@ -3277,7 +2935,7 @@ waitagain:
3277 /* Now copy what we have to the user */ 2935 /* Now copy what we have to the user */
3278 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 2936 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
3279 if (iter->seq.readpos >= iter->seq.len) 2937 if (iter->seq.readpos >= iter->seq.len)
3280 trace_seq_reset(&iter->seq); 2938 trace_seq_init(&iter->seq);
3281 2939
3282 /* 2940 /*
3283 * If there was nothing to send to user, inspite of consuming trace 2941 * If there was nothing to send to user, inspite of consuming trace
@@ -3287,20 +2945,165 @@ waitagain:
3287 goto waitagain; 2945 goto waitagain;
3288 2946
3289out: 2947out:
3290 mutex_unlock(&trace_types_lock); 2948 mutex_unlock(&iter->mutex);
3291 2949
3292 return sret; 2950 return sret;
3293} 2951}
3294 2952
2953static void tracing_pipe_buf_release(struct pipe_inode_info *pipe,
2954 struct pipe_buffer *buf)
2955{
2956 __free_page(buf->page);
2957}
2958
2959static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
2960 unsigned int idx)
2961{
2962 __free_page(spd->pages[idx]);
2963}
2964
2965static struct pipe_buf_operations tracing_pipe_buf_ops = {
2966 .can_merge = 0,
2967 .map = generic_pipe_buf_map,
2968 .unmap = generic_pipe_buf_unmap,
2969 .confirm = generic_pipe_buf_confirm,
2970 .release = tracing_pipe_buf_release,
2971 .steal = generic_pipe_buf_steal,
2972 .get = generic_pipe_buf_get,
2973};
2974
2975static size_t
2976tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
2977{
2978 size_t count;
2979 int ret;
2980
2981 /* Seq buffer is page-sized, exactly what we need. */
2982 for (;;) {
2983 count = iter->seq.len;
2984 ret = print_trace_line(iter);
2985 count = iter->seq.len - count;
2986 if (rem < count) {
2987 rem = 0;
2988 iter->seq.len -= count;
2989 break;
2990 }
2991 if (ret == TRACE_TYPE_PARTIAL_LINE) {
2992 iter->seq.len -= count;
2993 break;
2994 }
2995
2996 trace_consume(iter);
2997 rem -= count;
2998 if (!find_next_entry_inc(iter)) {
2999 rem = 0;
3000 iter->ent = NULL;
3001 break;
3002 }
3003 }
3004
3005 return rem;
3006}
3007
3008static ssize_t tracing_splice_read_pipe(struct file *filp,
3009 loff_t *ppos,
3010 struct pipe_inode_info *pipe,
3011 size_t len,
3012 unsigned int flags)
3013{
3014 struct page *pages[PIPE_BUFFERS];
3015 struct partial_page partial[PIPE_BUFFERS];
3016 struct trace_iterator *iter = filp->private_data;
3017 struct splice_pipe_desc spd = {
3018 .pages = pages,
3019 .partial = partial,
3020 .nr_pages = 0, /* This gets updated below. */
3021 .flags = flags,
3022 .ops = &tracing_pipe_buf_ops,
3023 .spd_release = tracing_spd_release_pipe,
3024 };
3025 static struct tracer *old_tracer;
3026 ssize_t ret;
3027 size_t rem;
3028 unsigned int i;
3029
3030 /* copy the tracer to avoid using a global lock all around */
3031 mutex_lock(&trace_types_lock);
3032 if (unlikely(old_tracer != current_trace && current_trace)) {
3033 old_tracer = current_trace;
3034 *iter->trace = *current_trace;
3035 }
3036 mutex_unlock(&trace_types_lock);
3037
3038 mutex_lock(&iter->mutex);
3039
3040 if (iter->trace->splice_read) {
3041 ret = iter->trace->splice_read(iter, filp,
3042 ppos, pipe, len, flags);
3043 if (ret)
3044 goto out_err;
3045 }
3046
3047 ret = tracing_wait_pipe(filp);
3048 if (ret <= 0)
3049 goto out_err;
3050
3051 if (!iter->ent && !find_next_entry_inc(iter)) {
3052 ret = -EFAULT;
3053 goto out_err;
3054 }
3055
3056 /* Fill as many pages as possible. */
3057 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
3058 pages[i] = alloc_page(GFP_KERNEL);
3059 if (!pages[i])
3060 break;
3061
3062 rem = tracing_fill_pipe_page(rem, iter);
3063
3064 /* Copy the data into the page, so we can start over. */
3065 ret = trace_seq_to_buffer(&iter->seq,
3066 page_address(pages[i]),
3067 iter->seq.len);
3068 if (ret < 0) {
3069 __free_page(pages[i]);
3070 break;
3071 }
3072 partial[i].offset = 0;
3073 partial[i].len = iter->seq.len;
3074
3075 trace_seq_init(&iter->seq);
3076 }
3077
3078 mutex_unlock(&iter->mutex);
3079
3080 spd.nr_pages = i;
3081
3082 return splice_to_pipe(pipe, &spd);
3083
3084out_err:
3085 mutex_unlock(&iter->mutex);
3086
3087 return ret;
3088}
3089
3295static ssize_t 3090static ssize_t
3296tracing_entries_read(struct file *filp, char __user *ubuf, 3091tracing_entries_read(struct file *filp, char __user *ubuf,
3297 size_t cnt, loff_t *ppos) 3092 size_t cnt, loff_t *ppos)
3298{ 3093{
3299 struct trace_array *tr = filp->private_data; 3094 struct trace_array *tr = filp->private_data;
3300 char buf[64]; 3095 char buf[96];
3301 int r; 3096 int r;
3302 3097
3303 r = sprintf(buf, "%lu\n", tr->entries >> 10); 3098 mutex_lock(&trace_types_lock);
3099 if (!ring_buffer_expanded)
3100 r = sprintf(buf, "%lu (expanded: %lu)\n",
3101 tr->entries >> 10,
3102 trace_buf_size >> 10);
3103 else
3104 r = sprintf(buf, "%lu\n", tr->entries >> 10);
3105 mutex_unlock(&trace_types_lock);
3106
3304 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3107 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3305} 3108}
3306 3109
@@ -3344,28 +3147,11 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3344 val <<= 10; 3147 val <<= 10;
3345 3148
3346 if (val != global_trace.entries) { 3149 if (val != global_trace.entries) {
3347 ret = ring_buffer_resize(global_trace.buffer, val); 3150 ret = tracing_resize_ring_buffer(val);
3348 if (ret < 0) {
3349 cnt = ret;
3350 goto out;
3351 }
3352
3353 ret = ring_buffer_resize(max_tr.buffer, val);
3354 if (ret < 0) { 3151 if (ret < 0) {
3355 int r;
3356 cnt = ret; 3152 cnt = ret;
3357 r = ring_buffer_resize(global_trace.buffer,
3358 global_trace.entries);
3359 if (r < 0) {
3360 /* AARGH! We are left with different
3361 * size max buffer!!!! */
3362 WARN_ON(1);
3363 tracing_disabled = 1;
3364 }
3365 goto out; 3153 goto out;
3366 } 3154 }
3367
3368 global_trace.entries = val;
3369 } 3155 }
3370 3156
3371 filp->f_pos += cnt; 3157 filp->f_pos += cnt;
@@ -3393,7 +3179,7 @@ static int mark_printk(const char *fmt, ...)
3393 int ret; 3179 int ret;
3394 va_list args; 3180 va_list args;
3395 va_start(args, fmt); 3181 va_start(args, fmt);
3396 ret = trace_vprintk(0, -1, fmt, args); 3182 ret = trace_vprintk(0, fmt, args);
3397 va_end(args); 3183 va_end(args);
3398 return ret; 3184 return ret;
3399} 3185}
@@ -3433,42 +3219,295 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3433 return cnt; 3219 return cnt;
3434} 3220}
3435 3221
3436static struct file_operations tracing_max_lat_fops = { 3222static const struct file_operations tracing_max_lat_fops = {
3437 .open = tracing_open_generic, 3223 .open = tracing_open_generic,
3438 .read = tracing_max_lat_read, 3224 .read = tracing_max_lat_read,
3439 .write = tracing_max_lat_write, 3225 .write = tracing_max_lat_write,
3440}; 3226};
3441 3227
3442static struct file_operations tracing_ctrl_fops = { 3228static const struct file_operations tracing_ctrl_fops = {
3443 .open = tracing_open_generic, 3229 .open = tracing_open_generic,
3444 .read = tracing_ctrl_read, 3230 .read = tracing_ctrl_read,
3445 .write = tracing_ctrl_write, 3231 .write = tracing_ctrl_write,
3446}; 3232};
3447 3233
3448static struct file_operations set_tracer_fops = { 3234static const struct file_operations set_tracer_fops = {
3449 .open = tracing_open_generic, 3235 .open = tracing_open_generic,
3450 .read = tracing_set_trace_read, 3236 .read = tracing_set_trace_read,
3451 .write = tracing_set_trace_write, 3237 .write = tracing_set_trace_write,
3452}; 3238};
3453 3239
3454static struct file_operations tracing_pipe_fops = { 3240static const struct file_operations tracing_pipe_fops = {
3455 .open = tracing_open_pipe, 3241 .open = tracing_open_pipe,
3456 .poll = tracing_poll_pipe, 3242 .poll = tracing_poll_pipe,
3457 .read = tracing_read_pipe, 3243 .read = tracing_read_pipe,
3244 .splice_read = tracing_splice_read_pipe,
3458 .release = tracing_release_pipe, 3245 .release = tracing_release_pipe,
3459}; 3246};
3460 3247
3461static struct file_operations tracing_entries_fops = { 3248static const struct file_operations tracing_entries_fops = {
3462 .open = tracing_open_generic, 3249 .open = tracing_open_generic,
3463 .read = tracing_entries_read, 3250 .read = tracing_entries_read,
3464 .write = tracing_entries_write, 3251 .write = tracing_entries_write,
3465}; 3252};
3466 3253
3467static struct file_operations tracing_mark_fops = { 3254static const struct file_operations tracing_mark_fops = {
3468 .open = tracing_open_generic, 3255 .open = tracing_open_generic,
3469 .write = tracing_mark_write, 3256 .write = tracing_mark_write,
3470}; 3257};
3471 3258
3259struct ftrace_buffer_info {
3260 struct trace_array *tr;
3261 void *spare;
3262 int cpu;
3263 unsigned int read;
3264};
3265
3266static int tracing_buffers_open(struct inode *inode, struct file *filp)
3267{
3268 int cpu = (int)(long)inode->i_private;
3269 struct ftrace_buffer_info *info;
3270
3271 if (tracing_disabled)
3272 return -ENODEV;
3273
3274 info = kzalloc(sizeof(*info), GFP_KERNEL);
3275 if (!info)
3276 return -ENOMEM;
3277
3278 info->tr = &global_trace;
3279 info->cpu = cpu;
3280 info->spare = NULL;
3281 /* Force reading ring buffer for first read */
3282 info->read = (unsigned int)-1;
3283
3284 filp->private_data = info;
3285
3286 return nonseekable_open(inode, filp);
3287}
3288
3289static ssize_t
3290tracing_buffers_read(struct file *filp, char __user *ubuf,
3291 size_t count, loff_t *ppos)
3292{
3293 struct ftrace_buffer_info *info = filp->private_data;
3294 unsigned int pos;
3295 ssize_t ret;
3296 size_t size;
3297
3298 if (!count)
3299 return 0;
3300
3301 if (!info->spare)
3302 info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
3303 if (!info->spare)
3304 return -ENOMEM;
3305
3306 /* Do we have previous read data to read? */
3307 if (info->read < PAGE_SIZE)
3308 goto read;
3309
3310 info->read = 0;
3311
3312 ret = ring_buffer_read_page(info->tr->buffer,
3313 &info->spare,
3314 count,
3315 info->cpu, 0);
3316 if (ret < 0)
3317 return 0;
3318
3319 pos = ring_buffer_page_len(info->spare);
3320
3321 if (pos < PAGE_SIZE)
3322 memset(info->spare + pos, 0, PAGE_SIZE - pos);
3323
3324read:
3325 size = PAGE_SIZE - info->read;
3326 if (size > count)
3327 size = count;
3328
3329 ret = copy_to_user(ubuf, info->spare + info->read, size);
3330 if (ret == size)
3331 return -EFAULT;
3332 size -= ret;
3333
3334 *ppos += size;
3335 info->read += size;
3336
3337 return size;
3338}
3339
3340static int tracing_buffers_release(struct inode *inode, struct file *file)
3341{
3342 struct ftrace_buffer_info *info = file->private_data;
3343
3344 if (info->spare)
3345 ring_buffer_free_read_page(info->tr->buffer, info->spare);
3346 kfree(info);
3347
3348 return 0;
3349}
3350
3351struct buffer_ref {
3352 struct ring_buffer *buffer;
3353 void *page;
3354 int ref;
3355};
3356
3357static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
3358 struct pipe_buffer *buf)
3359{
3360 struct buffer_ref *ref = (struct buffer_ref *)buf->private;
3361
3362 if (--ref->ref)
3363 return;
3364
3365 ring_buffer_free_read_page(ref->buffer, ref->page);
3366 kfree(ref);
3367 buf->private = 0;
3368}
3369
3370static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
3371 struct pipe_buffer *buf)
3372{
3373 return 1;
3374}
3375
3376static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
3377 struct pipe_buffer *buf)
3378{
3379 struct buffer_ref *ref = (struct buffer_ref *)buf->private;
3380
3381 ref->ref++;
3382}
3383
3384/* Pipe buffer operations for a buffer. */
3385static struct pipe_buf_operations buffer_pipe_buf_ops = {
3386 .can_merge = 0,
3387 .map = generic_pipe_buf_map,
3388 .unmap = generic_pipe_buf_unmap,
3389 .confirm = generic_pipe_buf_confirm,
3390 .release = buffer_pipe_buf_release,
3391 .steal = buffer_pipe_buf_steal,
3392 .get = buffer_pipe_buf_get,
3393};
3394
3395/*
3396 * Callback from splice_to_pipe(), if we need to release some pages
3397 * at the end of the spd in case we error'ed out in filling the pipe.
3398 */
3399static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
3400{
3401 struct buffer_ref *ref =
3402 (struct buffer_ref *)spd->partial[i].private;
3403
3404 if (--ref->ref)
3405 return;
3406
3407 ring_buffer_free_read_page(ref->buffer, ref->page);
3408 kfree(ref);
3409 spd->partial[i].private = 0;
3410}
3411
3412static ssize_t
3413tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3414 struct pipe_inode_info *pipe, size_t len,
3415 unsigned int flags)
3416{
3417 struct ftrace_buffer_info *info = file->private_data;
3418 struct partial_page partial[PIPE_BUFFERS];
3419 struct page *pages[PIPE_BUFFERS];
3420 struct splice_pipe_desc spd = {
3421 .pages = pages,
3422 .partial = partial,
3423 .flags = flags,
3424 .ops = &buffer_pipe_buf_ops,
3425 .spd_release = buffer_spd_release,
3426 };
3427 struct buffer_ref *ref;
3428 int size, i;
3429 size_t ret;
3430
3431 if (*ppos & (PAGE_SIZE - 1)) {
3432 WARN_ONCE(1, "Ftrace: previous read must page-align\n");
3433 return -EINVAL;
3434 }
3435
3436 if (len & (PAGE_SIZE - 1)) {
3437 WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
3438 if (len < PAGE_SIZE)
3439 return -EINVAL;
3440 len &= PAGE_MASK;
3441 }
3442
3443 for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
3444 struct page *page;
3445 int r;
3446
3447 ref = kzalloc(sizeof(*ref), GFP_KERNEL);
3448 if (!ref)
3449 break;
3450
3451 ref->ref = 1;
3452 ref->buffer = info->tr->buffer;
3453 ref->page = ring_buffer_alloc_read_page(ref->buffer);
3454 if (!ref->page) {
3455 kfree(ref);
3456 break;
3457 }
3458
3459 r = ring_buffer_read_page(ref->buffer, &ref->page,
3460 len, info->cpu, 0);
3461 if (r < 0) {
3462 ring_buffer_free_read_page(ref->buffer,
3463 ref->page);
3464 kfree(ref);
3465 break;
3466 }
3467
3468 /*
3469 * zero out any left over data, this is going to
3470 * user land.
3471 */
3472 size = ring_buffer_page_len(ref->page);
3473 if (size < PAGE_SIZE)
3474 memset(ref->page + size, 0, PAGE_SIZE - size);
3475
3476 page = virt_to_page(ref->page);
3477
3478 spd.pages[i] = page;
3479 spd.partial[i].len = PAGE_SIZE;
3480 spd.partial[i].offset = 0;
3481 spd.partial[i].private = (unsigned long)ref;
3482 spd.nr_pages++;
3483 *ppos += PAGE_SIZE;
3484 }
3485
3486 spd.nr_pages = i;
3487
3488 /* did we read anything? */
3489 if (!spd.nr_pages) {
3490 if (flags & SPLICE_F_NONBLOCK)
3491 ret = -EAGAIN;
3492 else
3493 ret = 0;
3494 /* TODO: block */
3495 return ret;
3496 }
3497
3498 ret = splice_to_pipe(pipe, &spd);
3499
3500 return ret;
3501}
3502
3503static const struct file_operations tracing_buffers_fops = {
3504 .open = tracing_buffers_open,
3505 .read = tracing_buffers_read,
3506 .release = tracing_buffers_release,
3507 .splice_read = tracing_buffers_splice_read,
3508 .llseek = no_llseek,
3509};
3510
3472#ifdef CONFIG_DYNAMIC_FTRACE 3511#ifdef CONFIG_DYNAMIC_FTRACE
3473 3512
3474int __weak ftrace_arch_read_dyn_info(char *buf, int size) 3513int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3500,7 +3539,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
3500 return r; 3539 return r;
3501} 3540}
3502 3541
3503static struct file_operations tracing_dyn_info_fops = { 3542static const struct file_operations tracing_dyn_info_fops = {
3504 .open = tracing_open_generic, 3543 .open = tracing_open_generic,
3505 .read = tracing_read_dyn_info, 3544 .read = tracing_read_dyn_info,
3506}; 3545};
@@ -3515,6 +3554,9 @@ struct dentry *tracing_init_dentry(void)
3515 if (d_tracer) 3554 if (d_tracer)
3516 return d_tracer; 3555 return d_tracer;
3517 3556
3557 if (!debugfs_initialized())
3558 return NULL;
3559
3518 d_tracer = debugfs_create_dir("tracing", NULL); 3560 d_tracer = debugfs_create_dir("tracing", NULL);
3519 3561
3520 if (!d_tracer && !once) { 3562 if (!d_tracer && !once) {
@@ -3526,15 +3568,350 @@ struct dentry *tracing_init_dentry(void)
3526 return d_tracer; 3568 return d_tracer;
3527} 3569}
3528 3570
3571static struct dentry *d_percpu;
3572
3573struct dentry *tracing_dentry_percpu(void)
3574{
3575 static int once;
3576 struct dentry *d_tracer;
3577
3578 if (d_percpu)
3579 return d_percpu;
3580
3581 d_tracer = tracing_init_dentry();
3582
3583 if (!d_tracer)
3584 return NULL;
3585
3586 d_percpu = debugfs_create_dir("per_cpu", d_tracer);
3587
3588 if (!d_percpu && !once) {
3589 once = 1;
3590 pr_warning("Could not create debugfs directory 'per_cpu'\n");
3591 return NULL;
3592 }
3593
3594 return d_percpu;
3595}
3596
3597static void tracing_init_debugfs_percpu(long cpu)
3598{
3599 struct dentry *d_percpu = tracing_dentry_percpu();
3600 struct dentry *entry, *d_cpu;
3601 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
3602 char cpu_dir[7];
3603
3604 if (cpu > 999 || cpu < 0)
3605 return;
3606
3607 sprintf(cpu_dir, "cpu%ld", cpu);
3608 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
3609 if (!d_cpu) {
3610 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
3611 return;
3612 }
3613
3614 /* per cpu trace_pipe */
3615 entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
3616 (void *) cpu, &tracing_pipe_fops);
3617 if (!entry)
3618 pr_warning("Could not create debugfs 'trace_pipe' entry\n");
3619
3620 /* per cpu trace */
3621 entry = debugfs_create_file("trace", 0644, d_cpu,
3622 (void *) cpu, &tracing_fops);
3623 if (!entry)
3624 pr_warning("Could not create debugfs 'trace' entry\n");
3625
3626 entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
3627 (void *) cpu, &tracing_buffers_fops);
3628 if (!entry)
3629 pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
3630}
3631
3529#ifdef CONFIG_FTRACE_SELFTEST 3632#ifdef CONFIG_FTRACE_SELFTEST
3530/* Let selftest have access to static functions in this file */ 3633/* Let selftest have access to static functions in this file */
3531#include "trace_selftest.c" 3634#include "trace_selftest.c"
3532#endif 3635#endif
3533 3636
3637struct trace_option_dentry {
3638 struct tracer_opt *opt;
3639 struct tracer_flags *flags;
3640 struct dentry *entry;
3641};
3642
3643static ssize_t
3644trace_options_read(struct file *filp, char __user *ubuf, size_t cnt,
3645 loff_t *ppos)
3646{
3647 struct trace_option_dentry *topt = filp->private_data;
3648 char *buf;
3649
3650 if (topt->flags->val & topt->opt->bit)
3651 buf = "1\n";
3652 else
3653 buf = "0\n";
3654
3655 return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
3656}
3657
3658static ssize_t
3659trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
3660 loff_t *ppos)
3661{
3662 struct trace_option_dentry *topt = filp->private_data;
3663 unsigned long val;
3664 char buf[64];
3665 int ret;
3666
3667 if (cnt >= sizeof(buf))
3668 return -EINVAL;
3669
3670 if (copy_from_user(&buf, ubuf, cnt))
3671 return -EFAULT;
3672
3673 buf[cnt] = 0;
3674
3675 ret = strict_strtoul(buf, 10, &val);
3676 if (ret < 0)
3677 return ret;
3678
3679 ret = 0;
3680 switch (val) {
3681 case 0:
3682 /* do nothing if already cleared */
3683 if (!(topt->flags->val & topt->opt->bit))
3684 break;
3685
3686 mutex_lock(&trace_types_lock);
3687 if (current_trace->set_flag)
3688 ret = current_trace->set_flag(topt->flags->val,
3689 topt->opt->bit, 0);
3690 mutex_unlock(&trace_types_lock);
3691 if (ret)
3692 return ret;
3693 topt->flags->val &= ~topt->opt->bit;
3694 break;
3695 case 1:
3696 /* do nothing if already set */
3697 if (topt->flags->val & topt->opt->bit)
3698 break;
3699
3700 mutex_lock(&trace_types_lock);
3701 if (current_trace->set_flag)
3702 ret = current_trace->set_flag(topt->flags->val,
3703 topt->opt->bit, 1);
3704 mutex_unlock(&trace_types_lock);
3705 if (ret)
3706 return ret;
3707 topt->flags->val |= topt->opt->bit;
3708 break;
3709
3710 default:
3711 return -EINVAL;
3712 }
3713
3714 *ppos += cnt;
3715
3716 return cnt;
3717}
3718
3719
3720static const struct file_operations trace_options_fops = {
3721 .open = tracing_open_generic,
3722 .read = trace_options_read,
3723 .write = trace_options_write,
3724};
3725
3726static ssize_t
3727trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
3728 loff_t *ppos)
3729{
3730 long index = (long)filp->private_data;
3731 char *buf;
3732
3733 if (trace_flags & (1 << index))
3734 buf = "1\n";
3735 else
3736 buf = "0\n";
3737
3738 return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
3739}
3740
3741static ssize_t
3742trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
3743 loff_t *ppos)
3744{
3745 long index = (long)filp->private_data;
3746 char buf[64];
3747 unsigned long val;
3748 int ret;
3749
3750 if (cnt >= sizeof(buf))
3751 return -EINVAL;
3752
3753 if (copy_from_user(&buf, ubuf, cnt))
3754 return -EFAULT;
3755
3756 buf[cnt] = 0;
3757
3758 ret = strict_strtoul(buf, 10, &val);
3759 if (ret < 0)
3760 return ret;
3761
3762 switch (val) {
3763 case 0:
3764 trace_flags &= ~(1 << index);
3765 break;
3766 case 1:
3767 trace_flags |= 1 << index;
3768 break;
3769
3770 default:
3771 return -EINVAL;
3772 }
3773
3774 *ppos += cnt;
3775
3776 return cnt;
3777}
3778
3779static const struct file_operations trace_options_core_fops = {
3780 .open = tracing_open_generic,
3781 .read = trace_options_core_read,
3782 .write = trace_options_core_write,
3783};
3784
3785static struct dentry *trace_options_init_dentry(void)
3786{
3787 struct dentry *d_tracer;
3788 static struct dentry *t_options;
3789
3790 if (t_options)
3791 return t_options;
3792
3793 d_tracer = tracing_init_dentry();
3794 if (!d_tracer)
3795 return NULL;
3796
3797 t_options = debugfs_create_dir("options", d_tracer);
3798 if (!t_options) {
3799 pr_warning("Could not create debugfs directory 'options'\n");
3800 return NULL;
3801 }
3802
3803 return t_options;
3804}
3805
3806static void
3807create_trace_option_file(struct trace_option_dentry *topt,
3808 struct tracer_flags *flags,
3809 struct tracer_opt *opt)
3810{
3811 struct dentry *t_options;
3812 struct dentry *entry;
3813
3814 t_options = trace_options_init_dentry();
3815 if (!t_options)
3816 return;
3817
3818 topt->flags = flags;
3819 topt->opt = opt;
3820
3821 entry = debugfs_create_file(opt->name, 0644, t_options, topt,
3822 &trace_options_fops);
3823
3824 topt->entry = entry;
3825
3826}
3827
3828static struct trace_option_dentry *
3829create_trace_option_files(struct tracer *tracer)
3830{
3831 struct trace_option_dentry *topts;
3832 struct tracer_flags *flags;
3833 struct tracer_opt *opts;
3834 int cnt;
3835
3836 if (!tracer)
3837 return NULL;
3838
3839 flags = tracer->flags;
3840
3841 if (!flags || !flags->opts)
3842 return NULL;
3843
3844 opts = flags->opts;
3845
3846 for (cnt = 0; opts[cnt].name; cnt++)
3847 ;
3848
3849 topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
3850 if (!topts)
3851 return NULL;
3852
3853 for (cnt = 0; opts[cnt].name; cnt++)
3854 create_trace_option_file(&topts[cnt], flags,
3855 &opts[cnt]);
3856
3857 return topts;
3858}
3859
3860static void
3861destroy_trace_option_files(struct trace_option_dentry *topts)
3862{
3863 int cnt;
3864
3865 if (!topts)
3866 return;
3867
3868 for (cnt = 0; topts[cnt].opt; cnt++) {
3869 if (topts[cnt].entry)
3870 debugfs_remove(topts[cnt].entry);
3871 }
3872
3873 kfree(topts);
3874}
3875
3876static struct dentry *
3877create_trace_option_core_file(const char *option, long index)
3878{
3879 struct dentry *t_options;
3880 struct dentry *entry;
3881
3882 t_options = trace_options_init_dentry();
3883 if (!t_options)
3884 return NULL;
3885
3886 entry = debugfs_create_file(option, 0644, t_options, (void *)index,
3887 &trace_options_core_fops);
3888
3889 return entry;
3890}
3891
3892static __init void create_trace_options_dir(void)
3893{
3894 struct dentry *t_options;
3895 struct dentry *entry;
3896 int i;
3897
3898 t_options = trace_options_init_dentry();
3899 if (!t_options)
3900 return;
3901
3902 for (i = 0; trace_options[i]; i++) {
3903 entry = create_trace_option_core_file(trace_options[i], i);
3904 if (!entry)
3905 pr_warning("Could not create debugfs %s entry\n",
3906 trace_options[i]);
3907 }
3908}
3909
3534static __init int tracer_init_debugfs(void) 3910static __init int tracer_init_debugfs(void)
3535{ 3911{
3536 struct dentry *d_tracer; 3912 struct dentry *d_tracer;
3537 struct dentry *entry; 3913 struct dentry *entry;
3914 int cpu;
3538 3915
3539 d_tracer = tracing_init_dentry(); 3916 d_tracer = tracing_init_dentry();
3540 3917
@@ -3548,18 +3925,15 @@ static __init int tracer_init_debugfs(void)
3548 if (!entry) 3925 if (!entry)
3549 pr_warning("Could not create debugfs 'trace_options' entry\n"); 3926 pr_warning("Could not create debugfs 'trace_options' entry\n");
3550 3927
3928 create_trace_options_dir();
3929
3551 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, 3930 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
3552 NULL, &tracing_cpumask_fops); 3931 NULL, &tracing_cpumask_fops);
3553 if (!entry) 3932 if (!entry)
3554 pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); 3933 pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
3555 3934
3556 entry = debugfs_create_file("latency_trace", 0444, d_tracer, 3935 entry = debugfs_create_file("trace", 0644, d_tracer,
3557 &global_trace, &tracing_lt_fops); 3936 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
3558 if (!entry)
3559 pr_warning("Could not create debugfs 'latency_trace' entry\n");
3560
3561 entry = debugfs_create_file("trace", 0444, d_tracer,
3562 &global_trace, &tracing_fops);
3563 if (!entry) 3937 if (!entry)
3564 pr_warning("Could not create debugfs 'trace' entry\n"); 3938 pr_warning("Could not create debugfs 'trace' entry\n");
3565 3939
@@ -3590,8 +3964,8 @@ static __init int tracer_init_debugfs(void)
3590 if (!entry) 3964 if (!entry)
3591 pr_warning("Could not create debugfs 'README' entry\n"); 3965 pr_warning("Could not create debugfs 'README' entry\n");
3592 3966
3593 entry = debugfs_create_file("trace_pipe", 0644, d_tracer, 3967 entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
3594 NULL, &tracing_pipe_fops); 3968 (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
3595 if (!entry) 3969 if (!entry)
3596 pr_warning("Could not create debugfs " 3970 pr_warning("Could not create debugfs "
3597 "'trace_pipe' entry\n"); 3971 "'trace_pipe' entry\n");
@@ -3619,77 +3993,12 @@ static __init int tracer_init_debugfs(void)
3619#ifdef CONFIG_SYSPROF_TRACER 3993#ifdef CONFIG_SYSPROF_TRACER
3620 init_tracer_sysprof_debugfs(d_tracer); 3994 init_tracer_sysprof_debugfs(d_tracer);
3621#endif 3995#endif
3622 return 0;
3623}
3624
3625int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
3626{
3627 static DEFINE_SPINLOCK(trace_buf_lock);
3628 static char trace_buf[TRACE_BUF_SIZE];
3629
3630 struct ring_buffer_event *event;
3631 struct trace_array *tr = &global_trace;
3632 struct trace_array_cpu *data;
3633 int cpu, len = 0, size, pc;
3634 struct print_entry *entry;
3635 unsigned long irq_flags;
3636
3637 if (tracing_disabled || tracing_selftest_running)
3638 return 0;
3639
3640 pc = preempt_count();
3641 preempt_disable_notrace();
3642 cpu = raw_smp_processor_id();
3643 data = tr->data[cpu];
3644
3645 if (unlikely(atomic_read(&data->disabled)))
3646 goto out;
3647
3648 pause_graph_tracing();
3649 spin_lock_irqsave(&trace_buf_lock, irq_flags);
3650 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
3651
3652 len = min(len, TRACE_BUF_SIZE-1);
3653 trace_buf[len] = 0;
3654
3655 size = sizeof(*entry) + len + 1;
3656 event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
3657 if (!event)
3658 goto out_unlock;
3659 entry = ring_buffer_event_data(event);
3660 tracing_generic_entry_update(&entry->ent, irq_flags, pc);
3661 entry->ent.type = TRACE_PRINT;
3662 entry->ip = ip;
3663 entry->depth = depth;
3664
3665 memcpy(&entry->buf, trace_buf, len);
3666 entry->buf[len] = 0;
3667 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
3668
3669 out_unlock:
3670 spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
3671 unpause_graph_tracing();
3672 out:
3673 preempt_enable_notrace();
3674
3675 return len;
3676}
3677EXPORT_SYMBOL_GPL(trace_vprintk);
3678 3996
3679int __ftrace_printk(unsigned long ip, const char *fmt, ...) 3997 for_each_tracing_cpu(cpu)
3680{ 3998 tracing_init_debugfs_percpu(cpu);
3681 int ret;
3682 va_list ap;
3683
3684 if (!(trace_flags & TRACE_ITER_PRINTK))
3685 return 0;
3686 3999
3687 va_start(ap, fmt); 4000 return 0;
3688 ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
3689 va_end(ap);
3690 return ret;
3691} 4001}
3692EXPORT_SYMBOL_GPL(__ftrace_printk);
3693 4002
3694static int trace_panic_handler(struct notifier_block *this, 4003static int trace_panic_handler(struct notifier_block *this,
3695 unsigned long event, void *unused) 4004 unsigned long event, void *unused)
@@ -3750,14 +4059,15 @@ trace_printk_seq(struct trace_seq *s)
3750 4059
3751 printk(KERN_TRACE "%s", s->buffer); 4060 printk(KERN_TRACE "%s", s->buffer);
3752 4061
3753 trace_seq_reset(s); 4062 trace_seq_init(s);
3754} 4063}
3755 4064
3756void ftrace_dump(void) 4065static void __ftrace_dump(bool disable_tracing)
3757{ 4066{
3758 static DEFINE_SPINLOCK(ftrace_dump_lock); 4067 static DEFINE_SPINLOCK(ftrace_dump_lock);
3759 /* use static because iter can be a bit big for the stack */ 4068 /* use static because iter can be a bit big for the stack */
3760 static struct trace_iterator iter; 4069 static struct trace_iterator iter;
4070 unsigned int old_userobj;
3761 static int dump_ran; 4071 static int dump_ran;
3762 unsigned long flags; 4072 unsigned long flags;
3763 int cnt = 0, cpu; 4073 int cnt = 0, cpu;
@@ -3769,21 +4079,26 @@ void ftrace_dump(void)
3769 4079
3770 dump_ran = 1; 4080 dump_ran = 1;
3771 4081
3772 /* No turning back! */
3773 tracing_off(); 4082 tracing_off();
3774 ftrace_kill(); 4083
4084 if (disable_tracing)
4085 ftrace_kill();
3775 4086
3776 for_each_tracing_cpu(cpu) { 4087 for_each_tracing_cpu(cpu) {
3777 atomic_inc(&global_trace.data[cpu]->disabled); 4088 atomic_inc(&global_trace.data[cpu]->disabled);
3778 } 4089 }
3779 4090
4091 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
4092
3780 /* don't look at user memory in panic mode */ 4093 /* don't look at user memory in panic mode */
3781 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 4094 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
3782 4095
3783 printk(KERN_TRACE "Dumping ftrace buffer:\n"); 4096 printk(KERN_TRACE "Dumping ftrace buffer:\n");
3784 4097
4098 /* Simulate the iterator */
3785 iter.tr = &global_trace; 4099 iter.tr = &global_trace;
3786 iter.trace = current_trace; 4100 iter.trace = current_trace;
4101 iter.cpu_file = TRACE_PIPE_ALL_CPU;
3787 4102
3788 /* 4103 /*
3789 * We need to stop all tracing on all CPUS to read the 4104 * We need to stop all tracing on all CPUS to read the
@@ -3819,13 +4134,30 @@ void ftrace_dump(void)
3819 else 4134 else
3820 printk(KERN_TRACE "---------------------------------\n"); 4135 printk(KERN_TRACE "---------------------------------\n");
3821 4136
4137 /* Re-enable tracing if requested */
4138 if (!disable_tracing) {
4139 trace_flags |= old_userobj;
4140
4141 for_each_tracing_cpu(cpu) {
4142 atomic_dec(&global_trace.data[cpu]->disabled);
4143 }
4144 tracing_on();
4145 }
4146
3822 out: 4147 out:
3823 spin_unlock_irqrestore(&ftrace_dump_lock, flags); 4148 spin_unlock_irqrestore(&ftrace_dump_lock, flags);
3824} 4149}
3825 4150
4151/* By default: disable tracing after the dump */
4152void ftrace_dump(void)
4153{
4154 __ftrace_dump(true);
4155}
4156
3826__init static int tracer_alloc_buffers(void) 4157__init static int tracer_alloc_buffers(void)
3827{ 4158{
3828 struct trace_array_cpu *data; 4159 struct trace_array_cpu *data;
4160 int ring_buf_size;
3829 int i; 4161 int i;
3830 int ret = -ENOMEM; 4162 int ret = -ENOMEM;
3831 4163
@@ -3835,11 +4167,21 @@ __init static int tracer_alloc_buffers(void)
3835 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4167 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
3836 goto out_free_buffer_mask; 4168 goto out_free_buffer_mask;
3837 4169
4170 if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4171 goto out_free_tracing_cpumask;
4172
4173 /* To save memory, keep the ring buffer size to its minimum */
4174 if (ring_buffer_expanded)
4175 ring_buf_size = trace_buf_size;
4176 else
4177 ring_buf_size = 1;
4178
3838 cpumask_copy(tracing_buffer_mask, cpu_possible_mask); 4179 cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
3839 cpumask_copy(tracing_cpumask, cpu_all_mask); 4180 cpumask_copy(tracing_cpumask, cpu_all_mask);
4181 cpumask_clear(tracing_reader_cpumask);
3840 4182
3841 /* TODO: make the number of buffers hot pluggable with CPUS */ 4183 /* TODO: make the number of buffers hot pluggable with CPUS */
3842 global_trace.buffer = ring_buffer_alloc(trace_buf_size, 4184 global_trace.buffer = ring_buffer_alloc(ring_buf_size,
3843 TRACE_BUFFER_FLAGS); 4185 TRACE_BUFFER_FLAGS);
3844 if (!global_trace.buffer) { 4186 if (!global_trace.buffer) {
3845 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); 4187 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
@@ -3850,7 +4192,7 @@ __init static int tracer_alloc_buffers(void)
3850 4192
3851 4193
3852#ifdef CONFIG_TRACER_MAX_TRACE 4194#ifdef CONFIG_TRACER_MAX_TRACE
3853 max_tr.buffer = ring_buffer_alloc(trace_buf_size, 4195 max_tr.buffer = ring_buffer_alloc(ring_buf_size,
3854 TRACE_BUFFER_FLAGS); 4196 TRACE_BUFFER_FLAGS);
3855 if (!max_tr.buffer) { 4197 if (!max_tr.buffer) {
3856 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); 4198 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
@@ -3871,14 +4213,10 @@ __init static int tracer_alloc_buffers(void)
3871 trace_init_cmdlines(); 4213 trace_init_cmdlines();
3872 4214
3873 register_tracer(&nop_trace); 4215 register_tracer(&nop_trace);
4216 current_trace = &nop_trace;
3874#ifdef CONFIG_BOOT_TRACER 4217#ifdef CONFIG_BOOT_TRACER
3875 register_tracer(&boot_tracer); 4218 register_tracer(&boot_tracer);
3876 current_trace = &boot_tracer;
3877 current_trace->init(&global_trace);
3878#else
3879 current_trace = &nop_trace;
3880#endif 4219#endif
3881
3882 /* All seems OK, enable tracing */ 4220 /* All seems OK, enable tracing */
3883 tracing_disabled = 0; 4221 tracing_disabled = 0;
3884 4222
@@ -3886,14 +4224,38 @@ __init static int tracer_alloc_buffers(void)
3886 &trace_panic_notifier); 4224 &trace_panic_notifier);
3887 4225
3888 register_die_notifier(&trace_die_notifier); 4226 register_die_notifier(&trace_die_notifier);
3889 ret = 0; 4227
4228 return 0;
3890 4229
3891out_free_cpumask: 4230out_free_cpumask:
4231 free_cpumask_var(tracing_reader_cpumask);
4232out_free_tracing_cpumask:
3892 free_cpumask_var(tracing_cpumask); 4233 free_cpumask_var(tracing_cpumask);
3893out_free_buffer_mask: 4234out_free_buffer_mask:
3894 free_cpumask_var(tracing_buffer_mask); 4235 free_cpumask_var(tracing_buffer_mask);
3895out: 4236out:
3896 return ret; 4237 return ret;
3897} 4238}
4239
4240__init static int clear_boot_tracer(void)
4241{
4242 /*
4243 * The default tracer at boot buffer is an init section.
4244 * This function is called in lateinit. If we did not
4245 * find the boot tracer, then clear it out, to prevent
4246 * later registration from accessing the buffer that is
4247 * about to be freed.
4248 */
4249 if (!default_bootup_tracer)
4250 return 0;
4251
4252 printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n",
4253 default_bootup_tracer);
4254 default_bootup_tracer = NULL;
4255
4256 return 0;
4257}
4258
3898early_initcall(tracer_alloc_buffers); 4259early_initcall(tracer_alloc_buffers);
3899fs_initcall(tracer_init_debugfs); 4260fs_initcall(tracer_init_debugfs);
4261late_initcall(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 4d3d381bfd95..e685ac2b2ba1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,6 +9,8 @@
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <trace/boot.h> 11#include <trace/boot.h>
12#include <trace/kmemtrace.h>
13#include <trace/power.h>
12 14
13enum trace_type { 15enum trace_type {
14 __TRACE_FIRST_TYPE = 0, 16 __TRACE_FIRST_TYPE = 0,
@@ -16,9 +18,9 @@ enum trace_type {
16 TRACE_FN, 18 TRACE_FN,
17 TRACE_CTX, 19 TRACE_CTX,
18 TRACE_WAKE, 20 TRACE_WAKE,
19 TRACE_CONT,
20 TRACE_STACK, 21 TRACE_STACK,
21 TRACE_PRINT, 22 TRACE_PRINT,
23 TRACE_BPRINT,
22 TRACE_SPECIAL, 24 TRACE_SPECIAL,
23 TRACE_MMIO_RW, 25 TRACE_MMIO_RW,
24 TRACE_MMIO_MAP, 26 TRACE_MMIO_MAP,
@@ -29,9 +31,14 @@ enum trace_type {
29 TRACE_GRAPH_ENT, 31 TRACE_GRAPH_ENT,
30 TRACE_USER_STACK, 32 TRACE_USER_STACK,
31 TRACE_HW_BRANCHES, 33 TRACE_HW_BRANCHES,
34 TRACE_SYSCALL_ENTER,
35 TRACE_SYSCALL_EXIT,
36 TRACE_KMEM_ALLOC,
37 TRACE_KMEM_FREE,
32 TRACE_POWER, 38 TRACE_POWER,
39 TRACE_BLK,
33 40
34 __TRACE_LAST_TYPE 41 __TRACE_LAST_TYPE,
35}; 42};
36 43
37/* 44/*
@@ -42,7 +49,6 @@ enum trace_type {
42 */ 49 */
43struct trace_entry { 50struct trace_entry {
44 unsigned char type; 51 unsigned char type;
45 unsigned char cpu;
46 unsigned char flags; 52 unsigned char flags;
47 unsigned char preempt_count; 53 unsigned char preempt_count;
48 int pid; 54 int pid;
@@ -60,13 +66,13 @@ struct ftrace_entry {
60 66
61/* Function call entry */ 67/* Function call entry */
62struct ftrace_graph_ent_entry { 68struct ftrace_graph_ent_entry {
63 struct trace_entry ent; 69 struct trace_entry ent;
64 struct ftrace_graph_ent graph_ent; 70 struct ftrace_graph_ent graph_ent;
65}; 71};
66 72
67/* Function return entry */ 73/* Function return entry */
68struct ftrace_graph_ret_entry { 74struct ftrace_graph_ret_entry {
69 struct trace_entry ent; 75 struct trace_entry ent;
70 struct ftrace_graph_ret ret; 76 struct ftrace_graph_ret ret;
71}; 77};
72extern struct tracer boot_tracer; 78extern struct tracer boot_tracer;
@@ -112,12 +118,18 @@ struct userstack_entry {
112}; 118};
113 119
114/* 120/*
115 * ftrace_printk entry: 121 * trace_printk entry:
116 */ 122 */
123struct bprint_entry {
124 struct trace_entry ent;
125 unsigned long ip;
126 const char *fmt;
127 u32 buf[];
128};
129
117struct print_entry { 130struct print_entry {
118 struct trace_entry ent; 131 struct trace_entry ent;
119 unsigned long ip; 132 unsigned long ip;
120 int depth;
121 char buf[]; 133 char buf[];
122}; 134};
123 135
@@ -170,15 +182,51 @@ struct trace_power {
170 struct power_trace state_data; 182 struct power_trace state_data;
171}; 183};
172 184
185enum kmemtrace_type_id {
186 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
187 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
188 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
189};
190
191struct kmemtrace_alloc_entry {
192 struct trace_entry ent;
193 enum kmemtrace_type_id type_id;
194 unsigned long call_site;
195 const void *ptr;
196 size_t bytes_req;
197 size_t bytes_alloc;
198 gfp_t gfp_flags;
199 int node;
200};
201
202struct kmemtrace_free_entry {
203 struct trace_entry ent;
204 enum kmemtrace_type_id type_id;
205 unsigned long call_site;
206 const void *ptr;
207};
208
209struct syscall_trace_enter {
210 struct trace_entry ent;
211 int nr;
212 unsigned long args[];
213};
214
215struct syscall_trace_exit {
216 struct trace_entry ent;
217 int nr;
218 unsigned long ret;
219};
220
221
173/* 222/*
174 * trace_flag_type is an enumeration that holds different 223 * trace_flag_type is an enumeration that holds different
175 * states when a trace occurs. These are: 224 * states when a trace occurs. These are:
176 * IRQS_OFF - interrupts were disabled 225 * IRQS_OFF - interrupts were disabled
177 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags 226 * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
178 * NEED_RESCED - reschedule is requested 227 * NEED_RESCED - reschedule is requested
179 * HARDIRQ - inside an interrupt handler 228 * HARDIRQ - inside an interrupt handler
180 * SOFTIRQ - inside a softirq handler 229 * SOFTIRQ - inside a softirq handler
181 * CONT - multiple entries hold the trace item
182 */ 230 */
183enum trace_flag_type { 231enum trace_flag_type {
184 TRACE_FLAG_IRQS_OFF = 0x01, 232 TRACE_FLAG_IRQS_OFF = 0x01,
@@ -186,7 +234,6 @@ enum trace_flag_type {
186 TRACE_FLAG_NEED_RESCHED = 0x04, 234 TRACE_FLAG_NEED_RESCHED = 0x04,
187 TRACE_FLAG_HARDIRQ = 0x08, 235 TRACE_FLAG_HARDIRQ = 0x08,
188 TRACE_FLAG_SOFTIRQ = 0x10, 236 TRACE_FLAG_SOFTIRQ = 0x10,
189 TRACE_FLAG_CONT = 0x20,
190}; 237};
191 238
192#define TRACE_BUF_SIZE 1024 239#define TRACE_BUF_SIZE 1024
@@ -198,6 +245,7 @@ enum trace_flag_type {
198 */ 245 */
199struct trace_array_cpu { 246struct trace_array_cpu {
200 atomic_t disabled; 247 atomic_t disabled;
248 void *buffer_page; /* ring buffer spare */
201 249
202 /* these fields get copied into max-trace: */ 250 /* these fields get copied into max-trace: */
203 unsigned long trace_idx; 251 unsigned long trace_idx;
@@ -262,10 +310,10 @@ extern void __ftrace_bad_type(void);
262 do { \ 310 do { \
263 IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ 311 IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
264 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ 312 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
265 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
266 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ 313 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
267 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ 314 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
268 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 315 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
316 IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
269 IF_ASSIGN(var, ent, struct special_entry, 0); \ 317 IF_ASSIGN(var, ent, struct special_entry, 0); \
270 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 318 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
271 TRACE_MMIO_RW); \ 319 TRACE_MMIO_RW); \
@@ -279,7 +327,15 @@ extern void __ftrace_bad_type(void);
279 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 327 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
280 TRACE_GRAPH_RET); \ 328 TRACE_GRAPH_RET); \
281 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ 329 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
282 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ 330 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
331 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
332 TRACE_KMEM_ALLOC); \
333 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
334 TRACE_KMEM_FREE); \
335 IF_ASSIGN(var, ent, struct syscall_trace_enter, \
336 TRACE_SYSCALL_ENTER); \
337 IF_ASSIGN(var, ent, struct syscall_trace_exit, \
338 TRACE_SYSCALL_EXIT); \
283 __ftrace_bad_type(); \ 339 __ftrace_bad_type(); \
284 } while (0) 340 } while (0)
285 341
@@ -287,7 +343,8 @@ extern void __ftrace_bad_type(void);
287enum print_line_t { 343enum print_line_t {
288 TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ 344 TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
289 TRACE_TYPE_HANDLED = 1, 345 TRACE_TYPE_HANDLED = 1,
290 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ 346 TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */
347 TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */
291}; 348};
292 349
293 350
@@ -297,8 +354,8 @@ enum print_line_t {
297 * flags value in struct tracer_flags. 354 * flags value in struct tracer_flags.
298 */ 355 */
299struct tracer_opt { 356struct tracer_opt {
300 const char *name; /* Will appear on the trace_options file */ 357 const char *name; /* Will appear on the trace_options file */
301 u32 bit; /* Mask assigned in val field in tracer_flags */ 358 u32 bit; /* Mask assigned in val field in tracer_flags */
302}; 359};
303 360
304/* 361/*
@@ -307,28 +364,51 @@ struct tracer_opt {
307 */ 364 */
308struct tracer_flags { 365struct tracer_flags {
309 u32 val; 366 u32 val;
310 struct tracer_opt *opts; 367 struct tracer_opt *opts;
311}; 368};
312 369
313/* Makes more easy to define a tracer opt */ 370/* Makes more easy to define a tracer opt */
314#define TRACER_OPT(s, b) .name = #s, .bit = b 371#define TRACER_OPT(s, b) .name = #s, .bit = b
315 372
316/* 373
317 * A specific tracer, represented by methods that operate on a trace array: 374/**
375 * struct tracer - a specific tracer and its callbacks to interact with debugfs
376 * @name: the name chosen to select it on the available_tracers file
377 * @init: called when one switches to this tracer (echo name > current_tracer)
378 * @reset: called when one switches to another tracer
379 * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
380 * @stop: called when tracing is paused (echo 0 > tracing_enabled)
381 * @open: called when the trace file is opened
382 * @pipe_open: called when the trace_pipe file is opened
383 * @wait_pipe: override how the user waits for traces on trace_pipe
384 * @close: called when the trace file is released
385 * @read: override the default read callback on trace_pipe
386 * @splice_read: override the default splice_read callback on trace_pipe
387 * @selftest: selftest to run on boot (see trace_selftest.c)
388 * @print_headers: override the first lines that describe your columns
389 * @print_line: callback that prints a trace
390 * @set_flag: signals one of your private flags changed (trace_options file)
391 * @flags: your private flags
318 */ 392 */
319struct tracer { 393struct tracer {
320 const char *name; 394 const char *name;
321 /* Your tracer should raise a warning if init fails */
322 int (*init)(struct trace_array *tr); 395 int (*init)(struct trace_array *tr);
323 void (*reset)(struct trace_array *tr); 396 void (*reset)(struct trace_array *tr);
324 void (*start)(struct trace_array *tr); 397 void (*start)(struct trace_array *tr);
325 void (*stop)(struct trace_array *tr); 398 void (*stop)(struct trace_array *tr);
326 void (*open)(struct trace_iterator *iter); 399 void (*open)(struct trace_iterator *iter);
327 void (*pipe_open)(struct trace_iterator *iter); 400 void (*pipe_open)(struct trace_iterator *iter);
401 void (*wait_pipe)(struct trace_iterator *iter);
328 void (*close)(struct trace_iterator *iter); 402 void (*close)(struct trace_iterator *iter);
329 ssize_t (*read)(struct trace_iterator *iter, 403 ssize_t (*read)(struct trace_iterator *iter,
330 struct file *filp, char __user *ubuf, 404 struct file *filp, char __user *ubuf,
331 size_t cnt, loff_t *ppos); 405 size_t cnt, loff_t *ppos);
406 ssize_t (*splice_read)(struct trace_iterator *iter,
407 struct file *filp,
408 loff_t *ppos,
409 struct pipe_inode_info *pipe,
410 size_t len,
411 unsigned int flags);
332#ifdef CONFIG_FTRACE_STARTUP_TEST 412#ifdef CONFIG_FTRACE_STARTUP_TEST
333 int (*selftest)(struct tracer *trace, 413 int (*selftest)(struct tracer *trace,
334 struct trace_array *tr); 414 struct trace_array *tr);
@@ -339,7 +419,8 @@ struct tracer {
339 int (*set_flag)(u32 old_flags, u32 bit, int set); 419 int (*set_flag)(u32 old_flags, u32 bit, int set);
340 struct tracer *next; 420 struct tracer *next;
341 int print_max; 421 int print_max;
342 struct tracer_flags *flags; 422 struct tracer_flags *flags;
423 struct tracer_stat *stats;
343}; 424};
344 425
345struct trace_seq { 426struct trace_seq {
@@ -348,6 +429,16 @@ struct trace_seq {
348 unsigned int readpos; 429 unsigned int readpos;
349}; 430};
350 431
432static inline void
433trace_seq_init(struct trace_seq *s)
434{
435 s->len = 0;
436 s->readpos = 0;
437}
438
439
440#define TRACE_PIPE_ALL_CPU -1
441
351/* 442/*
352 * Trace iterator - used by printout routines who present trace 443 * Trace iterator - used by printout routines who present trace
353 * results to users and which routines might sleep, etc: 444 * results to users and which routines might sleep, etc:
@@ -356,6 +447,8 @@ struct trace_iterator {
356 struct trace_array *tr; 447 struct trace_array *tr;
357 struct tracer *trace; 448 struct tracer *trace;
358 void *private; 449 void *private;
450 int cpu_file;
451 struct mutex mutex;
359 struct ring_buffer_iter *buffer_iter[NR_CPUS]; 452 struct ring_buffer_iter *buffer_iter[NR_CPUS];
360 453
361 /* The below is zeroed out in pipe_read */ 454 /* The below is zeroed out in pipe_read */
@@ -371,6 +464,7 @@ struct trace_iterator {
371 cpumask_var_t started; 464 cpumask_var_t started;
372}; 465};
373 466
467int tracer_init(struct tracer *t, struct trace_array *tr);
374int tracing_is_enabled(void); 468int tracing_is_enabled(void);
375void trace_wake_up(void); 469void trace_wake_up(void);
376void tracing_reset(struct trace_array *tr, int cpu); 470void tracing_reset(struct trace_array *tr, int cpu);
@@ -379,26 +473,50 @@ int tracing_open_generic(struct inode *inode, struct file *filp);
379struct dentry *tracing_init_dentry(void); 473struct dentry *tracing_init_dentry(void);
380void init_tracer_sysprof_debugfs(struct dentry *d_tracer); 474void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
381 475
476struct ring_buffer_event;
477
478struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
479 unsigned char type,
480 unsigned long len,
481 unsigned long flags,
482 int pc);
483void trace_buffer_unlock_commit(struct trace_array *tr,
484 struct ring_buffer_event *event,
485 unsigned long flags, int pc);
486
487struct ring_buffer_event *
488trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
489 unsigned long flags, int pc);
490void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
491 unsigned long flags, int pc);
492void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
493 unsigned long flags, int pc);
494
382struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, 495struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
383 struct trace_array_cpu *data); 496 struct trace_array_cpu *data);
497
498struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
499 int *ent_cpu, u64 *ent_ts);
500
384void tracing_generic_entry_update(struct trace_entry *entry, 501void tracing_generic_entry_update(struct trace_entry *entry,
385 unsigned long flags, 502 unsigned long flags,
386 int pc); 503 int pc);
387 504
505void default_wait_pipe(struct trace_iterator *iter);
506void poll_wait_pipe(struct trace_iterator *iter);
507
388void ftrace(struct trace_array *tr, 508void ftrace(struct trace_array *tr,
389 struct trace_array_cpu *data, 509 struct trace_array_cpu *data,
390 unsigned long ip, 510 unsigned long ip,
391 unsigned long parent_ip, 511 unsigned long parent_ip,
392 unsigned long flags, int pc); 512 unsigned long flags, int pc);
393void tracing_sched_switch_trace(struct trace_array *tr, 513void tracing_sched_switch_trace(struct trace_array *tr,
394 struct trace_array_cpu *data,
395 struct task_struct *prev, 514 struct task_struct *prev,
396 struct task_struct *next, 515 struct task_struct *next,
397 unsigned long flags, int pc); 516 unsigned long flags, int pc);
398void tracing_record_cmdline(struct task_struct *tsk); 517void tracing_record_cmdline(struct task_struct *tsk);
399 518
400void tracing_sched_wakeup_trace(struct trace_array *tr, 519void tracing_sched_wakeup_trace(struct trace_array *tr,
401 struct trace_array_cpu *data,
402 struct task_struct *wakee, 520 struct task_struct *wakee,
403 struct task_struct *cur, 521 struct task_struct *cur,
404 unsigned long flags, int pc); 522 unsigned long flags, int pc);
@@ -408,14 +526,12 @@ void trace_special(struct trace_array *tr,
408 unsigned long arg2, 526 unsigned long arg2,
409 unsigned long arg3, int pc); 527 unsigned long arg3, int pc);
410void trace_function(struct trace_array *tr, 528void trace_function(struct trace_array *tr,
411 struct trace_array_cpu *data,
412 unsigned long ip, 529 unsigned long ip,
413 unsigned long parent_ip, 530 unsigned long parent_ip,
414 unsigned long flags, int pc); 531 unsigned long flags, int pc);
415 532
416void trace_graph_return(struct ftrace_graph_ret *trace); 533void trace_graph_return(struct ftrace_graph_ret *trace);
417int trace_graph_entry(struct ftrace_graph_ent *trace); 534int trace_graph_entry(struct ftrace_graph_ent *trace);
418void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
419 535
420void tracing_start_cmdline_record(void); 536void tracing_start_cmdline_record(void);
421void tracing_stop_cmdline_record(void); 537void tracing_stop_cmdline_record(void);
@@ -434,15 +550,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
434void update_max_tr_single(struct trace_array *tr, 550void update_max_tr_single(struct trace_array *tr,
435 struct task_struct *tsk, int cpu); 551 struct task_struct *tsk, int cpu);
436 552
437extern cycle_t ftrace_now(int cpu); 553void __trace_stack(struct trace_array *tr,
554 unsigned long flags,
555 int skip, int pc);
438 556
439#ifdef CONFIG_FUNCTION_TRACER 557extern cycle_t ftrace_now(int cpu);
440void tracing_start_function_trace(void);
441void tracing_stop_function_trace(void);
442#else
443# define tracing_start_function_trace() do { } while (0)
444# define tracing_stop_function_trace() do { } while (0)
445#endif
446 558
447#ifdef CONFIG_CONTEXT_SWITCH_TRACER 559#ifdef CONFIG_CONTEXT_SWITCH_TRACER
448typedef void 560typedef void
@@ -456,10 +568,10 @@ struct tracer_switch_ops {
456 void *private; 568 void *private;
457 struct tracer_switch_ops *next; 569 struct tracer_switch_ops *next;
458}; 570};
459
460char *trace_find_cmdline(int pid);
461#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 571#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
462 572
573extern void trace_find_cmdline(int pid, char comm[]);
574
463#ifdef CONFIG_DYNAMIC_FTRACE 575#ifdef CONFIG_DYNAMIC_FTRACE
464extern unsigned long ftrace_update_tot_cnt; 576extern unsigned long ftrace_update_tot_cnt;
465#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func 577#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
@@ -469,6 +581,8 @@ extern int DYN_FTRACE_TEST_NAME(void);
469#ifdef CONFIG_FTRACE_STARTUP_TEST 581#ifdef CONFIG_FTRACE_STARTUP_TEST
470extern int trace_selftest_startup_function(struct tracer *trace, 582extern int trace_selftest_startup_function(struct tracer *trace,
471 struct trace_array *tr); 583 struct trace_array *tr);
584extern int trace_selftest_startup_function_graph(struct tracer *trace,
585 struct trace_array *tr);
472extern int trace_selftest_startup_irqsoff(struct tracer *trace, 586extern int trace_selftest_startup_irqsoff(struct tracer *trace,
473 struct trace_array *tr); 587 struct trace_array *tr);
474extern int trace_selftest_startup_preemptoff(struct tracer *trace, 588extern int trace_selftest_startup_preemptoff(struct tracer *trace,
@@ -488,18 +602,11 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
488#endif /* CONFIG_FTRACE_STARTUP_TEST */ 602#endif /* CONFIG_FTRACE_STARTUP_TEST */
489 603
490extern void *head_page(struct trace_array_cpu *data); 604extern void *head_page(struct trace_array_cpu *data);
491extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); 605extern unsigned long long ns2usecs(cycle_t nsec);
492extern void trace_seq_print_cont(struct trace_seq *s,
493 struct trace_iterator *iter);
494
495extern int 606extern int
496seq_print_ip_sym(struct trace_seq *s, unsigned long ip, 607trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
497 unsigned long sym_flags);
498extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
499 size_t cnt);
500extern long ns2usecs(cycle_t nsec);
501extern int 608extern int
502trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args); 609trace_vprintk(unsigned long ip, const char *fmt, va_list args);
503 610
504extern unsigned long trace_flags; 611extern unsigned long trace_flags;
505 612
@@ -580,7 +687,11 @@ enum trace_iterator_flags {
580 TRACE_ITER_ANNOTATE = 0x2000, 687 TRACE_ITER_ANNOTATE = 0x2000,
581 TRACE_ITER_USERSTACKTRACE = 0x4000, 688 TRACE_ITER_USERSTACKTRACE = 0x4000,
582 TRACE_ITER_SYM_USEROBJ = 0x8000, 689 TRACE_ITER_SYM_USEROBJ = 0x8000,
583 TRACE_ITER_PRINTK_MSGONLY = 0x10000 690 TRACE_ITER_PRINTK_MSGONLY = 0x10000,
691 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
692 TRACE_ITER_LATENCY_FMT = 0x40000,
693 TRACE_ITER_GLOBAL_CLK = 0x80000,
694 TRACE_ITER_SLEEP_TIME = 0x100000,
584}; 695};
585 696
586/* 697/*
@@ -601,12 +712,12 @@ extern struct tracer nop_trace;
601 * preempt_enable (after a disable), a schedule might take place 712 * preempt_enable (after a disable), a schedule might take place
602 * causing an infinite recursion. 713 * causing an infinite recursion.
603 * 714 *
604 * To prevent this, we read the need_recshed flag before 715 * To prevent this, we read the need_resched flag before
605 * disabling preemption. When we want to enable preemption we 716 * disabling preemption. When we want to enable preemption we
606 * check the flag, if it is set, then we call preempt_enable_no_resched. 717 * check the flag, if it is set, then we call preempt_enable_no_resched.
607 * Otherwise, we call preempt_enable. 718 * Otherwise, we call preempt_enable.
608 * 719 *
609 * The rational for doing the above is that if need resched is set 720 * The rational for doing the above is that if need_resched is set
610 * and we have yet to reschedule, we are either in an atomic location 721 * and we have yet to reschedule, we are either in an atomic location
611 * (where we do not need to check for scheduling) or we are inside 722 * (where we do not need to check for scheduling) or we are inside
612 * the scheduler and do not want to resched. 723 * the scheduler and do not want to resched.
@@ -627,7 +738,7 @@ static inline int ftrace_preempt_disable(void)
627 * 738 *
628 * This is a scheduler safe way to enable preemption and not miss 739 * This is a scheduler safe way to enable preemption and not miss
629 * any preemption checks. The disabled saved the state of preemption. 740 * any preemption checks. The disabled saved the state of preemption.
630 * If resched is set, then we were either inside an atomic or 741 * If resched is set, then we are either inside an atomic or
631 * are inside the scheduler (we would have already scheduled 742 * are inside the scheduler (we would have already scheduled
632 * otherwise). In this case, we do not want to call normal 743 * otherwise). In this case, we do not want to call normal
633 * preempt_enable, but preempt_enable_no_resched instead. 744 * preempt_enable, but preempt_enable_no_resched instead.
@@ -664,4 +775,118 @@ static inline void trace_branch_disable(void)
664} 775}
665#endif /* CONFIG_BRANCH_TRACER */ 776#endif /* CONFIG_BRANCH_TRACER */
666 777
778/* set ring buffers to default size if not already done so */
779int tracing_update_buffers(void);
780
781/* trace event type bit fields, not numeric */
782enum {
783 TRACE_EVENT_TYPE_PRINTF = 1,
784 TRACE_EVENT_TYPE_RAW = 2,
785};
786
787struct ftrace_event_field {
788 struct list_head link;
789 char *name;
790 char *type;
791 int offset;
792 int size;
793};
794
795struct ftrace_event_call {
796 char *name;
797 char *system;
798 struct dentry *dir;
799 int enabled;
800 int (*regfunc)(void);
801 void (*unregfunc)(void);
802 int id;
803 int (*raw_init)(void);
804 int (*show_format)(struct trace_seq *s);
805 int (*define_fields)(void);
806 struct list_head fields;
807 struct filter_pred **preds;
808
809#ifdef CONFIG_EVENT_PROFILE
810 atomic_t profile_count;
811 int (*profile_enable)(struct ftrace_event_call *);
812 void (*profile_disable)(struct ftrace_event_call *);
813#endif
814};
815
816struct event_subsystem {
817 struct list_head list;
818 const char *name;
819 struct dentry *entry;
820 struct filter_pred **preds;
821};
822
823#define events_for_each(event) \
824 for (event = __start_ftrace_events; \
825 (unsigned long)event < (unsigned long)__stop_ftrace_events; \
826 event++)
827
828#define MAX_FILTER_PRED 8
829
830struct filter_pred;
831
832typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
833
834struct filter_pred {
835 filter_pred_fn_t fn;
836 u64 val;
837 char *str_val;
838 int str_len;
839 char *field_name;
840 int offset;
841 int not;
842 int or;
843 int compound;
844 int clear;
845};
846
847int trace_define_field(struct ftrace_event_call *call, char *type,
848 char *name, int offset, int size);
849extern void filter_free_pred(struct filter_pred *pred);
850extern void filter_print_preds(struct filter_pred **preds,
851 struct trace_seq *s);
852extern int filter_parse(char **pbuf, struct filter_pred *pred);
853extern int filter_add_pred(struct ftrace_event_call *call,
854 struct filter_pred *pred);
855extern void filter_free_preds(struct ftrace_event_call *call);
856extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
857extern void filter_free_subsystem_preds(struct event_subsystem *system);
858extern int filter_add_subsystem_pred(struct event_subsystem *system,
859 struct filter_pred *pred);
860
861void event_trace_printk(unsigned long ip, const char *fmt, ...);
862extern struct ftrace_event_call __start_ftrace_events[];
863extern struct ftrace_event_call __stop_ftrace_events[];
864
865#define for_each_event(event) \
866 for (event = __start_ftrace_events; \
867 (unsigned long)event < (unsigned long)__stop_ftrace_events; \
868 event++)
869
870extern const char *__start___trace_bprintk_fmt[];
871extern const char *__stop___trace_bprintk_fmt[];
872
873/*
874 * The double __builtin_constant_p is because gcc will give us an error
875 * if we try to allocate the static variable to fmt if it is not a
876 * constant. Even with the outer if statement optimizing out.
877 */
878#define event_trace_printk(ip, fmt, args...) \
879do { \
880 __trace_printk_check_format(fmt, ##args); \
881 tracing_record_cmdline(current); \
882 if (__builtin_constant_p(fmt)) { \
883 static const char *trace_printk_fmt \
884 __attribute__((section("__trace_printk_fmt"))) = \
885 __builtin_constant_p(fmt) ? fmt : NULL; \
886 \
887 __trace_bprintk(ip, trace_printk_fmt, ##args); \
888 } else \
889 __trace_printk(ip, fmt, ##args); \
890} while (0)
891
667#endif /* _LINUX_KERNEL_TRACE_H */ 892#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 366c8c333e13..7a30fc4c3642 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -11,6 +11,7 @@
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12 12
13#include "trace.h" 13#include "trace.h"
14#include "trace_output.h"
14 15
15static struct trace_array *boot_trace; 16static struct trace_array *boot_trace;
16static bool pre_initcalls_finished; 17static bool pre_initcalls_finished;
@@ -27,13 +28,13 @@ void start_boot_trace(void)
27 28
28void enable_boot_trace(void) 29void enable_boot_trace(void)
29{ 30{
30 if (pre_initcalls_finished) 31 if (boot_trace && pre_initcalls_finished)
31 tracing_start_sched_switch_record(); 32 tracing_start_sched_switch_record();
32} 33}
33 34
34void disable_boot_trace(void) 35void disable_boot_trace(void)
35{ 36{
36 if (pre_initcalls_finished) 37 if (boot_trace && pre_initcalls_finished)
37 tracing_stop_sched_switch_record(); 38 tracing_stop_sched_switch_record();
38} 39}
39 40
@@ -42,6 +43,9 @@ static int boot_trace_init(struct trace_array *tr)
42 int cpu; 43 int cpu;
43 boot_trace = tr; 44 boot_trace = tr;
44 45
46 if (!tr)
47 return 0;
48
45 for_each_cpu(cpu, cpu_possible_mask) 49 for_each_cpu(cpu, cpu_possible_mask)
46 tracing_reset(tr, cpu); 50 tracing_reset(tr, cpu);
47 51
@@ -128,10 +132,9 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
128{ 132{
129 struct ring_buffer_event *event; 133 struct ring_buffer_event *event;
130 struct trace_boot_call *entry; 134 struct trace_boot_call *entry;
131 unsigned long irq_flags;
132 struct trace_array *tr = boot_trace; 135 struct trace_array *tr = boot_trace;
133 136
134 if (!pre_initcalls_finished) 137 if (!tr || !pre_initcalls_finished)
135 return; 138 return;
136 139
137 /* Get its name now since this function could 140 /* Get its name now since this function could
@@ -140,18 +143,13 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
140 sprint_symbol(bt->func, (unsigned long)fn); 143 sprint_symbol(bt->func, (unsigned long)fn);
141 preempt_disable(); 144 preempt_disable();
142 145
143 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 146 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL,
144 &irq_flags); 147 sizeof(*entry), 0, 0);
145 if (!event) 148 if (!event)
146 goto out; 149 goto out;
147 entry = ring_buffer_event_data(event); 150 entry = ring_buffer_event_data(event);
148 tracing_generic_entry_update(&entry->ent, 0, 0);
149 entry->ent.type = TRACE_BOOT_CALL;
150 entry->boot_call = *bt; 151 entry->boot_call = *bt;
151 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 152 trace_buffer_unlock_commit(tr, event, 0, 0);
152
153 trace_wake_up();
154
155 out: 153 out:
156 preempt_enable(); 154 preempt_enable();
157} 155}
@@ -160,27 +158,21 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
160{ 158{
161 struct ring_buffer_event *event; 159 struct ring_buffer_event *event;
162 struct trace_boot_ret *entry; 160 struct trace_boot_ret *entry;
163 unsigned long irq_flags;
164 struct trace_array *tr = boot_trace; 161 struct trace_array *tr = boot_trace;
165 162
166 if (!pre_initcalls_finished) 163 if (!tr || !pre_initcalls_finished)
167 return; 164 return;
168 165
169 sprint_symbol(bt->func, (unsigned long)fn); 166 sprint_symbol(bt->func, (unsigned long)fn);
170 preempt_disable(); 167 preempt_disable();
171 168
172 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 169 event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET,
173 &irq_flags); 170 sizeof(*entry), 0, 0);
174 if (!event) 171 if (!event)
175 goto out; 172 goto out;
176 entry = ring_buffer_event_data(event); 173 entry = ring_buffer_event_data(event);
177 tracing_generic_entry_update(&entry->ent, 0, 0);
178 entry->ent.type = TRACE_BOOT_RET;
179 entry->boot_ret = *bt; 174 entry->boot_ret = *bt;
180 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 175 trace_buffer_unlock_commit(tr, event, 0, 0);
181
182 trace_wake_up();
183
184 out: 176 out:
185 preempt_enable(); 177 preempt_enable();
186} 178}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6c00feb3bac7..8333715e4066 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -14,12 +14,17 @@
14#include <linux/hash.h> 14#include <linux/hash.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <asm/local.h> 16#include <asm/local.h>
17
17#include "trace.h" 18#include "trace.h"
19#include "trace_stat.h"
20#include "trace_output.h"
18 21
19#ifdef CONFIG_BRANCH_TRACER 22#ifdef CONFIG_BRANCH_TRACER
20 23
24static struct tracer branch_trace;
21static int branch_tracing_enabled __read_mostly; 25static int branch_tracing_enabled __read_mostly;
22static DEFINE_MUTEX(branch_tracing_mutex); 26static DEFINE_MUTEX(branch_tracing_mutex);
27
23static struct trace_array *branch_tracer; 28static struct trace_array *branch_tracer;
24 29
25static void 30static void
@@ -28,7 +33,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
28 struct trace_array *tr = branch_tracer; 33 struct trace_array *tr = branch_tracer;
29 struct ring_buffer_event *event; 34 struct ring_buffer_event *event;
30 struct trace_branch *entry; 35 struct trace_branch *entry;
31 unsigned long flags, irq_flags; 36 unsigned long flags;
32 int cpu, pc; 37 int cpu, pc;
33 const char *p; 38 const char *p;
34 39
@@ -47,15 +52,13 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
47 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) 52 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
48 goto out; 53 goto out;
49 54
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 55 pc = preempt_count();
51 &irq_flags); 56 event = trace_buffer_lock_reserve(tr, TRACE_BRANCH,
57 sizeof(*entry), flags, pc);
52 if (!event) 58 if (!event)
53 goto out; 59 goto out;
54 60
55 pc = preempt_count();
56 entry = ring_buffer_event_data(event); 61 entry = ring_buffer_event_data(event);
57 tracing_generic_entry_update(&entry->ent, flags, pc);
58 entry->ent.type = TRACE_BRANCH;
59 62
60 /* Strip off the path, only save the file */ 63 /* Strip off the path, only save the file */
61 p = f->file + strlen(f->file); 64 p = f->file + strlen(f->file);
@@ -70,7 +73,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
70 entry->line = f->line; 73 entry->line = f->line;
71 entry->correct = val == expect; 74 entry->correct = val == expect;
72 75
73 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 76 ring_buffer_unlock_commit(tr->buffer, event);
74 77
75 out: 78 out:
76 atomic_dec(&tr->data[cpu]->disabled); 79 atomic_dec(&tr->data[cpu]->disabled);
@@ -88,8 +91,6 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
88 91
89int enable_branch_tracing(struct trace_array *tr) 92int enable_branch_tracing(struct trace_array *tr)
90{ 93{
91 int ret = 0;
92
93 mutex_lock(&branch_tracing_mutex); 94 mutex_lock(&branch_tracing_mutex);
94 branch_tracer = tr; 95 branch_tracer = tr;
95 /* 96 /*
@@ -100,7 +101,7 @@ int enable_branch_tracing(struct trace_array *tr)
100 branch_tracing_enabled++; 101 branch_tracing_enabled++;
101 mutex_unlock(&branch_tracing_mutex); 102 mutex_unlock(&branch_tracing_mutex);
102 103
103 return ret; 104 return 0;
104} 105}
105 106
106void disable_branch_tracing(void) 107void disable_branch_tracing(void)
@@ -128,11 +129,6 @@ static void stop_branch_trace(struct trace_array *tr)
128 129
129static int branch_trace_init(struct trace_array *tr) 130static int branch_trace_init(struct trace_array *tr)
130{ 131{
131 int cpu;
132
133 for_each_online_cpu(cpu)
134 tracing_reset(tr, cpu);
135
136 start_branch_trace(tr); 132 start_branch_trace(tr);
137 return 0; 133 return 0;
138} 134}
@@ -142,22 +138,61 @@ static void branch_trace_reset(struct trace_array *tr)
142 stop_branch_trace(tr); 138 stop_branch_trace(tr);
143} 139}
144 140
145struct tracer branch_trace __read_mostly = 141static enum print_line_t trace_branch_print(struct trace_iterator *iter,
142 int flags)
143{
144 struct trace_branch *field;
145
146 trace_assign_type(field, iter->ent);
147
148 if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
149 field->correct ? " ok " : " MISS ",
150 field->func,
151 field->file,
152 field->line))
153 return TRACE_TYPE_PARTIAL_LINE;
154
155 return TRACE_TYPE_HANDLED;
156}
157
158static void branch_print_header(struct seq_file *s)
159{
160 seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT"
161 " FUNC:FILE:LINE\n");
162 seq_puts(s, "# | | | | | "
163 " |\n");
164}
165
166static struct trace_event trace_branch_event = {
167 .type = TRACE_BRANCH,
168 .trace = trace_branch_print,
169};
170
171static struct tracer branch_trace __read_mostly =
146{ 172{
147 .name = "branch", 173 .name = "branch",
148 .init = branch_trace_init, 174 .init = branch_trace_init,
149 .reset = branch_trace_reset, 175 .reset = branch_trace_reset,
150#ifdef CONFIG_FTRACE_SELFTEST 176#ifdef CONFIG_FTRACE_SELFTEST
151 .selftest = trace_selftest_startup_branch, 177 .selftest = trace_selftest_startup_branch,
152#endif 178#endif /* CONFIG_FTRACE_SELFTEST */
179 .print_header = branch_print_header,
153}; 180};
154 181
155__init static int init_branch_trace(void) 182__init static int init_branch_tracer(void)
156{ 183{
184 int ret;
185
186 ret = register_ftrace_event(&trace_branch_event);
187 if (!ret) {
188 printk(KERN_WARNING "Warning: could not register "
189 "branch events\n");
190 return 1;
191 }
157 return register_tracer(&branch_trace); 192 return register_tracer(&branch_trace);
158} 193}
194device_initcall(init_branch_tracer);
159 195
160device_initcall(init_branch_trace);
161#else 196#else
162static inline 197static inline
163void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) 198void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
@@ -183,66 +218,39 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
183} 218}
184EXPORT_SYMBOL(ftrace_likely_update); 219EXPORT_SYMBOL(ftrace_likely_update);
185 220
186struct ftrace_pointer { 221extern unsigned long __start_annotated_branch_profile[];
187 void *start; 222extern unsigned long __stop_annotated_branch_profile[];
188 void *stop;
189 int hit;
190};
191 223
192static void * 224static int annotated_branch_stat_headers(struct seq_file *m)
193t_next(struct seq_file *m, void *v, loff_t *pos)
194{ 225{
195 const struct ftrace_pointer *f = m->private; 226 seq_printf(m, " correct incorrect %% ");
196 struct ftrace_branch_data *p = v; 227 seq_printf(m, " Function "
197 228 " File Line\n"
198 (*pos)++; 229 " ------- --------- - "
199 230 " -------- "
200 if (v == (void *)1) 231 " ---- ----\n");
201 return f->start; 232 return 0;
202
203 ++p;
204
205 if ((void *)p >= (void *)f->stop)
206 return NULL;
207
208 return p;
209} 233}
210 234
211static void *t_start(struct seq_file *m, loff_t *pos) 235static inline long get_incorrect_percent(struct ftrace_branch_data *p)
212{ 236{
213 void *t = (void *)1; 237 long percent;
214 loff_t l = 0;
215
216 for (; t && l < *pos; t = t_next(m, t, &l))
217 ;
218 238
219 return t; 239 if (p->correct) {
220} 240 percent = p->incorrect * 100;
241 percent /= p->correct + p->incorrect;
242 } else
243 percent = p->incorrect ? 100 : -1;
221 244
222static void t_stop(struct seq_file *m, void *p) 245 return percent;
223{
224} 246}
225 247
226static int t_show(struct seq_file *m, void *v) 248static int branch_stat_show(struct seq_file *m, void *v)
227{ 249{
228 const struct ftrace_pointer *fp = m->private;
229 struct ftrace_branch_data *p = v; 250 struct ftrace_branch_data *p = v;
230 const char *f; 251 const char *f;
231 long percent; 252 long percent;
232 253
233 if (v == (void *)1) {
234 if (fp->hit)
235 seq_printf(m, " miss hit %% ");
236 else
237 seq_printf(m, " correct incorrect %% ");
238 seq_printf(m, " Function "
239 " File Line\n"
240 " ------- --------- - "
241 " -------- "
242 " ---- ----\n");
243 return 0;
244 }
245
246 /* Only print the file, not the path */ 254 /* Only print the file, not the path */
247 f = p->file + strlen(p->file); 255 f = p->file + strlen(p->file);
248 while (f >= p->file && *f != '/') 256 while (f >= p->file && *f != '/')
@@ -252,11 +260,7 @@ static int t_show(struct seq_file *m, void *v)
252 /* 260 /*
253 * The miss is overlayed on correct, and hit on incorrect. 261 * The miss is overlayed on correct, and hit on incorrect.
254 */ 262 */
255 if (p->correct) { 263 percent = get_incorrect_percent(p);
256 percent = p->incorrect * 100;
257 percent /= p->correct + p->incorrect;
258 } else
259 percent = p->incorrect ? 100 : -1;
260 264
261 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); 265 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
262 if (percent < 0) 266 if (percent < 0)
@@ -267,76 +271,118 @@ static int t_show(struct seq_file *m, void *v)
267 return 0; 271 return 0;
268} 272}
269 273
270static struct seq_operations tracing_likely_seq_ops = { 274static void *annotated_branch_stat_start(void)
271 .start = t_start, 275{
272 .next = t_next, 276 return __start_annotated_branch_profile;
273 .stop = t_stop, 277}
274 .show = t_show, 278
279static void *
280annotated_branch_stat_next(void *v, int idx)
281{
282 struct ftrace_branch_data *p = v;
283
284 ++p;
285
286 if ((void *)p >= (void *)__stop_annotated_branch_profile)
287 return NULL;
288
289 return p;
290}
291
292static int annotated_branch_stat_cmp(void *p1, void *p2)
293{
294 struct ftrace_branch_data *a = p1;
295 struct ftrace_branch_data *b = p2;
296
297 long percent_a, percent_b;
298
299 percent_a = get_incorrect_percent(a);
300 percent_b = get_incorrect_percent(b);
301
302 if (percent_a < percent_b)
303 return -1;
304 if (percent_a > percent_b)
305 return 1;
306 else
307 return 0;
308}
309
310static struct tracer_stat annotated_branch_stats = {
311 .name = "branch_annotated",
312 .stat_start = annotated_branch_stat_start,
313 .stat_next = annotated_branch_stat_next,
314 .stat_cmp = annotated_branch_stat_cmp,
315 .stat_headers = annotated_branch_stat_headers,
316 .stat_show = branch_stat_show
275}; 317};
276 318
277static int tracing_branch_open(struct inode *inode, struct file *file) 319__init static int init_annotated_branch_stats(void)
278{ 320{
279 int ret; 321 int ret;
280 322
281 ret = seq_open(file, &tracing_likely_seq_ops); 323 ret = register_stat_tracer(&annotated_branch_stats);
282 if (!ret) { 324 if (!ret) {
283 struct seq_file *m = file->private_data; 325 printk(KERN_WARNING "Warning: could not register "
284 m->private = (void *)inode->i_private; 326 "annotated branches stats\n");
327 return 1;
285 } 328 }
286 329 return 0;
287 return ret;
288} 330}
289 331fs_initcall(init_annotated_branch_stats);
290static const struct file_operations tracing_branch_fops = {
291 .open = tracing_branch_open,
292 .read = seq_read,
293 .llseek = seq_lseek,
294};
295 332
296#ifdef CONFIG_PROFILE_ALL_BRANCHES 333#ifdef CONFIG_PROFILE_ALL_BRANCHES
334
297extern unsigned long __start_branch_profile[]; 335extern unsigned long __start_branch_profile[];
298extern unsigned long __stop_branch_profile[]; 336extern unsigned long __stop_branch_profile[];
299 337
300static const struct ftrace_pointer ftrace_branch_pos = { 338static int all_branch_stat_headers(struct seq_file *m)
301 .start = __start_branch_profile, 339{
302 .stop = __stop_branch_profile, 340 seq_printf(m, " miss hit %% ");
303 .hit = 1, 341 seq_printf(m, " Function "
304}; 342 " File Line\n"
343 " ------- --------- - "
344 " -------- "
345 " ---- ----\n");
346 return 0;
347}
305 348
306#endif /* CONFIG_PROFILE_ALL_BRANCHES */ 349static void *all_branch_stat_start(void)
350{
351 return __start_branch_profile;
352}
307 353
308extern unsigned long __start_annotated_branch_profile[]; 354static void *
309extern unsigned long __stop_annotated_branch_profile[]; 355all_branch_stat_next(void *v, int idx)
356{
357 struct ftrace_branch_data *p = v;
310 358
311static const struct ftrace_pointer ftrace_annotated_branch_pos = { 359 ++p;
312 .start = __start_annotated_branch_profile,
313 .stop = __stop_annotated_branch_profile,
314};
315 360
316static __init int ftrace_branch_init(void) 361 if ((void *)p >= (void *)__stop_branch_profile)
317{ 362 return NULL;
318 struct dentry *d_tracer;
319 struct dentry *entry;
320 363
321 d_tracer = tracing_init_dentry(); 364 return p;
365}
322 366
323 entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer, 367static struct tracer_stat all_branch_stats = {
324 (void *)&ftrace_annotated_branch_pos, 368 .name = "branch_all",
325 &tracing_branch_fops); 369 .stat_start = all_branch_stat_start,
326 if (!entry) 370 .stat_next = all_branch_stat_next,
327 pr_warning("Could not create debugfs " 371 .stat_headers = all_branch_stat_headers,
328 "'profile_annotatet_branch' entry\n"); 372 .stat_show = branch_stat_show
373};
329 374
330#ifdef CONFIG_PROFILE_ALL_BRANCHES 375__init static int all_annotated_branch_stats(void)
331 entry = debugfs_create_file("profile_branch", 0444, d_tracer, 376{
332 (void *)&ftrace_branch_pos, 377 int ret;
333 &tracing_branch_fops);
334 if (!entry)
335 pr_warning("Could not create debugfs"
336 " 'profile_branch' entry\n");
337#endif
338 378
379 ret = register_stat_tracer(&all_branch_stats);
380 if (!ret) {
381 printk(KERN_WARNING "Warning: could not register "
382 "all branches stats\n");
383 return 1;
384 }
339 return 0; 385 return 0;
340} 386}
341 387fs_initcall(all_annotated_branch_stats);
342device_initcall(ftrace_branch_init); 388#endif /* CONFIG_PROFILE_ALL_BRANCHES */
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
new file mode 100644
index 000000000000..b588fd81f7f9
--- /dev/null
+++ b/kernel/trace/trace_clock.c
@@ -0,0 +1,109 @@
1/*
2 * tracing clocks
3 *
4 * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Implements 3 trace clock variants, with differing scalability/precision
7 * tradeoffs:
8 *
9 * - local: CPU-local trace clock
10 * - medium: scalable global clock with some jitter
11 * - global: globally monotonic, serialized clock
12 *
13 * Tracer plugins will chose a default from these clocks.
14 */
15#include <linux/spinlock.h>
16#include <linux/hardirq.h>
17#include <linux/module.h>
18#include <linux/percpu.h>
19#include <linux/sched.h>
20#include <linux/ktime.h>
21#include <linux/trace_clock.h>
22
23/*
24 * trace_clock_local(): the simplest and least coherent tracing clock.
25 *
26 * Useful for tracing that does not cross to other CPUs nor
27 * does it go through idle events.
28 */
29u64 notrace trace_clock_local(void)
30{
31 unsigned long flags;
32 u64 clock;
33
34 /*
35 * sched_clock() is an architecture implemented, fast, scalable,
36 * lockless clock. It is not guaranteed to be coherent across
37 * CPUs, nor across CPU idle events.
38 */
39 raw_local_irq_save(flags);
40 clock = sched_clock();
41 raw_local_irq_restore(flags);
42
43 return clock;
44}
45
46/*
47 * trace_clock(): 'inbetween' trace clock. Not completely serialized,
48 * but not completely incorrect when crossing CPUs either.
49 *
50 * This is based on cpu_clock(), which will allow at most ~1 jiffy of
51 * jitter between CPUs. So it's a pretty scalable clock, but there
52 * can be offsets in the trace data.
53 */
54u64 notrace trace_clock(void)
55{
56 return cpu_clock(raw_smp_processor_id());
57}
58
59
60/*
61 * trace_clock_global(): special globally coherent trace clock
62 *
63 * It has higher overhead than the other trace clocks but is still
64 * an order of magnitude faster than GTOD derived hardware clocks.
65 *
66 * Used by plugins that need globally coherent timestamps.
67 */
68
69static u64 prev_trace_clock_time;
70
71static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp =
72 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
73
74u64 notrace trace_clock_global(void)
75{
76 unsigned long flags;
77 int this_cpu;
78 u64 now;
79
80 raw_local_irq_save(flags);
81
82 this_cpu = raw_smp_processor_id();
83 now = cpu_clock(this_cpu);
84 /*
85 * If in an NMI context then dont risk lockups and return the
86 * cpu_clock() time:
87 */
88 if (unlikely(in_nmi()))
89 goto out;
90
91 __raw_spin_lock(&trace_clock_lock);
92
93 /*
94 * TODO: if this happens often then maybe we should reset
95 * my_scd->clock to prev_trace_clock_time+1, to make sure
96 * we start ticking with the local clock from now on?
97 */
98 if ((s64)(now - prev_trace_clock_time) < 0)
99 now = prev_trace_clock_time + 1;
100
101 prev_trace_clock_time = now;
102
103 __raw_spin_unlock(&trace_clock_lock);
104
105 out:
106 raw_local_irq_restore(flags);
107
108 return now;
109}
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
new file mode 100644
index 000000000000..22cba9970776
--- /dev/null
+++ b/kernel/trace/trace_event_profile.c
@@ -0,0 +1,31 @@
1/*
2 * trace event based perf counter profiling
3 *
4 * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
5 *
6 */
7
8#include "trace.h"
9
10int ftrace_profile_enable(int event_id)
11{
12 struct ftrace_event_call *event;
13
14 for_each_event(event) {
15 if (event->id == event_id)
16 return event->profile_enable(event);
17 }
18
19 return -EINVAL;
20}
21
22void ftrace_profile_disable(int event_id)
23{
24 struct ftrace_event_call *event;
25
26 for_each_event(event) {
27 if (event->id == event_id)
28 return event->profile_disable(event);
29 }
30}
31
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
new file mode 100644
index 000000000000..fd78bee71dd7
--- /dev/null
+++ b/kernel/trace/trace_event_types.h
@@ -0,0 +1,173 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ftrace
3
4/*
5 * We cheat and use the proto type field as the ID
6 * and args as the entry type (minus 'struct')
7 */
8TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
9 TRACE_STRUCT(
10 TRACE_FIELD(unsigned long, ip, ip)
11 TRACE_FIELD(unsigned long, parent_ip, parent_ip)
12 ),
13 TP_RAW_FMT(" %lx <-- %lx")
14);
15
16TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
17 ftrace_graph_ent_entry, ignore,
18 TRACE_STRUCT(
19 TRACE_FIELD(unsigned long, graph_ent.func, func)
20 TRACE_FIELD(int, graph_ent.depth, depth)
21 ),
22 TP_RAW_FMT("--> %lx (%d)")
23);
24
25TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
26 ftrace_graph_ret_entry, ignore,
27 TRACE_STRUCT(
28 TRACE_FIELD(unsigned long, ret.func, func)
29 TRACE_FIELD(int, ret.depth, depth)
30 ),
31 TP_RAW_FMT("<-- %lx (%d)")
32);
33
34TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
35 TRACE_STRUCT(
36 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
37 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
38 TRACE_FIELD(unsigned char, prev_state, prev_state)
39 TRACE_FIELD(unsigned int, next_pid, next_pid)
40 TRACE_FIELD(unsigned char, next_prio, next_prio)
41 TRACE_FIELD(unsigned char, next_state, next_state)
42 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
43 ),
44 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
45);
46
47TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
48 TRACE_STRUCT(
49 TRACE_FIELD(unsigned int, prev_pid, prev_pid)
50 TRACE_FIELD(unsigned char, prev_prio, prev_prio)
51 TRACE_FIELD(unsigned char, prev_state, prev_state)
52 TRACE_FIELD(unsigned int, next_pid, next_pid)
53 TRACE_FIELD(unsigned char, next_prio, next_prio)
54 TRACE_FIELD(unsigned char, next_state, next_state)
55 TRACE_FIELD(unsigned int, next_cpu, next_cpu)
56 ),
57 TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
58);
59
60TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
61 TRACE_STRUCT(
62 TRACE_FIELD(unsigned long, arg1, arg1)
63 TRACE_FIELD(unsigned long, arg2, arg2)
64 TRACE_FIELD(unsigned long, arg3, arg3)
65 ),
66 TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
67);
68
69/*
70 * Stack-trace entry:
71 */
72
73/* #define FTRACE_STACK_ENTRIES 8 */
74
75TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
76 TRACE_STRUCT(
77 TRACE_FIELD(unsigned long, caller[0], stack0)
78 TRACE_FIELD(unsigned long, caller[1], stack1)
79 TRACE_FIELD(unsigned long, caller[2], stack2)
80 TRACE_FIELD(unsigned long, caller[3], stack3)
81 TRACE_FIELD(unsigned long, caller[4], stack4)
82 TRACE_FIELD(unsigned long, caller[5], stack5)
83 TRACE_FIELD(unsigned long, caller[6], stack6)
84 TRACE_FIELD(unsigned long, caller[7], stack7)
85 ),
86 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
87 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
88);
89
90TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
91 TRACE_STRUCT(
92 TRACE_FIELD(unsigned long, caller[0], stack0)
93 TRACE_FIELD(unsigned long, caller[1], stack1)
94 TRACE_FIELD(unsigned long, caller[2], stack2)
95 TRACE_FIELD(unsigned long, caller[3], stack3)
96 TRACE_FIELD(unsigned long, caller[4], stack4)
97 TRACE_FIELD(unsigned long, caller[5], stack5)
98 TRACE_FIELD(unsigned long, caller[6], stack6)
99 TRACE_FIELD(unsigned long, caller[7], stack7)
100 ),
101 TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
102 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
103);
104
105TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
106 TRACE_STRUCT(
107 TRACE_FIELD(unsigned long, ip, ip)
108 TRACE_FIELD(char *, fmt, fmt)
109 TRACE_FIELD_ZERO_CHAR(buf)
110 ),
111 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
112);
113
114TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
115 TRACE_STRUCT(
116 TRACE_FIELD(unsigned long, ip, ip)
117 TRACE_FIELD_ZERO_CHAR(buf)
118 ),
119 TP_RAW_FMT("%08lx (%d) fmt:%p %s")
120);
121
122TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
123 TRACE_STRUCT(
124 TRACE_FIELD(unsigned int, line, line)
125 TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
126 TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
127 TRACE_FIELD(char, correct, correct)
128 ),
129 TP_RAW_FMT("%u:%s:%s (%u)")
130);
131
132TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
133 TRACE_STRUCT(
134 TRACE_FIELD(u64, from, from)
135 TRACE_FIELD(u64, to, to)
136 ),
137 TP_RAW_FMT("from: %llx to: %llx")
138);
139
140TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
141 TRACE_STRUCT(
142 TRACE_FIELD(ktime_t, state_data.stamp, stamp)
143 TRACE_FIELD(ktime_t, state_data.end, end)
144 TRACE_FIELD(int, state_data.type, type)
145 TRACE_FIELD(int, state_data.state, state)
146 ),
147 TP_RAW_FMT("%llx->%llx type:%u state:%u")
148);
149
150TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
151 TRACE_STRUCT(
152 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
153 TRACE_FIELD(unsigned long, call_site, call_site)
154 TRACE_FIELD(const void *, ptr, ptr)
155 TRACE_FIELD(size_t, bytes_req, bytes_req)
156 TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
157 TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
158 TRACE_FIELD(int, node, node)
159 ),
160 TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
161 " flags:%x node:%d")
162);
163
164TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
165 TRACE_STRUCT(
166 TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
167 TRACE_FIELD(unsigned long, call_site, call_site)
168 TRACE_FIELD(const void *, ptr, ptr)
169 ),
170 TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
171);
172
173#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
new file mode 100644
index 000000000000..576f4fa2af0d
--- /dev/null
+++ b/kernel/trace/trace_events.c
@@ -0,0 +1,828 @@
1/*
2 * event tracer
3 *
4 * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
5 *
6 * - Added format output of fields of the trace point.
7 * This was based off of work by Tom Zanussi <tzanussi@gmail.com>.
8 *
9 */
10
11#include <linux/debugfs.h>
12#include <linux/uaccess.h>
13#include <linux/module.h>
14#include <linux/ctype.h>
15
16#include "trace_output.h"
17
18#define TRACE_SYSTEM "TRACE_SYSTEM"
19
20static DEFINE_MUTEX(event_mutex);
21
22int trace_define_field(struct ftrace_event_call *call, char *type,
23 char *name, int offset, int size)
24{
25 struct ftrace_event_field *field;
26
27 field = kzalloc(sizeof(*field), GFP_KERNEL);
28 if (!field)
29 goto err;
30
31 field->name = kstrdup(name, GFP_KERNEL);
32 if (!field->name)
33 goto err;
34
35 field->type = kstrdup(type, GFP_KERNEL);
36 if (!field->type)
37 goto err;
38
39 field->offset = offset;
40 field->size = size;
41 list_add(&field->link, &call->fields);
42
43 return 0;
44
45err:
46 if (field) {
47 kfree(field->name);
48 kfree(field->type);
49 }
50 kfree(field);
51
52 return -ENOMEM;
53}
54
55static void ftrace_clear_events(void)
56{
57 struct ftrace_event_call *call = (void *)__start_ftrace_events;
58
59
60 while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
61
62 if (call->enabled) {
63 call->enabled = 0;
64 call->unregfunc();
65 }
66 call++;
67 }
68}
69
70static void ftrace_event_enable_disable(struct ftrace_event_call *call,
71 int enable)
72{
73
74 switch (enable) {
75 case 0:
76 if (call->enabled) {
77 call->enabled = 0;
78 call->unregfunc();
79 }
80 break;
81 case 1:
82 if (!call->enabled) {
83 call->enabled = 1;
84 call->regfunc();
85 }
86 break;
87 }
88}
89
90static int ftrace_set_clr_event(char *buf, int set)
91{
92 struct ftrace_event_call *call = __start_ftrace_events;
93 char *event = NULL, *sub = NULL, *match;
94 int ret = -EINVAL;
95
96 /*
97 * The buf format can be <subsystem>:<event-name>
98 * *:<event-name> means any event by that name.
99 * :<event-name> is the same.
100 *
101 * <subsystem>:* means all events in that subsystem
102 * <subsystem>: means the same.
103 *
104 * <name> (no ':') means all events in a subsystem with
105 * the name <name> or any event that matches <name>
106 */
107
108 match = strsep(&buf, ":");
109 if (buf) {
110 sub = match;
111 event = buf;
112 match = NULL;
113
114 if (!strlen(sub) || strcmp(sub, "*") == 0)
115 sub = NULL;
116 if (!strlen(event) || strcmp(event, "*") == 0)
117 event = NULL;
118 }
119
120 mutex_lock(&event_mutex);
121 for_each_event(call) {
122
123 if (!call->name || !call->regfunc)
124 continue;
125
126 if (match &&
127 strcmp(match, call->name) != 0 &&
128 strcmp(match, call->system) != 0)
129 continue;
130
131 if (sub && strcmp(sub, call->system) != 0)
132 continue;
133
134 if (event && strcmp(event, call->name) != 0)
135 continue;
136
137 ftrace_event_enable_disable(call, set);
138
139 ret = 0;
140 }
141 mutex_unlock(&event_mutex);
142
143 return ret;
144}
145
146/* 128 should be much more than enough */
147#define EVENT_BUF_SIZE 127
148
149static ssize_t
150ftrace_event_write(struct file *file, const char __user *ubuf,
151 size_t cnt, loff_t *ppos)
152{
153 size_t read = 0;
154 int i, set = 1;
155 ssize_t ret;
156 char *buf;
157 char ch;
158
159 if (!cnt || cnt < 0)
160 return 0;
161
162 ret = tracing_update_buffers();
163 if (ret < 0)
164 return ret;
165
166 ret = get_user(ch, ubuf++);
167 if (ret)
168 return ret;
169 read++;
170 cnt--;
171
172 /* skip white space */
173 while (cnt && isspace(ch)) {
174 ret = get_user(ch, ubuf++);
175 if (ret)
176 return ret;
177 read++;
178 cnt--;
179 }
180
181 /* Only white space found? */
182 if (isspace(ch)) {
183 file->f_pos += read;
184 ret = read;
185 return ret;
186 }
187
188 buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
189 if (!buf)
190 return -ENOMEM;
191
192 if (cnt > EVENT_BUF_SIZE)
193 cnt = EVENT_BUF_SIZE;
194
195 i = 0;
196 while (cnt && !isspace(ch)) {
197 if (!i && ch == '!')
198 set = 0;
199 else
200 buf[i++] = ch;
201
202 ret = get_user(ch, ubuf++);
203 if (ret)
204 goto out_free;
205 read++;
206 cnt--;
207 }
208 buf[i] = 0;
209
210 file->f_pos += read;
211
212 ret = ftrace_set_clr_event(buf, set);
213 if (ret)
214 goto out_free;
215
216 ret = read;
217
218 out_free:
219 kfree(buf);
220
221 return ret;
222}
223
224static void *
225t_next(struct seq_file *m, void *v, loff_t *pos)
226{
227 struct ftrace_event_call *call = m->private;
228 struct ftrace_event_call *next = call;
229
230 (*pos)++;
231
232 for (;;) {
233 if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
234 return NULL;
235
236 /*
237 * The ftrace subsystem is for showing formats only.
238 * They can not be enabled or disabled via the event files.
239 */
240 if (call->regfunc)
241 break;
242
243 call++;
244 next = call;
245 }
246
247 m->private = ++next;
248
249 return call;
250}
251
252static void *t_start(struct seq_file *m, loff_t *pos)
253{
254 return t_next(m, NULL, pos);
255}
256
257static void *
258s_next(struct seq_file *m, void *v, loff_t *pos)
259{
260 struct ftrace_event_call *call = m->private;
261 struct ftrace_event_call *next;
262
263 (*pos)++;
264
265 retry:
266 if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
267 return NULL;
268
269 if (!call->enabled) {
270 call++;
271 goto retry;
272 }
273
274 next = call;
275 m->private = ++next;
276
277 return call;
278}
279
280static void *s_start(struct seq_file *m, loff_t *pos)
281{
282 return s_next(m, NULL, pos);
283}
284
285static int t_show(struct seq_file *m, void *v)
286{
287 struct ftrace_event_call *call = v;
288
289 if (strcmp(call->system, TRACE_SYSTEM) != 0)
290 seq_printf(m, "%s:", call->system);
291 seq_printf(m, "%s\n", call->name);
292
293 return 0;
294}
295
296static void t_stop(struct seq_file *m, void *p)
297{
298}
299
300static int
301ftrace_event_seq_open(struct inode *inode, struct file *file)
302{
303 int ret;
304 const struct seq_operations *seq_ops;
305
306 if ((file->f_mode & FMODE_WRITE) &&
307 !(file->f_flags & O_APPEND))
308 ftrace_clear_events();
309
310 seq_ops = inode->i_private;
311 ret = seq_open(file, seq_ops);
312 if (!ret) {
313 struct seq_file *m = file->private_data;
314
315 m->private = __start_ftrace_events;
316 }
317 return ret;
318}
319
320static ssize_t
321event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
322 loff_t *ppos)
323{
324 struct ftrace_event_call *call = filp->private_data;
325 char *buf;
326
327 if (call->enabled)
328 buf = "1\n";
329 else
330 buf = "0\n";
331
332 return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
333}
334
335static ssize_t
336event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
337 loff_t *ppos)
338{
339 struct ftrace_event_call *call = filp->private_data;
340 char buf[64];
341 unsigned long val;
342 int ret;
343
344 if (cnt >= sizeof(buf))
345 return -EINVAL;
346
347 if (copy_from_user(&buf, ubuf, cnt))
348 return -EFAULT;
349
350 buf[cnt] = 0;
351
352 ret = strict_strtoul(buf, 10, &val);
353 if (ret < 0)
354 return ret;
355
356 ret = tracing_update_buffers();
357 if (ret < 0)
358 return ret;
359
360 switch (val) {
361 case 0:
362 case 1:
363 mutex_lock(&event_mutex);
364 ftrace_event_enable_disable(call, val);
365 mutex_unlock(&event_mutex);
366 break;
367
368 default:
369 return -EINVAL;
370 }
371
372 *ppos += cnt;
373
374 return cnt;
375}
376
377#undef FIELD
378#define FIELD(type, name) \
379 #type, "common_" #name, offsetof(typeof(field), name), \
380 sizeof(field.name)
381
382static int trace_write_header(struct trace_seq *s)
383{
384 struct trace_entry field;
385
386 /* struct trace_entry */
387 return trace_seq_printf(s,
388 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
389 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
390 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
391 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
392 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
393 "\n",
394 FIELD(unsigned char, type),
395 FIELD(unsigned char, flags),
396 FIELD(unsigned char, preempt_count),
397 FIELD(int, pid),
398 FIELD(int, tgid));
399}
400
401static ssize_t
402event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
403 loff_t *ppos)
404{
405 struct ftrace_event_call *call = filp->private_data;
406 struct trace_seq *s;
407 char *buf;
408 int r;
409
410 if (*ppos)
411 return 0;
412
413 s = kmalloc(sizeof(*s), GFP_KERNEL);
414 if (!s)
415 return -ENOMEM;
416
417 trace_seq_init(s);
418
419 /* If any of the first writes fail, so will the show_format. */
420
421 trace_seq_printf(s, "name: %s\n", call->name);
422 trace_seq_printf(s, "ID: %d\n", call->id);
423 trace_seq_printf(s, "format:\n");
424 trace_write_header(s);
425
426 r = call->show_format(s);
427 if (!r) {
428 /*
429 * ug! The format output is bigger than a PAGE!!
430 */
431 buf = "FORMAT TOO BIG\n";
432 r = simple_read_from_buffer(ubuf, cnt, ppos,
433 buf, strlen(buf));
434 goto out;
435 }
436
437 r = simple_read_from_buffer(ubuf, cnt, ppos,
438 s->buffer, s->len);
439 out:
440 kfree(s);
441 return r;
442}
443
444static ssize_t
445event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
446{
447 struct ftrace_event_call *call = filp->private_data;
448 struct trace_seq *s;
449 int r;
450
451 if (*ppos)
452 return 0;
453
454 s = kmalloc(sizeof(*s), GFP_KERNEL);
455 if (!s)
456 return -ENOMEM;
457
458 trace_seq_init(s);
459 trace_seq_printf(s, "%d\n", call->id);
460
461 r = simple_read_from_buffer(ubuf, cnt, ppos,
462 s->buffer, s->len);
463 kfree(s);
464 return r;
465}
466
467static ssize_t
468event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
469 loff_t *ppos)
470{
471 struct ftrace_event_call *call = filp->private_data;
472 struct trace_seq *s;
473 int r;
474
475 if (*ppos)
476 return 0;
477
478 s = kmalloc(sizeof(*s), GFP_KERNEL);
479 if (!s)
480 return -ENOMEM;
481
482 trace_seq_init(s);
483
484 filter_print_preds(call->preds, s);
485 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
486
487 kfree(s);
488
489 return r;
490}
491
492static ssize_t
493event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
494 loff_t *ppos)
495{
496 struct ftrace_event_call *call = filp->private_data;
497 char buf[64], *pbuf = buf;
498 struct filter_pred *pred;
499 int err;
500
501 if (cnt >= sizeof(buf))
502 return -EINVAL;
503
504 if (copy_from_user(&buf, ubuf, cnt))
505 return -EFAULT;
506 buf[cnt] = '\0';
507
508 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
509 if (!pred)
510 return -ENOMEM;
511
512 err = filter_parse(&pbuf, pred);
513 if (err < 0) {
514 filter_free_pred(pred);
515 return err;
516 }
517
518 if (pred->clear) {
519 filter_free_preds(call);
520 filter_free_pred(pred);
521 return cnt;
522 }
523
524 err = filter_add_pred(call, pred);
525 if (err < 0) {
526 filter_free_pred(pred);
527 return err;
528 }
529
530 *ppos += cnt;
531
532 return cnt;
533}
534
535static ssize_t
536subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
537 loff_t *ppos)
538{
539 struct event_subsystem *system = filp->private_data;
540 struct trace_seq *s;
541 int r;
542
543 if (*ppos)
544 return 0;
545
546 s = kmalloc(sizeof(*s), GFP_KERNEL);
547 if (!s)
548 return -ENOMEM;
549
550 trace_seq_init(s);
551
552 filter_print_preds(system->preds, s);
553 r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
554
555 kfree(s);
556
557 return r;
558}
559
560static ssize_t
561subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
562 loff_t *ppos)
563{
564 struct event_subsystem *system = filp->private_data;
565 char buf[64], *pbuf = buf;
566 struct filter_pred *pred;
567 int err;
568
569 if (cnt >= sizeof(buf))
570 return -EINVAL;
571
572 if (copy_from_user(&buf, ubuf, cnt))
573 return -EFAULT;
574 buf[cnt] = '\0';
575
576 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
577 if (!pred)
578 return -ENOMEM;
579
580 err = filter_parse(&pbuf, pred);
581 if (err < 0) {
582 filter_free_pred(pred);
583 return err;
584 }
585
586 if (pred->clear) {
587 filter_free_subsystem_preds(system);
588 filter_free_pred(pred);
589 return cnt;
590 }
591
592 err = filter_add_subsystem_pred(system, pred);
593 if (err < 0) {
594 filter_free_subsystem_preds(system);
595 filter_free_pred(pred);
596 return err;
597 }
598
599 *ppos += cnt;
600
601 return cnt;
602}
603
604static const struct seq_operations show_event_seq_ops = {
605 .start = t_start,
606 .next = t_next,
607 .show = t_show,
608 .stop = t_stop,
609};
610
611static const struct seq_operations show_set_event_seq_ops = {
612 .start = s_start,
613 .next = s_next,
614 .show = t_show,
615 .stop = t_stop,
616};
617
618static const struct file_operations ftrace_avail_fops = {
619 .open = ftrace_event_seq_open,
620 .read = seq_read,
621 .llseek = seq_lseek,
622 .release = seq_release,
623};
624
625static const struct file_operations ftrace_set_event_fops = {
626 .open = ftrace_event_seq_open,
627 .read = seq_read,
628 .write = ftrace_event_write,
629 .llseek = seq_lseek,
630 .release = seq_release,
631};
632
633static const struct file_operations ftrace_enable_fops = {
634 .open = tracing_open_generic,
635 .read = event_enable_read,
636 .write = event_enable_write,
637};
638
639static const struct file_operations ftrace_event_format_fops = {
640 .open = tracing_open_generic,
641 .read = event_format_read,
642};
643
644static const struct file_operations ftrace_event_id_fops = {
645 .open = tracing_open_generic,
646 .read = event_id_read,
647};
648
649static const struct file_operations ftrace_event_filter_fops = {
650 .open = tracing_open_generic,
651 .read = event_filter_read,
652 .write = event_filter_write,
653};
654
655static const struct file_operations ftrace_subsystem_filter_fops = {
656 .open = tracing_open_generic,
657 .read = subsystem_filter_read,
658 .write = subsystem_filter_write,
659};
660
661static struct dentry *event_trace_events_dir(void)
662{
663 static struct dentry *d_tracer;
664 static struct dentry *d_events;
665
666 if (d_events)
667 return d_events;
668
669 d_tracer = tracing_init_dentry();
670 if (!d_tracer)
671 return NULL;
672
673 d_events = debugfs_create_dir("events", d_tracer);
674 if (!d_events)
675 pr_warning("Could not create debugfs "
676 "'events' directory\n");
677
678 return d_events;
679}
680
681static LIST_HEAD(event_subsystems);
682
683static struct dentry *
684event_subsystem_dir(const char *name, struct dentry *d_events)
685{
686 struct event_subsystem *system;
687
688 /* First see if we did not already create this dir */
689 list_for_each_entry(system, &event_subsystems, list) {
690 if (strcmp(system->name, name) == 0)
691 return system->entry;
692 }
693
694 /* need to create new entry */
695 system = kmalloc(sizeof(*system), GFP_KERNEL);
696 if (!system) {
697 pr_warning("No memory to create event subsystem %s\n",
698 name);
699 return d_events;
700 }
701
702 system->entry = debugfs_create_dir(name, d_events);
703 if (!system->entry) {
704 pr_warning("Could not create event subsystem %s\n",
705 name);
706 kfree(system);
707 return d_events;
708 }
709
710 system->name = name;
711 list_add(&system->list, &event_subsystems);
712
713 system->preds = NULL;
714
715 return system->entry;
716}
717
718static int
719event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
720{
721 struct dentry *entry;
722 int ret;
723
724 /*
725 * If the trace point header did not define TRACE_SYSTEM
726 * then the system would be called "TRACE_SYSTEM".
727 */
728 if (strcmp(call->system, "TRACE_SYSTEM") != 0)
729 d_events = event_subsystem_dir(call->system, d_events);
730
731 if (call->raw_init) {
732 ret = call->raw_init();
733 if (ret < 0) {
734 pr_warning("Could not initialize trace point"
735 " events/%s\n", call->name);
736 return ret;
737 }
738 }
739
740 call->dir = debugfs_create_dir(call->name, d_events);
741 if (!call->dir) {
742 pr_warning("Could not create debugfs "
743 "'%s' directory\n", call->name);
744 return -1;
745 }
746
747 if (call->regfunc) {
748 entry = debugfs_create_file("enable", 0644, call->dir, call,
749 &ftrace_enable_fops);
750 if (!entry)
751 pr_warning("Could not create debugfs "
752 "'%s/enable' entry\n", call->name);
753 }
754
755 if (call->id) {
756 entry = debugfs_create_file("id", 0444, call->dir, call,
757 &ftrace_event_id_fops);
758 if (!entry)
759 pr_warning("Could not create debugfs '%s/id' entry\n",
760 call->name);
761 }
762
763 if (call->define_fields) {
764 ret = call->define_fields();
765 if (ret < 0) {
766 pr_warning("Could not initialize trace point"
767 " events/%s\n", call->name);
768 return ret;
769 }
770 entry = debugfs_create_file("filter", 0644, call->dir, call,
771 &ftrace_event_filter_fops);
772 if (!entry)
773 pr_warning("Could not create debugfs "
774 "'%s/filter' entry\n", call->name);
775 }
776
777 /* A trace may not want to export its format */
778 if (!call->show_format)
779 return 0;
780
781 entry = debugfs_create_file("format", 0444, call->dir, call,
782 &ftrace_event_format_fops);
783 if (!entry)
784 pr_warning("Could not create debugfs "
785 "'%s/format' entry\n", call->name);
786
787 return 0;
788}
789
790static __init int event_trace_init(void)
791{
792 struct ftrace_event_call *call = __start_ftrace_events;
793 struct dentry *d_tracer;
794 struct dentry *entry;
795 struct dentry *d_events;
796
797 d_tracer = tracing_init_dentry();
798 if (!d_tracer)
799 return 0;
800
801 entry = debugfs_create_file("available_events", 0444, d_tracer,
802 (void *)&show_event_seq_ops,
803 &ftrace_avail_fops);
804 if (!entry)
805 pr_warning("Could not create debugfs "
806 "'available_events' entry\n");
807
808 entry = debugfs_create_file("set_event", 0644, d_tracer,
809 (void *)&show_set_event_seq_ops,
810 &ftrace_set_event_fops);
811 if (!entry)
812 pr_warning("Could not create debugfs "
813 "'set_event' entry\n");
814
815 d_events = event_trace_events_dir();
816 if (!d_events)
817 return 0;
818
819 for_each_event(call) {
820 /* The linker may leave blanks */
821 if (!call->name)
822 continue;
823 event_create_dir(call, d_events);
824 }
825
826 return 0;
827}
828fs_initcall(event_trace_init);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
new file mode 100644
index 000000000000..e03cbf1e38f3
--- /dev/null
+++ b/kernel/trace/trace_events_filter.c
@@ -0,0 +1,433 @@
1/*
2 * trace_events_filter - generic event filtering
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
19 */
20
21#include <linux/debugfs.h>
22#include <linux/uaccess.h>
23#include <linux/module.h>
24#include <linux/ctype.h>
25
26#include "trace.h"
27#include "trace_output.h"
28
29static int filter_pred_64(struct filter_pred *pred, void *event)
30{
31 u64 *addr = (u64 *)(event + pred->offset);
32 u64 val = (u64)pred->val;
33 int match;
34
35 match = (val == *addr) ^ pred->not;
36
37 return match;
38}
39
40static int filter_pred_32(struct filter_pred *pred, void *event)
41{
42 u32 *addr = (u32 *)(event + pred->offset);
43 u32 val = (u32)pred->val;
44 int match;
45
46 match = (val == *addr) ^ pred->not;
47
48 return match;
49}
50
51static int filter_pred_16(struct filter_pred *pred, void *event)
52{
53 u16 *addr = (u16 *)(event + pred->offset);
54 u16 val = (u16)pred->val;
55 int match;
56
57 match = (val == *addr) ^ pred->not;
58
59 return match;
60}
61
62static int filter_pred_8(struct filter_pred *pred, void *event)
63{
64 u8 *addr = (u8 *)(event + pred->offset);
65 u8 val = (u8)pred->val;
66 int match;
67
68 match = (val == *addr) ^ pred->not;
69
70 return match;
71}
72
73static int filter_pred_string(struct filter_pred *pred, void *event)
74{
75 char *addr = (char *)(event + pred->offset);
76 int cmp, match;
77
78 cmp = strncmp(addr, pred->str_val, pred->str_len);
79
80 match = (!cmp) ^ pred->not;
81
82 return match;
83}
84
85/* return 1 if event matches, 0 otherwise (discard) */
86int filter_match_preds(struct ftrace_event_call *call, void *rec)
87{
88 int i, matched, and_failed = 0;
89 struct filter_pred *pred;
90
91 for (i = 0; i < MAX_FILTER_PRED; i++) {
92 if (call->preds[i]) {
93 pred = call->preds[i];
94 if (and_failed && !pred->or)
95 continue;
96 matched = pred->fn(pred, rec);
97 if (!matched && !pred->or) {
98 and_failed = 1;
99 continue;
100 } else if (matched && pred->or)
101 return 1;
102 } else
103 break;
104 }
105
106 if (and_failed)
107 return 0;
108
109 return 1;
110}
111
112void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
113{
114 char *field_name;
115 struct filter_pred *pred;
116 int i;
117
118 if (!preds) {
119 trace_seq_printf(s, "none\n");
120 return;
121 }
122
123 for (i = 0; i < MAX_FILTER_PRED; i++) {
124 if (preds[i]) {
125 pred = preds[i];
126 field_name = pred->field_name;
127 if (i)
128 trace_seq_printf(s, pred->or ? "|| " : "&& ");
129 trace_seq_printf(s, "%s ", field_name);
130 trace_seq_printf(s, pred->not ? "!= " : "== ");
131 if (pred->str_val)
132 trace_seq_printf(s, "%s\n", pred->str_val);
133 else
134 trace_seq_printf(s, "%llu\n", pred->val);
135 } else
136 break;
137 }
138}
139
140static struct ftrace_event_field *
141find_event_field(struct ftrace_event_call *call, char *name)
142{
143 struct ftrace_event_field *field;
144
145 list_for_each_entry(field, &call->fields, link) {
146 if (!strcmp(field->name, name))
147 return field;
148 }
149
150 return NULL;
151}
152
153void filter_free_pred(struct filter_pred *pred)
154{
155 if (!pred)
156 return;
157
158 kfree(pred->field_name);
159 kfree(pred->str_val);
160 kfree(pred);
161}
162
163void filter_free_preds(struct ftrace_event_call *call)
164{
165 int i;
166
167 if (call->preds) {
168 for (i = 0; i < MAX_FILTER_PRED; i++)
169 filter_free_pred(call->preds[i]);
170 kfree(call->preds);
171 call->preds = NULL;
172 }
173}
174
175void filter_free_subsystem_preds(struct event_subsystem *system)
176{
177 struct ftrace_event_call *call = __start_ftrace_events;
178 int i;
179
180 if (system->preds) {
181 for (i = 0; i < MAX_FILTER_PRED; i++)
182 filter_free_pred(system->preds[i]);
183 kfree(system->preds);
184 system->preds = NULL;
185 }
186
187 events_for_each(call) {
188 if (!call->name || !call->regfunc)
189 continue;
190
191 if (!strcmp(call->system, system->name))
192 filter_free_preds(call);
193 }
194}
195
196static int __filter_add_pred(struct ftrace_event_call *call,
197 struct filter_pred *pred)
198{
199 int i;
200
201 if (call->preds && !pred->compound)
202 filter_free_preds(call);
203
204 if (!call->preds) {
205 call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
206 GFP_KERNEL);
207 if (!call->preds)
208 return -ENOMEM;
209 }
210
211 for (i = 0; i < MAX_FILTER_PRED; i++) {
212 if (!call->preds[i]) {
213 call->preds[i] = pred;
214 return 0;
215 }
216 }
217
218 return -ENOSPC;
219}
220
221static int is_string_field(const char *type)
222{
223 if (strchr(type, '[') && strstr(type, "char"))
224 return 1;
225
226 return 0;
227}
228
229int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
230{
231 struct ftrace_event_field *field;
232
233 field = find_event_field(call, pred->field_name);
234 if (!field)
235 return -EINVAL;
236
237 pred->offset = field->offset;
238
239 if (is_string_field(field->type)) {
240 if (!pred->str_val)
241 return -EINVAL;
242 pred->fn = filter_pred_string;
243 pred->str_len = field->size;
244 return __filter_add_pred(call, pred);
245 } else {
246 if (pred->str_val)
247 return -EINVAL;
248 }
249
250 switch (field->size) {
251 case 8:
252 pred->fn = filter_pred_64;
253 break;
254 case 4:
255 pred->fn = filter_pred_32;
256 break;
257 case 2:
258 pred->fn = filter_pred_16;
259 break;
260 case 1:
261 pred->fn = filter_pred_8;
262 break;
263 default:
264 return -EINVAL;
265 }
266
267 return __filter_add_pred(call, pred);
268}
269
270static struct filter_pred *copy_pred(struct filter_pred *pred)
271{
272 struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
273 if (!new_pred)
274 return NULL;
275
276 memcpy(new_pred, pred, sizeof(*pred));
277
278 if (pred->field_name) {
279 new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
280 if (!new_pred->field_name) {
281 kfree(new_pred);
282 return NULL;
283 }
284 }
285
286 if (pred->str_val) {
287 new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
288 if (!new_pred->str_val) {
289 filter_free_pred(new_pred);
290 return NULL;
291 }
292 }
293
294 return new_pred;
295}
296
297int filter_add_subsystem_pred(struct event_subsystem *system,
298 struct filter_pred *pred)
299{
300 struct ftrace_event_call *call = __start_ftrace_events;
301 struct filter_pred *event_pred;
302 int i;
303
304 if (system->preds && !pred->compound)
305 filter_free_subsystem_preds(system);
306
307 if (!system->preds) {
308 system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
309 GFP_KERNEL);
310 if (!system->preds)
311 return -ENOMEM;
312 }
313
314 for (i = 0; i < MAX_FILTER_PRED; i++) {
315 if (!system->preds[i]) {
316 system->preds[i] = pred;
317 break;
318 }
319 }
320
321 if (i == MAX_FILTER_PRED)
322 return -ENOSPC;
323
324 events_for_each(call) {
325 int err;
326
327 if (!call->name || !call->regfunc)
328 continue;
329
330 if (strcmp(call->system, system->name))
331 continue;
332
333 if (!find_event_field(call, pred->field_name))
334 continue;
335
336 event_pred = copy_pred(pred);
337 if (!event_pred)
338 goto oom;
339
340 err = filter_add_pred(call, event_pred);
341 if (err)
342 filter_free_pred(event_pred);
343 if (err == -ENOMEM)
344 goto oom;
345 }
346
347 return 0;
348
349oom:
350 system->preds[i] = NULL;
351 return -ENOMEM;
352}
353
354int filter_parse(char **pbuf, struct filter_pred *pred)
355{
356 char *tmp, *tok, *val_str = NULL;
357 int tok_n = 0;
358
359 /* field ==/!= number, or/and field ==/!= number, number */
360 while ((tok = strsep(pbuf, " \n"))) {
361 if (tok_n == 0) {
362 if (!strcmp(tok, "0")) {
363 pred->clear = 1;
364 return 0;
365 } else if (!strcmp(tok, "&&")) {
366 pred->or = 0;
367 pred->compound = 1;
368 } else if (!strcmp(tok, "||")) {
369 pred->or = 1;
370 pred->compound = 1;
371 } else
372 pred->field_name = tok;
373 tok_n = 1;
374 continue;
375 }
376 if (tok_n == 1) {
377 if (!pred->field_name)
378 pred->field_name = tok;
379 else if (!strcmp(tok, "!="))
380 pred->not = 1;
381 else if (!strcmp(tok, "=="))
382 pred->not = 0;
383 else {
384 pred->field_name = NULL;
385 return -EINVAL;
386 }
387 tok_n = 2;
388 continue;
389 }
390 if (tok_n == 2) {
391 if (pred->compound) {
392 if (!strcmp(tok, "!="))
393 pred->not = 1;
394 else if (!strcmp(tok, "=="))
395 pred->not = 0;
396 else {
397 pred->field_name = NULL;
398 return -EINVAL;
399 }
400 } else {
401 val_str = tok;
402 break; /* done */
403 }
404 tok_n = 3;
405 continue;
406 }
407 if (tok_n == 3) {
408 val_str = tok;
409 break; /* done */
410 }
411 }
412
413 if (!val_str) {
414 pred->field_name = NULL;
415 return -EINVAL;
416 }
417
418 pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
419 if (!pred->field_name)
420 return -ENOMEM;
421
422 pred->val = simple_strtoull(val_str, &tmp, 0);
423 if (tmp == val_str) {
424 pred->str_val = kstrdup(val_str, GFP_KERNEL);
425 if (!pred->str_val)
426 return -ENOMEM;
427 } else if (*tmp != '\0')
428 return -EINVAL;
429
430 return 0;
431}
432
433
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
new file mode 100644
index 000000000000..38985f9b379c
--- /dev/null
+++ b/kernel/trace/trace_events_stage_1.h
@@ -0,0 +1,39 @@
1/*
2 * Stage 1 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * struct ftrace_raw_<call> {
7 * struct trace_entry ent;
8 * <type> <item>;
9 * <type2> <item2>[<len>];
10 * [...]
11 * };
12 *
13 * The <type> <item> is created by the __field(type, item) macro or
14 * the __array(type2, item2, len) macro.
15 * We simply do "type item;", and that will create the fields
16 * in the structure.
17 */
18
19#undef TRACE_FORMAT
20#define TRACE_FORMAT(call, proto, args, fmt)
21
22#undef __array
23#define __array(type, item, len) type item[len];
24
25#undef __field
26#define __field(type, item) type item;
27
28#undef TP_STRUCT__entry
29#define TP_STRUCT__entry(args...) args
30
31#undef TRACE_EVENT
32#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
33 struct ftrace_raw_##name { \
34 struct trace_entry ent; \
35 tstruct \
36 }; \
37 static struct ftrace_event_call event_##name
38
39#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
new file mode 100644
index 000000000000..d363c6672c6c
--- /dev/null
+++ b/kernel/trace/trace_events_stage_2.h
@@ -0,0 +1,176 @@
1/*
2 * Stage 2 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * enum print_line_t
7 * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
8 * {
9 * struct trace_seq *s = &iter->seq;
10 * struct ftrace_raw_<call> *field; <-- defined in stage 1
11 * struct trace_entry *entry;
12 * int ret;
13 *
14 * entry = iter->ent;
15 *
16 * if (entry->type != event_<call>.id) {
17 * WARN_ON_ONCE(1);
18 * return TRACE_TYPE_UNHANDLED;
19 * }
20 *
21 * field = (typeof(field))entry;
22 *
23 * ret = trace_seq_printf(s, <TP_printk> "\n");
24 * if (!ret)
25 * return TRACE_TYPE_PARTIAL_LINE;
26 *
27 * return TRACE_TYPE_HANDLED;
28 * }
29 *
30 * This is the method used to print the raw event to the trace
31 * output format. Note, this is not needed if the data is read
32 * in binary.
33 */
34
35#undef __entry
36#define __entry field
37
38#undef TP_printk
39#define TP_printk(fmt, args...) fmt "\n", args
40
41#undef TRACE_EVENT
42#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
43enum print_line_t \
44ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
45{ \
46 struct trace_seq *s = &iter->seq; \
47 struct ftrace_raw_##call *field; \
48 struct trace_entry *entry; \
49 int ret; \
50 \
51 entry = iter->ent; \
52 \
53 if (entry->type != event_##call.id) { \
54 WARN_ON_ONCE(1); \
55 return TRACE_TYPE_UNHANDLED; \
56 } \
57 \
58 field = (typeof(field))entry; \
59 \
60 ret = trace_seq_printf(s, #call ": " print); \
61 if (!ret) \
62 return TRACE_TYPE_PARTIAL_LINE; \
63 \
64 return TRACE_TYPE_HANDLED; \
65}
66
67#include <trace/trace_event_types.h>
68
69/*
70 * Setup the showing format of trace point.
71 *
72 * int
73 * ftrace_format_##call(struct trace_seq *s)
74 * {
75 * struct ftrace_raw_##call field;
76 * int ret;
77 *
78 * ret = trace_seq_printf(s, #type " " #item ";"
79 * " offset:%u; size:%u;\n",
80 * offsetof(struct ftrace_raw_##call, item),
81 * sizeof(field.type));
82 *
83 * }
84 */
85
86#undef TP_STRUCT__entry
87#define TP_STRUCT__entry(args...) args
88
89#undef __field
90#define __field(type, item) \
91 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
92 "offset:%u;\tsize:%u;\n", \
93 (unsigned int)offsetof(typeof(field), item), \
94 (unsigned int)sizeof(field.item)); \
95 if (!ret) \
96 return 0;
97
98#undef __array
99#define __array(type, item, len) \
100 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
101 "offset:%u;\tsize:%u;\n", \
102 (unsigned int)offsetof(typeof(field), item), \
103 (unsigned int)sizeof(field.item)); \
104 if (!ret) \
105 return 0;
106
107#undef __entry
108#define __entry REC
109
110#undef TP_printk
111#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
112
113#undef TP_fast_assign
114#define TP_fast_assign(args...) args
115
116#undef TRACE_EVENT
117#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
118static int \
119ftrace_format_##call(struct trace_seq *s) \
120{ \
121 struct ftrace_raw_##call field; \
122 int ret; \
123 \
124 tstruct; \
125 \
126 trace_seq_printf(s, "\nprint fmt: " print); \
127 \
128 return ret; \
129}
130
131#include <trace/trace_event_types.h>
132
133#undef __field
134#define __field(type, item) \
135 ret = trace_define_field(event_call, #type, #item, \
136 offsetof(typeof(field), item), \
137 sizeof(field.item)); \
138 if (ret) \
139 return ret;
140
141#undef __array
142#define __array(type, item, len) \
143 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
144 offsetof(typeof(field), item), \
145 sizeof(field.item)); \
146 if (ret) \
147 return ret;
148
149#define __common_field(type, item) \
150 ret = trace_define_field(event_call, #type, "common_" #item, \
151 offsetof(typeof(field.ent), item), \
152 sizeof(field.ent.item)); \
153 if (ret) \
154 return ret;
155
156#undef TRACE_EVENT
157#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
158int \
159ftrace_define_fields_##call(void) \
160{ \
161 struct ftrace_raw_##call field; \
162 struct ftrace_event_call *event_call = &event_##call; \
163 int ret; \
164 \
165 __common_field(unsigned char, type); \
166 __common_field(unsigned char, flags); \
167 __common_field(unsigned char, preempt_count); \
168 __common_field(int, pid); \
169 __common_field(int, tgid); \
170 \
171 tstruct; \
172 \
173 return ret; \
174}
175
176#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
new file mode 100644
index 000000000000..9d2fa78cecca
--- /dev/null
+++ b/kernel/trace/trace_events_stage_3.h
@@ -0,0 +1,281 @@
1/*
2 * Stage 3 of the trace events.
3 *
4 * Override the macros in <trace/trace_event_types.h> to include the following:
5 *
6 * static void ftrace_event_<call>(proto)
7 * {
8 * event_trace_printk(_RET_IP_, "<call>: " <fmt>);
9 * }
10 *
11 * static int ftrace_reg_event_<call>(void)
12 * {
13 * int ret;
14 *
15 * ret = register_trace_<call>(ftrace_event_<call>);
16 * if (!ret)
17 * pr_info("event trace: Could not activate trace point "
18 * "probe to <call>");
19 * return ret;
20 * }
21 *
22 * static void ftrace_unreg_event_<call>(void)
23 * {
24 * unregister_trace_<call>(ftrace_event_<call>);
25 * }
26 *
27 * For those macros defined with TRACE_FORMAT:
28 *
29 * static struct ftrace_event_call __used
30 * __attribute__((__aligned__(4)))
31 * __attribute__((section("_ftrace_events"))) event_<call> = {
32 * .name = "<call>",
33 * .regfunc = ftrace_reg_event_<call>,
34 * .unregfunc = ftrace_unreg_event_<call>,
35 * }
36 *
37 *
38 * For those macros defined with TRACE_EVENT:
39 *
40 * static struct ftrace_event_call event_<call>;
41 *
42 * static void ftrace_raw_event_<call>(proto)
43 * {
44 * struct ring_buffer_event *event;
45 * struct ftrace_raw_<call> *entry; <-- defined in stage 1
46 * unsigned long irq_flags;
47 * int pc;
48 *
49 * local_save_flags(irq_flags);
50 * pc = preempt_count();
51 *
52 * event = trace_current_buffer_lock_reserve(event_<call>.id,
53 * sizeof(struct ftrace_raw_<call>),
54 * irq_flags, pc);
55 * if (!event)
56 * return;
57 * entry = ring_buffer_event_data(event);
58 *
59 * <assign>; <-- Here we assign the entries by the __field and
60 * __array macros.
61 *
62 * trace_current_buffer_unlock_commit(event, irq_flags, pc);
63 * }
64 *
65 * static int ftrace_raw_reg_event_<call>(void)
66 * {
67 * int ret;
68 *
69 * ret = register_trace_<call>(ftrace_raw_event_<call>);
70 * if (!ret)
71 * pr_info("event trace: Could not activate trace point "
72 * "probe to <call>");
73 * return ret;
74 * }
75 *
76 * static void ftrace_unreg_event_<call>(void)
77 * {
78 * unregister_trace_<call>(ftrace_raw_event_<call>);
79 * }
80 *
81 * static struct trace_event ftrace_event_type_<call> = {
82 * .trace = ftrace_raw_output_<call>, <-- stage 2
83 * };
84 *
85 * static int ftrace_raw_init_event_<call>(void)
86 * {
87 * int id;
88 *
89 * id = register_ftrace_event(&ftrace_event_type_<call>);
90 * if (!id)
91 * return -ENODEV;
92 * event_<call>.id = id;
93 * return 0;
94 * }
95 *
96 * static struct ftrace_event_call __used
97 * __attribute__((__aligned__(4)))
98 * __attribute__((section("_ftrace_events"))) event_<call> = {
99 * .name = "<call>",
100 * .system = "<system>",
101 * .raw_init = ftrace_raw_init_event_<call>,
102 * .regfunc = ftrace_reg_event_<call>,
103 * .unregfunc = ftrace_unreg_event_<call>,
104 * .show_format = ftrace_format_<call>,
105 * }
106 *
107 */
108
109#undef TP_FMT
110#define TP_FMT(fmt, args...) fmt "\n", ##args
111
112#ifdef CONFIG_EVENT_PROFILE
113#define _TRACE_PROFILE(call, proto, args) \
114static void ftrace_profile_##call(proto) \
115{ \
116 extern void perf_tpcounter_event(int); \
117 perf_tpcounter_event(event_##call.id); \
118} \
119 \
120static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
121{ \
122 int ret = 0; \
123 \
124 if (!atomic_inc_return(&call->profile_count)) \
125 ret = register_trace_##call(ftrace_profile_##call); \
126 \
127 return ret; \
128} \
129 \
130static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
131{ \
132 if (atomic_add_negative(-1, &call->profile_count)) \
133 unregister_trace_##call(ftrace_profile_##call); \
134}
135
136#define _TRACE_PROFILE_INIT(call) \
137 .profile_count = ATOMIC_INIT(-1), \
138 .profile_enable = ftrace_profile_enable_##call, \
139 .profile_disable = ftrace_profile_disable_##call,
140
141#else
142#define _TRACE_PROFILE(call, proto, args)
143#define _TRACE_PROFILE_INIT(call)
144#endif
145
146#define _TRACE_FORMAT(call, proto, args, fmt) \
147static void ftrace_event_##call(proto) \
148{ \
149 event_trace_printk(_RET_IP_, #call ": " fmt); \
150} \
151 \
152static int ftrace_reg_event_##call(void) \
153{ \
154 int ret; \
155 \
156 ret = register_trace_##call(ftrace_event_##call); \
157 if (ret) \
158 pr_info("event trace: Could not activate trace point " \
159 "probe to " #call "\n"); \
160 return ret; \
161} \
162 \
163static void ftrace_unreg_event_##call(void) \
164{ \
165 unregister_trace_##call(ftrace_event_##call); \
166} \
167 \
168static struct ftrace_event_call event_##call; \
169 \
170static int ftrace_init_event_##call(void) \
171{ \
172 int id; \
173 \
174 id = register_ftrace_event(NULL); \
175 if (!id) \
176 return -ENODEV; \
177 event_##call.id = id; \
178 return 0; \
179}
180
181#undef TRACE_FORMAT
182#define TRACE_FORMAT(call, proto, args, fmt) \
183_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \
184_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
185static struct ftrace_event_call __used \
186__attribute__((__aligned__(4))) \
187__attribute__((section("_ftrace_events"))) event_##call = { \
188 .name = #call, \
189 .system = __stringify(TRACE_SYSTEM), \
190 .raw_init = ftrace_init_event_##call, \
191 .regfunc = ftrace_reg_event_##call, \
192 .unregfunc = ftrace_unreg_event_##call, \
193 _TRACE_PROFILE_INIT(call) \
194}
195
196#undef __entry
197#define __entry entry
198
199#undef TRACE_EVENT
200#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
201_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
202 \
203static struct ftrace_event_call event_##call; \
204 \
205static void ftrace_raw_event_##call(proto) \
206{ \
207 struct ftrace_event_call *call = &event_##call; \
208 struct ring_buffer_event *event; \
209 struct ftrace_raw_##call *entry; \
210 unsigned long irq_flags; \
211 int pc; \
212 \
213 local_save_flags(irq_flags); \
214 pc = preempt_count(); \
215 \
216 event = trace_current_buffer_lock_reserve(event_##call.id, \
217 sizeof(struct ftrace_raw_##call), \
218 irq_flags, pc); \
219 if (!event) \
220 return; \
221 entry = ring_buffer_event_data(event); \
222 \
223 assign; \
224 \
225 if (call->preds && !filter_match_preds(call, entry)) \
226 ring_buffer_event_discard(event); \
227 \
228 trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
229 \
230} \
231 \
232static int ftrace_raw_reg_event_##call(void) \
233{ \
234 int ret; \
235 \
236 ret = register_trace_##call(ftrace_raw_event_##call); \
237 if (ret) \
238 pr_info("event trace: Could not activate trace point " \
239 "probe to " #call "\n"); \
240 return ret; \
241} \
242 \
243static void ftrace_raw_unreg_event_##call(void) \
244{ \
245 unregister_trace_##call(ftrace_raw_event_##call); \
246} \
247 \
248static struct trace_event ftrace_event_type_##call = { \
249 .trace = ftrace_raw_output_##call, \
250}; \
251 \
252static int ftrace_raw_init_event_##call(void) \
253{ \
254 int id; \
255 \
256 id = register_ftrace_event(&ftrace_event_type_##call); \
257 if (!id) \
258 return -ENODEV; \
259 event_##call.id = id; \
260 INIT_LIST_HEAD(&event_##call.fields); \
261 return 0; \
262} \
263 \
264static struct ftrace_event_call __used \
265__attribute__((__aligned__(4))) \
266__attribute__((section("_ftrace_events"))) event_##call = { \
267 .name = #call, \
268 .system = __stringify(TRACE_SYSTEM), \
269 .raw_init = ftrace_raw_init_event_##call, \
270 .regfunc = ftrace_raw_reg_event_##call, \
271 .unregfunc = ftrace_raw_unreg_event_##call, \
272 .show_format = ftrace_format_##call, \
273 .define_fields = ftrace_define_fields_##call, \
274 _TRACE_PROFILE_INIT(call) \
275}
276
277#include <trace/trace_event_types.h>
278
279#undef _TRACE_PROFILE
280#undef _TRACE_PROFILE_INIT
281
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
new file mode 100644
index 000000000000..07a22c33ebf3
--- /dev/null
+++ b/kernel/trace/trace_export.c
@@ -0,0 +1,102 @@
1/*
2 * trace_export.c - export basic ftrace utilities to user space
3 *
4 * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/stringify.h>
7#include <linux/kallsyms.h>
8#include <linux/seq_file.h>
9#include <linux/debugfs.h>
10#include <linux/uaccess.h>
11#include <linux/ftrace.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/fs.h>
15
16#include "trace_output.h"
17
18
19#undef TRACE_STRUCT
20#define TRACE_STRUCT(args...) args
21
22#undef TRACE_FIELD
23#define TRACE_FIELD(type, item, assign) \
24 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
25 "offset:%u;\tsize:%u;\n", \
26 (unsigned int)offsetof(typeof(field), item), \
27 (unsigned int)sizeof(field.item)); \
28 if (!ret) \
29 return 0;
30
31
32#undef TRACE_FIELD_SPECIAL
33#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
34 ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \
35 "offset:%u;\tsize:%u;\n", \
36 (unsigned int)offsetof(typeof(field), item), \
37 (unsigned int)sizeof(field.item)); \
38 if (!ret) \
39 return 0;
40
41#undef TRACE_FIELD_ZERO_CHAR
42#define TRACE_FIELD_ZERO_CHAR(item) \
43 ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \
44 "offset:%u;\tsize:0;\n", \
45 (unsigned int)offsetof(typeof(field), item)); \
46 if (!ret) \
47 return 0;
48
49
50#undef TP_RAW_FMT
51#define TP_RAW_FMT(args...) args
52
53#undef TRACE_EVENT_FORMAT
54#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
55static int \
56ftrace_format_##call(struct trace_seq *s) \
57{ \
58 struct args field; \
59 int ret; \
60 \
61 tstruct; \
62 \
63 trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
64 \
65 return ret; \
66}
67
68#include "trace_event_types.h"
69
70#undef TRACE_ZERO_CHAR
71#define TRACE_ZERO_CHAR(arg)
72
73#undef TRACE_FIELD
74#define TRACE_FIELD(type, item, assign)\
75 entry->item = assign;
76
77#undef TRACE_FIELD
78#define TRACE_FIELD(type, item, assign)\
79 entry->item = assign;
80
81#undef TP_CMD
82#define TP_CMD(cmd...) cmd
83
84#undef TRACE_ENTRY
85#define TRACE_ENTRY entry
86
87#undef TRACE_FIELD_SPECIAL
88#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
89 cmd;
90
91#undef TRACE_EVENT_FORMAT
92#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
93 \
94static struct ftrace_event_call __used \
95__attribute__((__aligned__(4))) \
96__attribute__((section("_ftrace_events"))) event_##call = { \
97 .name = #call, \
98 .id = proto, \
99 .system = __stringify(TRACE_SYSTEM), \
100 .show_format = ftrace_format_##call, \
101}
102#include "trace_event_types.h"
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9236d7e25a16..c9a0b7df44ff 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -9,6 +9,7 @@
9 * Copyright (C) 2004-2006 Ingo Molnar 9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III 10 * Copyright (C) 2004 William Lee Irwin III
11 */ 11 */
12#include <linux/ring_buffer.h>
12#include <linux/debugfs.h> 13#include <linux/debugfs.h>
13#include <linux/uaccess.h> 14#include <linux/uaccess.h>
14#include <linux/ftrace.h> 15#include <linux/ftrace.h>
@@ -16,52 +17,388 @@
16 17
17#include "trace.h" 18#include "trace.h"
18 19
19static void start_function_trace(struct trace_array *tr) 20/* function tracing enabled */
21static int ftrace_function_enabled;
22
23static struct trace_array *func_trace;
24
25static void tracing_start_function_trace(void);
26static void tracing_stop_function_trace(void);
27
28static int function_trace_init(struct trace_array *tr)
20{ 29{
30 func_trace = tr;
21 tr->cpu = get_cpu(); 31 tr->cpu = get_cpu();
22 tracing_reset_online_cpus(tr);
23 put_cpu(); 32 put_cpu();
24 33
25 tracing_start_cmdline_record(); 34 tracing_start_cmdline_record();
26 tracing_start_function_trace(); 35 tracing_start_function_trace();
36 return 0;
27} 37}
28 38
29static void stop_function_trace(struct trace_array *tr) 39static void function_trace_reset(struct trace_array *tr)
30{ 40{
31 tracing_stop_function_trace(); 41 tracing_stop_function_trace();
32 tracing_stop_cmdline_record(); 42 tracing_stop_cmdline_record();
33} 43}
34 44
35static int function_trace_init(struct trace_array *tr) 45static void function_trace_start(struct trace_array *tr)
36{ 46{
37 start_function_trace(tr); 47 tracing_reset_online_cpus(tr);
38 return 0;
39} 48}
40 49
41static void function_trace_reset(struct trace_array *tr) 50static void
51function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
52{
53 struct trace_array *tr = func_trace;
54 struct trace_array_cpu *data;
55 unsigned long flags;
56 long disabled;
57 int cpu, resched;
58 int pc;
59
60 if (unlikely(!ftrace_function_enabled))
61 return;
62
63 pc = preempt_count();
64 resched = ftrace_preempt_disable();
65 local_save_flags(flags);
66 cpu = raw_smp_processor_id();
67 data = tr->data[cpu];
68 disabled = atomic_inc_return(&data->disabled);
69
70 if (likely(disabled == 1))
71 trace_function(tr, ip, parent_ip, flags, pc);
72
73 atomic_dec(&data->disabled);
74 ftrace_preempt_enable(resched);
75}
76
77static void
78function_trace_call(unsigned long ip, unsigned long parent_ip)
42{ 79{
43 stop_function_trace(tr); 80 struct trace_array *tr = func_trace;
81 struct trace_array_cpu *data;
82 unsigned long flags;
83 long disabled;
84 int cpu;
85 int pc;
86
87 if (unlikely(!ftrace_function_enabled))
88 return;
89
90 /*
91 * Need to use raw, since this must be called before the
92 * recursive protection is performed.
93 */
94 local_irq_save(flags);
95 cpu = raw_smp_processor_id();
96 data = tr->data[cpu];
97 disabled = atomic_inc_return(&data->disabled);
98
99 if (likely(disabled == 1)) {
100 pc = preempt_count();
101 trace_function(tr, ip, parent_ip, flags, pc);
102 }
103
104 atomic_dec(&data->disabled);
105 local_irq_restore(flags);
44} 106}
45 107
46static void function_trace_start(struct trace_array *tr) 108static void
109function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
47{ 110{
48 tracing_reset_online_cpus(tr); 111 struct trace_array *tr = func_trace;
112 struct trace_array_cpu *data;
113 unsigned long flags;
114 long disabled;
115 int cpu;
116 int pc;
117
118 if (unlikely(!ftrace_function_enabled))
119 return;
120
121 /*
122 * Need to use raw, since this must be called before the
123 * recursive protection is performed.
124 */
125 local_irq_save(flags);
126 cpu = raw_smp_processor_id();
127 data = tr->data[cpu];
128 disabled = atomic_inc_return(&data->disabled);
129
130 if (likely(disabled == 1)) {
131 pc = preempt_count();
132 trace_function(tr, ip, parent_ip, flags, pc);
133 /*
134 * skip over 5 funcs:
135 * __ftrace_trace_stack,
136 * __trace_stack,
137 * function_stack_trace_call
138 * ftrace_list_func
139 * ftrace_call
140 */
141 __trace_stack(tr, flags, 5, pc);
142 }
143
144 atomic_dec(&data->disabled);
145 local_irq_restore(flags);
146}
147
148
149static struct ftrace_ops trace_ops __read_mostly =
150{
151 .func = function_trace_call,
152};
153
154static struct ftrace_ops trace_stack_ops __read_mostly =
155{
156 .func = function_stack_trace_call,
157};
158
159/* Our two options */
160enum {
161 TRACE_FUNC_OPT_STACK = 0x1,
162};
163
164static struct tracer_opt func_opts[] = {
165#ifdef CONFIG_STACKTRACE
166 { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
167#endif
168 { } /* Always set a last empty entry */
169};
170
171static struct tracer_flags func_flags = {
172 .val = 0, /* By default: all flags disabled */
173 .opts = func_opts
174};
175
176static void tracing_start_function_trace(void)
177{
178 ftrace_function_enabled = 0;
179
180 if (trace_flags & TRACE_ITER_PREEMPTONLY)
181 trace_ops.func = function_trace_call_preempt_only;
182 else
183 trace_ops.func = function_trace_call;
184
185 if (func_flags.val & TRACE_FUNC_OPT_STACK)
186 register_ftrace_function(&trace_stack_ops);
187 else
188 register_ftrace_function(&trace_ops);
189
190 ftrace_function_enabled = 1;
191}
192
193static void tracing_stop_function_trace(void)
194{
195 ftrace_function_enabled = 0;
196 /* OK if they are not registered */
197 unregister_ftrace_function(&trace_stack_ops);
198 unregister_ftrace_function(&trace_ops);
199}
200
201static int func_set_flag(u32 old_flags, u32 bit, int set)
202{
203 if (bit == TRACE_FUNC_OPT_STACK) {
204 /* do nothing if already set */
205 if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
206 return 0;
207
208 if (set) {
209 unregister_ftrace_function(&trace_ops);
210 register_ftrace_function(&trace_stack_ops);
211 } else {
212 unregister_ftrace_function(&trace_stack_ops);
213 register_ftrace_function(&trace_ops);
214 }
215
216 return 0;
217 }
218
219 return -EINVAL;
49} 220}
50 221
51static struct tracer function_trace __read_mostly = 222static struct tracer function_trace __read_mostly =
52{ 223{
53 .name = "function", 224 .name = "function",
54 .init = function_trace_init, 225 .init = function_trace_init,
55 .reset = function_trace_reset, 226 .reset = function_trace_reset,
56 .start = function_trace_start, 227 .start = function_trace_start,
228 .wait_pipe = poll_wait_pipe,
229 .flags = &func_flags,
230 .set_flag = func_set_flag,
57#ifdef CONFIG_FTRACE_SELFTEST 231#ifdef CONFIG_FTRACE_SELFTEST
58 .selftest = trace_selftest_startup_function, 232 .selftest = trace_selftest_startup_function,
59#endif 233#endif
60}; 234};
61 235
236#ifdef CONFIG_DYNAMIC_FTRACE
237static void
238ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
239{
240 long *count = (long *)data;
241
242 if (tracing_is_on())
243 return;
244
245 if (!*count)
246 return;
247
248 if (*count != -1)
249 (*count)--;
250
251 tracing_on();
252}
253
254static void
255ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
256{
257 long *count = (long *)data;
258
259 if (!tracing_is_on())
260 return;
261
262 if (!*count)
263 return;
264
265 if (*count != -1)
266 (*count)--;
267
268 tracing_off();
269}
270
271static int
272ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
273 struct ftrace_probe_ops *ops, void *data);
274
275static struct ftrace_probe_ops traceon_probe_ops = {
276 .func = ftrace_traceon,
277 .print = ftrace_trace_onoff_print,
278};
279
280static struct ftrace_probe_ops traceoff_probe_ops = {
281 .func = ftrace_traceoff,
282 .print = ftrace_trace_onoff_print,
283};
284
285static int
286ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
287 struct ftrace_probe_ops *ops, void *data)
288{
289 char str[KSYM_SYMBOL_LEN];
290 long count = (long)data;
291
292 kallsyms_lookup(ip, NULL, NULL, NULL, str);
293 seq_printf(m, "%s:", str);
294
295 if (ops == &traceon_probe_ops)
296 seq_printf(m, "traceon");
297 else
298 seq_printf(m, "traceoff");
299
300 if (count == -1)
301 seq_printf(m, ":unlimited\n");
302 else
303 seq_printf(m, ":count=%ld", count);
304 seq_putc(m, '\n');
305
306 return 0;
307}
308
309static int
310ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
311{
312 struct ftrace_probe_ops *ops;
313
314 /* we register both traceon and traceoff to this callback */
315 if (strcmp(cmd, "traceon") == 0)
316 ops = &traceon_probe_ops;
317 else
318 ops = &traceoff_probe_ops;
319
320 unregister_ftrace_function_probe_func(glob, ops);
321
322 return 0;
323}
324
325static int
326ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
327{
328 struct ftrace_probe_ops *ops;
329 void *count = (void *)-1;
330 char *number;
331 int ret;
332
333 /* hash funcs only work with set_ftrace_filter */
334 if (!enable)
335 return -EINVAL;
336
337 if (glob[0] == '!')
338 return ftrace_trace_onoff_unreg(glob+1, cmd, param);
339
340 /* we register both traceon and traceoff to this callback */
341 if (strcmp(cmd, "traceon") == 0)
342 ops = &traceon_probe_ops;
343 else
344 ops = &traceoff_probe_ops;
345
346 if (!param)
347 goto out_reg;
348
349 number = strsep(&param, ":");
350
351 if (!strlen(number))
352 goto out_reg;
353
354 /*
355 * We use the callback data field (which is a pointer)
356 * as our counter.
357 */
358 ret = strict_strtoul(number, 0, (unsigned long *)&count);
359 if (ret)
360 return ret;
361
362 out_reg:
363 ret = register_ftrace_function_probe(glob, ops, count);
364
365 return ret;
366}
367
368static struct ftrace_func_command ftrace_traceon_cmd = {
369 .name = "traceon",
370 .func = ftrace_trace_onoff_callback,
371};
372
373static struct ftrace_func_command ftrace_traceoff_cmd = {
374 .name = "traceoff",
375 .func = ftrace_trace_onoff_callback,
376};
377
378static int __init init_func_cmd_traceon(void)
379{
380 int ret;
381
382 ret = register_ftrace_command(&ftrace_traceoff_cmd);
383 if (ret)
384 return ret;
385
386 ret = register_ftrace_command(&ftrace_traceon_cmd);
387 if (ret)
388 unregister_ftrace_command(&ftrace_traceoff_cmd);
389 return ret;
390}
391#else
392static inline int init_func_cmd_traceon(void)
393{
394 return 0;
395}
396#endif /* CONFIG_DYNAMIC_FTRACE */
397
62static __init int init_function_trace(void) 398static __init int init_function_trace(void)
63{ 399{
400 init_func_cmd_traceon();
64 return register_tracer(&function_trace); 401 return register_tracer(&function_trace);
65} 402}
66
67device_initcall(init_function_trace); 403device_initcall(init_function_trace);
404
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index dce71a5b51bc..d28687e7b3a7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * 2 *
3 * Function graph tracer. 3 * Function graph tracer.
4 * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com> 4 * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
5 * Mostly borrowed from function tracer which 5 * Mostly borrowed from function tracer which
6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com> 6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
7 * 7 *
@@ -12,6 +12,12 @@
12#include <linux/fs.h> 12#include <linux/fs.h>
13 13
14#include "trace.h" 14#include "trace.h"
15#include "trace_output.h"
16
17struct fgraph_data {
18 pid_t last_pid;
19 int depth;
20};
15 21
16#define TRACE_GRAPH_INDENT 2 22#define TRACE_GRAPH_INDENT 2
17 23
@@ -20,9 +26,11 @@
20#define TRACE_GRAPH_PRINT_CPU 0x2 26#define TRACE_GRAPH_PRINT_CPU 0x2
21#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 27#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
22#define TRACE_GRAPH_PRINT_PROC 0x8 28#define TRACE_GRAPH_PRINT_PROC 0x8
29#define TRACE_GRAPH_PRINT_DURATION 0x10
30#define TRACE_GRAPH_PRINT_ABS_TIME 0X20
23 31
24static struct tracer_opt trace_opts[] = { 32static struct tracer_opt trace_opts[] = {
25 /* Display overruns ? */ 33 /* Display overruns? (for self-debug purpose) */
26 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, 34 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
27 /* Display CPU ? */ 35 /* Display CPU ? */
28 { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, 36 { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
@@ -30,23 +38,28 @@ static struct tracer_opt trace_opts[] = {
30 { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, 38 { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
31 /* Display proc name/pid */ 39 /* Display proc name/pid */
32 { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, 40 { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
41 /* Display duration of execution */
42 { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
43 /* Display absolute time of an entry */
44 { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
33 { } /* Empty entry */ 45 { } /* Empty entry */
34}; 46};
35 47
36static struct tracer_flags tracer_flags = { 48static struct tracer_flags tracer_flags = {
37 /* Don't display overruns and proc by default */ 49 /* Don't display overruns and proc by default */
38 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD, 50 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
51 TRACE_GRAPH_PRINT_DURATION,
39 .opts = trace_opts 52 .opts = trace_opts
40}; 53};
41 54
42/* pid on the last trace processed */ 55/* pid on the last trace processed */
43static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 }; 56
44 57
45/* Add a function return address to the trace stack on thread info.*/ 58/* Add a function return address to the trace stack on thread info.*/
46int 59int
47ftrace_push_return_trace(unsigned long ret, unsigned long long time, 60ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
48 unsigned long func, int *depth)
49{ 61{
62 unsigned long long calltime;
50 int index; 63 int index;
51 64
52 if (!current->ret_stack) 65 if (!current->ret_stack)
@@ -58,11 +71,13 @@ ftrace_push_return_trace(unsigned long ret, unsigned long long time,
58 return -EBUSY; 71 return -EBUSY;
59 } 72 }
60 73
74 calltime = trace_clock_local();
75
61 index = ++current->curr_ret_stack; 76 index = ++current->curr_ret_stack;
62 barrier(); 77 barrier();
63 current->ret_stack[index].ret = ret; 78 current->ret_stack[index].ret = ret;
64 current->ret_stack[index].func = func; 79 current->ret_stack[index].func = func;
65 current->ret_stack[index].calltime = time; 80 current->ret_stack[index].calltime = calltime;
66 *depth = index; 81 *depth = index;
67 82
68 return 0; 83 return 0;
@@ -104,7 +119,7 @@ unsigned long ftrace_return_to_handler(void)
104 unsigned long ret; 119 unsigned long ret;
105 120
106 ftrace_pop_return_trace(&trace, &ret); 121 ftrace_pop_return_trace(&trace, &ret);
107 trace.rettime = cpu_clock(raw_smp_processor_id()); 122 trace.rettime = trace_clock_local();
108 ftrace_graph_return(&trace); 123 ftrace_graph_return(&trace);
109 124
110 if (unlikely(!ret)) { 125 if (unlikely(!ret)) {
@@ -119,12 +134,7 @@ unsigned long ftrace_return_to_handler(void)
119 134
120static int graph_trace_init(struct trace_array *tr) 135static int graph_trace_init(struct trace_array *tr)
121{ 136{
122 int cpu, ret; 137 int ret = register_ftrace_graph(&trace_graph_return,
123
124 for_each_online_cpu(cpu)
125 tracing_reset(tr, cpu);
126
127 ret = register_ftrace_graph(&trace_graph_return,
128 &trace_graph_entry); 138 &trace_graph_entry);
129 if (ret) 139 if (ret)
130 return ret; 140 return ret;
@@ -187,15 +197,15 @@ print_graph_cpu(struct trace_seq *s, int cpu)
187static enum print_line_t 197static enum print_line_t
188print_graph_proc(struct trace_seq *s, pid_t pid) 198print_graph_proc(struct trace_seq *s, pid_t pid)
189{ 199{
190 int i; 200 char comm[TASK_COMM_LEN];
191 int ret;
192 int len;
193 char comm[8];
194 int spaces = 0;
195 /* sign + log10(MAX_INT) + '\0' */ 201 /* sign + log10(MAX_INT) + '\0' */
196 char pid_str[11]; 202 char pid_str[11];
203 int spaces = 0;
204 int ret;
205 int len;
206 int i;
197 207
198 strncpy(comm, trace_find_cmdline(pid), 7); 208 trace_find_cmdline(pid, comm);
199 comm[7] = '\0'; 209 comm[7] = '\0';
200 sprintf(pid_str, "%d", pid); 210 sprintf(pid_str, "%d", pid);
201 211
@@ -228,17 +238,25 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
228 238
229/* If the pid changed since the last trace, output this event */ 239/* If the pid changed since the last trace, output this event */
230static enum print_line_t 240static enum print_line_t
231verif_pid(struct trace_seq *s, pid_t pid, int cpu) 241verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
232{ 242{
233 pid_t prev_pid; 243 pid_t prev_pid;
244 pid_t *last_pid;
234 int ret; 245 int ret;
235 246
236 if (last_pid[cpu] != -1 && last_pid[cpu] == pid) 247 if (!data)
248 return TRACE_TYPE_HANDLED;
249
250 last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
251
252 if (*last_pid == pid)
237 return TRACE_TYPE_HANDLED; 253 return TRACE_TYPE_HANDLED;
238 254
239 prev_pid = last_pid[cpu]; 255 prev_pid = *last_pid;
240 last_pid[cpu] = pid; 256 *last_pid = pid;
241 257
258 if (prev_pid == -1)
259 return TRACE_TYPE_HANDLED;
242/* 260/*
243 * Context-switch trace line: 261 * Context-switch trace line:
244 262
@@ -250,34 +268,34 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu)
250 ret = trace_seq_printf(s, 268 ret = trace_seq_printf(s,
251 " ------------------------------------------\n"); 269 " ------------------------------------------\n");
252 if (!ret) 270 if (!ret)
253 TRACE_TYPE_PARTIAL_LINE; 271 return TRACE_TYPE_PARTIAL_LINE;
254 272
255 ret = print_graph_cpu(s, cpu); 273 ret = print_graph_cpu(s, cpu);
256 if (ret == TRACE_TYPE_PARTIAL_LINE) 274 if (ret == TRACE_TYPE_PARTIAL_LINE)
257 TRACE_TYPE_PARTIAL_LINE; 275 return TRACE_TYPE_PARTIAL_LINE;
258 276
259 ret = print_graph_proc(s, prev_pid); 277 ret = print_graph_proc(s, prev_pid);
260 if (ret == TRACE_TYPE_PARTIAL_LINE) 278 if (ret == TRACE_TYPE_PARTIAL_LINE)
261 TRACE_TYPE_PARTIAL_LINE; 279 return TRACE_TYPE_PARTIAL_LINE;
262 280
263 ret = trace_seq_printf(s, " => "); 281 ret = trace_seq_printf(s, " => ");
264 if (!ret) 282 if (!ret)
265 TRACE_TYPE_PARTIAL_LINE; 283 return TRACE_TYPE_PARTIAL_LINE;
266 284
267 ret = print_graph_proc(s, pid); 285 ret = print_graph_proc(s, pid);
268 if (ret == TRACE_TYPE_PARTIAL_LINE) 286 if (ret == TRACE_TYPE_PARTIAL_LINE)
269 TRACE_TYPE_PARTIAL_LINE; 287 return TRACE_TYPE_PARTIAL_LINE;
270 288
271 ret = trace_seq_printf(s, 289 ret = trace_seq_printf(s,
272 "\n ------------------------------------------\n\n"); 290 "\n ------------------------------------------\n\n");
273 if (!ret) 291 if (!ret)
274 TRACE_TYPE_PARTIAL_LINE; 292 return TRACE_TYPE_PARTIAL_LINE;
275 293
276 return ret; 294 return TRACE_TYPE_HANDLED;
277} 295}
278 296
279static bool 297static struct ftrace_graph_ret_entry *
280trace_branch_is_leaf(struct trace_iterator *iter, 298get_return_for_leaf(struct trace_iterator *iter,
281 struct ftrace_graph_ent_entry *curr) 299 struct ftrace_graph_ent_entry *curr)
282{ 300{
283 struct ring_buffer_iter *ring_iter; 301 struct ring_buffer_iter *ring_iter;
@@ -286,65 +304,123 @@ trace_branch_is_leaf(struct trace_iterator *iter,
286 304
287 ring_iter = iter->buffer_iter[iter->cpu]; 305 ring_iter = iter->buffer_iter[iter->cpu];
288 306
289 if (!ring_iter) 307 /* First peek to compare current entry and the next one */
290 return false; 308 if (ring_iter)
291 309 event = ring_buffer_iter_peek(ring_iter, NULL);
292 event = ring_buffer_iter_peek(ring_iter, NULL); 310 else {
311 /* We need to consume the current entry to see the next one */
312 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
313 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
314 NULL);
315 }
293 316
294 if (!event) 317 if (!event)
295 return false; 318 return NULL;
296 319
297 next = ring_buffer_event_data(event); 320 next = ring_buffer_event_data(event);
298 321
299 if (next->ent.type != TRACE_GRAPH_RET) 322 if (next->ent.type != TRACE_GRAPH_RET)
300 return false; 323 return NULL;
301 324
302 if (curr->ent.pid != next->ent.pid || 325 if (curr->ent.pid != next->ent.pid ||
303 curr->graph_ent.func != next->ret.func) 326 curr->graph_ent.func != next->ret.func)
304 return false; 327 return NULL;
328
329 /* this is a leaf, now advance the iterator */
330 if (ring_iter)
331 ring_buffer_read(ring_iter, NULL);
332
333 return next;
334}
335
336/* Signal a overhead of time execution to the output */
337static int
338print_graph_overhead(unsigned long long duration, struct trace_seq *s)
339{
340 /* If duration disappear, we don't need anything */
341 if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
342 return 1;
343
344 /* Non nested entry or return */
345 if (duration == -1)
346 return trace_seq_printf(s, " ");
347
348 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
349 /* Duration exceeded 100 msecs */
350 if (duration > 100000ULL)
351 return trace_seq_printf(s, "! ");
352
353 /* Duration exceeded 10 msecs */
354 if (duration > 10000ULL)
355 return trace_seq_printf(s, "+ ");
356 }
357
358 return trace_seq_printf(s, " ");
359}
360
361static int print_graph_abs_time(u64 t, struct trace_seq *s)
362{
363 unsigned long usecs_rem;
364
365 usecs_rem = do_div(t, NSEC_PER_SEC);
366 usecs_rem /= 1000;
305 367
306 return true; 368 return trace_seq_printf(s, "%5lu.%06lu | ",
369 (unsigned long)t, usecs_rem);
307} 370}
308 371
309static enum print_line_t 372static enum print_line_t
310print_graph_irq(struct trace_seq *s, unsigned long addr, 373print_graph_irq(struct trace_iterator *iter, unsigned long addr,
311 enum trace_type type, int cpu, pid_t pid) 374 enum trace_type type, int cpu, pid_t pid)
312{ 375{
313 int ret; 376 int ret;
377 struct trace_seq *s = &iter->seq;
314 378
315 if (addr < (unsigned long)__irqentry_text_start || 379 if (addr < (unsigned long)__irqentry_text_start ||
316 addr >= (unsigned long)__irqentry_text_end) 380 addr >= (unsigned long)__irqentry_text_end)
317 return TRACE_TYPE_UNHANDLED; 381 return TRACE_TYPE_UNHANDLED;
318 382
319 if (type == TRACE_GRAPH_ENT) { 383 /* Absolute time */
320 ret = trace_seq_printf(s, "==========> | "); 384 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
321 } else { 385 ret = print_graph_abs_time(iter->ts, s);
322 /* Cpu */ 386 if (!ret)
323 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 387 return TRACE_TYPE_PARTIAL_LINE;
324 ret = print_graph_cpu(s, cpu); 388 }
325 if (ret == TRACE_TYPE_PARTIAL_LINE)
326 return TRACE_TYPE_PARTIAL_LINE;
327 }
328 /* Proc */
329 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
330 ret = print_graph_proc(s, pid);
331 if (ret == TRACE_TYPE_PARTIAL_LINE)
332 return TRACE_TYPE_PARTIAL_LINE;
333 389
334 ret = trace_seq_printf(s, " | "); 390 /* Cpu */
335 if (!ret) 391 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
336 return TRACE_TYPE_PARTIAL_LINE; 392 ret = print_graph_cpu(s, cpu);
337 } 393 if (ret == TRACE_TYPE_PARTIAL_LINE)
394 return TRACE_TYPE_PARTIAL_LINE;
395 }
396 /* Proc */
397 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
398 ret = print_graph_proc(s, pid);
399 if (ret == TRACE_TYPE_PARTIAL_LINE)
400 return TRACE_TYPE_PARTIAL_LINE;
401 ret = trace_seq_printf(s, " | ");
402 if (!ret)
403 return TRACE_TYPE_PARTIAL_LINE;
404 }
338 405
339 /* No overhead */ 406 /* No overhead */
340 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 407 ret = print_graph_overhead(-1, s);
341 ret = trace_seq_printf(s, " "); 408 if (!ret)
342 if (!ret) 409 return TRACE_TYPE_PARTIAL_LINE;
343 return TRACE_TYPE_PARTIAL_LINE; 410
344 } 411 if (type == TRACE_GRAPH_ENT)
412 ret = trace_seq_printf(s, "==========>");
413 else
414 ret = trace_seq_printf(s, "<==========");
415
416 if (!ret)
417 return TRACE_TYPE_PARTIAL_LINE;
418
419 /* Don't close the duration column if haven't one */
420 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
421 trace_seq_printf(s, " |");
422 ret = trace_seq_printf(s, "\n");
345 423
346 ret = trace_seq_printf(s, "<========== |\n");
347 }
348 if (!ret) 424 if (!ret)
349 return TRACE_TYPE_PARTIAL_LINE; 425 return TRACE_TYPE_PARTIAL_LINE;
350 return TRACE_TYPE_HANDLED; 426 return TRACE_TYPE_HANDLED;
@@ -363,7 +439,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
363 sprintf(msecs_str, "%lu", (unsigned long) duration); 439 sprintf(msecs_str, "%lu", (unsigned long) duration);
364 440
365 /* Print msecs */ 441 /* Print msecs */
366 ret = trace_seq_printf(s, msecs_str); 442 ret = trace_seq_printf(s, "%s", msecs_str);
367 if (!ret) 443 if (!ret)
368 return TRACE_TYPE_PARTIAL_LINE; 444 return TRACE_TYPE_PARTIAL_LINE;
369 445
@@ -396,52 +472,47 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
396 472
397} 473}
398 474
399/* Signal a overhead of time execution to the output */
400static int
401print_graph_overhead(unsigned long long duration, struct trace_seq *s)
402{
403 /* Duration exceeded 100 msecs */
404 if (duration > 100000ULL)
405 return trace_seq_printf(s, "! ");
406
407 /* Duration exceeded 10 msecs */
408 if (duration > 10000ULL)
409 return trace_seq_printf(s, "+ ");
410
411 return trace_seq_printf(s, " ");
412}
413
414/* Case of a leaf function on its call entry */ 475/* Case of a leaf function on its call entry */
415static enum print_line_t 476static enum print_line_t
416print_graph_entry_leaf(struct trace_iterator *iter, 477print_graph_entry_leaf(struct trace_iterator *iter,
417 struct ftrace_graph_ent_entry *entry, struct trace_seq *s) 478 struct ftrace_graph_ent_entry *entry,
479 struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
418{ 480{
419 struct ftrace_graph_ret_entry *ret_entry; 481 struct fgraph_data *data = iter->private;
420 struct ftrace_graph_ret *graph_ret; 482 struct ftrace_graph_ret *graph_ret;
421 struct ring_buffer_event *event;
422 struct ftrace_graph_ent *call; 483 struct ftrace_graph_ent *call;
423 unsigned long long duration; 484 unsigned long long duration;
424 int ret; 485 int ret;
425 int i; 486 int i;
426 487
427 event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
428 ret_entry = ring_buffer_event_data(event);
429 graph_ret = &ret_entry->ret; 488 graph_ret = &ret_entry->ret;
430 call = &entry->graph_ent; 489 call = &entry->graph_ent;
431 duration = graph_ret->rettime - graph_ret->calltime; 490 duration = graph_ret->rettime - graph_ret->calltime;
432 491
433 /* Overhead */ 492 if (data) {
434 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 493 int cpu = iter->cpu;
435 ret = print_graph_overhead(duration, s); 494 int *depth = &(per_cpu_ptr(data, cpu)->depth);
436 if (!ret) 495
437 return TRACE_TYPE_PARTIAL_LINE; 496 /*
497 * Comments display at + 1 to depth. Since
498 * this is a leaf function, keep the comments
499 * equal to this depth.
500 */
501 *depth = call->depth - 1;
438 } 502 }
439 503
440 /* Duration */ 504 /* Overhead */
441 ret = print_graph_duration(duration, s); 505 ret = print_graph_overhead(duration, s);
442 if (ret == TRACE_TYPE_PARTIAL_LINE) 506 if (!ret)
443 return TRACE_TYPE_PARTIAL_LINE; 507 return TRACE_TYPE_PARTIAL_LINE;
444 508
509 /* Duration */
510 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
511 ret = print_graph_duration(duration, s);
512 if (ret == TRACE_TYPE_PARTIAL_LINE)
513 return TRACE_TYPE_PARTIAL_LINE;
514 }
515
445 /* Function */ 516 /* Function */
446 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 517 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
447 ret = trace_seq_printf(s, " "); 518 ret = trace_seq_printf(s, " ");
@@ -461,33 +532,34 @@ print_graph_entry_leaf(struct trace_iterator *iter,
461} 532}
462 533
463static enum print_line_t 534static enum print_line_t
464print_graph_entry_nested(struct ftrace_graph_ent_entry *entry, 535print_graph_entry_nested(struct trace_iterator *iter,
465 struct trace_seq *s, pid_t pid, int cpu) 536 struct ftrace_graph_ent_entry *entry,
537 struct trace_seq *s, int cpu)
466{ 538{
467 int i;
468 int ret;
469 struct ftrace_graph_ent *call = &entry->graph_ent; 539 struct ftrace_graph_ent *call = &entry->graph_ent;
540 struct fgraph_data *data = iter->private;
541 int ret;
542 int i;
470 543
471 /* No overhead */ 544 if (data) {
472 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 545 int cpu = iter->cpu;
473 ret = trace_seq_printf(s, " "); 546 int *depth = &(per_cpu_ptr(data, cpu)->depth);
474 if (!ret) 547
475 return TRACE_TYPE_PARTIAL_LINE; 548 *depth = call->depth;
476 } 549 }
477 550
478 /* Interrupt */ 551 /* No overhead */
479 ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid); 552 ret = print_graph_overhead(-1, s);
480 if (ret == TRACE_TYPE_UNHANDLED) { 553 if (!ret)
481 /* No time */ 554 return TRACE_TYPE_PARTIAL_LINE;
555
556 /* No time */
557 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
482 ret = trace_seq_printf(s, " | "); 558 ret = trace_seq_printf(s, " | ");
483 if (!ret) 559 if (!ret)
484 return TRACE_TYPE_PARTIAL_LINE; 560 return TRACE_TYPE_PARTIAL_LINE;
485 } else {
486 if (ret == TRACE_TYPE_PARTIAL_LINE)
487 return TRACE_TYPE_PARTIAL_LINE;
488 } 561 }
489 562
490
491 /* Function */ 563 /* Function */
492 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { 564 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
493 ret = trace_seq_printf(s, " "); 565 ret = trace_seq_printf(s, " ");
@@ -503,20 +575,40 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
503 if (!ret) 575 if (!ret)
504 return TRACE_TYPE_PARTIAL_LINE; 576 return TRACE_TYPE_PARTIAL_LINE;
505 577
506 return TRACE_TYPE_HANDLED; 578 /*
579 * we already consumed the current entry to check the next one
580 * and see if this is a leaf.
581 */
582 return TRACE_TYPE_NO_CONSUME;
507} 583}
508 584
509static enum print_line_t 585static enum print_line_t
510print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 586print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
511 struct trace_iterator *iter, int cpu) 587 int type, unsigned long addr)
512{ 588{
513 int ret; 589 struct fgraph_data *data = iter->private;
514 struct trace_entry *ent = iter->ent; 590 struct trace_entry *ent = iter->ent;
591 int cpu = iter->cpu;
592 int ret;
515 593
516 /* Pid */ 594 /* Pid */
517 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) 595 if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE)
518 return TRACE_TYPE_PARTIAL_LINE; 596 return TRACE_TYPE_PARTIAL_LINE;
519 597
598 if (type) {
599 /* Interrupt */
600 ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
601 if (ret == TRACE_TYPE_PARTIAL_LINE)
602 return TRACE_TYPE_PARTIAL_LINE;
603 }
604
605 /* Absolute time */
606 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
607 ret = print_graph_abs_time(iter->ts, s);
608 if (!ret)
609 return TRACE_TYPE_PARTIAL_LINE;
610 }
611
520 /* Cpu */ 612 /* Cpu */
521 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 613 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
522 ret = print_graph_cpu(s, cpu); 614 ret = print_graph_cpu(s, cpu);
@@ -535,54 +627,65 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
535 return TRACE_TYPE_PARTIAL_LINE; 627 return TRACE_TYPE_PARTIAL_LINE;
536 } 628 }
537 629
538 if (trace_branch_is_leaf(iter, field)) 630 return 0;
539 return print_graph_entry_leaf(iter, field, s); 631}
632
633static enum print_line_t
634print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
635 struct trace_iterator *iter)
636{
637 int cpu = iter->cpu;
638 struct ftrace_graph_ent *call = &field->graph_ent;
639 struct ftrace_graph_ret_entry *leaf_ret;
640
641 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
642 return TRACE_TYPE_PARTIAL_LINE;
643
644 leaf_ret = get_return_for_leaf(iter, field);
645 if (leaf_ret)
646 return print_graph_entry_leaf(iter, field, leaf_ret, s);
540 else 647 else
541 return print_graph_entry_nested(field, s, iter->ent->pid, cpu); 648 return print_graph_entry_nested(iter, field, s, cpu);
542 649
543} 650}
544 651
545static enum print_line_t 652static enum print_line_t
546print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, 653print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
547 struct trace_entry *ent, int cpu) 654 struct trace_entry *ent, struct trace_iterator *iter)
548{ 655{
549 int i;
550 int ret;
551 unsigned long long duration = trace->rettime - trace->calltime; 656 unsigned long long duration = trace->rettime - trace->calltime;
657 struct fgraph_data *data = iter->private;
658 pid_t pid = ent->pid;
659 int cpu = iter->cpu;
660 int ret;
661 int i;
552 662
553 /* Pid */ 663 if (data) {
554 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE) 664 int cpu = iter->cpu;
555 return TRACE_TYPE_PARTIAL_LINE; 665 int *depth = &(per_cpu_ptr(data, cpu)->depth);
556 666
557 /* Cpu */ 667 /*
558 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 668 * Comments display at + 1 to depth. This is the
559 ret = print_graph_cpu(s, cpu); 669 * return from a function, we now want the comments
560 if (ret == TRACE_TYPE_PARTIAL_LINE) 670 * to display at the same level of the bracket.
561 return TRACE_TYPE_PARTIAL_LINE; 671 */
672 *depth = trace->depth - 1;
562 } 673 }
563 674
564 /* Proc */ 675 if (print_graph_prologue(iter, s, 0, 0))
565 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 676 return TRACE_TYPE_PARTIAL_LINE;
566 ret = print_graph_proc(s, ent->pid);
567 if (ret == TRACE_TYPE_PARTIAL_LINE)
568 return TRACE_TYPE_PARTIAL_LINE;
569
570 ret = trace_seq_printf(s, " | ");
571 if (!ret)
572 return TRACE_TYPE_PARTIAL_LINE;
573 }
574 677
575 /* Overhead */ 678 /* Overhead */
576 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 679 ret = print_graph_overhead(duration, s);
577 ret = print_graph_overhead(duration, s); 680 if (!ret)
578 if (!ret) 681 return TRACE_TYPE_PARTIAL_LINE;
579 return TRACE_TYPE_PARTIAL_LINE;
580 }
581 682
582 /* Duration */ 683 /* Duration */
583 ret = print_graph_duration(duration, s); 684 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
584 if (ret == TRACE_TYPE_PARTIAL_LINE) 685 ret = print_graph_duration(duration, s);
585 return TRACE_TYPE_PARTIAL_LINE; 686 if (ret == TRACE_TYPE_PARTIAL_LINE)
687 return TRACE_TYPE_PARTIAL_LINE;
688 }
586 689
587 /* Closing brace */ 690 /* Closing brace */
588 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { 691 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
@@ -603,7 +706,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
603 return TRACE_TYPE_PARTIAL_LINE; 706 return TRACE_TYPE_PARTIAL_LINE;
604 } 707 }
605 708
606 ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid); 709 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
607 if (ret == TRACE_TYPE_PARTIAL_LINE) 710 if (ret == TRACE_TYPE_PARTIAL_LINE)
608 return TRACE_TYPE_PARTIAL_LINE; 711 return TRACE_TYPE_PARTIAL_LINE;
609 712
@@ -611,61 +714,73 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
611} 714}
612 715
613static enum print_line_t 716static enum print_line_t
614print_graph_comment(struct print_entry *trace, struct trace_seq *s, 717print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
615 struct trace_entry *ent, struct trace_iterator *iter) 718 struct trace_iterator *iter)
616{ 719{
617 int i; 720 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
721 struct fgraph_data *data = iter->private;
722 struct trace_event *event;
723 int depth = 0;
618 int ret; 724 int ret;
725 int i;
619 726
620 /* Pid */ 727 if (data)
621 if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE) 728 depth = per_cpu_ptr(data, iter->cpu)->depth;
622 return TRACE_TYPE_PARTIAL_LINE;
623
624 /* Cpu */
625 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
626 ret = print_graph_cpu(s, iter->cpu);
627 if (ret == TRACE_TYPE_PARTIAL_LINE)
628 return TRACE_TYPE_PARTIAL_LINE;
629 }
630
631 /* Proc */
632 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
633 ret = print_graph_proc(s, ent->pid);
634 if (ret == TRACE_TYPE_PARTIAL_LINE)
635 return TRACE_TYPE_PARTIAL_LINE;
636 729
637 ret = trace_seq_printf(s, " | "); 730 if (print_graph_prologue(iter, s, 0, 0))
638 if (!ret) 731 return TRACE_TYPE_PARTIAL_LINE;
639 return TRACE_TYPE_PARTIAL_LINE;
640 }
641 732
642 /* No overhead */ 733 /* No overhead */
643 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 734 ret = print_graph_overhead(-1, s);
644 ret = trace_seq_printf(s, " "); 735 if (!ret)
736 return TRACE_TYPE_PARTIAL_LINE;
737
738 /* No time */
739 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
740 ret = trace_seq_printf(s, " | ");
645 if (!ret) 741 if (!ret)
646 return TRACE_TYPE_PARTIAL_LINE; 742 return TRACE_TYPE_PARTIAL_LINE;
647 } 743 }
648 744
649 /* No time */
650 ret = trace_seq_printf(s, " | ");
651 if (!ret)
652 return TRACE_TYPE_PARTIAL_LINE;
653
654 /* Indentation */ 745 /* Indentation */
655 if (trace->depth > 0) 746 if (depth > 0)
656 for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) { 747 for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
657 ret = trace_seq_printf(s, " "); 748 ret = trace_seq_printf(s, " ");
658 if (!ret) 749 if (!ret)
659 return TRACE_TYPE_PARTIAL_LINE; 750 return TRACE_TYPE_PARTIAL_LINE;
660 } 751 }
661 752
662 /* The comment */ 753 /* The comment */
663 ret = trace_seq_printf(s, "/* %s", trace->buf); 754 ret = trace_seq_printf(s, "/* ");
664 if (!ret) 755 if (!ret)
665 return TRACE_TYPE_PARTIAL_LINE; 756 return TRACE_TYPE_PARTIAL_LINE;
666 757
667 if (ent->flags & TRACE_FLAG_CONT) 758 switch (iter->ent->type) {
668 trace_seq_print_cont(s, iter); 759 case TRACE_BPRINT:
760 ret = trace_print_bprintk_msg_only(iter);
761 if (ret != TRACE_TYPE_HANDLED)
762 return ret;
763 break;
764 case TRACE_PRINT:
765 ret = trace_print_printk_msg_only(iter);
766 if (ret != TRACE_TYPE_HANDLED)
767 return ret;
768 break;
769 default:
770 event = ftrace_find_event(ent->type);
771 if (!event)
772 return TRACE_TYPE_UNHANDLED;
773
774 ret = event->trace(iter, sym_flags);
775 if (ret != TRACE_TYPE_HANDLED)
776 return ret;
777 }
778
779 /* Strip ending newline */
780 if (s->buffer[s->len - 1] == '\n') {
781 s->buffer[s->len - 1] = '\0';
782 s->len--;
783 }
669 784
670 ret = trace_seq_printf(s, " */\n"); 785 ret = trace_seq_printf(s, " */\n");
671 if (!ret) 786 if (!ret)
@@ -678,62 +793,91 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
678enum print_line_t 793enum print_line_t
679print_graph_function(struct trace_iterator *iter) 794print_graph_function(struct trace_iterator *iter)
680{ 795{
681 struct trace_seq *s = &iter->seq;
682 struct trace_entry *entry = iter->ent; 796 struct trace_entry *entry = iter->ent;
797 struct trace_seq *s = &iter->seq;
683 798
684 switch (entry->type) { 799 switch (entry->type) {
685 case TRACE_GRAPH_ENT: { 800 case TRACE_GRAPH_ENT: {
686 struct ftrace_graph_ent_entry *field; 801 struct ftrace_graph_ent_entry *field;
687 trace_assign_type(field, entry); 802 trace_assign_type(field, entry);
688 return print_graph_entry(field, s, iter, 803 return print_graph_entry(field, s, iter);
689 iter->cpu);
690 } 804 }
691 case TRACE_GRAPH_RET: { 805 case TRACE_GRAPH_RET: {
692 struct ftrace_graph_ret_entry *field; 806 struct ftrace_graph_ret_entry *field;
693 trace_assign_type(field, entry); 807 trace_assign_type(field, entry);
694 return print_graph_return(&field->ret, s, entry, iter->cpu); 808 return print_graph_return(&field->ret, s, entry, iter);
695 }
696 case TRACE_PRINT: {
697 struct print_entry *field;
698 trace_assign_type(field, entry);
699 return print_graph_comment(field, s, entry, iter);
700 } 809 }
701 default: 810 default:
702 return TRACE_TYPE_UNHANDLED; 811 return print_graph_comment(s, entry, iter);
703 } 812 }
813
814 return TRACE_TYPE_HANDLED;
704} 815}
705 816
706static void print_graph_headers(struct seq_file *s) 817static void print_graph_headers(struct seq_file *s)
707{ 818{
708 /* 1st line */ 819 /* 1st line */
709 seq_printf(s, "# "); 820 seq_printf(s, "# ");
821 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
822 seq_printf(s, " TIME ");
710 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 823 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
711 seq_printf(s, "CPU "); 824 seq_printf(s, "CPU");
712 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 825 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
713 seq_printf(s, "TASK/PID "); 826 seq_printf(s, " TASK/PID ");
714 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) 827 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
715 seq_printf(s, "OVERHEAD/"); 828 seq_printf(s, " DURATION ");
716 seq_printf(s, "DURATION FUNCTION CALLS\n"); 829 seq_printf(s, " FUNCTION CALLS\n");
717 830
718 /* 2nd line */ 831 /* 2nd line */
719 seq_printf(s, "# "); 832 seq_printf(s, "# ");
833 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
834 seq_printf(s, " | ");
720 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 835 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
721 seq_printf(s, "| "); 836 seq_printf(s, "| ");
722 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 837 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
723 seq_printf(s, "| | "); 838 seq_printf(s, " | | ");
724 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 839 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
725 seq_printf(s, "| "); 840 seq_printf(s, " | | ");
726 seq_printf(s, "| | | | |\n"); 841 seq_printf(s, " | | | |\n");
727 } else 842}
728 seq_printf(s, " | | | | |\n"); 843
844static void graph_trace_open(struct trace_iterator *iter)
845{
846 /* pid and depth on the last trace processed */
847 struct fgraph_data *data = alloc_percpu(struct fgraph_data);
848 int cpu;
849
850 if (!data)
851 pr_warning("function graph tracer: not enough memory\n");
852 else
853 for_each_possible_cpu(cpu) {
854 pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
855 int *depth = &(per_cpu_ptr(data, cpu)->depth);
856 *pid = -1;
857 *depth = 0;
858 }
859
860 iter->private = data;
729} 861}
862
863static void graph_trace_close(struct trace_iterator *iter)
864{
865 free_percpu(iter->private);
866}
867
730static struct tracer graph_trace __read_mostly = { 868static struct tracer graph_trace __read_mostly = {
731 .name = "function_graph", 869 .name = "function_graph",
732 .init = graph_trace_init, 870 .open = graph_trace_open,
733 .reset = graph_trace_reset, 871 .close = graph_trace_close,
872 .wait_pipe = poll_wait_pipe,
873 .init = graph_trace_init,
874 .reset = graph_trace_reset,
734 .print_line = print_graph_function, 875 .print_line = print_graph_function,
735 .print_header = print_graph_headers, 876 .print_header = print_graph_headers,
736 .flags = &tracer_flags, 877 .flags = &tracer_flags,
878#ifdef CONFIG_FTRACE_SELFTEST
879 .selftest = trace_selftest_startup_function_graph,
880#endif
737}; 881};
738 882
739static __init int init_graph_trace(void) 883static __init int init_graph_trace(void)
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 649df22d435f..7bfdf4c2347f 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,30 +1,53 @@
1/* 1/*
2 * h/w branch tracer for x86 based on bts 2 * h/w branch tracer for x86 based on bts
3 * 3 *
4 * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com> 4 * Copyright (C) 2008-2009 Intel Corporation.
5 * 5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
6 */ 6 */
7 7#include <linux/spinlock.h>
8#include <linux/module.h> 8#include <linux/kallsyms.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h> 9#include <linux/debugfs.h>
11#include <linux/ftrace.h> 10#include <linux/ftrace.h>
12#include <linux/kallsyms.h> 11#include <linux/module.h>
12#include <linux/cpu.h>
13#include <linux/smp.h>
14#include <linux/fs.h>
13 15
14#include <asm/ds.h> 16#include <asm/ds.h>
15 17
16#include "trace.h" 18#include "trace.h"
19#include "trace_output.h"
17 20
18 21
19#define SIZEOF_BTS (1 << 13) 22#define SIZEOF_BTS (1 << 13)
20 23
24/*
25 * The tracer lock protects the below per-cpu tracer array.
26 * It needs to be held to:
27 * - start tracing on all cpus
28 * - stop tracing on all cpus
29 * - start tracing on a single hotplug cpu
30 * - stop tracing on a single hotplug cpu
31 * - read the trace from all cpus
32 * - read the trace from a single cpu
33 */
34static DEFINE_SPINLOCK(bts_tracer_lock);
21static DEFINE_PER_CPU(struct bts_tracer *, tracer); 35static DEFINE_PER_CPU(struct bts_tracer *, tracer);
22static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); 36static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
23 37
24#define this_tracer per_cpu(tracer, smp_processor_id()) 38#define this_tracer per_cpu(tracer, smp_processor_id())
25#define this_buffer per_cpu(buffer, smp_processor_id()) 39#define this_buffer per_cpu(buffer, smp_processor_id())
26 40
41static int __read_mostly trace_hw_branches_enabled;
42static struct trace_array *hw_branch_trace __read_mostly;
43
27 44
45/*
46 * Start tracing on the current cpu.
47 * The argument is ignored.
48 *
49 * pre: bts_tracer_lock must be locked.
50 */
28static void bts_trace_start_cpu(void *arg) 51static void bts_trace_start_cpu(void *arg)
29{ 52{
30 if (this_tracer) 53 if (this_tracer)
@@ -42,14 +65,20 @@ static void bts_trace_start_cpu(void *arg)
42 65
43static void bts_trace_start(struct trace_array *tr) 66static void bts_trace_start(struct trace_array *tr)
44{ 67{
45 int cpu; 68 spin_lock(&bts_tracer_lock);
46 69
47 tracing_reset_online_cpus(tr); 70 on_each_cpu(bts_trace_start_cpu, NULL, 1);
71 trace_hw_branches_enabled = 1;
48 72
49 for_each_cpu(cpu, cpu_possible_mask) 73 spin_unlock(&bts_tracer_lock);
50 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
51} 74}
52 75
76/*
77 * Stop tracing on the current cpu.
78 * The argument is ignored.
79 *
80 * pre: bts_tracer_lock must be locked.
81 */
53static void bts_trace_stop_cpu(void *arg) 82static void bts_trace_stop_cpu(void *arg)
54{ 83{
55 if (this_tracer) { 84 if (this_tracer) {
@@ -60,26 +89,60 @@ static void bts_trace_stop_cpu(void *arg)
60 89
61static void bts_trace_stop(struct trace_array *tr) 90static void bts_trace_stop(struct trace_array *tr)
62{ 91{
63 int cpu; 92 spin_lock(&bts_tracer_lock);
93
94 trace_hw_branches_enabled = 0;
95 on_each_cpu(bts_trace_stop_cpu, NULL, 1);
96
97 spin_unlock(&bts_tracer_lock);
98}
99
100static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
101 unsigned long action, void *hcpu)
102{
103 unsigned int cpu = (unsigned long)hcpu;
64 104
65 for_each_cpu(cpu, cpu_possible_mask) 105 spin_lock(&bts_tracer_lock);
106
107 if (!trace_hw_branches_enabled)
108 goto out;
109
110 switch (action) {
111 case CPU_ONLINE:
112 case CPU_DOWN_FAILED:
113 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
114 break;
115 case CPU_DOWN_PREPARE:
66 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); 116 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
117 break;
118 }
119
120 out:
121 spin_unlock(&bts_tracer_lock);
122 return NOTIFY_DONE;
67} 123}
68 124
125static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
126 .notifier_call = bts_hotcpu_handler
127};
128
69static int bts_trace_init(struct trace_array *tr) 129static int bts_trace_init(struct trace_array *tr)
70{ 130{
71 tracing_reset_online_cpus(tr); 131 hw_branch_trace = tr;
132
72 bts_trace_start(tr); 133 bts_trace_start(tr);
73 134
74 return 0; 135 return 0;
75} 136}
76 137
138static void bts_trace_reset(struct trace_array *tr)
139{
140 bts_trace_stop(tr);
141}
142
77static void bts_trace_print_header(struct seq_file *m) 143static void bts_trace_print_header(struct seq_file *m)
78{ 144{
79 seq_puts(m, 145 seq_puts(m, "# CPU# TO <- FROM\n");
80 "# CPU# FROM TO FUNCTION\n");
81 seq_puts(m,
82 "# | | | |\n");
83} 146}
84 147
85static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) 148static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
@@ -87,15 +150,15 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
87 struct trace_entry *entry = iter->ent; 150 struct trace_entry *entry = iter->ent;
88 struct trace_seq *seq = &iter->seq; 151 struct trace_seq *seq = &iter->seq;
89 struct hw_branch_entry *it; 152 struct hw_branch_entry *it;
153 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
90 154
91 trace_assign_type(it, entry); 155 trace_assign_type(it, entry);
92 156
93 if (entry->type == TRACE_HW_BRANCHES) { 157 if (entry->type == TRACE_HW_BRANCHES) {
94 if (trace_seq_printf(seq, "%4d ", entry->cpu) && 158 if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
95 trace_seq_printf(seq, "0x%016llx -> 0x%016llx ", 159 seq_print_ip_sym(seq, it->to, symflags) &&
96 it->from, it->to) && 160 trace_seq_printf(seq, "\t <- ") &&
97 (!it->from || 161 seq_print_ip_sym(seq, it->from, symflags) &&
98 seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
99 trace_seq_printf(seq, "\n")) 162 trace_seq_printf(seq, "\n"))
100 return TRACE_TYPE_HANDLED; 163 return TRACE_TYPE_HANDLED;
101 return TRACE_TYPE_PARTIAL_LINE;; 164 return TRACE_TYPE_PARTIAL_LINE;;
@@ -103,26 +166,42 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
103 return TRACE_TYPE_UNHANDLED; 166 return TRACE_TYPE_UNHANDLED;
104} 167}
105 168
106void trace_hw_branch(struct trace_array *tr, u64 from, u64 to) 169void trace_hw_branch(u64 from, u64 to)
107{ 170{
171 struct trace_array *tr = hw_branch_trace;
108 struct ring_buffer_event *event; 172 struct ring_buffer_event *event;
109 struct hw_branch_entry *entry; 173 struct hw_branch_entry *entry;
110 unsigned long irq; 174 unsigned long irq1;
175 int cpu;
111 176
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq); 177 if (unlikely(!tr))
113 if (!event)
114 return; 178 return;
179
180 if (unlikely(!trace_hw_branches_enabled))
181 return;
182
183 local_irq_save(irq1);
184 cpu = raw_smp_processor_id();
185 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
186 goto out;
187
188 event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES,
189 sizeof(*entry), 0, 0);
190 if (!event)
191 goto out;
115 entry = ring_buffer_event_data(event); 192 entry = ring_buffer_event_data(event);
116 tracing_generic_entry_update(&entry->ent, 0, from); 193 tracing_generic_entry_update(&entry->ent, 0, from);
117 entry->ent.type = TRACE_HW_BRANCHES; 194 entry->ent.type = TRACE_HW_BRANCHES;
118 entry->ent.cpu = smp_processor_id();
119 entry->from = from; 195 entry->from = from;
120 entry->to = to; 196 entry->to = to;
121 ring_buffer_unlock_commit(tr->buffer, event, irq); 197 trace_buffer_unlock_commit(tr, event, 0, 0);
198
199 out:
200 atomic_dec(&tr->data[cpu]->disabled);
201 local_irq_restore(irq1);
122} 202}
123 203
124static void trace_bts_at(struct trace_array *tr, 204static void trace_bts_at(const struct bts_trace *trace, void *at)
125 const struct bts_trace *trace, void *at)
126{ 205{
127 struct bts_struct bts; 206 struct bts_struct bts;
128 int err = 0; 207 int err = 0;
@@ -137,18 +216,29 @@ static void trace_bts_at(struct trace_array *tr,
137 216
138 switch (bts.qualifier) { 217 switch (bts.qualifier) {
139 case BTS_BRANCH: 218 case BTS_BRANCH:
140 trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to); 219 trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
141 break; 220 break;
142 } 221 }
143} 222}
144 223
224/*
225 * Collect the trace on the current cpu and write it into the ftrace buffer.
226 *
227 * pre: bts_tracer_lock must be locked
228 */
145static void trace_bts_cpu(void *arg) 229static void trace_bts_cpu(void *arg)
146{ 230{
147 struct trace_array *tr = (struct trace_array *) arg; 231 struct trace_array *tr = (struct trace_array *) arg;
148 const struct bts_trace *trace; 232 const struct bts_trace *trace;
149 unsigned char *at; 233 unsigned char *at;
150 234
151 if (!this_tracer) 235 if (unlikely(!tr))
236 return;
237
238 if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
239 return;
240
241 if (unlikely(!this_tracer))
152 return; 242 return;
153 243
154 ds_suspend_bts(this_tracer); 244 ds_suspend_bts(this_tracer);
@@ -158,11 +248,11 @@ static void trace_bts_cpu(void *arg)
158 248
159 for (at = trace->ds.top; (void *)at < trace->ds.end; 249 for (at = trace->ds.top; (void *)at < trace->ds.end;
160 at += trace->ds.size) 250 at += trace->ds.size)
161 trace_bts_at(tr, trace, at); 251 trace_bts_at(trace, at);
162 252
163 for (at = trace->ds.begin; (void *)at < trace->ds.top; 253 for (at = trace->ds.begin; (void *)at < trace->ds.top;
164 at += trace->ds.size) 254 at += trace->ds.size)
165 trace_bts_at(tr, trace, at); 255 trace_bts_at(trace, at);
166 256
167out: 257out:
168 ds_resume_bts(this_tracer); 258 ds_resume_bts(this_tracer);
@@ -170,26 +260,43 @@ out:
170 260
171static void trace_bts_prepare(struct trace_iterator *iter) 261static void trace_bts_prepare(struct trace_iterator *iter)
172{ 262{
173 int cpu; 263 spin_lock(&bts_tracer_lock);
264
265 on_each_cpu(trace_bts_cpu, iter->tr, 1);
266
267 spin_unlock(&bts_tracer_lock);
268}
269
270static void trace_bts_close(struct trace_iterator *iter)
271{
272 tracing_reset_online_cpus(iter->tr);
273}
274
275void trace_hw_branch_oops(void)
276{
277 spin_lock(&bts_tracer_lock);
278
279 trace_bts_cpu(hw_branch_trace);
174 280
175 for_each_cpu(cpu, cpu_possible_mask) 281 spin_unlock(&bts_tracer_lock);
176 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
177} 282}
178 283
179struct tracer bts_tracer __read_mostly = 284struct tracer bts_tracer __read_mostly =
180{ 285{
181 .name = "hw-branch-tracer", 286 .name = "hw-branch-tracer",
182 .init = bts_trace_init, 287 .init = bts_trace_init,
183 .reset = bts_trace_stop, 288 .reset = bts_trace_reset,
184 .print_header = bts_trace_print_header, 289 .print_header = bts_trace_print_header,
185 .print_line = bts_trace_print_line, 290 .print_line = bts_trace_print_line,
186 .start = bts_trace_start, 291 .start = bts_trace_start,
187 .stop = bts_trace_stop, 292 .stop = bts_trace_stop,
188 .open = trace_bts_prepare 293 .open = trace_bts_prepare,
294 .close = trace_bts_close
189}; 295};
190 296
191__init static int init_bts_trace(void) 297__init static int init_bts_trace(void)
192{ 298{
299 register_hotcpu_notifier(&bts_hotcpu_notifier);
193 return register_tracer(&bts_tracer); 300 return register_tracer(&bts_tracer);
194} 301}
195device_initcall(init_bts_trace); 302device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 62a78d943534..b923d13e2fad 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * trace irqs off criticall timings 2 * trace irqs off critical timings
3 * 3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> 5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
@@ -32,6 +32,8 @@ enum {
32 32
33static int trace_type __read_mostly; 33static int trace_type __read_mostly;
34 34
35static int save_lat_flag;
36
35#ifdef CONFIG_PREEMPT_TRACER 37#ifdef CONFIG_PREEMPT_TRACER
36static inline int 38static inline int
37preempt_trace(void) 39preempt_trace(void)
@@ -95,7 +97,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
95 disabled = atomic_inc_return(&data->disabled); 97 disabled = atomic_inc_return(&data->disabled);
96 98
97 if (likely(disabled == 1)) 99 if (likely(disabled == 1))
98 trace_function(tr, data, ip, parent_ip, flags, preempt_count()); 100 trace_function(tr, ip, parent_ip, flags, preempt_count());
99 101
100 atomic_dec(&data->disabled); 102 atomic_dec(&data->disabled);
101} 103}
@@ -153,7 +155,7 @@ check_critical_timing(struct trace_array *tr,
153 if (!report_latency(delta)) 155 if (!report_latency(delta))
154 goto out_unlock; 156 goto out_unlock;
155 157
156 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); 158 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
157 159
158 latency = nsecs_to_usecs(delta); 160 latency = nsecs_to_usecs(delta);
159 161
@@ -177,7 +179,7 @@ out:
177 data->critical_sequence = max_sequence; 179 data->critical_sequence = max_sequence;
178 data->preempt_timestamp = ftrace_now(cpu); 180 data->preempt_timestamp = ftrace_now(cpu);
179 tracing_reset(tr, cpu); 181 tracing_reset(tr, cpu);
180 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc); 182 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
181} 183}
182 184
183static inline void 185static inline void
@@ -210,7 +212,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
210 212
211 local_save_flags(flags); 213 local_save_flags(flags);
212 214
213 trace_function(tr, data, ip, parent_ip, flags, preempt_count()); 215 trace_function(tr, ip, parent_ip, flags, preempt_count());
214 216
215 per_cpu(tracing_cpu, cpu) = 1; 217 per_cpu(tracing_cpu, cpu) = 1;
216 218
@@ -244,7 +246,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
244 atomic_inc(&data->disabled); 246 atomic_inc(&data->disabled);
245 247
246 local_save_flags(flags); 248 local_save_flags(flags);
247 trace_function(tr, data, ip, parent_ip, flags, preempt_count()); 249 trace_function(tr, ip, parent_ip, flags, preempt_count());
248 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 250 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
249 data->critical_start = 0; 251 data->critical_start = 0;
250 atomic_dec(&data->disabled); 252 atomic_dec(&data->disabled);
@@ -353,33 +355,26 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
353} 355}
354#endif /* CONFIG_PREEMPT_TRACER */ 356#endif /* CONFIG_PREEMPT_TRACER */
355 357
356/*
357 * save_tracer_enabled is used to save the state of the tracer_enabled
358 * variable when we disable it when we open a trace output file.
359 */
360static int save_tracer_enabled;
361
362static void start_irqsoff_tracer(struct trace_array *tr) 358static void start_irqsoff_tracer(struct trace_array *tr)
363{ 359{
364 register_ftrace_function(&trace_ops); 360 register_ftrace_function(&trace_ops);
365 if (tracing_is_enabled()) { 361 if (tracing_is_enabled())
366 tracer_enabled = 1; 362 tracer_enabled = 1;
367 save_tracer_enabled = 1; 363 else
368 } else {
369 tracer_enabled = 0; 364 tracer_enabled = 0;
370 save_tracer_enabled = 0;
371 }
372} 365}
373 366
374static void stop_irqsoff_tracer(struct trace_array *tr) 367static void stop_irqsoff_tracer(struct trace_array *tr)
375{ 368{
376 tracer_enabled = 0; 369 tracer_enabled = 0;
377 save_tracer_enabled = 0;
378 unregister_ftrace_function(&trace_ops); 370 unregister_ftrace_function(&trace_ops);
379} 371}
380 372
381static void __irqsoff_tracer_init(struct trace_array *tr) 373static void __irqsoff_tracer_init(struct trace_array *tr)
382{ 374{
375 save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
376 trace_flags |= TRACE_ITER_LATENCY_FMT;
377
383 tracing_max_latency = 0; 378 tracing_max_latency = 0;
384 irqsoff_trace = tr; 379 irqsoff_trace = tr;
385 /* make sure that the tracer is visible */ 380 /* make sure that the tracer is visible */
@@ -390,30 +385,19 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
390static void irqsoff_tracer_reset(struct trace_array *tr) 385static void irqsoff_tracer_reset(struct trace_array *tr)
391{ 386{
392 stop_irqsoff_tracer(tr); 387 stop_irqsoff_tracer(tr);
388
389 if (!save_lat_flag)
390 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
393} 391}
394 392
395static void irqsoff_tracer_start(struct trace_array *tr) 393static void irqsoff_tracer_start(struct trace_array *tr)
396{ 394{
397 tracer_enabled = 1; 395 tracer_enabled = 1;
398 save_tracer_enabled = 1;
399} 396}
400 397
401static void irqsoff_tracer_stop(struct trace_array *tr) 398static void irqsoff_tracer_stop(struct trace_array *tr)
402{ 399{
403 tracer_enabled = 0; 400 tracer_enabled = 0;
404 save_tracer_enabled = 0;
405}
406
407static void irqsoff_tracer_open(struct trace_iterator *iter)
408{
409 /* stop the trace while dumping */
410 tracer_enabled = 0;
411}
412
413static void irqsoff_tracer_close(struct trace_iterator *iter)
414{
415 /* restart tracing */
416 tracer_enabled = save_tracer_enabled;
417} 401}
418 402
419#ifdef CONFIG_IRQSOFF_TRACER 403#ifdef CONFIG_IRQSOFF_TRACER
@@ -431,8 +415,6 @@ static struct tracer irqsoff_tracer __read_mostly =
431 .reset = irqsoff_tracer_reset, 415 .reset = irqsoff_tracer_reset,
432 .start = irqsoff_tracer_start, 416 .start = irqsoff_tracer_start,
433 .stop = irqsoff_tracer_stop, 417 .stop = irqsoff_tracer_stop,
434 .open = irqsoff_tracer_open,
435 .close = irqsoff_tracer_close,
436 .print_max = 1, 418 .print_max = 1,
437#ifdef CONFIG_FTRACE_SELFTEST 419#ifdef CONFIG_FTRACE_SELFTEST
438 .selftest = trace_selftest_startup_irqsoff, 420 .selftest = trace_selftest_startup_irqsoff,
@@ -459,8 +441,6 @@ static struct tracer preemptoff_tracer __read_mostly =
459 .reset = irqsoff_tracer_reset, 441 .reset = irqsoff_tracer_reset,
460 .start = irqsoff_tracer_start, 442 .start = irqsoff_tracer_start,
461 .stop = irqsoff_tracer_stop, 443 .stop = irqsoff_tracer_stop,
462 .open = irqsoff_tracer_open,
463 .close = irqsoff_tracer_close,
464 .print_max = 1, 444 .print_max = 1,
465#ifdef CONFIG_FTRACE_SELFTEST 445#ifdef CONFIG_FTRACE_SELFTEST
466 .selftest = trace_selftest_startup_preemptoff, 446 .selftest = trace_selftest_startup_preemptoff,
@@ -489,8 +469,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
489 .reset = irqsoff_tracer_reset, 469 .reset = irqsoff_tracer_reset,
490 .start = irqsoff_tracer_start, 470 .start = irqsoff_tracer_start,
491 .stop = irqsoff_tracer_stop, 471 .stop = irqsoff_tracer_stop,
492 .open = irqsoff_tracer_open,
493 .close = irqsoff_tracer_close,
494 .print_max = 1, 472 .print_max = 1,
495#ifdef CONFIG_FTRACE_SELFTEST 473#ifdef CONFIG_FTRACE_SELFTEST
496 .selftest = trace_selftest_startup_preemptirqsoff, 474 .selftest = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 80e503ef6136..8e37fcddd8b4 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -12,6 +12,7 @@
12#include <asm/atomic.h> 12#include <asm/atomic.h>
13 13
14#include "trace.h" 14#include "trace.h"
15#include "trace_output.h"
15 16
16struct header_iter { 17struct header_iter {
17 struct pci_dev *dev; 18 struct pci_dev *dev;
@@ -183,21 +184,22 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
183 switch (rw->opcode) { 184 switch (rw->opcode) {
184 case MMIO_READ: 185 case MMIO_READ:
185 ret = trace_seq_printf(s, 186 ret = trace_seq_printf(s,
186 "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 187 "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
187 rw->width, secs, usec_rem, rw->map_id, 188 rw->width, secs, usec_rem, rw->map_id,
188 (unsigned long long)rw->phys, 189 (unsigned long long)rw->phys,
189 rw->value, rw->pc, 0); 190 rw->value, rw->pc, 0);
190 break; 191 break;
191 case MMIO_WRITE: 192 case MMIO_WRITE:
192 ret = trace_seq_printf(s, 193 ret = trace_seq_printf(s,
193 "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 194 "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
194 rw->width, secs, usec_rem, rw->map_id, 195 rw->width, secs, usec_rem, rw->map_id,
195 (unsigned long long)rw->phys, 196 (unsigned long long)rw->phys,
196 rw->value, rw->pc, 0); 197 rw->value, rw->pc, 0);
197 break; 198 break;
198 case MMIO_UNKNOWN_OP: 199 case MMIO_UNKNOWN_OP:
199 ret = trace_seq_printf(s, 200 ret = trace_seq_printf(s,
200 "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n", 201 "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
202 "%02lx 0x%lx %d\n",
201 secs, usec_rem, rw->map_id, 203 secs, usec_rem, rw->map_id,
202 (unsigned long long)rw->phys, 204 (unsigned long long)rw->phys,
203 (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, 205 (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
@@ -229,14 +231,14 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
229 switch (m->opcode) { 231 switch (m->opcode) {
230 case MMIO_PROBE: 232 case MMIO_PROBE:
231 ret = trace_seq_printf(s, 233 ret = trace_seq_printf(s,
232 "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", 234 "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
233 secs, usec_rem, m->map_id, 235 secs, usec_rem, m->map_id,
234 (unsigned long long)m->phys, m->virt, m->len, 236 (unsigned long long)m->phys, m->virt, m->len,
235 0UL, 0); 237 0UL, 0);
236 break; 238 break;
237 case MMIO_UNPROBE: 239 case MMIO_UNPROBE:
238 ret = trace_seq_printf(s, 240 ret = trace_seq_printf(s,
239 "UNMAP %lu.%06lu %d 0x%lx %d\n", 241 "UNMAP %u.%06lu %d 0x%lx %d\n",
240 secs, usec_rem, m->map_id, 0UL, 0); 242 secs, usec_rem, m->map_id, 0UL, 0);
241 break; 243 break;
242 default: 244 default:
@@ -255,18 +257,15 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
255 const char *msg = print->buf; 257 const char *msg = print->buf;
256 struct trace_seq *s = &iter->seq; 258 struct trace_seq *s = &iter->seq;
257 unsigned long long t = ns2usecs(iter->ts); 259 unsigned long long t = ns2usecs(iter->ts);
258 unsigned long usec_rem = do_div(t, 1000000ULL); 260 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
259 unsigned secs = (unsigned long)t; 261 unsigned secs = (unsigned long)t;
260 int ret; 262 int ret;
261 263
262 /* The trailing newline must be in the message. */ 264 /* The trailing newline must be in the message. */
263 ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg); 265 ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
264 if (!ret) 266 if (!ret)
265 return TRACE_TYPE_PARTIAL_LINE; 267 return TRACE_TYPE_PARTIAL_LINE;
266 268
267 if (entry->flags & TRACE_FLAG_CONT)
268 trace_seq_print_cont(s, iter);
269
270 return TRACE_TYPE_HANDLED; 269 return TRACE_TYPE_HANDLED;
271} 270}
272 271
@@ -308,21 +307,17 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
308{ 307{
309 struct ring_buffer_event *event; 308 struct ring_buffer_event *event;
310 struct trace_mmiotrace_rw *entry; 309 struct trace_mmiotrace_rw *entry;
311 unsigned long irq_flags; 310 int pc = preempt_count();
312 311
313 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 312 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW,
314 &irq_flags); 313 sizeof(*entry), 0, pc);
315 if (!event) { 314 if (!event) {
316 atomic_inc(&dropped_count); 315 atomic_inc(&dropped_count);
317 return; 316 return;
318 } 317 }
319 entry = ring_buffer_event_data(event); 318 entry = ring_buffer_event_data(event);
320 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
321 entry->ent.type = TRACE_MMIO_RW;
322 entry->rw = *rw; 319 entry->rw = *rw;
323 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 320 trace_buffer_unlock_commit(tr, event, 0, pc);
324
325 trace_wake_up();
326} 321}
327 322
328void mmio_trace_rw(struct mmiotrace_rw *rw) 323void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -338,21 +333,17 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
338{ 333{
339 struct ring_buffer_event *event; 334 struct ring_buffer_event *event;
340 struct trace_mmiotrace_map *entry; 335 struct trace_mmiotrace_map *entry;
341 unsigned long irq_flags; 336 int pc = preempt_count();
342 337
343 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 338 event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP,
344 &irq_flags); 339 sizeof(*entry), 0, pc);
345 if (!event) { 340 if (!event) {
346 atomic_inc(&dropped_count); 341 atomic_inc(&dropped_count);
347 return; 342 return;
348 } 343 }
349 entry = ring_buffer_event_data(event); 344 entry = ring_buffer_event_data(event);
350 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
351 entry->ent.type = TRACE_MMIO_MAP;
352 entry->map = *map; 345 entry->map = *map;
353 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 346 trace_buffer_unlock_commit(tr, event, 0, pc);
354
355 trace_wake_up();
356} 347}
357 348
358void mmio_trace_mapping(struct mmiotrace_map *map) 349void mmio_trace_mapping(struct mmiotrace_map *map)
@@ -368,5 +359,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
368 359
369int mmio_trace_printk(const char *fmt, va_list args) 360int mmio_trace_printk(const char *fmt, va_list args)
370{ 361{
371 return trace_vprintk(0, -1, fmt, args); 362 return trace_vprintk(0, fmt, args);
372} 363}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index b9767acd30ac..394f94417e2f 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -47,12 +47,7 @@ static void stop_nop_trace(struct trace_array *tr)
47 47
48static int nop_trace_init(struct trace_array *tr) 48static int nop_trace_init(struct trace_array *tr)
49{ 49{
50 int cpu;
51 ctx_trace = tr; 50 ctx_trace = tr;
52
53 for_each_online_cpu(cpu)
54 tracing_reset(tr, cpu);
55
56 start_nop_trace(tr); 51 start_nop_trace(tr);
57 return 0; 52 return 0;
58} 53}
@@ -96,6 +91,7 @@ struct tracer nop_trace __read_mostly =
96 .name = "nop", 91 .name = "nop",
97 .init = nop_trace_init, 92 .init = nop_trace_init,
98 .reset = nop_trace_reset, 93 .reset = nop_trace_reset,
94 .wait_pipe = poll_wait_pipe,
99#ifdef CONFIG_FTRACE_SELFTEST 95#ifdef CONFIG_FTRACE_SELFTEST
100 .selftest = trace_selftest_startup_nop, 96 .selftest = trace_selftest_startup_nop,
101#endif 97#endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
new file mode 100644
index 000000000000..64b54a59c55b
--- /dev/null
+++ b/kernel/trace/trace_output.c
@@ -0,0 +1,1017 @@
1/*
2 * trace_output.c
3 *
4 * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/mutex.h>
10#include <linux/ftrace.h>
11
12#include "trace_output.h"
13
14/* must be a power of 2 */
15#define EVENT_HASHSIZE 128
16
17static DEFINE_MUTEX(trace_event_mutex);
18static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
19
20static int next_event_type = __TRACE_LAST_TYPE + 1;
21
22enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
23{
24 struct trace_seq *s = &iter->seq;
25 struct trace_entry *entry = iter->ent;
26 struct bprint_entry *field;
27 int ret;
28
29 trace_assign_type(field, entry);
30
31 ret = trace_seq_bprintf(s, field->fmt, field->buf);
32 if (!ret)
33 return TRACE_TYPE_PARTIAL_LINE;
34
35 return TRACE_TYPE_HANDLED;
36}
37
38enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
39{
40 struct trace_seq *s = &iter->seq;
41 struct trace_entry *entry = iter->ent;
42 struct print_entry *field;
43 int ret;
44
45 trace_assign_type(field, entry);
46
47 ret = trace_seq_printf(s, "%s", field->buf);
48 if (!ret)
49 return TRACE_TYPE_PARTIAL_LINE;
50
51 return TRACE_TYPE_HANDLED;
52}
53
54/**
55 * trace_seq_printf - sequence printing of trace information
56 * @s: trace sequence descriptor
57 * @fmt: printf format string
58 *
59 * The tracer may use either sequence operations or its own
60 * copy to user routines. To simplify formating of a trace
61 * trace_seq_printf is used to store strings into a special
62 * buffer (@s). Then the output may be either used by
63 * the sequencer or pulled into another buffer.
64 */
65int
66trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
67{
68 int len = (PAGE_SIZE - 1) - s->len;
69 va_list ap;
70 int ret;
71
72 if (!len)
73 return 0;
74
75 va_start(ap, fmt);
76 ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
77 va_end(ap);
78
79 /* If we can't write it all, don't bother writing anything */
80 if (ret >= len)
81 return 0;
82
83 s->len += ret;
84
85 return len;
86}
87
88int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
89{
90 int len = (PAGE_SIZE - 1) - s->len;
91 int ret;
92
93 if (!len)
94 return 0;
95
96 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
97
98 /* If we can't write it all, don't bother writing anything */
99 if (ret >= len)
100 return 0;
101
102 s->len += ret;
103
104 return len;
105}
106
107/**
108 * trace_seq_puts - trace sequence printing of simple string
109 * @s: trace sequence descriptor
110 * @str: simple string to record
111 *
112 * The tracer may use either the sequence operations or its own
113 * copy to user routines. This function records a simple string
114 * into a special buffer (@s) for later retrieval by a sequencer
115 * or other mechanism.
116 */
117int trace_seq_puts(struct trace_seq *s, const char *str)
118{
119 int len = strlen(str);
120
121 if (len > ((PAGE_SIZE - 1) - s->len))
122 return 0;
123
124 memcpy(s->buffer + s->len, str, len);
125 s->len += len;
126
127 return len;
128}
129
130int trace_seq_putc(struct trace_seq *s, unsigned char c)
131{
132 if (s->len >= (PAGE_SIZE - 1))
133 return 0;
134
135 s->buffer[s->len++] = c;
136
137 return 1;
138}
139
140int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
141{
142 if (len > ((PAGE_SIZE - 1) - s->len))
143 return 0;
144
145 memcpy(s->buffer + s->len, mem, len);
146 s->len += len;
147
148 return len;
149}
150
151int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
152{
153 unsigned char hex[HEX_CHARS];
154 const unsigned char *data = mem;
155 int i, j;
156
157#ifdef __BIG_ENDIAN
158 for (i = 0, j = 0; i < len; i++) {
159#else
160 for (i = len-1, j = 0; i >= 0; i--) {
161#endif
162 hex[j++] = hex_asc_hi(data[i]);
163 hex[j++] = hex_asc_lo(data[i]);
164 }
165 hex[j++] = ' ';
166
167 return trace_seq_putmem(s, hex, j);
168}
169
170void *trace_seq_reserve(struct trace_seq *s, size_t len)
171{
172 void *ret;
173
174 if (len > ((PAGE_SIZE - 1) - s->len))
175 return NULL;
176
177 ret = s->buffer + s->len;
178 s->len += len;
179
180 return ret;
181}
182
183int trace_seq_path(struct trace_seq *s, struct path *path)
184{
185 unsigned char *p;
186
187 if (s->len >= (PAGE_SIZE - 1))
188 return 0;
189 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
190 if (!IS_ERR(p)) {
191 p = mangle_path(s->buffer + s->len, p, "\n");
192 if (p) {
193 s->len = p - s->buffer;
194 return 1;
195 }
196 } else {
197 s->buffer[s->len++] = '?';
198 return 1;
199 }
200
201 return 0;
202}
203
204#ifdef CONFIG_KRETPROBES
205static inline const char *kretprobed(const char *name)
206{
207 static const char tramp_name[] = "kretprobe_trampoline";
208 int size = sizeof(tramp_name);
209
210 if (strncmp(tramp_name, name, size) == 0)
211 return "[unknown/kretprobe'd]";
212 return name;
213}
214#else
215static inline const char *kretprobed(const char *name)
216{
217 return name;
218}
219#endif /* CONFIG_KRETPROBES */
220
221static int
222seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
223{
224#ifdef CONFIG_KALLSYMS
225 char str[KSYM_SYMBOL_LEN];
226 const char *name;
227
228 kallsyms_lookup(address, NULL, NULL, NULL, str);
229
230 name = kretprobed(str);
231
232 return trace_seq_printf(s, fmt, name);
233#endif
234 return 1;
235}
236
237static int
238seq_print_sym_offset(struct trace_seq *s, const char *fmt,
239 unsigned long address)
240{
241#ifdef CONFIG_KALLSYMS
242 char str[KSYM_SYMBOL_LEN];
243 const char *name;
244
245 sprint_symbol(str, address);
246 name = kretprobed(str);
247
248 return trace_seq_printf(s, fmt, name);
249#endif
250 return 1;
251}
252
253#ifndef CONFIG_64BIT
254# define IP_FMT "%08lx"
255#else
256# define IP_FMT "%016lx"
257#endif
258
259int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
260 unsigned long ip, unsigned long sym_flags)
261{
262 struct file *file = NULL;
263 unsigned long vmstart = 0;
264 int ret = 1;
265
266 if (mm) {
267 const struct vm_area_struct *vma;
268
269 down_read(&mm->mmap_sem);
270 vma = find_vma(mm, ip);
271 if (vma) {
272 file = vma->vm_file;
273 vmstart = vma->vm_start;
274 }
275 if (file) {
276 ret = trace_seq_path(s, &file->f_path);
277 if (ret)
278 ret = trace_seq_printf(s, "[+0x%lx]",
279 ip - vmstart);
280 }
281 up_read(&mm->mmap_sem);
282 }
283 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
284 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
285 return ret;
286}
287
288int
289seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
290 unsigned long sym_flags)
291{
292 struct mm_struct *mm = NULL;
293 int ret = 1;
294 unsigned int i;
295
296 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
297 struct task_struct *task;
298 /*
299 * we do the lookup on the thread group leader,
300 * since individual threads might have already quit!
301 */
302 rcu_read_lock();
303 task = find_task_by_vpid(entry->ent.tgid);
304 if (task)
305 mm = get_task_mm(task);
306 rcu_read_unlock();
307 }
308
309 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
310 unsigned long ip = entry->caller[i];
311
312 if (ip == ULONG_MAX || !ret)
313 break;
314 if (i && ret)
315 ret = trace_seq_puts(s, " <- ");
316 if (!ip) {
317 if (ret)
318 ret = trace_seq_puts(s, "??");
319 continue;
320 }
321 if (!ret)
322 break;
323 if (ret)
324 ret = seq_print_user_ip(s, mm, ip, sym_flags);
325 }
326
327 if (mm)
328 mmput(mm);
329 return ret;
330}
331
332int
333seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
334{
335 int ret;
336
337 if (!ip)
338 return trace_seq_printf(s, "0");
339
340 if (sym_flags & TRACE_ITER_SYM_OFFSET)
341 ret = seq_print_sym_offset(s, "%s", ip);
342 else
343 ret = seq_print_sym_short(s, "%s", ip);
344
345 if (!ret)
346 return 0;
347
348 if (sym_flags & TRACE_ITER_SYM_ADDR)
349 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
350 return ret;
351}
352
353static int
354lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
355{
356 int hardirq, softirq;
357 char comm[TASK_COMM_LEN];
358
359 trace_find_cmdline(entry->pid, comm);
360 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
361 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
362
363 if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c",
364 comm, entry->pid, cpu,
365 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
366 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
367 'X' : '.',
368 (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
369 'N' : '.',
370 (hardirq && softirq) ? 'H' :
371 hardirq ? 'h' : softirq ? 's' : '.'))
372 return 0;
373
374 if (entry->preempt_count)
375 return trace_seq_printf(s, "%x", entry->preempt_count);
376 return trace_seq_puts(s, ".");
377}
378
379static unsigned long preempt_mark_thresh = 100;
380
381static int
382lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
383 unsigned long rel_usecs)
384{
385 return trace_seq_printf(s, " %4lldus%c: ", abs_usecs,
386 rel_usecs > preempt_mark_thresh ? '!' :
387 rel_usecs > 1 ? '+' : ' ');
388}
389
390int trace_print_context(struct trace_iterator *iter)
391{
392 struct trace_seq *s = &iter->seq;
393 struct trace_entry *entry = iter->ent;
394 unsigned long long t = ns2usecs(iter->ts);
395 unsigned long usec_rem = do_div(t, USEC_PER_SEC);
396 unsigned long secs = (unsigned long)t;
397 char comm[TASK_COMM_LEN];
398
399 trace_find_cmdline(entry->pid, comm);
400
401 return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
402 comm, entry->pid, iter->cpu, secs, usec_rem);
403}
404
405int trace_print_lat_context(struct trace_iterator *iter)
406{
407 u64 next_ts;
408 int ret;
409 struct trace_seq *s = &iter->seq;
410 struct trace_entry *entry = iter->ent,
411 *next_entry = trace_find_next_entry(iter, NULL,
412 &next_ts);
413 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
414 unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
415 unsigned long rel_usecs;
416
417 if (!next_entry)
418 next_ts = iter->ts;
419 rel_usecs = ns2usecs(next_ts - iter->ts);
420
421 if (verbose) {
422 char comm[TASK_COMM_LEN];
423
424 trace_find_cmdline(entry->pid, comm);
425
426 ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]"
427 " %ld.%03ldms (+%ld.%03ldms): ", comm,
428 entry->pid, iter->cpu, entry->flags,
429 entry->preempt_count, iter->idx,
430 ns2usecs(iter->ts),
431 abs_usecs / USEC_PER_MSEC,
432 abs_usecs % USEC_PER_MSEC,
433 rel_usecs / USEC_PER_MSEC,
434 rel_usecs % USEC_PER_MSEC);
435 } else {
436 ret = lat_print_generic(s, entry, iter->cpu);
437 if (ret)
438 ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
439 }
440
441 return ret;
442}
443
444static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
445
446static int task_state_char(unsigned long state)
447{
448 int bit = state ? __ffs(state) + 1 : 0;
449
450 return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
451}
452
453/**
454 * ftrace_find_event - find a registered event
455 * @type: the type of event to look for
456 *
457 * Returns an event of type @type otherwise NULL
458 */
459struct trace_event *ftrace_find_event(int type)
460{
461 struct trace_event *event;
462 struct hlist_node *n;
463 unsigned key;
464
465 key = type & (EVENT_HASHSIZE - 1);
466
467 hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
468 if (event->type == type)
469 return event;
470 }
471
472 return NULL;
473}
474
475/**
476 * register_ftrace_event - register output for an event type
477 * @event: the event type to register
478 *
479 * Event types are stored in a hash and this hash is used to
480 * find a way to print an event. If the @event->type is set
481 * then it will use that type, otherwise it will assign a
482 * type to use.
483 *
484 * If you assign your own type, please make sure it is added
485 * to the trace_type enum in trace.h, to avoid collisions
486 * with the dynamic types.
487 *
488 * Returns the event type number or zero on error.
489 */
490int register_ftrace_event(struct trace_event *event)
491{
492 unsigned key;
493 int ret = 0;
494
495 mutex_lock(&trace_event_mutex);
496
497 if (!event) {
498 ret = next_event_type++;
499 goto out;
500 }
501
502 if (!event->type)
503 event->type = next_event_type++;
504 else if (event->type > __TRACE_LAST_TYPE) {
505 printk(KERN_WARNING "Need to add type to trace.h\n");
506 WARN_ON(1);
507 }
508
509 if (ftrace_find_event(event->type))
510 goto out;
511
512 if (event->trace == NULL)
513 event->trace = trace_nop_print;
514 if (event->raw == NULL)
515 event->raw = trace_nop_print;
516 if (event->hex == NULL)
517 event->hex = trace_nop_print;
518 if (event->binary == NULL)
519 event->binary = trace_nop_print;
520
521 key = event->type & (EVENT_HASHSIZE - 1);
522
523 hlist_add_head_rcu(&event->node, &event_hash[key]);
524
525 ret = event->type;
526 out:
527 mutex_unlock(&trace_event_mutex);
528
529 return ret;
530}
531
532/**
533 * unregister_ftrace_event - remove a no longer used event
534 * @event: the event to remove
535 */
536int unregister_ftrace_event(struct trace_event *event)
537{
538 mutex_lock(&trace_event_mutex);
539 hlist_del(&event->node);
540 mutex_unlock(&trace_event_mutex);
541
542 return 0;
543}
544
545/*
546 * Standard events
547 */
548
549enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
550{
551 return TRACE_TYPE_HANDLED;
552}
553
554/* TRACE_FN */
555static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
556{
557 struct ftrace_entry *field;
558 struct trace_seq *s = &iter->seq;
559
560 trace_assign_type(field, iter->ent);
561
562 if (!seq_print_ip_sym(s, field->ip, flags))
563 goto partial;
564
565 if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
566 if (!trace_seq_printf(s, " <-"))
567 goto partial;
568 if (!seq_print_ip_sym(s,
569 field->parent_ip,
570 flags))
571 goto partial;
572 }
573 if (!trace_seq_printf(s, "\n"))
574 goto partial;
575
576 return TRACE_TYPE_HANDLED;
577
578 partial:
579 return TRACE_TYPE_PARTIAL_LINE;
580}
581
582static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
583{
584 struct ftrace_entry *field;
585
586 trace_assign_type(field, iter->ent);
587
588 if (!trace_seq_printf(&iter->seq, "%lx %lx\n",
589 field->ip,
590 field->parent_ip))
591 return TRACE_TYPE_PARTIAL_LINE;
592
593 return TRACE_TYPE_HANDLED;
594}
595
596static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
597{
598 struct ftrace_entry *field;
599 struct trace_seq *s = &iter->seq;
600
601 trace_assign_type(field, iter->ent);
602
603 SEQ_PUT_HEX_FIELD_RET(s, field->ip);
604 SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
605
606 return TRACE_TYPE_HANDLED;
607}
608
609static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
610{
611 struct ftrace_entry *field;
612 struct trace_seq *s = &iter->seq;
613
614 trace_assign_type(field, iter->ent);
615
616 SEQ_PUT_FIELD_RET(s, field->ip);
617 SEQ_PUT_FIELD_RET(s, field->parent_ip);
618
619 return TRACE_TYPE_HANDLED;
620}
621
622static struct trace_event trace_fn_event = {
623 .type = TRACE_FN,
624 .trace = trace_fn_trace,
625 .raw = trace_fn_raw,
626 .hex = trace_fn_hex,
627 .binary = trace_fn_bin,
628};
629
630/* TRACE_CTX an TRACE_WAKE */
631static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
632 char *delim)
633{
634 struct ctx_switch_entry *field;
635 char comm[TASK_COMM_LEN];
636 int S, T;
637
638
639 trace_assign_type(field, iter->ent);
640
641 T = task_state_char(field->next_state);
642 S = task_state_char(field->prev_state);
643 trace_find_cmdline(field->next_pid, comm);
644 if (!trace_seq_printf(&iter->seq,
645 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
646 field->prev_pid,
647 field->prev_prio,
648 S, delim,
649 field->next_cpu,
650 field->next_pid,
651 field->next_prio,
652 T, comm))
653 return TRACE_TYPE_PARTIAL_LINE;
654
655 return TRACE_TYPE_HANDLED;
656}
657
658static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
659{
660 return trace_ctxwake_print(iter, "==>");
661}
662
663static enum print_line_t trace_wake_print(struct trace_iterator *iter,
664 int flags)
665{
666 return trace_ctxwake_print(iter, " +");
667}
668
669static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
670{
671 struct ctx_switch_entry *field;
672 int T;
673
674 trace_assign_type(field, iter->ent);
675
676 if (!S)
677 task_state_char(field->prev_state);
678 T = task_state_char(field->next_state);
679 if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
680 field->prev_pid,
681 field->prev_prio,
682 S,
683 field->next_cpu,
684 field->next_pid,
685 field->next_prio,
686 T))
687 return TRACE_TYPE_PARTIAL_LINE;
688
689 return TRACE_TYPE_HANDLED;
690}
691
692static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
693{
694 return trace_ctxwake_raw(iter, 0);
695}
696
697static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
698{
699 return trace_ctxwake_raw(iter, '+');
700}
701
702
703static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
704{
705 struct ctx_switch_entry *field;
706 struct trace_seq *s = &iter->seq;
707 int T;
708
709 trace_assign_type(field, iter->ent);
710
711 if (!S)
712 task_state_char(field->prev_state);
713 T = task_state_char(field->next_state);
714
715 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
716 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
717 SEQ_PUT_HEX_FIELD_RET(s, S);
718 SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
719 SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
720 SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
721 SEQ_PUT_HEX_FIELD_RET(s, T);
722
723 return TRACE_TYPE_HANDLED;
724}
725
726static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
727{
728 return trace_ctxwake_hex(iter, 0);
729}
730
731static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
732{
733 return trace_ctxwake_hex(iter, '+');
734}
735
736static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
737 int flags)
738{
739 struct ctx_switch_entry *field;
740 struct trace_seq *s = &iter->seq;
741
742 trace_assign_type(field, iter->ent);
743
744 SEQ_PUT_FIELD_RET(s, field->prev_pid);
745 SEQ_PUT_FIELD_RET(s, field->prev_prio);
746 SEQ_PUT_FIELD_RET(s, field->prev_state);
747 SEQ_PUT_FIELD_RET(s, field->next_pid);
748 SEQ_PUT_FIELD_RET(s, field->next_prio);
749 SEQ_PUT_FIELD_RET(s, field->next_state);
750
751 return TRACE_TYPE_HANDLED;
752}
753
754static struct trace_event trace_ctx_event = {
755 .type = TRACE_CTX,
756 .trace = trace_ctx_print,
757 .raw = trace_ctx_raw,
758 .hex = trace_ctx_hex,
759 .binary = trace_ctxwake_bin,
760};
761
762static struct trace_event trace_wake_event = {
763 .type = TRACE_WAKE,
764 .trace = trace_wake_print,
765 .raw = trace_wake_raw,
766 .hex = trace_wake_hex,
767 .binary = trace_ctxwake_bin,
768};
769
770/* TRACE_SPECIAL */
771static enum print_line_t trace_special_print(struct trace_iterator *iter,
772 int flags)
773{
774 struct special_entry *field;
775
776 trace_assign_type(field, iter->ent);
777
778 if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
779 field->arg1,
780 field->arg2,
781 field->arg3))
782 return TRACE_TYPE_PARTIAL_LINE;
783
784 return TRACE_TYPE_HANDLED;
785}
786
787static enum print_line_t trace_special_hex(struct trace_iterator *iter,
788 int flags)
789{
790 struct special_entry *field;
791 struct trace_seq *s = &iter->seq;
792
793 trace_assign_type(field, iter->ent);
794
795 SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
796 SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
797 SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
798
799 return TRACE_TYPE_HANDLED;
800}
801
802static enum print_line_t trace_special_bin(struct trace_iterator *iter,
803 int flags)
804{
805 struct special_entry *field;
806 struct trace_seq *s = &iter->seq;
807
808 trace_assign_type(field, iter->ent);
809
810 SEQ_PUT_FIELD_RET(s, field->arg1);
811 SEQ_PUT_FIELD_RET(s, field->arg2);
812 SEQ_PUT_FIELD_RET(s, field->arg3);
813
814 return TRACE_TYPE_HANDLED;
815}
816
817static struct trace_event trace_special_event = {
818 .type = TRACE_SPECIAL,
819 .trace = trace_special_print,
820 .raw = trace_special_print,
821 .hex = trace_special_hex,
822 .binary = trace_special_bin,
823};
824
825/* TRACE_STACK */
826
827static enum print_line_t trace_stack_print(struct trace_iterator *iter,
828 int flags)
829{
830 struct stack_entry *field;
831 struct trace_seq *s = &iter->seq;
832 int i;
833
834 trace_assign_type(field, iter->ent);
835
836 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
837 if (i) {
838 if (!trace_seq_puts(s, " <= "))
839 goto partial;
840
841 if (!seq_print_ip_sym(s, field->caller[i], flags))
842 goto partial;
843 }
844 if (!trace_seq_puts(s, "\n"))
845 goto partial;
846 }
847
848 return TRACE_TYPE_HANDLED;
849
850 partial:
851 return TRACE_TYPE_PARTIAL_LINE;
852}
853
854static struct trace_event trace_stack_event = {
855 .type = TRACE_STACK,
856 .trace = trace_stack_print,
857 .raw = trace_special_print,
858 .hex = trace_special_hex,
859 .binary = trace_special_bin,
860};
861
862/* TRACE_USER_STACK */
863static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
864 int flags)
865{
866 struct userstack_entry *field;
867 struct trace_seq *s = &iter->seq;
868
869 trace_assign_type(field, iter->ent);
870
871 if (!seq_print_userip_objs(field, s, flags))
872 goto partial;
873
874 if (!trace_seq_putc(s, '\n'))
875 goto partial;
876
877 return TRACE_TYPE_HANDLED;
878
879 partial:
880 return TRACE_TYPE_PARTIAL_LINE;
881}
882
883static struct trace_event trace_user_stack_event = {
884 .type = TRACE_USER_STACK,
885 .trace = trace_user_stack_print,
886 .raw = trace_special_print,
887 .hex = trace_special_hex,
888 .binary = trace_special_bin,
889};
890
891/* TRACE_BPRINT */
892static enum print_line_t
893trace_bprint_print(struct trace_iterator *iter, int flags)
894{
895 struct trace_entry *entry = iter->ent;
896 struct trace_seq *s = &iter->seq;
897 struct bprint_entry *field;
898
899 trace_assign_type(field, entry);
900
901 if (!seq_print_ip_sym(s, field->ip, flags))
902 goto partial;
903
904 if (!trace_seq_puts(s, ": "))
905 goto partial;
906
907 if (!trace_seq_bprintf(s, field->fmt, field->buf))
908 goto partial;
909
910 return TRACE_TYPE_HANDLED;
911
912 partial:
913 return TRACE_TYPE_PARTIAL_LINE;
914}
915
916
917static enum print_line_t
918trace_bprint_raw(struct trace_iterator *iter, int flags)
919{
920 struct bprint_entry *field;
921 struct trace_seq *s = &iter->seq;
922
923 trace_assign_type(field, iter->ent);
924
925 if (!trace_seq_printf(s, ": %lx : ", field->ip))
926 goto partial;
927
928 if (!trace_seq_bprintf(s, field->fmt, field->buf))
929 goto partial;
930
931 return TRACE_TYPE_HANDLED;
932
933 partial:
934 return TRACE_TYPE_PARTIAL_LINE;
935}
936
937
938static struct trace_event trace_bprint_event = {
939 .type = TRACE_BPRINT,
940 .trace = trace_bprint_print,
941 .raw = trace_bprint_raw,
942};
943
944/* TRACE_PRINT */
945static enum print_line_t trace_print_print(struct trace_iterator *iter,
946 int flags)
947{
948 struct print_entry *field;
949 struct trace_seq *s = &iter->seq;
950
951 trace_assign_type(field, iter->ent);
952
953 if (!seq_print_ip_sym(s, field->ip, flags))
954 goto partial;
955
956 if (!trace_seq_printf(s, ": %s", field->buf))
957 goto partial;
958
959 return TRACE_TYPE_HANDLED;
960
961 partial:
962 return TRACE_TYPE_PARTIAL_LINE;
963}
964
965static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
966{
967 struct print_entry *field;
968
969 trace_assign_type(field, iter->ent);
970
971 if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
972 goto partial;
973
974 return TRACE_TYPE_HANDLED;
975
976 partial:
977 return TRACE_TYPE_PARTIAL_LINE;
978}
979
980static struct trace_event trace_print_event = {
981 .type = TRACE_PRINT,
982 .trace = trace_print_print,
983 .raw = trace_print_raw,
984};
985
986
987static struct trace_event *events[] __initdata = {
988 &trace_fn_event,
989 &trace_ctx_event,
990 &trace_wake_event,
991 &trace_special_event,
992 &trace_stack_event,
993 &trace_user_stack_event,
994 &trace_bprint_event,
995 &trace_print_event,
996 NULL
997};
998
999__init static int init_events(void)
1000{
1001 struct trace_event *event;
1002 int i, ret;
1003
1004 for (i = 0; events[i]; i++) {
1005 event = events[i];
1006
1007 ret = register_ftrace_event(event);
1008 if (!ret) {
1009 printk(KERN_WARNING "event %d failed to register\n",
1010 event->type);
1011 WARN_ON_ONCE(1);
1012 }
1013 }
1014
1015 return 0;
1016}
1017device_initcall(init_events);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
new file mode 100644
index 000000000000..e0bde39c2dd9
--- /dev/null
+++ b/kernel/trace/trace_output.h
@@ -0,0 +1,71 @@
1#ifndef __TRACE_EVENTS_H
2#define __TRACE_EVENTS_H
3
4#include "trace.h"
5
6typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
7 int flags);
8
9struct trace_event {
10 struct hlist_node node;
11 int type;
12 trace_print_func trace;
13 trace_print_func raw;
14 trace_print_func hex;
15 trace_print_func binary;
16};
17
18extern enum print_line_t
19trace_print_bprintk_msg_only(struct trace_iterator *iter);
20extern enum print_line_t
21trace_print_printk_msg_only(struct trace_iterator *iter);
22
23extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
24 __attribute__ ((format (printf, 2, 3)));
25extern int
26trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
27extern int
28seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
29 unsigned long sym_flags);
30extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
31 size_t cnt);
32extern int trace_seq_puts(struct trace_seq *s, const char *str);
33extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
34extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
35extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
36 size_t len);
37extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
38extern int trace_seq_path(struct trace_seq *s, struct path *path);
39extern int seq_print_userip_objs(const struct userstack_entry *entry,
40 struct trace_seq *s, unsigned long sym_flags);
41extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
42 unsigned long ip, unsigned long sym_flags);
43
44extern int trace_print_context(struct trace_iterator *iter);
45extern int trace_print_lat_context(struct trace_iterator *iter);
46
47extern struct trace_event *ftrace_find_event(int type);
48extern int register_ftrace_event(struct trace_event *event);
49extern int unregister_ftrace_event(struct trace_event *event);
50
51extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
52 int flags);
53
54#define MAX_MEMHEX_BYTES 8
55#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
56
57#define SEQ_PUT_FIELD_RET(s, x) \
58do { \
59 if (!trace_seq_putmem(s, &(x), sizeof(x))) \
60 return TRACE_TYPE_PARTIAL_LINE; \
61} while (0)
62
63#define SEQ_PUT_HEX_FIELD_RET(s, x) \
64do { \
65 BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
66 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
67 return TRACE_TYPE_PARTIAL_LINE; \
68} while (0)
69
70#endif
71
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 7bda248daf55..118439709fb7 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -11,15 +11,113 @@
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/debugfs.h> 13#include <linux/debugfs.h>
14#include <linux/ftrace.h> 14#include <trace/power.h>
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/module.h> 16#include <linux/module.h>
17 17
18#include "trace.h" 18#include "trace.h"
19#include "trace_output.h"
19 20
20static struct trace_array *power_trace; 21static struct trace_array *power_trace;
21static int __read_mostly trace_power_enabled; 22static int __read_mostly trace_power_enabled;
22 23
24static void probe_power_start(struct power_trace *it, unsigned int type,
25 unsigned int level)
26{
27 if (!trace_power_enabled)
28 return;
29
30 memset(it, 0, sizeof(struct power_trace));
31 it->state = level;
32 it->type = type;
33 it->stamp = ktime_get();
34}
35
36
37static void probe_power_end(struct power_trace *it)
38{
39 struct ring_buffer_event *event;
40 struct trace_power *entry;
41 struct trace_array_cpu *data;
42 struct trace_array *tr = power_trace;
43
44 if (!trace_power_enabled)
45 return;
46
47 preempt_disable();
48 it->end = ktime_get();
49 data = tr->data[smp_processor_id()];
50
51 event = trace_buffer_lock_reserve(tr, TRACE_POWER,
52 sizeof(*entry), 0, 0);
53 if (!event)
54 goto out;
55 entry = ring_buffer_event_data(event);
56 entry->state_data = *it;
57 trace_buffer_unlock_commit(tr, event, 0, 0);
58 out:
59 preempt_enable();
60}
61
62static void probe_power_mark(struct power_trace *it, unsigned int type,
63 unsigned int level)
64{
65 struct ring_buffer_event *event;
66 struct trace_power *entry;
67 struct trace_array_cpu *data;
68 struct trace_array *tr = power_trace;
69
70 if (!trace_power_enabled)
71 return;
72
73 memset(it, 0, sizeof(struct power_trace));
74 it->state = level;
75 it->type = type;
76 it->stamp = ktime_get();
77 preempt_disable();
78 it->end = it->stamp;
79 data = tr->data[smp_processor_id()];
80
81 event = trace_buffer_lock_reserve(tr, TRACE_POWER,
82 sizeof(*entry), 0, 0);
83 if (!event)
84 goto out;
85 entry = ring_buffer_event_data(event);
86 entry->state_data = *it;
87 trace_buffer_unlock_commit(tr, event, 0, 0);
88 out:
89 preempt_enable();
90}
91
92static int tracing_power_register(void)
93{
94 int ret;
95
96 ret = register_trace_power_start(probe_power_start);
97 if (ret) {
98 pr_info("power trace: Couldn't activate tracepoint"
99 " probe to trace_power_start\n");
100 return ret;
101 }
102 ret = register_trace_power_end(probe_power_end);
103 if (ret) {
104 pr_info("power trace: Couldn't activate tracepoint"
105 " probe to trace_power_end\n");
106 goto fail_start;
107 }
108 ret = register_trace_power_mark(probe_power_mark);
109 if (ret) {
110 pr_info("power trace: Couldn't activate tracepoint"
111 " probe to trace_power_mark\n");
112 goto fail_end;
113 }
114 return ret;
115fail_end:
116 unregister_trace_power_end(probe_power_end);
117fail_start:
118 unregister_trace_power_start(probe_power_start);
119 return ret;
120}
23 121
24static void start_power_trace(struct trace_array *tr) 122static void start_power_trace(struct trace_array *tr)
25{ 123{
@@ -31,6 +129,14 @@ static void stop_power_trace(struct trace_array *tr)
31 trace_power_enabled = 0; 129 trace_power_enabled = 0;
32} 130}
33 131
132static void power_trace_reset(struct trace_array *tr)
133{
134 trace_power_enabled = 0;
135 unregister_trace_power_start(probe_power_start);
136 unregister_trace_power_end(probe_power_end);
137 unregister_trace_power_mark(probe_power_mark);
138}
139
34 140
35static int power_trace_init(struct trace_array *tr) 141static int power_trace_init(struct trace_array *tr)
36{ 142{
@@ -38,6 +144,7 @@ static int power_trace_init(struct trace_array *tr)
38 power_trace = tr; 144 power_trace = tr;
39 145
40 trace_power_enabled = 1; 146 trace_power_enabled = 1;
147 tracing_power_register();
41 148
42 for_each_cpu(cpu, cpu_possible_mask) 149 for_each_cpu(cpu, cpu_possible_mask)
43 tracing_reset(tr, cpu); 150 tracing_reset(tr, cpu);
@@ -79,14 +186,21 @@ static enum print_line_t power_print_line(struct trace_iterator *iter)
79 return TRACE_TYPE_UNHANDLED; 186 return TRACE_TYPE_UNHANDLED;
80} 187}
81 188
189static void power_print_header(struct seq_file *s)
190{
191 seq_puts(s, "# TIMESTAMP STATE EVENT\n");
192 seq_puts(s, "# | | |\n");
193}
194
82static struct tracer power_tracer __read_mostly = 195static struct tracer power_tracer __read_mostly =
83{ 196{
84 .name = "power", 197 .name = "power",
85 .init = power_trace_init, 198 .init = power_trace_init,
86 .start = start_power_trace, 199 .start = start_power_trace,
87 .stop = stop_power_trace, 200 .stop = stop_power_trace,
88 .reset = stop_power_trace, 201 .reset = power_trace_reset,
89 .print_line = power_print_line, 202 .print_line = power_print_line,
203 .print_header = power_print_header,
90}; 204};
91 205
92static int init_power_trace(void) 206static int init_power_trace(void)
@@ -94,86 +208,3 @@ static int init_power_trace(void)
94 return register_tracer(&power_tracer); 208 return register_tracer(&power_tracer);
95} 209}
96device_initcall(init_power_trace); 210device_initcall(init_power_trace);
97
98void trace_power_start(struct power_trace *it, unsigned int type,
99 unsigned int level)
100{
101 if (!trace_power_enabled)
102 return;
103
104 memset(it, 0, sizeof(struct power_trace));
105 it->state = level;
106 it->type = type;
107 it->stamp = ktime_get();
108}
109EXPORT_SYMBOL_GPL(trace_power_start);
110
111
112void trace_power_end(struct power_trace *it)
113{
114 struct ring_buffer_event *event;
115 struct trace_power *entry;
116 struct trace_array_cpu *data;
117 unsigned long irq_flags;
118 struct trace_array *tr = power_trace;
119
120 if (!trace_power_enabled)
121 return;
122
123 preempt_disable();
124 it->end = ktime_get();
125 data = tr->data[smp_processor_id()];
126
127 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
128 &irq_flags);
129 if (!event)
130 goto out;
131 entry = ring_buffer_event_data(event);
132 tracing_generic_entry_update(&entry->ent, 0, 0);
133 entry->ent.type = TRACE_POWER;
134 entry->state_data = *it;
135 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
136
137 trace_wake_up();
138
139 out:
140 preempt_enable();
141}
142EXPORT_SYMBOL_GPL(trace_power_end);
143
144void trace_power_mark(struct power_trace *it, unsigned int type,
145 unsigned int level)
146{
147 struct ring_buffer_event *event;
148 struct trace_power *entry;
149 struct trace_array_cpu *data;
150 unsigned long irq_flags;
151 struct trace_array *tr = power_trace;
152
153 if (!trace_power_enabled)
154 return;
155
156 memset(it, 0, sizeof(struct power_trace));
157 it->state = level;
158 it->type = type;
159 it->stamp = ktime_get();
160 preempt_disable();
161 it->end = it->stamp;
162 data = tr->data[smp_processor_id()];
163
164 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
165 &irq_flags);
166 if (!event)
167 goto out;
168 entry = ring_buffer_event_data(event);
169 tracing_generic_entry_update(&entry->ent, 0, 0);
170 entry->ent.type = TRACE_POWER;
171 entry->state_data = *it;
172 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
173
174 trace_wake_up();
175
176 out:
177 preempt_enable();
178}
179EXPORT_SYMBOL_GPL(trace_power_mark);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
new file mode 100644
index 000000000000..eb81556107fe
--- /dev/null
+++ b/kernel/trace/trace_printk.c
@@ -0,0 +1,270 @@
1/*
2 * trace binary printk
3 *
4 * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
5 *
6 */
7#include <linux/seq_file.h>
8#include <linux/debugfs.h>
9#include <linux/uaccess.h>
10#include <linux/kernel.h>
11#include <linux/ftrace.h>
12#include <linux/string.h>
13#include <linux/module.h>
14#include <linux/marker.h>
15#include <linux/mutex.h>
16#include <linux/ctype.h>
17#include <linux/list.h>
18#include <linux/slab.h>
19#include <linux/fs.h>
20
21#include "trace.h"
22
23#ifdef CONFIG_MODULES
24
25/*
26 * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt
27 * which are queued on trace_bprintk_fmt_list.
28 */
29static LIST_HEAD(trace_bprintk_fmt_list);
30
31/* serialize accesses to trace_bprintk_fmt_list */
32static DEFINE_MUTEX(btrace_mutex);
33
34struct trace_bprintk_fmt {
35 struct list_head list;
36 char fmt[0];
37};
38
39static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
40{
41 struct trace_bprintk_fmt *pos;
42 list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
43 if (!strcmp(pos->fmt, fmt))
44 return pos;
45 }
46 return NULL;
47}
48
49static
50void hold_module_trace_bprintk_format(const char **start, const char **end)
51{
52 const char **iter;
53
54 mutex_lock(&btrace_mutex);
55 for (iter = start; iter < end; iter++) {
56 struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
57 if (tb_fmt) {
58 *iter = tb_fmt->fmt;
59 continue;
60 }
61
62 tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
63 + strlen(*iter) + 1, GFP_KERNEL);
64 if (tb_fmt) {
65 list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
66 strcpy(tb_fmt->fmt, *iter);
67 *iter = tb_fmt->fmt;
68 } else
69 *iter = NULL;
70 }
71 mutex_unlock(&btrace_mutex);
72}
73
74static int module_trace_bprintk_format_notify(struct notifier_block *self,
75 unsigned long val, void *data)
76{
77 struct module *mod = data;
78 if (mod->num_trace_bprintk_fmt) {
79 const char **start = mod->trace_bprintk_fmt_start;
80 const char **end = start + mod->num_trace_bprintk_fmt;
81
82 if (val == MODULE_STATE_COMING)
83 hold_module_trace_bprintk_format(start, end);
84 }
85 return 0;
86}
87
88#else /* !CONFIG_MODULES */
89__init static int
90module_trace_bprintk_format_notify(struct notifier_block *self,
91 unsigned long val, void *data)
92{
93 return 0;
94}
95#endif /* CONFIG_MODULES */
96
97
98__initdata_or_module static
99struct notifier_block module_trace_bprintk_format_nb = {
100 .notifier_call = module_trace_bprintk_format_notify,
101};
102
103int __trace_bprintk(unsigned long ip, const char *fmt, ...)
104 {
105 int ret;
106 va_list ap;
107
108 if (unlikely(!fmt))
109 return 0;
110
111 if (!(trace_flags & TRACE_ITER_PRINTK))
112 return 0;
113
114 va_start(ap, fmt);
115 ret = trace_vbprintk(ip, fmt, ap);
116 va_end(ap);
117 return ret;
118}
119EXPORT_SYMBOL_GPL(__trace_bprintk);
120
121int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
122 {
123 if (unlikely(!fmt))
124 return 0;
125
126 if (!(trace_flags & TRACE_ITER_PRINTK))
127 return 0;
128
129 return trace_vbprintk(ip, fmt, ap);
130}
131EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
132
133int __trace_printk(unsigned long ip, const char *fmt, ...)
134{
135 int ret;
136 va_list ap;
137
138 if (!(trace_flags & TRACE_ITER_PRINTK))
139 return 0;
140
141 va_start(ap, fmt);
142 ret = trace_vprintk(ip, fmt, ap);
143 va_end(ap);
144 return ret;
145}
146EXPORT_SYMBOL_GPL(__trace_printk);
147
148int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
149{
150 if (!(trace_flags & TRACE_ITER_PRINTK))
151 return 0;
152
153 return trace_vprintk(ip, fmt, ap);
154}
155EXPORT_SYMBOL_GPL(__ftrace_vprintk);
156
157static void *
158t_next(struct seq_file *m, void *v, loff_t *pos)
159{
160 const char **fmt = m->private;
161 const char **next = fmt;
162
163 (*pos)++;
164
165 if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
166 return NULL;
167
168 next = fmt;
169 m->private = ++next;
170
171 return fmt;
172}
173
174static void *t_start(struct seq_file *m, loff_t *pos)
175{
176 return t_next(m, NULL, pos);
177}
178
179static int t_show(struct seq_file *m, void *v)
180{
181 const char **fmt = v;
182 const char *str = *fmt;
183 int i;
184
185 seq_printf(m, "0x%lx : \"", (unsigned long)fmt);
186
187 /*
188 * Tabs and new lines need to be converted.
189 */
190 for (i = 0; str[i]; i++) {
191 switch (str[i]) {
192 case '\n':
193 seq_puts(m, "\\n");
194 break;
195 case '\t':
196 seq_puts(m, "\\t");
197 break;
198 case '\\':
199 seq_puts(m, "\\");
200 break;
201 case '"':
202 seq_puts(m, "\\\"");
203 break;
204 default:
205 seq_putc(m, str[i]);
206 }
207 }
208 seq_puts(m, "\"\n");
209
210 return 0;
211}
212
213static void t_stop(struct seq_file *m, void *p)
214{
215}
216
217static const struct seq_operations show_format_seq_ops = {
218 .start = t_start,
219 .next = t_next,
220 .show = t_show,
221 .stop = t_stop,
222};
223
224static int
225ftrace_formats_open(struct inode *inode, struct file *file)
226{
227 int ret;
228
229 ret = seq_open(file, &show_format_seq_ops);
230 if (!ret) {
231 struct seq_file *m = file->private_data;
232
233 m->private = __start___trace_bprintk_fmt;
234 }
235 return ret;
236}
237
238static const struct file_operations ftrace_formats_fops = {
239 .open = ftrace_formats_open,
240 .read = seq_read,
241 .llseek = seq_lseek,
242 .release = seq_release,
243};
244
245static __init int init_trace_printk_function_export(void)
246{
247 struct dentry *d_tracer;
248 struct dentry *entry;
249
250 d_tracer = tracing_init_dentry();
251 if (!d_tracer)
252 return 0;
253
254 entry = debugfs_create_file("printk_formats", 0444, d_tracer,
255 NULL, &ftrace_formats_fops);
256 if (!entry)
257 pr_warning("Could not create debugfs "
258 "'printk_formats' entry\n");
259
260 return 0;
261}
262
263fs_initcall(init_trace_printk_function_export);
264
265static __init int init_trace_printk(void)
266{
267 return register_module_notifier(&module_trace_bprintk_format_nb);
268}
269
270early_initcall(init_trace_printk);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index df175cb4564f..9117cea6f1ae 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -18,6 +18,7 @@ static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled; 18static int __read_mostly tracer_enabled;
19static int sched_ref; 19static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 20static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped;
21 22
22static void 23static void
23probe_sched_switch(struct rq *__rq, struct task_struct *prev, 24probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -28,7 +29,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
28 int cpu; 29 int cpu;
29 int pc; 30 int pc;
30 31
31 if (!sched_ref) 32 if (!sched_ref || sched_stopped)
32 return; 33 return;
33 34
34 tracing_record_cmdline(prev); 35 tracing_record_cmdline(prev);
@@ -43,7 +44,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
43 data = ctx_trace->data[cpu]; 44 data = ctx_trace->data[cpu];
44 45
45 if (likely(!atomic_read(&data->disabled))) 46 if (likely(!atomic_read(&data->disabled)))
46 tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc); 47 tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
47 48
48 local_irq_restore(flags); 49 local_irq_restore(flags);
49} 50}
@@ -61,12 +62,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
61 pc = preempt_count(); 62 pc = preempt_count();
62 tracing_record_cmdline(current); 63 tracing_record_cmdline(current);
63 64
65 if (sched_stopped)
66 return;
67
64 local_irq_save(flags); 68 local_irq_save(flags);
65 cpu = raw_smp_processor_id(); 69 cpu = raw_smp_processor_id();
66 data = ctx_trace->data[cpu]; 70 data = ctx_trace->data[cpu];
67 71
68 if (likely(!atomic_read(&data->disabled))) 72 if (likely(!atomic_read(&data->disabled)))
69 tracing_sched_wakeup_trace(ctx_trace, data, wakee, current, 73 tracing_sched_wakeup_trace(ctx_trace, wakee, current,
70 flags, pc); 74 flags, pc);
71 75
72 local_irq_restore(flags); 76 local_irq_restore(flags);
@@ -93,7 +97,7 @@ static int tracing_sched_register(void)
93 ret = register_trace_sched_switch(probe_sched_switch); 97 ret = register_trace_sched_switch(probe_sched_switch);
94 if (ret) { 98 if (ret) {
95 pr_info("sched trace: Couldn't activate tracepoint" 99 pr_info("sched trace: Couldn't activate tracepoint"
96 " probe to kernel_sched_schedule\n"); 100 " probe to kernel_sched_switch\n");
97 goto fail_deprobe_wake_new; 101 goto fail_deprobe_wake_new;
98 } 102 }
99 103
@@ -185,12 +189,6 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
185 ctx_trace = tr; 189 ctx_trace = tr;
186} 190}
187 191
188static void start_sched_trace(struct trace_array *tr)
189{
190 tracing_reset_online_cpus(tr);
191 tracing_start_sched_switch_record();
192}
193
194static void stop_sched_trace(struct trace_array *tr) 192static void stop_sched_trace(struct trace_array *tr)
195{ 193{
196 tracing_stop_sched_switch_record(); 194 tracing_stop_sched_switch_record();
@@ -199,7 +197,8 @@ static void stop_sched_trace(struct trace_array *tr)
199static int sched_switch_trace_init(struct trace_array *tr) 197static int sched_switch_trace_init(struct trace_array *tr)
200{ 198{
201 ctx_trace = tr; 199 ctx_trace = tr;
202 start_sched_trace(tr); 200 tracing_reset_online_cpus(tr);
201 tracing_start_sched_switch_record();
203 return 0; 202 return 0;
204} 203}
205 204
@@ -211,13 +210,12 @@ static void sched_switch_trace_reset(struct trace_array *tr)
211 210
212static void sched_switch_trace_start(struct trace_array *tr) 211static void sched_switch_trace_start(struct trace_array *tr)
213{ 212{
214 tracing_reset_online_cpus(tr); 213 sched_stopped = 0;
215 tracing_start_sched_switch();
216} 214}
217 215
218static void sched_switch_trace_stop(struct trace_array *tr) 216static void sched_switch_trace_stop(struct trace_array *tr)
219{ 217{
220 tracing_stop_sched_switch(); 218 sched_stopped = 1;
221} 219}
222 220
223static struct tracer sched_switch_trace __read_mostly = 221static struct tracer sched_switch_trace __read_mostly =
@@ -227,6 +225,7 @@ static struct tracer sched_switch_trace __read_mostly =
227 .reset = sched_switch_trace_reset, 225 .reset = sched_switch_trace_reset,
228 .start = sched_switch_trace_start, 226 .start = sched_switch_trace_start,
229 .stop = sched_switch_trace_stop, 227 .stop = sched_switch_trace_stop,
228 .wait_pipe = poll_wait_pipe,
230#ifdef CONFIG_FTRACE_SELFTEST 229#ifdef CONFIG_FTRACE_SELFTEST
231 .selftest = trace_selftest_startup_sched_switch, 230 .selftest = trace_selftest_startup_sched_switch,
232#endif 231#endif
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 42ae1e77b6b3..5bc00e8f153e 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -25,12 +25,15 @@ static int __read_mostly tracer_enabled;
25static struct task_struct *wakeup_task; 25static struct task_struct *wakeup_task;
26static int wakeup_cpu; 26static int wakeup_cpu;
27static unsigned wakeup_prio = -1; 27static unsigned wakeup_prio = -1;
28static int wakeup_rt;
28 29
29static raw_spinlock_t wakeup_lock = 30static raw_spinlock_t wakeup_lock =
30 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 31 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
31 32
32static void __wakeup_reset(struct trace_array *tr); 33static void __wakeup_reset(struct trace_array *tr);
33 34
35static int save_lat_flag;
36
34#ifdef CONFIG_FUNCTION_TRACER 37#ifdef CONFIG_FUNCTION_TRACER
35/* 38/*
36 * irqsoff uses its own tracer function to keep the overhead down: 39 * irqsoff uses its own tracer function to keep the overhead down:
@@ -71,7 +74,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
71 if (task_cpu(wakeup_task) != cpu) 74 if (task_cpu(wakeup_task) != cpu)
72 goto unlock; 75 goto unlock;
73 76
74 trace_function(tr, data, ip, parent_ip, flags, pc); 77 trace_function(tr, ip, parent_ip, flags, pc);
75 78
76 unlock: 79 unlock:
77 __raw_spin_unlock(&wakeup_lock); 80 __raw_spin_unlock(&wakeup_lock);
@@ -151,7 +154,8 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
151 if (unlikely(!tracer_enabled || next != wakeup_task)) 154 if (unlikely(!tracer_enabled || next != wakeup_task))
152 goto out_unlock; 155 goto out_unlock;
153 156
154 trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
155 159
156 /* 160 /*
157 * usecs conversion is slow so we try to delay the conversion 161 * usecs conversion is slow so we try to delay the conversion
@@ -182,13 +186,10 @@ out:
182 186
183static void __wakeup_reset(struct trace_array *tr) 187static void __wakeup_reset(struct trace_array *tr)
184{ 188{
185 struct trace_array_cpu *data;
186 int cpu; 189 int cpu;
187 190
188 for_each_possible_cpu(cpu) { 191 for_each_possible_cpu(cpu)
189 data = tr->data[cpu];
190 tracing_reset(tr, cpu); 192 tracing_reset(tr, cpu);
191 }
192 193
193 wakeup_cpu = -1; 194 wakeup_cpu = -1;
194 wakeup_prio = -1; 195 wakeup_prio = -1;
@@ -213,6 +214,7 @@ static void wakeup_reset(struct trace_array *tr)
213static void 214static void
214probe_wakeup(struct rq *rq, struct task_struct *p, int success) 215probe_wakeup(struct rq *rq, struct task_struct *p, int success)
215{ 216{
217 struct trace_array_cpu *data;
216 int cpu = smp_processor_id(); 218 int cpu = smp_processor_id();
217 unsigned long flags; 219 unsigned long flags;
218 long disabled; 220 long disabled;
@@ -224,7 +226,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
224 tracing_record_cmdline(p); 226 tracing_record_cmdline(p);
225 tracing_record_cmdline(current); 227 tracing_record_cmdline(current);
226 228
227 if (likely(!rt_task(p)) || 229 if ((wakeup_rt && !rt_task(p)) ||
228 p->prio >= wakeup_prio || 230 p->prio >= wakeup_prio ||
229 p->prio >= current->prio) 231 p->prio >= current->prio)
230 return; 232 return;
@@ -252,9 +254,16 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
252 254
253 local_save_flags(flags); 255 local_save_flags(flags);
254 256
255 wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); 257 data = wakeup_trace->data[wakeup_cpu];
256 trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu], 258 data->preempt_timestamp = ftrace_now(cpu);
257 CALLER_ADDR1, CALLER_ADDR2, flags, pc); 259 tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
260
261 /*
262 * We must be careful in using CALLER_ADDR2. But since wake_up
263 * is not called by an assembly function (where as schedule is)
264 * it should be safe to use it here.
265 */
266 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
258 267
259out_locked: 268out_locked:
260 __raw_spin_unlock(&wakeup_lock); 269 __raw_spin_unlock(&wakeup_lock);
@@ -262,12 +271,6 @@ out:
262 atomic_dec(&wakeup_trace->data[cpu]->disabled); 271 atomic_dec(&wakeup_trace->data[cpu]->disabled);
263} 272}
264 273
265/*
266 * save_tracer_enabled is used to save the state of the tracer_enabled
267 * variable when we disable it when we open a trace output file.
268 */
269static int save_tracer_enabled;
270
271static void start_wakeup_tracer(struct trace_array *tr) 274static void start_wakeup_tracer(struct trace_array *tr)
272{ 275{
273 int ret; 276 int ret;
@@ -289,7 +292,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
289 ret = register_trace_sched_switch(probe_wakeup_sched_switch); 292 ret = register_trace_sched_switch(probe_wakeup_sched_switch);
290 if (ret) { 293 if (ret) {
291 pr_info("sched trace: Couldn't activate tracepoint" 294 pr_info("sched trace: Couldn't activate tracepoint"
292 " probe to kernel_sched_schedule\n"); 295 " probe to kernel_sched_switch\n");
293 goto fail_deprobe_wake_new; 296 goto fail_deprobe_wake_new;
294 } 297 }
295 298
@@ -306,13 +309,10 @@ static void start_wakeup_tracer(struct trace_array *tr)
306 309
307 register_ftrace_function(&trace_ops); 310 register_ftrace_function(&trace_ops);
308 311
309 if (tracing_is_enabled()) { 312 if (tracing_is_enabled())
310 tracer_enabled = 1; 313 tracer_enabled = 1;
311 save_tracer_enabled = 1; 314 else
312 } else {
313 tracer_enabled = 0; 315 tracer_enabled = 0;
314 save_tracer_enabled = 0;
315 }
316 316
317 return; 317 return;
318fail_deprobe_wake_new: 318fail_deprobe_wake_new:
@@ -324,54 +324,54 @@ fail_deprobe:
324static void stop_wakeup_tracer(struct trace_array *tr) 324static void stop_wakeup_tracer(struct trace_array *tr)
325{ 325{
326 tracer_enabled = 0; 326 tracer_enabled = 0;
327 save_tracer_enabled = 0;
328 unregister_ftrace_function(&trace_ops); 327 unregister_ftrace_function(&trace_ops);
329 unregister_trace_sched_switch(probe_wakeup_sched_switch); 328 unregister_trace_sched_switch(probe_wakeup_sched_switch);
330 unregister_trace_sched_wakeup_new(probe_wakeup); 329 unregister_trace_sched_wakeup_new(probe_wakeup);
331 unregister_trace_sched_wakeup(probe_wakeup); 330 unregister_trace_sched_wakeup(probe_wakeup);
332} 331}
333 332
334static int wakeup_tracer_init(struct trace_array *tr) 333static int __wakeup_tracer_init(struct trace_array *tr)
335{ 334{
335 save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
336 trace_flags |= TRACE_ITER_LATENCY_FMT;
337
336 tracing_max_latency = 0; 338 tracing_max_latency = 0;
337 wakeup_trace = tr; 339 wakeup_trace = tr;
338 start_wakeup_tracer(tr); 340 start_wakeup_tracer(tr);
339 return 0; 341 return 0;
340} 342}
341 343
344static int wakeup_tracer_init(struct trace_array *tr)
345{
346 wakeup_rt = 0;
347 return __wakeup_tracer_init(tr);
348}
349
350static int wakeup_rt_tracer_init(struct trace_array *tr)
351{
352 wakeup_rt = 1;
353 return __wakeup_tracer_init(tr);
354}
355
342static void wakeup_tracer_reset(struct trace_array *tr) 356static void wakeup_tracer_reset(struct trace_array *tr)
343{ 357{
344 stop_wakeup_tracer(tr); 358 stop_wakeup_tracer(tr);
345 /* make sure we put back any tasks we are tracing */ 359 /* make sure we put back any tasks we are tracing */
346 wakeup_reset(tr); 360 wakeup_reset(tr);
361
362 if (!save_lat_flag)
363 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
347} 364}
348 365
349static void wakeup_tracer_start(struct trace_array *tr) 366static void wakeup_tracer_start(struct trace_array *tr)
350{ 367{
351 wakeup_reset(tr); 368 wakeup_reset(tr);
352 tracer_enabled = 1; 369 tracer_enabled = 1;
353 save_tracer_enabled = 1;
354} 370}
355 371
356static void wakeup_tracer_stop(struct trace_array *tr) 372static void wakeup_tracer_stop(struct trace_array *tr)
357{ 373{
358 tracer_enabled = 0; 374 tracer_enabled = 0;
359 save_tracer_enabled = 0;
360}
361
362static void wakeup_tracer_open(struct trace_iterator *iter)
363{
364 /* stop the trace while dumping */
365 tracer_enabled = 0;
366}
367
368static void wakeup_tracer_close(struct trace_iterator *iter)
369{
370 /* forget about any processes we were recording */
371 if (save_tracer_enabled) {
372 wakeup_reset(iter->tr);
373 tracer_enabled = 1;
374 }
375} 375}
376 376
377static struct tracer wakeup_tracer __read_mostly = 377static struct tracer wakeup_tracer __read_mostly =
@@ -381,8 +381,20 @@ static struct tracer wakeup_tracer __read_mostly =
381 .reset = wakeup_tracer_reset, 381 .reset = wakeup_tracer_reset,
382 .start = wakeup_tracer_start, 382 .start = wakeup_tracer_start,
383 .stop = wakeup_tracer_stop, 383 .stop = wakeup_tracer_stop,
384 .open = wakeup_tracer_open, 384 .print_max = 1,
385 .close = wakeup_tracer_close, 385#ifdef CONFIG_FTRACE_SELFTEST
386 .selftest = trace_selftest_startup_wakeup,
387#endif
388};
389
390static struct tracer wakeup_rt_tracer __read_mostly =
391{
392 .name = "wakeup_rt",
393 .init = wakeup_rt_tracer_init,
394 .reset = wakeup_tracer_reset,
395 .start = wakeup_tracer_start,
396 .stop = wakeup_tracer_stop,
397 .wait_pipe = poll_wait_pipe,
386 .print_max = 1, 398 .print_max = 1,
387#ifdef CONFIG_FTRACE_SELFTEST 399#ifdef CONFIG_FTRACE_SELFTEST
388 .selftest = trace_selftest_startup_wakeup, 400 .selftest = trace_selftest_startup_wakeup,
@@ -397,6 +409,10 @@ __init static int init_wakeup_tracer(void)
397 if (ret) 409 if (ret)
398 return ret; 410 return ret;
399 411
412 ret = register_tracer(&wakeup_rt_tracer);
413 if (ret)
414 return ret;
415
400 return 0; 416 return 0;
401} 417}
402device_initcall(init_wakeup_tracer); 418device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index bc8e80a86bca..08f4eb2763d1 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,5 +1,6 @@
1/* Include in trace.c */ 1/* Include in trace.c */
2 2
3#include <linux/stringify.h>
3#include <linux/kthread.h> 4#include <linux/kthread.h>
4#include <linux/delay.h> 5#include <linux/delay.h>
5 6
@@ -9,11 +10,12 @@ static inline int trace_valid_entry(struct trace_entry *entry)
9 case TRACE_FN: 10 case TRACE_FN:
10 case TRACE_CTX: 11 case TRACE_CTX:
11 case TRACE_WAKE: 12 case TRACE_WAKE:
12 case TRACE_CONT:
13 case TRACE_STACK: 13 case TRACE_STACK:
14 case TRACE_PRINT: 14 case TRACE_PRINT:
15 case TRACE_SPECIAL: 15 case TRACE_SPECIAL:
16 case TRACE_BRANCH: 16 case TRACE_BRANCH:
17 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET:
17 return 1; 19 return 1;
18 } 20 }
19 return 0; 21 return 0;
@@ -99,9 +101,6 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
99 101
100#ifdef CONFIG_DYNAMIC_FTRACE 102#ifdef CONFIG_DYNAMIC_FTRACE
101 103
102#define __STR(x) #x
103#define STR(x) __STR(x)
104
105/* Test dynamic code modification and ftrace filters */ 104/* Test dynamic code modification and ftrace filters */
106int trace_selftest_startup_dynamic_tracing(struct tracer *trace, 105int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
107 struct trace_array *tr, 106 struct trace_array *tr,
@@ -125,17 +124,17 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
125 func(); 124 func();
126 125
127 /* 126 /*
128 * Some archs *cough*PowerPC*cough* add charachters to the 127 * Some archs *cough*PowerPC*cough* add characters to the
129 * start of the function names. We simply put a '*' to 128 * start of the function names. We simply put a '*' to
130 * accomodate them. 129 * accommodate them.
131 */ 130 */
132 func_name = "*" STR(DYN_FTRACE_TEST_NAME); 131 func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
133 132
134 /* filter only on our function */ 133 /* filter only on our function */
135 ftrace_set_filter(func_name, strlen(func_name), 1); 134 ftrace_set_filter(func_name, strlen(func_name), 1);
136 135
137 /* enable tracing */ 136 /* enable tracing */
138 ret = trace->init(tr); 137 ret = tracer_init(trace, tr);
139 if (ret) { 138 if (ret) {
140 warn_failed_init_tracer(trace, ret); 139 warn_failed_init_tracer(trace, ret);
141 goto out; 140 goto out;
@@ -209,7 +208,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
209 ftrace_enabled = 1; 208 ftrace_enabled = 1;
210 tracer_enabled = 1; 209 tracer_enabled = 1;
211 210
212 ret = trace->init(tr); 211 ret = tracer_init(trace, tr);
213 if (ret) { 212 if (ret) {
214 warn_failed_init_tracer(trace, ret); 213 warn_failed_init_tracer(trace, ret);
215 goto out; 214 goto out;
@@ -247,6 +246,90 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
247} 246}
248#endif /* CONFIG_FUNCTION_TRACER */ 247#endif /* CONFIG_FUNCTION_TRACER */
249 248
249
250#ifdef CONFIG_FUNCTION_GRAPH_TRACER
251
252/* Maximum number of functions to trace before diagnosing a hang */
253#define GRAPH_MAX_FUNC_TEST 100000000
254
255static void __ftrace_dump(bool disable_tracing);
256static unsigned int graph_hang_thresh;
257
258/* Wrap the real function entry probe to avoid possible hanging */
259static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
260{
261 /* This is harmlessly racy, we want to approximately detect a hang */
262 if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
263 ftrace_graph_stop();
264 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
265 if (ftrace_dump_on_oops)
266 __ftrace_dump(false);
267 return 0;
268 }
269
270 return trace_graph_entry(trace);
271}
272
273/*
274 * Pretty much the same than for the function tracer from which the selftest
275 * has been borrowed.
276 */
277int
278trace_selftest_startup_function_graph(struct tracer *trace,
279 struct trace_array *tr)
280{
281 int ret;
282 unsigned long count;
283
284 /*
285 * Simulate the init() callback but we attach a watchdog callback
286 * to detect and recover from possible hangs
287 */
288 tracing_reset_online_cpus(tr);
289 ret = register_ftrace_graph(&trace_graph_return,
290 &trace_graph_entry_watchdog);
291 if (ret) {
292 warn_failed_init_tracer(trace, ret);
293 goto out;
294 }
295 tracing_start_cmdline_record();
296
297 /* Sleep for a 1/10 of a second */
298 msleep(100);
299
300 /* Have we just recovered from a hang? */
301 if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) {
302 tracing_selftest_disabled = true;
303 ret = -1;
304 goto out;
305 }
306
307 tracing_stop();
308
309 /* check the trace buffer */
310 ret = trace_test_buffer(tr, &count);
311
312 trace->reset(tr);
313 tracing_start();
314
315 if (!ret && !count) {
316 printk(KERN_CONT ".. no entries found ..");
317 ret = -1;
318 goto out;
319 }
320
321 /* Don't test dynamic tracing, the function tracer already did */
322
323out:
324 /* Stop it if we failed */
325 if (ret)
326 ftrace_graph_stop();
327
328 return ret;
329}
330#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
331
332
250#ifdef CONFIG_IRQSOFF_TRACER 333#ifdef CONFIG_IRQSOFF_TRACER
251int 334int
252trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) 335trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
@@ -256,7 +339,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
256 int ret; 339 int ret;
257 340
258 /* start the tracing */ 341 /* start the tracing */
259 ret = trace->init(tr); 342 ret = tracer_init(trace, tr);
260 if (ret) { 343 if (ret) {
261 warn_failed_init_tracer(trace, ret); 344 warn_failed_init_tracer(trace, ret);
262 return ret; 345 return ret;
@@ -268,6 +351,14 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
268 local_irq_disable(); 351 local_irq_disable();
269 udelay(100); 352 udelay(100);
270 local_irq_enable(); 353 local_irq_enable();
354
355 /*
356 * Stop the tracer to avoid a warning subsequent
357 * to buffer flipping failure because tracing_stop()
358 * disables the tr and max buffers, making flipping impossible
359 * in case of parallels max irqs off latencies.
360 */
361 trace->stop(tr);
271 /* stop the tracing. */ 362 /* stop the tracing. */
272 tracing_stop(); 363 tracing_stop();
273 /* check both trace buffers */ 364 /* check both trace buffers */
@@ -310,7 +401,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
310 } 401 }
311 402
312 /* start the tracing */ 403 /* start the tracing */
313 ret = trace->init(tr); 404 ret = tracer_init(trace, tr);
314 if (ret) { 405 if (ret) {
315 warn_failed_init_tracer(trace, ret); 406 warn_failed_init_tracer(trace, ret);
316 return ret; 407 return ret;
@@ -322,6 +413,14 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
322 preempt_disable(); 413 preempt_disable();
323 udelay(100); 414 udelay(100);
324 preempt_enable(); 415 preempt_enable();
416
417 /*
418 * Stop the tracer to avoid a warning subsequent
419 * to buffer flipping failure because tracing_stop()
420 * disables the tr and max buffers, making flipping impossible
421 * in case of parallels max preempt off latencies.
422 */
423 trace->stop(tr);
325 /* stop the tracing. */ 424 /* stop the tracing. */
326 tracing_stop(); 425 tracing_stop();
327 /* check both trace buffers */ 426 /* check both trace buffers */
@@ -364,10 +463,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
364 } 463 }
365 464
366 /* start the tracing */ 465 /* start the tracing */
367 ret = trace->init(tr); 466 ret = tracer_init(trace, tr);
368 if (ret) { 467 if (ret) {
369 warn_failed_init_tracer(trace, ret); 468 warn_failed_init_tracer(trace, ret);
370 goto out; 469 goto out_no_start;
371 } 470 }
372 471
373 /* reset the max latency */ 472 /* reset the max latency */
@@ -381,31 +480,35 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
381 /* reverse the order of preempt vs irqs */ 480 /* reverse the order of preempt vs irqs */
382 local_irq_enable(); 481 local_irq_enable();
383 482
483 /*
484 * Stop the tracer to avoid a warning subsequent
485 * to buffer flipping failure because tracing_stop()
486 * disables the tr and max buffers, making flipping impossible
487 * in case of parallels max irqs/preempt off latencies.
488 */
489 trace->stop(tr);
384 /* stop the tracing. */ 490 /* stop the tracing. */
385 tracing_stop(); 491 tracing_stop();
386 /* check both trace buffers */ 492 /* check both trace buffers */
387 ret = trace_test_buffer(tr, NULL); 493 ret = trace_test_buffer(tr, NULL);
388 if (ret) { 494 if (ret)
389 tracing_start();
390 goto out; 495 goto out;
391 }
392 496
393 ret = trace_test_buffer(&max_tr, &count); 497 ret = trace_test_buffer(&max_tr, &count);
394 if (ret) { 498 if (ret)
395 tracing_start();
396 goto out; 499 goto out;
397 }
398 500
399 if (!ret && !count) { 501 if (!ret && !count) {
400 printk(KERN_CONT ".. no entries found .."); 502 printk(KERN_CONT ".. no entries found ..");
401 ret = -1; 503 ret = -1;
402 tracing_start();
403 goto out; 504 goto out;
404 } 505 }
405 506
406 /* do the test by disabling interrupts first this time */ 507 /* do the test by disabling interrupts first this time */
407 tracing_max_latency = 0; 508 tracing_max_latency = 0;
408 tracing_start(); 509 tracing_start();
510 trace->start(tr);
511
409 preempt_disable(); 512 preempt_disable();
410 local_irq_disable(); 513 local_irq_disable();
411 udelay(100); 514 udelay(100);
@@ -413,6 +516,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
413 /* reverse the order of preempt vs irqs */ 516 /* reverse the order of preempt vs irqs */
414 local_irq_enable(); 517 local_irq_enable();
415 518
519 trace->stop(tr);
416 /* stop the tracing. */ 520 /* stop the tracing. */
417 tracing_stop(); 521 tracing_stop();
418 /* check both trace buffers */ 522 /* check both trace buffers */
@@ -428,9 +532,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
428 goto out; 532 goto out;
429 } 533 }
430 534
431 out: 535out:
432 trace->reset(tr);
433 tracing_start(); 536 tracing_start();
537out_no_start:
538 trace->reset(tr);
434 tracing_max_latency = save_max; 539 tracing_max_latency = save_max;
435 540
436 return ret; 541 return ret;
@@ -496,7 +601,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
496 wait_for_completion(&isrt); 601 wait_for_completion(&isrt);
497 602
498 /* start the tracing */ 603 /* start the tracing */
499 ret = trace->init(tr); 604 ret = tracer_init(trace, tr);
500 if (ret) { 605 if (ret) {
501 warn_failed_init_tracer(trace, ret); 606 warn_failed_init_tracer(trace, ret);
502 return ret; 607 return ret;
@@ -557,7 +662,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
557 int ret; 662 int ret;
558 663
559 /* start the tracing */ 664 /* start the tracing */
560 ret = trace->init(tr); 665 ret = tracer_init(trace, tr);
561 if (ret) { 666 if (ret) {
562 warn_failed_init_tracer(trace, ret); 667 warn_failed_init_tracer(trace, ret);
563 return ret; 668 return ret;
@@ -589,10 +694,10 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
589 int ret; 694 int ret;
590 695
591 /* start the tracing */ 696 /* start the tracing */
592 ret = trace->init(tr); 697 ret = tracer_init(trace, tr);
593 if (ret) { 698 if (ret) {
594 warn_failed_init_tracer(trace, ret); 699 warn_failed_init_tracer(trace, ret);
595 return 0; 700 return ret;
596 } 701 }
597 702
598 /* Sleep for a 1/10 of a second */ 703 /* Sleep for a 1/10 of a second */
@@ -604,6 +709,11 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
604 trace->reset(tr); 709 trace->reset(tr);
605 tracing_start(); 710 tracing_start();
606 711
712 if (!ret && !count) {
713 printk(KERN_CONT ".. no entries found ..");
714 ret = -1;
715 }
716
607 return ret; 717 return ret;
608} 718}
609#endif /* CONFIG_SYSPROF_TRACER */ 719#endif /* CONFIG_SYSPROF_TRACER */
@@ -616,7 +726,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
616 int ret; 726 int ret;
617 727
618 /* start the tracing */ 728 /* start the tracing */
619 ret = trace->init(tr); 729 ret = tracer_init(trace, tr);
620 if (ret) { 730 if (ret) {
621 warn_failed_init_tracer(trace, ret); 731 warn_failed_init_tracer(trace, ret);
622 return ret; 732 return ret;
@@ -631,6 +741,11 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
631 trace->reset(tr); 741 trace->reset(tr);
632 tracing_start(); 742 tracing_start();
633 743
744 if (!ret && !count) {
745 printk(KERN_CONT ".. no entries found ..");
746 ret = -1;
747 }
748
634 return ret; 749 return ret;
635} 750}
636#endif /* CONFIG_BRANCH_TRACER */ 751#endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index d0871bc0aca5..c750f65f9661 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -245,16 +245,31 @@ static int trace_lookup_stack(struct seq_file *m, long i)
245#endif 245#endif
246} 246}
247 247
248static void print_disabled(struct seq_file *m)
249{
250 seq_puts(m, "#\n"
251 "# Stack tracer disabled\n"
252 "#\n"
253 "# To enable the stack tracer, either add 'stacktrace' to the\n"
254 "# kernel command line\n"
255 "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n"
256 "#\n");
257}
258
248static int t_show(struct seq_file *m, void *v) 259static int t_show(struct seq_file *m, void *v)
249{ 260{
250 long i; 261 long i;
251 int size; 262 int size;
252 263
253 if (v == SEQ_START_TOKEN) { 264 if (v == SEQ_START_TOKEN) {
254 seq_printf(m, " Depth Size Location" 265 seq_printf(m, " Depth Size Location"
255 " (%d entries)\n" 266 " (%d entries)\n"
256 " ----- ---- --------\n", 267 " ----- ---- --------\n",
257 max_stack_trace.nr_entries); 268 max_stack_trace.nr_entries);
269
270 if (!stack_tracer_enabled && !max_stack_size)
271 print_disabled(m);
272
258 return 0; 273 return 0;
259 } 274 }
260 275
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
new file mode 100644
index 000000000000..acdebd771a93
--- /dev/null
+++ b/kernel/trace/trace_stat.c
@@ -0,0 +1,326 @@
1/*
2 * Infrastructure for statistic tracing (histogram output).
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 * Based on the code from trace_branch.c which is
7 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
8 *
9 */
10
11
12#include <linux/list.h>
13#include <linux/debugfs.h>
14#include "trace_stat.h"
15#include "trace.h"
16
17
18/* List of stat entries from a tracer */
19struct trace_stat_list {
20 struct list_head list;
21 void *stat;
22};
23
24/* A stat session is the stats output in one file */
25struct tracer_stat_session {
26 struct list_head session_list;
27 struct tracer_stat *ts;
28 struct list_head stat_list;
29 struct mutex stat_mutex;
30 struct dentry *file;
31};
32
33/* All of the sessions currently in use. Each stat file embed one session */
34static LIST_HEAD(all_stat_sessions);
35static DEFINE_MUTEX(all_stat_sessions_mutex);
36
37/* The root directory for all stat files */
38static struct dentry *stat_dir;
39
40
41static void reset_stat_session(struct tracer_stat_session *session)
42{
43 struct trace_stat_list *node, *next;
44
45 list_for_each_entry_safe(node, next, &session->stat_list, list)
46 kfree(node);
47
48 INIT_LIST_HEAD(&session->stat_list);
49}
50
51static void destroy_session(struct tracer_stat_session *session)
52{
53 debugfs_remove(session->file);
54 reset_stat_session(session);
55 mutex_destroy(&session->stat_mutex);
56 kfree(session);
57}
58
59/*
60 * For tracers that don't provide a stat_cmp callback.
61 * This one will force an immediate insertion on tail of
62 * the list.
63 */
64static int dummy_cmp(void *p1, void *p2)
65{
66 return 1;
67}
68
69/*
70 * Initialize the stat list at each trace_stat file opening.
71 * All of these copies and sorting are required on all opening
72 * since the stats could have changed between two file sessions.
73 */
74static int stat_seq_init(struct tracer_stat_session *session)
75{
76 struct trace_stat_list *iter_entry, *new_entry;
77 struct tracer_stat *ts = session->ts;
78 void *stat;
79 int ret = 0;
80 int i;
81
82 mutex_lock(&session->stat_mutex);
83 reset_stat_session(session);
84
85 if (!ts->stat_cmp)
86 ts->stat_cmp = dummy_cmp;
87
88 stat = ts->stat_start();
89 if (!stat)
90 goto exit;
91
92 /*
93 * The first entry. Actually this is the second, but the first
94 * one (the stat_list head) is pointless.
95 */
96 new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
97 if (!new_entry) {
98 ret = -ENOMEM;
99 goto exit;
100 }
101
102 INIT_LIST_HEAD(&new_entry->list);
103
104 list_add(&new_entry->list, &session->stat_list);
105
106 new_entry->stat = stat;
107
108 /*
109 * Iterate over the tracer stat entries and store them in a sorted
110 * list.
111 */
112 for (i = 1; ; i++) {
113 stat = ts->stat_next(stat, i);
114
115 /* End of insertion */
116 if (!stat)
117 break;
118
119 new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
120 if (!new_entry) {
121 ret = -ENOMEM;
122 goto exit_free_list;
123 }
124
125 INIT_LIST_HEAD(&new_entry->list);
126 new_entry->stat = stat;
127
128 list_for_each_entry_reverse(iter_entry, &session->stat_list,
129 list) {
130
131 /* Insertion with a descendent sorting */
132 if (ts->stat_cmp(iter_entry->stat,
133 new_entry->stat) >= 0) {
134
135 list_add(&new_entry->list, &iter_entry->list);
136 break;
137 }
138 }
139
140 /* The current larger value */
141 if (list_empty(&new_entry->list))
142 list_add(&new_entry->list, &session->stat_list);
143 }
144exit:
145 mutex_unlock(&session->stat_mutex);
146 return ret;
147
148exit_free_list:
149 reset_stat_session(session);
150 mutex_unlock(&session->stat_mutex);
151 return ret;
152}
153
154
155static void *stat_seq_start(struct seq_file *s, loff_t *pos)
156{
157 struct tracer_stat_session *session = s->private;
158
159 /* Prevent from tracer switch or stat_list modification */
160 mutex_lock(&session->stat_mutex);
161
162 /* If we are in the beginning of the file, print the headers */
163 if (!*pos && session->ts->stat_headers)
164 return SEQ_START_TOKEN;
165
166 return seq_list_start(&session->stat_list, *pos);
167}
168
169static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
170{
171 struct tracer_stat_session *session = s->private;
172
173 if (p == SEQ_START_TOKEN)
174 return seq_list_start(&session->stat_list, *pos);
175
176 return seq_list_next(p, &session->stat_list, pos);
177}
178
179static void stat_seq_stop(struct seq_file *s, void *p)
180{
181 struct tracer_stat_session *session = s->private;
182 mutex_unlock(&session->stat_mutex);
183}
184
185static int stat_seq_show(struct seq_file *s, void *v)
186{
187 struct tracer_stat_session *session = s->private;
188 struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
189
190 if (v == SEQ_START_TOKEN)
191 return session->ts->stat_headers(s);
192
193 return session->ts->stat_show(s, l->stat);
194}
195
196static const struct seq_operations trace_stat_seq_ops = {
197 .start = stat_seq_start,
198 .next = stat_seq_next,
199 .stop = stat_seq_stop,
200 .show = stat_seq_show
201};
202
203/* The session stat is refilled and resorted at each stat file opening */
204static int tracing_stat_open(struct inode *inode, struct file *file)
205{
206 int ret;
207
208 struct tracer_stat_session *session = inode->i_private;
209
210 ret = seq_open(file, &trace_stat_seq_ops);
211 if (!ret) {
212 struct seq_file *m = file->private_data;
213 m->private = session;
214 ret = stat_seq_init(session);
215 }
216
217 return ret;
218}
219
220/*
221 * Avoid consuming memory with our now useless list.
222 */
223static int tracing_stat_release(struct inode *i, struct file *f)
224{
225 struct tracer_stat_session *session = i->i_private;
226
227 mutex_lock(&session->stat_mutex);
228 reset_stat_session(session);
229 mutex_unlock(&session->stat_mutex);
230
231 return 0;
232}
233
234static const struct file_operations tracing_stat_fops = {
235 .open = tracing_stat_open,
236 .read = seq_read,
237 .llseek = seq_lseek,
238 .release = tracing_stat_release
239};
240
241static int tracing_stat_init(void)
242{
243 struct dentry *d_tracing;
244
245 d_tracing = tracing_init_dentry();
246
247 stat_dir = debugfs_create_dir("trace_stat", d_tracing);
248 if (!stat_dir)
249 pr_warning("Could not create debugfs "
250 "'trace_stat' entry\n");
251 return 0;
252}
253
254static int init_stat_file(struct tracer_stat_session *session)
255{
256 if (!stat_dir && tracing_stat_init())
257 return -ENODEV;
258
259 session->file = debugfs_create_file(session->ts->name, 0644,
260 stat_dir,
261 session, &tracing_stat_fops);
262 if (!session->file)
263 return -ENOMEM;
264 return 0;
265}
266
267int register_stat_tracer(struct tracer_stat *trace)
268{
269 struct tracer_stat_session *session, *node, *tmp;
270 int ret;
271
272 if (!trace)
273 return -EINVAL;
274
275 if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
276 return -EINVAL;
277
278 /* Already registered? */
279 mutex_lock(&all_stat_sessions_mutex);
280 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
281 if (node->ts == trace) {
282 mutex_unlock(&all_stat_sessions_mutex);
283 return -EINVAL;
284 }
285 }
286 mutex_unlock(&all_stat_sessions_mutex);
287
288 /* Init the session */
289 session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
290 if (!session)
291 return -ENOMEM;
292
293 session->ts = trace;
294 INIT_LIST_HEAD(&session->session_list);
295 INIT_LIST_HEAD(&session->stat_list);
296 mutex_init(&session->stat_mutex);
297 session->file = NULL;
298
299 ret = init_stat_file(session);
300 if (ret) {
301 destroy_session(session);
302 return ret;
303 }
304
305 /* Register */
306 mutex_lock(&all_stat_sessions_mutex);
307 list_add_tail(&session->session_list, &all_stat_sessions);
308 mutex_unlock(&all_stat_sessions_mutex);
309
310 return 0;
311}
312
313void unregister_stat_tracer(struct tracer_stat *trace)
314{
315 struct tracer_stat_session *node, *tmp;
316
317 mutex_lock(&all_stat_sessions_mutex);
318 list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
319 if (node->ts == trace) {
320 list_del(&node->session_list);
321 destroy_session(node);
322 break;
323 }
324 }
325 mutex_unlock(&all_stat_sessions_mutex);
326}
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
new file mode 100644
index 000000000000..202274cf7f3d
--- /dev/null
+++ b/kernel/trace/trace_stat.h
@@ -0,0 +1,31 @@
1#ifndef __TRACE_STAT_H
2#define __TRACE_STAT_H
3
4#include <linux/seq_file.h>
5
6/*
7 * If you want to provide a stat file (one-shot statistics), fill
8 * an iterator with stat_start/stat_next and a stat_show callbacks.
9 * The others callbacks are optional.
10 */
11struct tracer_stat {
12 /* The name of your stat file */
13 const char *name;
14 /* Iteration over statistic entries */
15 void *(*stat_start)(void);
16 void *(*stat_next)(void *prev, int idx);
17 /* Compare two entries for stats sorting */
18 int (*stat_cmp)(void *p1, void *p2);
19 /* Print a stat entry */
20 int (*stat_show)(struct seq_file *s, void *p);
21 /* Print the headers of your stat entries */
22 int (*stat_headers)(struct seq_file *s);
23};
24
25/*
26 * Destroy or create a stat file
27 */
28extern int register_stat_tracer(struct tracer_stat *trace);
29extern void unregister_stat_tracer(struct tracer_stat *trace);
30
31#endif /* __TRACE_STAT_H */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
new file mode 100644
index 000000000000..5e579645ac86
--- /dev/null
+++ b/kernel/trace/trace_syscalls.c
@@ -0,0 +1,250 @@
1#include <trace/syscall.h>
2#include <linux/kernel.h>
3#include <asm/syscall.h>
4
5#include "trace_output.h"
6#include "trace.h"
7
8/* Keep a counter of the syscall tracing users */
9static int refcount;
10
11/* Prevent from races on thread flags toggling */
12static DEFINE_MUTEX(syscall_trace_lock);
13
14/* Option to display the parameters types */
15enum {
16 TRACE_SYSCALLS_OPT_TYPES = 0x1,
17};
18
19static struct tracer_opt syscalls_opts[] = {
20 { TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
21 { }
22};
23
24static struct tracer_flags syscalls_flags = {
25 .val = 0, /* By default: no parameters types */
26 .opts = syscalls_opts
27};
28
29enum print_line_t
30print_syscall_enter(struct trace_iterator *iter, int flags)
31{
32 struct trace_seq *s = &iter->seq;
33 struct trace_entry *ent = iter->ent;
34 struct syscall_trace_enter *trace;
35 struct syscall_metadata *entry;
36 int i, ret, syscall;
37
38 trace_assign_type(trace, ent);
39
40 syscall = trace->nr;
41
42 entry = syscall_nr_to_meta(syscall);
43 if (!entry)
44 goto end;
45
46 ret = trace_seq_printf(s, "%s(", entry->name);
47 if (!ret)
48 return TRACE_TYPE_PARTIAL_LINE;
49
50 for (i = 0; i < entry->nb_args; i++) {
51 /* parameter types */
52 if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) {
53 ret = trace_seq_printf(s, "%s ", entry->types[i]);
54 if (!ret)
55 return TRACE_TYPE_PARTIAL_LINE;
56 }
57 /* parameter values */
58 ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i],
59 trace->args[i],
60 i == entry->nb_args - 1 ? ")" : ",");
61 if (!ret)
62 return TRACE_TYPE_PARTIAL_LINE;
63 }
64
65end:
66 trace_seq_printf(s, "\n");
67 return TRACE_TYPE_HANDLED;
68}
69
70enum print_line_t
71print_syscall_exit(struct trace_iterator *iter, int flags)
72{
73 struct trace_seq *s = &iter->seq;
74 struct trace_entry *ent = iter->ent;
75 struct syscall_trace_exit *trace;
76 int syscall;
77 struct syscall_metadata *entry;
78 int ret;
79
80 trace_assign_type(trace, ent);
81
82 syscall = trace->nr;
83
84 entry = syscall_nr_to_meta(syscall);
85 if (!entry) {
86 trace_seq_printf(s, "\n");
87 return TRACE_TYPE_HANDLED;
88 }
89
90 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
91 trace->ret);
92 if (!ret)
93 return TRACE_TYPE_PARTIAL_LINE;
94
95 return TRACE_TYPE_HANDLED;
96}
97
98void start_ftrace_syscalls(void)
99{
100 unsigned long flags;
101 struct task_struct *g, *t;
102
103 mutex_lock(&syscall_trace_lock);
104
105 /* Don't enable the flag on the tasks twice */
106 if (++refcount != 1)
107 goto unlock;
108
109 arch_init_ftrace_syscalls();
110 read_lock_irqsave(&tasklist_lock, flags);
111
112 do_each_thread(g, t) {
113 set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
114 } while_each_thread(g, t);
115
116 read_unlock_irqrestore(&tasklist_lock, flags);
117
118unlock:
119 mutex_unlock(&syscall_trace_lock);
120}
121
122void stop_ftrace_syscalls(void)
123{
124 unsigned long flags;
125 struct task_struct *g, *t;
126
127 mutex_lock(&syscall_trace_lock);
128
129 /* There are perhaps still some users */
130 if (--refcount)
131 goto unlock;
132
133 read_lock_irqsave(&tasklist_lock, flags);
134
135 do_each_thread(g, t) {
136 clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
137 } while_each_thread(g, t);
138
139 read_unlock_irqrestore(&tasklist_lock, flags);
140
141unlock:
142 mutex_unlock(&syscall_trace_lock);
143}
144
145void ftrace_syscall_enter(struct pt_regs *regs)
146{
147 struct syscall_trace_enter *entry;
148 struct syscall_metadata *sys_data;
149 struct ring_buffer_event *event;
150 int size;
151 int syscall_nr;
152
153 syscall_nr = syscall_get_nr(current, regs);
154
155 sys_data = syscall_nr_to_meta(syscall_nr);
156 if (!sys_data)
157 return;
158
159 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
160
161 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size,
162 0, 0);
163 if (!event)
164 return;
165
166 entry = ring_buffer_event_data(event);
167 entry->nr = syscall_nr;
168 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
169
170 trace_current_buffer_unlock_commit(event, 0, 0);
171 trace_wake_up();
172}
173
174void ftrace_syscall_exit(struct pt_regs *regs)
175{
176 struct syscall_trace_exit *entry;
177 struct syscall_metadata *sys_data;
178 struct ring_buffer_event *event;
179 int syscall_nr;
180
181 syscall_nr = syscall_get_nr(current, regs);
182
183 sys_data = syscall_nr_to_meta(syscall_nr);
184 if (!sys_data)
185 return;
186
187 event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT,
188 sizeof(*entry), 0, 0);
189 if (!event)
190 return;
191
192 entry = ring_buffer_event_data(event);
193 entry->nr = syscall_nr;
194 entry->ret = syscall_get_return_value(current, regs);
195
196 trace_current_buffer_unlock_commit(event, 0, 0);
197 trace_wake_up();
198}
199
200static int init_syscall_tracer(struct trace_array *tr)
201{
202 start_ftrace_syscalls();
203
204 return 0;
205}
206
207static void reset_syscall_tracer(struct trace_array *tr)
208{
209 stop_ftrace_syscalls();
210 tracing_reset_online_cpus(tr);
211}
212
213static struct trace_event syscall_enter_event = {
214 .type = TRACE_SYSCALL_ENTER,
215 .trace = print_syscall_enter,
216};
217
218static struct trace_event syscall_exit_event = {
219 .type = TRACE_SYSCALL_EXIT,
220 .trace = print_syscall_exit,
221};
222
223static struct tracer syscall_tracer __read_mostly = {
224 .name = "syscall",
225 .init = init_syscall_tracer,
226 .reset = reset_syscall_tracer,
227 .flags = &syscalls_flags,
228};
229
230__init int register_ftrace_syscalls(void)
231{
232 int ret;
233
234 ret = register_ftrace_event(&syscall_enter_event);
235 if (!ret) {
236 printk(KERN_WARNING "event %d failed to register\n",
237 syscall_enter_event.type);
238 WARN_ON_ONCE(1);
239 }
240
241 ret = register_ftrace_event(&syscall_exit_event);
242 if (!ret) {
243 printk(KERN_WARNING "event %d failed to register\n",
244 syscall_exit_event.type);
245 WARN_ON_ONCE(1);
246 }
247
248 return register_tracer(&syscall_tracer);
249}
250device_initcall(register_ftrace_syscalls);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index eaca5ad803ff..91fd19c2149f 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -88,7 +88,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
88 } 88 }
89} 89}
90 90
91const static struct stacktrace_ops backtrace_ops = { 91static const struct stacktrace_ops backtrace_ops = {
92 .warning = backtrace_warning, 92 .warning = backtrace_warning,
93 .warning_symbol = backtrace_warning_symbol, 93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack, 94 .stack = backtrace_stack,
@@ -226,15 +226,6 @@ static void stop_stack_timers(void)
226 stop_stack_timer(cpu); 226 stop_stack_timer(cpu);
227} 227}
228 228
229static void start_stack_trace(struct trace_array *tr)
230{
231 mutex_lock(&sample_timer_lock);
232 tracing_reset_online_cpus(tr);
233 start_stack_timers();
234 tracer_enabled = 1;
235 mutex_unlock(&sample_timer_lock);
236}
237
238static void stop_stack_trace(struct trace_array *tr) 229static void stop_stack_trace(struct trace_array *tr)
239{ 230{
240 mutex_lock(&sample_timer_lock); 231 mutex_lock(&sample_timer_lock);
@@ -247,12 +238,18 @@ static int stack_trace_init(struct trace_array *tr)
247{ 238{
248 sysprof_trace = tr; 239 sysprof_trace = tr;
249 240
250 start_stack_trace(tr); 241 tracing_start_cmdline_record();
242
243 mutex_lock(&sample_timer_lock);
244 start_stack_timers();
245 tracer_enabled = 1;
246 mutex_unlock(&sample_timer_lock);
251 return 0; 247 return 0;
252} 248}
253 249
254static void stack_trace_reset(struct trace_array *tr) 250static void stack_trace_reset(struct trace_array *tr)
255{ 251{
252 tracing_stop_cmdline_record();
256 stop_stack_trace(tr); 253 stop_stack_trace(tr);
257} 254}
258 255
@@ -317,7 +314,7 @@ sysprof_sample_write(struct file *filp, const char __user *ubuf,
317 return cnt; 314 return cnt;
318} 315}
319 316
320static struct file_operations sysprof_sample_fops = { 317static const struct file_operations sysprof_sample_fops = {
321 .read = sysprof_sample_read, 318 .read = sysprof_sample_read,
322 .write = sysprof_sample_write, 319 .write = sysprof_sample_write,
323}; 320};
@@ -330,5 +327,5 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
330 d_tracer, NULL, &sysprof_sample_fops); 327 d_tracer, NULL, &sysprof_sample_fops);
331 if (entry) 328 if (entry)
332 return; 329 return;
333 pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n"); 330 pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
334} 331}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
new file mode 100644
index 000000000000..797201e4a137
--- /dev/null
+++ b/kernel/trace/trace_workqueue.c
@@ -0,0 +1,288 @@
1/*
2 * Workqueue statistical tracer.
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8
9#include <trace/workqueue.h>
10#include <linux/list.h>
11#include <linux/percpu.h>
12#include "trace_stat.h"
13#include "trace.h"
14
15
16/* A cpu workqueue thread */
17struct cpu_workqueue_stats {
18 struct list_head list;
19/* Useful to know if we print the cpu headers */
20 bool first_entry;
21 int cpu;
22 pid_t pid;
23/* Can be inserted from interrupt or user context, need to be atomic */
24 atomic_t inserted;
25/*
26 * Don't need to be atomic, works are serialized in a single workqueue thread
27 * on a single CPU.
28 */
29 unsigned int executed;
30};
31
32/* List of workqueue threads on one cpu */
33struct workqueue_global_stats {
34 struct list_head list;
35 spinlock_t lock;
36};
37
38/* Don't need a global lock because allocated before the workqueues, and
39 * never freed.
40 */
41static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
42#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
43
44/* Insertion of a work */
45static void
46probe_workqueue_insertion(struct task_struct *wq_thread,
47 struct work_struct *work)
48{
49 int cpu = cpumask_first(&wq_thread->cpus_allowed);
50 struct cpu_workqueue_stats *node, *next;
51 unsigned long flags;
52
53 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
54 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
55 list) {
56 if (node->pid == wq_thread->pid) {
57 atomic_inc(&node->inserted);
58 goto found;
59 }
60 }
61 pr_debug("trace_workqueue: entry not found\n");
62found:
63 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
64}
65
66/* Execution of a work */
67static void
68probe_workqueue_execution(struct task_struct *wq_thread,
69 struct work_struct *work)
70{
71 int cpu = cpumask_first(&wq_thread->cpus_allowed);
72 struct cpu_workqueue_stats *node, *next;
73 unsigned long flags;
74
75 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
76 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
77 list) {
78 if (node->pid == wq_thread->pid) {
79 node->executed++;
80 goto found;
81 }
82 }
83 pr_debug("trace_workqueue: entry not found\n");
84found:
85 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
86}
87
88/* Creation of a cpu workqueue thread */
89static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
90{
91 struct cpu_workqueue_stats *cws;
92 unsigned long flags;
93
94 WARN_ON(cpu < 0);
95
96 /* Workqueues are sometimes created in atomic context */
97 cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
98 if (!cws) {
99 pr_warning("trace_workqueue: not enough memory\n");
100 return;
101 }
102 INIT_LIST_HEAD(&cws->list);
103 cws->cpu = cpu;
104
105 cws->pid = wq_thread->pid;
106
107 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
108 if (list_empty(&workqueue_cpu_stat(cpu)->list))
109 cws->first_entry = true;
110 list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
111 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
112}
113
114/* Destruction of a cpu workqueue thread */
115static void probe_workqueue_destruction(struct task_struct *wq_thread)
116{
117 /* Workqueue only execute on one cpu */
118 int cpu = cpumask_first(&wq_thread->cpus_allowed);
119 struct cpu_workqueue_stats *node, *next;
120 unsigned long flags;
121
122 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
123 list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
124 list) {
125 if (node->pid == wq_thread->pid) {
126 list_del(&node->list);
127 kfree(node);
128 goto found;
129 }
130 }
131
132 pr_debug("trace_workqueue: don't find workqueue to destroy\n");
133found:
134 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
135
136}
137
138static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
139{
140 unsigned long flags;
141 struct cpu_workqueue_stats *ret = NULL;
142
143
144 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
145
146 if (!list_empty(&workqueue_cpu_stat(cpu)->list))
147 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
148 struct cpu_workqueue_stats, list);
149
150 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
151
152 return ret;
153}
154
155static void *workqueue_stat_start(void)
156{
157 int cpu;
158 void *ret = NULL;
159
160 for_each_possible_cpu(cpu) {
161 ret = workqueue_stat_start_cpu(cpu);
162 if (ret)
163 return ret;
164 }
165 return NULL;
166}
167
168static void *workqueue_stat_next(void *prev, int idx)
169{
170 struct cpu_workqueue_stats *prev_cws = prev;
171 int cpu = prev_cws->cpu;
172 unsigned long flags;
173 void *ret = NULL;
174
175 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
176 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
177 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
178 do {
179 cpu = cpumask_next(cpu, cpu_possible_mask);
180 if (cpu >= nr_cpu_ids)
181 return NULL;
182 } while (!(ret = workqueue_stat_start_cpu(cpu)));
183 return ret;
184 }
185 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
186
187 return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
188 list);
189}
190
191static int workqueue_stat_show(struct seq_file *s, void *p)
192{
193 struct cpu_workqueue_stats *cws = p;
194 unsigned long flags;
195 int cpu = cws->cpu;
196 struct pid *pid;
197 struct task_struct *tsk;
198
199 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
200 if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
201 seq_printf(s, "\n");
202 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
203
204 pid = find_get_pid(cws->pid);
205 if (pid) {
206 tsk = get_pid_task(pid, PIDTYPE_PID);
207 if (tsk) {
208 seq_printf(s, "%3d %6d %6u %s\n", cws->cpu,
209 atomic_read(&cws->inserted), cws->executed,
210 tsk->comm);
211 put_task_struct(tsk);
212 }
213 put_pid(pid);
214 }
215
216 return 0;
217}
218
219static int workqueue_stat_headers(struct seq_file *s)
220{
221 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
222 seq_printf(s, "# | | | |\n");
223 return 0;
224}
225
226struct tracer_stat workqueue_stats __read_mostly = {
227 .name = "workqueues",
228 .stat_start = workqueue_stat_start,
229 .stat_next = workqueue_stat_next,
230 .stat_show = workqueue_stat_show,
231 .stat_headers = workqueue_stat_headers
232};
233
234
235int __init stat_workqueue_init(void)
236{
237 if (register_stat_tracer(&workqueue_stats)) {
238 pr_warning("Unable to register workqueue stat tracer\n");
239 return 1;
240 }
241
242 return 0;
243}
244fs_initcall(stat_workqueue_init);
245
246/*
247 * Workqueues are created very early, just after pre-smp initcalls.
248 * So we must register our tracepoints at this stage.
249 */
250int __init trace_workqueue_early_init(void)
251{
252 int ret, cpu;
253
254 ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
255 if (ret)
256 goto out;
257
258 ret = register_trace_workqueue_execution(probe_workqueue_execution);
259 if (ret)
260 goto no_insertion;
261
262 ret = register_trace_workqueue_creation(probe_workqueue_creation);
263 if (ret)
264 goto no_execution;
265
266 ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
267 if (ret)
268 goto no_creation;
269
270 for_each_possible_cpu(cpu) {
271 spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
272 INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
273 }
274
275 return 0;
276
277no_creation:
278 unregister_trace_workqueue_creation(probe_workqueue_creation);
279no_execution:
280 unregister_trace_workqueue_execution(probe_workqueue_execution);
281no_insertion:
282 unregister_trace_workqueue_insertion(probe_workqueue_insertion);
283out:
284 pr_warning("trace_workqueue: unable to trace workqueues\n");
285
286 return 1;
287}
288early_initcall(trace_workqueue_early_init);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 79602740bbb5..1ef5d3a601c7 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -272,12 +272,15 @@ static void disable_tracepoint(struct tracepoint *elem)
272 * 272 *
273 * Updates the probe callback corresponding to a range of tracepoints. 273 * Updates the probe callback corresponding to a range of tracepoints.
274 */ 274 */
275void tracepoint_update_probe_range(struct tracepoint *begin, 275void
276 struct tracepoint *end) 276tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
277{ 277{
278 struct tracepoint *iter; 278 struct tracepoint *iter;
279 struct tracepoint_entry *mark_entry; 279 struct tracepoint_entry *mark_entry;
280 280
281 if (!begin)
282 return;
283
281 mutex_lock(&tracepoints_mutex); 284 mutex_lock(&tracepoints_mutex);
282 for (iter = begin; iter < end; iter++) { 285 for (iter = begin; iter < end; iter++) {
283 mark_entry = get_tracepoint(iter->name); 286 mark_entry = get_tracepoint(iter->name);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 32f8e0d2bf5a..f71fb2a08950 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,6 +33,7 @@
33#include <linux/kallsyms.h> 33#include <linux/kallsyms.h>
34#include <linux/debug_locks.h> 34#include <linux/debug_locks.h>
35#include <linux/lockdep.h> 35#include <linux/lockdep.h>
36#include <trace/workqueue.h>
36 37
37/* 38/*
38 * The per-CPU workqueue (if single thread, we always use the first 39 * The per-CPU workqueue (if single thread, we always use the first
@@ -123,9 +124,13 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
123 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); 124 return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
124} 125}
125 126
127DEFINE_TRACE(workqueue_insertion);
128
126static void insert_work(struct cpu_workqueue_struct *cwq, 129static void insert_work(struct cpu_workqueue_struct *cwq,
127 struct work_struct *work, struct list_head *head) 130 struct work_struct *work, struct list_head *head)
128{ 131{
132 trace_workqueue_insertion(cwq->thread, work);
133
129 set_wq_data(work, cwq); 134 set_wq_data(work, cwq);
130 /* 135 /*
131 * Ensure that we get the right work->data if we see the 136 * Ensure that we get the right work->data if we see the
@@ -257,6 +262,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
257} 262}
258EXPORT_SYMBOL_GPL(queue_delayed_work_on); 263EXPORT_SYMBOL_GPL(queue_delayed_work_on);
259 264
265DEFINE_TRACE(workqueue_execution);
266
260static void run_workqueue(struct cpu_workqueue_struct *cwq) 267static void run_workqueue(struct cpu_workqueue_struct *cwq)
261{ 268{
262 spin_lock_irq(&cwq->lock); 269 spin_lock_irq(&cwq->lock);
@@ -275,7 +282,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
275 */ 282 */
276 struct lockdep_map lockdep_map = work->lockdep_map; 283 struct lockdep_map lockdep_map = work->lockdep_map;
277#endif 284#endif
278 285 trace_workqueue_execution(cwq->thread, work);
279 cwq->current_work = work; 286 cwq->current_work = work;
280 list_del_init(cwq->worklist.next); 287 list_del_init(cwq->worklist.next);
281 spin_unlock_irq(&cwq->lock); 288 spin_unlock_irq(&cwq->lock);
@@ -746,6 +753,8 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
746 return cwq; 753 return cwq;
747} 754}
748 755
756DEFINE_TRACE(workqueue_creation);
757
749static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 758static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
750{ 759{
751 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 760 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -768,6 +777,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
768 sched_setscheduler_nocheck(p, SCHED_FIFO, &param); 777 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
769 cwq->thread = p; 778 cwq->thread = p;
770 779
780 trace_workqueue_creation(cwq->thread, cpu);
781
771 return 0; 782 return 0;
772} 783}
773 784
@@ -849,6 +860,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
849} 860}
850EXPORT_SYMBOL_GPL(__create_workqueue_key); 861EXPORT_SYMBOL_GPL(__create_workqueue_key);
851 862
863DEFINE_TRACE(workqueue_destruction);
864
852static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) 865static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
853{ 866{
854 /* 867 /*
@@ -872,6 +885,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
872 * checks list_empty(), and a "normal" queue_work() can't use 885 * checks list_empty(), and a "normal" queue_work() can't use
873 * a dead CPU. 886 * a dead CPU.
874 */ 887 */
888 trace_workqueue_destruction(cwq->thread);
875 kthread_stop(cwq->thread); 889 kthread_stop(cwq->thread);
876 cwq->thread = NULL; 890 cwq->thread = NULL;
877} 891}
@@ -952,20 +966,20 @@ undo:
952} 966}
953 967
954#ifdef CONFIG_SMP 968#ifdef CONFIG_SMP
955static struct workqueue_struct *work_on_cpu_wq __read_mostly;
956 969
957struct work_for_cpu { 970struct work_for_cpu {
958 struct work_struct work; 971 struct completion completion;
959 long (*fn)(void *); 972 long (*fn)(void *);
960 void *arg; 973 void *arg;
961 long ret; 974 long ret;
962}; 975};
963 976
964static void do_work_for_cpu(struct work_struct *w) 977static int do_work_for_cpu(void *_wfc)
965{ 978{
966 struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work); 979 struct work_for_cpu *wfc = _wfc;
967
968 wfc->ret = wfc->fn(wfc->arg); 980 wfc->ret = wfc->fn(wfc->arg);
981 complete(&wfc->completion);
982 return 0;
969} 983}
970 984
971/** 985/**
@@ -976,17 +990,23 @@ static void do_work_for_cpu(struct work_struct *w)
976 * 990 *
977 * This will return the value @fn returns. 991 * This will return the value @fn returns.
978 * It is up to the caller to ensure that the cpu doesn't go offline. 992 * It is up to the caller to ensure that the cpu doesn't go offline.
993 * The caller must not hold any locks which would prevent @fn from completing.
979 */ 994 */
980long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) 995long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
981{ 996{
982 struct work_for_cpu wfc; 997 struct task_struct *sub_thread;
983 998 struct work_for_cpu wfc = {
984 INIT_WORK(&wfc.work, do_work_for_cpu); 999 .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
985 wfc.fn = fn; 1000 .fn = fn,
986 wfc.arg = arg; 1001 .arg = arg,
987 queue_work_on(cpu, work_on_cpu_wq, &wfc.work); 1002 };
988 flush_work(&wfc.work); 1003
989 1004 sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
1005 if (IS_ERR(sub_thread))
1006 return PTR_ERR(sub_thread);
1007 kthread_bind(sub_thread, cpu);
1008 wake_up_process(sub_thread);
1009 wait_for_completion(&wfc.completion);
990 return wfc.ret; 1010 return wfc.ret;
991} 1011}
992EXPORT_SYMBOL_GPL(work_on_cpu); 1012EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -1002,8 +1022,4 @@ void __init init_workqueues(void)
1002 hotcpu_notifier(workqueue_cpu_callback, 0); 1022 hotcpu_notifier(workqueue_cpu_callback, 0);
1003 keventd_wq = create_workqueue("events"); 1023 keventd_wq = create_workqueue("events");
1004 BUG_ON(!keventd_wq); 1024 BUG_ON(!keventd_wq);
1005#ifdef CONFIG_SMP
1006 work_on_cpu_wq = create_workqueue("work_on_cpu");
1007 BUG_ON(!work_on_cpu_wq);
1008#endif
1009} 1025}