From 809826a389040e0ad9d646b587bccc0e34691afd Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:34 +0800 Subject: tracing: Have __dynamic_array() define a field This is part of a patch set that removes the show_format method in the ftrace event macros. This patch set requires that all fields are added to the ftrace_event_call->fields. This patch changes __dynamic_array() to call trace_define_field() to include fields that use __dynamic_array(). Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D36.8090100@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_export.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d4fa5dc1ee4..9978a4f4009 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -175,7 +175,12 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ return ret; #undef __dynamic_array -#define __dynamic_array(type, item) +#define __dynamic_array(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + 0, is_signed_type(type), FILTER_OTHER);\ + if (ret) \ + return ret; #undef FTRACE_ENTRY #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -- cgit v1.2.2 From 509e760cd91c831983097ae174cb6c0b8c6c8e6b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:42 +0800 Subject: tracing: Add print_fmt field This is part of a patch set that removes the show_format method in the ftrace event macros. The print_fmt field is added to hold the string that shows the print_fmt in the event format files. This patch only adds the field but it is currently not used. Later patches will use this field to enable us to remove the show_format field and function. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D3E.2000704@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + include/trace/ftrace.h | 28 +++++++++++++++++++++++++++- kernel/trace/trace_export.c | 7 +++++++ 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2233c98d80d..bd23d8e52f0 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -121,6 +121,7 @@ struct ftrace_event_call { int (*regfunc)(struct ftrace_event_call *); void (*unregfunc)(struct ftrace_event_call *); int id; + const char *print_fmt; int (*raw_init)(struct ftrace_event_call *); int (*show_format)(struct ftrace_event_call *, struct trace_seq *); diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index c6fe03e902c..3351d85c83a 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -722,8 +722,20 @@ static struct trace_event ftrace_event_type_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#undef __entry +#define __entry REC + +#undef __print_flags +#undef __print_symbolic +#undef __get_dynamic_array +#undef __get_str + +#undef TP_printk +#define TP_printk(fmt, args...) "\"" fmt "\", " __stringify(args) + #undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +static const char print_fmt_##call[] = print; #undef DEFINE_EVENT #define DEFINE_EVENT(template, call, proto, args) \ @@ -737,6 +749,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .raw_init = trace_event_raw_init, \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ + .print_fmt = print_fmt_##template, \ .show_format = ftrace_format_##template, \ .define_fields = ftrace_define_fields_##template, \ _TRACE_PROFILE_INIT(call) \ @@ -745,6 +758,8 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, call, proto, args, print) \ \ +static const char print_fmt_##call[] = print; \ + \ static struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ @@ -754,6 +769,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .raw_init = trace_event_raw_init, \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ + .print_fmt = print_fmt_##call, \ .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##template, \ _TRACE_PROFILE_INIT(call) \ @@ -837,6 +853,16 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #ifdef CONFIG_EVENT_PROFILE +#undef __entry +#define __entry entry + +#undef __get_dynamic_array +#define __get_dynamic_array(field) \ + ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) + +#undef __get_str +#define __get_str(field) (char *)__get_dynamic_array(field) + #undef __perf_addr #define __perf_addr(a) __addr = (a) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 9978a4f4009..95d14b640a6 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -203,6 +203,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) return 0; } +#undef __entry +#define __entry REC + #undef __field #define __field(type, item) @@ -218,6 +221,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) #undef __dynamic_array #define __dynamic_array(type, item) +#undef F_printk +#define F_printk(fmt, args...) #fmt ", " __stringify(args) + #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ \ @@ -228,6 +234,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .id = type, \ .system = __stringify(TRACE_SYSTEM), \ .raw_init = ftrace_raw_init_event, \ + .print_fmt = print, \ .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ }; \ -- cgit v1.2.2 From 50307a45f8515f6244e3b08e6b19824b9fbfe293 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:45 +0800 Subject: tracing/syscalls: Init print_fmt for syscall events This is part of a patch set that removes the show_format method in the ftrace event macros. Add the print_fmt initialization to the syscall events. The print_fmt is still not used, but will be in the follow up patches. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D41.609@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 68 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 75289f372dd..1352b0a36fa 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -191,6 +191,67 @@ int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) return trace_seq_putc(s, '\n'); } +static +int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) +{ + int i; + int pos = 0; + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < entry->nb_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", + entry->args[i], sizeof(unsigned long), + i == entry->nb_args - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < entry->nb_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", ((unsigned long)(REC->%s))", entry->args[i]); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_syscall_print_fmt(struct ftrace_event_call *call) +{ + char *print_fmt; + int len; + struct syscall_metadata *entry = call->data; + + if (entry->enter_event != call) { + call->print_fmt = "\"0x%lx\", REC->ret"; + return 0; + } + + /* First: called with 0 length to calculate the needed length */ + len = __set_enter_print_fmt(entry, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_enter_print_fmt(entry, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void free_syscall_print_fmt(struct ftrace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + if (entry->enter_event == call) + kfree(call->print_fmt); +} + int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) { int ret; @@ -386,9 +447,14 @@ int init_syscall_trace(struct ftrace_event_call *call) { int id; + if (set_syscall_print_fmt(call) < 0) + return -ENOMEM; + id = register_ftrace_event(call->event); - if (!id) + if (!id) { + free_syscall_print_fmt(call); return -ENODEV; + } call->id = id; INIT_LIST_HEAD(&call->fields); return 0; -- cgit v1.2.2 From a342a0280b981c130e32dbb94dbd3a57959c4d04 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:49 +0800 Subject: tracing/kprobes: Init print_fmt for kprobe events This is part of a patch set that removes the show_format method in the ftrace event macros. Add the print_fmt initialization to the kprobe events. The print_fmt is still not used, but will be in the follow up patches. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D45.3080100@cn.fujitsu.com> Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 64 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6ea90c0e2c9..147491dccea 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1250,6 +1250,62 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call, ", REC->" FIELD_STRING_RETIP); } +static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) +{ + int i; + int pos = 0; + + const char *fmt, *arg; + + if (!probe_is_return(tp)) { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } else { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); + + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", + tp->args[i].name); + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_print_fmt(struct trace_probe *tp) +{ + int len; + char *print_fmt; + + /* First: called with 0 length to calculate the needed length */ + len = __set_print_fmt(tp, NULL, 0); + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_print_fmt(tp, print_fmt, len + 1); + tp->call.print_fmt = print_fmt; + + return 0; +} + #ifdef CONFIG_EVENT_PROFILE /* Kprobe profile handler */ @@ -1456,10 +1512,14 @@ static int register_probe_event(struct trace_probe *tp) call->show_format = kprobe_event_show_format; call->define_fields = kprobe_event_define_fields; } + if (set_print_fmt(tp) < 0) + return -ENOMEM; call->event = &tp->event; call->id = register_ftrace_event(&tp->event); - if (!call->id) + if (!call->id) { + kfree(call->print_fmt); return -ENODEV; + } call->enabled = 0; call->regfunc = probe_event_enable; call->unregfunc = probe_event_disable; @@ -1472,6 +1532,7 @@ static int register_probe_event(struct trace_probe *tp) ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", call->name); + kfree(call->print_fmt); unregister_ftrace_event(&tp->event); } return ret; @@ -1481,6 +1542,7 @@ static void unregister_probe_event(struct trace_probe *tp) { /* tp->event is unregistered in trace_remove_event_call() */ trace_remove_event_call(&tp->call); + kfree(tp->call.print_fmt); } /* Make a debugfs interface for controling probe points */ -- cgit v1.2.2 From c7ef3a9004201bca90626db246a19dadd2c29c9b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 28 Dec 2009 21:13:59 -0500 Subject: tracing: Have syscall tracing call its own init function In the clean up of having all events call one specific function, the syscall event init was changed to call this helper function. With the new print_fmt updates, the syscalls need to do special initializations. This patch converts the syscall events to call its own init function again. Cc: Lai Jiangshan Cc: Li Zefan Signed-off-by: Steven Rostedt --- include/linux/syscalls.h | 4 ++-- kernel/trace/trace_syscalls.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 207466a49f3..ed353d274a7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -143,7 +143,7 @@ struct perf_event_attr; .name = "sys_enter"#sname, \ .system = "syscalls", \ .event = &enter_syscall_print_##sname, \ - .raw_init = trace_event_raw_init, \ + .raw_init = init_syscall_trace, \ .show_format = syscall_enter_format, \ .define_fields = syscall_enter_define_fields, \ .regfunc = reg_event_syscall_enter, \ @@ -165,7 +165,7 @@ struct perf_event_attr; .name = "sys_exit"#sname, \ .system = "syscalls", \ .event = &exit_syscall_print_##sname, \ - .raw_init = trace_event_raw_init, \ + .raw_init = init_syscall_trace, \ .show_format = syscall_exit_format, \ .define_fields = syscall_exit_define_fields, \ .regfunc = reg_event_syscall_exit, \ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 1352b0a36fa..a78e86349ec 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -450,14 +450,14 @@ int init_syscall_trace(struct ftrace_event_call *call) if (set_syscall_print_fmt(call) < 0) return -ENOMEM; - id = register_ftrace_event(call->event); - if (!id) { + id = trace_event_raw_init(call); + + if (id < 0) { free_syscall_print_fmt(call); - return -ENODEV; + return id; } - call->id = id; - INIT_LIST_HEAD(&call->fields); - return 0; + + return id; } int __init init_ftrace_syscalls(void) -- cgit v1.2.2 From 5a65e956220efc2421e21ee56d6153fd5c533a95 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:53 +0800 Subject: tracing: Use defined fields and print_fmt to print formats The calls ftrace_format_##call() and ftrace_define_fields_##call() are almost duplicate in functionality. With the addition of the print_fmt in previous patches, these two functions can be merged into one. The trace_define_field() defines the fields and links them into the struct ftrace_event_call. The previous patches introduced the print_fmt field and this can now be used with the trace_define_field() to create the event format file fields and print_fmt field. The struct ftrace_event_call->fields are used to print the fields The struct ftrace_event_call->print_fmt is used to print the "print fmt: XXXXXXXXXXX" line. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D49.5000006@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 65 ++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 189b09baf4f..250ec865d5f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -528,33 +528,16 @@ extern char *__bad_type_size(void); #type, "common_" #name, offsetof(typeof(field), name), \ sizeof(field.name), is_signed_type(type) -static int trace_write_header(struct trace_seq *s) -{ - struct trace_entry field; - - /* struct trace_entry */ - return trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\n", - FIELD(unsigned short, type), - FIELD(unsigned char, flags), - FIELD(unsigned char, preempt_count), - FIELD(int, pid), - FIELD(int, lock_depth)); -} - static ssize_t event_format_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_field *field; struct trace_seq *s; + int common_field_count = 5; char *buf; - int r; + int r = 0; if (*ppos) return 0; @@ -565,14 +548,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - /* If any of the first writes fail, so will the show_format. */ - trace_seq_printf(s, "name: %s\n", call->name); trace_seq_printf(s, "ID: %d\n", call->id); trace_seq_printf(s, "format:\n"); - trace_write_header(s); - r = call->show_format(call, s); + list_for_each_entry_reverse(field, &call->fields, link) { + /* + * Smartly shows the array type(except dynamic array). + * Normal: + * field:TYPE VAR + * If TYPE := TYPE[LEN], it is shown: + * field:TYPE VAR[LEN] + */ + const char *array_descriptor = strchr(field->type, '['); + + if (!strncmp(field->type, "__data_loc", 10)) + array_descriptor = NULL; + + if (!array_descriptor) { + r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" + "\tsize:%u;\tsigned:%d;\n", + field->type, field->name, field->offset, + field->size, !!field->is_signed); + } else { + r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" + "\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + array_descriptor, field->offset, + field->size, !!field->is_signed); + } + + if (--common_field_count == 0) + r = trace_seq_printf(s, "\n"); + + if (!r) + break; + } + + if (r) + r = trace_seq_printf(s, "\nprint fmt: %s\n", + call->print_fmt); + if (!r) { /* * ug! The format output is bigger than a PAGE!! -- cgit v1.2.2 From 0fa0edaf32b9a78b9854f1da98d4511a501089b0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Dec 2009 15:39:57 +0800 Subject: tracing: Remove show_format and related macros from TRACE_EVENT The previous patches added the use of print_fmt string and changes the trace_define_field() function to also create the fields and format output for the event format files. text data bss dec hex filename 5857201 1355780 9336808 16549789 fc879d vmlinux 5884589 1351684 9337896 16574169 fce6d9 vmlinux-orig The above shows the size of the vmlinux after this patch set compared to the vmlinux-orig which is before the patch set. This saves us 27k on text, 1k on bss and adds just 4k of data. The total savings of 24k in size. Signed-off-by: Lai Jiangshan LKML-Reference: <4B273D4D.40604@cn.fujitsu.com> Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 - include/linux/syscalls.h | 2 - include/trace/ftrace.h | 133 ++---------------------------------------- include/trace/syscall.h | 4 -- kernel/trace/trace_events.c | 12 ---- kernel/trace/trace_export.c | 73 ----------------------- kernel/trace/trace_kprobe.c | 78 ------------------------- kernel/trace/trace_syscalls.c | 66 --------------------- 8 files changed, 6 insertions(+), 364 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index bd23d8e52f0..84a5629adfd 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -123,8 +123,6 @@ struct ftrace_event_call { int id; const char *print_fmt; int (*raw_init)(struct ftrace_event_call *); - int (*show_format)(struct ftrace_event_call *, - struct trace_seq *); int (*define_fields)(struct ftrace_event_call *); struct list_head fields; int filter_active; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index ed353d274a7..7b219696ad2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -144,7 +144,6 @@ struct perf_event_attr; .system = "syscalls", \ .event = &enter_syscall_print_##sname, \ .raw_init = init_syscall_trace, \ - .show_format = syscall_enter_format, \ .define_fields = syscall_enter_define_fields, \ .regfunc = reg_event_syscall_enter, \ .unregfunc = unreg_event_syscall_enter, \ @@ -166,7 +165,6 @@ struct perf_event_attr; .system = "syscalls", \ .event = &exit_syscall_print_##sname, \ .raw_init = init_syscall_trace, \ - .show_format = syscall_exit_format, \ .define_fields = syscall_exit_define_fields, \ .regfunc = reg_event_syscall_exit, \ .unregfunc = unreg_event_syscall_exit, \ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 3351d85c83a..df65b99880b 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -130,130 +130,6 @@ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " offset:%u; size:%u;\n", - * offsetof(struct ftrace_raw_##call, item), - * sizeof(field.type)); - * - * } - */ - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\tsigned:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item), \ - (unsigned int)is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __field_ext -#define __field_ext(type, item, filter_type) __field(type, item) - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%u;\tsize:%u;\tsigned:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item), \ - (unsigned int)is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __dynamic_array -#define __dynamic_array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:__data_loc " #type "[] " #item ";\t"\ - "offset:%u;\tsize:%u;\tsigned:%u;\n", \ - (unsigned int)offsetof(typeof(field), \ - __data_loc_##item), \ - (unsigned int)sizeof(field.__data_loc_##item), \ - (unsigned int)is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __string -#define __string(item, src) __dynamic_array(char, item, -1) - -#undef __entry -#define __entry REC - -#undef __print_symbolic -#undef __get_dynamic_array -#undef __get_str - -#undef TP_printk -#define TP_printk(fmt, args...) "\"%s\", %s\n", fmt, __stringify(args) - -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TP_perf_assign -#define TP_perf_assign(args...) - -#undef DECLARE_EVENT_CLASS -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print) \ -static int \ -ftrace_format_setup_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field __attribute__((unused)); \ - int ret = 0; \ - \ - tstruct; \ - \ - return ret; \ -} \ - \ -static int \ -ftrace_format_##call(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - int ret = 0; \ - \ - ret = ftrace_format_setup_##call(unused, s); \ - if (!ret) \ - return ret; \ - \ - ret = trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#undef DEFINE_EVENT -#define DEFINE_EVENT(template, name, proto, args) - -#undef DEFINE_EVENT_PRINT -#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ -static int \ -ftrace_format_##name(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - int ret = 0; \ - \ - ret = ftrace_format_setup_##template(unused, s); \ - if (!ret) \ - return ret; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) - /* * Stage 3 of the trace events. * @@ -622,7 +498,6 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ * .raw_init = trace_event_raw_init, * .regfunc = ftrace_reg_event_, * .unregfunc = ftrace_unreg_event_, - * .show_format = ftrace_format_, * } * */ @@ -657,6 +532,12 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ #define __assign_str(dst, src) \ strcpy(__get_str(dst), src); +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TP_perf_assign +#define TP_perf_assign(args...) + #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ \ @@ -750,7 +631,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ .print_fmt = print_fmt_##template, \ - .show_format = ftrace_format_##template, \ .define_fields = ftrace_define_fields_##template, \ _TRACE_PROFILE_INIT(call) \ } @@ -770,7 +650,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ .print_fmt = print_fmt_##call, \ - .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##template, \ _TRACE_PROFILE_INIT(call) \ } diff --git a/include/trace/syscall.h b/include/trace/syscall.h index 961fda3556b..8cd41025445 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -34,10 +34,6 @@ struct syscall_metadata { extern unsigned long arch_syscall_addr(int nr); extern int init_syscall_trace(struct ftrace_event_call *call); -extern int syscall_enter_format(struct ftrace_event_call *call, - struct trace_seq *s); -extern int syscall_exit_format(struct ftrace_event_call *call, - struct trace_seq *s); extern int syscall_enter_define_fields(struct ftrace_event_call *call); extern int syscall_exit_define_fields(struct ftrace_event_call *call); extern int reg_event_syscall_enter(struct ftrace_event_call *call); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 250ec865d5f..c2a3077b735 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -520,14 +520,6 @@ out: return ret; } -extern char *__bad_type_size(void); - -#undef FIELD -#define FIELD(type, name) \ - sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ - #type, "common_" #name, offsetof(typeof(field), name), \ - sizeof(field.name), is_signed_type(type) - static ssize_t event_format_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -965,10 +957,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, filter); } - /* A trace may not want to export its format */ - if (!call->show_format) - return 0; - trace_create_file("format", 0444, call->dir, call, format); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 95d14b640a6..e091f64ba6c 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \ #include "trace_entries.h" - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __field_desc -#define __field_desc(type, container, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), container.item), \ - sizeof(field.container.item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __array_desc -#define __array_desc(type, container, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), container.item), \ - sizeof(field.container.item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __dynamic_array -#define __dynamic_array(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:0;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef F_printk -#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef __entry -#define __entry REC - -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -static int \ -ftrace_format_##name(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - struct struct_name field __attribute__((unused)); \ - int ret = 0; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include "trace_entries.h" - #undef __field #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ @@ -235,7 +163,6 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .raw_init = ftrace_raw_init_event, \ .print_fmt = print, \ - .show_format = ftrace_format_##call, \ .define_fields = ftrace_define_fields_##call, \ }; \ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 147491dccea..c99029916c7 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1174,82 +1174,6 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -static int __probe_event_show_format(struct trace_seq *s, - struct trace_probe *tp, const char *fmt, - const char *arg) -{ - int i; - - /* Show format */ - if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) - return 0; - - for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) - return 0; - - if (!trace_seq_printf(s, "\", %s", arg)) - return 0; - - for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) - return 0; - - return trace_seq_puts(s, "\n"); -} - -#undef SHOW_FIELD -#define SHOW_FIELD(type, item, name) \ - do { \ - ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ - "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\ - (unsigned int)offsetof(typeof(field), item),\ - (unsigned int)sizeof(type), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; \ - } while (0) - -static int kprobe_event_show_format(struct ftrace_event_call *call, - struct trace_seq *s) -{ - struct kprobe_trace_entry field __attribute__((unused)); - int ret, i; - struct trace_probe *tp = (struct trace_probe *)call->data; - - SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); - SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); - - /* Show fields */ - for (i = 0; i < tp->nr_args; i++) - SHOW_FIELD(unsigned long, args[i], tp->args[i].name); - trace_seq_puts(s, "\n"); - - return __probe_event_show_format(s, tp, "(%lx)", - "REC->" FIELD_STRING_IP); -} - -static int kretprobe_event_show_format(struct ftrace_event_call *call, - struct trace_seq *s) -{ - struct kretprobe_trace_entry field __attribute__((unused)); - int ret, i; - struct trace_probe *tp = (struct trace_probe *)call->data; - - SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); - SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); - SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); - - /* Show fields */ - for (i = 0; i < tp->nr_args; i++) - SHOW_FIELD(unsigned long, args[i], tp->args[i].name); - trace_seq_puts(s, "\n"); - - return __probe_event_show_format(s, tp, "(%lx <- %lx)", - "REC->" FIELD_STRING_FUNC - ", REC->" FIELD_STRING_RETIP); -} - static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) { int i; @@ -1504,12 +1428,10 @@ static int register_probe_event(struct trace_probe *tp) if (probe_is_return(tp)) { tp->event.trace = print_kretprobe_event; call->raw_init = probe_event_raw_init; - call->show_format = kretprobe_event_show_format; call->define_fields = kretprobe_event_define_fields; } else { tp->event.trace = print_kprobe_event; call->raw_init = probe_event_raw_init; - call->show_format = kprobe_event_show_format; call->define_fields = kprobe_event_define_fields; } if (set_print_fmt(tp) < 0) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index a78e86349ec..49cea70fbf6 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -143,54 +143,6 @@ extern char *__bad_type_size(void); #type, #name, offsetof(typeof(trace), name), \ sizeof(trace.name), is_signed_type(type) -int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) -{ - int i; - int ret; - struct syscall_metadata *entry = call->data; - struct syscall_trace_enter trace; - int offset = offsetof(struct syscall_trace_enter, args); - - ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n", - SYSCALL_FIELD(int, nr)); - if (!ret) - return 0; - - for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], - entry->args[i]); - if (!ret) - return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" - "\tsigned:%u;\n", offset, - sizeof(unsigned long), - is_signed_type(unsigned long)); - if (!ret) - return 0; - offset += sizeof(unsigned long); - } - - trace_seq_puts(s, "\nprint fmt: \""); - for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], - sizeof(unsigned long), - i == entry->nb_args - 1 ? "" : ", "); - if (!ret) - return 0; - } - trace_seq_putc(s, '"'); - - for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))", - entry->args[i]); - if (!ret) - return 0; - } - - return trace_seq_putc(s, '\n'); -} - static int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { @@ -252,24 +204,6 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call) kfree(call->print_fmt); } -int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) -{ - int ret; - struct syscall_trace_exit trace; - - ret = trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n", - SYSCALL_FIELD(int, nr), - SYSCALL_FIELD(long, ret)); - if (!ret) - return 0; - - return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); -} - int syscall_enter_define_fields(struct ftrace_event_call *call) { struct syscall_trace_enter trace; -- cgit v1.2.2 From 7e53bd42d14c75192b99674c40fcc359392da59d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 6 Jan 2010 20:08:50 +0800 Subject: tracing: Consolidate protection of reader access to the ring buffer At the beginning, access to the ring buffer was fully serialized by trace_types_lock. Patch d7350c3f4569 gives more freedom to readers, and patch b04cc6b1f6 adds code to protect trace_pipe and cpu#/trace_pipe. But actually it is not enough, ring buffer readers are not always read-only, they may consume data. This patch makes accesses to trace, trace_pipe, trace_pipe_raw cpu#/trace, cpu#/trace_pipe and cpu#/trace_pipe_raw serialized. And removes tracing_reader_cpumask which is used to protect trace_pipe. Details: Ring buffer serializes readers, but it is low level protection. The validity of the events (which returns by ring_buffer_peek() ..etc) are not protected by ring buffer. The content of events may become garbage if we allow another process to consume these events concurrently: A) the page of the consumed events may become a normal page (not reader page) in ring buffer, and this page will be rewritten by the events producer. B) The page of the consumed events may become a page for splice_read, and this page will be returned to system. This patch adds trace_access_lock() and trace_access_unlock() primitives. These primitives allow multi process access to different cpu ring buffers concurrently. These primitives don't distinguish read-only and read-consume access. Multi read-only access is also serialized. And we don't use these primitives when we open files, we only use them when we read files. Signed-off-by: Lai Jiangshan LKML-Reference: <4B447D52.1050602@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 136 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 97 insertions(+), 39 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0df1b0f2cb9..abdd333a082 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -102,9 +103,6 @@ static inline void ftrace_enable_cpu(void) static cpumask_var_t __read_mostly tracing_buffer_mask; -/* Define which cpu buffers are currently read in trace_pipe */ -static cpumask_var_t tracing_reader_cpumask; - #define for_each_tracing_cpu(cpu) \ for_each_cpu(cpu, tracing_buffer_mask) @@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly; /* * trace_types_lock is used to protect the trace_types list. - * This lock is also used to keep user access serialized. - * Accesses from userspace will grab this lock while userspace - * activities happen inside the kernel. */ static DEFINE_MUTEX(trace_types_lock); +/* + * serialize the access of the ring buffer + * + * ring buffer serializes readers, but it is low level protection. + * The validity of the events (which returns by ring_buffer_peek() ..etc) + * are not protected by ring buffer. + * + * The content of events may become garbage if we allow other process consumes + * these events concurrently: + * A) the page of the consumed events may become a normal page + * (not reader page) in ring buffer, and this page will be rewrited + * by events producer. + * B) The page of the consumed events may become a page for splice_read, + * and this page will be returned to system. + * + * These primitives allow multi process access to different cpu ring buffer + * concurrently. + * + * These primitives don't distinguish read-only and read-consume access. + * Multi read-only access are also serialized. + */ + +#ifdef CONFIG_SMP +static DECLARE_RWSEM(all_cpu_access_lock); +static DEFINE_PER_CPU(struct mutex, cpu_access_lock); + +static inline void trace_access_lock(int cpu) +{ + if (cpu == TRACE_PIPE_ALL_CPU) { + /* gain it for accessing the whole ring buffer. */ + down_write(&all_cpu_access_lock); + } else { + /* gain it for accessing a cpu ring buffer. */ + + /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ + down_read(&all_cpu_access_lock); + + /* Secondly block other access to this @cpu ring buffer. */ + mutex_lock(&per_cpu(cpu_access_lock, cpu)); + } +} + +static inline void trace_access_unlock(int cpu) +{ + if (cpu == TRACE_PIPE_ALL_CPU) { + up_write(&all_cpu_access_lock); + } else { + mutex_unlock(&per_cpu(cpu_access_lock, cpu)); + up_read(&all_cpu_access_lock); + } +} + +static inline void trace_access_lock_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + mutex_init(&per_cpu(cpu_access_lock, cpu)); +} + +#else + +static DEFINE_MUTEX(access_lock); + +static inline void trace_access_lock(int cpu) +{ + (void)cpu; + mutex_lock(&access_lock); +} + +static inline void trace_access_unlock(int cpu) +{ + (void)cpu; + mutex_unlock(&access_lock); +} + +static inline void trace_access_lock_init(void) +{ +} + +#endif + /* trace_wait is a waitqueue for tasks blocked on trace_poll */ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); @@ -1580,12 +1657,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu) } /* - * No necessary locking here. The worst thing which can - * happen is loosing events consumed at the same time - * by a trace_pipe reader. - * Other than that, we don't risk to crash the ring buffer - * because it serializes the readers. - * * The current tracer is copied to avoid a global locking * all around. */ @@ -1640,12 +1711,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) } trace_event_read_lock(); + trace_access_lock(cpu_file); return p; } static void s_stop(struct seq_file *m, void *p) { + struct trace_iterator *iter = m->private; + atomic_dec(&trace_record_cmdline_disabled); + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } @@ -2836,22 +2911,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) mutex_lock(&trace_types_lock); - /* We only allow one reader per cpu */ - if (cpu_file == TRACE_PIPE_ALL_CPU) { - if (!cpumask_empty(tracing_reader_cpumask)) { - ret = -EBUSY; - goto out; - } - cpumask_setall(tracing_reader_cpumask); - } else { - if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask)) - cpumask_set_cpu(cpu_file, tracing_reader_cpumask); - else { - ret = -EBUSY; - goto out; - } - } - /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { @@ -2907,12 +2966,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - if (iter->cpu_file == TRACE_PIPE_ALL_CPU) - cpumask_clear(tracing_reader_cpumask); - else - cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); - - if (iter->trace->pipe_close) iter->trace->pipe_close(iter); @@ -3074,6 +3127,7 @@ waitagain: iter->pos = -1; trace_event_read_lock(); + trace_access_lock(iter->cpu_file); while (find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -3090,6 +3144,7 @@ waitagain: if (iter->seq.len >= cnt) break; } + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); /* Now copy what we have to the user */ @@ -3215,6 +3270,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, } trace_event_read_lock(); + trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { @@ -3238,6 +3294,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_seq_init(&iter->seq); } + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); mutex_unlock(&iter->mutex); @@ -3539,10 +3596,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, info->read = 0; + trace_access_lock(info->cpu); ret = ring_buffer_read_page(info->tr->buffer, &info->spare, count, info->cpu, 0); + trace_access_unlock(info->cpu); if (ret < 0) return 0; @@ -3670,6 +3729,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } + trace_access_lock(info->cpu); entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { @@ -3717,6 +3777,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); } + trace_access_unlock(info->cpu); spd.nr_pages = i; /* did we read anything? */ @@ -4153,6 +4214,8 @@ static __init int tracer_init_debugfs(void) struct dentry *d_tracer; int cpu; + trace_access_lock_init(); + d_tracer = tracing_init_dentry(); trace_create_file("tracing_enabled", 0644, d_tracer, @@ -4387,9 +4450,6 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; - if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) - goto out_free_tracing_cpumask; - /* To save memory, keep the ring buffer size to its minimum */ if (ring_buffer_expanded) ring_buf_size = trace_buf_size; @@ -4447,8 +4507,6 @@ __init static int tracer_alloc_buffers(void) return 0; out_free_cpumask: - free_cpumask_var(tracing_reader_cpumask); -out_free_tracing_cpumask: free_cpumask_var(tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); -- cgit v1.2.2 From dc4f8845ee2ca39fe054a2d911729ffd269b4b66 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 5 Jan 2010 19:27:51 +0100 Subject: tracing: optimize recordmcount.pl for offsets-handling - move check for open file in front of the writing loop - use perl-constructs to access the array Signed-off-by: Wolfram Sang LKML-Reference: <1262716072-14414-2-git-send-email-w.sang@pengutronix.de> Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 92f09fe9639..5de12c7b366 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -432,14 +432,14 @@ sub update_funcs # Loop through all the mcount caller offsets and print a reference # to the caller based from the ref_func. - for (my $i=0; $i <= $#offsets; $i++) { - if (!$opened) { - open(FILE, ">$mcount_s") || die "can't create $mcount_s\n"; - $opened = 1; - print FILE "\t.section $mcount_section,\"a\",$section_type\n"; - print FILE "\t.align $alignment\n" if (defined($alignment)); - } - printf FILE "\t%s %s + %d\n", $type, $ref_func, $offsets[$i] - $offset; + if (!$opened) { + open(FILE, ">$mcount_s") || die "can't create $mcount_s\n"; + $opened = 1; + print FILE "\t.section $mcount_section,\"a\",$section_type\n"; + print FILE "\t.align $alignment\n" if (defined($alignment)); + } + foreach my $cur_offset (@offsets) { + printf FILE "\t%s %s + %d\n", $type, $ref_func, $cur_offset - $offset; } } @@ -514,7 +514,7 @@ while () { } # is this a call site to mcount? If so, record it to print later if ($text_found && /$mcount_regex/) { - $offsets[$#offsets + 1] = hex $1; + push(@offsets, hex $1); } } -- cgit v1.2.2 From dfaa9e2c5707b2c217c0121aac796e0fa3051482 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 5 Jan 2010 21:41:22 +0100 Subject: tracing: Use appropriate perl constructs in recordmcount.pl Modified recordmcount.pl to use perl constructs that are still understandable by C hackers that are not perl programmers. Signed-off-by: Wolfram Sang LKML-Reference: <1262724082-9517-1-git-send-email-w.sang@pengutronix.de> Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 5de12c7b366..545fe7154ce 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -136,13 +136,14 @@ my %text_sections = ( ".text.unlikely" => 1, ); -$objdump = "objdump" if ((length $objdump) == 0); -$objcopy = "objcopy" if ((length $objcopy) == 0); -$cc = "gcc" if ((length $cc) == 0); -$ld = "ld" if ((length $ld) == 0); -$nm = "nm" if ((length $nm) == 0); -$rm = "rm" if ((length $rm) == 0); -$mv = "mv" if ((length $mv) == 0); +# Note: we are nice to C-programmers here, thus we skip the '||='-idiom. +$objdump = 'objdump' if (!$objdump); +$objcopy = 'objcopy' if (!$objcopy); +$cc = 'gcc' if (!$cc); +$ld = 'ld' if (!$ld); +$nm = 'nm' if (!$nm); +$rm = 'rm' if (!$rm); +$mv = 'mv' if (!$mv); #print STDERR "running: $P '$arch' '$objdump' '$objcopy' '$cc' '$ld' " . # "'$nm' '$rm' '$mv' '$inputfile'\n"; @@ -194,12 +195,8 @@ sub check_objcopy } } -if ($arch eq "x86") { - if ($bits == 64) { - $arch = "x86_64"; - } else { - $arch = "i386"; - } +if ($arch eq 'x86') { + $arch = ($bits == 64) ? 'x86_64' : 'i386'; } # @@ -476,11 +473,7 @@ while () { $read_headers = 0; # Only record text sections that we know are safe - if (defined($text_sections{$1})) { - $read_function = 1; - } else { - $read_function = 0; - } + $read_function = defined($text_sections{$1}); # print out any recorded offsets update_funcs(); -- cgit v1.2.2 From d931369b74b3d6f2044f595af6f3dd074f65d9cf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Jan 2010 17:27:11 -0500 Subject: tracing: Add stack dump to trace_printk if stacktrace option is set If the ftrace stacktrace option is set, then add the stack dumps to trace_printk. Requested-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index abdd333a082..5314c90bbc8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1392,8 +1392,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); - if (!filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) { ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, flags, 6, pc); + } out_unlock: arch_spin_unlock(&trace_buf_lock); @@ -1466,8 +1468,10 @@ int trace_array_vprintk(struct trace_array *tr, memcpy(&entry->buf, trace_buf, len); entry->buf[len] = '\0'; - if (!filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) { ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, irq_flags, 6, pc); + } out_unlock: arch_spin_unlock(&trace_buf_lock); -- cgit v1.2.2 From 24a53652e361321b09df5040711e69387344ce09 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 14 Jan 2010 08:23:15 +0100 Subject: tracing: Drop the tr check from the graph tracing path Each time we save a function entry from the function graph tracer, we check if the trace array is set, which is wasteful because it is set anyway before we start the tracer. All we need is to ensure we have good read and write orderings. When we set the trace array, we just need to guarantee it to be visible before starting tracing. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Lai Jiangshan Cc: Paul E. McKenney LKML-Reference: <1263453795-7496-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_functions_graph.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b1342c5d37c..f2252296607 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -212,9 +212,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; - if (unlikely(!tr)) - return 0; - if (!ftrace_trace_task(current)) return 0; @@ -287,11 +284,20 @@ void trace_graph_return(struct ftrace_graph_ret *trace) local_irq_restore(flags); } +void set_graph_array(struct trace_array *tr) +{ + graph_array = tr; + + /* Make graph_array visible before we start tracing */ + + smp_mb(); +} + static int graph_trace_init(struct trace_array *tr) { int ret; - graph_array = tr; + set_graph_array(tr); ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry); if (ret) @@ -301,11 +307,6 @@ static int graph_trace_init(struct trace_array *tr) return 0; } -void set_graph_array(struct trace_array *tr) -{ - graph_array = tr; -} - static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); -- cgit v1.2.2 From ea2c68a08fedb5053ba312d661e47df9f4d72411 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 13 Jan 2010 19:38:30 +0800 Subject: tracing: Simplify test for function_graph tracing start point In the function graph tracer, a calling function is to be traced only when it is enabled through the set_graph_function file, or when it is nested in an enabled function. Current code uses TSK_TRACE_FL_GRAPH to test whether it is nested or not. Looking at the code, we can get this: (trace->depth > 0) <==> (TSK_TRACE_FL_GRAPH is set) trace->depth is more explicit to tell that it is nested. So we use trace->depth directly and simplify the code. No functionality is changed. TSK_TRACE_FL_GRAPH is not removed yet, it is left for future usage. Signed-off-by: Lai Jiangshan Cc: Ingo Molnar Cc: Steven Rostedt LKML-Reference: <4B4DB0B6.7040607@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.h | 2 +- kernel/trace/trace_functions_graph.c | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4df6a77eb19..ce077fbbf55 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -504,7 +504,7 @@ static inline int ftrace_graph_addr(unsigned long addr) { int i; - if (!ftrace_graph_count || test_tsk_trace_graph(current)) + if (!ftrace_graph_count) return 1; for (i = 0; i < ftrace_graph_count; i++) { diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index f2252296607..616b135c9eb 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -215,7 +215,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) if (!ftrace_trace_task(current)) return 0; - if (!ftrace_graph_addr(trace->func)) + /* trace it when it is-nested-in or is a function enabled. */ + if (!(trace->depth || ftrace_graph_addr(trace->func))) return 0; local_irq_save(flags); @@ -228,9 +229,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) } else { ret = 0; } - /* Only do the atomic if it is not already set */ - if (!test_tsk_trace_graph(current)) - set_tsk_trace_graph(current); atomic_dec(&data->disabled); local_irq_restore(flags); @@ -278,8 +276,6 @@ void trace_graph_return(struct ftrace_graph_ret *trace) pc = preempt_count(); __trace_graph_return(tr, trace, flags, pc); } - if (!trace->depth) - clear_tsk_trace_graph(current); atomic_dec(&data->disabled); local_irq_restore(flags); } -- cgit v1.2.2 From ede55c9d78101fef0d8e620940a5163f14b02f29 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 27 Jan 2010 11:25:54 -0500 Subject: tracing: Add correct/incorrect to sort keys for branch annotation output The branch annotation is a bit difficult to see the worst offenders because it only sorts by percentage: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 0 163 100 qdisc_restart sch_generic.c 179 0 163 100 pfifo_fast_dequeue sch_generic.c 447 0 4 100 pskb_trim_rcsum skbuff.h 1689 0 4 100 llc_rcv llc_input.c 170 0 18 100 psmouse_interrupt psmouse-base.c 304 0 3 100 atkbd_interrupt atkbd.c 389 0 5 100 usb_alloc_dev usb.c 437 0 11 100 vsscanf vsprintf.c 1897 0 2 100 IS_ERR err.h 34 0 23 100 __rmqueue_fallback page_alloc.c 865 0 4 100 probe_wakeup_sched_switch trace_sched_wakeup.c 142 0 3 100 move_masked_irq migration.c 11 Adding the incorrect and correct values as sort keys makes this file a bit more informative: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 0 366541 100 audit_syscall_entry auditsc.c 1637 0 366538 100 audit_syscall_exit auditsc.c 1685 0 115839 100 sched_info_switch sched_stats.h 269 0 74567 100 sched_info_queued sched_stats.h 222 0 66578 100 sched_info_dequeued sched_stats.h 177 0 15113 100 trace_workqueue_insertion workqueue.h 38 0 15107 100 trace_workqueue_execution workqueue.h 45 0 3622 100 syscall_trace_leave ptrace.c 1772 0 2750 100 sched_move_task sched.c 10100 0 2750 100 sched_move_task sched.c 10110 0 1815 100 pre_schedule_rt sched_rt.c 1462 0 837 100 audit_alloc auditsc.c 879 0 814 100 tcp_mss_split_point tcp_output.c 1302 Signed-off-by: Steven Rostedt --- kernel/trace/trace_branch.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4a194f08f88..b9bc4d47017 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2) return -1; if (percent_a > percent_b) return 1; - else - return 0; + + if (a->incorrect < b->incorrect) + return -1; + if (a->incorrect > b->incorrect) + return 1; + + /* + * Since the above shows worse (incorrect) cases + * first, we continue that by showing best (correct) + * cases last. + */ + if (a->correct > b->correct) + return -1; + if (a->correct < b->correct) + return 1; + + return 0; } static struct tracer_stat annotated_branch_stats = { -- cgit v1.2.2 From c7c6b1fe9f942c1a30585ec2210a09dfff238506 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 10 Feb 2010 15:43:04 +0800 Subject: ftrace: Allow to remove a single function from function graph filter I don't see why we can only clear all functions from the filter. After patching: # echo sys_open > set_graph_function # echo sys_close >> set_graph_function # cat set_graph_function sys_open sys_close # echo '!sys_close' >> set_graph_function # cat set_graph_function sys_open Signed-off-by: Li Zefan LKML-Reference: <4B726388.2000408@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 51 +++++++++++++++++++++++++++++++-------------------- kernel/trace/trace.h | 3 ++- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7968762c816..43bec993c86 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2426,6 +2426,7 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); int ftrace_graph_count; +int ftrace_graph_filter_enabled; unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; static void * @@ -2448,7 +2449,7 @@ static void *g_start(struct seq_file *m, loff_t *pos) mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ - if (!ftrace_graph_count && !*pos) + if (!ftrace_graph_filter_enabled && !*pos) return (void *)1; return __g_next(m, pos); @@ -2494,6 +2495,7 @@ ftrace_graph_open(struct inode *inode, struct file *file) mutex_lock(&graph_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ftrace_graph_filter_enabled = 0; ftrace_graph_count = 0; memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); } @@ -2519,7 +2521,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) struct dyn_ftrace *rec; struct ftrace_page *pg; int search_len; - int found = 0; + int fail = 1; int type, not; char *search; bool exists; @@ -2530,37 +2532,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) /* decode regex */ type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); - if (not) - return -EINVAL; + if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) + return -EBUSY; search_len = strlen(search); mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if (*idx >= FTRACE_GRAPH_MAX_FUNCS) - break; - if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) continue; if (ftrace_match_record(rec, search, search_len, type)) { - /* ensure it is not already in the array */ + /* if it is in the array */ exists = false; - for (i = 0; i < *idx; i++) + for (i = 0; i < *idx; i++) { if (array[i] == rec->ip) { exists = true; break; } - if (!exists) - array[(*idx)++] = rec->ip; - found = 1; + } + + if (!not) { + fail = 0; + if (!exists) { + array[(*idx)++] = rec->ip; + if (*idx >= FTRACE_GRAPH_MAX_FUNCS) + goto out; + } + } else { + if (exists) { + array[i] = array[--(*idx)]; + array[*idx] = 0; + fail = 0; + } + } } } while_for_each_ftrace_rec(); - +out: mutex_unlock(&ftrace_lock); - return found ? 0 : -EINVAL; + if (fail) + return -EINVAL; + + ftrace_graph_filter_enabled = 1; + return 0; } static ssize_t @@ -2570,16 +2586,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, struct trace_parser parser; ssize_t read, ret; - if (!cnt || cnt < 0) + if (!cnt) return 0; mutex_lock(&graph_lock); - if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { - ret = -EBUSY; - goto out_unlock; - } - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { ret = -ENOMEM; goto out_unlock; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ce077fbbf55..b477fce41ed 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -497,6 +497,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ #define FTRACE_GRAPH_MAX_FUNCS 32 +extern int ftrace_graph_filter_enabled; extern int ftrace_graph_count; extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; @@ -504,7 +505,7 @@ static inline int ftrace_graph_addr(unsigned long addr) { int i; - if (!ftrace_graph_count) + if (!ftrace_graph_filter_enabled) return 1; for (i = 0; i < ftrace_graph_count; i++) { -- cgit v1.2.2 From 83f0d53993b2967e54186468b0fc4321447f68f1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Feb 2010 10:38:47 -0500 Subject: tracing: Add notrace to TRACE_EVENT implementation functions The functions used to implement the TRACE_EVENT macro show up in function tracing. This is considered a distraction, and these should not be displayed. For example: -0 [000] 57.202149: task_of <-update_stats_wait_end -0 [000] 57.202149: ftrace_raw_event_sched_stat_wait <-update_stats_wait_end -0 [000] 57.202150: ftrace_raw_event_id_sched_stat_template <-ftrace_raw_event_sched_stat_wait -0 [000] 57.202150: sched_stat_wait: comm=sshd pid=2735 delay=19207 [ns] The "ftrace_raw_event_*" traces are just the utility functions used by TRACE_EVENT tracepoints. Cc: Thomas Gleixner Requested-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index df65b99880b..09fd9afc085 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -199,7 +199,7 @@ #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ -static enum print_line_t \ +static notrace enum print_line_t \ ftrace_raw_output_id_##call(int event_id, const char *name, \ struct trace_iterator *iter, int flags) \ { \ @@ -232,7 +232,7 @@ ftrace_raw_output_id_##call(int event_id, const char *name, \ #undef DEFINE_EVENT #define DEFINE_EVENT(template, name, proto, args) \ -static enum print_line_t \ +static notrace enum print_line_t \ ftrace_raw_output_##name(struct trace_iterator *iter, int flags) \ { \ return ftrace_raw_output_id_##template(event_##name.id, \ @@ -241,7 +241,7 @@ ftrace_raw_output_##name(struct trace_iterator *iter, int flags) \ #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, call, proto, args, print) \ -static enum print_line_t \ +static notrace enum print_line_t \ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ { \ struct trace_seq *s = &iter->seq; \ @@ -307,7 +307,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print) \ -static int \ +static int notrace \ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ { \ struct ftrace_raw_##call field; \ @@ -355,7 +355,7 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ -static inline int ftrace_get_offsets_##call( \ +static inline notrace int ftrace_get_offsets_##call( \ struct ftrace_data_offsets_##call *__data_offsets, proto) \ { \ int __data_size = 0; \ @@ -402,12 +402,14 @@ static inline int ftrace_get_offsets_##call( \ \ static void ftrace_profile_##name(proto); \ \ -static int ftrace_profile_enable_##name(struct ftrace_event_call *unused)\ +static notrace int \ +ftrace_profile_enable_##name(struct ftrace_event_call *unused) \ { \ return register_trace_##name(ftrace_profile_##name); \ } \ \ -static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ +static notrace void \ +ftrace_profile_disable_##name(struct ftrace_event_call *unused) \ { \ unregister_trace_##name(ftrace_profile_##name); \ } @@ -541,7 +543,8 @@ static void ftrace_profile_disable_##name(struct ftrace_event_call *unused)\ #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ \ -static void ftrace_raw_event_id_##call(struct ftrace_event_call *event_call, \ +static notrace void \ +ftrace_raw_event_id_##call(struct ftrace_event_call *event_call, \ proto) \ { \ struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ @@ -578,17 +581,19 @@ static void ftrace_raw_event_id_##call(struct ftrace_event_call *event_call, \ #undef DEFINE_EVENT #define DEFINE_EVENT(template, call, proto, args) \ \ -static void ftrace_raw_event_##call(proto) \ +static notrace void ftrace_raw_event_##call(proto) \ { \ ftrace_raw_event_id_##template(&event_##call, args); \ } \ \ -static int ftrace_raw_reg_event_##call(struct ftrace_event_call *unused)\ +static notrace int \ +ftrace_raw_reg_event_##call(struct ftrace_event_call *unused) \ { \ return register_trace_##call(ftrace_raw_event_##call); \ } \ \ -static void ftrace_raw_unreg_event_##call(struct ftrace_event_call *unused)\ +static notrace void \ +ftrace_raw_unreg_event_##call(struct ftrace_event_call *unused) \ { \ unregister_trace_##call(ftrace_raw_event_##call); \ } \ @@ -750,7 +755,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ -static void \ +static notrace void \ ftrace_profile_templ_##call(struct ftrace_event_call *event_call, \ proto) \ { \ @@ -820,7 +825,7 @@ end_recursion: \ #undef DEFINE_EVENT #define DEFINE_EVENT(template, call, proto, args) \ -static void ftrace_profile_##call(proto) \ +static notrace void ftrace_profile_##call(proto) \ { \ struct ftrace_event_call *event_call = &event_##call; \ \ -- cgit v1.2.2 From e7b8e675d9c71b868b66f62f725a948047514719 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 26 Jan 2010 04:40:03 -0500 Subject: tracing: Unify arch_syscall_addr() implementations Most implementations of arch_syscall_addr() are the same, so create a default version in common code and move the one piece that differs (the syscall table) to asm/syscall.h. New arch ports don't have to waste time copying & pasting this simple function. The s390/sparc versions need to be different, so document why. Signed-off-by: Mike Frysinger Acked-by: David S. Miller Acked-by: Paul Mundt Acked-by: Heiko Carstens Cc: Steven Rostedt LKML-Reference: <1264498803-17278-1-git-send-email-vapier@gentoo.org> Signed-off-by: Frederic Weisbecker --- Documentation/trace/ftrace-design.txt | 5 ++--- arch/s390/include/asm/syscall.h | 7 +++++++ arch/s390/kernel/ftrace.c | 10 ---------- arch/sh/include/asm/syscall.h | 2 ++ arch/sh/kernel/ftrace.c | 9 --------- arch/sparc/include/asm/syscall.h | 7 +++++++ arch/sparc/kernel/ftrace.c | 11 ----------- arch/x86/include/asm/syscall.h | 2 ++ arch/x86/kernel/ftrace.c | 10 ---------- include/linux/ftrace.h | 6 ++++++ kernel/trace/trace_syscalls.c | 5 +++++ 11 files changed, 31 insertions(+), 43 deletions(-) diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt index 239f14b2b55..99df1101d2a 100644 --- a/Documentation/trace/ftrace-design.txt +++ b/Documentation/trace/ftrace-design.txt @@ -218,11 +218,10 @@ HAVE_SYSCALL_TRACEPOINTS You need very few things to get the syscalls tracing in an arch. +- Support HAVE_ARCH_TRACEHOOK (see arch/Kconfig). - Have a NR_syscalls variable in that provides the number of syscalls supported by the arch. -- Implement arch_syscall_addr() that resolves a syscall address from a - syscall number. -- Support the TIF_SYSCALL_TRACEPOINT thread flags +- Support the TIF_SYSCALL_TRACEPOINT thread flags. - Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace in the ptrace syscalls tracing path. - Tag this arch as HAVE_SYSCALL_TRACEPOINTS. diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index e0a73d3eb83..8429686951f 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -15,6 +15,13 @@ #include #include +/* + * The syscall table always contains 32 bit pointers since we know that the + * address of the function to be called is (way) below 4GB. So the "int" + * type here is what we want [need] for both 32 bit and 64 bit systems. + */ +extern const unsigned int sys_call_table[]; + static inline long syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 5a82bc68193..9e69449e77a 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -200,13 +200,3 @@ out: return parent; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#ifdef CONFIG_FTRACE_SYSCALLS - -extern unsigned int sys_call_table[]; - -unsigned long __init arch_syscall_addr(int nr) -{ - return (unsigned long)sys_call_table[nr]; -} -#endif diff --git a/arch/sh/include/asm/syscall.h b/arch/sh/include/asm/syscall.h index 6a381429ee9..aa7777bdc37 100644 --- a/arch/sh/include/asm/syscall.h +++ b/arch/sh/include/asm/syscall.h @@ -1,6 +1,8 @@ #ifndef __ASM_SH_SYSCALL_H #define __ASM_SH_SYSCALL_H +extern const unsigned long sys_call_table[]; + #ifdef CONFIG_SUPERH32 # include "syscall_32.h" #else diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c index a48cdedc73b..30e13196d35 100644 --- a/arch/sh/kernel/ftrace.c +++ b/arch/sh/kernel/ftrace.c @@ -399,12 +399,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#ifdef CONFIG_FTRACE_SYSCALLS -extern unsigned long *sys_call_table; - -unsigned long __init arch_syscall_addr(int nr) -{ - return (unsigned long)sys_call_table[nr]; -} -#endif /* CONFIG_FTRACE_SYSCALLS */ diff --git a/arch/sparc/include/asm/syscall.h b/arch/sparc/include/asm/syscall.h index 7486c605e23..025a02ad2e3 100644 --- a/arch/sparc/include/asm/syscall.h +++ b/arch/sparc/include/asm/syscall.h @@ -5,6 +5,13 @@ #include #include +/* + * The syscall table always contains 32 bit pointers since we know that the + * address of the function to be called is (way) below 4GB. So the "int" + * type here is what we want [need] for both 32 bit and 64 bit systems. + */ +extern const unsigned int sys_call_table[]; + /* The system call number is given by the user in %g1 */ static inline long syscall_get_nr(struct task_struct *task, struct pt_regs *regs) diff --git a/arch/sparc/kernel/ftrace.c b/arch/sparc/kernel/ftrace.c index 29973daa993..9103a56b39e 100644 --- a/arch/sparc/kernel/ftrace.c +++ b/arch/sparc/kernel/ftrace.c @@ -91,14 +91,3 @@ int __init ftrace_dyn_arch_init(void *data) return 0; } #endif - -#ifdef CONFIG_FTRACE_SYSCALLS - -extern unsigned int sys_call_table[]; - -unsigned long __init arch_syscall_addr(int nr) -{ - return (unsigned long)sys_call_table[nr]; -} - -#endif diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 8d33bc5462d..c4a348f7bd4 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -16,6 +16,8 @@ #include #include +extern const unsigned long sys_call_table[]; + /* * Only the low 32 bits of orig_ax are meaningful, so we return int. * This importantly ignores the high bits on 64-bit, so comparisons diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 30968924543..0d93a941934 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -484,13 +484,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#ifdef CONFIG_FTRACE_SYSCALLS - -extern unsigned long *sys_call_table; - -unsigned long __init arch_syscall_addr(int nr) -{ - return (unsigned long)(&sys_call_table)[nr]; -} -#endif diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0b4f97d24d7..1cbb36f2759 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -511,4 +511,10 @@ static inline void trace_hw_branch_oops(void) {} #endif /* CONFIG_HW_BRANCH_TRACER */ +#ifdef CONFIG_FTRACE_SYSCALLS + +unsigned long arch_syscall_addr(int nr); + +#endif /* CONFIG_FTRACE_SYSCALLS */ + #endif /* _LINUX_FTRACE_H */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 49cea70fbf6..ecf00782b46 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -394,6 +394,11 @@ int init_syscall_trace(struct ftrace_event_call *call) return id; } +unsigned long __init arch_syscall_addr(int nr) +{ + return (unsigned long)sys_call_table[nr]; +} + int __init init_ftrace_syscalls(void) { struct syscall_metadata *meta; -- cgit v1.2.2 From f850c30c8b426ba1688cb63b1a3e534eed03a138 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 10 Feb 2010 17:25:17 +0100 Subject: tracing/kprobes: Make Kconfig dependencies generic KPROBES_EVENT actually depends on the regs and stack access API (b1cf540f) and not on x86. So introduce a new config option which architectures can select if they have the API implemented and switch x86. Signed-off-by: Heiko Carstens Acked-by: Masami Hiramatsu Cc: Ingo Molnar Cc: Martin Schwidefsky LKML-Reference: <20100210162517.GB6933@osiris.boeblingen.de.ibm.com> Signed-off-by: Frederic Weisbecker --- arch/Kconfig | 3 +++ arch/x86/Kconfig | 1 + kernel/trace/Kconfig | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/Kconfig b/arch/Kconfig index 9d055b4f058..04e3aa77da2 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -121,6 +121,9 @@ config HAVE_DMA_ATTRS config USE_GENERIC_SMP_HELPERS bool +config HAVE_REGS_AND_STACK_ACCESS_API + bool + config HAVE_CLK bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 55298e89157..07baa12929b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -45,6 +45,7 @@ config X86 select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select USER_STACKTRACE_SUPPORT + select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_DMA_API_DEBUG select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6c22d8a2f28..40fef552f01 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -451,7 +451,7 @@ config BLK_DEV_IO_TRACE config KPROBE_EVENT depends on KPROBES - depends on X86 + depends on HAVE_REGS_AND_STACK_ACCESS_API bool "Enable kprobes-based dynamic events" select TRACING default y -- cgit v1.2.2 From 952974ac61f686896bd4134dae106a886a5589f1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 12 Feb 2010 13:38:40 +0100 Subject: s390: Add pt_regs register and stack access API This API is needed for the kprobe-based event tracer. Signed-off-by: Heiko Carstens Reviewed-by: Masami Hiramatsu Cc: Martin Schwidefsky LKML-Reference: <20100212123840.GB27548@osiris.boeblingen.de.ibm.com> Signed-off-by: Frederic Weisbecker --- arch/s390/Kconfig | 1 + arch/s390/include/asm/ptrace.h | 13 +++++++++- arch/s390/kernel/ptrace.c | 58 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 1 deletion(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index c80235206c0..2590ce20157 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -87,6 +87,7 @@ config S390 select HAVE_SYSCALL_TRACEPOINTS select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_DEFAULT_NO_SPIN_MUTEXES select HAVE_OPROFILE select HAVE_KPROBES diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 95dcf183a28..dd2d913afca 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -492,13 +492,24 @@ struct user_regs_struct struct task_struct; extern void user_enable_single_step(struct task_struct *); extern void user_disable_single_step(struct task_struct *); +extern void show_regs(struct pt_regs * regs); #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) #define user_stack_pointer(regs)((regs)->gprs[15]) #define regs_return_value(regs)((regs)->gprs[2]) #define profile_pc(regs) instruction_pointer(regs) -extern void show_regs(struct pt_regs * regs); + +int regs_query_register_offset(const char *name); +const char *regs_query_register_name(unsigned int offset); +unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset); +unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n); + +static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ + return regs->gprs[15] & PSW_ADDR_INSN; +} + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 13815d39f7d..1720f380add 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -984,3 +984,61 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif return &user_s390_view; } + +static const char *gpr_names[NUM_GPRS] = { + "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", +}; + +unsigned long regs_get_register(struct pt_regs *regs, unsigned int offset) +{ + if (offset >= NUM_GPRS) + return 0; + return regs->gprs[offset]; +} + +int regs_query_register_offset(const char *name) +{ + unsigned long offset; + + if (!name || *name != 'r') + return -EINVAL; + if (strict_strtoul(name + 1, 10, &offset)) + return -EINVAL; + if (offset >= NUM_GPRS) + return -EINVAL; + return offset; +} + +const char *regs_query_register_name(unsigned int offset) +{ + if (offset >= NUM_GPRS) + return NULL; + return gpr_names[offset]; +} + +static int regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr) +{ + unsigned long ksp = kernel_stack_pointer(regs); + + return (addr & ~(THREAD_SIZE - 1)) == (ksp & ~(THREAD_SIZE - 1)); +} + +/** + * regs_get_kernel_stack_nth() - get Nth entry of the stack + * @regs:pt_regs which contains kernel stack pointer. + * @n:stack entry number. + * + * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which + * is specifined by @regs. If the @n th entry is NOT in the kernel stack, + * this returns 0. + */ +unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n) +{ + unsigned long addr; + + addr = kernel_stack_pointer(regs) + n * sizeof(long); + if (!regs_within_kernel_stack(regs, addr)) + return 0; + return *(unsigned long *)addr; +} -- cgit v1.2.2 From e01292b1fd68ff2abe234d584b06e64344d2c1de Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 18 Feb 2010 14:25:21 +0100 Subject: tracing/kprobes: Add short documentation for HAVE_REGS_AND_STACK_ACCESS_API So that arch developers know how to implement it without the need to dig into changelogs. Reported-by: Mike Frysinger Signed-off-by: Heiko Carstens Acked-by: Masami Hiramatsu Cc: Martin Schwidefsky Cc: "David S . Miller" Cc: Paul Mundt Cc: Steven Rostedt LKML-Reference: <20100218132521.GB2406@osiris.boeblingen.de.ibm.com> [added reference to ptrace.h in the config help] Signed-off-by: Frederic Weisbecker --- arch/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 04e3aa77da2..50877ef2584 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -123,6 +123,11 @@ config USE_GENERIC_SMP_HELPERS config HAVE_REGS_AND_STACK_ACCESS_API bool + help + This symbol should be selected by an architecure if it supports + the API needed to access registers and stack entries from pt_regs, + declared in asm/ptrace.h + For example the kprobes-based event tracer needs this API. config HAVE_CLK bool -- cgit v1.2.2 From 0c54dd341fb701928b8e5dca91ced1870c55b05b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 25 Feb 2010 08:42:06 -0500 Subject: ftrace: Remove memory barriers from NMI code when not needed The code in stop_machine that modifies the kernel text has a bit of logic to handle the case of NMIs. stop_machine does not prevent NMIs from executing, and if an NMI were to trigger on another CPU as the modifying CPU is changing the NMI text, a GPF could result. To prevent the GPF, the NMI calls ftrace_nmi_enter() which may modify the code first, then any other NMIs will just change the text to the same content which will do no harm. The code that stop_machine called must wait for NMIs to finish while it changes each location in the kernel. That code may also change the text to what the NMI changed it to. The key is that the text will never change content while another CPU is executing it. To make the above work, the call to ftrace_nmi_enter() must also do a smp_mb() as well as atomic_inc(). But for applications like perf that require a high number of NMIs for profiling, this can have a dramatic effect on the system. Not only is it doing a full memory barrier on both nmi_enter() as well as nmi_exit() it is also modifying a global variable with an atomic operation. This kills performance on large SMP machines. Since the memory barriers are only needed when ftrace is in the process of modifying the text (which is seldom), this patch adds a "modifying_code" variable that gets set before stop machine is executed and cleared afterwards. The NMIs will check this variable and store it in a per CPU "save_modifying_code" variable that it will use to check if it needs to do the memory barriers and atomic dec on NMI exit. Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 30968924543..605ef196fdd 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -30,14 +30,32 @@ #ifdef CONFIG_DYNAMIC_FTRACE +/* + * modifying_code is set to notify NMIs that they need to use + * memory barriers when entering or exiting. But we don't want + * to burden NMIs with unnecessary memory barriers when code + * modification is not being done (which is most of the time). + * + * A mutex is already held when ftrace_arch_code_modify_prepare + * and post_process are called. No locks need to be taken here. + * + * Stop machine will make sure currently running NMIs are done + * and new NMIs will see the updated variable before we need + * to worry about NMIs doing memory barriers. + */ +static int modifying_code __read_mostly; +static DEFINE_PER_CPU(int, save_modifying_code); + int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); + modifying_code = 1; return 0; } int ftrace_arch_code_modify_post_process(void) { + modifying_code = 0; set_kernel_text_ro(); return 0; } @@ -149,6 +167,11 @@ static void ftrace_mod_code(void) void ftrace_nmi_enter(void) { + __get_cpu_var(save_modifying_code) = modifying_code; + + if (!__get_cpu_var(save_modifying_code)) + return; + if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { smp_rmb(); ftrace_mod_code(); @@ -160,6 +183,9 @@ void ftrace_nmi_enter(void) void ftrace_nmi_exit(void) { + if (!__get_cpu_var(save_modifying_code)) + return; + /* Finish all executions before clearing nmi_running */ smp_mb(); atomic_dec(&nmi_running); -- cgit v1.2.2 From 86c38a31aa7f2dd6e74a262710bf8ebf7455acc5 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 24 Feb 2010 13:59:23 -0500 Subject: tracing: Fix ftrace_event_call alignment for use with gcc 4.5 GCC 4.5 introduces behavior that forces the alignment of structures to use the largest possible value. The default value is 32 bytes, so if some structures are defined with a 4-byte alignment and others aren't declared with an alignment constraint at all - it will align at 32-bytes. For things like the ftrace events, this results in a non-standard array. When initializing the ftrace subsystem, we traverse the _ftrace_events section and call the initialization callback for each event. When the structures are misaligned, we could be treating another part of the structure (or the zeroed out space between them) as a function pointer. This patch forces the alignment for all the ftrace_event_call structures to 4 bytes. Without this patch, the kernel fails to boot very early when built with gcc 4.5. It's trivial to check the alignment of the members of the array, so it might be worthwhile to add something to the build system to do that automatically. Unfortunately, that only covers this case. I've asked one of the gcc developers about adding a warning when this condition is seen. Cc: stable@kernel.org Signed-off-by: Jeff Mahoney LKML-Reference: <4B85770B.6010901@suse.com> Signed-off-by: Steven Rostedt --- include/linux/syscalls.h | 6 ++++-- include/trace/ftrace.h | 3 ++- kernel/trace/trace.h | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 7b219696ad2..91bd7d78a07 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -132,7 +132,8 @@ struct perf_event_attr; #define SYSCALL_TRACE_ENTER_EVENT(sname) \ static const struct syscall_metadata __syscall_meta_##sname; \ - static struct ftrace_event_call event_enter_##sname; \ + static struct ftrace_event_call \ + __attribute__((__aligned__(4))) event_enter_##sname; \ static struct trace_event enter_syscall_print_##sname = { \ .trace = print_syscall_enter, \ }; \ @@ -153,7 +154,8 @@ struct perf_event_attr; #define SYSCALL_TRACE_EXIT_EVENT(sname) \ static const struct syscall_metadata __syscall_meta_##sname; \ - static struct ftrace_event_call event_exit_##sname; \ + static struct ftrace_event_call \ + __attribute__((__aligned__(4))) event_exit_##sname; \ static struct trace_event exit_syscall_print_##sname = { \ .trace = print_syscall_exit, \ }; \ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 09fd9afc085..f23a0ca6910 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -65,7 +65,8 @@ }; #undef DEFINE_EVENT #define DEFINE_EVENT(template, name, proto, args) \ - static struct ftrace_event_call event_##name + static struct ftrace_event_call \ + __attribute__((__aligned__(4))) event_##name #undef DEFINE_EVENT_PRINT #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b477fce41ed..fd05bcaf91b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -792,7 +792,8 @@ extern const char *__stop___trace_bprintk_fmt[]; #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ - extern struct ftrace_event_call event_##call; + extern struct ftrace_event_call \ + __attribute__((__aligned__(4))) event_##call; #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) -- cgit v1.2.2 From 1ab83a89411556c4d5914dcf0d5da190178ae7db Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Feb 2010 15:28:14 +0800 Subject: tracing: Remove CONFIG_TRACE_POWER from kernel config The power tracer has been converted to power trace events. Acked-by: Frederic Weisbecker Signed-off-by: Li Zefan LKML-Reference: <4B84D50E.4070806@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 9 --------- 1 file changed, 9 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6c22d8a2f28..ca2d3a8778b 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -330,15 +330,6 @@ config BRANCH_TRACER Say N if unsure. -config POWER_TRACER - bool "Trace power consumption behavior" - depends on X86 - select GENERIC_TRACER - help - This tracer helps developers to analyze and optimize the kernel's - power management decisions, specifically the C-state and P-state - behavior. - config KSYM_TRACER bool "Trace read and write access on kernel memory locations" depends on HAVE_HW_BREAKPOINT -- cgit v1.2.2 From 6574658b3bc7c408581629de5efb809f125cce8c Mon Sep 17 00:00:00 2001 From: Wenji Huang Date: Wed, 24 Feb 2010 15:40:22 +0800 Subject: tracing: Fix typo in prof_sysexit_enable() Signed-off-by: Wenji Huang LKML-Reference: <1266997226-6833-1-git-send-email-wenji.huang@oracle.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 49cea70fbf6..8cdda95da81 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -603,7 +603,7 @@ int prof_sysexit_enable(struct ftrace_event_call *call) ret = register_trace_sys_exit(prof_syscall_exit); if (ret) { pr_info("event trace: Could not activate" - "syscall entry trace point"); + "syscall exit trace point"); } else { set_bit(num, enabled_prof_exit_syscalls); sys_prof_refcount_exit++; -- cgit v1.2.2 From a5efd925115cbc1f90195dca9a25f7b8daa10c37 Mon Sep 17 00:00:00 2001 From: Wenji Huang Date: Wed, 24 Feb 2010 15:40:23 +0800 Subject: tracing: Fix typo of info text in trace_kprobe.c Signed-off-by: Wenji Huang LKML-Reference: <1266997226-6833-2-git-send-email-wenji.huang@oracle.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c99029916c7..8d4bd16d6f7 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -651,12 +651,12 @@ static int create_trace_probe(int argc, char **argv) event = strchr(group, '/') + 1; event[-1] = '\0'; if (strlen(group) == 0) { - pr_info("Group name is not specifiled\n"); + pr_info("Group name is not specified\n"); return -EINVAL; } } if (strlen(event) == 0) { - pr_info("Event name is not specifiled\n"); + pr_info("Event name is not specified\n"); return -EINVAL; } } -- cgit v1.2.2 From c85f3a91f84d5a85f179c2504bb7a39370c82b41 Mon Sep 17 00:00:00 2001 From: Wenji Huang Date: Wed, 24 Feb 2010 15:40:24 +0800 Subject: tracing: Remove unnecessary variable in print_graph_return The "cpu" variable is declared at the start of the function and also within a branch, with the exact same initialization. Remove the local variable of the same name in the branch. Signed-off-by: Wenji Huang LKML-Reference: <1266997226-6833-3-git-send-email-wenji.huang@oracle.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 616b135c9eb..112561df2a0 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -855,7 +855,6 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, int i; if (data) { - int cpu = iter->cpu; int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); /* -- cgit v1.2.2 From 7b60997f73865b019e595720185c85285ca3df9a Mon Sep 17 00:00:00 2001 From: Wenji Huang Date: Wed, 24 Feb 2010 15:40:26 +0800 Subject: tracing: Simplify memory recycle of trace_define_field Discard freeing field->type since it is not necessary. Reviewed-by: Li Zefan Signed-off-by: Wenji Huang LKML-Reference: <1266997226-6833-5-git-send-email-wenji.huang@oracle.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c2a3077b735..3f972ad98d0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -60,10 +60,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, return 0; err: - if (field) { + if (field) kfree(field->name); - kfree(field->type); - } kfree(field); return -ENOMEM; -- cgit v1.2.2 From f1c7f517a5dc23bce07efa5ed55e2c074ed9d4ba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 26 Feb 2010 17:08:16 -0500 Subject: ftrace: Add function names to dangling } in function graph tracer The function graph tracer is currently the most invasive tracer in the ftrace family. It can easily overflow the buffer even with 10megs per CPU. This means that events can often be lost. On start up, or after events are lost, if the function return is recorded but the function enter was lost, all we get to see is the exiting '}'. Here is how a typical trace output starts: [tracing] cat trace # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 0) + 91.897 us | } 0) ! 567.961 us | } 0) <========== | 0) ! 579.083 us | _raw_spin_lock_irqsave(); 0) 4.694 us | _raw_spin_unlock_irqrestore(); 0) ! 594.862 us | } 0) ! 603.361 us | } 0) ! 613.574 us | } 0) ! 623.554 us | } 0) 3.653 us | fget_light(); 0) | sock_poll() { There are a series of '}' with no matching "func() {". There's no information to what functions these ending brackets belong to. This patch adds a stack on the per cpu structure used in outputting the function graph tracer to keep track of what function was outputted. Then on a function exit event, it checks the depth to see if the function exit has a matching entry event. If it does, then it only prints the '}', otherwise it adds the function name after the '}'. This allows function exit events to show what function they belong to at trace output startup, when the entry was lost due to ring buffer overflow, or even after a new task is scheduled in. Here is what the above trace will look like after this patch: [tracing] cat trace # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 0) + 91.897 us | } (irq_exit) 0) ! 567.961 us | } (smp_apic_timer_interrupt) 0) <========== | 0) ! 579.083 us | _raw_spin_lock_irqsave(); 0) 4.694 us | _raw_spin_unlock_irqrestore(); 0) ! 594.862 us | } (add_wait_queue) 0) ! 603.361 us | } (__pollwait) 0) ! 613.574 us | } (tcp_poll) 0) ! 623.554 us | } (sock_poll) 0) 3.653 us | fget_light(); 0) | sock_poll() { Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 52 +++++++++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 112561df2a0..e998a824e9d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -18,6 +18,7 @@ struct fgraph_cpu_data { pid_t last_pid; int depth; int ignore; + unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; }; struct fgraph_data { @@ -670,15 +671,21 @@ print_graph_entry_leaf(struct trace_iterator *iter, duration = graph_ret->rettime - graph_ret->calltime; if (data) { + struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); /* * Comments display at + 1 to depth. Since * this is a leaf function, keep the comments * equal to this depth. */ - *depth = call->depth - 1; + cpu_data->depth = call->depth - 1; + + /* No need to keep this function around for this depth */ + if (call->depth < FTRACE_RETFUNC_DEPTH) + cpu_data->enter_funcs[call->depth] = 0; } /* Overhead */ @@ -718,10 +725,15 @@ print_graph_entry_nested(struct trace_iterator *iter, int i; if (data) { + struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); - *depth = call->depth; + cpu_data = per_cpu_ptr(data->cpu_data, cpu); + cpu_data->depth = call->depth; + + /* Save this function pointer to see if the exit matches */ + if (call->depth < FTRACE_RETFUNC_DEPTH) + cpu_data->enter_funcs[call->depth] = call->func; } /* No overhead */ @@ -851,18 +863,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, struct fgraph_data *data = iter->private; pid_t pid = ent->pid; int cpu = iter->cpu; + int func_match = 1; int ret; int i; if (data) { - int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + struct fgraph_cpu_data *cpu_data; + int cpu = iter->cpu; + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); /* * Comments display at + 1 to depth. This is the * return from a function, we now want the comments * to display at the same level of the bracket. */ - *depth = trace->depth - 1; + cpu_data->depth = trace->depth - 1; + + if (trace->depth < FTRACE_RETFUNC_DEPTH) { + if (cpu_data->enter_funcs[trace->depth] != trace->func) + func_match = 0; + cpu_data->enter_funcs[trace->depth] = 0; + } } if (print_graph_prologue(iter, s, 0, 0)) @@ -887,9 +909,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "}\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + /* + * If the return function does not have a matching entry, + * then the entry was lost. Instead of just printing + * the '}' and letting the user guess what function this + * belongs to, write out the function name. + */ + if (func_match) { + ret = trace_seq_printf(s, "}\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } else { + ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } /* Overrun */ if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { -- cgit v1.2.2