From bc44fb5f7d3e764ed7698c835a1a0f35aba2eb3d Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:42:18 +0100 Subject: x86, bts: detect size of DS fields Impact: more robust DS feature enumeration Detect the size of the pointer-type fields in the DS area configuration via the DTES64 features rather than based on the cpuid. Rename a variable to denote that size to reflect that it only covers the pointer-type fields. Add more boot-time diagnostics giving the detected size and the sizes of BTS and PEBS records. Use the size of the BTS/PEBS record to indicate that the respective feature is not available (if the record size is zero). Signed-off-by: Markus Metzger LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 84 +++++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 87b67e3a765a..6e5ec679a0cd 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -39,7 +39,7 @@ struct ds_configuration { /* the size of one pointer-typed field in the DS structure and in the BTS and PEBS buffers in bytes; this covers the first 8 DS fields related to buffer management. */ - unsigned char sizeof_field; + unsigned char sizeof_ptr_field; /* the size of a BTS/PEBS record in bytes */ unsigned char sizeof_rec[2]; /* a series of bit-masks to control various features indexed @@ -142,14 +142,14 @@ enum ds_qualifier { static inline unsigned long ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field) { - base += (ds_cfg.sizeof_field * (field + (4 * qual))); + base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); return *(unsigned long *)base; } static inline void ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, unsigned long value) { - base += (ds_cfg.sizeof_field * (field + (4 * qual))); + base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); (*(unsigned long *)base) = value; } @@ -410,7 +410,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, * Later architectures use 64bit pointers throughout, whereas earlier * architectures use 32bit pointers in 32bit mode. * - * We compute the base address for the first 8 fields based on: + * We compute the base address for the fields based on: * - the field size stored in the DS configuration * - the relative field position * @@ -441,13 +441,13 @@ enum bts_field { static inline unsigned long bts_get(const char *base, enum bts_field field) { - base += (ds_cfg.sizeof_field * field); + base += (ds_cfg.sizeof_ptr_field * field); return *(unsigned long *)base; } static inline void bts_set(char *base, enum bts_field field, unsigned long val) { - base += (ds_cfg.sizeof_field * field);; + base += (ds_cfg.sizeof_ptr_field * field);; (*(unsigned long *)base) = val; } @@ -593,6 +593,10 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, struct ds_context *context; int error; + error = -EOPNOTSUPP; + if (!ds_cfg.sizeof_rec[qual]) + goto out; + error = -EINVAL; if (!base) goto out; @@ -635,10 +639,6 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, unsigned long irq; int error; - error = -EOPNOTSUPP; - if (!ds_cfg.ctl[dsf_bts]) - goto out; - /* buffer overflow notification is not yet implemented */ error = -EOPNOTSUPP; if (ovfl) @@ -848,7 +848,8 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); tracer->trace.reset_value = - *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); + *(u64 *)(tracer->ds.context->ds + + (ds_cfg.sizeof_ptr_field * 8)); return &tracer->trace; } @@ -884,7 +885,8 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) if (!tracer) return -EINVAL; - *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; + *(u64 *)(tracer->ds.context->ds + + (ds_cfg.sizeof_ptr_field * 8)) = value; return 0; } @@ -894,52 +896,54 @@ static const struct ds_configuration ds_cfg_netburst = { .ctl[dsf_bts] = (1 << 2) | (1 << 3), .ctl[dsf_bts_kernel] = (1 << 5), .ctl[dsf_bts_user] = (1 << 6), - - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, -#ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10, -#else - .sizeof_rec[ds_pebs] = sizeof(long) * 18, -#endif }; static const struct ds_configuration ds_cfg_pentium_m = { .name = "Pentium M", .ctl[dsf_bts] = (1 << 6) | (1 << 7), - - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, -#ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10, -#else - .sizeof_rec[ds_pebs] = sizeof(long) * 18, -#endif }; static const struct ds_configuration ds_cfg_core2_atom = { .name = "Core 2/Atom", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .ctl[dsf_bts_kernel] = (1 << 9), .ctl[dsf_bts_user] = (1 << 10), - - .sizeof_field = 8, - .sizeof_rec[ds_bts] = 8 * 3, - .sizeof_rec[ds_pebs] = 8 * 18, }; static void -ds_configure(const struct ds_configuration *cfg) +ds_configure(const struct ds_configuration *cfg, + struct cpuinfo_x86 *cpu) { + unsigned long nr_pebs_fields = 0; + + printk(KERN_INFO "[ds] using %s configuration\n", cfg->name); + +#ifdef __i386__ + nr_pebs_fields = 10; +#else + nr_pebs_fields = 18; +#endif + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; - printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + ds_cfg.sizeof_ptr_field = + (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4); - if (!cpu_has_bts) { - ds_cfg.ctl[dsf_bts] = 0; + ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3; + ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields; + + if (!cpu_has(cpu, X86_FEATURE_BTS)) { + ds_cfg.sizeof_rec[ds_bts] = 0; printk(KERN_INFO "[ds] bts not available\n"); } - if (!cpu_has_pebs) + if (!cpu_has(cpu, X86_FEATURE_PEBS)) { + ds_cfg.sizeof_rec[ds_pebs] = 0; printk(KERN_INFO "[ds] pebs not available\n"); + } + + printk(KERN_INFO "[ds] sizes: address: %u bit, ", + 8 * ds_cfg.sizeof_ptr_field); + printk("bts/pebs record: %u/%u bytes\n", + ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); } @@ -951,12 +955,12 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86_model) { case 0x9: case 0xd: /* Pentium M */ - ds_configure(&ds_cfg_pentium_m); + ds_configure(&ds_cfg_pentium_m, c); break; case 0xf: case 0x17: /* Core2 */ case 0x1c: /* Atom */ - ds_configure(&ds_cfg_core2_atom); + ds_configure(&ds_cfg_core2_atom, c); break; case 0x1a: /* i7 */ default: @@ -969,7 +973,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_netburst); + ds_configure(&ds_cfg_netburst, c); break; default: /* sorry, don't know about them */ -- cgit v1.2.2 From 8a327f6d1b05f5ce16572b4413a5df1d0e872283 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:45:07 +0100 Subject: x86, bts: add selftest for BTS Perform a selftest of branch trace store when a cpu is initialized. WARN and disable branch trace store support if the selftest fails. Signed-off-by: Markus Metzger LKML-Reference: <20090313104507.A30125@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 9 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/ds.c | 21 ++++ arch/x86/kernel/ds_selftest.c | 241 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/ds_selftest.h | 15 +++ 5 files changed, 287 insertions(+) create mode 100644 arch/x86/kernel/ds_selftest.c create mode 100644 arch/x86/kernel/ds_selftest.h diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index fdb45df608b6..dfd74abc03f8 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -175,6 +175,15 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. +config X86_DS_SELFTEST + bool "DS selftest" + default y + depends on DEBUG_KERNEL + depends on X86_DS + ---help--- + Perform Debug Store selftests at boot time. + If in doubt, say "N". + config HAVE_MMIOTRACE_SUPPORT def_bool y diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 339ce35648e6..a0c9e138b008 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -44,6 +44,7 @@ obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o obj-$(CONFIG_X86_DS) += ds.o +obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 6e5ec679a0cd..51c936c1a390 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -29,6 +29,7 @@ #include #include +#include "ds_selftest.h" /* * The configuration for a particular DS hardware implementation. @@ -940,6 +941,26 @@ ds_configure(const struct ds_configuration *cfg, printk(KERN_INFO "[ds] pebs not available\n"); } + if (ds_cfg.sizeof_rec[ds_bts]) { + int error; + + error = ds_selftest_bts(); + if (error) { + WARN(1, "[ds] selftest failed. disabling bts.\n"); + ds_cfg.sizeof_rec[ds_bts] = 0; + } + } + + if (ds_cfg.sizeof_rec[ds_pebs]) { + int error; + + error = ds_selftest_pebs(); + if (error) { + WARN(1, "[ds] selftest failed. disabling pebs.\n"); + ds_cfg.sizeof_rec[ds_pebs] = 0; + } + } + printk(KERN_INFO "[ds] sizes: address: %u bit, ", 8 * ds_cfg.sizeof_ptr_field); printk("bts/pebs record: %u/%u bytes\n", diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c new file mode 100644 index 000000000000..8c46fbf38c46 --- /dev/null +++ b/arch/x86/kernel/ds_selftest.c @@ -0,0 +1,241 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger , 2009 + */ + +#include "ds_selftest.h" + +#include +#include + +#include + + +#define DS_SELFTEST_BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ + + +static int ds_selftest_bts_consistency(const struct bts_trace *trace) +{ + int error = 0; + + if (!trace) { + printk(KERN_CONT "failed to access trace..."); + /* Bail out. Other tests are pointless. */ + return -1; + } + + if (!trace->read) { + printk(KERN_CONT "bts read not available..."); + error = -1; + } + + /* Do some sanity checks on the trace configuration. */ + if (!trace->ds.n) { + printk(KERN_CONT "empty bts buffer..."); + error = -1; + } + if (!trace->ds.size) { + printk(KERN_CONT "bad bts trace setup..."); + error = -1; + } + if (trace->ds.end != + (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) { + printk(KERN_CONT "bad bts buffer setup..."); + error = -1; + } + if ((trace->ds.top < trace->ds.begin) || + (trace->ds.end <= trace->ds.top)) { + printk(KERN_CONT "bts top out of bounds..."); + error = -1; + } + + return error; +} + +static int ds_selftest_bts_read(struct bts_tracer *tracer, + const struct bts_trace *trace, + const void *from, const void *to) +{ + const unsigned char *at; + + /* + * Check a few things which do not belong to this test. + * They should be covered by other tests. + */ + if (!trace) + return -1; + + if (!trace->read) + return -1; + + if (to < from) + return -1; + + if (from < trace->ds.begin) + return -1; + + if (trace->ds.end < to) + return -1; + + if (!trace->ds.size) + return -1; + + /* Now to the test itself. */ + for (at = from; (void *)at < to; at += trace->ds.size) { + struct bts_struct bts; + size_t index; + int error; + + if (((void *)at - trace->ds.begin) % trace->ds.size) { + printk(KERN_CONT + "read from non-integer index..."); + return -1; + } + index = ((void *)at - trace->ds.begin) / trace->ds.size; + + memset(&bts, 0, sizeof(bts)); + error = trace->read(tracer, at, &bts); + if (error < 0) { + printk(KERN_CONT + "error reading bts trace at [%lu] (0x%p)...", + index, at); + return error; + } + + switch (bts.qualifier) { + case BTS_BRANCH: + break; + default: + printk(KERN_CONT + "unexpected bts entry %llu at [%lu] (0x%p)...", + bts.qualifier, index, at); + return -1; + } + } + + return 0; +} + +int ds_selftest_bts(void) +{ + const struct bts_trace *trace; + struct bts_tracer *tracer; + int error = 0; + void *top; + unsigned char buffer[DS_SELFTEST_BUFFER_SIZE]; + + printk(KERN_INFO "[ds] bts selftest..."); + + tracer = ds_request_bts(NULL, buffer, DS_SELFTEST_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + if (IS_ERR(tracer)) { + error = PTR_ERR(tracer); + tracer = NULL; + + printk(KERN_CONT + "initialization failed (err: %d)...", error); + goto out; + } + + /* The return should already give us enough trace. */ + ds_suspend_bts(tracer); + + /* Let's see if we can access the trace. */ + trace = ds_read_bts(tracer); + + error = ds_selftest_bts_consistency(trace); + if (error < 0) + goto out; + + /* If everything went well, we should have a few trace entries. */ + if (trace->ds.top == trace->ds.begin) { + /* + * It is possible but highly unlikely that we got a + * buffer overflow and end up at exactly the same + * position we started from. + * Let's issue a warning, but continue. + */ + printk(KERN_CONT "no trace/overflow..."); + } + + /* Let's try to read the trace we collected. */ + error = ds_selftest_bts_read(tracer, trace, + trace->ds.begin, trace->ds.top); + if (error < 0) + goto out; + + /* + * Let's read the trace again. + * Since we suspended tracing, we should get the same result. + */ + top = trace->ds.top; + + trace = ds_read_bts(tracer); + error = ds_selftest_bts_consistency(trace); + if (error < 0) + goto out; + + if (top != trace->ds.top) { + printk(KERN_CONT "suspend not working..."); + error = -1; + goto out; + } + + /* Let's collect some more trace - see if resume is working. */ + ds_resume_bts(tracer); + ds_suspend_bts(tracer); + + trace = ds_read_bts(tracer); + + error = ds_selftest_bts_consistency(trace); + if (error < 0) + goto out; + + if (trace->ds.top == top) { + /* + * It is possible but highly unlikely that we got a + * buffer overflow and end up at exactly the same + * position we started from. + * Let's issue a warning and check the full trace. + */ + printk(KERN_CONT + "no resume progress/overflow..."); + + error = ds_selftest_bts_read(tracer, trace, + trace->ds.begin, trace->ds.end); + } else if (trace->ds.top < top) { + /* + * We had a buffer overflow - the entire buffer should + * contain trace records. + */ + error = ds_selftest_bts_read(tracer, trace, + trace->ds.begin, trace->ds.end); + } else { + /* + * It is quite likely that the buffer did not overflow. + * Let's just check the delta trace. + */ + error = ds_selftest_bts_read(tracer, trace, + top, trace->ds.top); + } + if (error < 0) + goto out; + + error = 0; + + /* The final test: release the tracer while tracing is suspended. */ + out: + ds_release_bts(tracer); + + printk(KERN_CONT "%s.\n", (error ? "failed" : "passed")); + + return error; +} + +int ds_selftest_pebs(void) +{ + return 0; +} diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h new file mode 100644 index 000000000000..0e6e19d4c7d2 --- /dev/null +++ b/arch/x86/kernel/ds_selftest.h @@ -0,0 +1,15 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger , 2009 + */ + +#ifdef CONFIG_X86_DS_SELFTEST +extern int ds_selftest_bts(void); +extern int ds_selftest_pebs(void); +#else +static inline int ds_selftest_bts(void) { return 0; } +static inline int ds_selftest_pebs(void) { return 0; } +#endif /* CONFIG_X86_DS_SELFTEST */ -- cgit v1.2.2 From b8e47195451c5d3f62620b2b1b5928669afd56eb Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:46:42 +0100 Subject: x86, bts: correct comment style in ds.c Correct the comment style in ds.c. Signed-off-by: Markus Metzger LKML-Reference: <20090313104642.A30149@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 79 ++++++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 51c936c1a390..d9cab7168058 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -35,25 +35,22 @@ * The configuration for a particular DS hardware implementation. */ struct ds_configuration { - /* the name of the configuration */ + /* The name of the configuration. */ const char *name; - /* the size of one pointer-typed field in the DS structure and - in the BTS and PEBS buffers in bytes; - this covers the first 8 DS fields related to buffer management. */ + /* The size of pointer-typed fields in DS, BTS, and PEBS. */ unsigned char sizeof_ptr_field; - /* the size of a BTS/PEBS record in bytes */ + /* The size of a BTS/PEBS record in bytes. */ unsigned char sizeof_rec[2]; - /* a series of bit-masks to control various features indexed - * by enum ds_feature */ + /* Control bit-masks indexed by enum ds_feature. */ unsigned long ctl[dsf_ctl_max]; }; static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); #define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) -#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ -#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ +#define MAX_SIZEOF_DS (12 * 8) /* Maximal size of a DS configuration. */ +#define MAX_SIZEOF_BTS (3 * 8) /* Maximal size of a BTS record. */ +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment. */ #define BTS_CONTROL \ (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ @@ -67,28 +64,28 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); * to identify tracers. */ struct ds_tracer { - /* the DS context (partially) owned by this tracer */ + /* The DS context (partially) owned by this tracer. */ struct ds_context *context; - /* the buffer provided on ds_request() and its size in bytes */ + /* The buffer provided on ds_request() and its size in bytes. */ void *buffer; size_t size; }; struct bts_tracer { - /* the common DS part */ + /* The common DS part. */ struct ds_tracer ds; - /* the trace including the DS configuration */ + /* The trace including the DS configuration. */ struct bts_trace trace; - /* buffer overflow notification function */ + /* Buffer overflow notification function. */ bts_ovfl_callback_t ovfl; }; struct pebs_tracer { - /* the common DS part */ + /* The common DS part. */ struct ds_tracer ds; - /* the trace including the DS configuration */ + /* The trace including the DS configuration. */ struct pebs_trace trace; - /* buffer overflow notification function */ + /* Buffer overflow notification function. */ pebs_ovfl_callback_t ovfl; }; @@ -214,18 +211,16 @@ static inline int check_tracer(struct task_struct *task) * deallocated when the last user puts the context. */ struct ds_context { - /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ + /* The DS configuration; goes into MSR_IA32_DS_AREA. */ unsigned char ds[MAX_SIZEOF_DS]; - /* the owner of the BTS and PEBS configuration, respectively */ + /* The owner of the BTS and PEBS configuration, respectively. */ struct bts_tracer *bts_master; struct pebs_tracer *pebs_master; - /* use count */ + /* Use count. */ unsigned long count; - /* a pointer to the context location inside the thread_struct - * or the per_cpu context array */ + /* Pointer to the context pointer field. */ struct ds_context **this; - /* a pointer to the task owning this context, or NULL, if the - * context is owned by a cpu */ + /* The traced task; NULL for current cpu. */ struct task_struct *task; }; @@ -350,14 +345,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, unsigned long write_size, adj_write_size; /* - * write as much as possible without producing an + * Write as much as possible without producing an * overflow interrupt. * - * interrupt_threshold must either be + * Interrupt_threshold must either be * - bigger than absolute_maximum or * - point to a record between buffer_base and absolute_maximum * - * index points to a valid record. + * Index points to a valid record. */ base = ds_get(context->ds, qual, ds_buffer_base); index = ds_get(context->ds, qual, ds_index); @@ -366,8 +361,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, write_end = min(end, int_th); - /* if we are already beyond the interrupt threshold, - * we fill the entire buffer */ + /* + * If we are already beyond the interrupt threshold, + * we fill the entire buffer. + */ if (write_end <= index) write_end = end; @@ -384,7 +381,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; adj_write_size *= ds_cfg.sizeof_rec[qual]; - /* zero out trailing bytes */ + /* Zero out trailing bytes. */ memset((char *)index + write_size, 0, adj_write_size - write_size); index += adj_write_size; @@ -556,7 +553,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, unsigned int flags) { unsigned long buffer, adj; - /* adjust the buffer address and size to meet alignment + /* + * Adjust the buffer address and size to meet alignment * constraints: * - buffer is double-word aligned * - size is multiple of record size @@ -578,7 +576,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, trace->begin = (void *)buffer; trace->top = trace->begin; trace->end = (void *)(buffer + size); - /* The value for 'no threshold' is -1, which will set the + /* + * The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ trace->ith = (void *)(buffer + size - ith); @@ -602,7 +601,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, if (!base) goto out; - /* we require some space to do alignment adjustments below */ + /* We require some space to do alignment adjustments below. */ error = -EINVAL; if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) goto out; @@ -640,7 +639,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, unsigned long irq; int error; - /* buffer overflow notification is not yet implemented */ + /* Buffer overflow notification is not yet implemented. */ error = -EOPNOTSUPP; if (ovfl) goto out; @@ -700,7 +699,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, unsigned long irq; int error; - /* buffer overflow notification is not yet implemented */ + /* Buffer overflow notification is not yet implemented. */ error = -EOPNOTSUPP; if (ovfl) goto out; @@ -983,9 +982,9 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x1c: /* Atom */ ds_configure(&ds_cfg_core2_atom, c); break; - case 0x1a: /* i7 */ + case 0x1a: /* Core i7 */ default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } break; @@ -997,12 +996,12 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) ds_configure(&ds_cfg_netburst, c); break; default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } break; default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } } -- cgit v1.2.2 From ba9372a8f306c4e53a5f61dcbcd6c1e4a8c2e9ac Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:48:52 +0100 Subject: x86, hw-branch-tracer: keep resources on stop Distinguish init/reset and start/stop: init/reset will allocate and release bts tracing resources stop/start will suspend and resume bts tracing Return an error on init() if no cpu can be traced. Signed-off-by: Markus Metzger LKML-Reference: <20090313104852.A30168@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_hw_branches.c | 119 ++++++++++++++++++++++++++++----------- 1 file changed, 85 insertions(+), 34 deletions(-) diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 7bfdf4c2347f..a99a04c5e9cd 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -19,7 +19,7 @@ #include "trace_output.h" -#define SIZEOF_BTS (1 << 13) +#define BTS_BUFFER_SIZE (1 << 13) /* * The tracer lock protects the below per-cpu tracer array. @@ -33,53 +33,68 @@ */ static DEFINE_SPINLOCK(bts_tracer_lock); static DEFINE_PER_CPU(struct bts_tracer *, tracer); -static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); +static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); #define this_tracer per_cpu(tracer, smp_processor_id()) #define this_buffer per_cpu(buffer, smp_processor_id()) -static int __read_mostly trace_hw_branches_enabled; +static int trace_hw_branches_enabled __read_mostly; +static int trace_hw_branches_suspended __read_mostly; static struct trace_array *hw_branch_trace __read_mostly; /* - * Start tracing on the current cpu. + * Initialize the tracer for the current cpu. * The argument is ignored. * * pre: bts_tracer_lock must be locked. */ -static void bts_trace_start_cpu(void *arg) +static void bts_trace_init_cpu(void *arg) { if (this_tracer) ds_release_bts(this_tracer); - this_tracer = - ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, - /* ovfl = */ NULL, /* th = */ (size_t)-1, - BTS_KERNEL); + this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); if (IS_ERR(this_tracer)) { this_tracer = NULL; return; } } -static void bts_trace_start(struct trace_array *tr) +static int bts_trace_init(struct trace_array *tr) { + int cpu, avail; + spin_lock(&bts_tracer_lock); - on_each_cpu(bts_trace_start_cpu, NULL, 1); - trace_hw_branches_enabled = 1; + hw_branch_trace = tr; + + on_each_cpu(bts_trace_init_cpu, NULL, 1); + + /* Check on how many cpus we could enable tracing */ + avail = 0; + for_each_online_cpu(cpu) + if (per_cpu(tracer, cpu)) + avail++; + + trace_hw_branches_enabled = (avail ? 1 : 0); + trace_hw_branches_suspended = 0; spin_unlock(&bts_tracer_lock); + + + /* If we could not enable tracing on a single cpu, we fail. */ + return avail ? 0 : -EOPNOTSUPP; } /* - * Stop tracing on the current cpu. + * Release the tracer for the current cpu. * The argument is ignored. * * pre: bts_tracer_lock must be locked. */ -static void bts_trace_stop_cpu(void *arg) +static void bts_trace_release_cpu(void *arg) { if (this_tracer) { ds_release_bts(this_tracer); @@ -87,12 +102,57 @@ static void bts_trace_stop_cpu(void *arg) } } -static void bts_trace_stop(struct trace_array *tr) +static void bts_trace_reset(struct trace_array *tr) { spin_lock(&bts_tracer_lock); + on_each_cpu(bts_trace_release_cpu, NULL, 1); trace_hw_branches_enabled = 0; - on_each_cpu(bts_trace_stop_cpu, NULL, 1); + trace_hw_branches_suspended = 0; + + spin_unlock(&bts_tracer_lock); +} + +/* + * Resume tracing on the current cpu. + * The argument is ignored. + * + * pre: bts_tracer_lock must be locked. + */ +static void bts_trace_resume_cpu(void *arg) +{ + if (this_tracer) + ds_resume_bts(this_tracer); +} + +static void bts_trace_start(struct trace_array *tr) +{ + spin_lock(&bts_tracer_lock); + + on_each_cpu(bts_trace_resume_cpu, NULL, 1); + trace_hw_branches_suspended = 0; + + spin_unlock(&bts_tracer_lock); +} + +/* + * Suspend tracing on the current cpu. + * The argument is ignored. + * + * pre: bts_tracer_lock must be locked. + */ +static void bts_trace_suspend_cpu(void *arg) +{ + if (this_tracer) + ds_suspend_bts(this_tracer); +} + +static void bts_trace_stop(struct trace_array *tr) +{ + spin_lock(&bts_tracer_lock); + + on_each_cpu(bts_trace_suspend_cpu, NULL, 1); + trace_hw_branches_suspended = 1; spin_unlock(&bts_tracer_lock); } @@ -110,10 +170,14 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_DOWN_FAILED: - smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); + smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1); + + if (trace_hw_branches_suspended) + smp_call_function_single(cpu, bts_trace_suspend_cpu, + NULL, 1); break; case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); + smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1); break; } @@ -126,20 +190,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { .notifier_call = bts_hotcpu_handler }; -static int bts_trace_init(struct trace_array *tr) -{ - hw_branch_trace = tr; - - bts_trace_start(tr); - - return 0; -} - -static void bts_trace_reset(struct trace_array *tr) -{ - bts_trace_stop(tr); -} - static void bts_trace_print_header(struct seq_file *m) { seq_puts(m, "# CPU# TO <- FROM\n"); @@ -228,7 +278,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at) */ static void trace_bts_cpu(void *arg) { - struct trace_array *tr = (struct trace_array *) arg; + struct trace_array *tr = (struct trace_array *)arg; const struct bts_trace *trace; unsigned char *at; @@ -276,7 +326,8 @@ void trace_hw_branch_oops(void) { spin_lock(&bts_tracer_lock); - trace_bts_cpu(hw_branch_trace); + if (trace_hw_branches_enabled) + trace_bts_cpu(hw_branch_trace); spin_unlock(&bts_tracer_lock); } -- cgit v1.2.2 From 321bb5e1ac461c04b6a93f795010d6eb01d8c5ca Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:50:27 +0100 Subject: x86, hw-branch-tracer: add selftest Add a selftest for the hw-branch-tracer. Signed-off-by: Markus Metzger LKML-Reference: <20090313105027.A30183@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 2 ++ kernel/trace/trace_hw_branches.c | 5 +++- kernel/trace/trace_selftest.c | 53 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 56ce34d90b03..e7fbc826f1e9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -576,6 +576,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_hw_branches(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index a99a04c5e9cd..4ca82700c04e 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -342,7 +342,10 @@ struct tracer bts_tracer __read_mostly = .start = bts_trace_start, .stop = bts_trace_stop, .open = trace_bts_prepare, - .close = trace_bts_close + .close = trace_bts_close, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_hw_branches, +#endif /* CONFIG_FTRACE_SELFTEST */ }; __init static int init_bts_trace(void) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index f907a2b29028..3c7b797d0d28 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: + case TRACE_HW_BRANCHES: return 1; } return 0; @@ -691,3 +692,55 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) return ret; } #endif /* CONFIG_BRANCH_TRACER */ + +#ifdef CONFIG_HW_BRANCH_TRACER +int +trace_selftest_startup_hw_branches(struct tracer *trace, + struct trace_array *tr) +{ + unsigned long count; + int ret; + struct trace_iterator iter; + struct tracer tracer; + + if (!trace->open) { + printk(KERN_CONT "missing open function..."); + return -1; + } + + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* + * The hw-branch tracer needs to collect the trace from the various + * cpu trace buffers - before tracing is stopped. + */ + memset(&iter, 0, sizeof(iter)); + memcpy(&tracer, trace, sizeof(tracer)); + + iter.trace = &tracer; + iter.tr = tr; + iter.pos = -1; + mutex_init(&iter.mutex); + + trace->open(&iter); + + mutex_destroy(&iter.mutex); + + tracing_stop(); + + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT "no entries found.."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_HW_BRANCH_TRACER */ -- cgit v1.2.2 From e9a22d1fb94050b7d600019c32e6b672d539054b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 11:54:40 +0100 Subject: x86, bts: cleanups Impact: cleanup, no code changed Cc: Markus Metzger LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 142 ++++++++++++++++++++++----------------- arch/x86/kernel/ds_selftest.h | 2 +- kernel/trace/trace_hw_branches.c | 6 +- kernel/trace/trace_selftest.c | 5 +- 4 files changed, 87 insertions(+), 68 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index d9cab7168058..7363e01ba082 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -19,43 +19,52 @@ * Markus Metzger , 2007-2009 */ - -#include - -#include +#include #include -#include +#include #include +#include #include -#include + +#include #include "ds_selftest.h" /* - * The configuration for a particular DS hardware implementation. + * The configuration for a particular DS hardware implementation: */ struct ds_configuration { - /* The name of the configuration. */ - const char *name; - /* The size of pointer-typed fields in DS, BTS, and PEBS. */ - unsigned char sizeof_ptr_field; - /* The size of a BTS/PEBS record in bytes. */ - unsigned char sizeof_rec[2]; - /* Control bit-masks indexed by enum ds_feature. */ - unsigned long ctl[dsf_ctl_max]; + /* The name of the configuration: */ + const char *name; + + /* The size of pointer-typed fields in DS, BTS, and PEBS: */ + unsigned char sizeof_ptr_field; + + /* The size of a BTS/PEBS record in bytes: */ + unsigned char sizeof_rec[2]; + + /* Control bit-masks indexed by enum ds_feature: */ + unsigned long ctl[dsf_ctl_max]; }; static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); #define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) -#define MAX_SIZEOF_DS (12 * 8) /* Maximal size of a DS configuration. */ -#define MAX_SIZEOF_BTS (3 * 8) /* Maximal size of a BTS record. */ -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment. */ +/* Maximal size of a DS configuration: */ +#define MAX_SIZEOF_DS (12 * 8) -#define BTS_CONTROL \ - (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ - ds_cfg.ctl[dsf_bts_overflow]) +/* Maximal size of a BTS record: */ +#define MAX_SIZEOF_BTS (3 * 8) +/* BTS and PEBS buffer alignment: */ +#define DS_ALIGNMENT (1 << 3) + +/* Mask of control bits in the DS MSR register: */ +#define BTS_CONTROL \ + ( ds_cfg.ctl[dsf_bts] | \ + ds_cfg.ctl[dsf_bts_kernel] | \ + ds_cfg.ctl[dsf_bts_user] | \ + ds_cfg.ctl[dsf_bts_overflow] ) /* * A BTS or PEBS tracer. @@ -65,28 +74,32 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); */ struct ds_tracer { /* The DS context (partially) owned by this tracer. */ - struct ds_context *context; + struct ds_context *context; /* The buffer provided on ds_request() and its size in bytes. */ - void *buffer; - size_t size; + void *buffer; + size_t size; }; struct bts_tracer { - /* The common DS part. */ - struct ds_tracer ds; - /* The trace including the DS configuration. */ - struct bts_trace trace; - /* Buffer overflow notification function. */ - bts_ovfl_callback_t ovfl; + /* The common DS part: */ + struct ds_tracer ds; + + /* The trace including the DS configuration: */ + struct bts_trace trace; + + /* Buffer overflow notification function: */ + bts_ovfl_callback_t ovfl; }; struct pebs_tracer { - /* The common DS part. */ - struct ds_tracer ds; - /* The trace including the DS configuration. */ - struct pebs_trace trace; - /* Buffer overflow notification function. */ - pebs_ovfl_callback_t ovfl; + /* The common DS part: */ + struct ds_tracer ds; + + /* The trace including the DS configuration: */ + struct pebs_trace trace; + + /* Buffer overflow notification function: */ + pebs_ovfl_callback_t ovfl; }; /* @@ -95,6 +108,7 @@ struct pebs_tracer { * * The DS configuration consists of the following fields; different * architetures vary in the size of those fields. + * * - double-word aligned base linear address of the BTS buffer * - write pointer into the BTS buffer * - end linear address of the BTS buffer (one byte beyond the end of @@ -133,19 +147,20 @@ enum ds_field { }; enum ds_qualifier { - ds_bts = 0, + ds_bts = 0, ds_pebs }; -static inline unsigned long ds_get(const unsigned char *base, - enum ds_qualifier qual, enum ds_field field) +static inline unsigned long +ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field) { base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); return *(unsigned long *)base; } -static inline void ds_set(unsigned char *base, enum ds_qualifier qual, - enum ds_field field, unsigned long value) +static inline void +ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, + unsigned long value) { base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); (*(unsigned long *)base) = value; @@ -157,7 +172,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, */ static DEFINE_SPINLOCK(ds_lock); - /* * We either support (system-wide) per-cpu or per-thread allocation. * We distinguish the two based on the task_struct pointer, where a @@ -211,17 +225,21 @@ static inline int check_tracer(struct task_struct *task) * deallocated when the last user puts the context. */ struct ds_context { - /* The DS configuration; goes into MSR_IA32_DS_AREA. */ - unsigned char ds[MAX_SIZEOF_DS]; - /* The owner of the BTS and PEBS configuration, respectively. */ - struct bts_tracer *bts_master; - struct pebs_tracer *pebs_master; - /* Use count. */ + /* The DS configuration; goes into MSR_IA32_DS_AREA: */ + unsigned char ds[MAX_SIZEOF_DS]; + + /* The owner of the BTS and PEBS configuration, respectively: */ + struct bts_tracer *bts_master; + struct pebs_tracer *pebs_master; + + /* Use count: */ unsigned long count; - /* Pointer to the context pointer field. */ - struct ds_context **this; - /* The traced task; NULL for current cpu. */ - struct task_struct *task; + + /* Pointer to the context pointer field: */ + struct ds_context **this; + + /* The traced task; NULL for current cpu: */ + struct task_struct *task; }; static DEFINE_PER_CPU(struct ds_context *, system_context_array); @@ -328,9 +346,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) * The remainder of any partially written record is zeroed out. * * context: the DS context - * qual: the buffer type - * record: the data to write - * size: the size of the data + * qual: the buffer type + * record: the data to write + * size: the size of the data */ static int ds_write(struct ds_context *context, enum ds_qualifier qual, const void *record, size_t size) @@ -429,12 +447,12 @@ enum bts_field { bts_to, bts_flags, - bts_qual = bts_from, - bts_jiffies = bts_to, - bts_pid = bts_flags, + bts_qual = bts_from, + bts_jiffies = bts_to, + bts_pid = bts_flags, - bts_qual_mask = (bts_qual_max - 1), - bts_escape = ((unsigned long)-1 & ~bts_qual_mask) + bts_qual_mask = (bts_qual_max - 1), + bts_escape = ((unsigned long)-1 & ~bts_qual_mask) }; static inline unsigned long bts_get(const char *base, enum bts_field field) @@ -461,8 +479,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val) * * return: bytes read/written on success; -Eerrno, otherwise */ -static int bts_read(struct bts_tracer *tracer, const void *at, - struct bts_struct *out) +static int +bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out) { if (!tracer) return -EINVAL; diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h index 0e6e19d4c7d2..2ba8745c6663 100644 --- a/arch/x86/kernel/ds_selftest.h +++ b/arch/x86/kernel/ds_selftest.h @@ -12,4 +12,4 @@ extern int ds_selftest_pebs(void); #else static inline int ds_selftest_bts(void) { return 0; } static inline int ds_selftest_pebs(void) { return 0; } -#endif /* CONFIG_X86_DS_SELFTEST */ +#endif diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 4ca82700c04e..8b2109a6c61c 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -1,5 +1,5 @@ /* - * h/w branch tracer for x86 based on bts + * h/w branch tracer for x86 based on BTS * * Copyright (C) 2008-2009 Intel Corporation. * Markus Metzger , 2008-2009 @@ -15,8 +15,8 @@ #include -#include "trace.h" #include "trace_output.h" +#include "trace.h" #define BTS_BUFFER_SIZE (1 << 13) @@ -197,10 +197,10 @@ static void bts_trace_print_header(struct seq_file *m) static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) { + unsigned long symflags = TRACE_ITER_SYM_OFFSET; struct trace_entry *entry = iter->ent; struct trace_seq *seq = &iter->seq; struct hw_branch_entry *it; - unsigned long symflags = TRACE_ITER_SYM_OFFSET; trace_assign_type(it, entry); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 3c7b797d0d28..b91091267067 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -189,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, #else # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) #endif /* CONFIG_DYNAMIC_FTRACE */ + /* * Simple verification test of ftrace function tracer. * Enable ftrace, sleep 1/10 second, and then read the trace @@ -698,10 +699,10 @@ int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr) { - unsigned long count; - int ret; struct trace_iterator iter; struct tracer tracer; + unsigned long count; + int ret; if (!trace->open) { printk(KERN_CONT "missing open function..."); -- cgit v1.2.2 From 79258a354e0c69be94ae2871809a195bf4a647b1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 12:02:08 +0100 Subject: x86, bts: detect size of DS fields, fix Impact: build fix One usage site was missed in the sizeof_field -> sizeof_ptr_field rename. Cc: Markus Metzger LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 7363e01ba082..5fd53333c1df 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -983,7 +983,7 @@ ds_configure(const struct ds_configuration *cfg, printk("bts/pebs record: %u/%u bytes\n", ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); - WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); + WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_ptr_field)); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) -- cgit v1.2.2 From c78a3956b982418186e40978a51636a2b43221bc Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Wed, 18 Mar 2009 19:27:00 +0100 Subject: x86, bts: use atomic memory allocation Ds_request_bts() needs to allocate memory. It uses GFP_KERNEL. Hw-branch-tracer calls ds_request_bts() within on_each_cpu(). Use atomic memory allocation to allow it to be used in that context. Signed-off-by: Markus Metzger LKML-Reference: <20090318192700.A6038@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 5fd53333c1df..b1d6e1f502fa 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -255,8 +255,13 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) struct ds_context *new_context = NULL; unsigned long irq; - /* Chances are small that we already have a context. */ - new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); + /* + * Chances are small that we already have a context. + * + * Contexts for per-cpu tracing are allocated using + * smp_call_function(). We must not sleep. + */ + new_context = kzalloc(sizeof(*new_context), GFP_ATOMIC); if (!new_context) return NULL; @@ -662,8 +667,12 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, if (ovfl) goto out; + /* + * Per-cpu tracing is typically requested using smp_call_function(). + * We must not sleep. + */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) goto out; tracer->ovfl = ovfl; @@ -722,8 +731,12 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, if (ovfl) goto out; + /* + * Per-cpu tracing is typically requested using smp_call_function(). + * We must not sleep. + */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) goto out; tracer->ovfl = ovfl; -- cgit v1.2.2 From 425480081e936d8725f0d44b8829d699bf088c6b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 13:38:36 -0400 Subject: tracing: add handler to trace_stat Currently, if a trace_stat user wants a handle to some private data, the trace_stat infrastructure does not supply a way to do that. This patch passes the trace_stat structure to the start function of the trace_stat code. Signed-off-by: Steven Rostedt --- kernel/trace/trace_branch.c | 4 ++-- kernel/trace/trace_stat.c | 2 +- kernel/trace/trace_stat.h | 2 +- kernel/trace/trace_workqueue.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index ad8c22efff41..e6e32912ffb8 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -263,7 +263,7 @@ static int branch_stat_show(struct seq_file *m, void *v) return 0; } -static void *annotated_branch_stat_start(void) +static void *annotated_branch_stat_start(struct tracer_stat *trace) { return __start_annotated_branch_profile; } @@ -338,7 +338,7 @@ static int all_branch_stat_headers(struct seq_file *m) return 0; } -static void *all_branch_stat_start(void) +static void *all_branch_stat_start(struct tracer_stat *trace) { return __start_branch_profile; } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index f71b85b22cfe..f8f48d84b2c3 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -85,7 +85,7 @@ static int stat_seq_init(struct tracer_stat_session *session) if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; - stat = ts->stat_start(); + stat = ts->stat_start(ts); if (!stat) goto exit; diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 202274cf7f3d..f3546a2cd826 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -12,7 +12,7 @@ struct tracer_stat { /* The name of your stat file */ const char *name; /* Iteration over statistic entries */ - void *(*stat_start)(void); + void *(*stat_start)(struct tracer_stat *trace); void *(*stat_next)(void *prev, int idx); /* Compare two entries for stats sorting */ int (*stat_cmp)(void *p1, void *p2); diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 9ab035b58cf1..ee533c2e161b 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -152,7 +152,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) return ret; } -static void *workqueue_stat_start(void) +static void *workqueue_stat_start(struct tracer_stat *trace) { int cpu; void *ret = NULL; -- cgit v1.2.2 From bac429f037f1a51a74d62bad6d1518c3be065df3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Mar 2009 12:50:56 -0400 Subject: tracing: add function profiler Impact: new profiling feature This patch adds a function profiler. In debugfs/tracing/ two new files are created. function_profile_enabled - to enable or disable profiling trace_stat/functions - the profiled functions. For example: echo 1 > /debugfs/tracing/function_profile_enabled ./hackbench 50 echo 0 > /debugfs/tracing/function_profile_enabled yields: cat /debugfs/tracing/trace_stat/functions Function Hit -------- --- _spin_lock 10106442 _spin_unlock 10097492 kfree 6013704 _spin_unlock_irqrestore 4423941 _spin_lock_irqsave 4406825 __phys_addr 4181686 __slab_free 4038222 dput 4030130 path_put 4023387 unroll_tree_refs 4019532 [...] The most hit functions are listed first. Functions that are not hit are not listed. This feature depends on and uses dynamic function tracing. When the function profiling is disabled, no overhead occurs. But it still takes up around 300KB to hold the data, thus it is not recomended to keep it enabled for systems low on memory. When a '1' is echoed into the function_profile_enabled file, the counters for is function is reset back to zero. Thus you can see what functions are hit most by different programs. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 4 + kernel/trace/Kconfig | 19 +++ kernel/trace/ftrace.c | 313 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 334 insertions(+), 2 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 015a3d22cf74..0456c3a51c66 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -153,6 +153,10 @@ struct dyn_ftrace { unsigned long flags; struct dyn_ftrace *newlist; }; +#ifdef CONFIG_FUNCTION_PROFILER + unsigned long counter; + struct hlist_node node; +#endif struct dyn_arch_ftrace arch; }; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8a4d72931042..95e9ad5735d9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -105,6 +105,7 @@ config FUNCTION_GRAPH_TRACER This is done by setting the current return address on the current task structure into a stack of calls. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n @@ -376,6 +377,24 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config FUNCTION_PROFILER + bool "Kernel function profiler" + depends on DYNAMIC_FTRACE + default n + help + This option enables the kernel function profiler. When the dynamic + function tracing is enabled, a counter is added into the function + records used by the dynamic function tracer. A file is created in + debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A file in the trace_stats + directory called functions, that show the list of functions that + have been hit and their counters. + + This takes up around 320K more memory. + + If in doubt, say N + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7b8722baf153..11f364c776d5 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -34,6 +34,7 @@ #include #include "trace.h" +#include "trace_stat.h" #define FTRACE_WARN_ON(cond) \ do { \ @@ -261,7 +262,6 @@ struct ftrace_func_probe { struct rcu_head rcu; }; - enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -309,6 +309,307 @@ static struct dyn_ftrace *ftrace_free_records; } \ } +#ifdef CONFIG_FUNCTION_PROFILER +static struct hlist_head *ftrace_profile_hash; +static int ftrace_profile_bits; +static int ftrace_profile_enabled; +static DEFINE_MUTEX(ftrace_profile_lock); + +static void * +function_stat_next(void *v, int idx) +{ + struct dyn_ftrace *rec = v; + struct ftrace_page *pg; + + pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK); + + again: + rec++; + if ((void *)rec >= (void *)&pg->records[pg->index]) { + pg = pg->next; + if (!pg) + return NULL; + rec = &pg->records[0]; + } + + if (rec->flags & FTRACE_FL_FREE || + rec->flags & FTRACE_FL_FAILED || + !(rec->flags & FTRACE_FL_CONVERTED) || + /* ignore non hit functions */ + !rec->counter) + goto again; + + return rec; +} + +static void *function_stat_start(struct tracer_stat *trace) +{ + return function_stat_next(&ftrace_pages_start->records[0], 0); +} + +static int function_stat_cmp(void *p1, void *p2) +{ + struct dyn_ftrace *a = p1; + struct dyn_ftrace *b = p2; + + if (a->counter < b->counter) + return -1; + if (a->counter > b->counter) + return 1; + else + return 0; +} + +static int function_stat_headers(struct seq_file *m) +{ + seq_printf(m, " Function Hit\n" + " -------- ---\n"); + return 0; +} + +static int function_stat_show(struct seq_file *m, void *v) +{ + struct dyn_ftrace *rec = v; + char str[KSYM_SYMBOL_LEN]; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + + seq_printf(m, " %-30.30s %10lu\n", str, rec->counter); + return 0; +} + +static struct tracer_stat function_stats = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + +static void ftrace_profile_init(int nr_funcs) +{ + unsigned long addr; + int order; + int size; + + /* + * We are profiling all functions, lets make it 1/4th of the + * number of functions that are in core kernel. So we have to + * iterate 4 times. + */ + order = (sizeof(struct hlist_head) * nr_funcs) / 4; + order = get_order(order); + size = 1 << (PAGE_SHIFT + order); + + pr_info("Allocating %d KB for profiler hash\n", size >> 10); + + addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!addr) { + pr_warning("Could not allocate function profiler hash\n"); + return; + } + + ftrace_profile_hash = (void *)addr; + + /* + * struct hlist_head should be a pointer of 4 or 8 bytes. + * And a simple bit manipulation can be done, but if for + * some reason struct hlist_head is not a mulitple of 2, + * then we play it safe, and simply count. This function + * is done once at boot up, so it is not that critical in + * performance. + */ + + size--; + size /= sizeof(struct hlist_head); + + for (; size; size >>= 1) + ftrace_profile_bits++; + + pr_info("Function profiler has %d hash buckets\n", + 1 << ftrace_profile_bits); + + return; +} + +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + r = sprintf(buf, "%u\n", ftrace_profile_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static void ftrace_profile_reset(void) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + + do_for_each_ftrace_rec(pg, rec) { + rec->counter = 0; + } while_for_each_ftrace_rec(); +} + +static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip) +{ + struct dyn_ftrace *rec; + struct hlist_head *hhd; + struct hlist_node *n; + unsigned long flags; + unsigned long key; + + if (!ftrace_profile_hash) + return NULL; + + key = hash_long(ip, ftrace_profile_bits); + hhd = &ftrace_profile_hash[key]; + + if (hlist_empty(hhd)) + return NULL; + + local_irq_save(flags); + hlist_for_each_entry_rcu(rec, n, hhd, node) { + if (rec->ip == ip) + goto out; + } + rec = NULL; + out: + local_irq_restore(flags); + + return rec; +} + +static void +function_profile_call(unsigned long ip, unsigned long parent_ip) +{ + struct dyn_ftrace *rec; + unsigned long flags; + + if (!ftrace_profile_enabled) + return; + + local_irq_save(flags); + rec = ftrace_find_profiled_func(ip); + if (!rec) + goto out; + + rec->counter++; + out: + local_irq_restore(flags); +} + +static struct ftrace_ops ftrace_profile_ops __read_mostly = +{ + .func = function_profile_call, +}; + +static ssize_t +ftrace_profile_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; + int ret; + + if (!ftrace_profile_hash) { + pr_info("Can not enable hash due to earlier problems\n"); + return -ENODEV; + } + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + + mutex_lock(&ftrace_profile_lock); + if (ftrace_profile_enabled ^ val) { + if (val) { + ftrace_profile_reset(); + register_ftrace_function(&ftrace_profile_ops); + ftrace_profile_enabled = 1; + } else { + ftrace_profile_enabled = 0; + unregister_ftrace_function(&ftrace_profile_ops); + } + } + mutex_unlock(&ftrace_profile_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static const struct file_operations ftrace_profile_fops = { + .open = tracing_open_generic, + .read = ftrace_profile_read, + .write = ftrace_profile_write, +}; + +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ + struct dentry *entry; + int ret; + + ret = register_stat_tracer(&function_stats); + if (ret) { + pr_warning("Warning: could not register " + "function stats\n"); + return; + } + + entry = debugfs_create_file("function_profile_enabled", 0644, + d_tracer, NULL, &ftrace_profile_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'function_profile_enabled' entry\n"); +} + +static void ftrace_add_profile(struct dyn_ftrace *rec) +{ + unsigned long key; + + if (!ftrace_profile_hash) + return; + + key = hash_long(rec->ip, ftrace_profile_bits); + hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); +} + +static void ftrace_profile_release(struct dyn_ftrace *rec) +{ + mutex_lock(&ftrace_profile_lock); + hlist_del(&rec->node); + mutex_unlock(&ftrace_profile_lock); +} + +#else /* CONFIG_FUNCTION_PROFILER */ +static void ftrace_profile_init(int nr_funcs) +{ +} +static void ftrace_add_profile(struct dyn_ftrace *rec) +{ +} +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ +} +static void ftrace_profile_release(struct dyn_ftrace *rec) +{ +} +#endif /* CONFIG_FUNCTION_PROFILER */ + #ifdef CONFIG_KPROBES static int frozen_record_count; @@ -359,8 +660,10 @@ void ftrace_release(void *start, unsigned long size) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if ((rec->ip >= s) && (rec->ip < e) && - !(rec->flags & FTRACE_FL_FREE)) + !(rec->flags & FTRACE_FL_FREE)) { ftrace_free_rec(rec); + ftrace_profile_release(rec); + } } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); } @@ -414,6 +717,8 @@ ftrace_record_ip(unsigned long ip) rec->newlist = ftrace_new_addrs; ftrace_new_addrs = rec; + ftrace_add_profile(rec); + return rec; } @@ -2157,6 +2462,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + ftrace_profile_debugfs(d_tracer); + return 0; } @@ -2225,6 +2532,8 @@ void __init ftrace_init(void) if (ret) goto failed; + ftrace_profile_init(count); + last_ftrace_enabled = ftrace_enabled = 1; ret = ftrace_convert_nops(NULL, -- cgit v1.2.2 From 493762fc534c71d11d489f872c4b4a2c61173668 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Mar 2009 17:12:36 -0400 Subject: tracing: move function profiler data out of function struct Impact: reduce size of memory in function profiler The function profiler originally introduces its counters into the function records itself. There is 20 thousand different functions on a normal system, and that is adding 20 thousand counters for profiling event when not needed. A normal run of the profiler yields only a couple of thousand functions executed, depending on what is being profiled. This means we have around 18 thousand useless counters. This patch rectifies this by moving the data out of the function records used by dynamic ftrace. Data is preallocated to hold the functions when the profiling begins. Checks are made during profiling to see if more recorcds should be allocated, and they are allocated if it is safe to do so. This also removes the dependency from using dynamic ftrace, and also removes the overhead by having it enabled. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 4 - kernel/trace/Kconfig | 10 +- kernel/trace/ftrace.c | 440 +++++++++++++++++++++++++++++-------------------- 3 files changed, 263 insertions(+), 191 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0456c3a51c66..015a3d22cf74 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -153,10 +153,6 @@ struct dyn_ftrace { unsigned long flags; struct dyn_ftrace *newlist; }; -#ifdef CONFIG_FUNCTION_PROFILER - unsigned long counter; - struct hlist_node node; -#endif struct dyn_arch_ftrace arch; }; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 95e9ad5735d9..8a4136096d7d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -379,20 +379,16 @@ config DYNAMIC_FTRACE config FUNCTION_PROFILER bool "Kernel function profiler" - depends on DYNAMIC_FTRACE + depends on FUNCTION_TRACER default n help - This option enables the kernel function profiler. When the dynamic - function tracing is enabled, a counter is added into the function - records used by the dynamic function tracer. A file is created in - debugfs called function_profile_enabled which defaults to zero. + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. When a 1 is echoed into this file profiling begins, and when a zero is entered, profiling stops. A file in the trace_stats directory called functions, that show the list of functions that have been hit and their counters. - This takes up around 320K more memory. - If in doubt, say N config FTRACE_MCOUNT_RECORD diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 11f364c776d5..24dac448cdc9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -241,87 +241,48 @@ static void ftrace_update_pid_func(void) #endif } -/* set when tracing only a pid */ -struct pid *ftrace_pid_trace; -static struct pid * const ftrace_swapper_pid = &init_struct_pid; - -#ifdef CONFIG_DYNAMIC_FTRACE - -#ifndef CONFIG_FTRACE_MCOUNT_RECORD -# error Dynamic ftrace depends on MCOUNT_RECORD -#endif - -static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; - -struct ftrace_func_probe { - struct hlist_node node; - struct ftrace_probe_ops *ops; - unsigned long flags; - unsigned long ip; - void *data; - struct rcu_head rcu; +#ifdef CONFIG_FUNCTION_PROFILER +struct ftrace_profile { + struct hlist_node node; + unsigned long ip; + unsigned long counter; }; -enum { - FTRACE_ENABLE_CALLS = (1 << 0), - FTRACE_DISABLE_CALLS = (1 << 1), - FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_ENABLE_MCOUNT = (1 << 3), - FTRACE_DISABLE_MCOUNT = (1 << 4), - FTRACE_START_FUNC_RET = (1 << 5), - FTRACE_STOP_FUNC_RET = (1 << 6), +struct ftrace_profile_page { + struct ftrace_profile_page *next; + unsigned long index; + struct ftrace_profile records[]; }; -static int ftrace_filtered; +#define PROFILE_RECORDS_SIZE \ + (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) -static struct dyn_ftrace *ftrace_new_addrs; +#define PROFILES_PER_PAGE \ + (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static DEFINE_MUTEX(ftrace_regex_lock); - -struct ftrace_page { - struct ftrace_page *next; - int index; - struct dyn_ftrace records[]; -}; +/* TODO: make these percpu, to prevent cache line bouncing */ +static struct ftrace_profile_page *profile_pages_start; +static struct ftrace_profile_page *profile_pages; -#define ENTRIES_PER_PAGE \ - ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) - -/* estimate from running different kernels */ -#define NR_TO_INIT 10000 - -static struct ftrace_page *ftrace_pages_start; -static struct ftrace_page *ftrace_pages; - -static struct dyn_ftrace *ftrace_free_records; - -/* - * This is a double for. Do not use 'break' to break out of the loop, - * you must use a goto. - */ -#define do_for_each_ftrace_rec(pg, rec) \ - for (pg = ftrace_pages_start; pg; pg = pg->next) { \ - int _____i; \ - for (_____i = 0; _____i < pg->index; _____i++) { \ - rec = &pg->records[_____i]; - -#define while_for_each_ftrace_rec() \ - } \ - } - -#ifdef CONFIG_FUNCTION_PROFILER static struct hlist_head *ftrace_profile_hash; static int ftrace_profile_bits; static int ftrace_profile_enabled; static DEFINE_MUTEX(ftrace_profile_lock); +static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable); + +#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ + +static raw_spinlock_t ftrace_profile_rec_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + static void * function_stat_next(void *v, int idx) { - struct dyn_ftrace *rec = v; - struct ftrace_page *pg; + struct ftrace_profile *rec = v; + struct ftrace_profile_page *pg; - pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK); + pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); again: rec++; @@ -330,27 +291,22 @@ function_stat_next(void *v, int idx) if (!pg) return NULL; rec = &pg->records[0]; + if (!rec->counter) + goto again; } - if (rec->flags & FTRACE_FL_FREE || - rec->flags & FTRACE_FL_FAILED || - !(rec->flags & FTRACE_FL_CONVERTED) || - /* ignore non hit functions */ - !rec->counter) - goto again; - return rec; } static void *function_stat_start(struct tracer_stat *trace) { - return function_stat_next(&ftrace_pages_start->records[0], 0); + return function_stat_next(&profile_pages_start->records[0], 0); } static int function_stat_cmp(void *p1, void *p2) { - struct dyn_ftrace *a = p1; - struct dyn_ftrace *b = p2; + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; if (a->counter < b->counter) return -1; @@ -369,7 +325,7 @@ static int function_stat_headers(struct seq_file *m) static int function_stat_show(struct seq_file *m, void *v) { - struct dyn_ftrace *rec = v; + struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); @@ -387,115 +343,191 @@ static struct tracer_stat function_stats = { .stat_show = function_stat_show }; -static void ftrace_profile_init(int nr_funcs) +static void ftrace_profile_reset(void) { - unsigned long addr; - int order; - int size; + struct ftrace_profile_page *pg; - /* - * We are profiling all functions, lets make it 1/4th of the - * number of functions that are in core kernel. So we have to - * iterate 4 times. - */ - order = (sizeof(struct hlist_head) * nr_funcs) / 4; - order = get_order(order); - size = 1 << (PAGE_SHIFT + order); - - pr_info("Allocating %d KB for profiler hash\n", size >> 10); + pg = profile_pages = profile_pages_start; - addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!addr) { - pr_warning("Could not allocate function profiler hash\n"); - return; + while (pg) { + memset(pg->records, 0, PROFILE_RECORDS_SIZE); + pg->index = 0; + pg = pg->next; } - ftrace_profile_hash = (void *)addr; + memset(ftrace_profile_hash, 0, + FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); +} - /* - * struct hlist_head should be a pointer of 4 or 8 bytes. - * And a simple bit manipulation can be done, but if for - * some reason struct hlist_head is not a mulitple of 2, - * then we play it safe, and simply count. This function - * is done once at boot up, so it is not that critical in - * performance. - */ +int ftrace_profile_pages_init(void) +{ + struct ftrace_profile_page *pg; + int i; - size--; - size /= sizeof(struct hlist_head); + /* If we already allocated, do nothing */ + if (profile_pages) + return 0; - for (; size; size >>= 1) - ftrace_profile_bits++; + profile_pages = (void *)get_zeroed_page(GFP_KERNEL); + if (!profile_pages) + return -ENOMEM; - pr_info("Function profiler has %d hash buckets\n", - 1 << ftrace_profile_bits); + pg = profile_pages_start = profile_pages; - return; + /* allocate 10 more pages to start */ + for (i = 0; i < 10; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + /* + * We only care about allocating profile_pages, if + * we failed to allocate here, hopefully we will allocate + * later. + */ + if (!pg->next) + break; + pg = pg->next; + } + + return 0; } -static ssize_t -ftrace_profile_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) +static int ftrace_profile_init(void) { - char buf[64]; - int r; + int size; - r = sprintf(buf, "%u\n", ftrace_profile_enabled); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} + if (ftrace_profile_hash) { + /* If the profile is already created, simply reset it */ + ftrace_profile_reset(); + return 0; + } -static void ftrace_profile_reset(void) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; + /* + * We are profiling all functions, but usually only a few thousand + * functions are hit. We'll make a hash of 1024 items. + */ + size = FTRACE_PROFILE_HASH_SIZE; - do_for_each_ftrace_rec(pg, rec) { - rec->counter = 0; - } while_for_each_ftrace_rec(); + ftrace_profile_hash = + kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + + if (!ftrace_profile_hash) + return -ENOMEM; + + size--; + + for (; size; size >>= 1) + ftrace_profile_bits++; + + /* Preallocate a few pages */ + if (ftrace_profile_pages_init() < 0) { + kfree(ftrace_profile_hash); + ftrace_profile_hash = NULL; + return -ENOMEM; + } + + return 0; } -static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip) +/* interrupts must be disabled */ +static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) { - struct dyn_ftrace *rec; + struct ftrace_profile *rec; struct hlist_head *hhd; struct hlist_node *n; - unsigned long flags; unsigned long key; - if (!ftrace_profile_hash) - return NULL; - key = hash_long(ip, ftrace_profile_bits); hhd = &ftrace_profile_hash[key]; if (hlist_empty(hhd)) return NULL; - local_irq_save(flags); hlist_for_each_entry_rcu(rec, n, hhd, node) { if (rec->ip == ip) - goto out; + return rec; + } + + return NULL; +} + +static void ftrace_add_profile(struct ftrace_profile *rec) +{ + unsigned long key; + + key = hash_long(rec->ip, ftrace_profile_bits); + hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); +} + +/* Interrupts must be disabled calling this */ +static struct ftrace_profile * +ftrace_profile_alloc(unsigned long ip, bool alloc_safe) +{ + struct ftrace_profile *rec = NULL; + + /* prevent recursion */ + if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1) + goto out; + + __raw_spin_lock(&ftrace_profile_rec_lock); + + /* Try to always keep another page available */ + if (!profile_pages->next && alloc_safe) + profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC); + + /* + * Try to find the function again since another + * task on another CPU could have added it + */ + rec = ftrace_find_profiled_func(ip); + if (rec) + goto out_unlock; + + if (profile_pages->index == PROFILES_PER_PAGE) { + if (!profile_pages->next) + goto out_unlock; + profile_pages = profile_pages->next; } - rec = NULL; + + rec = &profile_pages->records[profile_pages->index++]; + rec->ip = ip; + ftrace_add_profile(rec); + + out_unlock: + __raw_spin_unlock(&ftrace_profile_rec_lock); out: - local_irq_restore(flags); + atomic_dec(&__get_cpu_var(ftrace_profile_disable)); return rec; } +/* + * If we are not in an interrupt, or softirq and + * and interrupts are disabled and preemption is not enabled + * (not in a spinlock) then it should be safe to allocate memory. + */ +static bool ftrace_safe_to_allocate(void) +{ + return !in_interrupt() && irqs_disabled() && !preempt_count(); +} + static void function_profile_call(unsigned long ip, unsigned long parent_ip) { - struct dyn_ftrace *rec; + struct ftrace_profile *rec; unsigned long flags; + bool alloc_safe; if (!ftrace_profile_enabled) return; + alloc_safe = ftrace_safe_to_allocate(); + local_irq_save(flags); rec = ftrace_find_profiled_func(ip); - if (!rec) - goto out; + if (!rec) { + rec = ftrace_profile_alloc(ip, alloc_safe); + if (!rec) + goto out; + } rec->counter++; out: @@ -515,11 +547,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, char buf[64]; int ret; - if (!ftrace_profile_hash) { - pr_info("Can not enable hash due to earlier problems\n"); - return -ENODEV; - } - if (cnt >= sizeof(buf)) return -EINVAL; @@ -537,7 +564,12 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, mutex_lock(&ftrace_profile_lock); if (ftrace_profile_enabled ^ val) { if (val) { - ftrace_profile_reset(); + ret = ftrace_profile_init(); + if (ret < 0) { + cnt = ret; + goto out; + } + register_ftrace_function(&ftrace_profile_ops); ftrace_profile_enabled = 1; } else { @@ -545,6 +577,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, unregister_ftrace_function(&ftrace_profile_ops); } } + out: mutex_unlock(&ftrace_profile_lock); filp->f_pos += cnt; @@ -552,6 +585,17 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, return cnt; } +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + r = sprintf(buf, "%u\n", ftrace_profile_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + static const struct file_operations ftrace_profile_fops = { .open = tracing_open_generic, .read = ftrace_profile_read, @@ -577,39 +621,80 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer) "'function_profile_enabled' entry\n"); } -static void ftrace_add_profile(struct dyn_ftrace *rec) -{ - unsigned long key; - - if (!ftrace_profile_hash) - return; - - key = hash_long(rec->ip, ftrace_profile_bits); - hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); -} - -static void ftrace_profile_release(struct dyn_ftrace *rec) -{ - mutex_lock(&ftrace_profile_lock); - hlist_del(&rec->node); - mutex_unlock(&ftrace_profile_lock); -} - #else /* CONFIG_FUNCTION_PROFILER */ -static void ftrace_profile_init(int nr_funcs) -{ -} -static void ftrace_add_profile(struct dyn_ftrace *rec) -{ -} static void ftrace_profile_debugfs(struct dentry *d_tracer) { } -static void ftrace_profile_release(struct dyn_ftrace *rec) -{ -} #endif /* CONFIG_FUNCTION_PROFILER */ +/* set when tracing only a pid */ +struct pid *ftrace_pid_trace; +static struct pid * const ftrace_swapper_pid = &init_struct_pid; + +#ifdef CONFIG_DYNAMIC_FTRACE + +#ifndef CONFIG_FTRACE_MCOUNT_RECORD +# error Dynamic ftrace depends on MCOUNT_RECORD +#endif + +static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; + +struct ftrace_func_probe { + struct hlist_node node; + struct ftrace_probe_ops *ops; + unsigned long flags; + unsigned long ip; + void *data; + struct rcu_head rcu; +}; + +enum { + FTRACE_ENABLE_CALLS = (1 << 0), + FTRACE_DISABLE_CALLS = (1 << 1), + FTRACE_UPDATE_TRACE_FUNC = (1 << 2), + FTRACE_ENABLE_MCOUNT = (1 << 3), + FTRACE_DISABLE_MCOUNT = (1 << 4), + FTRACE_START_FUNC_RET = (1 << 5), + FTRACE_STOP_FUNC_RET = (1 << 6), +}; + +static int ftrace_filtered; + +static struct dyn_ftrace *ftrace_new_addrs; + +static DEFINE_MUTEX(ftrace_regex_lock); + +struct ftrace_page { + struct ftrace_page *next; + int index; + struct dyn_ftrace records[]; +}; + +#define ENTRIES_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) + +/* estimate from running different kernels */ +#define NR_TO_INIT 10000 + +static struct ftrace_page *ftrace_pages_start; +static struct ftrace_page *ftrace_pages; + +static struct dyn_ftrace *ftrace_free_records; + +/* + * This is a double for. Do not use 'break' to break out of the loop, + * you must use a goto. + */ +#define do_for_each_ftrace_rec(pg, rec) \ + for (pg = ftrace_pages_start; pg; pg = pg->next) { \ + int _____i; \ + for (_____i = 0; _____i < pg->index; _____i++) { \ + rec = &pg->records[_____i]; + +#define while_for_each_ftrace_rec() \ + } \ + } + #ifdef CONFIG_KPROBES static int frozen_record_count; @@ -660,10 +745,8 @@ void ftrace_release(void *start, unsigned long size) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if ((rec->ip >= s) && (rec->ip < e) && - !(rec->flags & FTRACE_FL_FREE)) { + !(rec->flags & FTRACE_FL_FREE)) ftrace_free_rec(rec); - ftrace_profile_release(rec); - } } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); } @@ -717,8 +800,6 @@ ftrace_record_ip(unsigned long ip) rec->newlist = ftrace_new_addrs; ftrace_new_addrs = rec; - ftrace_add_profile(rec); - return rec; } @@ -2462,8 +2543,6 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - ftrace_profile_debugfs(d_tracer); - return 0; } @@ -2532,8 +2611,6 @@ void __init ftrace_init(void) if (ret) goto failed; - ftrace_profile_init(count); - last_ftrace_enabled = ftrace_enabled = 1; ret = ftrace_convert_nops(NULL, @@ -2734,6 +2811,9 @@ static __init int ftrace_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs " "'set_ftrace_pid' entry\n"); + + ftrace_profile_debugfs(d_tracer); + return 0; } fs_initcall(ftrace_init_debugfs); -- cgit v1.2.2 From 0706f1c48ca8a7ab478090b4e38f2e578ae2bfe0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Mar 2009 23:12:58 -0400 Subject: tracing: adding function timings to function profiler If the function graph trace is enabled, the function profiler will use it to take the timing of the functions. cat /debug/tracing/trace_stat/functions Function Hit Time -------- --- ---- mwait_idle 127 183028.4 us schedule 26 151997.7 us __schedule 31 151975.1 us sys_wait4 2 74080.53 us do_wait 2 74077.80 us sys_newlstat 138 39929.16 us do_path_lookup 179 39845.79 us vfs_lstat_fd 138 39761.97 us user_path_at 153 39469.58 us path_walk 179 39435.76 us __link_path_walk 189 39143.73 us [...] Note the times are skewed due to the function graph tracer not taking into account schedules. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 93 ++++++++++++++++++++++++++++++++++-- kernel/trace/trace.c | 11 ----- kernel/trace/trace.h | 3 +- kernel/trace/trace_functions_graph.c | 17 +++++-- kernel/trace/trace_output.c | 10 ++++ kernel/trace/trace_output.h | 2 + 6 files changed, 117 insertions(+), 19 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 24dac448cdc9..a9ccd71fc922 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -33,7 +33,7 @@ #include -#include "trace.h" +#include "trace_output.h" #include "trace_stat.h" #define FTRACE_WARN_ON(cond) \ @@ -246,6 +246,9 @@ struct ftrace_profile { struct hlist_node node; unsigned long ip; unsigned long counter; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + unsigned long long time; +#endif }; struct ftrace_profile_page { @@ -303,6 +306,22 @@ static void *function_stat_start(struct tracer_stat *trace) return function_stat_next(&profile_pages_start->records[0], 0); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* function graph compares on total time */ +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->time < b->time) + return -1; + if (a->time > b->time) + return 1; + else + return 0; +} +#else +/* not function graph compares against hits */ static int function_stat_cmp(void *p1, void *p2) { struct ftrace_profile *a = p1; @@ -315,11 +334,17 @@ static int function_stat_cmp(void *p1, void *p2) else return 0; } +#endif static int function_stat_headers(struct seq_file *m) { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " Function Hit Time\n" + " -------- --- ----\n"); +#else seq_printf(m, " Function Hit\n" " -------- ---\n"); +#endif return 0; } @@ -327,10 +352,25 @@ static int function_stat_show(struct seq_file *m, void *v) { struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + static struct trace_seq s; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + trace_seq_init(&s); + trace_print_graph_duration(rec->time, &s); +#endif kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, " %-30.30s %10lu", str, rec->counter); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " "); + trace_print_seq(m, &s); + mutex_unlock(&mutex); +#endif + seq_putc(m, '\n'); - seq_printf(m, " %-30.30s %10lu\n", str, rec->counter); return 0; } @@ -534,11 +574,52 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) local_irq_restore(flags); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int profile_graph_entry(struct ftrace_graph_ent *trace) +{ + function_profile_call(trace->func, 0); + return 1; +} + +static void profile_graph_return(struct ftrace_graph_ret *trace) +{ + unsigned long flags; + struct ftrace_profile *rec; + + local_irq_save(flags); + rec = ftrace_find_profiled_func(trace->func); + if (rec) + rec->time += trace->rettime - trace->calltime; + local_irq_restore(flags); +} + +static int register_ftrace_profiler(void) +{ + return register_ftrace_graph(&profile_graph_return, + &profile_graph_entry); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_graph(); +} +#else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, }; +static int register_ftrace_profiler(void) +{ + return register_ftrace_function(&ftrace_profile_ops); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_function(&ftrace_profile_ops); +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static ssize_t ftrace_profile_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -570,11 +651,15 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, goto out; } - register_ftrace_function(&ftrace_profile_ops); + ret = register_ftrace_profiler(); + if (ret < 0) { + cnt = ret; + goto out; + } ftrace_profile_enabled = 1; } else { ftrace_profile_enabled = 0; - unregister_ftrace_function(&ftrace_profile_ops); + unregister_ftrace_profiler(); } } out: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 67c6a21dd427..821bf49771d4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -402,17 +402,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } -static void -trace_print_seq(struct seq_file *m, struct trace_seq *s) -{ - int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - - s->buffer[len] = 0; - seq_puts(m, s->buffer); - - trace_seq_init(s); -} - /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d7410bbb9a80..c66ca3b66050 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -605,6 +605,8 @@ extern unsigned long trace_flags; /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern enum print_line_t print_graph_function(struct trace_iterator *iter); +extern enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ @@ -636,7 +638,6 @@ static inline int ftrace_graph_addr(unsigned long addr) return 1; } #endif /* CONFIG_DYNAMIC_FTRACE */ - #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t print_graph_function(struct trace_iterator *iter) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d28687e7b3a7..85bba0f018b0 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -426,8 +426,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, return TRACE_TYPE_HANDLED; } -static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s) +enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); /* log10(ULONG_MAX) + '\0' */ @@ -464,12 +464,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s) if (!ret) return TRACE_TYPE_PARTIAL_LINE; } + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + int ret; + + ret = trace_print_graph_duration(duration, s); + if (ret != TRACE_TYPE_HANDLED) + return ret; ret = trace_seq_printf(s, "| "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; + return TRACE_TYPE_HANDLED; } /* Case of a leaf function on its call entry */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 19261fdd2455..a3b6e3fd7044 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -19,6 +19,16 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; +void trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + + s->buffer[len] = 0; + seq_puts(m, s->buffer); + + trace_seq_init(s); +} + enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 35c422fb51a9..1eac2973374e 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -20,6 +20,8 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter); +extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); + extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); extern int -- cgit v1.2.2 From cafb168a1c92e4c9e1731fe3d666c39611762c49 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 20:50:39 -0400 Subject: tracing: make the function profiler per cpu Impact: speed enhancement By making the function profiler record in per cpu data we not only get better readings, avoid races, we also do not have to take any locks. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 199 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 130 insertions(+), 69 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a9ccd71fc922..ed1fc5021d44 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -257,28 +257,28 @@ struct ftrace_profile_page { struct ftrace_profile records[]; }; +struct ftrace_profile_stat { + atomic_t disabled; + struct hlist_head *hash; + struct ftrace_profile_page *pages; + struct ftrace_profile_page *start; + struct tracer_stat stat; +}; + #define PROFILE_RECORDS_SIZE \ (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) #define PROFILES_PER_PAGE \ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -/* TODO: make these percpu, to prevent cache line bouncing */ -static struct ftrace_profile_page *profile_pages_start; -static struct ftrace_profile_page *profile_pages; - -static struct hlist_head *ftrace_profile_hash; static int ftrace_profile_bits; static int ftrace_profile_enabled; static DEFINE_MUTEX(ftrace_profile_lock); -static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable); +static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); #define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ -static raw_spinlock_t ftrace_profile_rec_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - static void * function_stat_next(void *v, int idx) { @@ -303,7 +303,13 @@ function_stat_next(void *v, int idx) static void *function_stat_start(struct tracer_stat *trace) { - return function_stat_next(&profile_pages_start->records[0], 0); + struct ftrace_profile_stat *stat = + container_of(trace, struct ftrace_profile_stat, stat); + + if (!stat || !stat->start) + return NULL; + + return function_stat_next(&stat->start->records[0], 0); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -374,20 +380,11 @@ static int function_stat_show(struct seq_file *m, void *v) return 0; } -static struct tracer_stat function_stats = { - .name = "functions", - .stat_start = function_stat_start, - .stat_next = function_stat_next, - .stat_cmp = function_stat_cmp, - .stat_headers = function_stat_headers, - .stat_show = function_stat_show -}; - -static void ftrace_profile_reset(void) +static void ftrace_profile_reset(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; - pg = profile_pages = profile_pages_start; + pg = stat->pages = stat->start; while (pg) { memset(pg->records, 0, PROFILE_RECORDS_SIZE); @@ -395,24 +392,24 @@ static void ftrace_profile_reset(void) pg = pg->next; } - memset(ftrace_profile_hash, 0, + memset(stat->hash, 0, FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); } -int ftrace_profile_pages_init(void) +int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; int i; /* If we already allocated, do nothing */ - if (profile_pages) + if (stat->pages) return 0; - profile_pages = (void *)get_zeroed_page(GFP_KERNEL); - if (!profile_pages) + stat->pages = (void *)get_zeroed_page(GFP_KERNEL); + if (!stat->pages) return -ENOMEM; - pg = profile_pages_start = profile_pages; + pg = stat->start = stat->pages; /* allocate 10 more pages to start */ for (i = 0; i < 10; i++) { @@ -430,13 +427,16 @@ int ftrace_profile_pages_init(void) return 0; } -static int ftrace_profile_init(void) +static int ftrace_profile_init_cpu(int cpu) { + struct ftrace_profile_stat *stat; int size; - if (ftrace_profile_hash) { + stat = &per_cpu(ftrace_profile_stats, cpu); + + if (stat->hash) { /* If the profile is already created, simply reset it */ - ftrace_profile_reset(); + ftrace_profile_reset(stat); return 0; } @@ -446,29 +446,45 @@ static int ftrace_profile_init(void) */ size = FTRACE_PROFILE_HASH_SIZE; - ftrace_profile_hash = - kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); - if (!ftrace_profile_hash) + if (!stat->hash) return -ENOMEM; - size--; + if (!ftrace_profile_bits) { + size--; - for (; size; size >>= 1) - ftrace_profile_bits++; + for (; size; size >>= 1) + ftrace_profile_bits++; + } /* Preallocate a few pages */ - if (ftrace_profile_pages_init() < 0) { - kfree(ftrace_profile_hash); - ftrace_profile_hash = NULL; + if (ftrace_profile_pages_init(stat) < 0) { + kfree(stat->hash); + stat->hash = NULL; return -ENOMEM; } return 0; } +static int ftrace_profile_init(void) +{ + int cpu; + int ret = 0; + + for_each_online_cpu(cpu) { + ret = ftrace_profile_init_cpu(cpu); + if (ret) + break; + } + + return ret; +} + /* interrupts must be disabled */ -static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) +static struct ftrace_profile * +ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) { struct ftrace_profile *rec; struct hlist_head *hhd; @@ -476,7 +492,7 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) unsigned long key; key = hash_long(ip, ftrace_profile_bits); - hhd = &ftrace_profile_hash[key]; + hhd = &stat->hash[key]; if (hlist_empty(hhd)) return NULL; @@ -489,52 +505,50 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) return NULL; } -static void ftrace_add_profile(struct ftrace_profile *rec) +static void ftrace_add_profile(struct ftrace_profile_stat *stat, + struct ftrace_profile *rec) { unsigned long key; key = hash_long(rec->ip, ftrace_profile_bits); - hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); + hlist_add_head_rcu(&rec->node, &stat->hash[key]); } /* Interrupts must be disabled calling this */ static struct ftrace_profile * -ftrace_profile_alloc(unsigned long ip, bool alloc_safe) +ftrace_profile_alloc(struct ftrace_profile_stat *stat, + unsigned long ip, bool alloc_safe) { struct ftrace_profile *rec = NULL; /* prevent recursion */ - if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1) + if (atomic_inc_return(&stat->disabled) != 1) goto out; - __raw_spin_lock(&ftrace_profile_rec_lock); - /* Try to always keep another page available */ - if (!profile_pages->next && alloc_safe) - profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC); + if (!stat->pages->next && alloc_safe) + stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC); /* * Try to find the function again since another * task on another CPU could have added it */ - rec = ftrace_find_profiled_func(ip); + rec = ftrace_find_profiled_func(stat, ip); if (rec) - goto out_unlock; + goto out; - if (profile_pages->index == PROFILES_PER_PAGE) { - if (!profile_pages->next) - goto out_unlock; - profile_pages = profile_pages->next; + if (stat->pages->index == PROFILES_PER_PAGE) { + if (!stat->pages->next) + goto out; + stat->pages = stat->pages->next; } - rec = &profile_pages->records[profile_pages->index++]; + rec = &stat->pages->records[stat->pages->index++]; rec->ip = ip; - ftrace_add_profile(rec); + ftrace_add_profile(stat, rec); - out_unlock: - __raw_spin_unlock(&ftrace_profile_rec_lock); out: - atomic_dec(&__get_cpu_var(ftrace_profile_disable)); + atomic_dec(&stat->disabled); return rec; } @@ -552,6 +566,7 @@ static bool ftrace_safe_to_allocate(void) static void function_profile_call(unsigned long ip, unsigned long parent_ip) { + struct ftrace_profile_stat *stat; struct ftrace_profile *rec; unsigned long flags; bool alloc_safe; @@ -562,9 +577,14 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) alloc_safe = ftrace_safe_to_allocate(); local_irq_save(flags); - rec = ftrace_find_profiled_func(ip); + + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash) + goto out; + + rec = ftrace_find_profiled_func(stat, ip); if (!rec) { - rec = ftrace_profile_alloc(ip, alloc_safe); + rec = ftrace_profile_alloc(stat, ip, alloc_safe); if (!rec) goto out; } @@ -583,13 +603,19 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) static void profile_graph_return(struct ftrace_graph_ret *trace) { - unsigned long flags; + struct ftrace_profile_stat *stat; struct ftrace_profile *rec; + unsigned long flags; local_irq_save(flags); - rec = ftrace_find_profiled_func(trace->func); + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash) + goto out; + + rec = ftrace_find_profiled_func(stat, trace->func); if (rec) rec->time += trace->rettime - trace->calltime; + out: local_irq_restore(flags); } @@ -687,16 +713,51 @@ static const struct file_operations ftrace_profile_fops = { .write = ftrace_profile_write, }; +/* used to initialize the real stat files */ +static struct tracer_stat function_stats __initdata = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + static void ftrace_profile_debugfs(struct dentry *d_tracer) { + struct ftrace_profile_stat *stat; struct dentry *entry; + char *name; int ret; + int cpu; - ret = register_stat_tracer(&function_stats); - if (ret) { - pr_warning("Warning: could not register " - "function stats\n"); - return; + for_each_possible_cpu(cpu) { + stat = &per_cpu(ftrace_profile_stats, cpu); + + /* allocate enough for function name + cpu number */ + name = kmalloc(32, GFP_KERNEL); + if (!name) { + /* + * The files created are permanent, if something happens + * we still do not free memory. + */ + kfree(stat); + WARN(1, + "Could not allocate stat file for cpu %d\n", + cpu); + return; + } + stat->stat = function_stats; + snprintf(name, 32, "function%d", cpu); + stat->stat.name = name; + ret = register_stat_tracer(&stat->stat); + if (ret) { + WARN(1, + "Could not register function stat for cpu %d\n", + cpu); + kfree(name); + return; + } } entry = debugfs_create_file("function_profile_enabled", 0644, -- cgit v1.2.2 From a2a16d6a3156ef7309ca7328a20c35df9418e670 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 23:17:58 -0400 Subject: function-graph: add option to calculate graph time or not graph time is the time that a function is executing another function. Thus if function A calls B, if graph-time is set, then the time for A includes B. This is the default behavior. But if graph-time is off, then the time spent executing B is subtracted from A. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 3 +-- kernel/trace/ftrace.c | 21 ++++++++++++++++++++- kernel/trace/trace.c | 4 +++- kernel/trace/trace.h | 1 + kernel/trace/trace_functions_graph.c | 8 ++++---- 5 files changed, 29 insertions(+), 8 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 015a3d22cf74..9e0a8d245e55 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -365,6 +365,7 @@ struct ftrace_ret_stack { unsigned long ret; unsigned long func; unsigned long long calltime; + unsigned long long subtime; }; /* @@ -376,8 +377,6 @@ extern void return_to_handler(void); extern int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth); -extern void -ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret); /* * Sometimes we don't want to trace a function with the function diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ed1fc5021d44..71e5faef12ab 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -604,6 +604,7 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) static void profile_graph_return(struct ftrace_graph_ret *trace) { struct ftrace_profile_stat *stat; + unsigned long long calltime; struct ftrace_profile *rec; unsigned long flags; @@ -612,9 +613,27 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) if (!stat->hash) goto out; + calltime = trace->rettime - trace->calltime; + + if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { + int index; + + index = trace->depth; + + /* Append this call time to the parent time to subtract */ + if (index) + current->ret_stack[index - 1].subtime += calltime; + + if (current->ret_stack[index].subtime < calltime) + calltime -= current->ret_stack[index].subtime; + else + calltime = 0; + } + rec = ftrace_find_profiled_func(stat, trace->func); if (rec) - rec->time += trace->rettime - trace->calltime; + rec->time += calltime; + out: local_irq_restore(flags); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 821bf49771d4..5d1a16cae376 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -255,7 +255,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME; + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | + TRACE_ITER_GRAPH_TIME; /** * trace_wake_up - wake up tasks waiting for trace input @@ -317,6 +318,7 @@ static const char *trace_options[] = { "latency-format", "global-clock", "sleep-time", + "graph-time", NULL }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c66ca3b66050..e3429a8ab059 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -685,6 +685,7 @@ enum trace_iterator_flags { TRACE_ITER_LATENCY_FMT = 0x40000, TRACE_ITER_GLOBAL_CLK = 0x80000, TRACE_ITER_SLEEP_TIME = 0x100000, + TRACE_ITER_GRAPH_TIME = 0x200000, }; /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 85bba0f018b0..10f6ad7d85f6 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -78,13 +78,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; + current->ret_stack[index].subtime = 0; *depth = index; return 0; } /* Retrieve a function return address to the trace stack on thread info.*/ -void +static void ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) { int index; @@ -104,9 +105,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); trace->depth = index; - barrier(); - current->curr_ret_stack--; - } /* @@ -121,6 +119,8 @@ unsigned long ftrace_return_to_handler(void) ftrace_pop_return_trace(&trace, &ret); trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); + barrier(); + current->curr_ret_stack--; if (unlikely(!ret)) { ftrace_graph_stop(); -- cgit v1.2.2 From fb9fb015e92123fa3a8e0c2e2fff491d4a56b470 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 13:26:41 -0400 Subject: tracing: clean up tracing profiler Ingo Molnar suggested clean ups for the profiling code. This patch makes those updates. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 71e5faef12ab..a141d8499ab0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -69,7 +69,7 @@ static DEFINE_MUTEX(ftrace_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { - .func = ftrace_stub, + .func = ftrace_stub, }; static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; @@ -271,8 +271,10 @@ struct ftrace_profile_stat { #define PROFILES_PER_PAGE \ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static int ftrace_profile_bits; -static int ftrace_profile_enabled; +static int ftrace_profile_bits __read_mostly; +static int ftrace_profile_enabled __read_mostly; + +/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ static DEFINE_MUTEX(ftrace_profile_lock); static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); @@ -651,7 +653,7 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { - .func = function_profile_call, + .func = function_profile_call, }; static int register_ftrace_profiler(void) @@ -670,7 +672,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; - char buf[64]; + char buf[64]; /* big enough to hold a number */ int ret; if (cnt >= sizeof(buf)) @@ -719,7 +721,7 @@ static ssize_t ftrace_profile_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[64]; + char buf[64]; /* big enough to hold a number */ int r; r = sprintf(buf, "%u\n", ftrace_profile_enabled); @@ -734,12 +736,12 @@ static const struct file_operations ftrace_profile_fops = { /* used to initialize the real stat files */ static struct tracer_stat function_stats __initdata = { - .name = "functions", - .stat_start = function_stat_start, - .stat_next = function_stat_next, - .stat_cmp = function_stat_cmp, - .stat_headers = function_stat_headers, - .stat_show = function_stat_show + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show }; static void ftrace_profile_debugfs(struct dentry *d_tracer) @@ -1954,7 +1956,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_probe_ops __read_mostly = { - .func = function_trace_probe_call, + .func = function_trace_probe_call, }; static int ftrace_probe_registered; -- cgit v1.2.2 From 318e0a73c9e41b9a17241829bcd0605a39b87cb9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 20:06:34 -0400 Subject: tracing: remove on the fly allocator from function profiler Impact: safer code The on the fly allocator for the function profiler was to save memory. But at the expense of stability. Although it survived several tests, allocating from the function tracer is just too risky, just to save space. This patch removes the allocator and simply allocates enough entries at start up. Each function gets a profiling structure of 40 bytes. With an average of 20K functions, and this is for each CPU, we have 800K per online CPU. This is not too bad, at least for non-embedded. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 76 +++++++++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 33 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a141d8499ab0..4d90c916b2bb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -401,6 +401,8 @@ static void ftrace_profile_reset(struct ftrace_profile_stat *stat) int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; + int functions; + int pages; int i; /* If we already allocated, do nothing */ @@ -411,22 +413,46 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) if (!stat->pages) return -ENOMEM; +#ifdef CONFIG_DYNAMIC_FTRACE + functions = ftrace_update_tot_cnt; +#else + /* + * We do not know the number of functions that exist because + * dynamic tracing is what counts them. With past experience + * we have around 20K functions. That should be more than enough. + * It is highly unlikely we will execute every function in + * the kernel. + */ + functions = 20000; +#endif + pg = stat->start = stat->pages; - /* allocate 10 more pages to start */ - for (i = 0; i < 10; i++) { + pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); + + for (i = 0; i < pages; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); - /* - * We only care about allocating profile_pages, if - * we failed to allocate here, hopefully we will allocate - * later. - */ if (!pg->next) - break; + goto out_free; pg = pg->next; } return 0; + + out_free: + pg = stat->start; + while (pg) { + unsigned long tmp = (unsigned long)pg; + + pg = pg->next; + free_page(tmp); + } + + free_page((unsigned long)stat->pages); + stat->pages = NULL; + stat->start = NULL; + + return -ENOMEM; } static int ftrace_profile_init_cpu(int cpu) @@ -460,7 +486,7 @@ static int ftrace_profile_init_cpu(int cpu) ftrace_profile_bits++; } - /* Preallocate a few pages */ + /* Preallocate the function profiling pages */ if (ftrace_profile_pages_init(stat) < 0) { kfree(stat->hash); stat->hash = NULL; @@ -516,24 +542,21 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat, hlist_add_head_rcu(&rec->node, &stat->hash[key]); } -/* Interrupts must be disabled calling this */ +/* + * The memory is already allocated, this simply finds a new record to use. + */ static struct ftrace_profile * -ftrace_profile_alloc(struct ftrace_profile_stat *stat, - unsigned long ip, bool alloc_safe) +ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) { struct ftrace_profile *rec = NULL; - /* prevent recursion */ + /* prevent recursion (from NMIs) */ if (atomic_inc_return(&stat->disabled) != 1) goto out; - /* Try to always keep another page available */ - if (!stat->pages->next && alloc_safe) - stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC); - /* - * Try to find the function again since another - * task on another CPU could have added it + * Try to find the function again since an NMI + * could have added it */ rec = ftrace_find_profiled_func(stat, ip); if (rec) @@ -555,29 +578,16 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, return rec; } -/* - * If we are not in an interrupt, or softirq and - * and interrupts are disabled and preemption is not enabled - * (not in a spinlock) then it should be safe to allocate memory. - */ -static bool ftrace_safe_to_allocate(void) -{ - return !in_interrupt() && irqs_disabled() && !preempt_count(); -} - static void function_profile_call(unsigned long ip, unsigned long parent_ip) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; unsigned long flags; - bool alloc_safe; if (!ftrace_profile_enabled) return; - alloc_safe = ftrace_safe_to_allocate(); - local_irq_save(flags); stat = &__get_cpu_var(ftrace_profile_stats); @@ -586,7 +596,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) rec = ftrace_find_profiled_func(stat, ip); if (!rec) { - rec = ftrace_profile_alloc(stat, ip, alloc_safe); + rec = ftrace_profile_alloc(stat, ip); if (!rec) goto out; } -- cgit v1.2.2 From 34886c8bc590f078d4c0b88f50d061326639198d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 21:00:47 -0400 Subject: tracing: add average time in function to function profiler Show the average time in the function (Time / Hit) Function Hit Time Avg -------- --- ---- --- mwait_idle 51 140326.6 us 2751.503 us smp_apic_timer_interrupt 47 3517.735 us 74.845 us schedule 10 2738.754 us 273.875 us __schedule 10 2732.857 us 273.285 us hrtimer_interrupt 47 1896.104 us 40.342 us irq_exit 56 1711.833 us 30.568 us __run_hrtimer 47 1315.589 us 27.991 us tick_sched_timer 47 1138.690 us 24.227 us do_softirq 56 1116.829 us 19.943 us __do_softirq 56 1066.932 us 19.052 us do_IRQ 9 926.153 us 102.905 us Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4d90c916b2bb..c7f4a4be05dc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -347,8 +347,10 @@ static int function_stat_cmp(void *p1, void *p2) static int function_stat_headers(struct seq_file *m) { #ifdef CONFIG_FUNCTION_GRAPH_TRACER - seq_printf(m, " Function Hit Time\n" - " -------- --- ----\n"); + seq_printf(m, " Function " + "Hit Time Avg\n" + " -------- " + "--- ---- ---\n"); #else seq_printf(m, " Function Hit\n" " -------- ---\n"); @@ -361,12 +363,9 @@ static int function_stat_show(struct seq_file *m, void *v) struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - static struct trace_seq s; static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); - trace_seq_init(&s); - trace_print_graph_duration(rec->time, &s); + static struct trace_seq s; + unsigned long long avg; #endif kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); @@ -374,6 +373,14 @@ static int function_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_printf(m, " "); + avg = rec->time; + do_div(avg, rec->counter); + + mutex_lock(&mutex); + trace_seq_init(&s); + trace_print_graph_duration(rec->time, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(avg, &s); trace_print_seq(m, &s); mutex_unlock(&mutex); #endif -- cgit v1.2.2 From cac94f979326212831c0ea44ed9ea1622b4f4e93 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:33 +0200 Subject: x86, bts: fix race when bts tracer is removed When the bts tracer is removed while the traced task is running, the write to clear the bts tracer pointer races with context switch code. Read the tracer once during a context switch. When a new tracer is installed, the bts tracer is set in the ds context before the tracer is initialized in order to claim the context for that tracer. This may result in write accesses using an uninitialized trace configuration when scheduling timestamps have been requested. Store active tracing flags separately and only set active flags after the tracing configuration has been initialized. Reviewed-by: Oleg Nesterov Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144548.881338000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 58 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index b1d6e1f502fa..c730155bf54d 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -89,6 +89,9 @@ struct bts_tracer { /* Buffer overflow notification function: */ bts_ovfl_callback_t ovfl; + + /* Active flags affecting trace collection. */ + unsigned int flags; }; struct pebs_tracer { @@ -799,6 +802,8 @@ void ds_suspend_bts(struct bts_tracer *tracer) if (!tracer) return; + tracer->flags = 0; + task = tracer->ds.context->task; if (!task || (task == current)) @@ -820,6 +825,8 @@ void ds_resume_bts(struct bts_tracer *tracer) if (!tracer) return; + tracer->flags = tracer->trace.ds.flags; + task = tracer->ds.context->task; control = ds_cfg.ctl[dsf_bts]; @@ -1037,43 +1044,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) } } +static inline void ds_take_timestamp(struct ds_context *context, + enum bts_qualifier qualifier, + struct task_struct *task) +{ + struct bts_tracer *tracer = context->bts_master; + struct bts_struct ts; + + /* Prevent compilers from reading the tracer pointer twice. */ + barrier(); + + if (!tracer || !(tracer->flags & BTS_TIMESTAMPS)) + return; + + memset(&ts, 0, sizeof(ts)); + ts.qualifier = qualifier; + ts.variant.timestamp.jiffies = jiffies_64; + ts.variant.timestamp.pid = task->pid; + + bts_write(tracer, &ts); +} + /* * Change the DS configuration from tracing prev to tracing next. */ void ds_switch_to(struct task_struct *prev, struct task_struct *next) { - struct ds_context *prev_ctx = prev->thread.ds_ctx; - struct ds_context *next_ctx = next->thread.ds_ctx; + struct ds_context *prev_ctx = prev->thread.ds_ctx; + struct ds_context *next_ctx = next->thread.ds_ctx; + unsigned long debugctlmsr = next->thread.debugctlmsr; + + /* Make sure all data is read before we start. */ + barrier(); if (prev_ctx) { update_debugctlmsr(0); - if (prev_ctx->bts_master && - (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { - struct bts_struct ts = { - .qualifier = bts_task_departs, - .variant.timestamp.jiffies = jiffies_64, - .variant.timestamp.pid = prev->pid - }; - bts_write(prev_ctx->bts_master, &ts); - } + ds_take_timestamp(prev_ctx, bts_task_departs, prev); } if (next_ctx) { - if (next_ctx->bts_master && - (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { - struct bts_struct ts = { - .qualifier = bts_task_arrives, - .variant.timestamp.jiffies = jiffies_64, - .variant.timestamp.pid = next->pid - }; - bts_write(next_ctx->bts_master, &ts); - } + ds_take_timestamp(next_ctx, bts_task_arrives, next); wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); } - update_debugctlmsr(next->thread.debugctlmsr); + update_debugctlmsr(debugctlmsr); } void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) -- cgit v1.2.2 From a26b89f05d194413c7238e0bea071054f6b5d3c8 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:34 +0200 Subject: sched, hw-branch-tracer: add wait_task_context_switch() function to sched.h Add a function to wait until some other task has been switched out at least once. This differs from wait_task_inactive() subtly, in that the latter will wait until the task has left the CPU. Signed-off-by: Markus Metzger Cc: markus.t.metzger@gmail.com Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144549.794157000@intel.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index b94f3541f67b..a5b9a83065fa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1993,8 +1993,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP +extern void wait_task_context_switch(struct task_struct *p); extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else +static inline void wait_task_context_switch(struct task_struct *p) {} static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) { diff --git a/kernel/sched.c b/kernel/sched.c index 6cc1fd5d5072..f91bc8141dc3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2002,6 +2002,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) return 1; } +/* + * wait_task_context_switch - wait for a thread to complete at least one + * context switch. + * + * @p must not be current. + */ +void wait_task_context_switch(struct task_struct *p) +{ + unsigned long nvcsw, nivcsw, flags; + int running; + struct rq *rq; + + nvcsw = p->nvcsw; + nivcsw = p->nivcsw; + for (;;) { + /* + * The runqueue is assigned before the actual context + * switch. We need to take the runqueue lock. + * + * We could check initially without the lock but it is + * very likely that we need to take the lock in every + * iteration. + */ + rq = task_rq_lock(p, &flags); + running = task_running(rq, p); + task_rq_unlock(rq, &flags); + + if (likely(!running)) + break; + /* + * The switch count is incremented before the actual + * context switch. We thus wait for two switches to be + * sure at least one completed. + */ + if ((p->nvcsw - nvcsw) > 1) + break; + if ((p->nivcsw - nivcsw) > 1) + break; + + cpu_relax(); + } +} + /* * wait_task_inactive - wait for a thread to unschedule. * -- cgit v1.2.2 From e2b371f00a6f529f6362654239bdec8dcd510760 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:35 +0200 Subject: mm, x86, ptrace, bts: defer branch trace stopping When a ptraced task is unlinked, we need to stop branch tracing for that task. Since the unlink is called with interrupts disabled, and we need interrupts enabled to stop branch tracing, we defer the work. Collect all branch tracing related stuff in a branch tracing context. Reviewed-by: Oleg Nesterov Signed-off-by: Markus Metzger Cc: Andrew Morton Cc: Peter Zijlstra Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144550.712401000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 4 - arch/x86/kernel/ptrace.c | 254 ++++++++++++++++++++++++++------------- include/linux/mm.h | 3 +- include/linux/sched.h | 9 +- mm/mlock.c | 13 +- 5 files changed, 179 insertions(+), 104 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 34c52370f2fe..2483807e06e7 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -458,10 +458,6 @@ struct thread_struct { /* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */ struct ds_context *ds_ctx; #endif /* CONFIG_X86_DS */ -#ifdef CONFIG_X86_PTRACE_BTS -/* the signal to send on a bts buffer overflow */ - unsigned int bts_ovfl_signal; -#endif /* CONFIG_X86_PTRACE_BTS */ }; static inline unsigned long native_get_debugreg(int regno) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index fe9345c967de..7c21d1e8cae7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -577,17 +578,119 @@ static int ioperm_get(struct task_struct *target, } #ifdef CONFIG_X86_PTRACE_BTS +/* + * A branch trace store context. + * + * Contexts may only be installed by ptrace_bts_config() and only for + * ptraced tasks. + * + * Contexts are destroyed when the tracee is detached from the tracer. + * The actual destruction work requires interrupts enabled, so the + * work is deferred and will be scheduled during __ptrace_unlink(). + * + * Contexts hold an additional task_struct reference on the traced + * task, as well as a reference on the tracer's mm. + * + * Ptrace already holds a task_struct for the duration of ptrace operations, + * but since destruction is deferred, it may be executed after both + * tracer and tracee exited. + */ +struct bts_context { + /* The branch trace handle. */ + struct bts_tracer *tracer; + + /* The buffer used to store the branch trace and its size. */ + void *buffer; + unsigned int size; + + /* The mm that paid for the above buffer. */ + struct mm_struct *mm; + + /* The task this context belongs to. */ + struct task_struct *task; + + /* The signal to send on a bts buffer overflow. */ + unsigned int bts_ovfl_signal; + + /* The work struct to destroy a context. */ + struct work_struct work; +}; + +static inline void alloc_bts_buffer(struct bts_context *context, + unsigned int size) +{ + void *buffer; + + buffer = alloc_locked_buffer(size); + if (buffer) { + context->buffer = buffer; + context->size = size; + context->mm = get_task_mm(current); + } +} + +static inline void free_bts_buffer(struct bts_context *context) +{ + if (!context->buffer) + return; + + kfree(context->buffer); + context->buffer = NULL; + + refund_locked_buffer_memory(context->mm, context->size); + context->size = 0; + + mmput(context->mm); + context->mm = NULL; +} + +static void free_bts_context_work(struct work_struct *w) +{ + struct bts_context *context; + + context = container_of(w, struct bts_context, work); + + ds_release_bts(context->tracer); + put_task_struct(context->task); + free_bts_buffer(context); + kfree(context); +} + +static inline void free_bts_context(struct bts_context *context) +{ + INIT_WORK(&context->work, free_bts_context_work); + schedule_work(&context->work); +} + +static inline struct bts_context *alloc_bts_context(struct task_struct *task) +{ + struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL); + if (context) { + context->task = task; + task->bts = context; + + get_task_struct(task); + } + + return context; +} + static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { + struct bts_context *context; const struct bts_trace *trace; struct bts_struct bts; const unsigned char *at; int error; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; at = trace->ds.top - ((index + 1) * trace->ds.size); if ((void *)at < trace->ds.begin) @@ -596,7 +699,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, if (!trace->read) return -EOPNOTSUPP; - error = trace->read(child->bts, at, &bts); + error = trace->read(context->tracer, at, &bts); if (error < 0) return error; @@ -610,13 +713,18 @@ static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { + struct bts_context *context; const struct bts_trace *trace; const unsigned char *at; int error, drained = 0; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; if (!trace->read) return -EOPNOTSUPP; @@ -627,9 +735,8 @@ static int ptrace_bts_drain(struct task_struct *child, for (at = trace->ds.begin; (void *)at < trace->ds.top; out++, drained++, at += trace->ds.size) { struct bts_struct bts; - int error; - error = trace->read(child->bts, at, &bts); + error = trace->read(context->tracer, at, &bts); if (error < 0) return error; @@ -639,35 +746,18 @@ static int ptrace_bts_drain(struct task_struct *child, memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - error = ds_reset_bts(child->bts); + error = ds_reset_bts(context->tracer); if (error < 0) return error; return drained; } -static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size) -{ - child->bts_buffer = alloc_locked_buffer(size); - if (!child->bts_buffer) - return -ENOMEM; - - child->bts_size = size; - - return 0; -} - -static void ptrace_bts_free_buffer(struct task_struct *child) -{ - free_locked_buffer(child->bts_buffer, child->bts_size); - child->bts_buffer = NULL; - child->bts_size = 0; -} - static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) { + struct bts_context *context; struct ptrace_bts_config cfg; unsigned int flags = 0; @@ -677,28 +767,31 @@ static int ptrace_bts_config(struct task_struct *child, if (copy_from_user(&cfg, ucfg, sizeof(cfg))) return -EFAULT; - if (child->bts) { - ds_release_bts(child->bts); - child->bts = NULL; - } + context = child->bts; + if (!context) + context = alloc_bts_context(child); + if (!context) + return -ENOMEM; if (cfg.flags & PTRACE_BTS_O_SIGNAL) { if (!cfg.signal) return -EINVAL; - child->thread.bts_ovfl_signal = cfg.signal; return -EOPNOTSUPP; + context->bts_ovfl_signal = cfg.signal; } - if ((cfg.flags & PTRACE_BTS_O_ALLOC) && - (cfg.size != child->bts_size)) { - int error; + ds_release_bts(context->tracer); + context->tracer = NULL; - ptrace_bts_free_buffer(child); + if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { + free_bts_buffer(context); + if (!cfg.size) + return 0; - error = ptrace_bts_allocate_buffer(child, cfg.size); - if (error < 0) - return error; + alloc_bts_buffer(context, cfg.size); + if (!context->buffer) + return -ENOMEM; } if (cfg.flags & PTRACE_BTS_O_TRACE) @@ -707,15 +800,13 @@ static int ptrace_bts_config(struct task_struct *child, if (cfg.flags & PTRACE_BTS_O_SCHED) flags |= BTS_TIMESTAMPS; - child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, - /* ovfl = */ NULL, /* th = */ (size_t)-1, - flags); - if (IS_ERR(child->bts)) { - int error = PTR_ERR(child->bts); - - ptrace_bts_free_buffer(child); - child->bts = NULL; + context->tracer = ds_request_bts(child, context->buffer, context->size, + NULL, (size_t)-1, flags); + if (unlikely(IS_ERR(context->tracer))) { + int error = PTR_ERR(context->tracer); + free_bts_buffer(context); + context->tracer = NULL; return error; } @@ -726,20 +817,25 @@ static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { + struct bts_context *context; const struct bts_trace *trace; struct ptrace_bts_config cfg; + context = child->bts; + if (!context) + return -ESRCH; + if (cfg_size < sizeof(cfg)) return -EIO; - trace = ds_read_bts(child->bts); + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; memset(&cfg, 0, sizeof(cfg)); - cfg.size = trace->ds.end - trace->ds.begin; - cfg.signal = child->thread.bts_ovfl_signal; - cfg.bts_size = sizeof(struct bts_struct); + cfg.size = trace->ds.end - trace->ds.begin; + cfg.signal = context->bts_ovfl_signal; + cfg.bts_size = sizeof(struct bts_struct); if (cfg.signal) cfg.flags |= PTRACE_BTS_O_SIGNAL; @@ -758,67 +854,56 @@ static int ptrace_bts_status(struct task_struct *child, static int ptrace_bts_clear(struct task_struct *child) { + struct bts_context *context; const struct bts_trace *trace; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - return ds_reset_bts(child->bts); + return ds_reset_bts(context->tracer); } static int ptrace_bts_size(struct task_struct *child) { + struct bts_context *context; const struct bts_trace *trace; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; return (trace->ds.top - trace->ds.begin) / trace->ds.size; } -static void ptrace_bts_fork(struct task_struct *tsk) +static inline void ptrace_bts_fork(struct task_struct *tsk) { tsk->bts = NULL; - tsk->bts_buffer = NULL; - tsk->bts_size = 0; - tsk->thread.bts_ovfl_signal = 0; } -static void ptrace_bts_untrace(struct task_struct *child) +/* + * Called from __ptrace_unlink() after the child has been moved back + * to its original parent. + */ +static inline void ptrace_bts_untrace(struct task_struct *child) { if (unlikely(child->bts)) { - ds_release_bts(child->bts); + free_bts_context(child->bts); child->bts = NULL; - - /* We cannot update total_vm and locked_vm since - child's mm is already gone. But we can reclaim the - memory. */ - kfree(child->bts_buffer); - child->bts_buffer = NULL; - child->bts_size = 0; } } - -static void ptrace_bts_detach(struct task_struct *child) -{ - /* - * Ptrace_detach() races with ptrace_untrace() in case - * the child dies and is reaped by another thread. - * - * We only do the memory accounting at this point and - * leave the buffer deallocation and the bts tracer - * release to ptrace_bts_untrace() which will be called - * later on with tasklist_lock held. - */ - release_locked_buffer(child->bts_buffer, child->bts_size); -} #else static inline void ptrace_bts_fork(struct task_struct *tsk) {} -static inline void ptrace_bts_detach(struct task_struct *child) {} static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ @@ -843,7 +928,6 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif - ptrace_bts_detach(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION diff --git a/include/linux/mm.h b/include/linux/mm.h index bff1f0d475c7..64d8ed2538ae 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -13,6 +13,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -1321,6 +1322,6 @@ void vmemmap_populate_print_last(void); extern void *alloc_locked_buffer(size_t size); extern void free_locked_buffer(void *buffer, size_t size); -extern void release_locked_buffer(void *buffer, size_t size); +extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index a5b9a83065fa..52b8cd049c2e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -96,8 +96,8 @@ struct exec_domain; struct futex_pi_state; struct robust_list_head; struct bio; -struct bts_tracer; struct fs_struct; +struct bts_context; /* * List of flags we want to share for kernel threads, @@ -1210,12 +1210,7 @@ struct task_struct { * This is the tracer handle for the ptrace BTS extension. * This field actually belongs to the ptracer task. */ - struct bts_tracer *bts; - /* - * The buffer to hold the BTS data. - */ - void *bts_buffer; - size_t bts_size; + struct bts_context *bts; #endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ diff --git a/mm/mlock.c b/mm/mlock.c index cbe9e0581b75..749383b442c7 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -660,21 +660,20 @@ void *alloc_locked_buffer(size_t size) return buffer; } -void release_locked_buffer(void *buffer, size_t size) +void refund_locked_buffer_memory(struct mm_struct *mm, size_t size) { unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); - current->mm->total_vm -= pgsz; - current->mm->locked_vm -= pgsz; + mm->total_vm -= pgsz; + mm->locked_vm -= pgsz; - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); } void free_locked_buffer(void *buffer, size_t size) { - release_locked_buffer(buffer, size); - + refund_locked_buffer_memory(current->mm, size); kfree(buffer); } -- cgit v1.2.2 From 8d99b3ac2726e5edd97ad147fa5c1f2acb63a745 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:36 +0200 Subject: x86, bts: wait until traced task has been scheduled out In order to stop branch tracing for a running task, we need to first clear the branch tracing control bits before we may free the tracing buffer. If the traced task is running, the cpu might still trace that task after the branch trace control bits have cleared. Wait until the traced task has been scheduled out before proceeding. A similar problem affects the task debug store context. We first remove the context, then we need to wait until the task has been scheduled out before we can free the context memory. Reviewed-by: Oleg Nesterov Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144551.919636000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index c730155bf54d..5cd137ab2672 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -299,6 +299,7 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) static inline void ds_put_context(struct ds_context *context) { + struct task_struct *task; unsigned long irq; if (!context) @@ -313,14 +314,20 @@ static inline void ds_put_context(struct ds_context *context) *(context->this) = NULL; - if (context->task) - clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + task = context->task; + + if (task) + clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); - if (!context->task || (context->task == current)) + if (!task || (task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); spin_unlock_irqrestore(&ds_lock, irq); + /* The context might still be in use for context switching. */ + if (task && (task != current)) + wait_task_context_switch(task); + kfree(context); } @@ -781,15 +788,23 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, void ds_release_bts(struct bts_tracer *tracer) { + struct task_struct *task; + if (!tracer) return; + task = tracer->ds.context->task; + ds_suspend_bts(tracer); WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); tracer->ds.context->bts_master = NULL; - put_tracer(tracer->ds.context->task); + /* Make sure tracing stopped and the tracer is not in use. */ + if (task && (task != current)) + wait_task_context_switch(task); + + put_tracer(task); ds_put_context(tracer->ds.context); kfree(tracer); -- cgit v1.2.2 From 38f801129ad07b9afa7f9bd3779f61b805416d8c Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:37 +0200 Subject: x86, bts: fix race between per-task and per-cpu branch tracing Per-task branch tracing installs a debug store context with the traced task. This immediately results in the branch trace control bits to be cleared for the next context switch of that task, if not set before. Either per-cpu or per-task tracing are allowed at the same time. An active per-cpu tracing would be disabled even if the per-task tracing request is rejected and the task debug store context removed. Check the tracing type (per-cpu or per-task) before installing a task debug store context. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144552.856000000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 72 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 5cd137ab2672..f03f117eff8c 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -193,12 +193,28 @@ static DEFINE_SPINLOCK(ds_lock); */ static atomic_t tracers = ATOMIC_INIT(0); -static inline void get_tracer(struct task_struct *task) +static inline int get_tracer(struct task_struct *task) { - if (task) + int error; + + spin_lock_irq(&ds_lock); + + if (task) { + error = -EPERM; + if (atomic_read(&tracers) < 0) + goto out; atomic_inc(&tracers); - else + } else { + error = -EPERM; + if (atomic_read(&tracers) > 0) + goto out; atomic_dec(&tracers); + } + + error = 0; +out: + spin_unlock_irq(&ds_lock); + return error; } static inline void put_tracer(struct task_struct *task) @@ -209,14 +225,6 @@ static inline void put_tracer(struct task_struct *task) atomic_inc(&tracers); } -static inline int check_tracer(struct task_struct *task) -{ - return task ? - (atomic_read(&tracers) >= 0) : - (atomic_read(&tracers) <= 0); -} - - /* * The DS context is either attached to a thread or to a cpu: * - in the former case, the thread_struct contains a pointer to the @@ -677,6 +685,10 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, if (ovfl) goto out; + error = get_tracer(task); + if (error < 0) + goto out; + /* * Per-cpu tracing is typically requested using smp_call_function(). * We must not sleep. @@ -684,7 +696,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, error = -ENOMEM; tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) - goto out; + goto out_put_tracer; tracer->ovfl = ovfl; error = ds_request(&tracer->ds, &tracer->trace.ds, @@ -695,14 +707,9 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, spin_lock_irqsave(&ds_lock, irq); - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); - error = -EPERM; if (tracer->ds.context->bts_master) - goto out_put_tracer; + goto out_unlock; tracer->ds.context->bts_master = tracer; spin_unlock_irqrestore(&ds_lock, irq); @@ -716,13 +723,13 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, return tracer; - out_put_tracer: - put_tracer(task); out_unlock: spin_unlock_irqrestore(&ds_lock, irq); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); + out_put_tracer: + put_tracer(task); out: return ERR_PTR(error); } @@ -741,6 +748,10 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, if (ovfl) goto out; + error = get_tracer(task); + if (error < 0) + goto out; + /* * Per-cpu tracing is typically requested using smp_call_function(). * We must not sleep. @@ -748,7 +759,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, error = -ENOMEM; tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) - goto out; + goto out_put_tracer; tracer->ovfl = ovfl; error = ds_request(&tracer->ds, &tracer->trace.ds, @@ -758,14 +769,9 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, spin_lock_irqsave(&ds_lock, irq); - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); - error = -EPERM; if (tracer->ds.context->pebs_master) - goto out_put_tracer; + goto out_unlock; tracer->ds.context->pebs_master = tracer; spin_unlock_irqrestore(&ds_lock, irq); @@ -775,13 +781,13 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, return tracer; - out_put_tracer: - put_tracer(task); out_unlock: spin_unlock_irqrestore(&ds_lock, irq); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); + out_put_tracer: + put_tracer(task); out: return ERR_PTR(error); } @@ -804,8 +810,8 @@ void ds_release_bts(struct bts_tracer *tracer) if (task && (task != current)) wait_task_context_switch(task); - put_tracer(task); ds_put_context(tracer->ds.context); + put_tracer(task); kfree(tracer); } @@ -861,16 +867,20 @@ void ds_resume_bts(struct bts_tracer *tracer) void ds_release_pebs(struct pebs_tracer *tracer) { + struct task_struct *task; + if (!tracer) return; + task = tracer->ds.context->task; + ds_suspend_pebs(tracer); WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); tracer->ds.context->pebs_master = NULL; - put_tracer(tracer->ds.context->task); ds_put_context(tracer->ds.context); + put_tracer(task); kfree(tracer); } -- cgit v1.2.2 From 15879d042164650b93d83281ad5f87ad323bfbfe Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:38 +0200 Subject: x86, bts: use trace_clock_global() for timestamps Rename the bts_struct timestamp field to event. Use trace_clock_global() for time measurement. Reported-by: Ingo Molnar Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144553.773216000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 4 ++-- arch/x86/kernel/ds.c | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index a8f672ba100c..772f141afb9a 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -170,9 +170,9 @@ struct bts_struct { } lbr; /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */ struct { - __u64 jiffies; + __u64 clock; pid_t pid; - } timestamp; + } event; } variant; }; diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index f03f117eff8c..2071b992c35c 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -471,7 +472,7 @@ enum bts_field { bts_flags, bts_qual = bts_from, - bts_jiffies = bts_to, + bts_clock = bts_to, bts_pid = bts_flags, bts_qual_mask = (bts_qual_max - 1), @@ -517,8 +518,8 @@ bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out) memset(out, 0, sizeof(*out)); if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); - out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); - out->variant.timestamp.pid = bts_get(at, bts_pid); + out->variant.event.clock = bts_get(at, bts_clock); + out->variant.event.pid = bts_get(at, bts_pid); } else { out->qualifier = bts_branch; out->variant.lbr.from = bts_get(at, bts_from); @@ -555,8 +556,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) case bts_task_arrives: case bts_task_departs: bts_set(raw, bts_qual, (bts_escape | in->qualifier)); - bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); - bts_set(raw, bts_pid, in->variant.timestamp.pid); + bts_set(raw, bts_clock, in->variant.event.clock); + bts_set(raw, bts_pid, in->variant.event.pid); break; default: return -EINVAL; @@ -1083,9 +1084,9 @@ static inline void ds_take_timestamp(struct ds_context *context, return; memset(&ts, 0, sizeof(ts)); - ts.qualifier = qualifier; - ts.variant.timestamp.jiffies = jiffies_64; - ts.variant.timestamp.pid = task->pid; + ts.qualifier = qualifier; + ts.variant.event.clock = trace_clock_global(); + ts.variant.event.pid = task->pid; bts_write(tracer, &ts); } -- cgit v1.2.2 From 35bb7600c17762bb129588c1877d2717fe325289 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:39 +0200 Subject: x86, debugctlmsr: add _on_cpu variants to debugctlmsr functions Add functions to get and set the debugctlmsr on different cpus. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144554.738772000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2483807e06e7..1efeb497f1f9 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -785,6 +785,21 @@ static inline unsigned long get_debugctlmsr(void) return debugctlmsr; } +static inline unsigned long get_debugctlmsr_on_cpu(int cpu) +{ + u64 debugctlmsr = 0; + u32 val1, val2; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2); + debugctlmsr = val1 | ((u64)val2 << 32); + + return debugctlmsr; +} + static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR @@ -794,6 +809,18 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } +static inline void update_debugctlmsr_on_cpu(int cpu, + unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, + (u32)((u64)debugctlmsr), + (u32)((u64)debugctlmsr >> 32)); +} + /* * from system description table in BIOS. Mostly for MCA use, but * others may find it useful: -- cgit v1.2.2 From de79f54f5347ad7ec6ff55ccbb6d4ab2a21f6a93 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:40 +0200 Subject: x86, bts, hw-branch-tracer: add _noirq variants to the debug store interface The hw-branch-tracer uses debug store functions from an on_each_cpu() context, which is simply wrong since the functions may sleep. Add _noirq variants for most functions, which may be called with interrupts disabled. Separate per-cpu and per-task tracing and allow per-cpu tracing to be controlled from any cpu. Make the hw-branch-tracer use the new debug store interface, synchronize with hotplug cpu event using get/put_online_cpus(), and remove the unnecessary spinlock. Make the ptrace bts and the ds selftest code use the new interface. Defer the ds selftest. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144555.658136000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 57 ++++- arch/x86/kernel/ds.c | 474 ++++++++++++++++++++++++++++++--------- arch/x86/kernel/ds_selftest.c | 9 +- arch/x86/kernel/ptrace.c | 5 +- kernel/trace/trace_hw_branches.c | 193 ++++++---------- 5 files changed, 492 insertions(+), 246 deletions(-) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 772f141afb9a..413e127e567d 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -15,8 +15,8 @@ * - buffer allocation (memory accounting) * * - * Copyright (C) 2007-2008 Intel Corporation. - * Markus Metzger , 2007-2008 + * Copyright (C) 2007-2009 Intel Corporation. + * Markus Metzger , 2007-2009 */ #ifndef _ASM_X86_DS_H @@ -83,8 +83,10 @@ enum ds_feature { * The interrupt threshold is independent from the overflow callback * to allow users to use their own overflow interrupt handling mechanism. * - * task: the task to request recording for; - * NULL for per-cpu recording on the current cpu + * The function might sleep. + * + * task: the task to request recording for + * cpu: the cpu to request recording for * base: the base pointer for the (non-pageable) buffer; * size: the size of the provided buffer in bytes * ovfl: pointer to a function to be called on buffer overflow; @@ -93,19 +95,28 @@ enum ds_feature { * -1 if no interrupt threshold is requested. * flags: a bit-mask of the above flags */ -extern struct bts_tracer *ds_request_bts(struct task_struct *task, - void *base, size_t size, - bts_ovfl_callback_t ovfl, - size_t th, unsigned int flags); -extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, - size_t th, unsigned int flags); +extern struct bts_tracer *ds_request_bts_task(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags); +extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags); +extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags); +extern struct pebs_tracer *ds_request_pebs_cpu(int cpu, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags); /* * Release BTS or PEBS resources * Suspend and resume BTS or PEBS tracing * + * Must be called with irq's enabled. + * * tracer: the tracer handle returned from ds_request_~() */ extern void ds_release_bts(struct bts_tracer *tracer); @@ -115,6 +126,28 @@ extern void ds_release_pebs(struct pebs_tracer *tracer); extern void ds_suspend_pebs(struct pebs_tracer *tracer); extern void ds_resume_pebs(struct pebs_tracer *tracer); +/* + * Release BTS or PEBS resources + * Suspend and resume BTS or PEBS tracing + * + * Cpu tracers must call this on the traced cpu. + * Task tracers must call ds_release_~_noirq() for themselves. + * + * May be called with irq's disabled. + * + * Returns 0 if successful; + * -EPERM if the cpu tracer does not trace the current cpu. + * -EPERM if the task tracer does not trace itself. + * + * tracer: the tracer handle returned from ds_request_~() + */ +extern int ds_release_bts_noirq(struct bts_tracer *tracer); +extern int ds_suspend_bts_noirq(struct bts_tracer *tracer); +extern int ds_resume_bts_noirq(struct bts_tracer *tracer); +extern int ds_release_pebs_noirq(struct pebs_tracer *tracer); +extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer); +extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer); + /* * The raw DS buffer state as it is used for BTS and PEBS recording. diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 2071b992c35c..21a3852abf68 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -245,60 +245,50 @@ struct ds_context { struct pebs_tracer *pebs_master; /* Use count: */ - unsigned long count; + unsigned long count; /* Pointer to the context pointer field: */ struct ds_context **this; - /* The traced task; NULL for current cpu: */ + /* The traced task; NULL for cpu tracing: */ struct task_struct *task; -}; -static DEFINE_PER_CPU(struct ds_context *, system_context_array); + /* The traced cpu; only valid if task is NULL: */ + int cpu; +}; -#define system_context per_cpu(system_context_array, smp_processor_id()) +static DEFINE_PER_CPU(struct ds_context *, cpu_context); -static inline struct ds_context *ds_get_context(struct task_struct *task) +static struct ds_context *ds_get_context(struct task_struct *task, int cpu) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &system_context); + (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); struct ds_context *context = NULL; struct ds_context *new_context = NULL; - unsigned long irq; - /* - * Chances are small that we already have a context. - * - * Contexts for per-cpu tracing are allocated using - * smp_call_function(). We must not sleep. - */ - new_context = kzalloc(sizeof(*new_context), GFP_ATOMIC); + /* Chances are small that we already have a context. */ + new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); if (!new_context) return NULL; - spin_lock_irqsave(&ds_lock, irq); + spin_lock_irq(&ds_lock); context = *p_context; - if (!context) { + if (likely(!context)) { context = new_context; context->this = p_context; context->task = task; + context->cpu = cpu; context->count = 0; - if (task) - set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); - *p_context = context; } context->count++; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); if (context != new_context) kfree(new_context); @@ -306,7 +296,7 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) return context; } -static inline void ds_put_context(struct ds_context *context) +static void ds_put_context(struct ds_context *context) { struct task_struct *task; unsigned long irq; @@ -328,8 +318,15 @@ static inline void ds_put_context(struct ds_context *context) if (task) clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, 0); + /* + * We leave the (now dangling) pointer to the DS configuration in + * the DS_AREA msr. This is as good or as bad as replacing it with + * NULL - the hardware would crash if we enabled tracing. + * + * This saves us some problems with having to write an msr on a + * different cpu while preventing others from doing the same for the + * next context for that same cpu. + */ spin_unlock_irqrestore(&ds_lock, irq); @@ -340,6 +337,31 @@ static inline void ds_put_context(struct ds_context *context) kfree(context); } +static void ds_install_ds_area(struct ds_context *context) +{ + unsigned long ds; + + ds = (unsigned long)context->ds; + + /* + * There is a race between the bts master and the pebs master. + * + * The thread/cpu access is synchronized via get/put_cpu() for + * task tracing and via wrmsr_on_cpu for cpu tracing. + * + * If bts and pebs are collected for the same task or same cpu, + * the same confiuration is written twice. + */ + if (context->task) { + get_cpu(); + if (context->task == current) + wrmsrl(MSR_IA32_DS_AREA, ds); + set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + put_cpu(); + } else + wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA, + (u32)((u64)ds), (u32)((u64)ds >> 32)); +} /* * Call the tracer's callback on a buffer overflow. @@ -622,6 +644,7 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, * The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ + ith *= ds_cfg.sizeof_rec[qual]; trace->ith = (void *)(buffer + size - ith); trace->flags = flags; @@ -630,7 +653,7 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, enum ds_qualifier qual, struct task_struct *task, - void *base, size_t size, size_t th, unsigned int flags) + int cpu, void *base, size_t size, size_t th) { struct ds_context *context; int error; @@ -643,7 +666,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, if (!base) goto out; - /* We require some space to do alignment adjustments below. */ + /* We need space for alignment adjustments in ds_init_ds_trace(). */ error = -EINVAL; if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) goto out; @@ -660,25 +683,27 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, tracer->size = size; error = -ENOMEM; - context = ds_get_context(task); + context = ds_get_context(task, cpu); if (!context) goto out; tracer->context = context; - ds_init_ds_trace(trace, qual, base, size, th, flags); + /* + * Defer any tracer-specific initialization work for the context until + * context ownership has been clarified. + */ error = 0; out: return error; } -struct bts_tracer *ds_request_bts(struct task_struct *task, - void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th, - unsigned int flags) +static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct bts_tracer *tracer; - unsigned long irq; int error; /* Buffer overflow notification is not yet implemented. */ @@ -690,42 +715,46 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, if (error < 0) goto out; - /* - * Per-cpu tracing is typically requested using smp_call_function(). - * We must not sleep. - */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); if (!tracer) goto out_put_tracer; tracer->ovfl = ovfl; + /* Do some more error checking and acquire a tracing context. */ error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_bts, task, base, size, th, flags); + ds_bts, task, cpu, base, size, th); if (error < 0) goto out_tracer; - - spin_lock_irqsave(&ds_lock, irq); + /* Claim the bts part of the tracing context we acquired above. */ + spin_lock_irq(&ds_lock); error = -EPERM; if (tracer->ds.context->bts_master) goto out_unlock; tracer->ds.context->bts_master = tracer; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); + /* + * Now that we own the bts part of the context, let's complete the + * initialization for that part. + */ + ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags); + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_install_ds_area(tracer->ds.context); tracer->trace.read = bts_read; tracer->trace.write = bts_write; - ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + /* Start tracing. */ ds_resume_bts(tracer); return tracer; out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); @@ -735,13 +764,27 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, return ERR_PTR(error); } -struct pebs_tracer *ds_request_pebs(struct task_struct *task, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, size_t th, - unsigned int flags) +struct bts_tracer *ds_request_bts_task(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_bts(task, 0, base, size, ovfl, th, flags); +} + +struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags); +} + +static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct pebs_tracer *tracer; - unsigned long irq; int error; /* Buffer overflow notification is not yet implemented. */ @@ -753,37 +796,43 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, if (error < 0) goto out; - /* - * Per-cpu tracing is typically requested using smp_call_function(). - * We must not sleep. - */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); if (!tracer) goto out_put_tracer; tracer->ovfl = ovfl; + /* Do some more error checking and acquire a tracing context. */ error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_pebs, task, base, size, th, flags); + ds_pebs, task, cpu, base, size, th); if (error < 0) goto out_tracer; - spin_lock_irqsave(&ds_lock, irq); + /* Claim the pebs part of the tracing context we acquired above. */ + spin_lock_irq(&ds_lock); error = -EPERM; if (tracer->ds.context->pebs_master) goto out_unlock; tracer->ds.context->pebs_master = tracer; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); + /* + * Now that we own the pebs part of the context, let's complete the + * initialization for that part. + */ + ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags); ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); + ds_install_ds_area(tracer->ds.context); + + /* Start tracing. */ ds_resume_pebs(tracer); return tracer; out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); @@ -793,16 +842,26 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, return ERR_PTR(error); } -void ds_release_bts(struct bts_tracer *tracer) +struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags) { - struct task_struct *task; + return ds_request_pebs(task, 0, base, size, ovfl, th, flags); +} - if (!tracer) - return; +struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags); +} - task = tracer->ds.context->task; +static void ds_free_bts(struct bts_tracer *tracer) +{ + struct task_struct *task; - ds_suspend_bts(tracer); + task = tracer->ds.context->task; WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); tracer->ds.context->bts_master = NULL; @@ -817,9 +876,69 @@ void ds_release_bts(struct bts_tracer *tracer) kfree(tracer); } +void ds_release_bts(struct bts_tracer *tracer) +{ + might_sleep(); + + if (!tracer) + return; + + ds_suspend_bts(tracer); + ds_free_bts(tracer); +} + +int ds_release_bts_noirq(struct bts_tracer *tracer) +{ + struct task_struct *task; + unsigned long irq; + int error; + + if (!tracer) + return 0; + + task = tracer->ds.context->task; + + local_irq_save(irq); + + error = -EPERM; + if (!task && + (tracer->ds.context->cpu != smp_processor_id())) + goto out; + + error = -EPERM; + if (task && (task != current)) + goto out; + + ds_suspend_bts_noirq(tracer); + ds_free_bts(tracer); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static void update_task_debugctlmsr(struct task_struct *task, + unsigned long debugctlmsr) +{ + task->thread.debugctlmsr = debugctlmsr; + + get_cpu(); + if (task == current) + update_debugctlmsr(debugctlmsr); + + if (task->thread.debugctlmsr) + set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + else + clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + put_cpu(); +} + void ds_suspend_bts(struct bts_tracer *tracer) { struct task_struct *task; + unsigned long debugctlmsr; + int cpu; if (!tracer) return; @@ -827,29 +946,60 @@ void ds_suspend_bts(struct bts_tracer *tracer) tracer->flags = 0; task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; - if (!task || (task == current)) - update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); + WARN_ON(!task && irqs_disabled()); - if (task) { - task->thread.debugctlmsr &= ~BTS_CONTROL; + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr_on_cpu(cpu)); + debugctlmsr &= ~BTS_CONTROL; - if (!task->thread.debugctlmsr) - clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - } + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr_on_cpu(cpu, debugctlmsr); } -void ds_resume_bts(struct bts_tracer *tracer) +int ds_suspend_bts_noirq(struct bts_tracer *tracer) { struct task_struct *task; - unsigned long control; + unsigned long debugctlmsr, irq; + int cpu, error = 0; if (!tracer) - return; + return 0; - tracer->flags = tracer->trace.ds.flags; + tracer->flags = 0; task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + local_irq_save(irq); + + error = -EPERM; + if (!task && (cpu != smp_processor_id())) + goto out; + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr()); + debugctlmsr &= ~BTS_CONTROL; + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr(debugctlmsr); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static unsigned long ds_bts_control(struct bts_tracer *tracer) +{ + unsigned long control; control = ds_cfg.ctl[dsf_bts]; if (!(tracer->trace.ds.flags & BTS_KERNEL)) @@ -857,25 +1007,77 @@ void ds_resume_bts(struct bts_tracer *tracer) if (!(tracer->trace.ds.flags & BTS_USER)) control |= ds_cfg.ctl[dsf_bts_user]; - if (task) { - task->thread.debugctlmsr |= control; - set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - } - - if (!task || (task == current)) - update_debugctlmsr(get_debugctlmsr() | control); + return control; } -void ds_release_pebs(struct pebs_tracer *tracer) +void ds_resume_bts(struct bts_tracer *tracer) { struct task_struct *task; + unsigned long debugctlmsr; + int cpu; if (!tracer) return; + tracer->flags = tracer->trace.ds.flags; + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; - ds_suspend_pebs(tracer); + WARN_ON(!task && irqs_disabled()); + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr_on_cpu(cpu)); + debugctlmsr |= ds_bts_control(tracer); + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr_on_cpu(cpu, debugctlmsr); +} + +int ds_resume_bts_noirq(struct bts_tracer *tracer) +{ + struct task_struct *task; + unsigned long debugctlmsr, irq; + int cpu, error = 0; + + if (!tracer) + return 0; + + tracer->flags = tracer->trace.ds.flags; + + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + local_irq_save(irq); + + error = -EPERM; + if (!task && (cpu != smp_processor_id())) + goto out; + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr()); + debugctlmsr |= ds_bts_control(tracer); + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr(debugctlmsr); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static void ds_free_pebs(struct pebs_tracer *tracer) +{ + struct task_struct *task; + + task = tracer->ds.context->task; WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); tracer->ds.context->pebs_master = NULL; @@ -886,16 +1088,68 @@ void ds_release_pebs(struct pebs_tracer *tracer) kfree(tracer); } +void ds_release_pebs(struct pebs_tracer *tracer) +{ + might_sleep(); + + if (!tracer) + return; + + ds_suspend_pebs(tracer); + ds_free_pebs(tracer); +} + +int ds_release_pebs_noirq(struct pebs_tracer *tracer) +{ + struct task_struct *task; + unsigned long irq; + int error; + + if (!tracer) + return 0; + + task = tracer->ds.context->task; + + local_irq_save(irq); + + error = -EPERM; + if (!task && + (tracer->ds.context->cpu != smp_processor_id())) + goto out; + + error = -EPERM; + if (task && (task != current)) + goto out; + + ds_suspend_pebs_noirq(tracer); + ds_free_pebs(tracer); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + void ds_suspend_pebs(struct pebs_tracer *tracer) { } +int ds_suspend_pebs_noirq(struct pebs_tracer *tracer) +{ + return 0; +} + void ds_resume_pebs(struct pebs_tracer *tracer) { } +int ds_resume_pebs_noirq(struct pebs_tracer *tracer) +{ + return 0; +} + const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) { if (!tracer) @@ -1004,26 +1258,6 @@ ds_configure(const struct ds_configuration *cfg, printk(KERN_INFO "[ds] pebs not available\n"); } - if (ds_cfg.sizeof_rec[ds_bts]) { - int error; - - error = ds_selftest_bts(); - if (error) { - WARN(1, "[ds] selftest failed. disabling bts.\n"); - ds_cfg.sizeof_rec[ds_bts] = 0; - } - } - - if (ds_cfg.sizeof_rec[ds_pebs]) { - int error; - - error = ds_selftest_pebs(); - if (error) { - WARN(1, "[ds] selftest failed. disabling pebs.\n"); - ds_cfg.sizeof_rec[ds_pebs] = 0; - } - } - printk(KERN_INFO "[ds] sizes: address: %u bit, ", 8 * ds_cfg.sizeof_ptr_field); printk("bts/pebs record: %u/%u bytes\n", @@ -1127,3 +1361,29 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) void ds_exit_thread(struct task_struct *tsk) { } + +static __init int ds_selftest(void) +{ + if (ds_cfg.sizeof_rec[ds_bts]) { + int error; + + error = ds_selftest_bts(); + if (error) { + WARN(1, "[ds] selftest failed. disabling bts.\n"); + ds_cfg.sizeof_rec[ds_bts] = 0; + } + } + + if (ds_cfg.sizeof_rec[ds_pebs]) { + int error; + + error = ds_selftest_pebs(); + if (error) { + WARN(1, "[ds] selftest failed. disabling pebs.\n"); + ds_cfg.sizeof_rec[ds_pebs] = 0; + } + } + + return 0; +} +device_initcall(ds_selftest); diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index 8c46fbf38c46..e5a263c8a14c 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -10,11 +10,12 @@ #include #include +#include #include -#define DS_SELFTEST_BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ +#define BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ static int ds_selftest_bts_consistency(const struct bts_trace *trace) @@ -125,12 +126,12 @@ int ds_selftest_bts(void) struct bts_tracer *tracer; int error = 0; void *top; - unsigned char buffer[DS_SELFTEST_BUFFER_SIZE]; + unsigned char buffer[BUFFER_SIZE]; printk(KERN_INFO "[ds] bts selftest..."); - tracer = ds_request_bts(NULL, buffer, DS_SELFTEST_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); + tracer = ds_request_bts_cpu(smp_processor_id(), buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); if (IS_ERR(tracer)) { error = PTR_ERR(tracer); tracer = NULL; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7c21d1e8cae7..adbb24322d8f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -800,8 +800,9 @@ static int ptrace_bts_config(struct task_struct *child, if (cfg.flags & PTRACE_BTS_O_SCHED) flags |= BTS_TIMESTAMPS; - context->tracer = ds_request_bts(child, context->buffer, context->size, - NULL, (size_t)-1, flags); + context->tracer = + ds_request_bts_task(child, context->buffer, context->size, + NULL, (size_t)-1, flags); if (unlikely(IS_ERR(context->tracer))) { int error = PTR_ERR(context->tracer); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 8b2109a6c61c..50565d8cd2ed 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -4,7 +4,6 @@ * Copyright (C) 2008-2009 Intel Corporation. * Markus Metzger , 2008-2009 */ -#include #include #include #include @@ -21,168 +20,113 @@ #define BTS_BUFFER_SIZE (1 << 13) -/* - * The tracer lock protects the below per-cpu tracer array. - * It needs to be held to: - * - start tracing on all cpus - * - stop tracing on all cpus - * - start tracing on a single hotplug cpu - * - stop tracing on a single hotplug cpu - * - read the trace from all cpus - * - read the trace from a single cpu - */ -static DEFINE_SPINLOCK(bts_tracer_lock); static DEFINE_PER_CPU(struct bts_tracer *, tracer); static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); #define this_tracer per_cpu(tracer, smp_processor_id()) -#define this_buffer per_cpu(buffer, smp_processor_id()) static int trace_hw_branches_enabled __read_mostly; static int trace_hw_branches_suspended __read_mostly; static struct trace_array *hw_branch_trace __read_mostly; -/* - * Initialize the tracer for the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_init_cpu(void *arg) +static void bts_trace_init_cpu(int cpu) { - if (this_tracer) - ds_release_bts(this_tracer); + per_cpu(tracer, cpu) = + ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); - this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - if (IS_ERR(this_tracer)) { - this_tracer = NULL; - return; - } + if (IS_ERR(per_cpu(tracer, cpu))) + per_cpu(tracer, cpu) = NULL; } static int bts_trace_init(struct trace_array *tr) { - int cpu, avail; - - spin_lock(&bts_tracer_lock); + int cpu; hw_branch_trace = tr; + trace_hw_branches_enabled = 0; - on_each_cpu(bts_trace_init_cpu, NULL, 1); - - /* Check on how many cpus we could enable tracing */ - avail = 0; - for_each_online_cpu(cpu) - if (per_cpu(tracer, cpu)) - avail++; + get_online_cpus(); + for_each_online_cpu(cpu) { + bts_trace_init_cpu(cpu); - trace_hw_branches_enabled = (avail ? 1 : 0); + if (likely(per_cpu(tracer, cpu))) + trace_hw_branches_enabled = 1; + } trace_hw_branches_suspended = 0; - - spin_unlock(&bts_tracer_lock); - + put_online_cpus(); /* If we could not enable tracing on a single cpu, we fail. */ - return avail ? 0 : -EOPNOTSUPP; -} - -/* - * Release the tracer for the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_release_cpu(void *arg) -{ - if (this_tracer) { - ds_release_bts(this_tracer); - this_tracer = NULL; - } + return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP; } static void bts_trace_reset(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; - on_each_cpu(bts_trace_release_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) { + if (likely(per_cpu(tracer, cpu))) { + ds_release_bts(per_cpu(tracer, cpu)); + per_cpu(tracer, cpu) = NULL; + } + } trace_hw_branches_enabled = 0; trace_hw_branches_suspended = 0; - - spin_unlock(&bts_tracer_lock); -} - -/* - * Resume tracing on the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_resume_cpu(void *arg) -{ - if (this_tracer) - ds_resume_bts(this_tracer); + put_online_cpus(); } static void bts_trace_start(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; - on_each_cpu(bts_trace_resume_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_resume_bts(per_cpu(tracer, cpu)); trace_hw_branches_suspended = 0; - - spin_unlock(&bts_tracer_lock); -} - -/* - * Suspend tracing on the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_suspend_cpu(void *arg) -{ - if (this_tracer) - ds_suspend_bts(this_tracer); + put_online_cpus(); } static void bts_trace_stop(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; - on_each_cpu(bts_trace_suspend_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); trace_hw_branches_suspended = 1; - - spin_unlock(&bts_tracer_lock); + put_online_cpus(); } static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; - - spin_lock(&bts_tracer_lock); - - if (!trace_hw_branches_enabled) - goto out; + int cpu = (long)hcpu; switch (action) { case CPU_ONLINE: case CPU_DOWN_FAILED: - smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1); - - if (trace_hw_branches_suspended) - smp_call_function_single(cpu, bts_trace_suspend_cpu, - NULL, 1); + /* The notification is sent with interrupts enabled. */ + if (trace_hw_branches_enabled) { + bts_trace_init_cpu(cpu); + + if (trace_hw_branches_suspended && + likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); + } break; + case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1); - break; + /* The notification is sent with interrupts enabled. */ + if (likely(per_cpu(tracer, cpu))) { + ds_release_bts(per_cpu(tracer, cpu)); + per_cpu(tracer, cpu) = NULL; + } } - out: - spin_unlock(&bts_tracer_lock); return NOTIFY_DONE; } @@ -274,7 +218,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at) /* * Collect the trace on the current cpu and write it into the ftrace buffer. * - * pre: bts_tracer_lock must be locked + * pre: tracing must be suspended on the current cpu */ static void trace_bts_cpu(void *arg) { @@ -291,10 +235,9 @@ static void trace_bts_cpu(void *arg) if (unlikely(!this_tracer)) return; - ds_suspend_bts(this_tracer); trace = ds_read_bts(this_tracer); if (!trace) - goto out; + return; for (at = trace->ds.top; (void *)at < trace->ds.end; at += trace->ds.size) @@ -303,18 +246,27 @@ static void trace_bts_cpu(void *arg) for (at = trace->ds.begin; (void *)at < trace->ds.top; at += trace->ds.size) trace_bts_at(trace, at); - -out: - ds_resume_bts(this_tracer); } static void trace_bts_prepare(struct trace_iterator *iter) { - spin_lock(&bts_tracer_lock); + int cpu; + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); + /* + * We need to collect the trace on the respective cpu since ftrace + * implicitly adds the record for the current cpu. + * Once that is more flexible, we could collect the data from any cpu. + */ on_each_cpu(trace_bts_cpu, iter->tr, 1); - spin_unlock(&bts_tracer_lock); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_resume_bts(per_cpu(tracer, cpu)); + put_online_cpus(); } static void trace_bts_close(struct trace_iterator *iter) @@ -324,12 +276,11 @@ static void trace_bts_close(struct trace_iterator *iter) void trace_hw_branch_oops(void) { - spin_lock(&bts_tracer_lock); - - if (trace_hw_branches_enabled) + if (this_tracer) { + ds_suspend_bts_noirq(this_tracer); trace_bts_cpu(hw_branch_trace); - - spin_unlock(&bts_tracer_lock); + ds_resume_bts_noirq(this_tracer); + } } struct tracer bts_tracer __read_mostly = -- cgit v1.2.2 From 4d657e51dfc042216febd4a007c6f36881f9256d Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:41 +0200 Subject: x86, hw-branch-tracer: allocate selftest iterator on heap Allocate the trace_iterator for the hw-branch-tracer selftest on the heap. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144556.578777000@intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_selftest.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 499d01c44cd1..00dd6485bdd7 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -757,7 +757,7 @@ int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr) { - struct trace_iterator iter; + struct trace_iterator *iter; struct tracer tracer; unsigned long count; int ret; @@ -777,17 +777,21 @@ trace_selftest_startup_hw_branches(struct tracer *trace, * The hw-branch tracer needs to collect the trace from the various * cpu trace buffers - before tracing is stopped. */ - memset(&iter, 0, sizeof(iter)); + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + memcpy(&tracer, trace, sizeof(tracer)); - iter.trace = &tracer; - iter.tr = tr; - iter.pos = -1; - mutex_init(&iter.mutex); + iter->trace = &tracer; + iter->tr = tr; + iter->pos = -1; + mutex_init(&iter->mutex); - trace->open(&iter); + trace->open(iter); - mutex_destroy(&iter.mutex); + mutex_destroy(&iter->mutex); + kfree(iter); tracing_stop(); -- cgit v1.2.2 From 353afeea24cc51aafc0ff21a72ec740b6f0af50c Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:42 +0200 Subject: x86, ds: fix compiler warning Size_t is defined differently on i386 and x86_64. Change type to avoid compiler warning. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144557.523964000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index e5a263c8a14c..e1ba5101b576 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -87,7 +87,7 @@ static int ds_selftest_bts_read(struct bts_tracer *tracer, /* Now to the test itself. */ for (at = from; (void *)at < to; at += trace->ds.size) { struct bts_struct bts; - size_t index; + unsigned long index; int error; if (((void *)at - trace->ds.begin) % trace->ds.size) { -- cgit v1.2.2 From 84f201139245c30777ff858e71b8d7e134b8c3ed Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:43 +0200 Subject: x86, ds: fix bounds check in ds selftest Fix a bad bounds check in the debug store selftest. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144558.450027000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index e1ba5101b576..cccc19a38f6d 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -47,8 +47,13 @@ static int ds_selftest_bts_consistency(const struct bts_trace *trace) printk(KERN_CONT "bad bts buffer setup..."); error = -1; } + /* + * We allow top in [begin; end], since its not clear when the + * overflow adjustment happens: after the increment or before the + * write. + */ if ((trace->ds.top < trace->ds.begin) || - (trace->ds.end <= trace->ds.top)) { + (trace->ds.end < trace->ds.top)) { printk(KERN_CONT "bts top out of bounds..."); error = -1; } -- cgit v1.2.2 From 01f6569ece6915616f6cae1d7d8b46ab8da9c1bd Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:44 +0200 Subject: x86, ds: selftest each cpu Perform debug store selftests on each cpu. Cover both the normal and the _noirq variant of the debug store interface. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144559.394583000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 182 +++++++++++++++++++++++++++++++----------- 1 file changed, 135 insertions(+), 47 deletions(-) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index cccc19a38f6d..599a96300628 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -11,13 +11,21 @@ #include #include #include +#include #include -#define BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ +#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ +struct ds_selftest_bts_conf { + struct bts_tracer *tracer; + int error; + int (*suspend)(struct bts_tracer *); + int (*resume)(struct bts_tracer *); +}; + static int ds_selftest_bts_consistency(const struct bts_trace *trace) { int error = 0; @@ -125,36 +133,32 @@ static int ds_selftest_bts_read(struct bts_tracer *tracer, return 0; } -int ds_selftest_bts(void) +static void ds_selftest_bts_cpu(void *arg) { + struct ds_selftest_bts_conf *conf = arg; const struct bts_trace *trace; - struct bts_tracer *tracer; - int error = 0; void *top; - unsigned char buffer[BUFFER_SIZE]; - printk(KERN_INFO "[ds] bts selftest..."); - - tracer = ds_request_bts_cpu(smp_processor_id(), buffer, BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - if (IS_ERR(tracer)) { - error = PTR_ERR(tracer); - tracer = NULL; + if (IS_ERR(conf->tracer)) { + conf->error = PTR_ERR(conf->tracer); + conf->tracer = NULL; printk(KERN_CONT - "initialization failed (err: %d)...", error); - goto out; + "initialization failed (err: %d)...", conf->error); + return; } - /* The return should already give us enough trace. */ - ds_suspend_bts(tracer); + /* We should meanwhile have enough trace. */ + conf->error = conf->suspend(conf->tracer); + if (conf->error < 0) + return; /* Let's see if we can access the trace. */ - trace = ds_read_bts(tracer); + trace = ds_read_bts(conf->tracer); - error = ds_selftest_bts_consistency(trace); - if (error < 0) - goto out; + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; /* If everything went well, we should have a few trace entries. */ if (trace->ds.top == trace->ds.begin) { @@ -168,10 +172,11 @@ int ds_selftest_bts(void) } /* Let's try to read the trace we collected. */ - error = ds_selftest_bts_read(tracer, trace, + conf->error = + ds_selftest_bts_read(conf->tracer, trace, trace->ds.begin, trace->ds.top); - if (error < 0) - goto out; + if (conf->error < 0) + return; /* * Let's read the trace again. @@ -179,26 +184,31 @@ int ds_selftest_bts(void) */ top = trace->ds.top; - trace = ds_read_bts(tracer); - error = ds_selftest_bts_consistency(trace); - if (error < 0) - goto out; + trace = ds_read_bts(conf->tracer); + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; if (top != trace->ds.top) { printk(KERN_CONT "suspend not working..."); - error = -1; - goto out; + conf->error = -1; + return; } /* Let's collect some more trace - see if resume is working. */ - ds_resume_bts(tracer); - ds_suspend_bts(tracer); + conf->error = conf->resume(conf->tracer); + if (conf->error < 0) + return; + + conf->error = conf->suspend(conf->tracer); + if (conf->error < 0) + return; - trace = ds_read_bts(tracer); + trace = ds_read_bts(conf->tracer); - error = ds_selftest_bts_consistency(trace); - if (error < 0) - goto out; + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; if (trace->ds.top == top) { /* @@ -210,35 +220,113 @@ int ds_selftest_bts(void) printk(KERN_CONT "no resume progress/overflow..."); - error = ds_selftest_bts_read(tracer, trace, + conf->error = + ds_selftest_bts_read(conf->tracer, trace, trace->ds.begin, trace->ds.end); } else if (trace->ds.top < top) { /* * We had a buffer overflow - the entire buffer should * contain trace records. */ - error = ds_selftest_bts_read(tracer, trace, + conf->error = + ds_selftest_bts_read(conf->tracer, trace, trace->ds.begin, trace->ds.end); } else { /* * It is quite likely that the buffer did not overflow. * Let's just check the delta trace. */ - error = ds_selftest_bts_read(tracer, trace, - top, trace->ds.top); + conf->error = + ds_selftest_bts_read(conf->tracer, trace, top, + trace->ds.top); } - if (error < 0) - goto out; + if (conf->error < 0) + return; - error = 0; + conf->error = 0; +} - /* The final test: release the tracer while tracing is suspended. */ - out: - ds_release_bts(tracer); +static int ds_suspend_bts_wrap(struct bts_tracer *tracer) +{ + ds_suspend_bts(tracer); + return 0; +} + +static int ds_resume_bts_wrap(struct bts_tracer *tracer) +{ + ds_resume_bts(tracer); + return 0; +} - printk(KERN_CONT "%s.\n", (error ? "failed" : "passed")); +static void ds_release_bts_noirq_wrap(void *tracer) +{ + (void)ds_release_bts_noirq(tracer); +} - return error; +static int ds_selftest_bts_bad_release_noirq(int cpu, + struct bts_tracer *tracer) +{ + int error = -EPERM; + + /* Try to release the tracer on the wrong cpu. */ + get_cpu(); + if (cpu != smp_processor_id()) { + error = ds_release_bts_noirq(tracer); + if (error != -EPERM) + printk(KERN_CONT "release on wrong cpu..."); + } + put_cpu(); + + return error ? 0 : -1; +} + +int ds_selftest_bts(void) +{ + struct ds_selftest_bts_conf conf; + unsigned char buffer[BUFFER_SIZE]; + int cpu; + + printk(KERN_INFO "[ds] bts selftest..."); + conf.error = 0; + + get_online_cpus(); + for_each_online_cpu(cpu) { + conf.suspend = ds_suspend_bts_wrap; + conf.resume = ds_resume_bts_wrap; + conf.tracer = + ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + ds_selftest_bts_cpu(&conf); + ds_release_bts(conf.tracer); + if (conf.error < 0) + goto out; + + conf.suspend = ds_suspend_bts_noirq; + conf.resume = ds_resume_bts_noirq; + conf.tracer = + ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1); + if (conf.error >= 0) { + conf.error = + ds_selftest_bts_bad_release_noirq(cpu, + conf.tracer); + /* We must not release the tracer twice. */ + if (conf.error < 0) + conf.tracer = NULL; + } + smp_call_function_single(cpu, ds_release_bts_noirq_wrap, + conf.tracer, 1); + if (conf.error < 0) + goto out; + } + + conf.error = 0; + out: + put_online_cpus(); + printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed")); + + return conf.error; } int ds_selftest_pebs(void) -- cgit v1.2.2 From 3a68eef945b234f286406d96dc690fe17863c203 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:45 +0200 Subject: x86, ds: add task tracing selftest Add selftests to cover per-task branch tracing. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144600.329346000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 71 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index 599a96300628..a40b2533c71e 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -280,10 +280,51 @@ static int ds_selftest_bts_bad_release_noirq(int cpu, return error ? 0 : -1; } +static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer) +{ + struct bts_tracer *tracer; + int error; + + /* Try to request cpu tracing while task tracing is active. */ + tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, + (size_t)-1, BTS_KERNEL); + error = PTR_ERR(tracer); + if (!IS_ERR(tracer)) { + ds_release_bts(tracer); + error = 0; + } + + if (error != -EPERM) + printk(KERN_CONT "cpu/task tracing overlap..."); + + return error ? 0 : -1; +} + +static int ds_selftest_bts_bad_request_task(void *buffer) +{ + struct bts_tracer *tracer; + int error; + + /* Try to request cpu tracing while task tracing is active. */ + tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL, + (size_t)-1, BTS_KERNEL); + error = PTR_ERR(tracer); + if (!IS_ERR(tracer)) { + error = 0; + ds_release_bts(tracer); + } + + if (error != -EPERM) + printk(KERN_CONT "task/cpu tracing overlap..."); + + return error ? 0 : -1; +} + int ds_selftest_bts(void) { struct ds_selftest_bts_conf conf; unsigned char buffer[BUFFER_SIZE]; + unsigned long irq; int cpu; printk(KERN_INFO "[ds] bts selftest..."); @@ -297,6 +338,8 @@ int ds_selftest_bts(void) ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, (size_t)-1, BTS_KERNEL); ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_task(buffer); ds_release_bts(conf.tracer); if (conf.error < 0) goto out; @@ -315,12 +358,40 @@ int ds_selftest_bts(void) if (conf.error < 0) conf.tracer = NULL; } + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_task(buffer); smp_call_function_single(cpu, ds_release_bts_noirq_wrap, conf.tracer, 1); if (conf.error < 0) goto out; } + conf.suspend = ds_suspend_bts_wrap; + conf.resume = ds_resume_bts_wrap; + conf.tracer = + ds_request_bts_task(current, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); + ds_release_bts(conf.tracer); + if (conf.error < 0) + goto out; + + conf.suspend = ds_suspend_bts_noirq; + conf.resume = ds_resume_bts_noirq; + conf.tracer = + ds_request_bts_task(current, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + local_irq_save(irq); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); + ds_release_bts_noirq(conf.tracer); + local_irq_restore(irq); + if (conf.error < 0) + goto out; + conf.error = 0; out: put_online_cpus(); -- cgit v1.2.2 From 2311f0de21c17b2a8b960677a9cccfbfa52beb35 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:46 +0200 Subject: x86, ds: add leakage warning Add a warning in case a debug store context is not removed before the task it is attached to is freed. Remove the old warning at thread exit. It is too early. Declare the debug store context field in thread_struct unconditionally. Remove ds_copy_thread() and ds_exit_thread() and do the work directly in process*.c. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144601.254472000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 9 --------- arch/x86/include/asm/processor.h | 4 +--- arch/x86/kernel/ds.c | 10 ---------- arch/x86/kernel/process.c | 5 +++-- arch/x86/kernel/process_32.c | 3 ++- arch/x86/kernel/process_64.c | 3 ++- 6 files changed, 8 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 413e127e567d..149e5208e967 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -285,21 +285,12 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); */ extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); -/* - * Task clone/init and cleanup work - */ -extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father); -extern void ds_exit_thread(struct task_struct *tsk); - #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} static inline void ds_switch_to(struct task_struct *prev, struct task_struct *next) {} -static inline void ds_copy_thread(struct task_struct *tsk, - struct task_struct *father) {} -static inline void ds_exit_thread(struct task_struct *tsk) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 1efeb497f1f9..7c39de7e709a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -454,10 +454,8 @@ struct thread_struct { unsigned io_bitmap_max; /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ unsigned long debugctlmsr; -#ifdef CONFIG_X86_DS -/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */ + /* Debug Store context; see asm/ds.h */ struct ds_context *ds_ctx; -#endif /* CONFIG_X86_DS */ }; static inline unsigned long native_get_debugreg(int regno) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 21a3852abf68..71cab3b62dce 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1352,16 +1352,6 @@ void ds_switch_to(struct task_struct *prev, struct task_struct *next) update_debugctlmsr(debugctlmsr); } -void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) -{ - clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); - tsk->thread.ds_ctx = NULL; -} - -void ds_exit_thread(struct task_struct *tsk) -{ -} - static __init int ds_selftest(void) { if (ds_cfg.sizeof_rec[ds_bts]) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e847..fb5dfb891f0f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -14,6 +14,7 @@ #include #include #include +#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -45,6 +46,8 @@ void free_thread_xstate(struct task_struct *tsk) kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } + + WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } void free_thread_info(struct thread_info *ti) @@ -83,8 +86,6 @@ void exit_thread(void) put_cpu(); kfree(bp); } - - ds_exit_thread(current); } void flush_thread(void) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84043a2..b5e4bfef4472 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -290,7 +290,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.io_bitmap_max = 0; } - ds_copy_thread(p, current); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b751a41392b1..5a1a1de292ec 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -335,7 +335,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, goto out; } - ds_copy_thread(p, me); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; -- cgit v1.2.2 From ee811517a5604aa63fae803b7c044712699e1303 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:47 +0200 Subject: x86, ds: use single debug store cpu configuration Use a single configuration for all cpus. Reported-by: Ingo Molnar Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144602.191165000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 71cab3b62dce..443f415441da 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -47,9 +47,8 @@ struct ds_configuration { /* Control bit-masks indexed by enum ds_feature: */ unsigned long ctl[dsf_ctl_max]; }; -static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); +static struct ds_configuration ds_cfg __read_mostly; -#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) /* Maximal size of a DS configuration: */ #define MAX_SIZEOF_DS (12 * 8) @@ -1268,6 +1267,10 @@ ds_configure(const struct ds_configuration *cfg, void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) { + /* Only configure the first cpu. Others are identical. */ + if (ds_cfg.name) + return; + switch (c->x86) { case 0x6: switch (c->x86_model) { -- cgit v1.2.2 From 0f4814065ff8c24ca8bfd75c9b73502be152c287 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:48 +0200 Subject: x86, ptrace: add bts context unconditionally Add the ptrace bts context field to task_struct unconditionally. Initialize the field directly in copy_process(). Remove all the unneeded functionality used to initialize that field. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144603.292754000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ptrace.h | 9 ++++----- arch/x86/kernel/ptrace.c | 20 +------------------- include/linux/ptrace.h | 10 ---------- include/linux/sched.h | 2 -- kernel/fork.c | 4 ++-- kernel/ptrace.c | 10 ---------- 6 files changed, 7 insertions(+), 48 deletions(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index e304b66abeea..5cdd19f20b5b 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -235,12 +235,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); -extern void x86_ptrace_untrace(struct task_struct *); -extern void x86_ptrace_fork(struct task_struct *child, - unsigned long clone_flags); +#ifdef CONFIG_X86_PTRACE_BTS +extern void ptrace_bts_untrace(struct task_struct *tsk); -#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk) -#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags) +#define arch_ptrace_untrace(tsk) ptrace_bts_untrace(tsk) +#endif /* CONFIG_X86_PTRACE_BTS */ #endif /* __KERNEL__ */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index adbb24322d8f..b32a8ee53381 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -887,37 +887,19 @@ static int ptrace_bts_size(struct task_struct *child) return (trace->ds.top - trace->ds.begin) / trace->ds.size; } -static inline void ptrace_bts_fork(struct task_struct *tsk) -{ - tsk->bts = NULL; -} - /* * Called from __ptrace_unlink() after the child has been moved back * to its original parent. */ -static inline void ptrace_bts_untrace(struct task_struct *child) +void ptrace_bts_untrace(struct task_struct *child) { if (unlikely(child->bts)) { free_bts_context(child->bts); child->bts = NULL; } } -#else -static inline void ptrace_bts_fork(struct task_struct *tsk) {} -static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ -void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) -{ - ptrace_bts_fork(child); -} - -void x86_ptrace_untrace(struct task_struct *child) -{ - ptrace_bts_untrace(child); -} - /* * Called by kernel/ptrace.c when detaching.. * diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 67c15653fc23..59e133d39d50 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -95,7 +95,6 @@ extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer); -extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags); #define PTRACE_MODE_READ 1 #define PTRACE_MODE_ATTACH 2 /* Returns 0 on success, -errno on denial. */ @@ -327,15 +326,6 @@ static inline void user_enable_block_step(struct task_struct *task) #define arch_ptrace_untrace(task) do { } while (0) #endif -#ifndef arch_ptrace_fork -/* - * Do machine-specific work to initialize a new task. - * - * This is called from copy_process(). - */ -#define arch_ptrace_fork(child, clone_flags) do { } while (0) -#endif - extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); diff --git a/include/linux/sched.h b/include/linux/sched.h index 52b8cd049c2e..451186a22ef5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1205,13 +1205,11 @@ struct task_struct { struct list_head ptraced; struct list_head ptrace_entry; -#ifdef CONFIG_X86_PTRACE_BTS /* * This is the tracer handle for the ptrace BTS extension. * This field actually belongs to the ptracer task. */ struct bts_context *bts; -#endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; diff --git a/kernel/fork.c b/kernel/fork.c index 660c2b8765bc..69bde7a22e9b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1086,8 +1086,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif - if (unlikely(current->ptrace)) - ptrace_fork(p, clone_flags); + + p->bts = NULL; /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aaad0ec34194..321127d965c2 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -26,16 +26,6 @@ #include -/* - * Initialize a new task whose father had been ptraced. - * - * Called from copy_process(). - */ -void ptrace_fork(struct task_struct *child, unsigned long clone_flags) -{ - arch_ptrace_fork(child, clone_flags); -} - /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. -- cgit v1.2.2 From 6047550d3d26fed88b18a208b31f8b90b5ef3e9b Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:49 +0200 Subject: x86, ds: dont use TIF_DEBUGCTLMSR Debug store already uses TIF_DS_AREA_MSR to trigger debug store context switch handling. No need to use TIF_DEBUGCTLMSR, as well. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144604.256645000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 443f415441da..cab28320dac7 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -925,11 +925,6 @@ static void update_task_debugctlmsr(struct task_struct *task, get_cpu(); if (task == current) update_debugctlmsr(debugctlmsr); - - if (task->thread.debugctlmsr) - set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - else - clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); put_cpu(); } -- cgit v1.2.2 From 608780a9048efa3e85fbc4d8649b26805cc588aa Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:50 +0200 Subject: x86, ds: fix bad ds_reset_pebs() Ds_reset_pebs() passed the wrong qualifier to a shared function resulting in a reset of bts, rather than pebs. Reported-by: Stephane Eranian Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144605.206510000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index cab28320dac7..ebfb0fde8e6f 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1186,7 +1186,7 @@ int ds_reset_pebs(struct pebs_tracer *tracer) tracer->trace.ds.top = tracer->trace.ds.begin; - ds_set(tracer->ds.context->ds, ds_bts, ds_index, + ds_set(tracer->ds.context->ds, ds_pebs, ds_index, (unsigned long)tracer->trace.ds.top); return 0; -- cgit v1.2.2 From 150f5164c1258e05b7dea16f29e592f354c48f34 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:51 +0200 Subject: x86, ds: allow small debug store buffers Check the buffer size more precisely to allow buffers for exactly one element provided the base address is already properly aligned. Add a debug store selftest. Reported-by: Stephane Eranian Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144606.139137000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 9 +++++++-- arch/x86/kernel/ds_selftest.c | 6 +++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index ebfb0fde8e6f..4e05157506aa 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -656,6 +656,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, { struct ds_context *context; int error; + size_t req_size; error = -EOPNOTSUPP; if (!ds_cfg.sizeof_rec[qual]) @@ -665,9 +666,13 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, if (!base) goto out; - /* We need space for alignment adjustments in ds_init_ds_trace(). */ + req_size = ds_cfg.sizeof_rec[qual]; + /* We might need space for alignment adjustments. */ + if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT)) + req_size += DS_ALIGNMENT; + error = -EINVAL; - if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) + if (size < req_size) goto out; if (th != (size_t)-1) { diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index a40b2533c71e..5f104a0ace66 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -16,8 +16,8 @@ #include -#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ - +#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ +#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */ struct ds_selftest_bts_conf { struct bts_tracer *tracer; @@ -381,7 +381,7 @@ int ds_selftest_bts(void) conf.suspend = ds_suspend_bts_noirq; conf.resume = ds_resume_bts_noirq; conf.tracer = - ds_request_bts_task(current, buffer, BUFFER_SIZE, + ds_request_bts_task(current, buffer, SMALL_BUFFER_SIZE, NULL, (size_t)-1, BTS_KERNEL); local_irq_save(irq); ds_selftest_bts_cpu(&conf); -- cgit v1.2.2 From 017bc617657c928cb9a0c45a7a7e9f4e66695347 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:52 +0200 Subject: x86, ds: support Core i7 Add debug store support for Core i7. Core i7 adds a reset value for each performance counter and a new PEBS record format. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144607.088997000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 12 ++++++--- arch/x86/kernel/ds.c | 69 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 149e5208e967..70dac199b093 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -234,8 +234,12 @@ struct bts_trace { struct pebs_trace { struct ds_trace ds; - /* the PEBS reset value */ - unsigned long long reset_value; + /* the number of valid counters in the below array */ + unsigned int counters; + +#define MAX_PEBS_COUNTERS 4 + /* the counter reset value */ + unsigned long long counter_reset[MAX_PEBS_COUNTERS]; }; @@ -270,9 +274,11 @@ extern int ds_reset_pebs(struct pebs_tracer *tracer); * Returns 0 on success; -Eerrno on error * * tracer: the tracer handle returned from ds_request_pebs() + * counter: the index of the counter * value: the new counter reset value */ -extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); +extern int ds_set_pebs_reset(struct pebs_tracer *tracer, + unsigned int counter, u64 value); /* * Initialization diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 4e05157506aa..48bfe1386038 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -44,6 +44,9 @@ struct ds_configuration { /* The size of a BTS/PEBS record in bytes: */ unsigned char sizeof_rec[2]; + /* The number of pebs counter reset values in the DS structure. */ + unsigned char nr_counter_reset; + /* Control bit-masks indexed by enum ds_feature: */ unsigned long ctl[dsf_ctl_max]; }; @@ -51,7 +54,7 @@ static struct ds_configuration ds_cfg __read_mostly; /* Maximal size of a DS configuration: */ -#define MAX_SIZEOF_DS (12 * 8) +#define MAX_SIZEOF_DS 0x80 /* Maximal size of a BTS record: */ #define MAX_SIZEOF_BTS (3 * 8) @@ -59,6 +62,12 @@ static struct ds_configuration ds_cfg __read_mostly; /* BTS and PEBS buffer alignment: */ #define DS_ALIGNMENT (1 << 3) +/* Number of buffer pointers in DS: */ +#define NUM_DS_PTR_FIELDS 8 + +/* Size of a pebs reset value in DS: */ +#define PEBS_RESET_FIELD_SIZE 8 + /* Mask of control bits in the DS MSR register: */ #define BTS_CONTROL \ ( ds_cfg.ctl[dsf_bts] | \ @@ -1164,9 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) return NULL; ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); - tracer->trace.reset_value = - *(u64 *)(tracer->ds.context->ds + - (ds_cfg.sizeof_ptr_field * 8)); + + tracer->trace.counters = ds_cfg.nr_counter_reset; + memcpy(tracer->trace.counter_reset, + tracer->ds.context->ds + + (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field), + ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE); return &tracer->trace; } @@ -1197,13 +1209,18 @@ int ds_reset_pebs(struct pebs_tracer *tracer) return 0; } -int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, + unsigned int counter, u64 value) { if (!tracer) return -EINVAL; + if (ds_cfg.nr_counter_reset < counter) + return -EINVAL; + *(u64 *)(tracer->ds.context->ds + - (ds_cfg.sizeof_ptr_field * 8)) = value; + (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) + + (counter * PEBS_RESET_FIELD_SIZE)) = value; return 0; } @@ -1213,16 +1230,26 @@ static const struct ds_configuration ds_cfg_netburst = { .ctl[dsf_bts] = (1 << 2) | (1 << 3), .ctl[dsf_bts_kernel] = (1 << 5), .ctl[dsf_bts_user] = (1 << 6), + .nr_counter_reset = 1, }; static const struct ds_configuration ds_cfg_pentium_m = { .name = "Pentium M", .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .nr_counter_reset = 1, }; static const struct ds_configuration ds_cfg_core2_atom = { .name = "Core 2/Atom", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .ctl[dsf_bts_kernel] = (1 << 9), .ctl[dsf_bts_user] = (1 << 10), + .nr_counter_reset = 1, +}; +static const struct ds_configuration ds_cfg_core_i7 = { + .name = "Core i7", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .ctl[dsf_bts_kernel] = (1 << 9), + .ctl[dsf_bts_user] = (1 << 10), + .nr_counter_reset = 4, }; static void @@ -1239,6 +1266,32 @@ ds_configure(const struct ds_configuration *cfg, nr_pebs_fields = 18; #endif + /* + * Starting with version 2, architectural performance + * monitoring supports a format specifier. + */ + if ((cpuid_eax(0xa) & 0xff) > 1) { + unsigned long perf_capabilities, format; + + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities); + + format = (perf_capabilities >> 8) & 0xf; + + switch (format) { + case 0: + nr_pebs_fields = 18; + break; + case 1: + nr_pebs_fields = 22; + break; + default: + printk(KERN_INFO + "[ds] unknown PEBS format: %lu\n", format); + nr_pebs_fields = 0; + break; + } + } + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; @@ -1262,7 +1315,7 @@ ds_configure(const struct ds_configuration *cfg, printk("bts/pebs record: %u/%u bytes\n", ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); - WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_ptr_field)); + WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -1284,6 +1337,8 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) ds_configure(&ds_cfg_core2_atom, c); break; case 0x1a: /* Core i7 */ + ds_configure(&ds_cfg_core_i7, c); + break; default: /* Sorry, don't know about them. */ break; -- cgit v1.2.2 From a5dec5573f3c7e63f2f9b5852b9759ea342a5ff9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 27 Mar 2009 14:55:44 +0800 Subject: tracing: use macros to denote usec and nsec per second Impact: cleanup Use USEC_PER_SEC and NSEC_PER_SEC instead of 1000000 and 1000000000. Signed-off-by: Li Zefan LKML-Reference: <49CC7870.9000309@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_boot.c | 5 +++-- kernel/trace/trace_mmiotrace.c | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 7a30fc4c3642..a29ef23ffb47 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" @@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter) trace_assign_type(field, entry); call = &field->boot_call; ts = iter->ts; - nsec_rem = do_div(ts, 1000000000); + nsec_rem = do_div(ts, NSEC_PER_SEC); ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", (unsigned long)ts, nsec_rem, call->func, call->caller); @@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter) trace_assign_type(field, entry); init_ret = &field->boot_ret; ts = iter->ts; - nsec_rem = do_div(ts, 1000000000); + nsec_rem = do_div(ts, NSEC_PER_SEC); ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " "returned %d after %llu msecs\n", diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 8e37fcddd8b4..d53b45ed0806 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,8 @@ #include #include #include +#include + #include #include "trace.h" @@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) struct mmiotrace_rw *rw; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret = 1; @@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) struct mmiotrace_map *m; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret; -- cgit v1.2.2 From 5452af664f6fba26b80eb2c8c4ceae2999d5cf56 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Mar 2009 00:25:38 +0100 Subject: tracing/ftrace: factorize the tracing files creation Impact: cleanup Most of the tracing files creation follow the same pattern: ret = debugfs_create_file(...) if (!ret) pr_warning("Couldn't create ... entry\n") Unify it! Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker LKML-Reference: <1238109938-11840-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 39 +++------ kernel/trace/ring_buffer.c | 7 +- kernel/trace/trace.c | 159 +++++++++++++++---------------------- kernel/trace/trace.h | 6 ++ kernel/trace/trace_event_profile.c | 1 - kernel/trace/trace_printk.c | 6 +- kernel/trace/trace_stack.c | 13 +-- kernel/trace/trace_sysprof.c | 6 +- 8 files changed, 86 insertions(+), 151 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 678e3d6caf85..6ea5a1ae6a98 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2698,38 +2698,23 @@ static const struct file_operations ftrace_graph_fops = { static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { - struct dentry *entry; - entry = debugfs_create_file("available_filter_functions", 0444, - d_tracer, NULL, &ftrace_avail_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'available_filter_functions' entry\n"); + trace_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); - entry = debugfs_create_file("failures", 0444, - d_tracer, NULL, &ftrace_failures_fops); - if (!entry) - pr_warning("Could not create debugfs 'failures' entry\n"); + trace_create_file("failures", 0444, + d_tracer, NULL, &ftrace_failures_fops); - entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, - NULL, &ftrace_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_filter' entry\n"); + trace_create_file("set_ftrace_filter", 0644, d_tracer, + NULL, &ftrace_filter_fops); - entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, + trace_create_file("set_ftrace_notrace", 0644, d_tracer, NULL, &ftrace_notrace_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_notrace' entry\n"); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - entry = debugfs_create_file("set_graph_function", 0444, d_tracer, + trace_create_file("set_graph_function", 0444, d_tracer, NULL, &ftrace_graph_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ return 0; @@ -2987,7 +2972,6 @@ static const struct file_operations ftrace_pid_fops = { static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -2995,11 +2979,8 @@ static __init int ftrace_init_debugfs(void) ftrace_init_dyn_debugfs(d_tracer); - entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer, - NULL, &ftrace_pid_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_pid' entry\n"); + trace_create_file("set_ftrace_pid", 0644, d_tracer, + NULL, &ftrace_pid_fops); ftrace_profile_debugfs(d_tracer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 960cbf44c844..74a11808c282 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2845,14 +2845,11 @@ static const struct file_operations rb_simple_fops = { static __init int rb_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("tracing_on", 0644, d_tracer, - &ring_buffer_flags, &rb_simple_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_on' entry\n"); + trace_create_file("tracing_on", 0644, d_tracer, + &ring_buffer_flags, &rb_simple_fops); return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 32653c8c6e26..0615751a3ed7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3581,7 +3581,7 @@ struct dentry *tracing_dentry_percpu(void) static void tracing_init_debugfs_percpu(long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(); - struct dentry *entry, *d_cpu; + struct dentry *d_cpu; /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ char cpu_dir[7]; @@ -3596,21 +3596,15 @@ static void tracing_init_debugfs_percpu(long cpu) } /* per cpu trace_pipe */ - entry = debugfs_create_file("trace_pipe", 0444, d_cpu, - (void *) cpu, &tracing_pipe_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_pipe' entry\n"); + trace_create_file("trace_pipe", 0444, d_cpu, + (void *) cpu, &tracing_pipe_fops); /* per cpu trace */ - entry = debugfs_create_file("trace", 0644, d_cpu, - (void *) cpu, &tracing_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + trace_create_file("trace", 0644, d_cpu, + (void *) cpu, &tracing_fops); - entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu, - (void *) cpu, &tracing_buffers_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n"); + trace_create_file("trace_pipe_raw", 0444, d_cpu, + (void *) cpu, &tracing_buffers_fops); } #ifdef CONFIG_FTRACE_SELFTEST @@ -3766,6 +3760,22 @@ static const struct file_operations trace_options_core_fops = { .write = trace_options_core_write, }; +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops) +{ + struct dentry *ret; + + ret = debugfs_create_file(name, mode, parent, data, fops); + if (!ret) + pr_warning("Could not create debugfs '%s' entry\n", name); + + return ret; +} + + static struct dentry *trace_options_init_dentry(void) { struct dentry *d_tracer; @@ -3793,7 +3803,6 @@ create_trace_option_file(struct trace_option_dentry *topt, struct tracer_opt *opt) { struct dentry *t_options; - struct dentry *entry; t_options = trace_options_init_dentry(); if (!t_options) @@ -3802,11 +3811,9 @@ create_trace_option_file(struct trace_option_dentry *topt, topt->flags = flags; topt->opt = opt; - entry = debugfs_create_file(opt->name, 0644, t_options, topt, + topt->entry = trace_create_file(opt->name, 0644, t_options, topt, &trace_options_fops); - topt->entry = entry; - } static struct trace_option_dentry * @@ -3861,123 +3868,81 @@ static struct dentry * create_trace_option_core_file(const char *option, long index) { struct dentry *t_options; - struct dentry *entry; t_options = trace_options_init_dentry(); if (!t_options) return NULL; - entry = debugfs_create_file(option, 0644, t_options, (void *)index, + return trace_create_file(option, 0644, t_options, (void *)index, &trace_options_core_fops); - - return entry; } static __init void create_trace_options_dir(void) { struct dentry *t_options; - struct dentry *entry; int i; t_options = trace_options_init_dentry(); if (!t_options) return; - for (i = 0; trace_options[i]; i++) { - entry = create_trace_option_core_file(trace_options[i], i); - if (!entry) - pr_warning("Could not create debugfs %s entry\n", - trace_options[i]); - } + for (i = 0; trace_options[i]; i++) + create_trace_option_core_file(trace_options[i], i); } static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; int cpu; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, - &global_trace, &tracing_ctrl_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); + trace_create_file("tracing_enabled", 0644, d_tracer, + &global_trace, &tracing_ctrl_fops); - entry = debugfs_create_file("trace_options", 0644, d_tracer, - NULL, &tracing_iter_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_options' entry\n"); + trace_create_file("trace_options", 0644, d_tracer, + NULL, &tracing_iter_fops); - create_trace_options_dir(); + trace_create_file("tracing_cpumask", 0644, d_tracer, + NULL, &tracing_cpumask_fops); + + trace_create_file("trace", 0644, d_tracer, + (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); + + trace_create_file("available_tracers", 0444, d_tracer, + &global_trace, &show_traces_fops); + + trace_create_file("current_tracer", 0444, d_tracer, + &global_trace, &set_tracer_fops); + + trace_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, &tracing_max_lat_fops); + + trace_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); - entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, - NULL, &tracing_cpumask_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); - - entry = debugfs_create_file("trace", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); - - entry = debugfs_create_file("available_tracers", 0444, d_tracer, - &global_trace, &show_traces_fops); - if (!entry) - pr_warning("Could not create debugfs 'available_tracers' entry\n"); - - entry = debugfs_create_file("current_tracer", 0444, d_tracer, - &global_trace, &set_tracer_fops); - if (!entry) - pr_warning("Could not create debugfs 'current_tracer' entry\n"); - - entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, - &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_max_latency' entry\n"); - - entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, - &tracing_thresh, &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_thresh' entry\n"); - entry = debugfs_create_file("README", 0644, d_tracer, - NULL, &tracing_readme_fops); - if (!entry) - pr_warning("Could not create debugfs 'README' entry\n"); - - entry = debugfs_create_file("trace_pipe", 0444, d_tracer, + trace_create_file("README", 0644, d_tracer, + NULL, &tracing_readme_fops); + + trace_create_file("trace_pipe", 0444, d_tracer, (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'trace_pipe' entry\n"); - - entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer, - &global_trace, &tracing_entries_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'buffer_size_kb' entry\n"); - - entry = debugfs_create_file("trace_marker", 0220, d_tracer, - NULL, &tracing_mark_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'trace_marker' entry\n"); + + trace_create_file("buffer_size_kb", 0644, d_tracer, + &global_trace, &tracing_entries_fops); + + trace_create_file("trace_marker", 0220, d_tracer, + NULL, &tracing_mark_fops); #ifdef CONFIG_DYNAMIC_FTRACE - entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, - &tracing_dyn_info_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'dyn_ftrace_total_info' entry\n"); + trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif #ifdef CONFIG_SYSPROF_TRACER init_tracer_sysprof_debugfs(d_tracer); #endif + create_trace_options_dir(); + for_each_tracing_cpu(cpu) tracing_init_debugfs_percpu(cpu); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 47aa6d0c97a0..f76a8f8689d4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -470,6 +470,12 @@ void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); void tracing_reset_online_cpus(struct trace_array *tr); int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops); + struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 22cba9970776..199de9c74229 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -28,4 +28,3 @@ void ftrace_profile_disable(int event_id) return event->profile_disable(event); } } - diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index eb81556107fe..9bece9687b62 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = { static __init int init_trace_printk_function_export(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); if (!d_tracer) return 0; - entry = debugfs_create_file("printk_formats", 0444, d_tracer, + trace_create_file("printk_formats", 0444, d_tracer, NULL, &ftrace_formats_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'printk_formats' entry\n"); return 0; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c750f65f9661..1796f00524e1 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace); static __init int stack_trace_init(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("stack_max_size", 0644, d_tracer, - &max_stack_size, &stack_max_size_fops); - if (!entry) - pr_warning("Could not create debugfs 'stack_max_size' entry\n"); + trace_create_file("stack_max_size", 0644, d_tracer, + &max_stack_size, &stack_max_size_fops); - entry = debugfs_create_file("stack_trace", 0444, d_tracer, - NULL, &stack_trace_fops); - if (!entry) - pr_warning("Could not create debugfs 'stack_trace' entry\n"); + trace_create_file("stack_trace", 0444, d_tracer, + NULL, &stack_trace_fops); if (stack_tracer_enabled) register_ftrace_function(&trace_ops); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 91fd19c2149f..e04b76cc238a 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -321,11 +321,7 @@ static const struct file_operations sysprof_sample_fops = { void init_tracer_sysprof_debugfs(struct dentry *d_tracer) { - struct dentry *entry; - entry = debugfs_create_file("sysprof_sample_period", 0644, + trace_create_file("sysprof_sample_period", 0644, d_tracer, NULL, &sysprof_sample_fops); - if (entry) - return; - pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n"); } -- cgit v1.2.2 From 597af81537654097b67fd7a0c92775e66d4a86fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 3 Apr 2009 15:24:12 -0400 Subject: function-graph: use int instead of atomic for ftrace_graph_active Impact: cleanup The variable ftrace_graph_active is only modified under the ftrace_lock mutex, thus an atomic is not necessary for modification. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6ea5a1ae6a98..8e6a0b5c9940 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3092,7 +3092,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static atomic_t ftrace_graph_active; +static int ftrace_graph_active; static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) @@ -3244,7 +3244,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, mutex_lock(&ftrace_lock); /* we currently allow only one tracer registered at a time */ - if (atomic_read(&ftrace_graph_active)) { + if (ftrace_graph_active) { ret = -EBUSY; goto out; } @@ -3252,10 +3252,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); - atomic_inc(&ftrace_graph_active); + ftrace_graph_active++; ret = start_graph_tracing(); if (ret) { - atomic_dec(&ftrace_graph_active); + ftrace_graph_active--; goto out; } @@ -3273,10 +3273,10 @@ void unregister_ftrace_graph(void) { mutex_lock(&ftrace_lock); - if (!unlikely(atomic_read(&ftrace_graph_active))) + if (unlikely(!ftrace_graph_active)) goto out; - atomic_dec(&ftrace_graph_active); + ftrace_graph_active--; unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; @@ -3290,7 +3290,7 @@ void unregister_ftrace_graph(void) /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { - if (atomic_read(&ftrace_graph_active)) { + if (ftrace_graph_active) { t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH * sizeof(struct ftrace_ret_stack), GFP_KERNEL); -- cgit v1.2.2 From dcef788eb9659b61a2110284fcce3ca6e63480d2 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Tue, 31 Mar 2009 15:26:14 +0800 Subject: ftrace: clean up enable logic for sched_switch Unify sched_switch and sched_wakeup's action to following logic: Do record_cmdline when start_cmdline_record() is called. Start tracing events when the tracer is started. Signed-off-by: Zhao Lei LKML-Reference: <49D1C596.5050203@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_switch.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 9117cea6f1ae..9d8cccdfaa06 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, int cpu; int pc; - if (!sched_ref || sched_stopped) + if (unlikely(!sched_ref)) return; tracing_record_cmdline(prev); tracing_record_cmdline(next); - if (!tracer_enabled) + if (!tracer_enabled || sched_stopped) return; pc = preempt_count(); @@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) unsigned long flags; int cpu, pc; - if (!likely(tracer_enabled)) + if (unlikely(!sched_ref)) return; - pc = preempt_count(); tracing_record_cmdline(current); - if (sched_stopped) + if (!tracer_enabled || sched_stopped) return; + pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); data = ctx_trace->data[cpu]; -- cgit v1.2.2 From 44bc9dc729e33a4ec6ebed4d0b6c08e8d20b42cf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 10:47:17 +0200 Subject: mm, x86, ptrace, bts: defer branch trace stopping, cleanup Andrew Morton noticed that mm.h needlessly includes sched.h - remove it. Reported-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 64d8ed2538ae..776b641f37e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -13,7 +13,6 @@ #include #include #include -#include struct mempolicy; struct anon_vma; -- cgit v1.2.2 From a34b50ddc265bae058c66661b096ef6384c5a8b1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 10:56:54 +0200 Subject: mm, x86, ptrace, bts: defer branch trace stopping, remove dead code Remove the unused free_locked_buffer() API. Signed-off-by: Ingo Molnar --- include/linux/mm.h | 1 - mm/mlock.c | 6 ------ 2 files changed, 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 776b641f37e3..a3963ba23a6d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1320,7 +1320,6 @@ int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); extern void *alloc_locked_buffer(size_t size); -extern void free_locked_buffer(void *buffer, size_t size); extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/mlock.c b/mm/mlock.c index 749383b442c7..28be15ead9c1 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -671,9 +671,3 @@ void refund_locked_buffer_memory(struct mm_struct *mm, size_t size) up_write(&mm->mmap_sem); } - -void free_locked_buffer(void *buffer, size_t size) -{ - refund_locked_buffer_memory(current->mm, size); - kfree(buffer); -} -- cgit v1.2.2 From 2062501ae6505dbc5bff3a792246c2661d114050 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Apr 2009 01:49:33 +0200 Subject: tracing/lockdep: report the time waited for a lock While trying to optimize the new lock on reiserfs to replace the bkl, I find the lock tracing very useful though it lacks something important for performance (and latency) instrumentation: the time a task waits for a lock. That's what this patch implements: bash-4816 [000] 202.652815: lock_contended: lock_contended: &sb->s_type->i_mutex_key bash-4816 [000] 202.652819: lock_acquired: &rq->lock (0.000 us) <...>-4787 [000] 202.652825: lock_acquired: &rq->lock (0.000 us) <...>-4787 [000] 202.652829: lock_acquired: &rq->lock (0.000 us) bash-4816 [000] 202.652833: lock_acquired: &sb->s_type->i_mutex_key (16.005 us) As shown above, the "lock acquired" field is followed by the time it has been waiting for the lock. Usually, a lock contended entry is followed by a near lock_acquired entry with a non-zero time waited. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Steven Rostedt LKML-Reference: <1238975373-15739-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/trace/lockdep_event_types.h | 23 ++++++++++++++++++----- kernel/lockdep.c | 8 ++++---- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h index adccfcd2ec8f..863f1e4583a6 100644 --- a/include/trace/lockdep_event_types.h +++ b/include/trace/lockdep_event_types.h @@ -32,11 +32,24 @@ TRACE_FORMAT(lock_contended, TP_FMT("%s", lock->name) ); -TRACE_FORMAT(lock_acquired, - TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, ip), - TP_FMT("%s", lock->name) - ); +TRACE_EVENT(lock_acquired, + TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), + + TP_ARGS(lock, ip, waittime), + + TP_STRUCT__entry( + __field(const char *, name) + __field(unsigned long, wait_usec) + __field(unsigned long, wait_nsec_rem) + ), + TP_fast_assign( + __entry->name = lock->name; + __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); + __entry->wait_usec = (unsigned long) waittime; + ), + TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, + __entry->wait_nsec_rem) +); #endif #endif diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b0f011866969..c4582a6ea953 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3061,6 +3061,8 @@ found_it: put_lock_stats(stats); } +DEFINE_TRACE(lock_acquired); + static void __lock_acquired(struct lockdep_map *lock, unsigned long ip) { @@ -3099,6 +3101,8 @@ found_it: hlock->holdtime_stamp = now; } + trace_lock_acquired(lock, ip, waittime); + stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) @@ -3137,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_contended); -DEFINE_TRACE(lock_acquired); - void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - trace_lock_acquired(lock, ip); - if (unlikely(!lock_stat)) return; -- cgit v1.2.2 From e71e99c294058a61b7a8b9bb6da2f745ac51aa4f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 14:30:04 -0400 Subject: x86, function-graph: only save return values on x86_64 Impact: speed up The return to handler portion of the function graph tracer should only need to save the return values. The caller already saved off the registers that the callee can modify. The returning function already saved the registers it modified. When we call our own trace function it too will save the registers that the callee must restore. There's no reason to save off anything more that the registers used to return the values. Note, I did a complete kernel build with this modification and the function graph tracer running on x86_64. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a331ec38af9e..1ac99865591c 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -147,27 +147,14 @@ END(ftrace_graph_caller) GLOBAL(return_to_handler) subq $80, %rsp + /* Save the return values */ movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) - movq %r10, 56(%rsp) - movq %r11, 64(%rsp) + movq %rdx, 8(%rsp) call ftrace_return_to_handler movq %rax, 72(%rsp) - movq 64(%rsp), %r11 - movq 56(%rsp), %r10 - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx + movq 8(%rsp), %rdx movq (%rsp), %rax addq $72, %rsp retq -- cgit v1.2.2 From 5cb3d1d9d34ac04bcaa2034139345b2a5fea54c1 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Thu, 9 Apr 2009 14:08:18 +0800 Subject: tracing, net, skb tracepoint: make skb tracepoint use the TRACE_EVENT() macro TRACE_EVENT is a more generic way to define a tracepoint. Doing so adds these new capabilities to this tracepoint: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions Signed-off-by: Zhao Lei Acked-by: Neil Horman Cc: "David S. Miller" Cc: Arnaldo Carvalho de Melo Cc: "Steven Rostedt ;" Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49DD90D2.5020604@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/skb.h | 4 +--- include/trace/skb_event_types.h | 38 ++++++++++++++++++++++++++++++++++++++ include/trace/trace_event_types.h | 1 + include/trace/trace_events.h | 1 + 4 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 include/trace/skb_event_types.h diff --git a/include/trace/skb.h b/include/trace/skb.h index b66206d9be72..d2de7174a6e8 100644 --- a/include/trace/skb.h +++ b/include/trace/skb.h @@ -4,8 +4,6 @@ #include #include -DECLARE_TRACE(kfree_skb, - TP_PROTO(struct sk_buff *skb, void *location), - TP_ARGS(skb, location)); +#include #endif diff --git a/include/trace/skb_event_types.h b/include/trace/skb_event_types.h new file mode 100644 index 000000000000..4a1c504c0e16 --- /dev/null +++ b/include/trace/skb_event_types.h @@ -0,0 +1,38 @@ + +/* use instead */ +#ifndef TRACE_EVENT +# error Do not include this file directly. +# error Unless you know what you are doing. +#endif + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM skb + +/* + * Tracepoint for free an sk_buff: + */ +TRACE_EVENT(kfree_skb, + + TP_PROTO(struct sk_buff *skb, void *location), + + TP_ARGS(skb, location), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + __field( unsigned short, protocol ) + __field( void *, location ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + if (skb) { + __entry->protocol = ntohs(skb->protocol); + } + __entry->location = location; + ), + + TP_printk("skbaddr=%p protocol=%u location=%p", + __entry->skbaddr, __entry->protocol, __entry->location) +); + +#undef TRACE_SYSTEM diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h index df56f5694be6..33b6bfcba93b 100644 --- a/include/trace/trace_event_types.h +++ b/include/trace/trace_event_types.h @@ -3,3 +3,4 @@ #include #include #include +#include diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index fd13750ca4ba..0e2aa80076d9 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -3,3 +3,4 @@ #include #include #include +#include -- cgit v1.2.2 From 56c49951747f250d8398582509e02ae5ce1d36d1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 11 Apr 2009 15:51:19 -0400 Subject: tracing: Add documentation for the power tracer Signed-off-by: "Theodore Ts'o" Acked-by: Arjan van de Ven Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1239479479-2603-4-git-send-email-tytso@mit.edu> Signed-off-by: Ingo Molnar --- Documentation/trace/power.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 Documentation/trace/power.txt diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt new file mode 100644 index 000000000000..cd805e16dc27 --- /dev/null +++ b/Documentation/trace/power.txt @@ -0,0 +1,17 @@ +The power tracer collects detailed information about C-state and P-state +transitions, instead of just looking at the high-level "average" +information. + +There is a helper script found in scrips/tracing/power.pl in the kernel +sources which can be used to parse this information and create a +Scalable Vector Graphics (SVG) picture from the trace data. + +To use this tracer: + + echo 0 > /sys/kernel/debug/tracing/tracing_enabled + echo power > /sys/kernel/debug/tracing/current_tracer + echo 1 > /sys/kernel/debug/tracing/tracing_enabled + sleep 1 + echo 0 > /sys/kernel/debug/tracing/tracing_enabled + cat /sys/kernel/debug/tracing/trace | \ + perl scripts/tracing/power.pl > out.sv -- cgit v1.2.2 From abd41443ac76d3e9c29a8c1d9e9a3312306cc55e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 11 Apr 2009 15:51:18 -0400 Subject: tracing: Document the event tracing system Signed-off-by: "Theodore Ts'o" Cc: Theodore Ts'o Cc: Steven Rostedt LKML-Reference: <1239479479-2603-3-git-send-email-tytso@mit.edu> Signed-off-by: Ingo Molnar --- Documentation/trace/events.txt | 135 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 Documentation/trace/events.txt diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt new file mode 100644 index 000000000000..abdee664c0f6 --- /dev/null +++ b/Documentation/trace/events.txt @@ -0,0 +1,135 @@ + Event Tracing + + Documentation written by Theodore Ts'o + +Introduction +============ + +Tracepoints (see Documentation/trace/tracepoints.txt) can be used +without creating custom kernel modules to register probe functions +using the event tracing infrastructure. + +Not all tracepoints can be traced using the event tracing system; +the kernel developer must provide code snippets which define how the +tracing information is saved into the tracing buffer, and how the +the tracing information should be printed. + +Using Event Tracing +=================== + +The events which are available for tracing can be found in the file +/sys/kernel/debug/tracing/available_events. + +To enable a particular event, such as 'sched_wakeup', simply echo it +to /sys/debug/tracing/set_event. For example: + + # echo sched_wakeup > /sys/kernel/debug/tracing/set_event + +[ Note: events can also be enabled/disabled via the 'enabled' toggle + found in the /sys/kernel/tracing/events/ hierarchy of directories. ] + +To disable an event, echo the event name to the set_event file prefixed +with an exclamation point: + + # echo '!sched_wakeup' >> /sys/kernel/debug/tracing/set_event + +To disable events, echo an empty line to the set_event file: + + # echo > /sys/kernel/debug/tracing/set_event + +The events are organized into subsystems, such as ext4, irq, sched, +etc., and a full event name looks like this: :. The +subsystem name is optional, but it is displayed in the available_events +file. All of the events in a subsystem can be specified via the syntax +":*"; for example, to enable all irq events, you can use the +command: + + # echo 'irq:*' > /sys/kernel/debug/tracing/set_event + +Defining an event-enabled tracepoint +------------------------------------ + +A kernel developer which wishes to define an event-enabled tracepoint +must declare the tracepoint using TRACE_EVENT instead of DECLARE_TRACE. +This is done via two header files in include/trace. For example, to +event-enable the jbd2 subsystem, we must create two files, +include/trace/jbd2.h and include/trace/jbd2_event_types.h. The +include/trace/jbd2.h file should be included by kernel source files that +will have a tracepoint inserted, and might look like this: + +#ifndef _TRACE_JBD2_H +#define _TRACE_JBD2_H + +#include +#include + +#include + +#endif + +In a file that utilizes a jbd2 tracepoint, this header file would be +included. Note that you still have to use DEFINE_TRACE(). So for +example, if fs/jbd2/commit.c planned to use the jbd2_start_commit +tracepoint, it would have the following near the beginning of the file: + +#include + +DEFINE_TRACE(jbd2_start_commit); + +Then in the function that would call the tracepoint, it would call the +tracepoint function. (For more information, please see the tracepoint +documentation in Documentation/trace/tracepoints.txt): + + trace_jbd2_start_commit(journal, commit_transaction); + +The code snippets which allow jbd2_start_commit to be an event-enabled +tracepoint are placed in the file include/trace/jbd2_event_types.h: + +/* use instead */ +#ifndef TRACE_EVENT +# error Do not include this file directly. +# error Unless you know what you are doing. +#endif + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM jbd2 + +#include + +TRACE_EVENT(jbd2_start_commit, + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + TP_ARGS(journal, commit_transaction), + TP_STRUCT__entry( + __array( char, devname, BDEVNAME_SIZE+24 ) + __field( int, transaction ) + ), + TP_fast_assign( + memcpy(__entry->devname, journal->j_devname, BDEVNAME_SIZE+24); + __entry->transaction = commit_transaction->t_tid; + ), + TP_printk("dev %s transaction %d", + __entry->devname, __entry->transaction) +); + +The TP_PROTO and TP_ARGS are unchanged from DECLARE_TRACE. The new +arguments to TRACE_EVENT are TP_STRUCT__entry, TP_fast_assign, and +TP_printk. + +TP_STRUCT__entry defines the data structure which will be stored in the +trace buffer. Normally, fields in __entry will be arrays or simple +types. It is possible to place data structures in __entry --- however, +pointers in the data structure can not be trusted, since they will be +accessed sometime later by TP_printk, and if the data structure contains +fields that will not or cannot be used by TP_printk, this will waste +space in the trace buffer. In general, data structures should be +avoided, unless they do only contain non-pointer types and all of the +fields will be used by TP_printk. + +TP_fast_assign defines the code snippet which saves information into the +__entry data structure, using the passed-in arguments defined in +TP_PROTO and TP_ARGS. + +Finally, TP_printk will print the __entry data structure. At the time +when the code snippet defined by TP_printk is executed, it will not have +access to the TP_ARGS arguments; it can only use the information saved +in the __entry data structure. -- cgit v1.2.2 From 02af61bb50f5d5f0322dbe5ab2a0d75808d25c7b Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 10 Apr 2009 14:26:18 +0800 Subject: tracing, kmemtrace: Separate include/trace/kmemtrace.h to kmemtrace part and tracepoint part Impact: refactor code for future changes Current kmemtrace.h is used both as header file of kmemtrace and kmem's tracepoints definition. Tracepoints' definition file may be used by other code, and should only have definition of tracepoint. We can separate include/trace/kmemtrace.h into 2 files: include/linux/kmemtrace.h: header file for kmemtrace include/trace/kmem.h: definition of kmem tracepoints Signed-off-by: Zhao Lei Acked-by: Eduard - Gabriel Munteanu Acked-by: Pekka Enberg Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49DEE68A.5040902@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/kmemtrace.h | 25 +++++++++++++++++++ include/linux/slab_def.h | 2 +- include/linux/slub_def.h | 2 +- include/trace/kmem.h | 44 +++++++++++++++++++++++++++++++++ include/trace/kmemtrace.h | 63 ----------------------------------------------- init/main.c | 2 +- kernel/trace/kmemtrace.c | 2 +- kernel/trace/trace.h | 2 +- mm/slab.c | 2 +- mm/slob.c | 2 +- mm/slub.c | 2 +- 11 files changed, 77 insertions(+), 71 deletions(-) create mode 100644 include/linux/kmemtrace.h create mode 100644 include/trace/kmem.h delete mode 100644 include/trace/kmemtrace.h diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h new file mode 100644 index 000000000000..15c45a27a925 --- /dev/null +++ b/include/linux/kmemtrace.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2008 Eduard - Gabriel Munteanu + * + * This file is released under GPL version 2. + */ + +#ifndef _LINUX_KMEMTRACE_H +#define _LINUX_KMEMTRACE_H + +#ifdef __KERNEL__ + +#include + +#ifdef CONFIG_KMEMTRACE +extern void kmemtrace_init(void); +#else +static inline void kmemtrace_init(void) +{ +} +#endif + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_KMEMTRACE_H */ + diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 5ac9b0bcaf9a..713f841ecaa9 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -14,7 +14,7 @@ #include /* kmalloc_sizes.h needs PAGE_SIZE */ #include /* kmalloc_sizes.h needs L1_CACHE_BYTES */ #include -#include +#include /* Size description struct for general caches. */ struct cache_sizes { diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 5046f90c1171..be5d40c43bd2 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ diff --git a/include/trace/kmem.h b/include/trace/kmem.h new file mode 100644 index 000000000000..24d251928182 --- /dev/null +++ b/include/trace/kmem.h @@ -0,0 +1,44 @@ +#ifndef _TRACE_KMEM_H +#define _TRACE_KMEM_H + +#include +#include + +DECLARE_TRACE(kmalloc, + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)); +DECLARE_TRACE(kmem_cache_alloc, + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)); +DECLARE_TRACE(kmalloc_node, + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)); +DECLARE_TRACE(kmem_cache_alloc_node, + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)); +DECLARE_TRACE(kfree, + TP_PROTO(unsigned long call_site, const void *ptr), + TP_ARGS(call_site, ptr)); +DECLARE_TRACE(kmem_cache_free, + TP_PROTO(unsigned long call_site, const void *ptr), + TP_ARGS(call_site, ptr)); + +#endif /* _TRACE_KMEM_H */ diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h deleted file mode 100644 index 28ee69f9cd46..000000000000 --- a/include/trace/kmemtrace.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (C) 2008 Eduard - Gabriel Munteanu - * - * This file is released under GPL version 2. - */ - -#ifndef _LINUX_KMEMTRACE_H -#define _LINUX_KMEMTRACE_H - -#ifdef __KERNEL__ - -#include -#include - -#ifdef CONFIG_KMEMTRACE -extern void kmemtrace_init(void); -#else -static inline void kmemtrace_init(void) -{ -} -#endif - -DECLARE_TRACE(kmalloc, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)); -DECLARE_TRACE(kmem_cache_alloc, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)); -DECLARE_TRACE(kmalloc_node, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)); -DECLARE_TRACE(kmem_cache_alloc_node, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)); -DECLARE_TRACE(kfree, - TP_PROTO(unsigned long call_site, const void *ptr), - TP_ARGS(call_site, ptr)); -DECLARE_TRACE(kmem_cache_free, - TP_PROTO(unsigned long call_site, const void *ptr), - TP_ARGS(call_site, ptr)); - -#endif /* __KERNEL__ */ - -#endif /* _LINUX_KMEMTRACE_H */ - diff --git a/init/main.c b/init/main.c index 3585f073d636..eece40cd8a64 100644 --- a/init/main.c +++ b/init/main.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -71,7 +72,6 @@ #include #include #include -#include #ifdef CONFIG_X86_LOCAL_APIC #include diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 5011f4d91e37..7a0aa0e260db 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include "trace_output.h" #include "trace.h" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f76a8f8689d4..34b94c3f40ad 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include enum trace_type { diff --git a/mm/slab.c b/mm/slab.c index 9a90b00d2f91..f85831da9080 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -102,7 +102,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/slob.c b/mm/slob.c index a2d4ab32198d..494f05f19417 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -65,7 +65,7 @@ #include #include #include -#include +#include #include /* diff --git a/mm/slub.c b/mm/slub.c index 7ab54ecbd3f3..ea9e7160e2e7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.2.2 From fc182a4330fc22ea1b68fa3d5064dd85a73a4c4a Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 10 Apr 2009 14:27:38 +0800 Subject: tracing, kmemtrace: Make kmem tracepoints use TRACE_EVENT macro TRACE_EVENT is a more generic way to define tracepoints. Doing so adds these new capabilities to this tracepoint: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions Signed-off-by: Zhao Lei Acked-by: Eduard - Gabriel Munteanu Acked-by: Pekka Enberg Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49DEE6DA.80600@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/kmem.h | 39 +------- include/trace/kmem_event_types.h | 193 ++++++++++++++++++++++++++++++++++++++ include/trace/trace_event_types.h | 1 + include/trace/trace_events.h | 1 + 4 files changed, 197 insertions(+), 37 deletions(-) create mode 100644 include/trace/kmem_event_types.h diff --git a/include/trace/kmem.h b/include/trace/kmem.h index 24d251928182..46efc2423f03 100644 --- a/include/trace/kmem.h +++ b/include/trace/kmem.h @@ -1,44 +1,9 @@ #ifndef _TRACE_KMEM_H #define _TRACE_KMEM_H -#include #include +#include -DECLARE_TRACE(kmalloc, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)); -DECLARE_TRACE(kmem_cache_alloc, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)); -DECLARE_TRACE(kmalloc_node, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)); -DECLARE_TRACE(kmem_cache_alloc_node, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)); -DECLARE_TRACE(kfree, - TP_PROTO(unsigned long call_site, const void *ptr), - TP_ARGS(call_site, ptr)); -DECLARE_TRACE(kmem_cache_free, - TP_PROTO(unsigned long call_site, const void *ptr), - TP_ARGS(call_site, ptr)); +#include #endif /* _TRACE_KMEM_H */ diff --git a/include/trace/kmem_event_types.h b/include/trace/kmem_event_types.h new file mode 100644 index 000000000000..4ff420fe4675 --- /dev/null +++ b/include/trace/kmem_event_types.h @@ -0,0 +1,193 @@ + +/* use instead */ +#ifndef TRACE_EVENT +# error Do not include this file directly. +# error Unless you know what you are doing. +#endif + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kmem + +TRACE_EVENT(kmalloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmem_cache_alloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmalloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kmem_cache_alloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kfree, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); + +TRACE_EVENT(kmem_cache_free, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); + +#undef TRACE_SYSTEM diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h index 33b6bfcba93b..552a50e169a6 100644 --- a/include/trace/trace_event_types.h +++ b/include/trace/trace_event_types.h @@ -4,3 +4,4 @@ #include #include #include +#include diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 0e2aa80076d9..13d6b85668cf 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -4,3 +4,4 @@ #include #include #include +#include -- cgit v1.2.2 From b78825d608f30a47e3154ab6872a03f0de0c9d45 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 1 Apr 2009 16:18:53 +0800 Subject: blktrace: fix output of unknown events Not all events are pc (packet command) events. An event is a pc event only if it has BLK_TC_PC bit set. Signed-off-by: Li Zefan Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49D3236D.3090705@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 921ef5d1f0ba..e45e1af13563 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1182,7 +1182,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) - ret = trace_seq_printf(s, "Bad pc action %x\n", what); + ret = trace_seq_printf(s, "Unknown action %x\n", what); else { ret = log_action(iter, what2act[what].act[long_act]); if (ret) -- cgit v1.2.2 From 66de7792c02693b49671afe58c771fde3b092fc7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 1 Apr 2009 16:19:19 +0800 Subject: blktrace: fix output of BLK_TC_PC events BLK_TC_PC events should be treated differently with BLK_TC_FS events. Before this patch: # echo 1 > /sys/block/sda/sda1/trace/enable # echo pc > /sys/block/sda/sda1/trace/act_mask # echo blk > /debugfs/tracing/current_tracer # (generate some BLK_TC_PC events) # cat trace bash-2184 [000] 1774.275413: 8,7 I N [bash] bash-2184 [000] 1774.275435: 8,7 D N [bash] bash-2184 [000] 1774.275540: 8,7 I R [bash] bash-2184 [000] 1774.275547: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275580: 8,7 C N 0 [0] bash-2184 [000] 1774.275648: 8,7 I R [bash] bash-2184 [000] 1774.275653: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275682: 8,7 C N 0 [0] bash-2184 [000] 1774.275739: 8,7 I R [bash] bash-2184 [000] 1774.275744: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275771: 8,7 C N 0 [0] bash-2184 [000] 1774.275804: 8,7 I R [bash] bash-2184 [000] 1774.275808: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275836: 8,7 C N 0 [0] After this patch: # cat trace bash-2263 [000] 366.782149: 8,7 I N 0 (00 ..) [bash] bash-2263 [000] 366.782323: 8,7 D N 0 (00 ..) [bash] bash-2263 [000] 366.782557: 8,7 I R 8 (25 00 ..) [bash] bash-2263 [000] 366.782560: 8,7 D R 8 (25 00 ..) [bash] ksoftirqd/0-4 [000] 366.782582: 8,7 C N (25 00 ..) [0] bash-2263 [000] 366.782648: 8,7 I R 8 (5a 00 3f 00) [bash] bash-2263 [000] 366.782650: 8,7 D R 8 (5a 00 3f 00) [bash] ksoftirqd/0-4 [000] 366.782669: 8,7 C N (5a 00 3f 00) [0] bash-2263 [000] 366.782710: 8,7 I R 8 (5a 00 08 00) [bash] bash-2263 [000] 366.782713: 8,7 D R 8 (5a 00 08 00) [bash] ksoftirqd/0-4 [000] 366.782730: 8,7 C N (5a 00 08 00) [0] bash-2263 [000] 366.783375: 8,7 I R 36 (5a 00 08 00) [bash] bash-2263 [000] 366.783379: 8,7 D R 36 (5a 00 08 00) [bash] ksoftirqd/0-4 [000] 366.783404: 8,7 C N (5a 00 08 00) [0] This is what we do with PC events in user-space blktrace. Signed-off-by: Li Zefan Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49D32387.9040106@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 88 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 8 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e45e1af13563..2b98195b338b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -971,6 +971,16 @@ static inline const void *pdu_start(const struct trace_entry *ent) return te_blk_io_trace(ent) + 1; } +static inline u32 t_action(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->action; +} + +static inline u32 t_bytes(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes; +} + static inline u32 t_sec(const struct trace_entry *ent) { return te_blk_io_trace(ent)->bytes >> 9; @@ -1031,25 +1041,87 @@ static int blk_log_action(struct trace_iterator *iter, const char *act) MAJOR(t->device), MINOR(t->device), act, rwbs); } +static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +{ + const char *pdu_buf; + int pdu_len; + int i, end, ret; + + pdu_buf = pdu_start(ent); + pdu_len = te_blk_io_trace(ent)->pdu_len; + + if (!pdu_len) + return 1; + + /* find the last zero that needs to be printed */ + for (end = pdu_len - 1; end >= 0; end--) + if (pdu_buf[end]) + break; + end++; + + if (!trace_seq_putc(s, '(')) + return 0; + + for (i = 0; i < pdu_len; i++) { + + ret = trace_seq_printf(s, "%s%02x", + i == 0 ? "" : " ", pdu_buf[i]); + if (!ret) + return ret; + + /* + * stop when the rest is just zeroes and indicate so + * with a ".." appended + */ + if (i == end && end != pdu_len - 1) + return trace_seq_puts(s, " ..) "); + } + + return trace_seq_puts(s, ") "); +} + static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%s]\n", - t_sector(ent), t_sec(ent), cmd); - return trace_seq_printf(s, "[%s]\n", cmd); + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = trace_seq_printf(s, "%u ", t_bytes(ent)); + if (!ret) + return 0; + ret = blk_log_dump_pdu(s, ent); + if (!ret) + return 0; + return trace_seq_printf(s, "[%s]\n", cmd); + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%s]\n", + t_sector(ent), t_sec(ent), cmd); + return trace_seq_printf(s, "[%s]\n", cmd); + } } static int blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent) { - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), - t_sec(ent), t_error(ent)); - return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = blk_log_dump_pdu(s, ent); + if (ret) + return trace_seq_printf(s, "[%d]\n", t_error(ent)); + return 0; + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%d]\n", + t_sector(ent), + t_sec(ent), t_error(ent)); + return trace_seq_printf(s, "%llu [%d]\n", + t_sector(ent), t_error(ent)); + } } static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) -- cgit v1.2.2 From e1112b4d96859367a93468027c9635e2ac04eb3f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 31 Mar 2009 00:48:49 -0500 Subject: tracing/filters: add run-time field descriptions to TRACE_EVENT_FORMAT events This patch adds run-time field descriptions to all the event formats exported using TRACE_EVENT_FORMAT. It also hooks up all the tracers that use them (i.e. the tracers in the 'ftrace subsystem') so they can also have their output filtered by the event-filtering mechanism. When I was testing this, there were a couple of things that fooled me into thinking the filters weren't working, when actually they were - I'll mention them here so others don't make the same mistakes (and file bug reports. ;-) One is that some of the tracers trace multiple events e.g. the sched_switch tracer uses the context_switch and wakeup events, and if you don't set filters on all of the traced events, the unfiltered output from the events without filters on them can make it look like the filtering as a whole isn't working properly, when actually it is doing what it was asked to do - it just wasn't asked to do the right thing. The other is that for the really high-volume tracers e.g. the function tracer, the volume of filtered events can be so high that it pushes the unfiltered events out of the ring buffer before they can be read so e.g. cat'ing the trace file repeatedly shows either no output, or once in awhile some output but that isn't there the next time you read the trace, which isn't what you normally expect when reading the trace file. If you read from the trace_pipe file though, you can catch them before they disappear. Changes from v1: As suggested by Frederic Weisbecker: - get rid of externs in functions - added unlikely() to filter_check_discard() Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 6 ++++ kernel/trace/trace.c | 25 ++++++++++++++++ kernel/trace/trace.h | 20 +++++++++++++ kernel/trace/trace_branch.c | 3 ++ kernel/trace/trace_event_types.h | 6 ++-- kernel/trace/trace_events.c | 7 +++++ kernel/trace/trace_events_filter.c | 4 +-- kernel/trace/trace_events_stage_2.h | 7 ----- kernel/trace/trace_export.c | 57 +++++++++++++++++++++++++++++++++++-- kernel/trace/trace_hw_branches.c | 2 ++ kernel/trace/trace_power.c | 4 +++ 11 files changed, 127 insertions(+), 14 deletions(-) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 7a0aa0e260db..9419ad10541b 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, gfp_t gfp_flags, int node) { + struct ftrace_event_call *call = &event_kmem_alloc; struct trace_array *tr = kmemtrace_array; struct kmemtrace_alloc_entry *entry; struct ring_buffer_event *event; @@ -62,6 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, entry->gfp_flags = gfp_flags; entry->node = node; + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); @@ -71,6 +74,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, unsigned long call_site, const void *ptr) { + struct ftrace_event_call *call = &event_kmem_free; struct trace_array *tr = kmemtrace_array; struct kmemtrace_free_entry *entry; struct ring_buffer_event *event; @@ -86,6 +90,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, entry->call_site = call_site; entry->ptr = ptr; + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4865459f609f..962e6179994a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -898,6 +898,7 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_function; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -912,6 +913,9 @@ trace_function(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; + + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); } @@ -921,6 +925,7 @@ static int __trace_graph_entry(struct trace_array *tr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; struct ftrace_graph_ent_entry *entry; @@ -933,6 +938,7 @@ static int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(global_trace.buffer, event); return 1; @@ -943,6 +949,7 @@ static void __trace_graph_return(struct trace_array *tr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; struct ftrace_graph_ret_entry *entry; @@ -955,6 +962,7 @@ static void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(global_trace.buffer, event); } #endif @@ -973,6 +981,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, int skip, int pc) { #ifdef CONFIG_STACKTRACE + struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; @@ -990,6 +999,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1015,6 +1025,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc) { #ifdef CONFIG_STACKTRACE + struct ftrace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; struct stack_trace trace; @@ -1036,6 +1047,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace_user(&trace); + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1052,6 +1064,7 @@ ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { + struct ftrace_event_call *call = &event_special; struct ring_buffer_event *event; struct trace_array *tr = __tr; struct special_entry *entry; @@ -1064,6 +1077,7 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, pc); } @@ -1080,6 +1094,7 @@ tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_context_switch; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -1095,6 +1110,9 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_prio = next->prio; entry->next_state = next->state; entry->next_cpu = task_cpu(next); + + filter_check_discard(call, entry, event); + trace_buffer_unlock_commit(tr, event, flags, pc); } @@ -1104,6 +1122,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *curr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -1120,6 +1139,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); @@ -1221,6 +1242,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static u32 trace_buf[TRACE_BUF_SIZE]; + struct ftrace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1260,6 +1282,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); out_unlock: @@ -1279,6 +1302,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; + struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1314,6 +1338,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); out_unlock: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 34b94c3f40ad..e7737281953f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -866,6 +866,21 @@ extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); +static inline void +filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer_event *event) +{ + if (unlikely(call->preds) && !filter_match_preds(call, rec)) + ring_buffer_event_discard(event); +} + +#define __common_field(type, item) \ + ret = trace_define_field(event_call, #type, "common_" #item, \ + offsetof(typeof(field.ent), item), \ + sizeof(field.ent.item)); \ + if (ret) \ + return ret; + void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; @@ -897,4 +912,9 @@ do { \ __trace_printk(ip, fmt, ##args); \ } while (0) +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ + extern struct ftrace_event_call event_##call; +#include "trace_event_types.h" + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index e6e32912ffb8..c95c25d838ef 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -30,6 +30,7 @@ static struct trace_array *branch_tracer; static void probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) { + struct ftrace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; struct trace_branch *entry; @@ -73,6 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); out: diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index fd78bee71dd7..95b147aac229 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned int, line, line) - TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func) - TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) + TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, + TRACE_FUNC_SIZE+1, func) + TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, + TRACE_FUNC_SIZE+1, file) TRACE_FIELD(char, correct, correct) ), TP_RAW_FMT("%u:%s:%s (%u)") diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 64ec4d278ffb..be9299a53e2a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -680,6 +680,7 @@ static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { struct event_subsystem *system; + struct dentry *entry; /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { @@ -708,6 +709,12 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->preds = NULL; + entry = debugfs_create_file("filter", 0644, system->entry, system, + &ftrace_subsystem_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", name); + return system->entry; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 026be412f356..470ad9487ecc 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -185,7 +185,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system) } events_for_each(call) { - if (!call->name || !call->regfunc) + if (!call->define_fields) continue; if (!strcmp(call->system, system->name)) @@ -324,7 +324,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system, events_for_each(call) { int err; - if (!call->name || !call->regfunc) + if (!call->define_fields) continue; if (strcmp(call->system, system->name)) diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 30743f7d4110..1c94b87c7180 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -146,13 +146,6 @@ ftrace_format_##call(struct trace_seq *s) \ if (ret) \ return ret; -#define __common_field(type, item) \ - ret = trace_define_field(event_call, #type, "common_" #item, \ - offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item)); \ - if (ret) \ - return ret; - #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ int \ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 07a22c33ebf3..f4e46616c48e 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -30,7 +30,7 @@ #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), item), \ @@ -85,18 +85,69 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_ENTRY entry #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ cmd; #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int ftrace_define_fields_##call(void); \ +static int ftrace_raw_init_event_##call(void); \ \ -static struct ftrace_event_call __used \ +struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .id = proto, \ .system = __stringify(TRACE_SYSTEM), \ + .raw_init = ftrace_raw_init_event_##call, \ .show_format = ftrace_format_##call, \ + .define_fields = ftrace_define_fields_##call, \ +}; \ +static int ftrace_raw_init_event_##call(void) \ +{ \ + INIT_LIST_HEAD(&event_##call.fields); \ + return 0; \ +} \ + +#include "trace_event_types.h" + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_SPECIAL +#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_ZERO_CHAR +#define TRACE_FIELD_ZERO_CHAR(item) + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int \ +ftrace_define_fields_##call(void) \ +{ \ + struct ftrace_event_call *event_call = &event_##call; \ + struct args field; \ + int ret; \ + \ + __common_field(unsigned char, type); \ + __common_field(unsigned char, flags); \ + __common_field(unsigned char, preempt_count); \ + __common_field(int, pid); \ + __common_field(int, tgid); \ + \ + tstruct; \ + \ + return ret; \ } + #include "trace_event_types.h" diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 7bfdf4c2347f..e6b275b22ac0 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -168,6 +168,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) void trace_hw_branch(u64 from, u64 to) { + struct ftrace_event_call *call = &event_hw_branch; struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; struct hw_branch_entry *entry; @@ -194,6 +195,7 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, 0); out: diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index bae791ebcc51..8ce7d7d62c07 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type, static void probe_power_end(struct power_trace *it) { + struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; @@ -54,6 +55,7 @@ static void probe_power_end(struct power_trace *it) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); @@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it) static void probe_power_mark(struct power_trace *it, unsigned int type, unsigned int level) { + struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; @@ -84,6 +87,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); -- cgit v1.2.2 From e45f2e2bd298e1ff687448e5fd15a3588b5807ec Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 31 Mar 2009 00:49:16 -0500 Subject: tracing/filters: add TRACE_EVENT_FORMAT_NOFILTER event macro Frederic Weisbecker suggested that the trace_special event shouldn't be filterable; this patch adds a TRACE_EVENT_FORMAT_NOFILTER event macro that allows an event format to be exported without having a filter attached, and removes filtering from the trace_special event. Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 -- kernel/trace/trace.h | 2 ++ kernel/trace/trace_event_types.h | 2 +- kernel/trace/trace_export.c | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 962e6179994a..c209d214169c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1064,7 +1064,6 @@ ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { - struct ftrace_event_call *call = &event_special; struct ring_buffer_event *event; struct trace_array *tr = __tr; struct special_entry *entry; @@ -1077,7 +1076,6 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; - filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, pc); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e7737281953f..3cf856fa597b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -915,6 +915,8 @@ do { \ #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ extern struct ftrace_event_call event_##call; +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) #include "trace_event_types.h" #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 95b147aac229..cfcecc4fd86d 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") ); -TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, +TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, arg1, arg1) TRACE_FIELD(unsigned long, arg2, arg2) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index f4e46616c48e..77c494f5e1d6 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -65,6 +65,22 @@ ftrace_format_##call(struct trace_seq *s) \ return ret; \ } +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) \ +static int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct args field; \ + int ret; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + \ + return ret; \ +} + #include "trace_event_types.h" #undef TRACE_ZERO_CHAR @@ -109,6 +125,19 @@ static int ftrace_raw_init_event_##call(void) \ return 0; \ } \ +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) \ + \ +struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .id = proto, \ + .system = __stringify(TRACE_SYSTEM), \ + .show_format = ftrace_format_##call, \ +}; + #include "trace_event_types.h" #undef TRACE_FIELD @@ -150,4 +179,8 @@ ftrace_define_fields_##call(void) \ return ret; \ } +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) + #include "trace_event_types.h" -- cgit v1.2.2 From fa1b47dd85453ec7d4bcfe4aa4a2d172ba452fc3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 2 Apr 2009 00:09:41 -0400 Subject: ring-buffer: add ring_buffer_discard_commit The ring_buffer_discard_commit is similar to ring_buffer_event_discard but it can only be done on an event that has yet to be commited. Unpredictable results can happen otherwise. The main difference between ring_buffer_discard_commit and ring_buffer_event_discard is that ring_buffer_discard_commit will try to free the data in the ring buffer if nothing has addded data after the reserved event. If something did, then it acts almost the same as ring_buffer_event_discard followed by a ring_buffer_unlock_commit. Note, either ring_buffer_commit_discard and ring_buffer_unlock_commit can be called on an event, not both. This commit also exports both discard functions to be usable by GPL modules. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ring_buffer.h | 29 ++++++++++ kernel/trace/ring_buffer.c | 125 ++++++++++++++++++++++++++++++++++++-------- 2 files changed, 133 insertions(+), 21 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index e1b7b2173885..f0aa486d131c 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -68,8 +68,37 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event) return event->time_delta; } +/* + * ring_buffer_event_discard can discard any event in the ring buffer. + * it is up to the caller to protect against a reader from + * consuming it or a writer from wrapping and replacing it. + * + * No external protection is needed if this is called before + * the event is commited. But in that case it would be better to + * use ring_buffer_discard_commit. + * + * Note, if an event that has not been committed is discarded + * with ring_buffer_event_discard, it must still be committed. + */ void ring_buffer_event_discard(struct ring_buffer_event *event); +/* + * ring_buffer_discard_commit will remove an event that has not + * ben committed yet. If this is used, then ring_buffer_unlock_commit + * must not be called on the discarded event. This function + * will try to remove the event from the ring buffer completely + * if another event has not been written after it. + * + * Example use: + * + * if (some_condition) + * ring_buffer_discard_commit(buffer, event); + * else + * ring_buffer_unlock_commit(buffer, event); + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event); + /* * size is in bytes for each per CPU buffer. */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 74a11808c282..f935bd5ec3e8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -205,27 +205,6 @@ static void rb_event_set_padding(struct ring_buffer_event *event) event->time_delta = 0; } -/** - * ring_buffer_event_discard - discard an event in the ring buffer - * @buffer: the ring buffer - * @event: the event to discard - * - * Sometimes a event that is in the ring buffer needs to be ignored. - * This function lets the user discard an event in the ring buffer - * and then that event will not be read later. - * - * Note, it is up to the user to be careful with this, and protect - * against races. If the user discards an event that has been consumed - * it is possible that it could corrupt the ring buffer. - */ -void ring_buffer_event_discard(struct ring_buffer_event *event) -{ - event->type = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - static unsigned rb_event_data_length(struct ring_buffer_event *event) { @@ -1570,6 +1549,110 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +/** + * ring_buffer_event_discard - discard any event in the ring buffer + * @event: the event to discard + * + * Sometimes a event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * Note, it is up to the user to be careful with this, and protect + * against races. If the user discards an event that has been consumed + * it is possible that it could corrupt the ring buffer. + */ +void ring_buffer_event_discard(struct ring_buffer_event *event) +{ + event->type = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} +EXPORT_SYMBOL_GPL(ring_buffer_event_discard); + +/** + * ring_buffer_commit_discard - discard an event that has not been committed + * @buffer: the ring buffer + * @event: non committed event to discard + * + * This is similar to ring_buffer_event_discard but must only be + * performed on an event that has not been committed yet. The difference + * is that this will also try to free the event from the ring buffer + * if another event has not been added behind it. + * + * If another event has been added behind it, it will set the event + * up as discarded, and perform the commit. + * + * If this function is called, do not call ring_buffer_unlock_commit on + * the event. + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + int cpu; + + /* The event is discarded regardless */ + ring_buffer_event_discard(event); + + /* + * This must only be called if the event has not been + * committed yet. Thus we can assume that preemption + * is still disabled. + */ + RB_WARN_ON(buffer, !preempt_count()); + + cpu = smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage == (void *)addr && rb_page_write(bpage) == old_index) { + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) + goto out; + } + + /* + * The commit is still visible by the reader, so we + * must increment entries. + */ + cpu_buffer->entries++; + out: + /* + * If a write came in and pushed the tail page + * we still need to update the commit pointer + * if we were the commit. + */ + if (rb_is_commit(cpu_buffer, event)) + rb_set_commit_to_write(cpu_buffer); + + /* + * Only the last preempt count needs to restore preemption. + */ + if (preempt_count() == 1) + ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); + else + preempt_enable_no_resched_notrace(); + +} +EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); + /** * ring_buffer_write - write data to the buffer without reserving * @buffer: The ring buffer to write to. -- cgit v1.2.2 From 77d9f465d46fd67cdb82ee5e1ab99dd57a17c486 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 2 Apr 2009 01:16:59 -0400 Subject: tracing/filters: use ring_buffer_discard_commit for discarded events The ring_buffer_discard_commit makes better usage of the ring_buffer when an event has been discarded. It tries to remove it completely if possible. This patch converts the trace event filtering to use ring_buffer_discard_commit instead of the ring_buffer_event_discard. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 9 +++++++-- kernel/trace/trace.h | 1 + kernel/trace/trace_events_stage_3.h | 6 +++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c209d214169c..d880ab2772ce 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -884,13 +884,18 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); + __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); + __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); +} + +void trace_current_buffer_discard_commit(struct ring_buffer_event *event) +{ + ring_buffer_discard_commit(global_trace.buffer, event); } void diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3cf856fa597b..dfefffd7ae39 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -497,6 +497,7 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); +void trace_current_buffer_discard_commit(struct ring_buffer_event *event); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 9d2fa78cecca..d2f34bf30e59 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -223,9 +223,9 @@ static void ftrace_raw_event_##call(proto) \ assign; \ \ if (call->preds && !filter_match_preds(call, entry)) \ - ring_buffer_event_discard(event); \ - \ - trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ + trace_current_buffer_discard_commit(event); \ + else \ + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ \ } \ \ -- cgit v1.2.2 From 5f77a88b3f8268b11940b51d2e03d26a663ceb90 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 8 Apr 2009 03:14:01 -0500 Subject: tracing/infrastructure: separate event tracer from event support Add a new config option, CONFIG_EVENT_TRACING that gets selected when CONFIG_TRACING is selected and adds everything needed by the stuff in trace_export - basically all the event tracing support needed by e.g. bprint, minus the actual events, which are only included if CONFIG_EVENT_TRACER is selected. So CONFIG_EVENT_TRACER can be used to turn on or off the generated events (what I think of as the 'event tracer'), while CONFIG_EVENT_TRACING turns on or off the base event tracing support used by both the event tracer and the other things such as bprint that can't be configured out. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: fweisbec@gmail.com LKML-Reference: <1239178441.10295.34.camel@tropicana> Signed-off-by: Ingo Molnar --- include/asm-generic/vmlinux.lds.h | 2 +- kernel/trace/Kconfig | 4 ++++ kernel/trace/Makefile | 6 +++--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 7fa660fd449c..7e9b1e9f711c 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -61,7 +61,7 @@ #define BRANCH_PROFILE() #endif -#ifdef CONFIG_EVENT_TRACER +#ifdef CONFIG_EVENT_TRACING #define FTRACE_EVENTS() VMLINUX_SYMBOL(__start_ftrace_events) = .; \ *(_ftrace_events) \ VMLINUX_SYMBOL(__stop_ftrace_events) = .; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 23b96ebbf893..644606e899fa 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -48,6 +48,9 @@ config FTRACE_NMI_ENTER depends on HAVE_FTRACE_NMI_ENTER default y +config EVENT_TRACING + bool + config TRACING bool select DEBUG_FS @@ -56,6 +59,7 @@ config TRACING select TRACEPOINTS select NOP_TRACER select BINARY_PRINTF + select EVENT_TRACING # # Minimum requirements an architecture has to meet for us to diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2630f5121ec1..3ad367e7c97f 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -40,11 +40,11 @@ obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o -obj-$(CONFIG_EVENT_TRACER) += trace_events.o +obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACER) += events.o -obj-$(CONFIG_EVENT_TRACER) += trace_export.o +obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o -obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o libftrace-y := ftrace.o -- cgit v1.2.2 From eb02ce017dd83985041a7e54c6449f92d53b026f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 8 Apr 2009 03:15:54 -0500 Subject: tracing/filters: use ring_buffer_discard_commit() in filter_check_discard() This patch changes filter_check_discard() to make use of the new ring_buffer_discard_commit() function and modifies the current users to call the old commit function in the non-discard case. It also introduces a version of filter_check_discard() that uses the global trace buffer (filter_current_check_discard()) for those cases. v2 changes: - fix compile error noticed by Ingo Molnar Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: fweisbec@gmail.com LKML-Reference: <1239178554.10295.36.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 10 ++++----- kernel/trace/trace.c | 45 ++++++++++++++++++++----------------- kernel/trace/trace.h | 14 +++++++++--- kernel/trace/trace_branch.c | 5 ++--- kernel/trace/trace_events_stage_3.h | 5 +---- kernel/trace/trace_hw_branches.c | 4 ++-- kernel/trace/trace_power.c | 8 +++---- 7 files changed, 48 insertions(+), 43 deletions(-) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 9419ad10541b..86cdf671d7e2 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -63,9 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, entry->gfp_flags = gfp_flags; entry->node = node; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } @@ -90,9 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, entry->call_site = call_site; entry->ptr = ptr; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d880ab2772ce..c0047fcf7076 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -171,6 +171,12 @@ static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +int filter_current_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer_event *event) +{ + return filter_check_discard(call, rec, global_trace.buffer, event); +} + cycle_t ftrace_now(int cpu) { u64 ts; @@ -919,9 +925,8 @@ trace_function(struct trace_array *tr, entry->ip = ip; entry->parent_ip = parent_ip; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -943,8 +948,8 @@ static int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(global_trace.buffer, event); + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(global_trace.buffer, event); return 1; } @@ -967,8 +972,8 @@ static void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(global_trace.buffer, event); + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(global_trace.buffer, event); } #endif @@ -1004,8 +1009,8 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1052,8 +1057,8 @@ static void ftrace_trace_userstack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace_user(&trace); - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1114,9 +1119,8 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = next->state; entry->next_cpu = task_cpu(next); - filter_check_discard(call, entry, event); - - trace_buffer_unlock_commit(tr, event, flags, pc); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, flags, pc); } void @@ -1142,9 +1146,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); } @@ -1285,8 +1288,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); @@ -1341,8 +1344,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dfefffd7ae39..9729d14767d8 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -866,13 +866,21 @@ extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); +extern int filter_current_check_discard(struct ftrace_event_call *call, + void *rec, + struct ring_buffer_event *event); -static inline void +static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->preds) && !filter_match_preds(call, rec)) - ring_buffer_event_discard(event); + if (unlikely(call->preds) && !filter_match_preds(call, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; } #define __common_field(type, item) \ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index c95c25d838ef..8e64e604f5a7 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -74,9 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index d2f34bf30e59..b2b298269eb0 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -222,11 +222,8 @@ static void ftrace_raw_event_##call(proto) \ \ assign; \ \ - if (call->preds && !filter_match_preds(call, entry)) \ - trace_current_buffer_discard_commit(event); \ - else \ + if (!filter_current_check_discard(call, entry, event)) \ trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ - \ } \ \ static int ftrace_raw_reg_event_##call(void) \ diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index e6b275b22ac0..8683d50a753a 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -195,8 +195,8 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; - filter_check_discard(call, entry, event); - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index 8ce7d7d62c07..810a5b7cf1c5 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -55,8 +55,8 @@ static void probe_power_end(struct power_trace *it) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - filter_check_discard(call, entry, event); - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } @@ -87,8 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - filter_check_discard(call, entry, event); - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } -- cgit v1.2.2 From 0a19e53c1514ad8e9c3cbab40c6c3f52c86f403d Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 13 Apr 2009 03:17:50 -0500 Subject: tracing/filters: allow on-the-fly filter switching This patch allows event filters to be safely removed or switched on-the-fly while avoiding the use of rcu or the suspension of tracing of previous versions. It does it by adding a new filter_pred_none() predicate function which does nothing and by never deallocating either the predicates or any of the filter_pred members used in matching; the predicate lists are allocated and initialized during ftrace_event_calls initialization. Whenever a filter is removed or replaced, the filter_pred_* functions currently in use by the affected ftrace_event_call are immediately switched over to to the filter_pred_none() function, while the rest of the filter_pred members are left intact, allowing any currently executing filter_pred_* functions to finish up, using the values they're currently using. In the case of filter replacement, the new predicate values are copied into the old predicates after the above step, and the filter_pred_none() functions are replaced by the filter_pred_* functions for the new filter. In this case, it is possible though very unlikely that a previous filter_pred_* is still running even after the filter_pred_none() switch and the switch to the new filter_pred_*. In that case, however, because nothing has been deallocated in the filter_pred, the worst that can happen is that the old filter_pred_* function sees the new values and as a result produces either a false positive or a false negative, depending on the values it finds. So one downside to this method is that rarely, it can produce a bad match during the filter switch, but it should be possible to live with that, IMHO. The other downside is that at least in this patch the predicate lists are always pre-allocated, taking up memory from the start. They could probably be allocated on first-use, and de-allocated when tracing is completely stopped - if this patch makes sense, I could create another one to do that later on. Oh, and it also places a restriction on the size of __arrays in events, currently set to 128, since they can't be larger than the now embedded str_val arrays in the filter_pred struct. Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: paulmck@linux.vnet.ibm.com LKML-Reference: <1239610670.6660.49.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 14 +- kernel/trace/trace_events.c | 9 +- kernel/trace/trace_events_filter.c | 252 +++++++++++++++++++----------------- kernel/trace/trace_events_stage_2.h | 1 + kernel/trace/trace_events_stage_3.h | 1 + kernel/trace/trace_export.c | 1 + 6 files changed, 150 insertions(+), 128 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9729d14767d8..b05b6ac982a1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -813,6 +813,7 @@ struct ftrace_event_call { int (*show_format)(struct trace_seq *s); int (*define_fields)(void); struct list_head fields; + int n_preds; struct filter_pred **preds; #ifdef CONFIG_EVENT_PROFILE @@ -826,6 +827,7 @@ struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; + int n_preds; struct filter_pred **preds; }; @@ -834,7 +836,8 @@ struct event_subsystem { (unsigned long)event < (unsigned long)__stop_ftrace_events; \ event++) -#define MAX_FILTER_PRED 8 +#define MAX_FILTER_PRED 8 +#define MAX_FILTER_STR_VAL 128 struct filter_pred; @@ -843,7 +846,7 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); struct filter_pred { filter_pred_fn_t fn; u64 val; - char *str_val; + char str_val[MAX_FILTER_STR_VAL]; int str_len; char *field_name; int offset; @@ -855,13 +858,14 @@ struct filter_pred { int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size); +extern int init_preds(struct ftrace_event_call *call); extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct filter_pred **preds, +extern void filter_print_preds(struct filter_pred **preds, int n_preds, struct trace_seq *s); extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); -extern void filter_free_preds(struct ftrace_event_call *call); +extern void filter_disable_preds(struct ftrace_event_call *call); extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, @@ -875,7 +879,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->preds) && !filter_match_preds(call, rec)) { + if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 789e14eb09a5..ead68ac99191 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -481,7 +481,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call->preds, s); + filter_print_preds(call->preds, call->n_preds, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -516,7 +516,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } if (pred->clear) { - filter_free_preds(call); + filter_disable_preds(call); filter_free_pred(pred); return cnt; } @@ -527,6 +527,8 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return err; } + filter_free_pred(pred); + *ppos += cnt; return cnt; @@ -549,7 +551,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(system->preds, s); + filter_print_preds(system->preds, system->n_preds, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -712,6 +714,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) list_add(&system->list, &event_subsystems); system->preds = NULL; + system->n_preds = 0; entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9f8ecca34a59..de42dad42a88 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -82,25 +82,27 @@ static int filter_pred_string(struct filter_pred *pred, void *event) return match; } +static int filter_pred_none(struct filter_pred *pred, void *event) +{ + return 0; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct ftrace_event_call *call, void *rec) { int i, matched, and_failed = 0; struct filter_pred *pred; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (call->preds[i]) { - pred = call->preds[i]; - if (and_failed && !pred->or) - continue; - matched = pred->fn(pred, rec); - if (!matched && !pred->or) { - and_failed = 1; - continue; - } else if (matched && pred->or) - return 1; - } else - break; + for (i = 0; i < call->n_preds; i++) { + pred = call->preds[i]; + if (and_failed && !pred->or) + continue; + matched = pred->fn(pred, rec); + if (!matched && !pred->or) { + and_failed = 1; + continue; + } else if (matched && pred->or) + return 1; } if (and_failed) @@ -109,31 +111,29 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) return 1; } -void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) +void filter_print_preds(struct filter_pred **preds, int n_preds, + struct trace_seq *s) { char *field_name; struct filter_pred *pred; int i; - if (!preds) { + if (!n_preds) { trace_seq_printf(s, "none\n"); return; } - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (preds[i]) { - pred = preds[i]; - field_name = pred->field_name; - if (i) - trace_seq_printf(s, pred->or ? "|| " : "&& "); - trace_seq_printf(s, "%s ", field_name); - trace_seq_printf(s, pred->not ? "!= " : "== "); - if (pred->str_val) - trace_seq_printf(s, "%s\n", pred->str_val); - else - trace_seq_printf(s, "%llu\n", pred->val); - } else - break; + for (i = 0; i < n_preds; i++) { + pred = preds[i]; + field_name = pred->field_name; + if (i) + trace_seq_printf(s, pred->or ? "|| " : "&& "); + trace_seq_printf(s, "%s ", field_name); + trace_seq_printf(s, pred->not ? "!= " : "== "); + if (pred->str_len) + trace_seq_printf(s, "%s\n", pred->str_val); + else + trace_seq_printf(s, "%llu\n", pred->val); } } @@ -156,20 +156,69 @@ void filter_free_pred(struct filter_pred *pred) return; kfree(pred->field_name); - kfree(pred->str_val); kfree(pred); } -void filter_free_preds(struct ftrace_event_call *call) +static void filter_clear_pred(struct filter_pred *pred) +{ + kfree(pred->field_name); + pred->field_name = NULL; + pred->str_len = 0; +} + +static int filter_set_pred(struct filter_pred *dest, + struct filter_pred *src, + filter_pred_fn_t fn) +{ + *dest = *src; + dest->field_name = kstrdup(src->field_name, GFP_KERNEL); + if (!dest->field_name) + return -ENOMEM; + dest->fn = fn; + + return 0; +} + +void filter_disable_preds(struct ftrace_event_call *call) { int i; - if (call->preds) { - for (i = 0; i < MAX_FILTER_PRED; i++) + call->n_preds = 0; + + for (i = 0; i < MAX_FILTER_PRED; i++) + call->preds[i]->fn = filter_pred_none; +} + +int init_preds(struct ftrace_event_call *call) +{ + struct filter_pred *pred; + int i; + + call->n_preds = 0; + + call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); + if (!call->preds) + return -ENOMEM; + + for (i = 0; i < MAX_FILTER_PRED; i++) { + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + goto oom; + pred->fn = filter_pred_none; + call->preds[i] = pred; + } + + return 0; + +oom: + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (call->preds[i]) filter_free_pred(call->preds[i]); - kfree(call->preds); - call->preds = NULL; } + kfree(call->preds); + call->preds = NULL; + + return -ENOMEM; } void filter_free_subsystem_preds(struct event_subsystem *system) @@ -177,11 +226,12 @@ void filter_free_subsystem_preds(struct event_subsystem *system) struct ftrace_event_call *call = __start_ftrace_events; int i; - if (system->preds) { - for (i = 0; i < MAX_FILTER_PRED; i++) + if (system->n_preds) { + for (i = 0; i < system->n_preds; i++) filter_free_pred(system->preds[i]); kfree(system->preds); system->preds = NULL; + system->n_preds = 0; } events_for_each(call) { @@ -189,33 +239,31 @@ void filter_free_subsystem_preds(struct event_subsystem *system) continue; if (!strcmp(call->system, system->name)) - filter_free_preds(call); + filter_disable_preds(call); } } static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred) + struct filter_pred *pred, + filter_pred_fn_t fn) { - int i; + int idx, err; - if (call->preds && !pred->compound) - filter_free_preds(call); + if (call->n_preds && !pred->compound) + filter_disable_preds(call); - if (!call->preds) { - call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), - GFP_KERNEL); - if (!call->preds) - return -ENOMEM; - } + if (call->n_preds == MAX_FILTER_PRED) + return -ENOSPC; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (!call->preds[i]) { - call->preds[i] = pred; - return 0; - } - } + idx = call->n_preds; + filter_clear_pred(call->preds[idx]); + err = filter_set_pred(call->preds[idx], pred, fn); + if (err) + return err; + + call->n_preds++; - return -ENOSPC; + return 0; } static int is_string_field(const char *type) @@ -229,98 +277,66 @@ static int is_string_field(const char *type) int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) { struct ftrace_event_field *field; + filter_pred_fn_t fn; field = find_event_field(call, pred->field_name); if (!field) return -EINVAL; + pred->fn = filter_pred_none; pred->offset = field->offset; if (is_string_field(field->type)) { - if (!pred->str_val) + if (!pred->str_len) return -EINVAL; - pred->fn = filter_pred_string; + fn = filter_pred_string; pred->str_len = field->size; - return __filter_add_pred(call, pred); + return __filter_add_pred(call, pred, fn); } else { - if (pred->str_val) + if (pred->str_len) return -EINVAL; } switch (field->size) { case 8: - pred->fn = filter_pred_64; + fn = filter_pred_64; break; case 4: - pred->fn = filter_pred_32; + fn = filter_pred_32; break; case 2: - pred->fn = filter_pred_16; + fn = filter_pred_16; break; case 1: - pred->fn = filter_pred_8; + fn = filter_pred_8; break; default: return -EINVAL; } - return __filter_add_pred(call, pred); -} - -static struct filter_pred *copy_pred(struct filter_pred *pred) -{ - struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); - if (!new_pred) - return NULL; - - memcpy(new_pred, pred, sizeof(*pred)); - - if (pred->field_name) { - new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); - if (!new_pred->field_name) { - kfree(new_pred); - return NULL; - } - } - - if (pred->str_val) { - new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); - if (!new_pred->str_val) { - filter_free_pred(new_pred); - return NULL; - } - } - - return new_pred; + return __filter_add_pred(call, pred, fn); } int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred) { struct ftrace_event_call *call = __start_ftrace_events; - struct filter_pred *event_pred; - int i; - if (system->preds && !pred->compound) + if (system->n_preds && !pred->compound) filter_free_subsystem_preds(system); - if (!system->preds) { + if (!system->n_preds) { system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); if (!system->preds) return -ENOMEM; } - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (!system->preds[i]) { - system->preds[i] = pred; - break; - } - } - - if (i == MAX_FILTER_PRED) + if (system->n_preds == MAX_FILTER_PRED) return -ENOSPC; + system->preds[system->n_preds] = pred; + events_for_each(call) { int err; @@ -333,22 +349,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system, if (!find_event_field(call, pred->field_name)) continue; - event_pred = copy_pred(pred); - if (!event_pred) - goto oom; - - err = filter_add_pred(call, event_pred); - if (err) - filter_free_pred(event_pred); - if (err == -ENOMEM) - goto oom; + err = filter_add_pred(call, pred); + if (err == -ENOMEM) { + system->preds[system->n_preds] = NULL; + return err; + } } - return 0; + system->n_preds++; -oom: - system->preds[i] = NULL; - return -ENOMEM; + return 0; } int filter_parse(char **pbuf, struct filter_pred *pred) @@ -410,7 +420,8 @@ int filter_parse(char **pbuf, struct filter_pred *pred) } } - if (!val_str) { + if (!val_str || !strlen(val_str) + || strlen(val_str) >= MAX_FILTER_STR_VAL) { pred->field_name = NULL; return -EINVAL; } @@ -419,11 +430,12 @@ int filter_parse(char **pbuf, struct filter_pred *pred) if (!pred->field_name) return -ENOMEM; + pred->str_len = 0; pred->val = simple_strtoull(val_str, &tmp, 0); if (tmp == val_str) { - pred->str_val = kstrdup(val_str, GFP_KERNEL); - if (!pred->str_val) - return -ENOMEM; + strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL); + pred->str_len = strlen(val_str); + pred->str_val[pred->str_len] = '\0'; } else if (*tmp != '\0') return -EINVAL; diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 02fb710193ed..59cfd7dfe68d 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -140,6 +140,7 @@ ftrace_format_##call(struct trace_seq *s) \ #undef __array #define __array(type, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ sizeof(field.item)); \ diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index b2b298269eb0..5bb1b7ffbdb6 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -255,6 +255,7 @@ static int ftrace_raw_init_event_##call(void) \ return -ENODEV; \ event_##call.id = id; \ INIT_LIST_HEAD(&event_##call.fields); \ + init_preds(&event_##call); \ return 0; \ } \ \ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 77c494f5e1d6..48fc02fe73a0 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -122,6 +122,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ static int ftrace_raw_init_event_##call(void) \ { \ INIT_LIST_HEAD(&event_##call.fields); \ + init_preds(&event_##call); \ return 0; \ } \ -- cgit v1.2.2 From ea20d9293ce423a39717ed4375393129a2e701f9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 08:54:16 -0400 Subject: tracing: consolidate trace and trace_event headers Impact: clean up Neil Horman (et. al.) criticized the way the trace events were broken up into two files. The reason for that was that ftrace needed to separate out the declarations from where the #include was used. It then dawned on me that the tracepoint.h header only needs to define the TRACE_EVENT macro if it is not already defined. The solution is simply to test if TRACE_EVENT is defined, and if it is not then the linux/tracepoint.h header can define it. This change consolidates all the .h and _event_types.h into the .h file. Reported-by: Neil Horman Reported-by: Theodore Tso Reported-by: Jiaying Zhang Cc: Zhaolei Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Jason Baron Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 9 +- include/trace/irq.h | 51 +++++- include/trace/irq_event_types.h | 55 ------ include/trace/kmem.h | 189 +++++++++++++++++++- include/trace/lockdep.h | 52 +++++- include/trace/lockdep_event_types.h | 57 ------ include/trace/sched.h | 333 ++++++++++++++++++++++++++++++++++- include/trace/sched_event_types.h | 337 ------------------------------------ include/trace/skb.h | 36 +++- include/trace/skb_event_types.h | 38 ---- include/trace/trace_event_types.h | 7 - kernel/trace/events.c | 1 + kernel/trace/trace_events_stage_1.h | 4 +- kernel/trace/trace_events_stage_2.h | 8 +- kernel/trace/trace_events_stage_3.h | 4 +- 15 files changed, 663 insertions(+), 518 deletions(-) delete mode 100644 include/trace/irq_event_types.h delete mode 100644 include/trace/lockdep_event_types.h delete mode 100644 include/trace/sched_event_types.h delete mode 100644 include/trace/skb_event_types.h delete mode 100644 include/trace/trace_event_types.h diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index d35a7ee7611f..4353f3f7e624 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -31,6 +31,8 @@ struct tracepoint { * Keep in sync with vmlinux.lds.h. */ +#ifndef DECLARE_TRACE + #define TP_PROTO(args...) args #define TP_ARGS(args...) args @@ -114,6 +116,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) { } #endif /* CONFIG_TRACEPOINTS */ +#endif /* DECLARE_TRACE */ /* * Connect a probe to a tracepoint. @@ -154,10 +157,13 @@ static inline void tracepoint_synchronize_unregister(void) } #define PARAMS(args...) args + +#ifndef TRACE_FORMAT #define TRACE_FORMAT(name, proto, args, fmt) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) +#endif - +#ifndef TRACE_EVENT /* * For use with the TRACE_EVENT macro: * @@ -262,5 +268,6 @@ static inline void tracepoint_synchronize_unregister(void) #define TRACE_EVENT(name, proto, args, struct, assign, print) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) +#endif #endif diff --git a/include/trace/irq.h b/include/trace/irq.h index ff5d4495dc37..04ab4c652225 100644 --- a/include/trace/irq.h +++ b/include/trace/irq.h @@ -1,9 +1,54 @@ -#ifndef _TRACE_IRQ_H +#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IRQ_H -#include #include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM irq + +/* + * Tracepoint for entry of interrupt handler: + */ +TRACE_FORMAT(irq_handler_entry, + TP_PROTO(int irq, struct irqaction *action), + TP_ARGS(irq, action), + TP_FMT("irq=%d handler=%s", irq, action->name) + ); + +/* + * Tracepoint for return of an interrupt handler: + */ +TRACE_EVENT(irq_handler_exit, + + TP_PROTO(int irq, struct irqaction *action, int ret), + + TP_ARGS(irq, action, ret), + + TP_STRUCT__entry( + __field( int, irq ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->ret = ret; + ), + + TP_printk("irq=%d return=%s", + __entry->irq, __entry->ret ? "handled" : "unhandled") +); + +TRACE_FORMAT(softirq_entry, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), + TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) + ); -#include +TRACE_FORMAT(softirq_exit, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), + TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) + ); #endif diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h deleted file mode 100644 index 85964ebd47ec..000000000000 --- a/include/trace/irq_event_types.h +++ /dev/null @@ -1,55 +0,0 @@ - -/* use instead */ -#ifndef TRACE_FORMAT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM irq - -/* - * Tracepoint for entry of interrupt handler: - */ -TRACE_FORMAT(irq_handler_entry, - TP_PROTO(int irq, struct irqaction *action), - TP_ARGS(irq, action), - TP_FMT("irq=%d handler=%s", irq, action->name) - ); - -/* - * Tracepoint for return of an interrupt handler: - */ -TRACE_EVENT(irq_handler_exit, - - TP_PROTO(int irq, struct irqaction *action, int ret), - - TP_ARGS(irq, action, ret), - - TP_STRUCT__entry( - __field( int, irq ) - __field( int, ret ) - ), - - TP_fast_assign( - __entry->irq = irq; - __entry->ret = ret; - ), - - TP_printk("irq=%d return=%s", - __entry->irq, __entry->ret ? "handled" : "unhandled") -); - -TRACE_FORMAT(softirq_entry, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), - TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); - -TRACE_FORMAT(softirq_exit, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), - TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); - -#undef TRACE_SYSTEM diff --git a/include/trace/kmem.h b/include/trace/kmem.h index 46efc2423f03..d7d12189e5c8 100644 --- a/include/trace/kmem.h +++ b/include/trace/kmem.h @@ -1,9 +1,192 @@ -#ifndef _TRACE_KMEM_H +#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KMEM_H #include #include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kmem -#endif /* _TRACE_KMEM_H */ +TRACE_EVENT(kmalloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmem_cache_alloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmalloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kmem_cache_alloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kfree, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); + +TRACE_EVENT(kmem_cache_free, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); + +#endif diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h index 5ca67df87f2a..8ee7900b38c4 100644 --- a/include/trace/lockdep.h +++ b/include/trace/lockdep.h @@ -1,9 +1,57 @@ -#ifndef _TRACE_LOCKDEP_H +#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_LOCKDEP_H #include #include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM lock + +#ifdef CONFIG_LOCKDEP + +TRACE_FORMAT(lock_acquire, + TP_PROTO(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, + struct lockdep_map *next_lock, unsigned long ip), + TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), + TP_FMT("%s%s%s", trylock ? "try " : "", + read ? "read " : "", lock->name) + ); + +TRACE_FORMAT(lock_release, + TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), + TP_ARGS(lock, nested, ip), + TP_FMT("%s", lock->name) + ); + +#ifdef CONFIG_LOCK_STAT + +TRACE_FORMAT(lock_contended, + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + TP_ARGS(lock, ip), + TP_FMT("%s", lock->name) + ); + +TRACE_EVENT(lock_acquired, + TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), + + TP_ARGS(lock, ip, waittime), + + TP_STRUCT__entry( + __field(const char *, name) + __field(unsigned long, wait_usec) + __field(unsigned long, wait_nsec_rem) + ), + TP_fast_assign( + __entry->name = lock->name; + __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); + __entry->wait_usec = (unsigned long) waittime; + ), + TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, + __entry->wait_nsec_rem) +); #endif +#endif + +#endif /* _TRACE_LOCKDEP_H */ diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h deleted file mode 100644 index 863f1e4583a6..000000000000 --- a/include/trace/lockdep_event_types.h +++ /dev/null @@ -1,57 +0,0 @@ - -#ifndef TRACE_FORMAT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM lock - -#ifdef CONFIG_LOCKDEP - -TRACE_FORMAT(lock_acquire, - TP_PROTO(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, - struct lockdep_map *next_lock, unsigned long ip), - TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), - TP_FMT("%s%s%s", trylock ? "try " : "", - read ? "read " : "", lock->name) - ); - -TRACE_FORMAT(lock_release, - TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), - TP_ARGS(lock, nested, ip), - TP_FMT("%s", lock->name) - ); - -#ifdef CONFIG_LOCK_STAT - -TRACE_FORMAT(lock_contended, - TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, ip), - TP_FMT("%s", lock->name) - ); - -TRACE_EVENT(lock_acquired, - TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), - - TP_ARGS(lock, ip, waittime), - - TP_STRUCT__entry( - __field(const char *, name) - __field(unsigned long, wait_usec) - __field(unsigned long, wait_nsec_rem) - ), - TP_fast_assign( - __entry->name = lock->name; - __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); - __entry->wait_usec = (unsigned long) waittime; - ), - TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, - __entry->wait_nsec_rem) -); - -#endif -#endif - -#undef TRACE_SYSTEM diff --git a/include/trace/sched.h b/include/trace/sched.h index 4e372a1a29bf..5b1cf4a28463 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -1,9 +1,336 @@ -#ifndef _TRACE_SCHED_H +#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCHED_H #include #include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sched -#endif +/* + * Tracepoint for calling kthread_stop, performed to end a kthread: + */ +TRACE_EVENT(sched_kthread_stop, + + TP_PROTO(struct task_struct *t), + + TP_ARGS(t), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + ), + + TP_fast_assign( + memcpy(__entry->comm, t->comm, TASK_COMM_LEN); + __entry->pid = t->pid; + ), + + TP_printk("task %s:%d", __entry->comm, __entry->pid) +); + +/* + * Tracepoint for the return value of the kthread stopping: + */ +TRACE_EVENT(sched_kthread_stop_ret, + + TP_PROTO(int ret), + + TP_ARGS(ret), + + TP_STRUCT__entry( + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ret = ret; + ), + + TP_printk("ret %d", __entry->ret) +); + +/* + * Tracepoint for waiting on task to unschedule: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wait_task, + + TP_PROTO(struct rq *rq, struct task_struct *p), + + TP_ARGS(rq, p), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for waking up a task: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wakeup, + + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + + TP_ARGS(rq, p, success), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, success ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->success = success; + ), + + TP_printk("task %s:%d [%d] success=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->success) +); + +/* + * Tracepoint for waking up a new task: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wakeup_new, + + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + + TP_ARGS(rq, p, success), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, success ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->success = success; + ), + + TP_printk("task %s:%d [%d] success=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->success) +); + +/* + * Tracepoint for task switches, performed by the scheduler: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_switch, + + TP_PROTO(struct rq *rq, struct task_struct *prev, + struct task_struct *next), + + TP_ARGS(rq, prev, next), + + TP_STRUCT__entry( + __array( char, prev_comm, TASK_COMM_LEN ) + __field( pid_t, prev_pid ) + __field( int, prev_prio ) + __array( char, next_comm, TASK_COMM_LEN ) + __field( pid_t, next_pid ) + __field( int, next_prio ) + ), + + TP_fast_assign( + memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); + __entry->prev_pid = prev->pid; + __entry->prev_prio = prev->prio; + memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); + __entry->next_pid = next->pid; + __entry->next_prio = next->prio; + ), + + TP_printk("task %s:%d [%d] ==> %s:%d [%d]", + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, + __entry->next_comm, __entry->next_pid, __entry->next_prio) +); + +/* + * Tracepoint for a task being migrated: + */ +TRACE_EVENT(sched_migrate_task, + + TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), + + TP_ARGS(p, orig_cpu, dest_cpu), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, orig_cpu ) + __field( int, dest_cpu ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->orig_cpu = orig_cpu; + __entry->dest_cpu = dest_cpu; + ), + + TP_printk("task %s:%d [%d] from: %d to: %d", + __entry->comm, __entry->pid, __entry->prio, + __entry->orig_cpu, __entry->dest_cpu) +); + +/* + * Tracepoint for freeing a task: + */ +TRACE_EVENT(sched_process_free, + + TP_PROTO(struct task_struct *p), + + TP_ARGS(p), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for a task exiting: + */ +TRACE_EVENT(sched_process_exit, + + TP_PROTO(struct task_struct *p), + + TP_ARGS(p), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for a waiting task: + */ +TRACE_EVENT(sched_process_wait, + + TP_PROTO(struct pid *pid), + + TP_ARGS(pid), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + __entry->pid = pid_nr(pid); + __entry->prio = current->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for do_fork: + */ +TRACE_EVENT(sched_process_fork, + + TP_PROTO(struct task_struct *parent, struct task_struct *child), + + TP_ARGS(parent, child), + + TP_STRUCT__entry( + __array( char, parent_comm, TASK_COMM_LEN ) + __field( pid_t, parent_pid ) + __array( char, child_comm, TASK_COMM_LEN ) + __field( pid_t, child_pid ) + ), + + TP_fast_assign( + memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); + __entry->parent_pid = parent->pid; + memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); + __entry->child_pid = child->pid; + ), + + TP_printk("parent %s:%d child %s:%d", + __entry->parent_comm, __entry->parent_pid, + __entry->child_comm, __entry->child_pid) +); + +/* + * Tracepoint for sending a signal: + */ +TRACE_EVENT(sched_signal_send, + + TP_PROTO(int sig, struct task_struct *p), + + TP_ARGS(sig, p), + + TP_STRUCT__entry( + __field( int, sig ) + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->sig = sig; + ), + + TP_printk("sig: %d task %s:%d", + __entry->sig, __entry->comm, __entry->pid) +); + +#endif /* _TRACE_SCHED_H */ diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h deleted file mode 100644 index 63547dc1125f..000000000000 --- a/include/trace/sched_event_types.h +++ /dev/null @@ -1,337 +0,0 @@ - -/* use instead */ -#ifndef TRACE_EVENT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM sched - -/* - * Tracepoint for calling kthread_stop, performed to end a kthread: - */ -TRACE_EVENT(sched_kthread_stop, - - TP_PROTO(struct task_struct *t), - - TP_ARGS(t), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - ), - - TP_fast_assign( - memcpy(__entry->comm, t->comm, TASK_COMM_LEN); - __entry->pid = t->pid; - ), - - TP_printk("task %s:%d", __entry->comm, __entry->pid) -); - -/* - * Tracepoint for the return value of the kthread stopping: - */ -TRACE_EVENT(sched_kthread_stop_ret, - - TP_PROTO(int ret), - - TP_ARGS(ret), - - TP_STRUCT__entry( - __field( int, ret ) - ), - - TP_fast_assign( - __entry->ret = ret; - ), - - TP_printk("ret %d", __entry->ret) -); - -/* - * Tracepoint for waiting on task to unschedule: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_wait_task, - - TP_PROTO(struct rq *rq, struct task_struct *p), - - TP_ARGS(rq, p), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for waking up a task: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_wakeup, - - TP_PROTO(struct rq *rq, struct task_struct *p, int success), - - TP_ARGS(rq, p, success), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - __field( int, success ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - __entry->success = success; - ), - - TP_printk("task %s:%d [%d] success=%d", - __entry->comm, __entry->pid, __entry->prio, - __entry->success) -); - -/* - * Tracepoint for waking up a new task: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_wakeup_new, - - TP_PROTO(struct rq *rq, struct task_struct *p, int success), - - TP_ARGS(rq, p, success), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - __field( int, success ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - __entry->success = success; - ), - - TP_printk("task %s:%d [%d] success=%d", - __entry->comm, __entry->pid, __entry->prio, - __entry->success) -); - -/* - * Tracepoint for task switches, performed by the scheduler: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_switch, - - TP_PROTO(struct rq *rq, struct task_struct *prev, - struct task_struct *next), - - TP_ARGS(rq, prev, next), - - TP_STRUCT__entry( - __array( char, prev_comm, TASK_COMM_LEN ) - __field( pid_t, prev_pid ) - __field( int, prev_prio ) - __array( char, next_comm, TASK_COMM_LEN ) - __field( pid_t, next_pid ) - __field( int, next_prio ) - ), - - TP_fast_assign( - memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); - __entry->prev_pid = prev->pid; - __entry->prev_prio = prev->prio; - memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); - __entry->next_pid = next->pid; - __entry->next_prio = next->prio; - ), - - TP_printk("task %s:%d [%d] ==> %s:%d [%d]", - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - __entry->next_comm, __entry->next_pid, __entry->next_prio) -); - -/* - * Tracepoint for a task being migrated: - */ -TRACE_EVENT(sched_migrate_task, - - TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), - - TP_ARGS(p, orig_cpu, dest_cpu), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - __field( int, orig_cpu ) - __field( int, dest_cpu ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - __entry->orig_cpu = orig_cpu; - __entry->dest_cpu = dest_cpu; - ), - - TP_printk("task %s:%d [%d] from: %d to: %d", - __entry->comm, __entry->pid, __entry->prio, - __entry->orig_cpu, __entry->dest_cpu) -); - -/* - * Tracepoint for freeing a task: - */ -TRACE_EVENT(sched_process_free, - - TP_PROTO(struct task_struct *p), - - TP_ARGS(p), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for a task exiting: - */ -TRACE_EVENT(sched_process_exit, - - TP_PROTO(struct task_struct *p), - - TP_ARGS(p), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for a waiting task: - */ -TRACE_EVENT(sched_process_wait, - - TP_PROTO(struct pid *pid), - - TP_ARGS(pid), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); - __entry->pid = pid_nr(pid); - __entry->prio = current->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for do_fork: - */ -TRACE_EVENT(sched_process_fork, - - TP_PROTO(struct task_struct *parent, struct task_struct *child), - - TP_ARGS(parent, child), - - TP_STRUCT__entry( - __array( char, parent_comm, TASK_COMM_LEN ) - __field( pid_t, parent_pid ) - __array( char, child_comm, TASK_COMM_LEN ) - __field( pid_t, child_pid ) - ), - - TP_fast_assign( - memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); - __entry->parent_pid = parent->pid; - memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); - __entry->child_pid = child->pid; - ), - - TP_printk("parent %s:%d child %s:%d", - __entry->parent_comm, __entry->parent_pid, - __entry->child_comm, __entry->child_pid) -); - -/* - * Tracepoint for sending a signal: - */ -TRACE_EVENT(sched_signal_send, - - TP_PROTO(int sig, struct task_struct *p), - - TP_ARGS(sig, p), - - TP_STRUCT__entry( - __field( int, sig ) - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->sig = sig; - ), - - TP_printk("sig: %d task %s:%d", - __entry->sig, __entry->comm, __entry->pid) -); - -#undef TRACE_SYSTEM diff --git a/include/trace/skb.h b/include/trace/skb.h index d2de7174a6e8..e6fd281f7f81 100644 --- a/include/trace/skb.h +++ b/include/trace/skb.h @@ -1,9 +1,37 @@ -#ifndef _TRACE_SKB_H_ -#define _TRACE_SKB_H_ +#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SKB_H #include #include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM skb -#endif +/* + * Tracepoint for free an sk_buff: + */ +TRACE_EVENT(kfree_skb, + + TP_PROTO(struct sk_buff *skb, void *location), + + TP_ARGS(skb, location), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + __field( unsigned short, protocol ) + __field( void *, location ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + if (skb) { + __entry->protocol = ntohs(skb->protocol); + } + __entry->location = location; + ), + + TP_printk("skbaddr=%p protocol=%u location=%p", + __entry->skbaddr, __entry->protocol, __entry->location) +); + +#endif /* _TRACE_SKB_H */ diff --git a/include/trace/skb_event_types.h b/include/trace/skb_event_types.h deleted file mode 100644 index 4a1c504c0e16..000000000000 --- a/include/trace/skb_event_types.h +++ /dev/null @@ -1,38 +0,0 @@ - -/* use instead */ -#ifndef TRACE_EVENT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM skb - -/* - * Tracepoint for free an sk_buff: - */ -TRACE_EVENT(kfree_skb, - - TP_PROTO(struct sk_buff *skb, void *location), - - TP_ARGS(skb, location), - - TP_STRUCT__entry( - __field( void *, skbaddr ) - __field( unsigned short, protocol ) - __field( void *, location ) - ), - - TP_fast_assign( - __entry->skbaddr = skb; - if (skb) { - __entry->protocol = ntohs(skb->protocol); - } - __entry->location = location; - ), - - TP_printk("skbaddr=%p protocol=%u location=%p", - __entry->skbaddr, __entry->protocol, __entry->location) -); - -#undef TRACE_SYSTEM diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h deleted file mode 100644 index 552a50e169a6..000000000000 --- a/include/trace/trace_event_types.h +++ /dev/null @@ -1,7 +0,0 @@ -/* trace/_event_types.h here */ - -#include -#include -#include -#include -#include diff --git a/kernel/trace/events.c b/kernel/trace/events.c index 246f2aa6dc46..5a35a914f0e2 100644 --- a/kernel/trace/events.c +++ b/kernel/trace/events.c @@ -8,6 +8,7 @@ #include "trace_output.h" +#define TRACE_HEADER_MULTI_READ #include "trace_events_stage_1.h" #include "trace_events_stage_2.h" #include "trace_events_stage_3.h" diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h index 38985f9b379c..475f46a047ae 100644 --- a/kernel/trace/trace_events_stage_1.h +++ b/kernel/trace/trace_events_stage_1.h @@ -1,7 +1,7 @@ /* * Stage 1 of the trace events. * - * Override the macros in to include the following: + * Override the macros in to include the following: * * struct ftrace_raw_ { * struct trace_entry ent; @@ -36,4 +36,4 @@ }; \ static struct ftrace_event_call event_##name -#include +#include diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 59cfd7dfe68d..aa4a67a0656f 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -1,7 +1,7 @@ /* * Stage 2 of the trace events. * - * Override the macros in to include the following: + * Override the macros in to include the following: * * enum print_line_t * ftrace_raw_output_(struct trace_iterator *iter, int flags) @@ -64,7 +64,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ return TRACE_TYPE_HANDLED; \ } -#include +#include /* * Setup the showing format of trace point. @@ -128,7 +128,7 @@ ftrace_format_##call(struct trace_seq *s) \ return ret; \ } -#include +#include #undef __field #define __field(type, item) \ @@ -167,4 +167,4 @@ ftrace_define_fields_##call(void) \ return ret; \ } -#include +#include diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 5bb1b7ffbdb6..45c04e1f38db 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -1,7 +1,7 @@ /* * Stage 3 of the trace events. * - * Override the macros in to include the following: + * Override the macros in to include the following: * * static void ftrace_event_(proto) * { @@ -272,7 +272,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ _TRACE_PROFILE_INIT(call) \ } -#include +#include #undef _TRACE_PROFILE #undef _TRACE_PROFILE_INIT -- cgit v1.2.2 From 56449f437add737a1e5e1cb7e00f63ac8ead1938 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 14 Apr 2009 11:24:36 +0200 Subject: tracing: make the trace clocks available generally Jeremy Fitzhardinge reported this build failure: LD .tmp_vmlinux1 arch/x86/kernel/built-in.o: In function `ds_take_timestamp': git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global' git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global' Which is due to !CONFIG_TRACING && CONFIG_X86_DS=y. Expose the trace clock code to CONFIG_X86_DS as well. [ Unfortunately librarizing doesnt work well - ancient architectures with no raw_local_irq_save() primitive break the build. ] Reported-by: Jeremy Fitzhardinge LKML-Reference: <49E4413F.7070700@goop.org> Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 + kernel/trace/Makefile | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index bab1dffe37e9..c8e1be5f0b00 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ +obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2630f5121ec1..ecc671e9f142 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -15,11 +15,16 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +# +# Make the trace clocks available generally: it's infrastructure +# relied on by ptrace for example: +# +obj-y += trace_clock.o + obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o -obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o -- cgit v1.2.2 From a8d154b009168337494fbf345671bab74d3e4b8b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 09:36:00 -0400 Subject: tracing: create automated trace defines This patch lowers the number of places a developer must modify to add new tracepoints. The current method to add a new tracepoint into an existing system is to write the trace point macro in the trace header with one of the macros TRACE_EVENT, TRACE_FORMAT or DECLARE_TRACE, then they must add the same named item into the C file with the macro DEFINE_TRACE(name) and then add the trace point. This change cuts out the needing to add the DEFINE_TRACE(name). Every file that uses the tracepoint must still include the trace/.h file, but the one C file must also add a define before the including of that file. #define CREATE_TRACE_POINTS #include This will cause the trace/mytrace.h file to also produce the C code necessary to implement the trace point. Note, if more than one trace/.h is used to create the C code it is best to list them all together. #define CREATE_TRACE_POINTS #include #include #include Thanks to Mathieu Desnoyers and Christoph Hellwig for coming up with the cleaner solution of the define above the includes over my first design to have the C code include a "special" header. This patch converts sched, irq and lockdep and skb to use this new method. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neil Horman Cc: Zhao Lei Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Signed-off-by: Steven Rostedt --- include/trace/define_trace.h | 75 ++++++++++++++++++++++++++++++++++++++++++++ include/trace/irq.h | 5 ++- include/trace/kmem.h | 4 ++- include/trace/lockdep.h | 3 ++ include/trace/sched.h | 3 ++ include/trace/skb.h | 3 ++ kernel/exit.c | 4 --- kernel/fork.c | 2 -- kernel/irq/handle.c | 7 ++--- kernel/kthread.c | 3 -- kernel/lockdep.c | 12 ++----- kernel/sched.c | 10 ++---- kernel/signal.c | 2 -- kernel/softirq.c | 3 -- mm/util.c | 11 ++----- net/core/net-traces.c | 4 +-- 16 files changed, 105 insertions(+), 46 deletions(-) create mode 100644 include/trace/define_trace.h diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h new file mode 100644 index 000000000000..de9dc7d8508b --- /dev/null +++ b/include/trace/define_trace.h @@ -0,0 +1,75 @@ +/* + * Trace files that want to automate creationg of all tracepoints defined + * in their file should include this file. The following are macros that the + * trace file may define: + * + * TRACE_SYSTEM defines the system the tracepoint is for + * + * TRACE_INCLUDE_FILE if the file name is something other than TRACE_SYSTEM.h + * This macro may be defined to tell define_trace.h what file to include. + * Note, leave off the ".h". + * + * TRACE_INCLUDE_PATH if the path is something other than core kernel include/trace + * then this macro can define the path to use. Note, the path is relative to + * define_trace.h, not the file including it. Full path names for out of tree + * modules must be used. + */ + +#ifdef CREATE_TRACE_POINTS + +/* Prevent recursion */ +#undef CREATE_TRACE_POINTS + +#include + +#undef TRACE_EVENT +#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ + DEFINE_TRACE(name) + +#undef TRACE_FORMAT +#define TRACE_FORMAT(name, proto, args, print) \ + DEFINE_TRACE(name) + +#undef DECLARE_TRACE +#define DECLARE_TRACE(name, proto, args) \ + DEFINE_TRACE(name) + +#undef TRACE_INCLUDE +#undef __TRACE_INCLUDE + +#ifndef TRACE_INCLUDE_FILE +# define TRACE_INCLUDE_FILE TRACE_SYSTEM +# define UNDEF_TRACE_INCLUDE_FILE +#endif + +#ifndef TRACE_INCLUDE_PATH +# define __TRACE_INCLUDE(system) +# define UNDEF_TRACE_INCLUDE_FILE +#else +# define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h) +#endif + +# define TRACE_INCLUDE(system) __TRACE_INCLUDE(system) + +/* Let the trace headers be reread */ +#define TRACE_HEADER_MULTI_READ + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef TRACE_HEADER_MULTI_READ + +/* Only undef what we defined in this file */ +#ifdef UNDEF_TRACE_INCLUDE_FILE +# undef TRACE_INCLUDE_PATH +# undef UNDEF_TRACE_INCLUDE_FILE +#endif + +#ifdef UNDEF_TRACE_INCLUDE_FILE +# undef TRACE_INCLUDE_PATH +# undef UNDEF_TRACE_INCLUDE_FILE +#endif + +/* We may be processing more files */ +#define CREATE_TRACE_POINTS + +#endif /* CREATE_TRACE_POINTS */ diff --git a/include/trace/irq.h b/include/trace/irq.h index 04ab4c652225..75e3468e4493 100644 --- a/include/trace/irq.h +++ b/include/trace/irq.h @@ -51,4 +51,7 @@ TRACE_FORMAT(softirq_exit, TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) ); -#endif +#endif /* _TRACE_IRQ_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/kmem.h b/include/trace/kmem.h index d7d12189e5c8..c22c42f980b5 100644 --- a/include/trace/kmem.h +++ b/include/trace/kmem.h @@ -188,5 +188,7 @@ TRACE_EVENT(kmem_cache_free, TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) ); +#endif /* _TRACE_KMEM_H */ -#endif +/* This part must be outside protection */ +#include diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h index 8ee7900b38c4..4d301e758de3 100644 --- a/include/trace/lockdep.h +++ b/include/trace/lockdep.h @@ -55,3 +55,6 @@ TRACE_EVENT(lock_acquired, #endif #endif /* _TRACE_LOCKDEP_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/sched.h b/include/trace/sched.h index 5b1cf4a28463..ffa1cab586b9 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -334,3 +334,6 @@ TRACE_EVENT(sched_signal_send, ); #endif /* _TRACE_SCHED_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/skb.h b/include/trace/skb.h index e6fd281f7f81..1e8fabb57c06 100644 --- a/include/trace/skb.h +++ b/include/trace/skb.h @@ -35,3 +35,6 @@ TRACE_EVENT(kfree_skb, ); #endif /* _TRACE_SKB_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/exit.c b/kernel/exit.c index abf9cf3b95c6..2fe9d2c7eeee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -56,10 +56,6 @@ #include #include "cred-internals.h" -DEFINE_TRACE(sched_process_free); -DEFINE_TRACE(sched_process_exit); -DEFINE_TRACE(sched_process_wait); - static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) diff --git a/kernel/fork.c b/kernel/fork.c index b9e2edd00726..4bebf2639235 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -83,8 +83,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ -DEFINE_TRACE(sched_process_fork); - int nr_processes(void) { int cpu; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d82142be8dd2..983d8be8dff7 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -17,9 +17,11 @@ #include #include #include -#include #include +#define CREATE_TRACE_POINTS +#include + #include "internals.h" /* @@ -348,9 +350,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -DEFINE_TRACE(irq_handler_entry); -DEFINE_TRACE(irq_handler_exit); - /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number diff --git a/kernel/kthread.c b/kernel/kthread.c index 4ebaf8519abf..e1c76924545b 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -21,9 +21,6 @@ static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; -DEFINE_TRACE(sched_kthread_stop); -DEFINE_TRACE(sched_kthread_stop_ret); - struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c4582a6ea953..257f21a76c52 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -42,12 +42,14 @@ #include #include #include -#include #include #include "lockdep_internals.h" +#define CREATE_TRACE_POINTS +#include + #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -2929,8 +2931,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lock_set_class); -DEFINE_TRACE(lock_acquire); - /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -2957,8 +2957,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, } EXPORT_SYMBOL_GPL(lock_acquire); -DEFINE_TRACE(lock_release); - void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { @@ -3061,8 +3059,6 @@ found_it: put_lock_stats(stats); } -DEFINE_TRACE(lock_acquired); - static void __lock_acquired(struct lockdep_map *lock, unsigned long ip) { @@ -3118,8 +3114,6 @@ found_it: lock->ip = ip; } -DEFINE_TRACE(lock_contended); - void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; diff --git a/kernel/sched.c b/kernel/sched.c index 5724508c3b66..e6d4518d47e0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -72,13 +72,15 @@ #include #include #include -#include #include #include #include "sched_cpupri.h" +#define CREATE_TRACE_POINTS +#include + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -118,12 +120,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -DEFINE_TRACE(sched_wait_task); -DEFINE_TRACE(sched_wakeup); -DEFINE_TRACE(sched_wakeup_new); -DEFINE_TRACE(sched_switch); -DEFINE_TRACE(sched_migrate_task); - #ifdef CONFIG_SMP static void double_rq_lock(struct rq *rq1, struct rq *rq2); diff --git a/kernel/signal.c b/kernel/signal.c index d8034737db4c..1d5703ff003c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -41,8 +41,6 @@ static struct kmem_cache *sigqueue_cachep; -DEFINE_TRACE(sched_signal_send); - static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; diff --git a/kernel/softirq.c b/kernel/softirq.c index 2fecefacdc5b..a2d9b458ac2b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -186,9 +186,6 @@ EXPORT_SYMBOL(local_bh_enable_ip); */ #define MAX_SOFTIRQ_RESTART 10 -DEFINE_TRACE(softirq_entry); -DEFINE_TRACE(softirq_exit); - asmlinkage void __do_softirq(void) { struct softirq_action *h; diff --git a/mm/util.c b/mm/util.c index 2599e83eea17..0e74a22791cb 100644 --- a/mm/util.c +++ b/mm/util.c @@ -4,9 +4,11 @@ #include #include #include -#include #include +#define CREATE_TRACE_POINTS +#include + /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate @@ -239,13 +241,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, EXPORT_SYMBOL_GPL(get_user_pages_fast); /* Tracepoints definitions. */ -DEFINE_TRACE(kmalloc); -DEFINE_TRACE(kmem_cache_alloc); -DEFINE_TRACE(kmalloc_node); -DEFINE_TRACE(kmem_cache_alloc_node); -DEFINE_TRACE(kfree); -DEFINE_TRACE(kmem_cache_free); - EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c8fb45665e4f..801772059474 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -19,11 +19,11 @@ #include #include #include -#include #include #include +#define CREATE_TRACE_POINTS +#include -DEFINE_TRACE(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); -- cgit v1.2.2 From 9504504cbab29ecb694186b1c5b15d3579c43c51 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 11 Apr 2009 12:59:57 -0400 Subject: tracing: make trace_seq operations available for core kernel In the process to make TRACE_EVENT macro work for modules, the trace_seq operations must be available for core kernel code. These operations are quite useful and can be used for other implementations. The main idea is that we create a trace_seq handle that acts very much like the seq_file handle. struct trace_seq *s = kmalloc(sizeof(*s, GFP_KERNEL); trace_seq_init(s); trace_seq_printf(s, "some data %d\n", variable); printk("%s", s->buffer); The main use is to allow a top level function call several other functions that may store printf like data into the buffer. Then at the end, the top level function can process all the data with any method it would like to. It could be passed to userspace, output via printk or even use seq_file: trace_seq_to_user(s, ubuf, cnt); seq_puts(m, s->buffer); Signed-off-by: Steven Rostedt --- include/linux/trace_seq.h | 89 +++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 15 +------- kernel/trace/trace_output.h | 16 +------- 3 files changed, 92 insertions(+), 28 deletions(-) create mode 100644 include/linux/trace_seq.h diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h new file mode 100644 index 000000000000..28051da876dd --- /dev/null +++ b/include/linux/trace_seq.h @@ -0,0 +1,89 @@ +#ifndef _LINUX_TRACE_SEQ_H +#define _LINUX_TRACE_SEQ_H + +/* + * Trace sequences are used to allow a function to call several other functions + * to create a string of data to use (up to a max of PAGE_SIZE. + */ + +struct trace_seq { + unsigned char buffer[PAGE_SIZE]; + unsigned int len; + unsigned int readpos; +}; + +static inline void +trace_seq_init(struct trace_seq *s) +{ + s->len = 0; + s->readpos = 0; +} + +/* + * Currently only defined when tracing is enabled. + */ +#ifdef CONFIG_TRACING +extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern int +trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); +extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); +extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, + size_t cnt); +extern int trace_seq_puts(struct trace_seq *s, const char *str); +extern int trace_seq_putc(struct trace_seq *s, unsigned char c); +extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len); +extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + size_t len); +extern void *trace_seq_reserve(struct trace_seq *s, size_t len); +extern int trace_seq_path(struct trace_seq *s, struct path *path); + +#else /* CONFIG_TRACING */ +static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))) +{ + return 0; +} +static inline int +trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +{ + return 0; +} + +static inline void trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ +} +static inline ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, + size_t cnt) +{ + return 0; +} +static inline int trace_seq_puts(struct trace_seq *s, const char *str) +{ + return 0; +} +static inline int trace_seq_putc(struct trace_seq *s, unsigned char c); +{ + return 0; +} +static inline int +trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) +{ + return 0; +} +static inline int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + size_t len) +{ + return 0; +} +static inline void *trace_seq_reserve(struct trace_seq *s, size_t len) +{ + return NULL; +} +static inline int trace_seq_path(struct trace_seq *s, struct path *path) +{ + return 0; +} +#endif /* CONFIG_TRACING */ + +#endif /* _LINUX_TRACE_SEQ_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b05b6ac982a1..1882846b7389 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -12,6 +12,8 @@ #include #include +#include + enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -423,19 +425,6 @@ struct tracer { struct tracer_stat *stats; }; -struct trace_seq { - unsigned char buffer[PAGE_SIZE]; - unsigned int len; - unsigned int readpos; -}; - -static inline void -trace_seq_init(struct trace_seq *s) -{ - s->len = 0; - s->readpos = 0; -} - #define TRACE_PIPE_ALL_CPU -1 diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 91630217fb46..5c7cbfb65c71 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -1,6 +1,7 @@ #ifndef __TRACE_EVENTS_H #define __TRACE_EVENTS_H +#include #include "trace.h" typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, @@ -20,24 +21,9 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter); -extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); - -extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); -extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, - size_t cnt); -extern int trace_seq_puts(struct trace_seq *s, const char *str); -extern int trace_seq_putc(struct trace_seq *s, unsigned char c); -extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len); -extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, - size_t len); -extern void *trace_seq_reserve(struct trace_seq *s, size_t len); -extern int trace_seq_path(struct trace_seq *s, struct path *path); extern int seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, unsigned long sym_flags); extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, -- cgit v1.2.2 From 97f2025153499faa17267a0d4e18c7afaf73f39d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 13 Apr 2009 11:20:49 -0400 Subject: tracing/events: move declarations from trace directory to core include In preparation to allowing trace events to happen in modules, we need to move some of the local declarations in the kernel/trace directory into include/linux. This patch simply moves the declarations and performs no context changes. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 146 +++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 120 +---------------------------------- kernel/trace/trace_output.h | 14 ----- 3 files changed, 147 insertions(+), 133 deletions(-) create mode 100644 include/linux/ftrace_event.h diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h new file mode 100644 index 000000000000..496b76d9f9d8 --- /dev/null +++ b/include/linux/ftrace_event.h @@ -0,0 +1,146 @@ +#ifndef _LINUX_FTRACE_EVENT_H +#define _LINUX_FTRACE_EVENT_H + +#include +#include + + +struct trace_array; +struct tracer; + +/* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + unsigned char type; + unsigned char flags; + unsigned char preempt_count; + int pid; + int tgid; +}; + +/* + * Trace iterator - used by printout routines who present trace + * results to users and which routines might sleep, etc: + */ +struct trace_iterator { + struct trace_array *tr; + struct tracer *trace; + void *private; + int cpu_file; + struct mutex mutex; + struct ring_buffer_iter *buffer_iter[NR_CPUS]; + + /* The below is zeroed out in pipe_read */ + struct trace_seq seq; + struct trace_entry *ent; + int cpu; + u64 ts; + + unsigned long iter_flags; + loff_t pos; + long idx; + + cpumask_var_t started; +}; + + +typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, + int flags); +struct trace_event { + struct hlist_node node; + int type; + trace_print_func trace; + trace_print_func raw; + trace_print_func hex; + trace_print_func binary; +}; + +extern int register_ftrace_event(struct trace_event *event); +extern int unregister_ftrace_event(struct trace_event *event); + +/* Return values for print_line callback */ +enum print_line_t { + TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ + TRACE_TYPE_HANDLED = 1, + TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ + TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ +}; + + +struct ring_buffer_event * +trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, + unsigned long flags, int pc); +void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc); +void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc); +void trace_current_buffer_discard_commit(struct ring_buffer_event *event); + +void tracing_record_cmdline(struct task_struct *tsk); + +struct ftrace_event_call { + char *name; + char *system; + struct dentry *dir; + int enabled; + int (*regfunc)(void); + void (*unregfunc)(void); + int id; + int (*raw_init)(void); + int (*show_format)(struct trace_seq *s); + int (*define_fields)(void); + struct list_head fields; + int n_preds; + struct filter_pred **preds; + +#ifdef CONFIG_EVENT_PROFILE + atomic_t profile_count; + int (*profile_enable)(struct ftrace_event_call *); + void (*profile_disable)(struct ftrace_event_call *); +#endif +}; + +#define MAX_FILTER_PRED 8 +#define MAX_FILTER_STR_VAL 128 + +extern int init_preds(struct ftrace_event_call *call); +extern int filter_match_preds(struct ftrace_event_call *call, void *rec); +extern int filter_current_check_discard(struct ftrace_event_call *call, + void *rec, + struct ring_buffer_event *event); + +extern int trace_define_field(struct ftrace_event_call *call, char *type, + char *name, int offset, int size); + + +/* + * The double __builtin_constant_p is because gcc will give us an error + * if we try to allocate the static variable to fmt if it is not a + * constant. Even with the outer if statement optimizing out. + */ +#define event_trace_printk(ip, fmt, args...) \ +do { \ + __trace_printk_check_format(fmt, ##args); \ + tracing_record_cmdline(current); \ + if (__builtin_constant_p(fmt)) { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))) = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ + \ + __trace_bprintk(ip, trace_printk_fmt, ##args); \ + } else \ + __trace_printk(ip, fmt, ##args); \ +} while (0) + +#define __common_field(type, item) \ + ret = trace_define_field(event_call, #type, "common_" #item, \ + offsetof(typeof(field.ent), item), \ + sizeof(field.ent.item)); \ + if (ret) \ + return ret; + +#endif /* _LINUX_FTRACE_EVENT_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1882846b7389..6bcdf4af9b2d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,7 @@ #include #include +#include enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -43,20 +44,6 @@ enum trace_type { __TRACE_LAST_TYPE, }; -/* - * The trace entry - the most basic unit of tracing. This is what - * is printed in the end as a single line in the trace output, such as: - * - * bash-15816 [01] 235.197585: idle_cpu <- irq_enter - */ -struct trace_entry { - unsigned char type; - unsigned char flags; - unsigned char preempt_count; - int pid; - int tgid; -}; - /* * Function trace entry - function address and parent function addres: */ @@ -265,8 +252,6 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; }; -struct trace_iterator; - /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -341,15 +326,6 @@ extern void __ftrace_bad_type(void); __ftrace_bad_type(); \ } while (0) -/* Return values for print_line callback */ -enum print_line_t { - TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ - TRACE_TYPE_HANDLED = 1, - TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ - TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ -}; - - /* * An option specific to a tracer. This is a boolean value. * The bit is the bit index that sets its value on the @@ -428,31 +404,6 @@ struct tracer { #define TRACE_PIPE_ALL_CPU -1 -/* - * Trace iterator - used by printout routines who present trace - * results to users and which routines might sleep, etc: - */ -struct trace_iterator { - struct trace_array *tr; - struct tracer *trace; - void *private; - int cpu_file; - struct mutex mutex; - struct ring_buffer_iter *buffer_iter[NR_CPUS]; - - /* The below is zeroed out in pipe_read */ - struct trace_seq seq; - struct trace_entry *ent; - int cpu; - u64 ts; - - unsigned long iter_flags; - loff_t pos; - long idx; - - cpumask_var_t started; -}; - int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void trace_wake_up(void); @@ -479,15 +430,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr, struct ring_buffer_event *event, unsigned long flags, int pc); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, - unsigned long flags, int pc); -void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_current_buffer_discard_commit(struct ring_buffer_event *event); - struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); @@ -510,7 +452,6 @@ void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, unsigned long flags, int pc); -void tracing_record_cmdline(struct task_struct *tsk); void tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *wakee, @@ -790,28 +731,6 @@ struct ftrace_event_field { int size; }; -struct ftrace_event_call { - char *name; - char *system; - struct dentry *dir; - int enabled; - int (*regfunc)(void); - void (*unregfunc)(void); - int id; - int (*raw_init)(void); - int (*show_format)(struct trace_seq *s); - int (*define_fields)(void); - struct list_head fields; - int n_preds; - struct filter_pred **preds; - -#ifdef CONFIG_EVENT_PROFILE - atomic_t profile_count; - int (*profile_enable)(struct ftrace_event_call *); - void (*profile_disable)(struct ftrace_event_call *); -#endif -}; - struct event_subsystem { struct list_head list; const char *name; @@ -825,9 +744,6 @@ struct event_subsystem { (unsigned long)event < (unsigned long)__stop_ftrace_events; \ event++) -#define MAX_FILTER_PRED 8 -#define MAX_FILTER_STR_VAL 128 - struct filter_pred; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); @@ -845,9 +761,6 @@ struct filter_pred { int clear; }; -int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size); -extern int init_preds(struct ftrace_event_call *call); extern void filter_free_pred(struct filter_pred *pred); extern void filter_print_preds(struct filter_pred **preds, int n_preds, struct trace_seq *s); @@ -855,13 +768,9 @@ extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); extern void filter_disable_preds(struct ftrace_event_call *call); -extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); -extern int filter_current_check_discard(struct ftrace_event_call *call, - void *rec, - struct ring_buffer_event *event); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -876,14 +785,6 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -#define __common_field(type, item) \ - ret = trace_define_field(event_call, #type, "common_" #item, \ - offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item)); \ - if (ret) \ - return ret; - -void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; @@ -895,25 +796,6 @@ extern struct ftrace_event_call __stop_ftrace_events[]; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; -/* - * The double __builtin_constant_p is because gcc will give us an error - * if we try to allocate the static variable to fmt if it is not a - * constant. Even with the outer if statement optimizing out. - */ -#define event_trace_printk(ip, fmt, args...) \ -do { \ - __trace_printk_check_format(fmt, ##args); \ - tracing_record_cmdline(current); \ - if (__builtin_constant_p(fmt)) { \ - static const char *trace_printk_fmt \ - __attribute__((section("__trace_printk_fmt"))) = \ - __builtin_constant_p(fmt) ? fmt : NULL; \ - \ - __trace_bprintk(ip, trace_printk_fmt, ##args); \ - } else \ - __trace_printk(ip, fmt, ##args); \ -} while (0) - #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ extern struct ftrace_event_call event_##call; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 5c7cbfb65c71..6e220a8e5706 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -4,18 +4,6 @@ #include #include "trace.h" -typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, - int flags); - -struct trace_event { - struct hlist_node node; - int type; - trace_print_func trace; - trace_print_func raw; - trace_print_func hex; - trace_print_func binary; -}; - extern enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t @@ -33,8 +21,6 @@ extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); extern struct trace_event *ftrace_find_event(int type); -extern int register_ftrace_event(struct trace_event *event); -extern int unregister_ftrace_event(struct trace_event *event); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); -- cgit v1.2.2 From f42c85e74faa422cf0bc747ed808681145448f88 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 13 Apr 2009 12:25:37 -0400 Subject: tracing/events: move the ftrace event tracing code to core This patch moves the ftrace creation into include/trace/ftrace.h and simplifies the work of developers in adding new tracepoints. Just the act of creating the trace points in include/trace and including define_trace.h will create the events in the debugfs/tracing/events directory. This patch removes the need of include/trace/trace_events.h Signed-off-by: Steven Rostedt --- include/trace/define_trace.h | 4 + include/trace/ftrace.h | 492 ++++++++++++++++++++++++++++++++++++ include/trace/trace_events.h | 7 - kernel/trace/Makefile | 1 - kernel/trace/events.c | 15 -- kernel/trace/trace_events_stage_1.h | 39 --- kernel/trace/trace_events_stage_2.h | 170 ------------- kernel/trace/trace_events_stage_3.h | 279 -------------------- 8 files changed, 496 insertions(+), 511 deletions(-) create mode 100644 include/trace/ftrace.h delete mode 100644 include/trace/trace_events.h delete mode 100644 kernel/trace/events.c delete mode 100644 kernel/trace/trace_events_stage_1.h delete mode 100644 kernel/trace/trace_events_stage_2.h delete mode 100644 kernel/trace/trace_events_stage_3.h diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index de9dc7d8508b..980eb66a6e38 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -56,6 +56,10 @@ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#ifdef CONFIG_EVENT_TRACER +#include +#endif + #undef TRACE_HEADER_MULTI_READ /* Only undef what we defined in this file */ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h new file mode 100644 index 000000000000..955b967acd74 --- /dev/null +++ b/include/trace/ftrace.h @@ -0,0 +1,492 @@ +/* + * Stage 1 of the trace events. + * + * Override the macros in to include the following: + * + * struct ftrace_raw_ { + * struct trace_entry ent; + * ; + * []; + * [...] + * }; + * + * The is created by the __field(type, item) macro or + * the __array(type2, item2, len) macro. + * We simply do "type item;", and that will create the fields + * in the structure. + */ + +#include + +#undef TRACE_FORMAT +#define TRACE_FORMAT(call, proto, args, fmt) + +#undef __array +#define __array(type, item, len) type item[len]; + +#undef __field +#define __field(type, item) type item; + +#undef TP_STRUCT__entry +#define TP_STRUCT__entry(args...) args + +#undef TRACE_EVENT +#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ + struct ftrace_raw_##name { \ + struct trace_entry ent; \ + tstruct \ + }; \ + static struct ftrace_event_call event_##name + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +/* + * Stage 2 of the trace events. + * + * Override the macros in to include the following: + * + * enum print_line_t + * ftrace_raw_output_(struct trace_iterator *iter, int flags) + * { + * struct trace_seq *s = &iter->seq; + * struct ftrace_raw_ *field; <-- defined in stage 1 + * struct trace_entry *entry; + * int ret; + * + * entry = iter->ent; + * + * if (entry->type != event_.id) { + * WARN_ON_ONCE(1); + * return TRACE_TYPE_UNHANDLED; + * } + * + * field = (typeof(field))entry; + * + * ret = trace_seq_printf(s, "\n"); + * if (!ret) + * return TRACE_TYPE_PARTIAL_LINE; + * + * return TRACE_TYPE_HANDLED; + * } + * + * This is the method used to print the raw event to the trace + * output format. Note, this is not needed if the data is read + * in binary. + */ + +#undef __entry +#define __entry field + +#undef TP_printk +#define TP_printk(fmt, args...) fmt "\n", args + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +enum print_line_t \ +ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ +{ \ + struct trace_seq *s = &iter->seq; \ + struct ftrace_raw_##call *field; \ + struct trace_entry *entry; \ + int ret; \ + \ + entry = iter->ent; \ + \ + if (entry->type != event_##call.id) { \ + WARN_ON_ONCE(1); \ + return TRACE_TYPE_UNHANDLED; \ + } \ + \ + field = (typeof(field))entry; \ + \ + ret = trace_seq_printf(s, #call ": " print); \ + if (!ret) \ + return TRACE_TYPE_PARTIAL_LINE; \ + \ + return TRACE_TYPE_HANDLED; \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +/* + * Setup the showing format of trace point. + * + * int + * ftrace_format_##call(struct trace_seq *s) + * { + * struct ftrace_raw_##call field; + * int ret; + * + * ret = trace_seq_printf(s, #type " " #item ";" + * " offset:%u; size:%u;\n", + * offsetof(struct ftrace_raw_##call, item), + * sizeof(field.type)); + * + * } + */ + +#undef TP_STRUCT__entry +#define TP_STRUCT__entry(args...) args + +#undef __field +#define __field(type, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef __array +#define __array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef __entry +#define __entry REC + +#undef TP_printk +#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) + +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ +static int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct ftrace_raw_##call field; \ + int ret; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: " print); \ + \ + return ret; \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef __field +#define __field(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ +int \ +ftrace_define_fields_##call(void) \ +{ \ + struct ftrace_raw_##call field; \ + struct ftrace_event_call *event_call = &event_##call; \ + int ret; \ + \ + __common_field(unsigned char, type); \ + __common_field(unsigned char, flags); \ + __common_field(unsigned char, preempt_count); \ + __common_field(int, pid); \ + __common_field(int, tgid); \ + \ + tstruct; \ + \ + return ret; \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +/* + * Stage 3 of the trace events. + * + * Override the macros in to include the following: + * + * static void ftrace_event_(proto) + * { + * event_trace_printk(_RET_IP_, ": " ); + * } + * + * static int ftrace_reg_event_(void) + * { + * int ret; + * + * ret = register_trace_(ftrace_event_); + * if (!ret) + * pr_info("event trace: Could not activate trace point " + * "probe to "); + * return ret; + * } + * + * static void ftrace_unreg_event_(void) + * { + * unregister_trace_(ftrace_event_); + * } + * + * For those macros defined with TRACE_FORMAT: + * + * static struct ftrace_event_call __used + * __attribute__((__aligned__(4))) + * __attribute__((section("_ftrace_events"))) event_ = { + * .name = "", + * .regfunc = ftrace_reg_event_, + * .unregfunc = ftrace_unreg_event_, + * } + * + * + * For those macros defined with TRACE_EVENT: + * + * static struct ftrace_event_call event_; + * + * static void ftrace_raw_event_(proto) + * { + * struct ring_buffer_event *event; + * struct ftrace_raw_ *entry; <-- defined in stage 1 + * unsigned long irq_flags; + * int pc; + * + * local_save_flags(irq_flags); + * pc = preempt_count(); + * + * event = trace_current_buffer_lock_reserve(event_.id, + * sizeof(struct ftrace_raw_), + * irq_flags, pc); + * if (!event) + * return; + * entry = ring_buffer_event_data(event); + * + * ; <-- Here we assign the entries by the __field and + * __array macros. + * + * trace_current_buffer_unlock_commit(event, irq_flags, pc); + * } + * + * static int ftrace_raw_reg_event_(void) + * { + * int ret; + * + * ret = register_trace_(ftrace_raw_event_); + * if (!ret) + * pr_info("event trace: Could not activate trace point " + * "probe to "); + * return ret; + * } + * + * static void ftrace_unreg_event_(void) + * { + * unregister_trace_(ftrace_raw_event_); + * } + * + * static struct trace_event ftrace_event_type_ = { + * .trace = ftrace_raw_output_, <-- stage 2 + * }; + * + * static int ftrace_raw_init_event_(void) + * { + * int id; + * + * id = register_ftrace_event(&ftrace_event_type_); + * if (!id) + * return -ENODEV; + * event_.id = id; + * return 0; + * } + * + * static struct ftrace_event_call __used + * __attribute__((__aligned__(4))) + * __attribute__((section("_ftrace_events"))) event_ = { + * .name = "", + * .system = "", + * .raw_init = ftrace_raw_init_event_, + * .regfunc = ftrace_reg_event_, + * .unregfunc = ftrace_unreg_event_, + * .show_format = ftrace_format_, + * } + * + */ + +#undef TP_FMT +#define TP_FMT(fmt, args...) fmt "\n", ##args + +#ifdef CONFIG_EVENT_PROFILE +#define _TRACE_PROFILE(call, proto, args) \ +static void ftrace_profile_##call(proto) \ +{ \ + extern void perf_tpcounter_event(int); \ + perf_tpcounter_event(event_##call.id); \ +} \ + \ +static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \ +{ \ + int ret = 0; \ + \ + if (!atomic_inc_return(&call->profile_count)) \ + ret = register_trace_##call(ftrace_profile_##call); \ + \ + return ret; \ +} \ + \ +static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ +{ \ + if (atomic_add_negative(-1, &call->profile_count)) \ + unregister_trace_##call(ftrace_profile_##call); \ +} + +#define _TRACE_PROFILE_INIT(call) \ + .profile_count = ATOMIC_INIT(-1), \ + .profile_enable = ftrace_profile_enable_##call, \ + .profile_disable = ftrace_profile_disable_##call, + +#else +#define _TRACE_PROFILE(call, proto, args) +#define _TRACE_PROFILE_INIT(call) +#endif + +#define _TRACE_FORMAT(call, proto, args, fmt) \ +static void ftrace_event_##call(proto) \ +{ \ + event_trace_printk(_RET_IP_, #call ": " fmt); \ +} \ + \ +static int ftrace_reg_event_##call(void) \ +{ \ + int ret; \ + \ + ret = register_trace_##call(ftrace_event_##call); \ + if (ret) \ + pr_info("event trace: Could not activate trace point " \ + "probe to " #call "\n"); \ + return ret; \ +} \ + \ +static void ftrace_unreg_event_##call(void) \ +{ \ + unregister_trace_##call(ftrace_event_##call); \ +} \ + \ +static struct ftrace_event_call event_##call; \ + \ +static int ftrace_init_event_##call(void) \ +{ \ + int id; \ + \ + id = register_ftrace_event(NULL); \ + if (!id) \ + return -ENODEV; \ + event_##call.id = id; \ + return 0; \ +} + +#undef TRACE_FORMAT +#define TRACE_FORMAT(call, proto, args, fmt) \ +_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ +_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ +static struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .system = __stringify(TRACE_SYSTEM), \ + .raw_init = ftrace_init_event_##call, \ + .regfunc = ftrace_reg_event_##call, \ + .unregfunc = ftrace_unreg_event_##call, \ + _TRACE_PROFILE_INIT(call) \ +} + +#undef __entry +#define __entry entry + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ + \ +static struct ftrace_event_call event_##call; \ + \ +static void ftrace_raw_event_##call(proto) \ +{ \ + struct ftrace_event_call *call = &event_##call; \ + struct ring_buffer_event *event; \ + struct ftrace_raw_##call *entry; \ + unsigned long irq_flags; \ + int pc; \ + \ + local_save_flags(irq_flags); \ + pc = preempt_count(); \ + \ + event = trace_current_buffer_lock_reserve(event_##call.id, \ + sizeof(struct ftrace_raw_##call), \ + irq_flags, pc); \ + if (!event) \ + return; \ + entry = ring_buffer_event_data(event); \ + \ + assign; \ + \ + if (!filter_current_check_discard(call, entry, event)) \ + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ +} \ + \ +static int ftrace_raw_reg_event_##call(void) \ +{ \ + int ret; \ + \ + ret = register_trace_##call(ftrace_raw_event_##call); \ + if (ret) \ + pr_info("event trace: Could not activate trace point " \ + "probe to " #call "\n"); \ + return ret; \ +} \ + \ +static void ftrace_raw_unreg_event_##call(void) \ +{ \ + unregister_trace_##call(ftrace_raw_event_##call); \ +} \ + \ +static struct trace_event ftrace_event_type_##call = { \ + .trace = ftrace_raw_output_##call, \ +}; \ + \ +static int ftrace_raw_init_event_##call(void) \ +{ \ + int id; \ + \ + id = register_ftrace_event(&ftrace_event_type_##call); \ + if (!id) \ + return -ENODEV; \ + event_##call.id = id; \ + INIT_LIST_HEAD(&event_##call.fields); \ + init_preds(&event_##call); \ + return 0; \ +} \ + \ +static struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .system = __stringify(TRACE_SYSTEM), \ + .raw_init = ftrace_raw_init_event_##call, \ + .regfunc = ftrace_raw_reg_event_##call, \ + .unregfunc = ftrace_raw_unreg_event_##call, \ + .show_format = ftrace_format_##call, \ + .define_fields = ftrace_define_fields_##call, \ + _TRACE_PROFILE_INIT(call) \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef _TRACE_PROFILE +#undef _TRACE_PROFILE_INIT + diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h deleted file mode 100644 index 13d6b85668cf..000000000000 --- a/include/trace/trace_events.h +++ /dev/null @@ -1,7 +0,0 @@ -/* trace/.h here */ - -#include -#include -#include -#include -#include diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 3ad367e7c97f..fb9d7f964898 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,7 +41,6 @@ obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_EVENT_TRACING) += trace_events.o -obj-$(CONFIG_EVENT_TRACER) += events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o diff --git a/kernel/trace/events.c b/kernel/trace/events.c deleted file mode 100644 index 5a35a914f0e2..000000000000 --- a/kernel/trace/events.c +++ /dev/null @@ -1,15 +0,0 @@ -/* - * This is the place to register all trace points as events. - */ - -#include - -#include - -#include "trace_output.h" - -#define TRACE_HEADER_MULTI_READ -#include "trace_events_stage_1.h" -#include "trace_events_stage_2.h" -#include "trace_events_stage_3.h" - diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h deleted file mode 100644 index 475f46a047ae..000000000000 --- a/kernel/trace/trace_events_stage_1.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Stage 1 of the trace events. - * - * Override the macros in to include the following: - * - * struct ftrace_raw_ { - * struct trace_entry ent; - * ; - * []; - * [...] - * }; - * - * The is created by the __field(type, item) macro or - * the __array(type2, item2, len) macro. - * We simply do "type item;", and that will create the fields - * in the structure. - */ - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) - -#undef __array -#define __array(type, item, len) type item[len]; - -#undef __field -#define __field(type, item) type item; - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ - struct ftrace_raw_##name { \ - struct trace_entry ent; \ - tstruct \ - }; \ - static struct ftrace_event_call event_##name - -#include diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h deleted file mode 100644 index aa4a67a0656f..000000000000 --- a/kernel/trace/trace_events_stage_2.h +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Stage 2 of the trace events. - * - * Override the macros in to include the following: - * - * enum print_line_t - * ftrace_raw_output_(struct trace_iterator *iter, int flags) - * { - * struct trace_seq *s = &iter->seq; - * struct ftrace_raw_ *field; <-- defined in stage 1 - * struct trace_entry *entry; - * int ret; - * - * entry = iter->ent; - * - * if (entry->type != event_.id) { - * WARN_ON_ONCE(1); - * return TRACE_TYPE_UNHANDLED; - * } - * - * field = (typeof(field))entry; - * - * ret = trace_seq_printf(s, "\n"); - * if (!ret) - * return TRACE_TYPE_PARTIAL_LINE; - * - * return TRACE_TYPE_HANDLED; - * } - * - * This is the method used to print the raw event to the trace - * output format. Note, this is not needed if the data is read - * in binary. - */ - -#undef __entry -#define __entry field - -#undef TP_printk -#define TP_printk(fmt, args...) fmt "\n", args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -enum print_line_t \ -ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ -{ \ - struct trace_seq *s = &iter->seq; \ - struct ftrace_raw_##call *field; \ - struct trace_entry *entry; \ - int ret; \ - \ - entry = iter->ent; \ - \ - if (entry->type != event_##call.id) { \ - WARN_ON_ONCE(1); \ - return TRACE_TYPE_UNHANDLED; \ - } \ - \ - field = (typeof(field))entry; \ - \ - ret = trace_seq_printf(s, #call ": " print); \ - if (!ret) \ - return TRACE_TYPE_PARTIAL_LINE; \ - \ - return TRACE_TYPE_HANDLED; \ -} - -#include - -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " offset:%u; size:%u;\n", - * offsetof(struct ftrace_raw_##call, item), - * sizeof(field.type)); - * - * } - */ - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __entry -#define __entry REC - -#undef TP_printk -#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -static int \ -ftrace_format_##call(struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include - -#undef __field -#define __field(type, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (ret) \ - return ret; - -#undef __array -#define __array(type, item, len) \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (ret) \ - return ret; - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -int \ -ftrace_define_fields_##call(void) \ -{ \ - struct ftrace_raw_##call field; \ - struct ftrace_event_call *event_call = &event_##call; \ - int ret; \ - \ - __common_field(unsigned char, type); \ - __common_field(unsigned char, flags); \ - __common_field(unsigned char, preempt_count); \ - __common_field(int, pid); \ - __common_field(int, tgid); \ - \ - tstruct; \ - \ - return ret; \ -} - -#include diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h deleted file mode 100644 index 45c04e1f38db..000000000000 --- a/kernel/trace/trace_events_stage_3.h +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Stage 3 of the trace events. - * - * Override the macros in to include the following: - * - * static void ftrace_event_(proto) - * { - * event_trace_printk(_RET_IP_, ": " ); - * } - * - * static int ftrace_reg_event_(void) - * { - * int ret; - * - * ret = register_trace_(ftrace_event_); - * if (!ret) - * pr_info("event trace: Could not activate trace point " - * "probe to "); - * return ret; - * } - * - * static void ftrace_unreg_event_(void) - * { - * unregister_trace_(ftrace_event_); - * } - * - * For those macros defined with TRACE_FORMAT: - * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_ = { - * .name = "", - * .regfunc = ftrace_reg_event_, - * .unregfunc = ftrace_unreg_event_, - * } - * - * - * For those macros defined with TRACE_EVENT: - * - * static struct ftrace_event_call event_; - * - * static void ftrace_raw_event_(proto) - * { - * struct ring_buffer_event *event; - * struct ftrace_raw_ *entry; <-- defined in stage 1 - * unsigned long irq_flags; - * int pc; - * - * local_save_flags(irq_flags); - * pc = preempt_count(); - * - * event = trace_current_buffer_lock_reserve(event_.id, - * sizeof(struct ftrace_raw_), - * irq_flags, pc); - * if (!event) - * return; - * entry = ring_buffer_event_data(event); - * - * ; <-- Here we assign the entries by the __field and - * __array macros. - * - * trace_current_buffer_unlock_commit(event, irq_flags, pc); - * } - * - * static int ftrace_raw_reg_event_(void) - * { - * int ret; - * - * ret = register_trace_(ftrace_raw_event_); - * if (!ret) - * pr_info("event trace: Could not activate trace point " - * "probe to "); - * return ret; - * } - * - * static void ftrace_unreg_event_(void) - * { - * unregister_trace_(ftrace_raw_event_); - * } - * - * static struct trace_event ftrace_event_type_ = { - * .trace = ftrace_raw_output_, <-- stage 2 - * }; - * - * static int ftrace_raw_init_event_(void) - * { - * int id; - * - * id = register_ftrace_event(&ftrace_event_type_); - * if (!id) - * return -ENODEV; - * event_.id = id; - * return 0; - * } - * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_ = { - * .name = "", - * .system = "", - * .raw_init = ftrace_raw_init_event_, - * .regfunc = ftrace_reg_event_, - * .unregfunc = ftrace_unreg_event_, - * .show_format = ftrace_format_, - * } - * - */ - -#undef TP_FMT -#define TP_FMT(fmt, args...) fmt "\n", ##args - -#ifdef CONFIG_EVENT_PROFILE -#define _TRACE_PROFILE(call, proto, args) \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int); \ - perf_tpcounter_event(event_##call.id); \ -} \ - \ -static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \ -{ \ - int ret = 0; \ - \ - if (!atomic_inc_return(&call->profile_count)) \ - ret = register_trace_##call(ftrace_profile_##call); \ - \ - return ret; \ -} \ - \ -static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ -{ \ - if (atomic_add_negative(-1, &call->profile_count)) \ - unregister_trace_##call(ftrace_profile_##call); \ -} - -#define _TRACE_PROFILE_INIT(call) \ - .profile_count = ATOMIC_INIT(-1), \ - .profile_enable = ftrace_profile_enable_##call, \ - .profile_disable = ftrace_profile_disable_##call, - -#else -#define _TRACE_PROFILE(call, proto, args) -#define _TRACE_PROFILE_INIT(call) -#endif - -#define _TRACE_FORMAT(call, proto, args, fmt) \ -static void ftrace_event_##call(proto) \ -{ \ - event_trace_printk(_RET_IP_, #call ": " fmt); \ -} \ - \ -static int ftrace_reg_event_##call(void) \ -{ \ - int ret; \ - \ - ret = register_trace_##call(ftrace_event_##call); \ - if (ret) \ - pr_info("event trace: Could not activate trace point " \ - "probe to " #call "\n"); \ - return ret; \ -} \ - \ -static void ftrace_unreg_event_##call(void) \ -{ \ - unregister_trace_##call(ftrace_event_##call); \ -} \ - \ -static struct ftrace_event_call event_##call; \ - \ -static int ftrace_init_event_##call(void) \ -{ \ - int id; \ - \ - id = register_ftrace_event(NULL); \ - if (!id) \ - return -ENODEV; \ - event_##call.id = id; \ - return 0; \ -} - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) \ -_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ -static struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_init_event_##call, \ - .regfunc = ftrace_reg_event_##call, \ - .unregfunc = ftrace_unreg_event_##call, \ - _TRACE_PROFILE_INIT(call) \ -} - -#undef __entry -#define __entry entry - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ - \ -static struct ftrace_event_call event_##call; \ - \ -static void ftrace_raw_event_##call(proto) \ -{ \ - struct ftrace_event_call *call = &event_##call; \ - struct ring_buffer_event *event; \ - struct ftrace_raw_##call *entry; \ - unsigned long irq_flags; \ - int pc; \ - \ - local_save_flags(irq_flags); \ - pc = preempt_count(); \ - \ - event = trace_current_buffer_lock_reserve(event_##call.id, \ - sizeof(struct ftrace_raw_##call), \ - irq_flags, pc); \ - if (!event) \ - return; \ - entry = ring_buffer_event_data(event); \ - \ - assign; \ - \ - if (!filter_current_check_discard(call, entry, event)) \ - trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ -} \ - \ -static int ftrace_raw_reg_event_##call(void) \ -{ \ - int ret; \ - \ - ret = register_trace_##call(ftrace_raw_event_##call); \ - if (ret) \ - pr_info("event trace: Could not activate trace point " \ - "probe to " #call "\n"); \ - return ret; \ -} \ - \ -static void ftrace_raw_unreg_event_##call(void) \ -{ \ - unregister_trace_##call(ftrace_raw_event_##call); \ -} \ - \ -static struct trace_event ftrace_event_type_##call = { \ - .trace = ftrace_raw_output_##call, \ -}; \ - \ -static int ftrace_raw_init_event_##call(void) \ -{ \ - int id; \ - \ - id = register_ftrace_event(&ftrace_event_type_##call); \ - if (!id) \ - return -ENODEV; \ - event_##call.id = id; \ - INIT_LIST_HEAD(&event_##call.fields); \ - init_preds(&event_##call); \ - return 0; \ -} \ - \ -static struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event_##call, \ - .regfunc = ftrace_raw_reg_event_##call, \ - .unregfunc = ftrace_raw_unreg_event_##call, \ - .show_format = ftrace_format_##call, \ - .define_fields = ftrace_define_fields_##call, \ - _TRACE_PROFILE_INIT(call) \ -} - -#include - -#undef _TRACE_PROFILE -#undef _TRACE_PROFILE_INIT - -- cgit v1.2.2 From a59fd6027218bd7c994e39d14afe0242f895144f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 13:52:20 -0400 Subject: tracing/events: convert event call sites to use a link list Impact: makes it possible to define events in modules The events are created by reading down the section that they are linked in by the macros. But this is not scalable to modules. This patch converts the manipulations to use a global link list, and on boot up it adds the items in the section to the list. This change will allow modules to add their tracing events to the list as well. Note, this change alone does not permit modules to use the TRACE_EVENT macros, but the change is needed for them to eventually do so. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + kernel/trace/trace.h | 13 +--------- kernel/trace/trace_event_profile.c | 4 +-- kernel/trace/trace_events.c | 51 +++++++++++++++++++++++--------------- kernel/trace/trace_events_filter.c | 8 +++--- 5 files changed, 39 insertions(+), 38 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 496b76d9f9d8..17810853b4f8 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -83,6 +83,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event); void tracing_record_cmdline(struct task_struct *tsk); struct ftrace_event_call { + struct list_head list; char *name; char *system; struct dentry *dir; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6bcdf4af9b2d..8817c18ef97a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -739,11 +739,6 @@ struct event_subsystem { struct filter_pred **preds; }; -#define events_for_each(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) - struct filter_pred; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); @@ -785,13 +780,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -extern struct ftrace_event_call __start_ftrace_events[]; -extern struct ftrace_event_call __stop_ftrace_events[]; - -#define for_each_event(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) +extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 199de9c74229..7bf2ad65eee5 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -11,7 +11,7 @@ int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; - for_each_event(event) { + list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) return event->profile_enable(event); } @@ -23,7 +23,7 @@ void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; - for_each_event(event) { + list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) return event->profile_disable(event); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ead68ac99191..5c66aaff07c1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -19,6 +19,8 @@ static DEFINE_MUTEX(event_mutex); +LIST_HEAD(ftrace_events); + int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size) { @@ -54,16 +56,14 @@ err: static void ftrace_clear_events(void) { - struct ftrace_event_call *call = (void *)__start_ftrace_events; - + struct ftrace_event_call *call; - while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { + list_for_each_entry(call, &ftrace_events, list) { if (call->enabled) { call->enabled = 0; call->unregfunc(); } - call++; } } @@ -89,7 +89,7 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, static int ftrace_set_clr_event(char *buf, int set) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; char *event = NULL, *sub = NULL, *match; int ret = -EINVAL; @@ -118,7 +118,7 @@ static int ftrace_set_clr_event(char *buf, int set) } mutex_lock(&event_mutex); - for_each_event(call) { + list_for_each_entry(call, &ftrace_events, list) { if (!call->name || !call->regfunc) continue; @@ -224,15 +224,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_call *next = call; + struct list_head *list = m->private; + struct ftrace_event_call *call; (*pos)++; for (;;) { - if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + if (list == &ftrace_events) return NULL; + call = list_entry(list, struct ftrace_event_call, list); + /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. @@ -240,11 +242,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (call->regfunc) break; - call++; - next = call; + list = list->next; } - m->private = ++next; + m->private = list->next; return call; } @@ -257,22 +258,23 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_call *next; + struct list_head *list = m->private; + struct ftrace_event_call *call; (*pos)++; retry: - if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + if (list == &ftrace_events) return NULL; + call = list_entry(list, struct ftrace_event_call, list); + if (!call->enabled) { - call++; + list = list->next; goto retry; } - next = call; - m->private = ++next; + m->private = list->next; return call; } @@ -312,7 +314,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *m = file->private_data; - m->private = __start_ftrace_events; + m->private = ftrace_events.next; } return ret; } @@ -797,9 +799,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return 0; } +extern struct ftrace_event_call __start_ftrace_events[]; +extern struct ftrace_event_call __stop_ftrace_events[]; + +#define for_each_event(event) \ + for (event = __start_ftrace_events; \ + (unsigned long)event < (unsigned long)__stop_ftrace_events; \ + event++) + static __init int event_trace_init(void) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; @@ -830,6 +840,7 @@ static __init int event_trace_init(void) /* The linker may leave blanks */ if (!call->name) continue; + list_add(&call->list, &ftrace_events); event_create_dir(call, d_events); } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index de42dad42a88..d30b06b02b4d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -223,7 +223,7 @@ oom: void filter_free_subsystem_preds(struct event_subsystem *system) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; int i; if (system->n_preds) { @@ -234,7 +234,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system) system->n_preds = 0; } - events_for_each(call) { + list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; @@ -320,7 +320,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; if (system->n_preds && !pred->compound) filter_free_subsystem_preds(system); @@ -337,7 +337,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system, system->preds[system->n_preds] = pred; - events_for_each(call) { + list_for_each_entry(call, &ftrace_events, list) { int err; if (!call->define_fields) -- cgit v1.2.2 From 17c873ec280a03894bc718af817f7f24fa787ae1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 18:12:50 -0400 Subject: tracing/events: add export symbols for trace events in modules Impact: let modules add trace events The trace event code requires some functions to be exported to allow modules to use TRACE_EVENT. This patch adds EXPORT_SYMBOL_GPL to the necessary functions. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +++ kernel/trace/trace_events.c | 1 + kernel/trace/trace_events_filter.c | 2 ++ kernel/trace/trace_output.c | 3 +++ 4 files changed, 9 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c0047fcf7076..2d69b26b3cc9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -176,6 +176,7 @@ int filter_current_check_discard(struct ftrace_event_call *call, void *rec, { return filter_check_discard(call, rec, global_trace.buffer, event); } +EXPORT_SYMBOL_GPL(filter_current_check_discard); cycle_t ftrace_now(int cpu) { @@ -886,6 +887,7 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, return trace_buffer_lock_reserve(&global_trace, type, len, flags, pc); } +EXPORT_SYMBOL(trace_current_buffer_lock_reserve); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) @@ -903,6 +905,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event) { ring_buffer_discard_commit(global_trace.buffer, event); } +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); void trace_function(struct trace_array *tr, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5c66aaff07c1..8b9e621b80b4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -53,6 +53,7 @@ err: return -ENOMEM; } +EXPORT_SYMBOL_GPL(trace_define_field); static void ftrace_clear_events(void) { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d30b06b02b4d..f8e5eab0424c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -110,6 +110,7 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) return 1; } +EXPORT_SYMBOL_GPL(filter_match_preds); void filter_print_preds(struct filter_pred **preds, int n_preds, struct trace_seq *s) @@ -220,6 +221,7 @@ oom: return -ENOMEM; } +EXPORT_SYMBOL_GPL(init_preds); void filter_free_subsystem_preds(struct event_subsystem *system) { diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0e70fb07ca78..83a8abb9640f 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -94,6 +94,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } +EXPORT_SYMBOL_GPL(trace_seq_printf); int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { @@ -538,6 +539,7 @@ int register_ftrace_event(struct trace_event *event) return ret; } +EXPORT_SYMBOL_GPL(register_ftrace_event); /** * unregister_ftrace_event - remove a no longer used event @@ -551,6 +553,7 @@ int unregister_ftrace_event(struct trace_event *event) return 0; } +EXPORT_SYMBOL_GPL(unregister_ftrace_event); /* * Standard events -- cgit v1.2.2 From 6d723736e472f7a0cd5b62c84152fceead241328 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 14:53:50 -0400 Subject: tracing/events: add support for modules to TRACE_EVENT Impact: allow modules to add TRACE_EVENTS on load This patch adds the final hooks to allow modules to use the TRACE_EVENT macro. A notifier and a data structure are used to link the TRACE_EVENTs defined in the module to connect them with the ftrace event tracing system. It also adds the necessary automated clean ups to the trace events when a module is removed. Cc: Rusty Russell Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 3 + include/linux/module.h | 4 ++ include/linux/trace_seq.h | 2 + include/trace/ftrace.h | 1 + kernel/module.c | 7 +++ kernel/trace/trace_events.c | 128 ++++++++++++++++++++++++++++++++----------- 6 files changed, 113 insertions(+), 32 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 17810853b4f8..75f3ac01a87c 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -7,6 +7,7 @@ struct trace_array; struct tracer; +struct dentry; /* * The trace entry - the most basic unit of tracing. This is what @@ -87,6 +88,7 @@ struct ftrace_event_call { char *name; char *system; struct dentry *dir; + struct trace_event *event; int enabled; int (*regfunc)(void); void (*unregfunc)(void); @@ -97,6 +99,7 @@ struct ftrace_event_call { struct list_head fields; int n_preds; struct filter_pred **preds; + void *mod; #ifdef CONFIG_EVENT_PROFILE atomic_t profile_count; diff --git a/include/linux/module.h b/include/linux/module.h index 627ac082e2a6..6155fa44168b 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -337,6 +337,10 @@ struct module const char **trace_bprintk_fmt_start; unsigned int num_trace_bprintk_fmt; #endif +#ifdef CONFIG_EVENT_TRACING + struct ftrace_event_call *trace_events; + unsigned int num_trace_events; +#endif #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 28051da876dd..15ca2c71af13 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -1,6 +1,8 @@ #ifndef _LINUX_TRACE_SEQ_H #define _LINUX_TRACE_SEQ_H +#include + /* * Trace sequences are used to allow a function to call several other functions * to create a string of data to use (up to a max of PAGE_SIZE. diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 955b967acd74..60c5323bee64 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -477,6 +477,7 @@ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .system = __stringify(TRACE_SYSTEM), \ + .event = &ftrace_event_type_##call, \ .raw_init = ftrace_raw_init_event_##call, \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ diff --git a/kernel/module.c b/kernel/module.c index e797812a4d95..a0394706f10c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -18,6 +18,7 @@ */ #include #include +#include #include #include #include @@ -2172,6 +2173,12 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->tracepoints), &mod->num_tracepoints); #endif +#ifdef CONFIG_EVENT_TRACING + mod->trace_events = section_objs(hdr, sechdrs, secstrings, + "_ftrace_events", + sizeof(*mod->trace_events), + &mod->num_trace_events); +#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8b9e621b80b4..a4b177720a6c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -713,7 +713,13 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return d_events; } - system->name = name; + system->name = kstrdup(name, GFP_KERNEL); + if (!system->name) { + debugfs_remove(system->entry); + kfree(system); + return d_events; + } + list_add(&system->list, &event_subsystems); system->preds = NULL; @@ -738,7 +744,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ - if (strcmp(call->system, "TRACE_SYSTEM") != 0) + if (strcmp(call->system, TRACE_SYSTEM) != 0) d_events = event_subsystem_dir(call->system, d_events); if (call->raw_init) { @@ -757,21 +763,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return -1; } - if (call->regfunc) { - entry = debugfs_create_file("enable", 0644, call->dir, call, - &ftrace_enable_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/enable' entry\n", call->name); - } + if (call->regfunc) + entry = trace_create_file("enable", 0644, call->dir, call, + &ftrace_enable_fops); - if (call->id) { - entry = debugfs_create_file("id", 0444, call->dir, call, - &ftrace_event_id_fops); - if (!entry) - pr_warning("Could not create debugfs '%s/id' entry\n", - call->name); - } + if (call->id) + entry = trace_create_file("id", 0444, call->dir, call, + &ftrace_event_id_fops); if (call->define_fields) { ret = call->define_fields(); @@ -780,40 +778,102 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) " events/%s\n", call->name); return ret; } - entry = debugfs_create_file("filter", 0644, call->dir, call, - &ftrace_event_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", call->name); + entry = trace_create_file("filter", 0644, call->dir, call, + &ftrace_event_filter_fops); } /* A trace may not want to export its format */ if (!call->show_format) return 0; - entry = debugfs_create_file("format", 0444, call->dir, call, - &ftrace_event_format_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/format' entry\n", call->name); + entry = trace_create_file("format", 0444, call->dir, call, + &ftrace_event_format_fops); + + return 0; +} + +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +static void trace_module_add_events(struct module *mod) +{ + struct ftrace_event_call *call, *start, *end; + struct dentry *d_events; + + start = mod->trace_events; + end = mod->trace_events + mod->num_trace_events; + + if (start == end) + return; + + d_events = event_trace_events_dir(); + if (!d_events) + return; + + for_each_event(call, start, end) { + /* The linker may leave blanks */ + if (!call->name) + continue; + call->mod = mod; + list_add(&call->list, &ftrace_events); + event_create_dir(call, d_events); + } +} + +static void trace_module_remove_events(struct module *mod) +{ + struct ftrace_event_call *call, *p; + + list_for_each_entry_safe(call, p, &ftrace_events, list) { + if (call->mod == mod) { + if (call->enabled) { + call->enabled = 0; + call->unregfunc(); + } + if (call->event) + unregister_ftrace_event(call->event); + debugfs_remove_recursive(call->dir); + list_del(&call->list); + } + } +} + +int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + mutex_lock(&event_mutex); + switch (val) { + case MODULE_STATE_COMING: + trace_module_add_events(mod); + break; + case MODULE_STATE_GOING: + trace_module_remove_events(mod); + break; + } + mutex_unlock(&event_mutex); return 0; } +struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; + extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; -#define for_each_event(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) - static __init int event_trace_init(void) { struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; + int ret; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -837,7 +897,7 @@ static __init int event_trace_init(void) if (!d_events) return 0; - for_each_event(call) { + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) continue; @@ -845,6 +905,10 @@ static __init int event_trace_init(void) event_create_dir(call, d_events); } + ret = register_module_notifier(&trace_module_nb); + if (!ret) + pr_warning("Failed to register trace events module notifier\n"); + return 0; } fs_initcall(event_trace_init); -- cgit v1.2.2 From 61f919a12fbdc3fd20f980a34a118d597198a392 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 18:22:32 -0400 Subject: tracing/events: fix compile for modules disabled Impact: compile fix The addition of TRACE_EVENT for modules breaks the build for when modules are disabled. This code fixes that. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a4b177720a6c..6591d83e1e7a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -797,6 +797,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) (unsigned long)event < (unsigned long)end; \ event++) +#ifdef CONFIG_MODULES static void trace_module_add_events(struct module *mod) { struct ftrace_event_call *call, *start, *end; @@ -840,8 +841,8 @@ static void trace_module_remove_events(struct module *mod) } } -int trace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) { struct module *mod = data; @@ -858,6 +859,13 @@ int trace_module_notify(struct notifier_block *self, return 0; } +#else +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, -- cgit v1.2.2 From ecda8ae02a08ef065ff387f5cb2a2d4999da2408 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 18:49:38 -0400 Subject: tracing/events: fix lockdep system name Impact: fix compile error of lockdep event tracer Ingo Molnar pointed out that the system name for the lockdep tracer was "lock" which is used to include the event trace file name. It should be "lockdep" Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- include/trace/lockdep.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h index 4d301e758de3..45e326b5c7f3 100644 --- a/include/trace/lockdep.h +++ b/include/trace/lockdep.h @@ -5,7 +5,7 @@ #include #undef TRACE_SYSTEM -#define TRACE_SYSTEM lock +#define TRACE_SYSTEM lockdep #ifdef CONFIG_LOCKDEP -- cgit v1.2.2 From ad8d75fff811a6a230f7f43b05a6483099349533 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 19:39:12 -0400 Subject: tracing/events: move trace point headers into include/trace/events Impact: clean up Create a sub directory in include/trace called events to keep the trace point headers in their own separate directory. Only headers that declare trace points should be defined in this directory. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neil Horman Cc: Zhao Lei Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Signed-off-by: Steven Rostedt --- include/linux/kmemtrace.h | 2 +- include/trace/define_trace.h | 2 +- include/trace/events/irq.h | 57 +++++++ include/trace/events/kmem.h | 194 ++++++++++++++++++++++ include/trace/events/lockdep.h | 60 +++++++ include/trace/events/sched.h | 339 ++++++++++++++++++++++++++++++++++++++ include/trace/events/skb.h | 40 +++++ include/trace/irq.h | 57 ------- include/trace/kmem.h | 194 ---------------------- include/trace/lockdep.h | 60 ------- include/trace/sched.h | 339 -------------------------------------- include/trace/skb.h | 40 ----- kernel/exit.c | 2 +- kernel/fork.c | 3 +- kernel/irq/handle.c | 2 +- kernel/kthread.c | 2 +- kernel/lockdep.c | 2 +- kernel/sched.c | 2 +- kernel/signal.c | 2 +- kernel/softirq.c | 2 +- kernel/trace/ftrace.c | 2 +- kernel/trace/trace_sched_switch.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- mm/util.c | 2 +- net/core/drop_monitor.c | 2 +- net/core/net-traces.c | 2 +- net/core/skbuff.c | 2 +- 27 files changed, 708 insertions(+), 707 deletions(-) create mode 100644 include/trace/events/irq.h create mode 100644 include/trace/events/kmem.h create mode 100644 include/trace/events/lockdep.h create mode 100644 include/trace/events/sched.h create mode 100644 include/trace/events/skb.h delete mode 100644 include/trace/irq.h delete mode 100644 include/trace/kmem.h delete mode 100644 include/trace/lockdep.h delete mode 100644 include/trace/sched.h delete mode 100644 include/trace/skb.h diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h index 15c45a27a925..b616d3930c3b 100644 --- a/include/linux/kmemtrace.h +++ b/include/linux/kmemtrace.h @@ -9,7 +9,7 @@ #ifdef __KERNEL__ -#include +#include #ifdef CONFIG_KMEMTRACE extern void kmemtrace_init(void); diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index 980eb66a6e38..18869417109c 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -43,7 +43,7 @@ #endif #ifndef TRACE_INCLUDE_PATH -# define __TRACE_INCLUDE(system) +# define __TRACE_INCLUDE(system) # define UNDEF_TRACE_INCLUDE_FILE #else # define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h) diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h new file mode 100644 index 000000000000..75e3468e4493 --- /dev/null +++ b/include/trace/events/irq.h @@ -0,0 +1,57 @@ +#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_IRQ_H + +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM irq + +/* + * Tracepoint for entry of interrupt handler: + */ +TRACE_FORMAT(irq_handler_entry, + TP_PROTO(int irq, struct irqaction *action), + TP_ARGS(irq, action), + TP_FMT("irq=%d handler=%s", irq, action->name) + ); + +/* + * Tracepoint for return of an interrupt handler: + */ +TRACE_EVENT(irq_handler_exit, + + TP_PROTO(int irq, struct irqaction *action, int ret), + + TP_ARGS(irq, action, ret), + + TP_STRUCT__entry( + __field( int, irq ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->ret = ret; + ), + + TP_printk("irq=%d return=%s", + __entry->irq, __entry->ret ? "handled" : "unhandled") +); + +TRACE_FORMAT(softirq_entry, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), + TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) + ); + +TRACE_FORMAT(softirq_exit, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), + TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) + ); + +#endif /* _TRACE_IRQ_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h new file mode 100644 index 000000000000..c22c42f980b5 --- /dev/null +++ b/include/trace/events/kmem.h @@ -0,0 +1,194 @@ +#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KMEM_H + +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kmem + +TRACE_EVENT(kmalloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmem_cache_alloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmalloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kmem_cache_alloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kfree, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); + +TRACE_EVENT(kmem_cache_free, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); +#endif /* _TRACE_KMEM_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h new file mode 100644 index 000000000000..45e326b5c7f3 --- /dev/null +++ b/include/trace/events/lockdep.h @@ -0,0 +1,60 @@ +#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_LOCKDEP_H + +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM lockdep + +#ifdef CONFIG_LOCKDEP + +TRACE_FORMAT(lock_acquire, + TP_PROTO(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, + struct lockdep_map *next_lock, unsigned long ip), + TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), + TP_FMT("%s%s%s", trylock ? "try " : "", + read ? "read " : "", lock->name) + ); + +TRACE_FORMAT(lock_release, + TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), + TP_ARGS(lock, nested, ip), + TP_FMT("%s", lock->name) + ); + +#ifdef CONFIG_LOCK_STAT + +TRACE_FORMAT(lock_contended, + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + TP_ARGS(lock, ip), + TP_FMT("%s", lock->name) + ); + +TRACE_EVENT(lock_acquired, + TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), + + TP_ARGS(lock, ip, waittime), + + TP_STRUCT__entry( + __field(const char *, name) + __field(unsigned long, wait_usec) + __field(unsigned long, wait_nsec_rem) + ), + TP_fast_assign( + __entry->name = lock->name; + __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); + __entry->wait_usec = (unsigned long) waittime; + ), + TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, + __entry->wait_nsec_rem) +); + +#endif +#endif + +#endif /* _TRACE_LOCKDEP_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h new file mode 100644 index 000000000000..ffa1cab586b9 --- /dev/null +++ b/include/trace/events/sched.h @@ -0,0 +1,339 @@ +#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SCHED_H + +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sched + +/* + * Tracepoint for calling kthread_stop, performed to end a kthread: + */ +TRACE_EVENT(sched_kthread_stop, + + TP_PROTO(struct task_struct *t), + + TP_ARGS(t), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + ), + + TP_fast_assign( + memcpy(__entry->comm, t->comm, TASK_COMM_LEN); + __entry->pid = t->pid; + ), + + TP_printk("task %s:%d", __entry->comm, __entry->pid) +); + +/* + * Tracepoint for the return value of the kthread stopping: + */ +TRACE_EVENT(sched_kthread_stop_ret, + + TP_PROTO(int ret), + + TP_ARGS(ret), + + TP_STRUCT__entry( + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ret = ret; + ), + + TP_printk("ret %d", __entry->ret) +); + +/* + * Tracepoint for waiting on task to unschedule: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wait_task, + + TP_PROTO(struct rq *rq, struct task_struct *p), + + TP_ARGS(rq, p), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for waking up a task: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wakeup, + + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + + TP_ARGS(rq, p, success), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, success ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->success = success; + ), + + TP_printk("task %s:%d [%d] success=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->success) +); + +/* + * Tracepoint for waking up a new task: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wakeup_new, + + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + + TP_ARGS(rq, p, success), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, success ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->success = success; + ), + + TP_printk("task %s:%d [%d] success=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->success) +); + +/* + * Tracepoint for task switches, performed by the scheduler: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_switch, + + TP_PROTO(struct rq *rq, struct task_struct *prev, + struct task_struct *next), + + TP_ARGS(rq, prev, next), + + TP_STRUCT__entry( + __array( char, prev_comm, TASK_COMM_LEN ) + __field( pid_t, prev_pid ) + __field( int, prev_prio ) + __array( char, next_comm, TASK_COMM_LEN ) + __field( pid_t, next_pid ) + __field( int, next_prio ) + ), + + TP_fast_assign( + memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); + __entry->prev_pid = prev->pid; + __entry->prev_prio = prev->prio; + memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); + __entry->next_pid = next->pid; + __entry->next_prio = next->prio; + ), + + TP_printk("task %s:%d [%d] ==> %s:%d [%d]", + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, + __entry->next_comm, __entry->next_pid, __entry->next_prio) +); + +/* + * Tracepoint for a task being migrated: + */ +TRACE_EVENT(sched_migrate_task, + + TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), + + TP_ARGS(p, orig_cpu, dest_cpu), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, orig_cpu ) + __field( int, dest_cpu ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->orig_cpu = orig_cpu; + __entry->dest_cpu = dest_cpu; + ), + + TP_printk("task %s:%d [%d] from: %d to: %d", + __entry->comm, __entry->pid, __entry->prio, + __entry->orig_cpu, __entry->dest_cpu) +); + +/* + * Tracepoint for freeing a task: + */ +TRACE_EVENT(sched_process_free, + + TP_PROTO(struct task_struct *p), + + TP_ARGS(p), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for a task exiting: + */ +TRACE_EVENT(sched_process_exit, + + TP_PROTO(struct task_struct *p), + + TP_ARGS(p), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for a waiting task: + */ +TRACE_EVENT(sched_process_wait, + + TP_PROTO(struct pid *pid), + + TP_ARGS(pid), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + __entry->pid = pid_nr(pid); + __entry->prio = current->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for do_fork: + */ +TRACE_EVENT(sched_process_fork, + + TP_PROTO(struct task_struct *parent, struct task_struct *child), + + TP_ARGS(parent, child), + + TP_STRUCT__entry( + __array( char, parent_comm, TASK_COMM_LEN ) + __field( pid_t, parent_pid ) + __array( char, child_comm, TASK_COMM_LEN ) + __field( pid_t, child_pid ) + ), + + TP_fast_assign( + memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); + __entry->parent_pid = parent->pid; + memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); + __entry->child_pid = child->pid; + ), + + TP_printk("parent %s:%d child %s:%d", + __entry->parent_comm, __entry->parent_pid, + __entry->child_comm, __entry->child_pid) +); + +/* + * Tracepoint for sending a signal: + */ +TRACE_EVENT(sched_signal_send, + + TP_PROTO(int sig, struct task_struct *p), + + TP_ARGS(sig, p), + + TP_STRUCT__entry( + __field( int, sig ) + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->sig = sig; + ), + + TP_printk("sig: %d task %s:%d", + __entry->sig, __entry->comm, __entry->pid) +); + +#endif /* _TRACE_SCHED_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h new file mode 100644 index 000000000000..1e8fabb57c06 --- /dev/null +++ b/include/trace/events/skb.h @@ -0,0 +1,40 @@ +#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SKB_H + +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM skb + +/* + * Tracepoint for free an sk_buff: + */ +TRACE_EVENT(kfree_skb, + + TP_PROTO(struct sk_buff *skb, void *location), + + TP_ARGS(skb, location), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + __field( unsigned short, protocol ) + __field( void *, location ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + if (skb) { + __entry->protocol = ntohs(skb->protocol); + } + __entry->location = location; + ), + + TP_printk("skbaddr=%p protocol=%u location=%p", + __entry->skbaddr, __entry->protocol, __entry->location) +); + +#endif /* _TRACE_SKB_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/irq.h b/include/trace/irq.h deleted file mode 100644 index 75e3468e4493..000000000000 --- a/include/trace/irq.h +++ /dev/null @@ -1,57 +0,0 @@ -#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_IRQ_H - -#include -#include - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM irq - -/* - * Tracepoint for entry of interrupt handler: - */ -TRACE_FORMAT(irq_handler_entry, - TP_PROTO(int irq, struct irqaction *action), - TP_ARGS(irq, action), - TP_FMT("irq=%d handler=%s", irq, action->name) - ); - -/* - * Tracepoint for return of an interrupt handler: - */ -TRACE_EVENT(irq_handler_exit, - - TP_PROTO(int irq, struct irqaction *action, int ret), - - TP_ARGS(irq, action, ret), - - TP_STRUCT__entry( - __field( int, irq ) - __field( int, ret ) - ), - - TP_fast_assign( - __entry->irq = irq; - __entry->ret = ret; - ), - - TP_printk("irq=%d return=%s", - __entry->irq, __entry->ret ? "handled" : "unhandled") -); - -TRACE_FORMAT(softirq_entry, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), - TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); - -TRACE_FORMAT(softirq_exit, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), - TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); - -#endif /* _TRACE_IRQ_H */ - -/* This part must be outside protection */ -#include diff --git a/include/trace/kmem.h b/include/trace/kmem.h deleted file mode 100644 index c22c42f980b5..000000000000 --- a/include/trace/kmem.h +++ /dev/null @@ -1,194 +0,0 @@ -#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_KMEM_H - -#include -#include - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kmem - -TRACE_EVENT(kmalloc, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags) -); - -TRACE_EVENT(kmem_cache_alloc, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags) -); - -TRACE_EVENT(kmalloc_node, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - __field( int, node ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - __entry->node = node; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags, - __entry->node) -); - -TRACE_EVENT(kmem_cache_alloc_node, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - __field( int, node ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - __entry->node = node; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags, - __entry->node) -); - -TRACE_EVENT(kfree, - - TP_PROTO(unsigned long call_site, const void *ptr), - - TP_ARGS(call_site, ptr), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - ), - - TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) -); - -TRACE_EVENT(kmem_cache_free, - - TP_PROTO(unsigned long call_site, const void *ptr), - - TP_ARGS(call_site, ptr), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - ), - - TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) -); -#endif /* _TRACE_KMEM_H */ - -/* This part must be outside protection */ -#include diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h deleted file mode 100644 index 45e326b5c7f3..000000000000 --- a/include/trace/lockdep.h +++ /dev/null @@ -1,60 +0,0 @@ -#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_LOCKDEP_H - -#include -#include - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM lockdep - -#ifdef CONFIG_LOCKDEP - -TRACE_FORMAT(lock_acquire, - TP_PROTO(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, - struct lockdep_map *next_lock, unsigned long ip), - TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), - TP_FMT("%s%s%s", trylock ? "try " : "", - read ? "read " : "", lock->name) - ); - -TRACE_FORMAT(lock_release, - TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), - TP_ARGS(lock, nested, ip), - TP_FMT("%s", lock->name) - ); - -#ifdef CONFIG_LOCK_STAT - -TRACE_FORMAT(lock_contended, - TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, ip), - TP_FMT("%s", lock->name) - ); - -TRACE_EVENT(lock_acquired, - TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), - - TP_ARGS(lock, ip, waittime), - - TP_STRUCT__entry( - __field(const char *, name) - __field(unsigned long, wait_usec) - __field(unsigned long, wait_nsec_rem) - ), - TP_fast_assign( - __entry->name = lock->name; - __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); - __entry->wait_usec = (unsigned long) waittime; - ), - TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, - __entry->wait_nsec_rem) -); - -#endif -#endif - -#endif /* _TRACE_LOCKDEP_H */ - -/* This part must be outside protection */ -#include diff --git a/include/trace/sched.h b/include/trace/sched.h deleted file mode 100644 index ffa1cab586b9..000000000000 --- a/include/trace/sched.h +++ /dev/null @@ -1,339 +0,0 @@ -#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_SCHED_H - -#include -#include - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM sched - -/* - * Tracepoint for calling kthread_stop, performed to end a kthread: - */ -TRACE_EVENT(sched_kthread_stop, - - TP_PROTO(struct task_struct *t), - - TP_ARGS(t), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - ), - - TP_fast_assign( - memcpy(__entry->comm, t->comm, TASK_COMM_LEN); - __entry->pid = t->pid; - ), - - TP_printk("task %s:%d", __entry->comm, __entry->pid) -); - -/* - * Tracepoint for the return value of the kthread stopping: - */ -TRACE_EVENT(sched_kthread_stop_ret, - - TP_PROTO(int ret), - - TP_ARGS(ret), - - TP_STRUCT__entry( - __field( int, ret ) - ), - - TP_fast_assign( - __entry->ret = ret; - ), - - TP_printk("ret %d", __entry->ret) -); - -/* - * Tracepoint for waiting on task to unschedule: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_wait_task, - - TP_PROTO(struct rq *rq, struct task_struct *p), - - TP_ARGS(rq, p), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for waking up a task: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_wakeup, - - TP_PROTO(struct rq *rq, struct task_struct *p, int success), - - TP_ARGS(rq, p, success), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - __field( int, success ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - __entry->success = success; - ), - - TP_printk("task %s:%d [%d] success=%d", - __entry->comm, __entry->pid, __entry->prio, - __entry->success) -); - -/* - * Tracepoint for waking up a new task: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_wakeup_new, - - TP_PROTO(struct rq *rq, struct task_struct *p, int success), - - TP_ARGS(rq, p, success), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - __field( int, success ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - __entry->success = success; - ), - - TP_printk("task %s:%d [%d] success=%d", - __entry->comm, __entry->pid, __entry->prio, - __entry->success) -); - -/* - * Tracepoint for task switches, performed by the scheduler: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_switch, - - TP_PROTO(struct rq *rq, struct task_struct *prev, - struct task_struct *next), - - TP_ARGS(rq, prev, next), - - TP_STRUCT__entry( - __array( char, prev_comm, TASK_COMM_LEN ) - __field( pid_t, prev_pid ) - __field( int, prev_prio ) - __array( char, next_comm, TASK_COMM_LEN ) - __field( pid_t, next_pid ) - __field( int, next_prio ) - ), - - TP_fast_assign( - memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); - __entry->prev_pid = prev->pid; - __entry->prev_prio = prev->prio; - memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); - __entry->next_pid = next->pid; - __entry->next_prio = next->prio; - ), - - TP_printk("task %s:%d [%d] ==> %s:%d [%d]", - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - __entry->next_comm, __entry->next_pid, __entry->next_prio) -); - -/* - * Tracepoint for a task being migrated: - */ -TRACE_EVENT(sched_migrate_task, - - TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), - - TP_ARGS(p, orig_cpu, dest_cpu), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - __field( int, orig_cpu ) - __field( int, dest_cpu ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - __entry->orig_cpu = orig_cpu; - __entry->dest_cpu = dest_cpu; - ), - - TP_printk("task %s:%d [%d] from: %d to: %d", - __entry->comm, __entry->pid, __entry->prio, - __entry->orig_cpu, __entry->dest_cpu) -); - -/* - * Tracepoint for freeing a task: - */ -TRACE_EVENT(sched_process_free, - - TP_PROTO(struct task_struct *p), - - TP_ARGS(p), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for a task exiting: - */ -TRACE_EVENT(sched_process_exit, - - TP_PROTO(struct task_struct *p), - - TP_ARGS(p), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for a waiting task: - */ -TRACE_EVENT(sched_process_wait, - - TP_PROTO(struct pid *pid), - - TP_ARGS(pid), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); - __entry->pid = pid_nr(pid); - __entry->prio = current->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for do_fork: - */ -TRACE_EVENT(sched_process_fork, - - TP_PROTO(struct task_struct *parent, struct task_struct *child), - - TP_ARGS(parent, child), - - TP_STRUCT__entry( - __array( char, parent_comm, TASK_COMM_LEN ) - __field( pid_t, parent_pid ) - __array( char, child_comm, TASK_COMM_LEN ) - __field( pid_t, child_pid ) - ), - - TP_fast_assign( - memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); - __entry->parent_pid = parent->pid; - memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); - __entry->child_pid = child->pid; - ), - - TP_printk("parent %s:%d child %s:%d", - __entry->parent_comm, __entry->parent_pid, - __entry->child_comm, __entry->child_pid) -); - -/* - * Tracepoint for sending a signal: - */ -TRACE_EVENT(sched_signal_send, - - TP_PROTO(int sig, struct task_struct *p), - - TP_ARGS(sig, p), - - TP_STRUCT__entry( - __field( int, sig ) - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->sig = sig; - ), - - TP_printk("sig: %d task %s:%d", - __entry->sig, __entry->comm, __entry->pid) -); - -#endif /* _TRACE_SCHED_H */ - -/* This part must be outside protection */ -#include diff --git a/include/trace/skb.h b/include/trace/skb.h deleted file mode 100644 index 1e8fabb57c06..000000000000 --- a/include/trace/skb.h +++ /dev/null @@ -1,40 +0,0 @@ -#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_SKB_H - -#include -#include - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM skb - -/* - * Tracepoint for free an sk_buff: - */ -TRACE_EVENT(kfree_skb, - - TP_PROTO(struct sk_buff *skb, void *location), - - TP_ARGS(skb, location), - - TP_STRUCT__entry( - __field( void *, skbaddr ) - __field( unsigned short, protocol ) - __field( void *, location ) - ), - - TP_fast_assign( - __entry->skbaddr = skb; - if (skb) { - __entry->protocol = ntohs(skb->protocol); - } - __entry->location = location; - ), - - TP_printk("skbaddr=%p protocol=%u location=%p", - __entry->skbaddr, __entry->protocol, __entry->location) -); - -#endif /* _TRACE_SKB_H */ - -/* This part must be outside protection */ -#include diff --git a/kernel/exit.c b/kernel/exit.c index 2fe9d2c7eeee..cab535c427b8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 4bebf2639235..085f73ebcea6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -61,7 +61,6 @@ #include #include #include -#include #include #include @@ -71,6 +70,8 @@ #include #include +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 983d8be8dff7..37c63633e78b 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -20,7 +20,7 @@ #include #define CREATE_TRACE_POINTS -#include +#include #include "internals.h" diff --git a/kernel/kthread.c b/kernel/kthread.c index e1c76924545b..41c88fe40500 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #define KTHREAD_NICE_LEVEL (-5) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 257f21a76c52..47b201ecc6df 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -48,7 +48,7 @@ #include "lockdep_internals.h" #define CREATE_TRACE_POINTS -#include +#include #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; diff --git a/kernel/sched.c b/kernel/sched.c index e6d4518d47e0..9f7ffd00b6ea 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -79,7 +79,7 @@ #include "sched_cpupri.h" #define CREATE_TRACE_POINTS -#include +#include /* * Convert user-nice values [ -20 ... 0 ... 19 ] diff --git a/kernel/signal.c b/kernel/signal.c index 1d5703ff003c..94ec0a4dde0f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include diff --git a/kernel/softirq.c b/kernel/softirq.c index a2d9b458ac2b..7ab9dfd8d082 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8e6a0b5c9940..a23488988581 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 9d8cccdfaa06..a98106dd979c 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include "trace.h" diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5bc00e8f153e..b8b13c5540fd 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "trace.h" diff --git a/mm/util.c b/mm/util.c index 0e74a22791cb..6794a336e9af 100644 --- a/mm/util.c +++ b/mm/util.c @@ -7,7 +7,7 @@ #include #define CREATE_TRACE_POINTS -#include +#include /** * kstrdup - allocate space for and copy an existing string diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 9fd0dc3cca99..b75b6cea49da 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 801772059474..499a67eaf3ae 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -24,6 +24,6 @@ #include #define CREATE_TRACE_POINTS -#include +#include EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ce6356cd9f71..12806b844456 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -65,7 +65,7 @@ #include #include -#include +#include #include "kmap_skb.h" -- cgit v1.2.2 From 9cfe06f8cd5c8c3ad6ab323973e87dde670642b8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 21:37:03 -0400 Subject: tracing/events: add trace-events-sample This patch adds a sample to the samples directory on how to create and use TRACE_EVENT trace points. Signed-off-by: Steven Rostedt --- samples/Kconfig | 7 ++ samples/Makefile | 2 +- samples/trace_events/Makefile | 8 ++ samples/trace_events/trace-events-sample.c | 56 +++++++++++++ samples/trace_events/trace-events-sample.h | 124 +++++++++++++++++++++++++++++ 5 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 samples/trace_events/Makefile create mode 100644 samples/trace_events/trace-events-sample.c create mode 100644 samples/trace_events/trace-events-sample.h diff --git a/samples/Kconfig b/samples/Kconfig index 4b02f5a0e656..93f41c051092 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -19,6 +19,13 @@ config SAMPLE_TRACEPOINTS help This build tracepoints example modules. +config SAMPLE_TRACE_EVENTS + tristate "Build trace_events examples" + depends on EVENT_TRACING + default m + help + This build trace event example modules. + config SAMPLE_KOBJECT tristate "Build kobject examples" help diff --git a/samples/Makefile b/samples/Makefile index 10eaca89fe17..13e4b470b539 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,3 +1,3 @@ # Makefile for Linux samples code -obj-$(CONFIG_SAMPLES) += markers/ kobject/ kprobes/ tracepoints/ +obj-$(CONFIG_SAMPLES) += markers/ kobject/ kprobes/ tracepoints/ trace_events/ diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile new file mode 100644 index 000000000000..06c6dea1eb84 --- /dev/null +++ b/samples/trace_events/Makefile @@ -0,0 +1,8 @@ +# builds the trace events example kernel modules; +# then to use one (as root): insmod + +PWD := $(shell pwd) + +CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/ + +obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c new file mode 100644 index 000000000000..f33b3ba744ac --- /dev/null +++ b/samples/trace_events/trace-events-sample.c @@ -0,0 +1,56 @@ +#include +#include + +/* + * Any file that uses trace points, must include the header. + * But only one file, must include the header by defining + * CREATE_TRACE_POINTS first. This will make the C code that + * creates the handles for the trace points. + */ +#define CREATE_TRACE_POINTS +#include "trace-events-sample.h" + + +static void simple_thread_func(int cnt) +{ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + trace_foo_bar("hello", cnt); + + if (!(cnt % 10)) + /* It is really important that I say "hi!" */ + printk(KERN_EMERG "hi!\n"); +} + +static int simple_thread(void *arg) +{ + int cnt = 0; + + while (!kthread_should_stop()) + simple_thread_func(cnt++); + + return 0; +} + +static struct task_struct *simple_tsk; + +static int __init trace_event_init(void) +{ + simple_tsk = kthread_run(simple_thread, NULL, "event-sample"); + if (IS_ERR(simple_tsk)) + return -1; + + return 0; +} + +static void __exit trace_event_exit(void) +{ + kthread_stop(simple_tsk); +} + +module_init(trace_event_init); +module_exit(trace_event_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("trace-events-sample"); +MODULE_LICENSE("GPL"); diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h new file mode 100644 index 000000000000..eab46443e610 --- /dev/null +++ b/samples/trace_events/trace-events-sample.h @@ -0,0 +1,124 @@ +/* + * Notice that this file is not protected like a normal header. + * We also must allow for rereading of this file. The + * + * || defined(TRACE_HEADER_MULTI_READ) + * + * serves this purpose. + */ +#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EVENT_SAMPLE_H + +/* + * All trace headers should include tracepoint.h, until we finally + * make it into a standard header. + */ +#include + +/* + * If TRACE_SYSTEM is defined, that will be the directory created + * in the ftrace directory under /debugfs/tracing/events/ + * + * The define_trace.h belowe will also look for a file name of + * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here. + * + * If you want a different system than file name, you can override + * the header name by defining TRACE_INCLUDE_FILE + * + * If this file was called, goofy.h, then we would define: + * + * #define TRACE_INCLUDE_FILE goofy + * + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM trace-events-sample + +/* + * The TRACE_EVENT macro is broken up into 5 parts. + * + * name: name of the trace point. This is also how to enable the tracepoint. + * A function called trace_foo_bar() will be created. + * + * proto: the prototype of the function trace_foo_bar() + * Here it is trace_foo_bar(char *foo, int bar). + * + * args: must match the arguments in the prototype. + * Here it is simply "foo, bar". + * + * struct: This defines the way the data will be stored in the ring buffer. + * There are currently two types of elements. __field and __array. + * a __field is broken up into (type, name). Where type can be any + * type but an array. + * For an array. there are three fields. (type, name, size). The + * type of elements in the array, the name of the field and the size + * of the array. + * + * __array( char, foo, 10) is the same as saying char foo[10]. + * + * fast_assign: This is a C like function that is used to store the items + * into the ring buffer. + * + * printk: This is a way to print out the data in pretty print. This is + * useful if the system crashes and you are logging via a serial line, + * the data can be printed to the console using this "printk" method. + * + * Note, that for both the assign and the printk, __entry is the handler + * to the data structure in the ring buffer, and is defined by the + * TP_STRUCT__entry. + */ +TRACE_EVENT(foo_bar, + + TP_PROTO(char *foo, int bar), + + TP_ARGS(foo, bar), + + TP_STRUCT__entry( + __array( char, foo, 10 ) + __field( int, bar ) + ), + + TP_fast_assign( + strncpy(__entry->foo, foo, 10); + __entry->bar = bar; + ), + + TP_printk("foo %s %d", __entry->foo, __entry->bar) +); +#endif + +/***** NOTICE! The #if protection ends here. *****/ + + +/* + * There are several ways I could have done this. If I left out the + * TRACE_INCLUDE_PATH, then it would default to the kernel source + * include/trace/events directory. + * + * I could specify a path from the define_trace.h file back to this + * file. + * + * #define TRACE_INCLUDE_PATH ../../samples/trace_events + * + * But I chose to simply make it use the current directory and then in + * the Makefile I added: + * + * CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/ + * + * This will make sure the current path is part of the include + * structure for our file so that we can find it. + * + * I could have made only the top level directory the include: + * + * CFLAGS_trace-events-sample.o := -I$(PWD) + * + * And then let the path to this directory be the TRACE_INCLUDE_PATH: + * + * #define TRACE_INCLUDE_PATH samples/trace_events + * + * But then if something defines "samples" or "trace_events" then we + * could risk that being converted too, and give us an unexpected + * result. + */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#include -- cgit v1.2.2 From d0deef5b14af7d5bbd0003a0a2a1a32326e20a6d Mon Sep 17 00:00:00 2001 From: Shawn Du Date: Tue, 14 Apr 2009 13:58:56 +0800 Subject: blktrace: support per-partition tracing Though one can specify '-d /dev/sda1' when using blktrace, it still traces the whole sda. To support per-partition tracing, when we start tracing, we initialize bt->start_lba and bt->end_lba to the start and end sector of that partition. Note some actions are per device, thus we don't filter 0-sector events. The original patch and discussion can be found here: http://marc.info/?l=linux-btrace&m=122949374214540&w=2 Signed-off-by: Shawn Du Signed-off-by: Li Zefan Acked-by: "Theodore Ts'o" Cc: Arnaldo Carvalho de Melo Cc: Jens Axboe LKML-Reference: <49E42620.4050701@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- block/compat_ioctl.c | 2 +- drivers/scsi/sg.c | 1 + include/linux/blktrace_api.h | 24 +++++++++++++----------- kernel/trace/blktrace.c | 29 +++++++++++++++++++++-------- 4 files changed, 36 insertions(+), 20 deletions(-) diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index f87615dea46b..f8c218cd08e1 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -568,7 +568,7 @@ static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg) memcpy(&buts.name, &cbuts.name, 32); mutex_lock(&bdev->bd_mutex); - ret = do_blk_trace_setup(q, b, bdev->bd_dev, &buts); + ret = do_blk_trace_setup(q, b, bdev->bd_dev, bdev, &buts); mutex_unlock(&bdev->bd_mutex); if (ret) return ret; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 82312df9b0bf..49c98730bb8d 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1065,6 +1065,7 @@ sg_ioctl(struct inode *inode, struct file *filp, return blk_trace_setup(sdp->device->request_queue, sdp->disk->disk_name, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), + NULL, (char *)arg); case BLKTRACESTART: return blk_trace_startstop(sdp->device->request_queue, 1); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index d960889e92ef..267edc4017ee 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -165,8 +165,9 @@ struct blk_trace { extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); extern void blk_trace_shutdown(struct request_queue *); -extern int do_blk_trace_setup(struct request_queue *q, - char *name, dev_t dev, struct blk_user_trace_setup *buts); +extern int do_blk_trace_setup(struct request_queue *q, char *name, + dev_t dev, struct block_device *bdev, + struct blk_user_trace_setup *buts); extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); /** @@ -193,6 +194,7 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); extern void blk_add_driver_data(struct request_queue *q, struct request *rq, void *data, size_t len); extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg); extern int blk_trace_startstop(struct request_queue *q, int start); extern int blk_trace_remove(struct request_queue *q); @@ -200,15 +202,15 @@ extern int blk_trace_remove(struct request_queue *q); extern struct attribute_group blk_trace_attr_group; #else /* !CONFIG_BLK_DEV_IO_TRACE */ -#define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) -#define blk_trace_shutdown(q) do { } while (0) -#define do_blk_trace_setup(q, name, dev, buts) (-ENOTTY) -#define blk_add_driver_data(q, rq, data, len) do {} while (0) -#define blk_trace_setup(q, name, dev, arg) (-ENOTTY) -#define blk_trace_startstop(q, start) (-ENOTTY) -#define blk_trace_remove(q) (-ENOTTY) -#define blk_add_trace_msg(q, fmt, ...) do { } while (0) - +# define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) +# define blk_trace_shutdown(q) do { } while (0) +# define do_blk_trace_setup(q, name, dev, bdev, buts) (-ENOTTY) +# define blk_add_driver_data(q, rq, data, len) do {} while (0) +# define blk_trace_setup(q, name, dev, bdev, arg) (-ENOTTY) +# define blk_trace_startstop(q, start) (-ENOTTY) +# define blk_trace_remove(q) (-ENOTTY) +# define blk_add_trace_msg(q, fmt, ...) do { } while (0) #endif /* CONFIG_BLK_DEV_IO_TRACE */ + #endif /* __KERNEL__ */ #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2b98195b338b..e932654cf590 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -147,7 +147,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) return 1; - if (sector < bt->start_lba || sector > bt->end_lba) + if (sector && (sector < bt->start_lba || sector > bt->end_lba)) return 1; if (bt->pid && pid != bt->pid) return 1; @@ -192,7 +192,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(rw, DISCARD); pid = tsk->pid; - if (unlikely(act_log_check(bt, what, sector, pid))) + if (act_log_check(bt, what, sector, pid)) return; cpu = raw_smp_processor_id(); @@ -407,11 +407,13 @@ static struct rchan_callbacks blk_relay_callbacks = { * Setup everything required to start tracing */ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct blk_user_trace_setup *buts) + struct block_device *bdev, + struct blk_user_trace_setup *buts) { struct blk_trace *old_bt, *bt = NULL; struct dentry *dir = NULL; int ret, i; + struct hd_struct *part = NULL; if (!buts->buf_size || !buts->buf_nr) return -EINVAL; @@ -480,11 +482,21 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->act_mask) bt->act_mask = (u16) -1; - bt->start_lba = buts->start_lba; - bt->end_lba = buts->end_lba; - if (!bt->end_lba) + if (bdev) + part = bdev->bd_part; + + if (part) { + bt->start_lba = part->start_sect; + bt->end_lba = part->start_sect + part->nr_sects; + } else bt->end_lba = -1ULL; + /* overwrite with user settings */ + if (buts->start_lba) + bt->start_lba = buts->start_lba; + if (buts->end_lba) + bt->end_lba = buts->end_lba; + bt->pid = buts->pid; bt->trace_state = Blktrace_setup; @@ -505,6 +517,7 @@ err: } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; @@ -514,7 +527,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (ret) return -EFAULT; - ret = do_blk_trace_setup(q, name, dev, &buts); + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; @@ -582,7 +595,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, arg); + ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; case BLKTRACESTART: start = 1; -- cgit v1.2.2 From 9908c30997b8a73c95f836170b9998dae9aa3f4a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 Apr 2009 13:59:34 +0800 Subject: blktrace: support per-partition tracing for ftrace plugin The previous patch adds support to trace a single partition for relay+ioctl blktrace, and this patch is for ftrace plugin blktrace: # echo 1 > /sys/block/sda/sda7/enable # cat start_lba 102398373 # cat end_lba 102703545 Signed-off-by: Li Zefan Acked-by: "Theodore Ts'o" Cc: Arnaldo Carvalho de Melo Cc: Shawn Du Cc: Jens Axboe LKML-Reference: <49E42646.4060608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e932654cf590..d10989880520 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -403,6 +403,23 @@ static struct rchan_callbacks blk_relay_callbacks = { .remove_buf_file = blk_remove_buf_file_callback, }; +static void blk_trace_setup_lba(struct blk_trace *bt, + struct block_device *bdev) +{ + struct hd_struct *part = NULL; + + if (bdev) + part = bdev->bd_part; + + if (part) { + bt->start_lba = part->start_sect; + bt->end_lba = part->start_sect + part->nr_sects; + } else { + bt->start_lba = 0; + bt->end_lba = -1ULL; + } +} + /* * Setup everything required to start tracing */ @@ -413,7 +430,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct blk_trace *old_bt, *bt = NULL; struct dentry *dir = NULL; int ret, i; - struct hd_struct *part = NULL; if (!buts->buf_size || !buts->buf_nr) return -EINVAL; @@ -482,14 +498,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->act_mask) bt->act_mask = (u16) -1; - if (bdev) - part = bdev->bd_part; - - if (part) { - bt->start_lba = part->start_sect; - bt->end_lba = part->start_sect + part->nr_sects; - } else - bt->end_lba = -1ULL; + blk_trace_setup_lba(bt, bdev); /* overwrite with user settings */ if (buts->start_lba) @@ -1370,7 +1379,8 @@ static int blk_trace_remove_queue(struct request_queue *q) /* * Setup everything required to start tracing */ -static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) +static int blk_trace_setup_queue(struct request_queue *q, + struct block_device *bdev) { struct blk_trace *old_bt, *bt = NULL; int ret = -ENOMEM; @@ -1383,9 +1393,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) if (!bt->msg_data) goto free_bt; - bt->dev = dev; + bt->dev = bdev->bd_dev; bt->act_mask = (u16)-1; - bt->end_lba = -1ULL; + + blk_trace_setup_lba(bt, bdev); old_bt = xchg(&q->blk_trace, bt); if (old_bt != NULL) { @@ -1602,7 +1613,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (attr == &dev_attr_enable) { if (value) - ret = blk_trace_setup_queue(q, bdev->bd_dev); + ret = blk_trace_setup_queue(q, bdev); else ret = blk_trace_remove_queue(q); goto out_unlock_bdev; @@ -1610,7 +1621,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, ret = 0; if (q->blk_trace == NULL) - ret = blk_trace_setup_queue(q, bdev->bd_dev); + ret = blk_trace_setup_queue(q, bdev); if (ret == 0) { if (attr == &dev_attr_act_mask) -- cgit v1.2.2 From 1d54ad6da9192fed5dd3b60224d9f2dfea0dcd82 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 Apr 2009 14:00:05 +0800 Subject: blktrace: add trace/ to /sys/block/sda Impact: allow ftrace-plugin blktrace to trace device-mapper devices To trace a single partition: # echo 1 > /sys/block/sda/sda1/enable To trace the whole sda instead: # echo 1 > /sys/block/sda/enable Thus we also fix an issue reported by Ted, that ftrace-plugin blktrace can't be used to trace device-mapper devices. Now: # echo 1 > /sys/block/dm-0/trace/enable echo: write error: No such device or address # mount -t ext4 /dev/dm-0 /mnt # echo 1 > /sys/block/dm-0/trace/enable # echo blk > /debug/tracing/current_tracer Reported-by: Theodore Tso Signed-off-by: Li Zefan Acked-by: "Theodore Ts'o" Cc: Arnaldo Carvalho de Melo Cc: Shawn Du Cc: Jens Axboe LKML-Reference: <49E42665.6020506@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- block/blk-sysfs.c | 7 ++++++- include/linux/blktrace_api.h | 6 ++++++ kernel/trace/blktrace.c | 5 +++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 73f36beff5cd..8653d710b39e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -387,16 +387,21 @@ struct kobj_type blk_queue_ktype = { int blk_register_queue(struct gendisk *disk) { int ret; + struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; if (WARN_ON(!q)) return -ENXIO; + ret = blk_trace_init_sysfs(dev); + if (ret) + return ret; + if (!q->request_fn) return 0; - ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj), + ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); if (ret < 0) return ret; diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 267edc4017ee..62763c952854 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -198,6 +198,7 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, char __user *arg); extern int blk_trace_startstop(struct request_queue *q, int start); extern int blk_trace_remove(struct request_queue *q); +extern int blk_trace_init_sysfs(struct device *dev); extern struct attribute_group blk_trace_attr_group; @@ -210,6 +211,11 @@ extern struct attribute_group blk_trace_attr_group; # define blk_trace_startstop(q, start) (-ENOTTY) # define blk_trace_remove(q) (-ENOTTY) # define blk_add_trace_msg(q, fmt, ...) do { } while (0) +static inline int blk_trace_init_sysfs(struct device *dev) +{ + return 0; +} + #endif /* CONFIG_BLK_DEV_IO_TRACE */ #endif /* __KERNEL__ */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d10989880520..8e7c5da3a3e6 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1644,3 +1644,8 @@ out: return ret ? ret : count; } +int blk_trace_init_sysfs(struct device *dev) +{ + return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); +} + -- cgit v1.2.2 From f3948f8857ef5de239f28a61dddb1554a0ae4c2c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 15 Apr 2009 11:02:56 +0800 Subject: blktrace: fix context-info when mixed-using blk tracer and trace events When current tracer is set to blk tracer, TRACE_ITER_CONTEXT_INFO is unset, but actually context-info is printed: pdflush-431 [000] 821.181576: 8,0 P N [pdflush] And then if we enable TRACE_ITER_CONTEXT_INFO: # echo context-info > trace_options We'll see context-info printed twice. What's worse, when we use blk tracer and trace events at the same time, we'll see no context-info for trace events at all: jbd2_commit_logging: dev dm-0:8 transaction 333227 jbd2_end_commit: dev dm-0:8 transaction 333227 head 332814 rm-25433 [001] 9578.307485: 8,18 m N cfq25433 slice expired t=0 rm-25433 [001] 9578.307486: 8,18 m N cfq25433 put_queue This patch adds blk_tracer->set_flags(), and context-info flag is unset only when we set the output to classic mode. Note after this patch, one should unset context-info explicitly if he wants to get binary output that can be parsed by blkparse: # echo nocontext-info > trace_options # echo bin > trace_options # echo blk > current_tracer # cat trace_pipe | blkparse -i - Reported-by: Theodore Ts'o Signed-off-by: Li Zefan Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <49E54E60.50408@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8e7c5da3a3e6..c32062bd10b3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1211,7 +1211,6 @@ static void blk_tracer_print_header(struct seq_file *m) static void blk_tracer_start(struct trace_array *tr) { blk_tracer_enabled = true; - trace_flags &= ~TRACE_ITER_CONTEXT_INFO; } static int blk_tracer_init(struct trace_array *tr) @@ -1224,7 +1223,6 @@ static int blk_tracer_init(struct trace_array *tr) static void blk_tracer_stop(struct trace_array *tr) { blk_tracer_enabled = false; - trace_flags |= TRACE_ITER_CONTEXT_INFO; } static void blk_tracer_reset(struct trace_array *tr) @@ -1289,9 +1287,6 @@ out: static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, int flags) { - if (!trace_print_context(iter)) - return TRACE_TYPE_PARTIAL_LINE; - return print_one_line(iter, false); } @@ -1326,6 +1321,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) return print_one_line(iter, true); } +static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) +{ + /* don't output context-info for blk_classic output */ + if (bit == TRACE_BLK_OPT_CLASSIC) { + if (set) + trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + else + trace_flags |= TRACE_ITER_CONTEXT_INFO; + } + return 0; +} + static struct tracer blk_tracer __read_mostly = { .name = "blk", .init = blk_tracer_init, @@ -1335,6 +1342,7 @@ static struct tracer blk_tracer __read_mostly = { .print_header = blk_tracer_print_header, .print_line = blk_tracer_print_line, .flags = &blk_tracer_flags, + .set_flag = blk_tracer_set_flag, }; static struct trace_event trace_blk_event = { -- cgit v1.2.2 From 93eb677d74a4f7d3edfb678c94f6c0544d9fbad2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2009 13:24:06 -0400 Subject: ftrace: use module notifier for function tracer The hooks in the module code for the function tracer must be called before any of that module code runs. The function tracer hooks modify the module (replacing calls to mcount to nops). If the code is executed while the change occurs, then the CPU can take a GPF. To handle the above with a bit of paranoia, I originally implemented the hooks as calls directly from the module code. After examining the notifier calls, it looks as though the start up notify is called before any of the module's code is executed. This makes the use of the notify safe with ftrace. Only the startup notify is required to be "safe". The shutdown simply removes the entries from the ftrace function list, and does not modify any code. This change has another benefit. It removes a issue with a reverse dependency in the mutexes of ftrace_lock and module_mutex. [ Impact: fix lock dependency bug, cleanup ] Cc: Rusty Russell Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 ---- include/linux/module.h | 4 +++ kernel/module.c | 19 ++++------- kernel/trace/ftrace.c | 90 +++++++++++++++++++++++++++++++++++--------------- 4 files changed, 75 insertions(+), 45 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 53869bef6102..97c83e1bc589 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -233,8 +233,6 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size); extern int skip_trace(unsigned long ip); -extern void ftrace_release(void *start, unsigned long size); - extern void ftrace_disable_daemon(void); extern void ftrace_enable_daemon(void); #else @@ -325,13 +323,8 @@ static inline void __ftrace_enabled_restore(int enabled) #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); -extern void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end); #else static inline void ftrace_init(void) { } -static inline void -ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) { } #endif /* diff --git a/include/linux/module.h b/include/linux/module.h index 6155fa44168b..a8f2c0aa4c32 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -341,6 +341,10 @@ struct module struct ftrace_event_call *trace_events; unsigned int num_trace_events; #endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + unsigned long *ftrace_callsites; + unsigned int num_ftrace_callsites; +#endif #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ diff --git a/kernel/module.c b/kernel/module.c index a0394706f10c..2383e60fcf3f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1490,9 +1490,6 @@ static void free_module(struct module *mod) /* Free any allocated parameters. */ destroy_params(mod->kp, mod->num_kp); - /* release any pointers to mcount in this module */ - ftrace_release(mod->module_core, mod->core_size); - /* This may be NULL, but that's OK */ module_free(mod, mod->module_init); kfree(mod->args); @@ -1893,11 +1890,9 @@ static noinline struct module *load_module(void __user *umod, unsigned int symindex = 0; unsigned int strindex = 0; unsigned int modindex, versindex, infoindex, pcpuindex; - unsigned int num_mcount; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ - unsigned long *mseg; mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2179,7 +2174,13 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->trace_events), &mod->num_trace_events); #endif - +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + /* sechdrs[0].sh_size is always zero */ + mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings, + "__mcount_loc", + sizeof(*mod->ftrace_callsites), + &mod->num_ftrace_callsites); +#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2244,11 +2245,6 @@ static noinline struct module *load_module(void __user *umod, dynamic_debug_setup(debug, num_debug); } - /* sechdrs[0].sh_size is always zero */ - mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", - sizeof(*mseg), &num_mcount); - ftrace_init_module(mod, mseg, mseg + num_mcount); - err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2309,7 +2305,6 @@ static noinline struct module *load_module(void __user *umod, cleanup: kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); - ftrace_release(mod->module_core, mod->core_size); free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a23488988581..5b606f45b6c4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -916,30 +916,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec) rec->flags |= FTRACE_FL_FREE; } -void ftrace_release(void *start, unsigned long size) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - unsigned long s = (unsigned long)start; - unsigned long e = s + size; - - if (ftrace_disabled || !start) - return; - - mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) { - /* - * rec->ip is changed in ftrace_free_rec() - * It should not between s and e if record was freed. - */ - FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); - ftrace_free_rec(rec); - } - } while_for_each_ftrace_rec(); - mutex_unlock(&ftrace_lock); -} - static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { struct dyn_ftrace *rec; @@ -2752,14 +2728,72 @@ static int ftrace_convert_nops(struct module *mod, return 0; } -void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) +#ifdef CONFIG_MODULES +void ftrace_release(void *start, void *end) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long s = (unsigned long)start; + unsigned long e = (unsigned long)end; + + if (ftrace_disabled || !start || start == end) + return; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { + if ((rec->ip >= s) && (rec->ip < e)) { + /* + * rec->ip is changed in ftrace_free_rec() + * It should not between s and e if record was freed. + */ + FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); + ftrace_free_rec(rec); + } + } while_for_each_ftrace_rec(); + mutex_unlock(&ftrace_lock); +} + +static void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { if (ftrace_disabled || start == end) return; ftrace_convert_nops(mod, start, end); } +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + ftrace_init_module(mod, mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); + break; + case MODULE_STATE_GOING: + ftrace_release(mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); + break; + } + + return 0; +} +#else +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + +struct notifier_block ftrace_module_nb = { + .notifier_call = ftrace_module_notify, + .priority = 0, +}; + extern unsigned long __start_mcount_loc[]; extern unsigned long __stop_mcount_loc[]; @@ -2791,6 +2825,10 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); + ret = register_module_notifier(&ftrace_module_nb); + if (!ret) + pr_warning("Failed to register trace ftrace module notifier\n"); + return; failed: ftrace_disabled = 1; -- cgit v1.2.2 From e6187007d6c365b551c69ea3df46f06fd1c8bd19 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2009 13:36:40 -0400 Subject: tracing/events: add startup tests for events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As events start to become popular, and the new way to add tracing infrastructure into ftrace, it is important to catch any problems that might happen with a mistake in the TRACE_EVENT macro. This patch introduces a startup self test on the registered trace events. Note, it can only do a generic test, any type of testing that needs more involement is needed to be implemented by the tracepoint creators. The test goes down one by one enabling a trace point and running some random tasks (random in the sense that I just made them up). Those tasks are creating threads, grabbing mutexes and spinlocks and using workqueues. After testing each event individually, it does the same test after enabling each system of trace points. Like sched, irq, lockdep. Then finally it enables all tracepoints and performs the tasks again. The output to the console on bootup will look like this when everything works: Running tests on trace events: Testing event kfree_skb: OK Testing event kmalloc: OK Testing event kmem_cache_alloc: OK Testing event kmalloc_node: OK Testing event kmem_cache_alloc_node: OK Testing event kfree: OK Testing event kmem_cache_free: OK Testing event irq_handler_exit: OK Testing event irq_handler_entry: OK Testing event softirq_entry: OK Testing event softirq_exit: OK Testing event lock_acquire: OK Testing event lock_release: OK Testing event sched_kthread_stop: OK Testing event sched_kthread_stop_ret: OK Testing event sched_wait_task: OK Testing event sched_wakeup: OK Testing event sched_wakeup_new: OK Testing event sched_switch: OK Testing event sched_migrate_task: OK Testing event sched_process_free: OK Testing event sched_process_exit: OK Testing event sched_process_wait: OK Testing event sched_process_fork: OK Testing event sched_signal_send: OK Running tests on trace event systems: Testing event system skb: OK Testing event system kmem: OK Testing event system irq: OK Testing event system lockdep: OK Testing event system sched: OK Running tests on all trace events: Testing all events: OK [ folded in: tracing: add #include to fix build failure in test_work() This build failure occured on a few rare configs: kernel/trace/trace_events.c: In function ‘test_work’: kernel/trace/trace_events.c:975: error: implicit declaration of function ‘udelay’ kernel/trace/trace_events.c:980: error: implicit declaration of function ‘msleep’ delay.h is included in way too many other headers, hiding cases where new usage is added without header inclusion. [ Impact: build fix ] Signed-off-by: Ingo Molnar ] [ Impact: add event tracer self-tests ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 178 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6591d83e1e7a..f81d6eec4e43 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -8,10 +8,14 @@ * */ +#include +#include +#include #include #include #include #include +#include #include "trace_output.h" @@ -920,3 +924,177 @@ static __init int event_trace_init(void) return 0; } fs_initcall(event_trace_init); + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static DEFINE_SPINLOCK(test_spinlock); +static DEFINE_SPINLOCK(test_spinlock_irq); +static DEFINE_MUTEX(test_mutex); + +static __init void test_work(struct work_struct *dummy) +{ + spin_lock(&test_spinlock); + spin_lock_irq(&test_spinlock_irq); + udelay(1); + spin_unlock_irq(&test_spinlock_irq); + spin_unlock(&test_spinlock); + + mutex_lock(&test_mutex); + msleep(1); + mutex_unlock(&test_mutex); +} + +static __init int event_test_thread(void *unused) +{ + void *test_malloc; + + test_malloc = kmalloc(1234, GFP_KERNEL); + if (!test_malloc) + pr_info("failed to kmalloc\n"); + + schedule_on_each_cpu(test_work); + + kfree(test_malloc); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) + schedule(); + + return 0; +} + +/* + * Do various things that may trigger events. + */ +static __init void event_test_stuff(void) +{ + struct task_struct *test_thread; + + test_thread = kthread_run(event_test_thread, NULL, "test-events"); + msleep(1); + kthread_stop(test_thread); +} + +/* + * For every trace event defined, we will test each trace point separately, + * and then by groups, and finally all trace points. + */ +static __init int event_trace_self_tests(void) +{ + struct ftrace_event_call *call; + struct event_subsystem *system; + char *sysname; + int ret; + + pr_info("Running tests on trace events:\n"); + + list_for_each_entry(call, &ftrace_events, list) { + + /* Only test those that have a regfunc */ + if (!call->regfunc) + continue; + + pr_info("Testing event %s: ", call->name); + + /* + * If an event is already enabled, someone is using + * it and the self test should not be on. + */ + if (call->enabled) { + pr_warning("Enabled event during self test!\n"); + WARN_ON_ONCE(1); + continue; + } + + call->enabled = 1; + call->regfunc(); + + event_test_stuff(); + + call->unregfunc(); + call->enabled = 0; + + pr_cont("OK\n"); + } + + /* Now test at the sub system level */ + + pr_info("Running tests on trace event systems:\n"); + + list_for_each_entry(system, &event_subsystems, list) { + + /* the ftrace system is special, skip it */ + if (strcmp(system->name, "ftrace") == 0) + continue; + + pr_info("Testing event system %s: ", system->name); + + /* ftrace_set_clr_event can modify the name passed in. */ + sysname = kstrdup(system->name, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return 0; + } + ret = ftrace_set_clr_event(sysname, 1); + kfree(sysname); + if (WARN_ON_ONCE(ret)) { + pr_warning("error enabling system %s\n", + system->name); + continue; + } + + event_test_stuff(); + + sysname = kstrdup(system->name, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return 0; + } + ret = ftrace_set_clr_event(sysname, 0); + kfree(sysname); + + if (WARN_ON_ONCE(ret)) + pr_warning("error disabling system %s\n", + system->name); + + pr_cont("OK\n"); + } + + /* Test with all events enabled */ + + pr_info("Running tests on all trace events:\n"); + pr_info("Testing all events: "); + + sysname = kmalloc(4, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return 0; + } + memcpy(sysname, "*:*", 4); + ret = ftrace_set_clr_event(sysname, 1); + if (WARN_ON_ONCE(ret)) { + kfree(sysname); + pr_warning("error enabling all events\n"); + return 0; + } + + event_test_stuff(); + + /* reset sysname */ + memcpy(sysname, "*:*", 4); + ret = ftrace_set_clr_event(sysname, 0); + kfree(sysname); + + if (WARN_ON_ONCE(ret)) { + pr_warning("error disabling all events\n"); + return 0; + } + + pr_cont("OK\n"); + + return 0; +} + +late_initcall(event_trace_self_tests); + +#endif -- cgit v1.2.2 From d1b182a8d49ed6416325b4e0a1cb0f17cd4e702a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2009 16:53:47 -0400 Subject: tracing/events/ring-buffer: expose format of ring buffer headers to users Currently, every thing needed to read the binary output from the ring buffers is available, with the exception of the way the ring buffers handles itself internally. This patch creates two special files in the debugfs/tracing/events directory: # cat /debug/tracing/events/header_page field: u64 timestamp; offset:0; size:8; field: local_t commit; offset:8; size:8; field: char data; offset:16; size:4080; # cat /debug/tracing/events/header_event type : 2 bits len : 3 bits time_delta : 27 bits array : 32 bits padding : type == 0 time_extend : type == 1 data : type == 3 This is to allow a userspace app to see if the ring buffer format changes or not. [ Impact: allow userspace apps to know of ringbuffer format changes ] Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 5 +++++ kernel/trace/ring_buffer.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index f0aa486d131c..fac8f1ac6f49 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -166,6 +166,11 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, size_t len, int cpu, int full); +struct trace_seq; + +int ring_buffer_print_entry_header(struct trace_seq *s); +int ring_buffer_print_page_header(struct trace_seq *s); + enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, }; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f935bd5ec3e8..84a6055f37c9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -21,6 +21,28 @@ #include "trace.h" +/* + * The ring buffer header is special. We must manually up keep it. + */ +int ring_buffer_print_entry_header(struct trace_seq *s) +{ + int ret; + + ret = trace_seq_printf(s, "\ttype : 2 bits\n"); + ret = trace_seq_printf(s, "\tlen : 3 bits\n"); + ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); + ret = trace_seq_printf(s, "\tarray : 32 bits\n"); + ret = trace_seq_printf(s, "\n"); + ret = trace_seq_printf(s, "\tpadding : type == %d\n", + RINGBUF_TYPE_PADDING); + ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", + RINGBUF_TYPE_TIME_EXTEND); + ret = trace_seq_printf(s, "\tdata : type == %d\n", + RINGBUF_TYPE_DATA); + + return ret; +} + /* * The ring buffer is made up of a list of pages. A separate list of pages is * allocated for each CPU. A writer may only write to a buffer that is @@ -340,6 +362,28 @@ static inline int test_time_stamp(u64 delta) #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) +int ring_buffer_print_page_header(struct trace_seq *s) +{ + struct buffer_data_page field; + int ret; + + ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\n", + (unsigned int)sizeof(field.time_stamp)); + + ret = trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit)); + + ret = trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE); + + return ret; +} + /* * head_page == tail_page && head == tail then buffer is empty. */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f81d6eec4e43..7163a2bb021a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -610,6 +610,30 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static ssize_t +show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + int (*func)(struct trace_seq *s) = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + func(s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + + kfree(s); + + return r; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -667,6 +691,11 @@ static const struct file_operations ftrace_subsystem_filter_fops = { .write = subsystem_filter_write, }; +static const struct file_operations ftrace_show_header_fops = { + .open = tracing_open_generic, + .read = show_header, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -909,6 +938,15 @@ static __init int event_trace_init(void) if (!d_events) return 0; + /* ring buffer internal formats */ + trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + + trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) -- cgit v1.2.2 From 69abe6a5d18a9394baa325bab8f57748b037c517 Mon Sep 17 00:00:00 2001 From: Avadh Patel Date: Fri, 10 Apr 2009 16:04:48 -0400 Subject: tracing: add saved_cmdlines file to show cached task comms Export the cached task comms to userspace. This allows user apps to translate the pids from a trace into their respective task command lines. [ Impact: let userspace apps reading binary buffer know comm's of pids ] Signed-off-by: Avadh Patel [ added error checking and use of buf pointer to index file_buf ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2d69b26b3cc9..031c46f11bb9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2421,6 +2421,56 @@ static const struct file_operations tracing_readme_fops = { .read = tracing_readme_read, }; +static ssize_t +tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf_comm; + char *file_buf; + char *buf; + int len = 0; + int pid; + int i; + + file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); + if (!file_buf) + return -ENOMEM; + + buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!buf_comm) { + kfree(file_buf); + return -ENOMEM; + } + + buf = file_buf; + + for (i = 0; i < SAVED_CMDLINES; i++) { + int r; + + pid = map_cmdline_to_pid[i]; + if (pid == -1 || pid == NO_CMDLINE_MAP) + continue; + + trace_find_cmdline(pid, buf_comm); + r = sprintf(buf, "%d %s\n", pid, buf_comm); + buf += r; + len += r; + } + + len = simple_read_from_buffer(ubuf, cnt, ppos, + file_buf, len); + + kfree(file_buf); + kfree(buf_comm); + + return len; +} + +static const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_read, +}; + static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3973,6 +4023,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("trace_marker", 0220, d_tracer, NULL, &tracing_mark_fops); + trace_create_file("saved_cmdlines", 0444, d_tracer, + NULL, &tracing_saved_cmdlines_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); -- cgit v1.2.2 From 9ea21c1ecdb35ecdcac5fd9d95f62a1f6a7ffec0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Apr 2009 12:15:44 -0400 Subject: tracing/events: perform function tracing in event selftests We can find some bugs in the trace events if we stress the writes as well. The function tracer is a good way to stress the events. [ Impact: extend scope of event tracer self-tests ] Signed-off-by: Steven Rostedt Cc: Andrew Morton Cc: Peter Zijlstra Cc: Frederic Weisbecker LKML-Reference: <20090416161746.604786131@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 78 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7163a2bb021a..1137f951be42 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1017,7 +1017,7 @@ static __init void event_test_stuff(void) * For every trace event defined, we will test each trace point separately, * and then by groups, and finally all trace points. */ -static __init int event_trace_self_tests(void) +static __init void event_trace_self_tests(void) { struct ftrace_event_call *call; struct event_subsystem *system; @@ -1071,7 +1071,7 @@ static __init int event_trace_self_tests(void) sysname = kstrdup(system->name, GFP_KERNEL); if (WARN_ON(!sysname)) { pr_warning("Can't allocate memory, giving up!\n"); - return 0; + return; } ret = ftrace_set_clr_event(sysname, 1); kfree(sysname); @@ -1086,7 +1086,7 @@ static __init int event_trace_self_tests(void) sysname = kstrdup(system->name, GFP_KERNEL); if (WARN_ON(!sysname)) { pr_warning("Can't allocate memory, giving up!\n"); - return 0; + return; } ret = ftrace_set_clr_event(sysname, 0); kfree(sysname); @@ -1106,14 +1106,14 @@ static __init int event_trace_self_tests(void) sysname = kmalloc(4, GFP_KERNEL); if (WARN_ON(!sysname)) { pr_warning("Can't allocate memory, giving up!\n"); - return 0; + return; } memcpy(sysname, "*:*", 4); ret = ftrace_set_clr_event(sysname, 1); if (WARN_ON_ONCE(ret)) { kfree(sysname); pr_warning("error enabling all events\n"); - return 0; + return; } event_test_stuff(); @@ -1125,10 +1125,76 @@ static __init int event_trace_self_tests(void) if (WARN_ON_ONCE(ret)) { pr_warning("error disabling all events\n"); - return 0; + return; } pr_cont("OK\n"); +} + +#ifdef CONFIG_FUNCTION_TRACER + +static DEFINE_PER_CPU(atomic_t, test_event_disable); + +static void +function_test_events_call(unsigned long ip, unsigned long parent_ip) +{ + struct ring_buffer_event *event; + struct ftrace_entry *entry; + unsigned long flags; + long disabled; + int resched; + int cpu; + int pc; + + pc = preempt_count(); + resched = ftrace_preempt_disable(); + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + + if (disabled != 1) + goto out; + + local_save_flags(flags); + + event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), + flags, pc); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->parent_ip = parent_ip; + + trace_current_buffer_unlock_commit(event, flags, pc); + + out: + atomic_dec(&per_cpu(test_event_disable, cpu)); + ftrace_preempt_enable(resched); +} + +static struct ftrace_ops trace_ops __initdata = +{ + .func = function_test_events_call, +}; + +static __init void event_trace_self_test_with_function(void) +{ + register_ftrace_function(&trace_ops); + pr_info("Running tests again, along with the function tracer\n"); + event_trace_self_tests(); + unregister_ftrace_function(&trace_ops); +} +#else +static __init void event_trace_self_test_with_function(void) +{ +} +#endif + +static __init int event_trace_self_tests_init(void) +{ + + event_trace_self_tests(); + + event_trace_self_test_with_function(); return 0; } -- cgit v1.2.2 From 76aa81118ddfbb3dc31533030cf3ec329dd067a6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 16 Apr 2009 23:35:39 -0700 Subject: tracing: avoid warnings from zero-arg tracepoints Tracepoints with no arguments can issue two warnings: "field" defined by not used "ret" is uninitialized in this function Mark field as being OK to leave unused, and initialize ret. [ Impact: fix false positive compiler warnings. ] Signed-off-by: Jeremy Fitzhardinge Acked-by: Steven Rostedt Cc: mathieu.desnoyers@polymtl.ca LKML-Reference: <1239950139-1119-5-git-send-email-jeremy@goop.org> Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 60c5323bee64..39a3351f2e7f 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -160,8 +160,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ static int \ ftrace_format_##call(struct trace_seq *s) \ { \ - struct ftrace_raw_##call field; \ - int ret; \ + struct ftrace_raw_##call field __attribute__((unused)); \ + int ret = 0; \ \ tstruct; \ \ -- cgit v1.2.2 From 339ae5d3c3fc2025e3657637921495fd600027c7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 17 Apr 2009 10:34:30 +0800 Subject: tracing: fix file mode of trace and README trace is read-write and README is read-only. [ Impact: fix /debug/tracing/ file permissions. ] Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Steven Rostedt LKML-Reference: <49E7EAB6.4070605@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 031c46f11bb9..f681f646aa07 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4002,7 +4002,7 @@ static __init int tracer_init_debugfs(void) trace_create_file("available_tracers", 0444, d_tracer, &global_trace, &show_traces_fops); - trace_create_file("current_tracer", 0444, d_tracer, + trace_create_file("current_tracer", 0644, d_tracer, &global_trace, &set_tracer_fops); trace_create_file("tracing_max_latency", 0644, d_tracer, @@ -4011,7 +4011,7 @@ static __init int tracer_init_debugfs(void) trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); - trace_create_file("README", 0644, d_tracer, + trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); trace_create_file("trace_pipe", 0444, d_tracer, -- cgit v1.2.2 From 46de405f25f1d9fa73b657ffbb752aa0cc87a91d Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 17 Apr 2009 10:53:43 +0800 Subject: tracing: Remove include/trace/kmem_event_types.h kmem_event_types.h is no longer necessary since tracepoint definitions are put into include/trace/events/kmem.h [ Impact: remove now-unused file. ] Signed-off-by: Zhao Lei Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49E7EF37.2080205@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/kmem_event_types.h | 193 --------------------------------------- 1 file changed, 193 deletions(-) delete mode 100644 include/trace/kmem_event_types.h diff --git a/include/trace/kmem_event_types.h b/include/trace/kmem_event_types.h deleted file mode 100644 index 4ff420fe4675..000000000000 --- a/include/trace/kmem_event_types.h +++ /dev/null @@ -1,193 +0,0 @@ - -/* use instead */ -#ifndef TRACE_EVENT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kmem - -TRACE_EVENT(kmalloc, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags) -); - -TRACE_EVENT(kmem_cache_alloc, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags) -); - -TRACE_EVENT(kmalloc_node, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - __field( int, node ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - __entry->node = node; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags, - __entry->node) -); - -TRACE_EVENT(kmem_cache_alloc_node, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - __field( int, node ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - __entry->node = node; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags, - __entry->node) -); - -TRACE_EVENT(kfree, - - TP_PROTO(unsigned long call_site, const void *ptr), - - TP_ARGS(call_site, ptr), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - ), - - TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) -); - -TRACE_EVENT(kmem_cache_free, - - TP_PROTO(unsigned long call_site, const void *ptr), - - TP_ARGS(call_site, ptr), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - ), - - TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) -); - -#undef TRACE_SYSTEM -- cgit v1.2.2 From ac1adc55fc71c7515caa2eb0e63e49b3d1c6a47c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 17 Apr 2009 00:27:08 -0500 Subject: tracing/filters: add filter_mutex to protect filter predicates This patch adds a filter_mutex to prevent the filter predicates from being accessed concurrently by various external functions. It's based on a previous patch by Li Zefan: "[PATCH 7/7] tracing/filters: make filter preds RCU safe" v2 changes: - fixed wrong value returned in a add_subsystem_pred() failure case noticed by Li Zefan. [ Impact: fix trace filter corruption/crashes on parallel access ] Signed-off-by: Tom Zanussi Reviewed-by: Li Zefan Tested-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: paulmck@linux.vnet.ibm.com LKML-Reference: <1239946028.6639.13.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 4 +- kernel/trace/trace_events.c | 4 +- kernel/trace/trace_events_filter.c | 90 +++++++++++++++++++++++++++++--------- 3 files changed, 75 insertions(+), 23 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8817c18ef97a..247948e81b08 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -757,13 +757,15 @@ struct filter_pred { }; extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct filter_pred **preds, int n_preds, +extern void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s); extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); extern void filter_disable_preds(struct ftrace_event_call *call); extern void filter_free_subsystem_preds(struct event_subsystem *system); +extern void filter_print_subsystem_preds(struct event_subsystem *system, + struct trace_seq *s); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1137f951be42..64f9d6d2735b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -488,7 +488,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call->preds, call->n_preds, s); + filter_print_preds(call, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -558,7 +558,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(system->preds, system->n_preds, s); + filter_print_subsystem_preds(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f8e5eab0424c..e0fcfd2a16d6 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -22,10 +22,13 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" +static DEFINE_MUTEX(filter_mutex); + static int filter_pred_64(struct filter_pred *pred, void *event) { u64 *addr = (u64 *)(event + pred->offset); @@ -112,8 +115,8 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) } EXPORT_SYMBOL_GPL(filter_match_preds); -void filter_print_preds(struct filter_pred **preds, int n_preds, - struct trace_seq *s) +static void __filter_print_preds(struct filter_pred **preds, int n_preds, + struct trace_seq *s) { char *field_name; struct filter_pred *pred; @@ -138,6 +141,21 @@ void filter_print_preds(struct filter_pred **preds, int n_preds, } } +void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s) +{ + mutex_lock(&filter_mutex); + __filter_print_preds(call->preds, call->n_preds, s); + mutex_unlock(&filter_mutex); +} + +void filter_print_subsystem_preds(struct event_subsystem *system, + struct trace_seq *s) +{ + mutex_lock(&filter_mutex); + __filter_print_preds(system->preds, system->n_preds, s); + mutex_unlock(&filter_mutex); +} + static struct ftrace_event_field * find_event_field(struct ftrace_event_call *call, char *name) { @@ -180,7 +198,7 @@ static int filter_set_pred(struct filter_pred *dest, return 0; } -void filter_disable_preds(struct ftrace_event_call *call) +static void __filter_disable_preds(struct ftrace_event_call *call) { int i; @@ -190,6 +208,13 @@ void filter_disable_preds(struct ftrace_event_call *call) call->preds[i]->fn = filter_pred_none; } +void filter_disable_preds(struct ftrace_event_call *call) +{ + mutex_lock(&filter_mutex); + __filter_disable_preds(call); + mutex_unlock(&filter_mutex); +} + int init_preds(struct ftrace_event_call *call) { struct filter_pred *pred; @@ -223,7 +248,7 @@ oom: } EXPORT_SYMBOL_GPL(init_preds); -void filter_free_subsystem_preds(struct event_subsystem *system) +static void __filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; int i; @@ -241,18 +266,25 @@ void filter_free_subsystem_preds(struct event_subsystem *system) continue; if (!strcmp(call->system, system->name)) - filter_disable_preds(call); + __filter_disable_preds(call); } } -static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred, - filter_pred_fn_t fn) +void filter_free_subsystem_preds(struct event_subsystem *system) +{ + mutex_lock(&filter_mutex); + __filter_free_subsystem_preds(system); + mutex_unlock(&filter_mutex); +} + +static int filter_add_pred_fn(struct ftrace_event_call *call, + struct filter_pred *pred, + filter_pred_fn_t fn) { int idx, err; if (call->n_preds && !pred->compound) - filter_disable_preds(call); + __filter_disable_preds(call); if (call->n_preds == MAX_FILTER_PRED) return -ENOSPC; @@ -276,7 +308,8 @@ static int is_string_field(const char *type) return 0; } -int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +static int __filter_add_pred(struct ftrace_event_call *call, + struct filter_pred *pred) { struct ftrace_event_field *field; filter_pred_fn_t fn; @@ -293,7 +326,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) return -EINVAL; fn = filter_pred_string; pred->str_len = field->size; - return __filter_add_pred(call, pred, fn); + return filter_add_pred_fn(call, pred, fn); } else { if (pred->str_len) return -EINVAL; @@ -316,7 +349,18 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) return -EINVAL; } - return __filter_add_pred(call, pred, fn); + return filter_add_pred_fn(call, pred, fn); +} + +int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +{ + int err; + + mutex_lock(&filter_mutex); + err = __filter_add_pred(call, pred); + mutex_unlock(&filter_mutex); + + return err; } int filter_add_subsystem_pred(struct event_subsystem *system, @@ -324,20 +368,27 @@ int filter_add_subsystem_pred(struct event_subsystem *system, { struct ftrace_event_call *call; + mutex_lock(&filter_mutex); + if (system->n_preds && !pred->compound) - filter_free_subsystem_preds(system); + __filter_free_subsystem_preds(system); if (!system->n_preds) { system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!system->preds) + if (!system->preds) { + mutex_unlock(&filter_mutex); return -ENOMEM; + } } - if (system->n_preds == MAX_FILTER_PRED) + if (system->n_preds == MAX_FILTER_PRED) { + mutex_unlock(&filter_mutex); return -ENOSPC; + } system->preds[system->n_preds] = pred; + system->n_preds++; list_for_each_entry(call, &ftrace_events, list) { int err; @@ -348,17 +399,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system, if (strcmp(call->system, system->name)) continue; - if (!find_event_field(call, pred->field_name)) - continue; - - err = filter_add_pred(call, pred); + err = __filter_add_pred(call, pred); if (err == -ENOMEM) { system->preds[system->n_preds] = NULL; + system->n_preds--; + mutex_unlock(&filter_mutex); return err; } } - system->n_preds++; + mutex_unlock(&filter_mutex); return 0; } -- cgit v1.2.2 From b0afdc126d0515e76890f0a5f26b28501cfa298e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 13:02:22 -0400 Subject: tracing/events: enable code with EVENT_TRACING not EVENT_TRACER The CONFIG_EVENT_TRACER is the way to turn on event tracing when no other tracing has been configured. All code to get enabled should depend on CONFIG_EVENT_TRACING. That is what is enabled when TRACING (or CONFIG_EVENT_TRACER) is selected. This patch enables the include/trace/ftrace.h file when CONFIG_EVENT_TRACING is enabled. [ Impact: fix warning in event tracer selftest ] Signed-off-by: Steven Rostedt --- include/trace/define_trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index 18869417109c..7f1f23d601ec 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -56,7 +56,7 @@ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#ifdef CONFIG_EVENT_TRACER +#ifdef CONFIG_EVENT_TRACING #include #endif -- cgit v1.2.2 From 12acd473d45cf2e40de3782cb2de712e5cd4d715 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 16:01:56 -0400 Subject: tracing: add EXPORT_SYMBOL_GPL for trace commits Not all the necessary symbols were exported to allow for tracing by modules. This patch adds them in. [ Impact: allow modules to commit data to the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f681f646aa07..183d788038e8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -894,18 +894,20 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } +EXPORT_SYMBOL(trace_current_buffer_unlock_commit); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); } +EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit); void trace_current_buffer_discard_commit(struct ring_buffer_event *event) { ring_buffer_discard_commit(global_trace.buffer, event); } -EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); void trace_function(struct trace_array *tr, -- cgit v1.2.2 From 261842b7c9099f56de2eb969c8ad65402d68e00e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Apr 2009 21:41:52 -0400 Subject: tracing: add same level recursion detection The tracing infrastructure allows for recursion. That is, an interrupt may interrupt the act of tracing an event, and that interrupt may very well perform its own trace. This is a recursive trace, and is fine to do. The problem arises when there is a bug, and the utility doing the trace calls something that recurses back into the tracer. This recursion is not caused by an external event like an interrupt, but by code that is not expected to recurse. The result could be a lockup. This patch adds a bitmask to the task structure that keeps track of the trace recursion. To find the interrupt depth, the following algorithm is used: level = hardirq_count() + softirq_count() + in_nmi; Here, level will be the depth of interrutps and softirqs, and even handles the nmi. Then the corresponding bit is set in the recursion bitmask. If the bit was already set, we know we had a recursion at the same level and we warn about it and fail the writing to the buffer. After the data has been committed to the buffer, we clear the bit. No atomics are needed. The only races are with interrupts and they reset the bitmask before returning anywy. [ Impact: detect same irq level trace recursion ] Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 +++++++ include/linux/init_task.h | 1 + include/linux/sched.h | 4 +++- kernel/trace/ring_buffer.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 97c83e1bc589..39b95c56587e 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -488,8 +488,15 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk) extern int ftrace_dump_on_oops; +#ifdef CONFIG_PREEMPT +#define INIT_TRACE_RECURSION .trace_recursion = 0, +#endif + #endif /* CONFIG_TRACING */ +#ifndef INIT_TRACE_RECURSION +#define INIT_TRACE_RECURSION +#endif #ifdef CONFIG_HW_BRANCH_TRACER diff --git a/include/linux/init_task.h b/include/linux/init_task.h index dcfb93337e9a..6fc218529863 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -187,6 +187,7 @@ extern struct cred init_cred; INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ + INIT_TRACE_RECURSION \ } diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049c..7ede5e490913 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1428,7 +1428,9 @@ struct task_struct { #ifdef CONFIG_TRACING /* state flags for use by tracers */ unsigned long trace; -#endif + /* bitmask of trace recursion */ + unsigned long trace_recursion; +#endif /* CONFIG_TRACING */ }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 84a6055f37c9..b421b0ea9112 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1481,6 +1481,40 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return event; } +static int trace_irq_level(void) +{ + return hardirq_count() + softirq_count() + in_nmi(); +} + +static int trace_recursive_lock(void) +{ + int level; + + level = trace_irq_level(); + + if (unlikely(current->trace_recursion & (1 << level))) { + /* Disable all tracing before we do anything else */ + tracing_off_permanent(); + WARN_ON_ONCE(1); + return -1; + } + + current->trace_recursion |= 1 << level; + + return 0; +} + +static void trace_recursive_unlock(void) +{ + int level; + + level = trace_irq_level(); + + WARN_ON_ONCE(!current->trace_recursion & (1 << level)); + + current->trace_recursion &= ~(1 << level); +} + static DEFINE_PER_CPU(int, rb_need_resched); /** @@ -1514,6 +1548,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) /* If we are tracing schedule, we don't want to recurse */ resched = ftrace_preempt_disable(); + if (trace_recursive_lock()) + goto out_nocheck; + cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -1543,6 +1580,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) return event; out: + trace_recursive_unlock(); + + out_nocheck: ftrace_preempt_enable(resched); return NULL; } @@ -1581,6 +1621,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + trace_recursive_unlock(); + /* * Only the last preempt count needs to restore preemption. */ -- cgit v1.2.2 From 3189cdb31622f4e40688ce5a6fc5d940b42bc805 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 16:13:55 -0400 Subject: tracing: protect trace_printk from recursion trace_printk can be called from any context, including NMIs. If this happens, then we must test for for recursion before grabbing any spinlocks. This patch prevents trace_printk from being called recursively. [ Impact: prevent hard lockup in lockdep event tracer ] Cc: Peter Zijlstra Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 183d788038e8..b9a3adce9221 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1259,6 +1259,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct trace_array_cpu *data; struct bprint_entry *entry; unsigned long flags; + int disable; int resched; int cpu, len = 0, size, pc; @@ -1273,7 +1274,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(atomic_read(&data->disabled))) + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) goto out; /* Lockdep uses trace_printk for lock tracing */ @@ -1301,6 +1303,7 @@ out_unlock: local_irq_restore(flags); out: + atomic_dec_return(&data->disabled); ftrace_preempt_enable(resched); unpause_graph_tracing(); @@ -1320,6 +1323,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) int cpu, len = 0, size, pc; struct print_entry *entry; unsigned long irq_flags; + int disable; if (tracing_disabled || tracing_selftest_running) return 0; @@ -1329,7 +1333,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(atomic_read(&data->disabled))) + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) goto out; pause_graph_tracing(); @@ -1357,6 +1362,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) raw_local_irq_restore(irq_flags); unpause_graph_tracing(); out: + atomic_dec_return(&data->disabled); preempt_enable_notrace(); return len; -- cgit v1.2.2 From 8e668b5b3455207e4540fc7ccab9ecf70142f288 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 17:17:55 -0400 Subject: tracing: remove format attribute of inline function Due to a cut and paste error, I added the gcc attribute for printf format to the static inline stub of trace_seq_printf. This will cause a compile failure. [ Impact: fix compiler error when CONFIG_TRACING is off ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt Cc: Andrew Morton Cc: =?ISO-8859-15?Q?Fr=E9d=E9ric_Weisbecker?= LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/trace_seq.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 15ca2c71af13..37db9bdfbc1a 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -42,7 +42,6 @@ extern int trace_seq_path(struct trace_seq *s, struct path *path); #else /* CONFIG_TRACING */ static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))) { return 0; } -- cgit v1.2.2 From e057a5e5647a1c9d0d0054fbd298bfa04b3d1cb4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 23:38:12 +0200 Subject: tracing/core: Add current context on tracing recursion warning In case of tracing recursion detection, we only get the stacktrace. But the current context may be very useful to debug the issue. This patch adds the softirq/hardirq/nmi context with the warning using lockdep context display to have a familiar output. v2: Use printk_once() v3: drop {hardirq,softirq}_context which depend on lockdep, only keep what is part of current->trace_recursion, sufficient to debug the warning source. [ Impact: print context necessary to debug recursion ] Signed-off-by: Frederic Weisbecker --- kernel/trace/ring_buffer.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b421b0ea9112..bffde630c4e2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1495,6 +1495,13 @@ static int trace_recursive_lock(void) if (unlikely(current->trace_recursion & (1 << level))) { /* Disable all tracing before we do anything else */ tracing_off_permanent(); + + printk_once(KERN_WARNING "Tracing recursion: " + "HC[%lu]:SC[%lu]:NMI[%lu]\n", + hardirq_count() >> HARDIRQ_SHIFT, + softirq_count() >> SOFTIRQ_SHIFT, + in_nmi()); + WARN_ON_ONCE(1); return -1; } -- cgit v1.2.2 From f3b9aae16219aaeca2dd5a9ca69f7a10faa063df Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 23:39:33 +0200 Subject: tracing/ring-buffer: Add unlock recursion protection on discard The pair of helpers trace_recursive_lock() and trace_recursive_unlock() have been introduced recently to provide generic tracing recursion protection. They are used in a symetric way: - trace_recursive_lock() on buffer reserve - trace_recursive_unlock() on buffer commit However sometimes, we don't commit but discard on entry to the buffer, ie: in case of filter checking. Then we must also unlock the recursion protection on discard time, otherwise the tracing gets definitely deactivated and a warning is raised spuriously, such as: 111.119821] ------------[ cut here ]------------ [ 111.119829] WARNING: at kernel/trace/ring_buffer.c:1498 ring_buffer_lock_reserve+0x1b7/0x1d0() [ 111.119835] Hardware name: AMILO Li 2727 [ 111.119839] Modules linked in: [ 111.119846] Pid: 5731, comm: Xorg Tainted: G W 2.6.30-rc1 #69 [ 111.119851] Call Trace: [ 111.119863] [] warn_slowpath+0xd8/0x130 [ 111.119873] [] ? __lock_acquire+0x19f/0x1ae0 [ 111.119882] [] ? __lock_acquire+0x19f/0x1ae0 [ 111.119891] [] ? native_sched_clock+0x20/0x70 [ 111.119899] [] ? put_lock_stats+0xe/0x30 [ 111.119906] [] ? lock_release_holdtime+0xa8/0x150 [ 111.119913] [] ring_buffer_lock_reserve+0x1b7/0x1d0 [ 111.119921] [] trace_buffer_lock_reserve+0x30/0x70 [ 111.119930] [] trace_current_buffer_lock_reserve+0x20/0x30 [ 111.119939] [] ftrace_raw_event_sched_switch+0x58/0x100 [ 111.119948] [] __schedule+0x3a7/0x4cd [ 111.119957] [] ? ftrace_call+0x5/0x2b [ 111.119964] [] ? ftrace_call+0x5/0x2b [ 111.119971] [] schedule+0x18/0x40 [ 111.119977] [] preempt_schedule+0x39/0x60 [ 111.119985] [] _read_unlock+0x53/0x60 [ 111.119993] [] sock_def_readable+0x72/0x80 [ 111.120002] [] unix_stream_sendmsg+0x24d/0x3d0 [ 111.120011] [] sock_aio_write+0x143/0x160 [ 111.120019] [] ? ftrace_call+0x5/0x2b [ 111.120026] [] ? sock_aio_write+0x0/0x160 [ 111.120033] [] ? sock_aio_write+0x0/0x160 [ 111.120042] [] do_sync_readv_writev+0xf3/0x140 [ 111.120049] [] ? ftrace_call+0x5/0x2b [ 111.120057] [] ? autoremove_wake_function+0x0/0x40 [ 111.120067] [] ? cap_file_permission+0x9/0x10 [ 111.120074] [] ? security_file_permission+0x16/0x20 [ 111.120082] [] do_readv_writev+0xd4/0x1f0 [ 111.120089] [] ? ftrace_call+0x5/0x2b [ 111.120097] [] ? ftrace_call+0x5/0x2b [ 111.120105] [] vfs_writev+0x48/0x70 [ 111.120111] [] sys_writev+0x55/0xc0 [ 111.120119] [] system_call_fastpath+0x16/0x1b [ 111.120125] ---[ end trace 15605f4e98d5ccb5 ]--- [ Impact: fix spurious warning triggering tracing shutdown ] Signed-off-by: Frederic Weisbecker --- kernel/trace/ring_buffer.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bffde630c4e2..e145969a8eda 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1642,6 +1642,14 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +static inline void rb_event_discard(struct ring_buffer_event *event) +{ + event->type = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + /** * ring_buffer_event_discard - discard any event in the ring buffer * @event: the event to discard @@ -1656,10 +1664,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); */ void ring_buffer_event_discard(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; + rb_event_discard(event); + trace_recursive_unlock(); } EXPORT_SYMBOL_GPL(ring_buffer_event_discard); @@ -1690,7 +1696,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, int cpu; /* The event is discarded regardless */ - ring_buffer_event_discard(event); + rb_event_discard(event); /* * This must only be called if the event has not been @@ -1735,6 +1741,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, if (rb_is_commit(cpu_buffer, event)) rb_set_commit_to_write(cpu_buffer); + trace_recursive_unlock(); + /* * Only the last preempt count needs to restore preemption. */ -- cgit v1.2.2 From 9ae5b8790037d05d32746f521af146c32089bfec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 10:27:58 -0400 Subject: tracing: change branch profiling to a choice selection This patch makes the branch profiling into a choice selection: None - no branch profiling likely/unlikely - only profile likely/unlikely branches all - profile all branches The all profiler will also enable the likely/unlikely branches. This does not change the way the profiler works or the dependencies between the profilers. What this patch does, is keep the branch profiling from being selected by an allyesconfig make. The branch profiler is very intrusive and it is known to break various architecture builds when selected as an allyesconfig. [ Impact: prevent branch profiler from being selected in allyesconfig ] Reported-by: Heiko Carstens Reported-by: Al Viro Reported-by: Stephen Rothwell Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 57981d338d1f..3ee28db69be6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -212,8 +212,36 @@ config BOOT_TRACER to enable this on bootup. config TRACE_BRANCH_PROFILING - bool "Trace likely/unlikely profiler" + bool select TRACING + +choice + prompt "Branch Profiling" + default BRANCH_PROFILE_NONE + help + The branch profiling is a software profiler. It will add hooks + into the C conditionals to test which path a branch takes. + + The likely/unlikely profiler only looks at the conditions that + are annotated with a likely or unlikely macro. + + The "all branch" profiler will profile every if statement in the + kernel. This profiler will also enable the likely/unlikely + profiler as well. + + Either of the above profilers add a bit of overhead to the system. + If unsure choose "No branch profiling". + +config BRANCH_PROFILE_NONE + bool "No branch profiling" + help + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. + +config PROFILE_ANNOTATED_BRANCHES + bool "Trace likely/unlikely profiler" + select TRACE_BRANCH_PROFILING help This tracer profiles all the the likely and unlikely macros in the kernel. It will display the results in: @@ -223,11 +251,9 @@ config TRACE_BRANCH_PROFILING Note: this will add a significant overhead, only turn this on if you need to profile the system's use of these macros. - Say N if unsure. - config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" - depends on TRACE_BRANCH_PROFILING + select TRACE_BRANCH_PROFILING help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. @@ -235,11 +261,12 @@ config PROFILE_ALL_BRANCHES /debugfs/tracing/profile_branch + This option also enables the likely/unlikely profiler. + This configuration, when enabled, will impose a great overhead on the system. This should only be enabled when the system is to be analyzed - - Say N if unsure. +endchoice config TRACING_BRANCHES bool -- cgit v1.2.2 From 4ed9f0716e46bb9646f26e73f4a1b5b24db7947a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 10:47:36 -0400 Subject: tracing: create menuconfig for tracing infrastructure During testing we often use randconfig to test various kernels. The current configuration set up does not give an easy way to disable all tracing with a single config. The case where randconfig would test all tracing disabled is very unlikely. This patch adds a config option to enable or disable all tracing. It is hooked into the tracing menu just like other submenus are done. [ Impact: allow randconfig to easily produce all traces disabled ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3ee28db69be6..3fa36d2bc290 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -77,7 +77,12 @@ config TRACING_SUPPORT if TRACING_SUPPORT -menu "Tracers" +menuconfig FTRACE + bool "Tracers" + help + Enable the kernel tracing infrastructure. + +if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" @@ -462,7 +467,7 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. -endmenu +endif # FTRACE endif # TRACING_SUPPORT -- cgit v1.2.2 From a7abe97fd8e7a6ccabba5a04a9f17be9211d418c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 10:59:34 -0400 Subject: tracing: rename EVENT_TRACER config to ENABLE_EVENT_TRACING Currently we have two configs: EVENT_TRACING and EVENT_TRACER. All tracers enable EVENT_TRACING. The EVENT_TRACER is only a convenience to enable the EVENT_TRACING when no other tracers are enabled. The names EVENT_TRACER and EVENT_TRACING are too similar and confusing. This patch renames EVENT_TRACER to ENABLE_EVENT_TRACING to be more appropriate to what it actually does, as well as add a comment in the help menu to explain the option's purpose. [ Impact: rename config option to reduce confusion ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3fa36d2bc290..450d3c2cfbd2 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -183,7 +183,7 @@ config CONTEXT_SWITCH_TRACER This tracer gets called from the context switch and records all switching of tasks. -config EVENT_TRACER +config ENABLE_EVENT_TRACING bool "Trace various events in the kernel" select TRACING help @@ -191,6 +191,10 @@ config EVENT_TRACER allowing the user to pick and choose which trace point they want to trace. + Note, all tracers enable event tracing. This option is + only a convenience to enable event tracing when no other + tracers are selected. + config FTRACE_SYSCALLS bool "Trace syscalls" depends on HAVE_FTRACE_SYSCALLS -- cgit v1.2.2 From 28d20e2d6e94434827e11c310788b87204b84559 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 12:12:44 -0400 Subject: tracing/events: call the correct event trace selftest init function The late_initcall calls a helper function instead of the proper init event selftest function. This update may have been lost due to conflicting merges. [ Impact: fix compiler warning and call extended event trace self tests ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 64f9d6d2735b..98daf5dc74a6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1199,6 +1199,6 @@ static __init int event_trace_self_tests_init(void) return 0; } -late_initcall(event_trace_self_tests); +late_initcall(event_trace_self_tests_init); #endif -- cgit v1.2.2 From 23de29de2d8b227943be191d59fb6d983996d55e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 12:59:29 -0400 Subject: tracing: remove dangling semicolon Due to a cut and paste error, the trace_seq_putc had a semicolon after the prototype but before the stub function when tracing is disabled. [Impact: fix compile error ] Signed-off-by: Steven Rostedt --- include/linux/trace_seq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 37db9bdfbc1a..ba9627f00d3f 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -63,7 +63,7 @@ static inline int trace_seq_puts(struct trace_seq *s, const char *str) { return 0; } -static inline int trace_seq_putc(struct trace_seq *s, unsigned char c); +static inline int trace_seq_putc(struct trace_seq *s, unsigned char c) { return 0; } -- cgit v1.2.2 From 17487bfeb6cfb05920e6a9d5a54f345f2917b4e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 13:24:21 -0400 Subject: tracing: fix recursive test level calculation The recursive tests to detect same level recursion in the ring buffers did not account for the hard/softirq_counts to be shifted. Thus the numbers could be larger than then mask to be tested. This patch includes the shift for the calculation of the irq depth. [ Impact: stop false positives in trace recursion detection ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e145969a8eda..aa40ae92233b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1483,7 +1483,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, static int trace_irq_level(void) { - return hardirq_count() + softirq_count() + in_nmi(); + return (hardirq_count() >> HARDIRQ_SHIFT) + + (softirq_count() >> + SOFTIRQ_SHIFT) + + !!in_nmi(); } static int trace_recursive_lock(void) -- cgit v1.2.2 From e395898e98119085f666febbc7b631dd69bc637f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 13:32:44 -0400 Subject: tracing: remove recursive test from ring_buffer_event_discard The ring_buffer_event_discard is not tied to ring_buffer_lock_reserve. It can be called inside or outside the reserve/commit. Even if it is called inside the reserve/commit the commit part must also be called. Only ring_buffer_discard_commit can be used as a replacement for ring_buffer_unlock_commit. This patch removes the trace_recursive_unlock from ring_buffer_event_discard since it would be the wrong place to do so. [Impact: prevent breakage in trace recursive testing ] Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index aa40ae92233b..a6997670cc46 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1667,7 +1667,6 @@ static inline void rb_event_discard(struct ring_buffer_event *event) void ring_buffer_event_discard(struct ring_buffer_event *event) { rb_event_discard(event); - trace_recursive_unlock(); } EXPORT_SYMBOL_GPL(ring_buffer_event_discard); -- cgit v1.2.2 From aa18efb2a2f07e1cf062039848e9d369bb358724 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 16:16:11 -0400 Subject: tracing: use recursive counter over irq level Althought using the irq level (hardirq_count, softirq_count and in_nmi) was nice to detect bad recursion right away, but since the counters are not atomically updated with respect to the interrupts, the function tracer might trigger the test from an interrupt handler before the hardirq_count is updated. This will trigger a false warning. This patch converts the recursive detection to a simple counter. If the depth is greater than 16 then the recursive detection will trigger. 16 is more than enough for any nested interrupts. [ Impact: fix false positive trace recursion detection ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 45 ++++++++++++++++----------------------------- 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a6997670cc46..7bcfd3e60537 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1481,47 +1481,34 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return event; } -static int trace_irq_level(void) -{ - return (hardirq_count() >> HARDIRQ_SHIFT) + - (softirq_count() >> + SOFTIRQ_SHIFT) + - !!in_nmi(); -} +#define TRACE_RECURSIVE_DEPTH 16 static int trace_recursive_lock(void) { - int level; - - level = trace_irq_level(); + current->trace_recursion++; - if (unlikely(current->trace_recursion & (1 << level))) { - /* Disable all tracing before we do anything else */ - tracing_off_permanent(); + if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + return 0; - printk_once(KERN_WARNING "Tracing recursion: " - "HC[%lu]:SC[%lu]:NMI[%lu]\n", - hardirq_count() >> HARDIRQ_SHIFT, - softirq_count() >> SOFTIRQ_SHIFT, - in_nmi()); + /* Disable all tracing before we do anything else */ + tracing_off_permanent(); - WARN_ON_ONCE(1); - return -1; - } + printk_once(KERN_WARNING "Tracing recursion: depth[%d]:" + "HC[%lu]:SC[%lu]:NMI[%lu]\n", + current->trace_recursion, + hardirq_count() >> HARDIRQ_SHIFT, + softirq_count() >> SOFTIRQ_SHIFT, + in_nmi()); - current->trace_recursion |= 1 << level; - - return 0; + WARN_ON_ONCE(1); + return -1; } static void trace_recursive_unlock(void) { - int level; - - level = trace_irq_level(); - - WARN_ON_ONCE(!current->trace_recursion & (1 << level)); + WARN_ON_ONCE(!current->trace_recursion); - current->trace_recursion &= ~(1 << level); + current->trace_recursion--; } static DEFINE_PER_CPU(int, rb_need_resched); -- cgit v1.2.2 From cb4764a6dbffd9bb3cf759421ae82384071a933d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 18:16:44 -0400 Subject: tracing: use nowakeup version of commit for function event trace tests The startup tests for the event tracer also runs with the function tracer enabled. The "wakeup" version of the trace commit was used which can grab spinlocks. If a task was preempted by an NMI that called a function being traced, it could deadlock due to the function tracer trying to grab the same lock. Thanks to Frederic Weisbecker for pointing out where the bug was. Reported-by: Ingo Molnar Reported-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 98daf5dc74a6..672b195f86c9 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1164,7 +1164,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) entry->ip = ip; entry->parent_ip = parent_ip; - trace_current_buffer_unlock_commit(event, flags, pc); + trace_nowake_buffer_unlock_commit(event, flags, pc); out: atomic_dec(&per_cpu(test_event_disable, cpu)); -- cgit v1.2.2 From e8082f3f5a17d7a7bfc7dd1050a3f958dc034e9a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 21 Apr 2009 17:11:46 +0800 Subject: tracing/filters: don't remove old filters when failed to write subsys->filter If writing subsys->filter returns EINVAL or ENOSPC, the original filters in subsys/ and subsys/events/ will be removed. This is definitely wrong. [ Impact: fix filter setting semantics on error condition ] Signed-off-by: Li Zefan Cc: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49ED8DD2.2070700@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 672b195f86c9..9ea55a7dfdec 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -600,7 +600,6 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, err = filter_add_subsystem_pred(system, pred); if (err < 0) { - filter_free_subsystem_preds(system); filter_free_pred(pred); return err; } -- cgit v1.2.2 From f66578a7637b87810cbb9041c4e3a77fd2fa4706 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 21 Apr 2009 17:12:11 +0800 Subject: tracing/filters: allow user-input to be integer-like string Suppose we would like to trace all tasks named '123', but this will fail: # echo 'parent_comm == 123' > events/sched/sched_process_fork/filter bash: echo: write error: Invalid argument Don't guess the type of the filter pred in filter_parse(), but instead we check it in __filter_add_pred(). [ Impact: extend allowed filter field string values ] Signed-off-by: Li Zefan Cc: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49ED8DEB.6000700@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e0fcfd2a16d6..65418288f957 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -313,6 +313,7 @@ static int __filter_add_pred(struct ftrace_event_call *call, { struct ftrace_event_field *field; filter_pred_fn_t fn; + unsigned long long val; field = find_event_field(call, pred->field_name); if (!field) @@ -322,14 +323,13 @@ static int __filter_add_pred(struct ftrace_event_call *call, pred->offset = field->offset; if (is_string_field(field->type)) { - if (!pred->str_len) - return -EINVAL; fn = filter_pred_string; pred->str_len = field->size; return filter_add_pred_fn(call, pred, fn); } else { - if (pred->str_len) + if (strict_strtoull(pred->str_val, 0, &val)) return -EINVAL; + pred->val = val; } switch (field->size) { @@ -413,12 +413,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system, return 0; } +/* + * The filter format can be + * - 0, which means remove all filter preds + * - [||/&&] ==/!= + */ int filter_parse(char **pbuf, struct filter_pred *pred) { - char *tmp, *tok, *val_str = NULL; + char *tok, *val_str = NULL; int tok_n = 0; - /* field ==/!= number, or/and field ==/!= number, number */ while ((tok = strsep(pbuf, " \n"))) { if (tok_n == 0) { if (!strcmp(tok, "0")) { @@ -478,19 +482,13 @@ int filter_parse(char **pbuf, struct filter_pred *pred) return -EINVAL; } + strcpy(pred->str_val, val_str); + pred->str_len = strlen(val_str); + pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); if (!pred->field_name) return -ENOMEM; - pred->str_len = 0; - pred->val = simple_strtoull(val_str, &tmp, 0); - if (tmp == val_str) { - strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL); - pred->str_len = strlen(val_str); - pred->str_val[pred->str_len] = '\0'; - } else if (*tmp != '\0') - return -EINVAL; - return 0; } -- cgit v1.2.2 From 3554228d4289098a8fe5cfd87512ec32a19bbe5a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Apr 2009 09:41:26 -0400 Subject: ring-buffer: only warn on wrap if buffer is bigger than two pages On boot up, to save memory, ftrace allocates the minimum buffer which is two pages. Ftrace also goes through a series of tests (when configured) on boot up. These tests can fill up a page within a single interrupt. The ring buffer also has a WARN_ON when it detects that the buffer was completely filled within a single commit (other commits are allowed to be nested). Combine the small buffer on start up, with the tests that can fill more than a single page within an interrupt, this can trigger the WARN_ON. This patch makes the WARN_ON only happen when the ring buffer consists of more than two pages. [ Impact: prevent false WARN_ON in ftrace startup tests ] Reported-by: Ingo Molnar LKML-Reference: <20090421094616.GA14561@elte.hu> Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7bcfd3e60537..61dbdf21cd32 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1241,7 +1241,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * about it. */ if (unlikely(next_page == commit_page)) { - WARN_ON_ONCE(1); + /* This can easily happen on small ring buffers */ + WARN_ON_ONCE(buffer->pages > 2); goto out_reset; } -- cgit v1.2.2 From 7a4f453b6d7379a7c380825949977c5a838aa012 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 22 Apr 2009 16:53:34 +0800 Subject: tracing/events: make struct trace_entry->type to be int type struct trace_entry->type is unsigned char, while trace event's id is int type, thus for a event with id >= 256, it's entry->type is cast to (id % 256), and then we can't see the trace output of this event. # insmod trace-events-sample.ko # echo foo_bar > /mnt/tracing/set_event # cat /debug/tracing/events/trace-events-sample/foo_bar/id 256 # cat /mnt/tracing/trace_pipe <...>-3548 [001] 215.091142: Unknown type 0 <...>-3548 [001] 216.089207: Unknown type 0 <...>-3548 [001] 217.087271: Unknown type 0 <...>-3548 [001] 218.085332: Unknown type 0 [ Impact: fix output for trace events with id >= 256 ] Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <49EEDB0E.5070207@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 4 ++-- include/trace/ftrace.h | 2 +- kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 2 +- kernel/trace/trace_events.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 75f3ac01a87c..2a4a40749911 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -16,7 +16,7 @@ struct dentry; * bash-15816 [01] 235.197585: idle_cpu <- irq_enter */ struct trace_entry { - unsigned char type; + int type; unsigned char flags; unsigned char preempt_count; int pid; @@ -73,7 +73,7 @@ enum print_line_t { struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, +trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 39a3351f2e7f..15ef08d9add1 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -198,7 +198,7 @@ ftrace_define_fields_##call(void) \ struct ftrace_event_call *event_call = &event_##call; \ int ret; \ \ - __common_field(unsigned char, type); \ + __common_field(int, type); \ __common_field(unsigned char, flags); \ __common_field(unsigned char, preempt_count); \ __common_field(int, pid); \ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b9a3adce9221..b6183bc9ecae 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -838,7 +838,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - unsigned char type, + int type, unsigned long len, unsigned long flags, int pc) { @@ -881,7 +881,7 @@ void trace_buffer_unlock_commit(struct trace_array *tr, } struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, +trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc) { return trace_buffer_lock_reserve(&global_trace, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 247948e81b08..7d55bcf50e49 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -422,7 +422,7 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct ring_buffer_event; struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - unsigned char type, + int type, unsigned long len, unsigned long flags, int pc); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9ea55a7dfdec..5d6e879cf875 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\n", - FIELD(unsigned char, type), + FIELD(int, type), FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), FIELD(int, pid), -- cgit v1.2.2 From 9cbf117662e24c6d33245666804487f92c21b59d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 04:51:29 +0200 Subject: tracing/events: provide string with undefined size support This patch provides the support for dynamic size strings on event tracing. The key concept is to use a structure with an ending char array field of undefined size and use such ability to allocate the minimal size on the ring buffer to make one or more string entries fit inside, as opposite to a fixed length strings with upper bound. The strings themselves are represented using fields which have an offset value from the beginning of the entry. This patch provides three new macros: __string(item, src) This one declares a string to the structure inside TP_STRUCT__entry. You need to provide the name of the string field and the source that will be copied inside. This will also add the dynamic size of the string needed for the ring buffer entry allocation. A stack allocated structure is used to temporarily store the offset of each strings, avoiding double calls to strlen() on each event insertion. __get_str(field) This one will give you a pointer to the string you have created. This is an abstract helper to resolve the absolute address given the field name which is a relative address from the beginning of the trace_structure. __assign_str(dst, src) Use this macro to automatically perform the string copy from src to dst. src must be a variable to assign and dst is the name of a __string field. Example on how to use it: TRACE_EVENT(my_event, TP_PROTO(char *src1, char *src2), TP_ARGS(src1, src2), TP_STRUCT__entry( __string(str1, src1) __string(str2, src2) ), TP_fast_assign( __assign_str(str1, src1); __assign_str(str2, src2); ), TP_printk("%s %s", __get_str(src1), __get_str(src2)) ) Of course you can mix-up any __field or __array inside this TRACE_EVENT. The position of the __string or __assign_str doesn't matter. Changes in v2: Address the suggestion of Steven Rostedt: drop the opening_string() macro and redefine __ending_string() to get the size of the string to be copied instead of overwritting the whole ring buffer allocation. Changes in v3: Address other suggestions of Steven Rostedt and Peter Zijlstra with some changes: drop the __ending_string and the need to have only one string field. Use offsets instead of absolute addresses. [ Impact: allow more compact memory usage for string tracing ] Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Li Zefan Cc: Peter Zijlstra --- include/trace/ftrace.h | 88 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 15ef08d9add1..5a7d18c43634 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -27,6 +27,9 @@ #undef __field #define __field(type, item) type item; +#undef __string +#define __string(item, src) int __str_loc_##item; + #undef TP_STRUCT__entry #define TP_STRUCT__entry(args...) args @@ -35,14 +38,53 @@ struct ftrace_raw_##name { \ struct trace_entry ent; \ tstruct \ + char __str_data[0]; \ }; \ static struct ftrace_event_call event_##name #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + /* * Stage 2 of the trace events. * + * Include the following: + * + * struct ftrace_str_offsets_ { + * int ; + * int ; + * [...] + * }; + * + * The __string() macro will create each int , this is to + * keep the offset of each string from the beggining of the event + * once we perform the strlen() of the src strings. + * + */ + +#undef TRACE_FORMAT +#define TRACE_FORMAT(call, proto, args, fmt) + +#undef __array +#define __array(type, item, len) + +#undef __field +#define __field(type, item); + +#undef __string +#define __string(item, src) int item; + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ + struct ftrace_str_offsets_##call { \ + tstruct; \ + }; + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +/* + * Stage 3 of the trace events. + * * Override the macros in to include the following: * * enum print_line_t @@ -80,6 +122,9 @@ #undef TP_printk #define TP_printk(fmt, args...) fmt "\n", args +#undef __get_str +#define __get_str(field) (char *)__entry + __entry->__str_loc_##field + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ enum print_line_t \ @@ -146,6 +191,16 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ if (!ret) \ return 0; +#undef __string +#define __string(item, src) \ + ret = trace_seq_printf(s, "\tfield: __str_loc " #item ";\t" \ + "offset:%u;tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), \ + __str_loc_##item), \ + (unsigned int)sizeof(field.__str_loc_##item)); \ + if (!ret) \ + return 0; + #undef __entry #define __entry REC @@ -189,6 +244,12 @@ ftrace_format_##call(struct trace_seq *s) \ if (ret) \ return ret; +#undef __string +#define __string(item, src) \ + ret = trace_define_field(event_call, "__str_loc", #item, \ + offsetof(typeof(field), __str_loc_##item), \ + sizeof(field.__str_loc_##item)); + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ int \ @@ -212,7 +273,7 @@ ftrace_define_fields_##call(void) \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) /* - * Stage 3 of the trace events. + * Stage 4 of the trace events. * * Override the macros in to include the following: * @@ -409,6 +470,23 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #undef __entry #define __entry entry +#undef __field +#define __field(type, item) + +#undef __array +#define __array(type, item, len) + +#undef __string +#define __string(item, src) \ + __str_offsets.item = __str_size + \ + offsetof(typeof(*entry), __str_data); \ + __str_size += strlen(src) + 1; + +#undef __assign_str +#define __assign_str(dst, src) \ + __entry->__str_loc_##dst = __str_offsets.dst; \ + strcpy(__get_str(dst), src); + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ _TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ @@ -417,18 +495,22 @@ static struct ftrace_event_call event_##call; \ \ static void ftrace_raw_event_##call(proto) \ { \ + struct ftrace_str_offsets_##call __maybe_unused __str_offsets; \ struct ftrace_event_call *call = &event_##call; \ struct ring_buffer_event *event; \ struct ftrace_raw_##call *entry; \ unsigned long irq_flags; \ + int __str_size = 0; \ int pc; \ \ local_save_flags(irq_flags); \ pc = preempt_count(); \ \ + tstruct; \ + \ event = trace_current_buffer_lock_reserve(event_##call.id, \ - sizeof(struct ftrace_raw_##call), \ - irq_flags, pc); \ + sizeof(struct ftrace_raw_##call) + __str_size,\ + irq_flags, pc); \ if (!event) \ return; \ entry = ring_buffer_event_data(event); \ -- cgit v1.2.2 From 7e7ca9a22dbbc5c91763cd16923c7509918709b6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 04:54:49 +0200 Subject: tracing/lock: provide lock_acquired event support for dynamic size string Now that we can support the dynamic sized string, make the lock tracing able to use it, making it safe against modules removal and consuming the right amount of memory needed for each lock name Changes in v2: adapt to the __ending_string() updates and the opening_string() removal. [ Impact: protect lock tracer against module removal ] Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Steven Rostedt --- include/trace/events/lockdep.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h index 45e326b5c7f3..3ca315c1429d 100644 --- a/include/trace/events/lockdep.h +++ b/include/trace/events/lockdep.h @@ -38,16 +38,16 @@ TRACE_EVENT(lock_acquired, TP_ARGS(lock, ip, waittime), TP_STRUCT__entry( - __field(const char *, name) + __string(name, lock->name) __field(unsigned long, wait_usec) __field(unsigned long, wait_nsec_rem) ), TP_fast_assign( - __entry->name = lock->name; + __assign_str(name, lock->name); __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); __entry->wait_usec = (unsigned long) waittime; ), - TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, + TP_printk("%s (%lu.%03lu us)", __get_str(name), __entry->wait_usec, __entry->wait_nsec_rem) ); -- cgit v1.2.2 From 6a74aa40907757ec98d8710ff66cd4cfe064e7d8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 22 Apr 2009 00:41:09 +0200 Subject: tracing/events: protect __get_str() The __get_str() macro is used in a code part then its content should be protected with parenthesis. [ Impact: make macro definition more robust ] Reported-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/trace/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 5a7d18c43634..a77f71a46dbe 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -123,7 +123,7 @@ #define TP_printk(fmt, args...) fmt "\n", args #undef __get_str -#define __get_str(field) (char *)__entry + __entry->__str_loc_##field +#define __get_str(field) ((char *)__entry + __entry->__str_loc_##field) #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -- cgit v1.2.2 From 9be24414aad047dcf9d8d2a9a929321536c7ebec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Mar 2009 10:25:24 -0400 Subject: tracing/wakeup: move access to wakeup_cpu into spinlock The code had the following outside the lock: if (next != wakeup_task) return; pc = preempt_count(); /* The task we are waiting for is waking up */ data = wakeup_trace->data[wakeup_cpu]; On initialization, wakeup_task is NULL and wakeup_cpu -1. This code is not under a lock. If wakeup_task is set on another CPU as that task is waking up, we can see the wakeup_task before wakeup_cpu is set. If we read wakeup_cpu while it is still -1 then we will have a bad data pointer. This patch moves the reading of wakeup_cpu within the protection of the spinlock used to protect the writing of wakeup_cpu and wakeup_task. [ Impact: remove possible race causing invalid pointer dereference ] Reported-by: Maneesh Soni Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index b8b13c5540fd..eacb27225173 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, pc = preempt_count(); - /* The task we are waiting for is waking up */ - data = wakeup_trace->data[wakeup_cpu]; - /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); @@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; + /* The task we are waiting for is waking up */ + data = wakeup_trace->data[wakeup_cpu]; + trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); -- cgit v1.2.2 From 89ec0dee9eba6275d47be0b878cf5f6d5c2fb6eb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Mar 2009 11:03:29 -0400 Subject: tracing: increase size of number of possible events With the new event tracing registration, we must increase the number of events that can be registered. Currently the type field is only one byte, which leaves us only 256 possible events. Since we do not save the CPU number in the tracer anymore (it is determined by the per cpu ring buffer that is used) we have an extra byte to use. This patch increases the size of type from 1 byte (256 events) to 2 bytes (65,536 events). It also adds a WARN_ON_ONCE if we exceed that limit. [ Impact: allow more than 255 events ] Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 5 ++++- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_output.c | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2a4a40749911..07e0a6d64a24 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -16,13 +16,16 @@ struct dentry; * bash-15816 [01] 235.197585: idle_cpu <- irq_enter */ struct trace_entry { - int type; + unsigned short type; unsigned char flags; unsigned char preempt_count; int pid; int tgid; }; +#define FTRACE_MAX_EVENT \ + ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1) + /* * Trace iterator - used by printout routines who present trace * results to users and which routines might sleep, etc: diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5d6e879cf875..9887131afa03 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\n", - FIELD(int, type), + FIELD(unsigned short, type), FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), FIELD(int, pid), diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 83a8abb9640f..06997e75114b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -537,6 +537,8 @@ int register_ftrace_event(struct trace_event *event) out: mutex_unlock(&trace_event_mutex); + WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT); + return ret; } EXPORT_SYMBOL_GPL(register_ftrace_event); -- cgit v1.2.2 From 75db37d2f4c0ad9466ead57d467277d097b4105c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Mar 2009 11:43:36 -0400 Subject: tracing: add size checks for exported ftrace internal structures The events exported by TRACE_EVENT are automated and are guaranteed to be correct when used. The internal ftrace structures on the other hand are more manually exported. These require the ftrace maintainer to make sure they are up to date. This patch adds a size check to help flag when a type changes in an internal ftrace data structure, and the update needs to be reflected in the export. If a export is incorrect, then the only harm is that the user space tools will not know how to correctly read the internal structures of ftrace. [ Impact: help prevent inconsistent ftrace format print outs ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 3 +++ kernel/trace/trace_export.c | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9887131afa03..b92081588088 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -381,8 +381,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +extern char *__bad_type_size(void); + #undef FIELD #define FIELD(type, name) \ + sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ #type, "common_" #name, offsetof(typeof(field), name), \ sizeof(field.name) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 48fc02fe73a0..0cb1a142c74f 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -19,8 +19,12 @@ #undef TRACE_STRUCT #define TRACE_STRUCT(args...) args +extern void __bad_type_size(void); + #undef TRACE_FIELD #define TRACE_FIELD(type, item, assign) \ + if (sizeof(type) != sizeof(field.item)) \ + __bad_type_size(); \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), item), \ -- cgit v1.2.2 From d7285c6b5c54397fdf112c2fb98ee43193173aa9 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Thu, 23 Apr 2009 10:21:38 -0700 Subject: x86: use native register access for native tlb flushing currently these are paravirtulaized, doesn't appear any callers rely on this (no pv_ops backends are using native_tlb and overriding cr3/4 access). [ Impact: fix lockdep warning with paravirt and function tracer ] Signed-off-by: Chris Wright LKML-Reference: <20090423172138.GR3036@sequoia.sous-sol.org> Signed-off-by: Steven Rostedt --- arch/x86/include/asm/tlbflush.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d3539f998f88..e2927c5f45b1 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -17,7 +17,7 @@ static inline void __native_flush_tlb(void) { - write_cr3(read_cr3()); + native_write_cr3(native_read_cr3()); } static inline void __native_flush_tlb_global(void) @@ -32,11 +32,11 @@ static inline void __native_flush_tlb_global(void) */ raw_local_irq_save(flags); - cr4 = read_cr4(); + cr4 = native_read_cr4(); /* clear PGE */ - write_cr4(cr4 & ~X86_CR4_PGE); + native_write_cr4(cr4 & ~X86_CR4_PGE); /* write old PGE again and flush TLBs */ - write_cr4(cr4); + native_write_cr4(cr4); raw_local_irq_restore(flags); } -- cgit v1.2.2 From c2518c4366f087ebc10b3919cb2461bbe4f42d0c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Apr 2009 23:26:18 -0400 Subject: tracing: fix cut and paste macro error In case a module uses the TRACE_EVENT macro for creating automated events in ftrace, it may choose to use a different file name than the defined system name, or choose to use a different path than the default "include/trace/events" include path. If this is done, then before including trace/define_trace.h the header would define either "TRACE_INCLUDE_FILE" for the file name or "TRACE_INCLUDE_PATH" for the include path. If it does not define these, then the define_trace.h defines them instead. If define trace defines them, then define_trace.h should also undefine them before exiting. To do this a macro is used to note this: #ifndef TRACE_INCLUDE_FILE # define TRACE_INCLUDE_FILE TRACE_SYSTEM # define UNDEF_TRACE_INCLUDE_FILE #endif [...] #ifdef UNDEF_TRACE_INCLUDE_FILE # undef TRACE_INCLUDE_FILE # undef UNDEF_TRACE_INCLUDE_FILE #endif The UNDEF_TRACE_INCLUDE_FILE acts as a CPP variable to know to undef the TRACE_INCLUDE_FILE before leaving define_trace.h. Unfortunately, due to cut and paste errors, the macros between FILE and PATH got mixed up. [ Impact: undef TRACE_INCLUDE_FILE and/or TRACE_INCLUDE_PATH when needed ] Signed-off-by: Steven Rostedt --- include/trace/define_trace.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index 7f1f23d601ec..abc611feeb8c 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -44,7 +44,7 @@ #ifndef TRACE_INCLUDE_PATH # define __TRACE_INCLUDE(system) -# define UNDEF_TRACE_INCLUDE_FILE +# define UNDEF_TRACE_INCLUDE_PATH #else # define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h) #endif @@ -64,13 +64,13 @@ /* Only undef what we defined in this file */ #ifdef UNDEF_TRACE_INCLUDE_FILE -# undef TRACE_INCLUDE_PATH +# undef TRACE_INCLUDE_FILE # undef UNDEF_TRACE_INCLUDE_FILE #endif -#ifdef UNDEF_TRACE_INCLUDE_FILE +#ifdef UNDEF_TRACE_INCLUDE_PATH # undef TRACE_INCLUDE_PATH -# undef UNDEF_TRACE_INCLUDE_FILE +# undef UNDEF_TRACE_INCLUDE_PATH #endif /* We may be processing more files */ -- cgit v1.2.2 From 334d4169a6592d3fcd863bbe822a8f6985ffa9af Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 24 Apr 2009 11:27:05 +0800 Subject: ring_buffer: compressed event header RB_MAX_SMALL_DATA = 28bytes is too small for most tracers, it wastes an 'u32' to save the actually length for events which data size > 28. This fix uses compressed event header and enlarges RB_MAX_SMALL_DATA. [ Impact: saves about 0%-12.5%(depends on tracer) memory in ring_buffer ] Signed-off-by: Lai Jiangshan LKML-Reference: <49F13189.3090000@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 16 +++++---- kernel/trace/ring_buffer.c | 83 ++++++++++++++++++++++----------------------- 2 files changed, 50 insertions(+), 49 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index fac8f1ac6f49..1c2f80911fbe 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -11,7 +11,7 @@ struct ring_buffer_iter; * Don't refer to this struct directly, use functions below. */ struct ring_buffer_event { - u32 type:2, len:3, time_delta:27; + u32 type_len:5, time_delta:27; u32 array[]; }; @@ -24,7 +24,8 @@ struct ring_buffer_event { * size is variable depending on how much * padding is needed * If time_delta is non zero: - * everything else same as RINGBUF_TYPE_DATA + * array[0] holds the actual length + * size = 4 + length (bytes) * * @RINGBUF_TYPE_TIME_EXTEND: Extend the time delta * array[0] = time delta (28 .. 59) @@ -35,22 +36,23 @@ struct ring_buffer_event { * array[1..2] = tv_sec * size = 16 bytes * - * @RINGBUF_TYPE_DATA: Data record - * If len is zero: + * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX: + * Data record + * If type_len is zero: * array[0] holds the actual length * array[1..(length+3)/4] holds data - * size = 4 + 4 + length (bytes) + * size = 4 + length (bytes) * else - * length = len << 2 + * length = type_len << 2 * array[0..(length+3)/4-1] holds data * size = 4 + length (bytes) */ enum ring_buffer_type { + RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28, RINGBUF_TYPE_PADDING, RINGBUF_TYPE_TIME_EXTEND, /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */ RINGBUF_TYPE_TIME_STAMP, - RINGBUF_TYPE_DATA, }; unsigned ring_buffer_event_length(struct ring_buffer_event *event); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 61dbdf21cd32..9692f100ec1a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -28,8 +28,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s) { int ret; - ret = trace_seq_printf(s, "\ttype : 2 bits\n"); - ret = trace_seq_printf(s, "\tlen : 3 bits\n"); + ret = trace_seq_printf(s, "# compressed entry header\n"); + ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); ret = trace_seq_printf(s, "\tarray : 32 bits\n"); ret = trace_seq_printf(s, "\n"); @@ -37,8 +37,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s) RINGBUF_TYPE_PADDING); ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", RINGBUF_TYPE_TIME_EXTEND); - ret = trace_seq_printf(s, "\tdata : type == %d\n", - RINGBUF_TYPE_DATA); + ret = trace_seq_printf(s, "\tdata max type_len == %d\n", + RINGBUF_TYPE_DATA_TYPE_LEN_MAX); return ret; } @@ -204,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA 28 +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + +/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ +#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX enum { RB_LEN_TIME_EXTEND = 8, @@ -213,17 +216,18 @@ enum { static inline int rb_null_event(struct ring_buffer_event *event) { - return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; + return event->type_len == RINGBUF_TYPE_PADDING + && event->time_delta == 0; } static inline int rb_discarded_event(struct ring_buffer_event *event) { - return event->type == RINGBUF_TYPE_PADDING && event->time_delta; + return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; + event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } @@ -232,8 +236,8 @@ rb_event_data_length(struct ring_buffer_event *event) { unsigned length; - if (event->len) - length = event->len * RB_ALIGNMENT; + if (event->type_len) + length = event->type_len * RB_ALIGNMENT; else length = event->array[0]; return length + RB_EVNT_HDR_SIZE; @@ -243,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event) static unsigned rb_event_length(struct ring_buffer_event *event) { - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) /* undefined */ return -1; - return rb_event_data_length(event); + return event->array[0] + RB_EVNT_HDR_SIZE; case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; @@ -272,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event) unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length = rb_event_length(event); - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) @@ -285,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static void * rb_event_data(struct ring_buffer_event *event) { - BUG_ON(event->type != RINGBUF_TYPE_DATA); + BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ - if (event->len) + if (event->type_len) return (void *)&event->array[0]; /* Otherwise length is in array[0] and array[1] has the data */ return (void *)&event->array[1]; @@ -988,7 +992,7 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) return; /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; cpu_buffer->overrun++; cpu_buffer->entries--; @@ -1133,28 +1137,21 @@ static void rb_update_event(struct ring_buffer_event *event, unsigned type, unsigned length) { - event->type = type; + event->type_len = type; switch (type) { case RINGBUF_TYPE_PADDING: - break; - case RINGBUF_TYPE_TIME_EXTEND: - event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT); - break; - case RINGBUF_TYPE_TIME_STAMP: - event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT); break; - case RINGBUF_TYPE_DATA: + case 0: length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) { - event->len = 0; + if (length > RB_MAX_SMALL_DATA) event->array[0] = length; - } else - event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); + else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); break; default: BUG(); @@ -1562,7 +1559,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (length > BUF_PAGE_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); + event = rb_reserve_next_event(cpu_buffer, 0, length); if (!event) goto out; @@ -1634,7 +1631,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static inline void rb_event_discard(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ if (!event->time_delta) event->time_delta = 1; @@ -1786,8 +1785,7 @@ int ring_buffer_write(struct ring_buffer *buffer, goto out; event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, - RINGBUF_TYPE_DATA, event_length); + event = rb_reserve_next_event(cpu_buffer, 0, event_length); if (!event) goto out; @@ -2035,7 +2033,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -2066,7 +2064,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -2181,7 +2179,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX + || rb_discarded_event(event)) cpu_buffer->entries--; rb_update_read_stamp(cpu_buffer, event); @@ -2262,7 +2261,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) event = rb_reader_event(cpu_buffer); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) RB_WARN_ON(cpu_buffer, 1); @@ -2334,7 +2333,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_head_event(iter); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) { rb_inc_iter(iter); @@ -2393,7 +2392,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) event = rb_buffer_peek(buffer, cpu, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2421,7 +2420,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_peek(iter, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2466,7 +2465,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) out: preempt_enable(); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2559,7 +2558,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2766,7 +2765,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) return; /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; cpu_buffer->entries--; } -- cgit v1.2.2 From 782cc5ae6331d63b4febaa312c9d14493aafa9b8 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 24 Apr 2009 09:43:09 +0200 Subject: x86, ds: fix buffer alignment in debug store selftest The debug store selftest code uses a stack-allocated buffer, which is not necessarily correctly aligned. For tests using a buffer to hold a single entry, the buffer that is passed to ds_request must already be suitably aligned. Pass a suitably aligned portion of the bigger buffer. [ Impact: fix hw-branch-tracer self-test failure ] Signed-off-by: Markus Metzger Cc: markus.t.metzger@gmail.com LKML-Reference: <20090424094309.A30145@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index 5f104a0ace66..6bc7c199ab99 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -323,13 +323,15 @@ static int ds_selftest_bts_bad_request_task(void *buffer) int ds_selftest_bts(void) { struct ds_selftest_bts_conf conf; - unsigned char buffer[BUFFER_SIZE]; + unsigned char buffer[BUFFER_SIZE], *small_buffer; unsigned long irq; int cpu; printk(KERN_INFO "[ds] bts selftest..."); conf.error = 0; + small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8; + get_online_cpus(); for_each_online_cpu(cpu) { conf.suspend = ds_suspend_bts_wrap; @@ -381,7 +383,7 @@ int ds_selftest_bts(void) conf.suspend = ds_suspend_bts_noirq; conf.resume = ds_resume_bts_noirq; conf.tracer = - ds_request_bts_task(current, buffer, SMALL_BUFFER_SIZE, + ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE, NULL, (size_t)-1, BTS_KERNEL); local_irq_save(irq); ds_selftest_bts_cpu(&conf); -- cgit v1.2.2 From 7e0bfad24d85de7cf2202a7b0ce51de11a077b21 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 24 Apr 2009 09:44:48 +0200 Subject: x86, bts: reenable ptrace branch trace support The races found by Oleg Nesterov have been fixed. Reenable branch trace support. Signed-off-by: Markus Metzger Acked-by: Oleg Nesterov LKML-Reference: <20090424094448.A30216@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 8130334329c0..924e156a85ab 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -506,7 +506,6 @@ config X86_PTRACE_BTS bool "Branch Trace Store" default y depends on X86_DEBUGCTLMSR - depends on BROKEN ---help--- This adds a ptrace interface to the hardware's branch trace store. -- cgit v1.2.2 From 1cb81b143fa8f0e4629f10690862e2e52ca792ff Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 24 Apr 2009 09:51:43 +0200 Subject: x86, bts, mm: clean up buffer allocation The current mm interface is asymetric. One function allocates a locked buffer, another function only refunds the memory. Change this to have two functions for accounting and refunding locked memory, respectively; and do the actual buffer allocation in ptrace. [ Impact: refactor BTS buffer allocation code ] Signed-off-by: Markus Metzger Acked-by: Andrew Morton Cc: Peter Zijlstra LKML-Reference: <20090424095143.A30265@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 39 ++++++++++++++++++++++++++------------- include/linux/mm.h | 6 ++++-- mm/mlock.c | 36 +++++++++++++++++------------------- 3 files changed, 47 insertions(+), 34 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index d5252ae6c520..09ecbde91c13 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -617,17 +617,28 @@ struct bts_context { struct work_struct work; }; -static inline void alloc_bts_buffer(struct bts_context *context, - unsigned int size) +static int alloc_bts_buffer(struct bts_context *context, unsigned int size) { - void *buffer; + void *buffer = NULL; + int err = -ENOMEM; - buffer = alloc_locked_buffer(size); - if (buffer) { - context->buffer = buffer; - context->size = size; - context->mm = get_task_mm(current); - } + err = account_locked_memory(current->mm, current->signal->rlim, size); + if (err < 0) + return err; + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto out_refund; + + context->buffer = buffer; + context->size = size; + context->mm = get_task_mm(current); + + return 0; + + out_refund: + refund_locked_memory(current->mm, size); + return err; } static inline void free_bts_buffer(struct bts_context *context) @@ -638,7 +649,7 @@ static inline void free_bts_buffer(struct bts_context *context) kfree(context->buffer); context->buffer = NULL; - refund_locked_buffer_memory(context->mm, context->size); + refund_locked_memory(context->mm, context->size); context->size = 0; mmput(context->mm); @@ -786,13 +797,15 @@ static int ptrace_bts_config(struct task_struct *child, context->tracer = NULL; if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { + int err; + free_bts_buffer(context); if (!cfg.size) return 0; - alloc_bts_buffer(context, cfg.size); - if (!context->buffer) - return -ENOMEM; + err = alloc_bts_buffer(context, cfg.size); + if (err < 0) + return err; } if (cfg.flags & PTRACE_BTS_O_TRACE) diff --git a/include/linux/mm.h b/include/linux/mm.h index a3963ba23a6d..009eabd3c21c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -19,6 +19,7 @@ struct anon_vma; struct file_ra_state; struct user_struct; struct writeback_control; +struct rlimit; #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -1319,7 +1320,8 @@ int vmemmap_populate_basepages(struct page *start_page, int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); -extern void *alloc_locked_buffer(size_t size); -extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size); +extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, + size_t size); +extern void refund_locked_memory(struct mm_struct *mm, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/mlock.c b/mm/mlock.c index 28be15ead9c1..ac130433c7d3 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -629,38 +629,36 @@ void user_shm_unlock(size_t size, struct user_struct *user) free_uid(user); } -void *alloc_locked_buffer(size_t size) +int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, + size_t size) { - unsigned long rlim, vm, pgsz; - void *buffer = NULL; + unsigned long lim, vm, pgsz; + int error = -ENOMEM; pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); - - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + pgsz; - if (rlim < vm) - goto out; + down_write(&mm->mmap_sem); - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + pgsz; - if (rlim < vm) + lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = mm->total_vm + pgsz; + if (lim < vm) goto out; - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) + lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = mm->locked_vm + pgsz; + if (lim < vm) goto out; - current->mm->total_vm += pgsz; - current->mm->locked_vm += pgsz; + mm->total_vm += pgsz; + mm->locked_vm += pgsz; + error = 0; out: - up_write(¤t->mm->mmap_sem); - return buffer; + up_write(&mm->mmap_sem); + return error; } -void refund_locked_buffer_memory(struct mm_struct *mm, size_t size) +void refund_locked_memory(struct mm_struct *mm, size_t size) { unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; -- cgit v1.2.2 From 39517091f88fae32b52254b561ced78da1eaf0a7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 11:05:52 -0400 Subject: tracing/lockdep: convert lockdep to use TRACE_EVENT macro The TRACE_FORMAT will soon be deprecated. This patch converts it to the TRACE_EVENT macro. Note, this change should also speed up the tracing. [ Impact: remove a user of deprecated TRACE_FORMAT ] Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/trace/events/lockdep.h | 56 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h index 3ca315c1429d..0e956c9dfd7e 100644 --- a/include/trace/events/lockdep.h +++ b/include/trace/events/lockdep.h @@ -9,28 +9,64 @@ #ifdef CONFIG_LOCKDEP -TRACE_FORMAT(lock_acquire, +TRACE_EVENT(lock_acquire, + TP_PROTO(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *next_lock, unsigned long ip), + TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), - TP_FMT("%s%s%s", trylock ? "try " : "", - read ? "read " : "", lock->name) - ); -TRACE_FORMAT(lock_release, + TP_STRUCT__entry( + __field(unsigned int, flags) + __string(name, lock->name) + ), + + TP_fast_assign( + __entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0); + __assign_str(name, lock->name); + ), + + TP_printk("%s%s%s", (__entry->flags & 1) ? "try " : "", + (__entry->flags & 2) ? "read " : "", + __get_str(name)) +); + +TRACE_EVENT(lock_release, + TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), + TP_ARGS(lock, nested, ip), - TP_FMT("%s", lock->name) - ); + + TP_STRUCT__entry( + __string(name, lock->name) + ), + + TP_fast_assign( + __assign_str(name, lock->name); + ), + + TP_printk("%s", __get_str(name)) +); #ifdef CONFIG_LOCK_STAT -TRACE_FORMAT(lock_contended, +TRACE_EVENT(lock_contended, + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + TP_ARGS(lock, ip), - TP_FMT("%s", lock->name) - ); + + TP_STRUCT__entry( + __string(name, lock->name) + ), + + TP_fast_assign( + __assign_str(name, lock->name); + ), + + TP_printk("%s", __get_str(name)) +); TRACE_EVENT(lock_acquired, TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), -- cgit v1.2.2 From 160031b556e93590fa8635210d73d93c3d3853a9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 11:26:55 -0400 Subject: tracing/irq: convert irq traces to use TRACE_EVENT macro The TRACE_FORMAT will soon be deprecated. This patch converts it to the TRACE_EVENT macro. Note, this change should also speed up the tracing. [ Impact: remove a user of deprecated TRACE_FORMAT ] Cc: Jason Baron Signed-off-by: Steven Rostedt --- include/trace/events/irq.h | 57 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 75e3468e4493..768686467518 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -10,11 +10,24 @@ /* * Tracepoint for entry of interrupt handler: */ -TRACE_FORMAT(irq_handler_entry, +TRACE_EVENT(irq_handler_entry, + TP_PROTO(int irq, struct irqaction *action), + TP_ARGS(irq, action), - TP_FMT("irq=%d handler=%s", irq, action->name) - ); + + TP_STRUCT__entry( + __field( int, irq ) + __string( name, action->name ) + ), + + TP_fast_assign( + __entry->irq = irq; + __assign_str(name, action->name); + ), + + TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name)) +); /* * Tracepoint for return of an interrupt handler: @@ -39,17 +52,43 @@ TRACE_EVENT(irq_handler_exit, __entry->irq, __entry->ret ? "handled" : "unhandled") ); -TRACE_FORMAT(softirq_entry, +TRACE_EVENT(softirq_entry, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); -TRACE_FORMAT(softirq_exit, + TP_STRUCT__entry( + __field( int, vec ) + __string( name, softirq_to_name[h-vec] ) + ), + + TP_fast_assign( + __entry->vec = (int)(h - vec); + __assign_str(name, softirq_to_name[h-vec]); + ), + + TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name)) +); + +TRACE_EVENT(softirq_exit, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); + + TP_STRUCT__entry( + __field( int, vec ) + __string( name, softirq_to_name[h-vec] ) + ), + + TP_fast_assign( + __entry->vec = (int)(h - vec); + __assign_str(name, softirq_to_name[h-vec]); + ), + + TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name)) +); #endif /* _TRACE_IRQ_H */ -- cgit v1.2.2 From b8e65554d80b4c560d201362d0e8fa02109d89fd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 11:50:39 -0400 Subject: tracing: remove deprecated TRACE_FORMAT The TRACE_FORMAT macro has been deprecated by the TRACE_EVENT macro. There are no more users. All new users must use the TRACE_EVENT macro. [ Impact: remove old functionality ] Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 5 ---- include/trace/define_trace.h | 4 --- include/trace/ftrace.h | 66 -------------------------------------------- 3 files changed, 75 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 4353f3f7e624..14df7e635d43 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -158,11 +158,6 @@ static inline void tracepoint_synchronize_unregister(void) #define PARAMS(args...) args -#ifndef TRACE_FORMAT -#define TRACE_FORMAT(name, proto, args, fmt) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) -#endif - #ifndef TRACE_EVENT /* * For use with the TRACE_EVENT macro: diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index abc611feeb8c..f7a7ae1e8f90 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -26,10 +26,6 @@ #define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ DEFINE_TRACE(name) -#undef TRACE_FORMAT -#define TRACE_FORMAT(name, proto, args, print) \ - DEFINE_TRACE(name) - #undef DECLARE_TRACE #define DECLARE_TRACE(name, proto, args) \ DEFINE_TRACE(name) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index a77f71a46dbe..1e681142f1da 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -18,9 +18,6 @@ #include -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) - #undef __array #define __array(type, item, len) type item[len]; @@ -62,9 +59,6 @@ * */ -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) - #undef __array #define __array(type, item, len) @@ -298,16 +292,6 @@ ftrace_define_fields_##call(void) \ * unregister_trace_(ftrace_event_); * } * - * For those macros defined with TRACE_FORMAT: - * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_ = { - * .name = "", - * .regfunc = ftrace_reg_event_, - * .unregfunc = ftrace_unreg_event_, - * } - * * * For those macros defined with TRACE_EVENT: * @@ -417,56 +401,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ #define _TRACE_PROFILE_INIT(call) #endif -#define _TRACE_FORMAT(call, proto, args, fmt) \ -static void ftrace_event_##call(proto) \ -{ \ - event_trace_printk(_RET_IP_, #call ": " fmt); \ -} \ - \ -static int ftrace_reg_event_##call(void) \ -{ \ - int ret; \ - \ - ret = register_trace_##call(ftrace_event_##call); \ - if (ret) \ - pr_info("event trace: Could not activate trace point " \ - "probe to " #call "\n"); \ - return ret; \ -} \ - \ -static void ftrace_unreg_event_##call(void) \ -{ \ - unregister_trace_##call(ftrace_event_##call); \ -} \ - \ -static struct ftrace_event_call event_##call; \ - \ -static int ftrace_init_event_##call(void) \ -{ \ - int id; \ - \ - id = register_ftrace_event(NULL); \ - if (!id) \ - return -ENODEV; \ - event_##call.id = id; \ - return 0; \ -} - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) \ -_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ -static struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_init_event_##call, \ - .regfunc = ftrace_reg_event_##call, \ - .unregfunc = ftrace_unreg_event_##call, \ - _TRACE_PROFILE_INIT(call) \ -} - #undef __entry #define __entry entry -- cgit v1.2.2 From 060fa5c83e67901ba47ab484cfcdb32737d630ba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 12:20:52 -0400 Subject: tracing/events: reuse trace event ids after overflow With modules being able to add trace events, and the max trace event counter is 16 bits (65536) we can overflow the counter easily with a simple while loop adding and removing modules that contain trace events. This patch links together the registered trace events and on overflow searches for available trace event ids. It will still fail if over 65536 events are registered, but considering that a typical kernel only has 22000 functions, 65000 events should be sufficient. Reported-by: Li Zefan Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + kernel/trace/trace_output.c | 71 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 61 insertions(+), 11 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 07e0a6d64a24..78a9ba24cbf6 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -56,6 +56,7 @@ typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, int flags); struct trace_event { struct hlist_node node; + struct list_head list; int type; trace_print_func trace; trace_print_func raw; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 06997e75114b..5fc51f0f75fc 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -483,6 +483,36 @@ struct trace_event *ftrace_find_event(int type) return NULL; } +static LIST_HEAD(ftrace_event_list); + +static int trace_search_list(struct list_head **list) +{ + struct trace_event *e; + int last = __TRACE_LAST_TYPE; + + if (list_empty(&ftrace_event_list)) { + *list = &ftrace_event_list; + return last + 1; + } + + /* + * We used up all possible max events, + * lets see if somebody freed one. + */ + list_for_each_entry(e, &ftrace_event_list, list) { + if (e->type != last + 1) + break; + last++; + } + + /* Did we used up all 65 thousand events??? */ + if ((last + 1) > FTRACE_MAX_EVENT) + return 0; + + *list = &e->list; + return last + 1; +} + /** * register_ftrace_event - register output for an event type * @event: the event type to register @@ -505,20 +535,40 @@ int register_ftrace_event(struct trace_event *event) mutex_lock(&trace_event_mutex); - if (!event) { - ret = next_event_type++; + if (WARN_ON(!event)) goto out; - } - if (!event->type) - event->type = next_event_type++; - else if (event->type > __TRACE_LAST_TYPE) { + INIT_LIST_HEAD(&event->list); + + if (!event->type) { + struct list_head *list; + + if (next_event_type > FTRACE_MAX_EVENT) { + + event->type = trace_search_list(&list); + if (!event->type) + goto out; + + } else { + + event->type = next_event_type++; + list = &ftrace_event_list; + } + + if (WARN_ON(ftrace_find_event(event->type))) + goto out; + + list_add_tail(&event->list, list); + + } else if (event->type > __TRACE_LAST_TYPE) { printk(KERN_WARNING "Need to add type to trace.h\n"); WARN_ON(1); - } - - if (ftrace_find_event(event->type)) goto out; + } else { + /* Is this event already used */ + if (ftrace_find_event(event->type)) + goto out; + } if (event->trace == NULL) event->trace = trace_nop_print; @@ -537,8 +587,6 @@ int register_ftrace_event(struct trace_event *event) out: mutex_unlock(&trace_event_mutex); - WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT); - return ret; } EXPORT_SYMBOL_GPL(register_ftrace_event); @@ -551,6 +599,7 @@ int unregister_ftrace_event(struct trace_event *event) { mutex_lock(&trace_event_mutex); hlist_del(&event->node); + list_del(&event->list); mutex_unlock(&trace_event_mutex); return 0; -- cgit v1.2.2 From 701970b3a83cc639c1ec8fc6f40a7871cb99426f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 23:11:22 -0400 Subject: tracing/events: make modules have their own file_operations structure For proper module reference counting, the file_operations that modules use must have the "owner" field set to the module. Unfortunately, the trace events use share file_operations. The same file_operations are used by all both kernel core and all modules. This patch makes the modules allocate their own file_operations and copies the functions from the core kernel. This allows those file operations to be owned by the module. Care is taken to free this code on module unload. Thanks to Greg KH for reminding me that file_operations must be owned by the module to have reference counting take place. [ Impact: fix modular tracepoints / potential crash ] Signed-off-by: Steven Rostedt Acked-by: Greg Kroah-Hartman --- kernel/trace/trace_events.c | 95 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b92081588088..be4d3a437c17 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -770,7 +770,11 @@ event_subsystem_dir(const char *name, struct dentry *d_events) } static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) +event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) { struct dentry *entry; int ret; @@ -800,11 +804,11 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) if (call->regfunc) entry = trace_create_file("enable", 0644, call->dir, call, - &ftrace_enable_fops); + enable); if (call->id) entry = trace_create_file("id", 0444, call->dir, call, - &ftrace_event_id_fops); + id); if (call->define_fields) { ret = call->define_fields(); @@ -814,7 +818,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return ret; } entry = trace_create_file("filter", 0644, call->dir, call, - &ftrace_event_filter_fops); + filter); } /* A trace may not want to export its format */ @@ -822,7 +826,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return 0; entry = trace_create_file("format", 0444, call->dir, call, - &ftrace_event_format_fops); + format); return 0; } @@ -833,8 +837,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) event++) #ifdef CONFIG_MODULES + +static LIST_HEAD(ftrace_module_file_list); + +/* + * Modules must own their file_operations to keep up with + * reference counting. + */ +struct ftrace_module_file_ops { + struct list_head list; + struct module *mod; + struct file_operations id; + struct file_operations enable; + struct file_operations format; + struct file_operations filter; +}; + +static struct ftrace_module_file_ops * +trace_create_file_ops(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops; + + /* + * This is a bit of a PITA. To allow for correct reference + * counting, modules must "own" their file_operations. + * To do this, we allocate the file operations that will be + * used in the event directory. + */ + + file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); + if (!file_ops) + return NULL; + + file_ops->mod = mod; + + file_ops->id = ftrace_event_id_fops; + file_ops->id.owner = mod; + + file_ops->enable = ftrace_enable_fops; + file_ops->enable.owner = mod; + + file_ops->filter = ftrace_event_filter_fops; + file_ops->filter.owner = mod; + + file_ops->format = ftrace_event_format_fops; + file_ops->format.owner = mod; + + list_add(&file_ops->list, &ftrace_module_file_list); + + return file_ops; +} + static void trace_module_add_events(struct module *mod) { + struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call, *start, *end; struct dentry *d_events; @@ -852,14 +908,27 @@ static void trace_module_add_events(struct module *mod) /* The linker may leave blanks */ if (!call->name) continue; + + /* + * This module has events, create file ops for this module + * if not already done. + */ + if (!file_ops) { + file_ops = trace_create_file_ops(mod); + if (!file_ops) + return; + } call->mod = mod; list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events); + event_create_dir(call, d_events, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); } } static void trace_module_remove_events(struct module *mod) { + struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; list_for_each_entry_safe(call, p, &ftrace_events, list) { @@ -874,6 +943,16 @@ static void trace_module_remove_events(struct module *mod) list_del(&call->list); } } + + /* Now free the file_operations */ + list_for_each_entry(file_ops, &ftrace_module_file_list, list) { + if (file_ops->mod == mod) + break; + } + if (&file_ops->list != &ftrace_module_file_list) { + list_del(&file_ops->list); + kfree(file_ops); + } } static int trace_module_notify(struct notifier_block *self, @@ -954,7 +1033,9 @@ static __init int event_trace_init(void) if (!call->name) continue; list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events); + event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, &ftrace_event_filter_fops, + &ftrace_event_format_fops); } ret = register_module_notifier(&trace_module_nb); -- cgit v1.2.2 From cd891ae0305601bdb4d2e7e85282961c4ff256cd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Apr 2009 11:39:34 -0400 Subject: tracing: convert ftrace_dump spinlocks to raw ftrace_dump is used for printing out the contents of the ftrace ring buffer to the console on failure. Currently it uses a spinlock to synchronize the output from multiple failures on different CPUs. This spin lock currently is a normal spinlock and can cause issues with lockdep and lock tracing. This patch converts it to raw since it is for error handling only. The lock is local to the ftrace_dump and is not used by any other infrastructure. [ Impact: prevent ftrace_dump from locking up by internal tracing ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b6183bc9ecae..5d704a41f836 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4114,7 +4114,8 @@ trace_printk_seq(struct trace_seq *s) static void __ftrace_dump(bool disable_tracing) { - static DEFINE_SPINLOCK(ftrace_dump_lock); + static raw_spinlock_t ftrace_dump_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; unsigned int old_userobj; @@ -4123,7 +4124,8 @@ static void __ftrace_dump(bool disable_tracing) int cnt = 0, cpu; /* only one dump */ - spin_lock_irqsave(&ftrace_dump_lock, flags); + local_irq_save(flags); + __raw_spin_lock(&ftrace_dump_lock); if (dump_ran) goto out; @@ -4195,7 +4197,8 @@ static void __ftrace_dump(bool disable_tracing) } out: - spin_unlock_irqrestore(&ftrace_dump_lock, flags); + __raw_spin_unlock(&ftrace_dump_lock); + local_irq_restore(flags); } /* By default: disable tracing after the dump */ -- cgit v1.2.2 From 5beae6efd1004b44c3e257dc96087978e4c763c1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:16:21 -0400 Subject: tracing: fix ref count in splice pages The pages allocated for the splice binary buffer did not initialize the ref count correctly. This caused pages not to be freed and causes a drastic memory leak. Thanks to logdev I was able to trace the tracer to find where the leak was. [ Impact: stop memory leak when using splice ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5d704a41f836..9058240c85ca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3531,6 +3531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if (!ref) break; + ref->ref = 1; ref->buffer = info->tr->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer); if (!ref->page) { -- cgit v1.2.2 From 93459c6cb9816c52200993d29dd18cea1daee335 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:23:13 -0400 Subject: tracing: only add splice page if entries exist The splice code allocates a page even when the ring buffer is empty. It detects the ring buffer being empty when it it fails to copy anything from the ring buffer into the page. This patch adds a check to see if there is anything in the ring buffer before allocating a page. Thanks to logdev for letting me trace the tracer to find this. [ Impact: speed up due to removing unnecessary allocation ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9058240c85ca..0aeb3b93414e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3508,7 +3508,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; - int size, i; + int entries, size, i; size_t ret; if (*ppos & (PAGE_SIZE - 1)) { @@ -3523,7 +3523,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } - for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) { + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + + for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; @@ -3564,6 +3566,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; *ppos += PAGE_SIZE; + + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); } spd.nr_pages = i; -- cgit v1.2.2 From f2957f1f196b0217644a17c1379855a118a37d72 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:26:30 -0400 Subject: tracing: have splice only copy full pages Splice works with pages, it is much more effecient to use an entire page than to copy bits over several pages. Using logdev to trace the internals of the splice mechanism, I was able to see that splice can be very aggressive. When tracing is occurring, and the reader caught up to the writer, and the writer is on the reader page, the reader will copy what is there into the splice page. Splice may iterate over several pages and if the writer is still writing to the page, the reader will keep copying bits to new pages to pass to userspace. This patch changes it to only pass data to userspace if the page is full (the writer has left the page). This has a small side effect that splice can not read a partial page, and must wait for the page to fill. This should not be an issue. If tracing has stopped, then a use of "read" will still read all of the page. [ Impact: better performance for ring buffer splice code ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0aeb3b93414e..f5427e0fc982 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3542,7 +3542,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } r = ring_buffer_read_page(ref->buffer, &ref->page, - len, info->cpu, 0); + len, info->cpu, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->page); -- cgit v1.2.2 From 7d7d2b803159d4edeb051b0e5efbc1a8d9ef1c67 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 27 Apr 2009 12:37:49 -0400 Subject: ring-buffer: fix printk output The warning output in trace_recursive_lock uses %d for a long when it should be %ld. [ Impact: fix compile warning ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9692f100ec1a..f4cc59040ebf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1491,7 +1491,7 @@ static int trace_recursive_lock(void) /* Disable all tracing before we do anything else */ tracing_off_permanent(); - printk_once(KERN_WARNING "Tracing recursion: depth[%d]:" + printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" "HC[%lu]:SC[%lu]:NMI[%lu]\n", current->trace_recursion, hardirq_count() >> HARDIRQ_SHIFT, -- cgit v1.2.2 From 0492e1bb8fe7d122901c9f3af75e537d4129712e Mon Sep 17 00:00:00 2001 From: Stuart Bennett Date: Tue, 28 Apr 2009 20:17:49 +0100 Subject: tracing: x86, mmiotrace: code consistency/legibility improvement kmmio_probe being *p and kmmio_fault_page being sometimes *f and sometimes *p is not helpful. [ Impact: cleanup ] Signed-off-by: Stuart Bennett Acked-by: Pekka Paalanen Cc: Steven Rostedt LKML-Reference: <1240946271-7083-3-git-send-email-stuart@freedesktop.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 4f115e00486b..869181a917d9 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -97,13 +97,13 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr) static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) { struct list_head *head; - struct kmmio_fault_page *p; + struct kmmio_fault_page *f; page &= PAGE_MASK; head = kmmio_page_list(page); - list_for_each_entry_rcu(p, head, list) { - if (p->page == page) - return p; + list_for_each_entry_rcu(f, head, list) { + if (f->page == page) + return f; } return NULL; } @@ -439,12 +439,12 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head) head, struct kmmio_delayed_release, rcu); - struct kmmio_fault_page *p = dr->release_list; - while (p) { - struct kmmio_fault_page *next = p->release_next; - BUG_ON(p->count); - kfree(p); - p = next; + struct kmmio_fault_page *f = dr->release_list; + while (f) { + struct kmmio_fault_page *next = f->release_next; + BUG_ON(f->count); + kfree(f); + f = next; } kfree(dr); } @@ -453,19 +453,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) { struct kmmio_delayed_release *dr = container_of(head, struct kmmio_delayed_release, rcu); - struct kmmio_fault_page *p = dr->release_list; + struct kmmio_fault_page *f = dr->release_list; struct kmmio_fault_page **prevp = &dr->release_list; unsigned long flags; spin_lock_irqsave(&kmmio_lock, flags); - while (p) { - if (!p->count) { - list_del_rcu(&p->list); - prevp = &p->release_next; + while (f) { + if (!f->count) { + list_del_rcu(&f->list); + prevp = &f->release_next; } else { - *prevp = p->release_next; + *prevp = f->release_next; } - p = p->release_next; + f = f->release_next; } spin_unlock_irqrestore(&kmmio_lock, flags); -- cgit v1.2.2 From 46e91d00b1165b14b484aa33800e1bba0794ae1a Mon Sep 17 00:00:00 2001 From: Stuart Bennett Date: Tue, 28 Apr 2009 20:17:50 +0100 Subject: tracing: x86, mmiotrace: refactor clearing/restore of page presence * change function names to clear_* from set_*: in reality we only clear and restore page presence, and never unconditionally set present. Using clear_*({true, false}, ...) is therefore more honest than set_*({false, true}, ...) * upgrade presence storage to pteval_t: doing user-space tracing will require saving and manipulation of the _PAGE_PROTNONE bit, in addition to the existing _PAGE_PRESENT changes, and having multiple bools stored and passed around does not seem optimal [ Impact: refactor, clean up mmiotrace code ] Signed-off-by: Stuart Bennett Acked-by: Pekka Paalanen Cc: Steven Rostedt LKML-Reference: <1240946271-7083-4-git-send-email-stuart@freedesktop.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 869181a917d9..a769d1a2d93b 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -32,7 +32,7 @@ struct kmmio_fault_page { struct list_head list; struct kmmio_fault_page *release_next; unsigned long page; /* location of the fault page */ - bool old_presence; /* page presence prior to arming */ + pteval_t old_presence; /* page presence prior to arming */ bool armed; /* @@ -108,49 +108,51 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) return NULL; } -static void set_pmd_presence(pmd_t *pmd, bool present, bool *old) +static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) { pmdval_t v = pmd_val(*pmd); - *old = !!(v & _PAGE_PRESENT); - v &= ~_PAGE_PRESENT; - if (present) - v |= _PAGE_PRESENT; + if (clear) { + *old = v & _PAGE_PRESENT; + v &= ~_PAGE_PRESENT; + } else /* presume this has been called with clear==true previously */ + v |= *old; set_pmd(pmd, __pmd(v)); } -static void set_pte_presence(pte_t *pte, bool present, bool *old) +static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) { pteval_t v = pte_val(*pte); - *old = !!(v & _PAGE_PRESENT); - v &= ~_PAGE_PRESENT; - if (present) - v |= _PAGE_PRESENT; + if (clear) { + *old = v & _PAGE_PRESENT; + v &= ~_PAGE_PRESENT; + } else /* presume this has been called with clear==true previously */ + v |= *old; set_pte_atomic(pte, __pte(v)); } -static int set_page_presence(unsigned long addr, bool present, bool *old) +static int clear_page_presence(struct kmmio_fault_page *f, bool clear) { unsigned int level; - pte_t *pte = lookup_address(addr, &level); + pte_t *pte = lookup_address(f->page, &level); if (!pte) { - pr_err("kmmio: no pte for page 0x%08lx\n", addr); + pr_err("kmmio: no pte for page 0x%08lx\n", f->page); return -1; } switch (level) { case PG_LEVEL_2M: - set_pmd_presence((pmd_t *)pte, present, old); + clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence); break; case PG_LEVEL_4K: - set_pte_presence(pte, present, old); + clear_pte_presence(pte, clear, &f->old_presence); break; default: pr_err("kmmio: unexpected page level 0x%x.\n", level); return -1; } - __flush_tlb_one(addr); + __flush_tlb_one(f->page); return 0; } @@ -171,9 +173,9 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); if (f->armed) { pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", - f->page, f->count, f->old_presence); + f->page, f->count, !!f->old_presence); } - ret = set_page_presence(f->page, false, &f->old_presence); + ret = clear_page_presence(f, true); WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); f->armed = true; return ret; @@ -182,8 +184,7 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) /** Restore the given page to saved presence state. */ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) { - bool tmp; - int ret = set_page_presence(f->page, f->old_presence, &tmp); + int ret = clear_page_presence(f, false); WARN_ONCE(ret < 0, KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); f->armed = false; -- cgit v1.2.2 From 0f9a623dd6c9b5b4dd00c232f29525bfc7a8ecf2 Mon Sep 17 00:00:00 2001 From: Stuart Bennett Date: Tue, 28 Apr 2009 20:17:51 +0100 Subject: tracing: x86, mmiotrace: only register for die notifier when tracer active Follow up to afcfe024aebd74b0984a41af9a34e009cf5badaf in Linus' tree ("x86: mmiotrace: quieten spurious warning message") Signed-off-by: Stuart Bennett Acked-by: Pekka Paalanen Cc: Steven Rostedt LKML-Reference: <1240946271-7083-5-git-send-email-stuart@freedesktop.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 27 ++++++++++++++++++++++----- arch/x86/mm/mmio-mod.c | 2 ++ include/linux/mmiotrace.h | 2 ++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index a769d1a2d93b..256ce643b0ba 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -311,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) { - pr_debug("kmmio: spurious debug trap on CPU %d.\n", + /* + * debug traps without an active context are due to either + * something external causing them (f.e. using a debugger while + * mmio tracing enabled), or erroneous behaviour + */ + pr_warning("kmmio: unexpected debug trap on CPU %d.\n", smp_processor_id()); goto out; } @@ -529,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p) } EXPORT_SYMBOL(unregister_kmmio_probe); -static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, - void *args) +static int +kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) { struct die_args *arg = args; @@ -545,11 +550,23 @@ static struct notifier_block nb_die = { .notifier_call = kmmio_die_notifier }; -static int __init init_kmmio(void) +int kmmio_init(void) { int i; + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) INIT_LIST_HEAD(&kmmio_page_table[i]); + return register_die_notifier(&nb_die); } -fs_initcall(init_kmmio); /* should be before device_initcall() */ + +void kmmio_cleanup(void) +{ + int i; + + unregister_die_notifier(&nb_die); + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) { + WARN_ONCE(!list_empty(&kmmio_page_table[i]), + KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n"); + } +} diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index c9342ed8b402..132772a8ec57 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -451,6 +451,7 @@ void enable_mmiotrace(void) if (nommiotrace) pr_info(NAME "MMIO tracing disabled.\n"); + kmmio_init(); enter_uniprocessor(); spin_lock_irq(&trace_lock); atomic_inc(&mmiotrace_enabled); @@ -473,6 +474,7 @@ void disable_mmiotrace(void) clear_trace_list(); /* guarantees: no more kmmio callbacks */ leave_uniprocessor(); + kmmio_cleanup(); pr_info(NAME "disabled.\n"); out: mutex_unlock(&mmiotrace_mutex); diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 3d1b7bde1283..97491f78b08c 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -30,6 +30,8 @@ extern unsigned int kmmio_count; extern int register_kmmio_probe(struct kmmio_probe *p); extern void unregister_kmmio_probe(struct kmmio_probe *p); +extern int kmmio_init(void); +extern void kmmio_cleanup(void); #ifdef CONFIG_MMIOTRACE /* kmmio is active by some kmmio_probes? */ -- cgit v1.2.2 From 30e673b230f9d556eb81ef68a7b1a08c8b3b142c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 28 Apr 2009 03:04:47 -0500 Subject: tracing/filters: move preds into event_filter object Create a new event_filter object, and move the pred-related members out of the call and subsystem objects and into the filter object - the details of the filter implementation don't need to be exposed in the call and subsystem in any case, and it will also help make the new parser implementation a little cleaner. [ Impact: refactor trace-filter code to prepare for new features ] Signed-off-by: Tom Zanussi Acked-by: Steven Rostedt Cc: fweisbec@gmail.com Cc: Li Zefan LKML-Reference: <1240905887.6416.119.camel@tropicana> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 4 +- kernel/trace/trace.h | 10 ++-- kernel/trace/trace_events.c | 3 +- kernel/trace/trace_events_filter.c | 107 +++++++++++++++++++++++-------------- 4 files changed, 76 insertions(+), 48 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 78a9ba24cbf6..46a27f2695a6 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -101,8 +101,8 @@ struct ftrace_event_call { int (*show_format)(struct trace_seq *s); int (*define_fields)(void); struct list_head fields; - int n_preds; - struct filter_pred **preds; + int filter_active; + void *filter; void *mod; #ifdef CONFIG_EVENT_PROFILE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7d55bcf50e49..1fb7d6ccadf4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -731,12 +731,16 @@ struct ftrace_event_field { int size; }; +struct event_filter { + int n_preds; + struct filter_pred **preds; +}; + struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; - int n_preds; - struct filter_pred **preds; + void *filter; }; struct filter_pred; @@ -774,7 +778,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) { + if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index be4d3a437c17..1cd1f37373dd 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -757,8 +757,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) list_add(&system->list, &event_subsystems); - system->preds = NULL; - system->n_preds = 0; + system->filter = NULL; entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 65418288f957..1e861eca3d02 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -93,11 +93,12 @@ static int filter_pred_none(struct filter_pred *pred, void *event) /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct ftrace_event_call *call, void *rec) { + struct event_filter *filter = call->filter; int i, matched, and_failed = 0; struct filter_pred *pred; - for (i = 0; i < call->n_preds; i++) { - pred = call->preds[i]; + for (i = 0; i < filter->n_preds; i++) { + pred = filter->preds[i]; if (and_failed && !pred->or) continue; matched = pred->fn(pred, rec); @@ -115,20 +116,20 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) } EXPORT_SYMBOL_GPL(filter_match_preds); -static void __filter_print_preds(struct filter_pred **preds, int n_preds, +static void __filter_print_preds(struct event_filter *filter, struct trace_seq *s) { - char *field_name; struct filter_pred *pred; + char *field_name; int i; - if (!n_preds) { + if (!filter || !filter->n_preds) { trace_seq_printf(s, "none\n"); return; } - for (i = 0; i < n_preds; i++) { - pred = preds[i]; + for (i = 0; i < filter->n_preds; i++) { + pred = filter->preds[i]; field_name = pred->field_name; if (i) trace_seq_printf(s, pred->or ? "|| " : "&& "); @@ -144,7 +145,7 @@ static void __filter_print_preds(struct filter_pred **preds, int n_preds, void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s) { mutex_lock(&filter_mutex); - __filter_print_preds(call->preds, call->n_preds, s); + __filter_print_preds(call->filter, s); mutex_unlock(&filter_mutex); } @@ -152,7 +153,7 @@ void filter_print_subsystem_preds(struct event_subsystem *system, struct trace_seq *s) { mutex_lock(&filter_mutex); - __filter_print_preds(system->preds, system->n_preds, s); + __filter_print_preds(system->filter, s); mutex_unlock(&filter_mutex); } @@ -200,12 +201,14 @@ static int filter_set_pred(struct filter_pred *dest, static void __filter_disable_preds(struct ftrace_event_call *call) { + struct event_filter *filter = call->filter; int i; - call->n_preds = 0; + call->filter_active = 0; + filter->n_preds = 0; for (i = 0; i < MAX_FILTER_PRED; i++) - call->preds[i]->fn = filter_pred_none; + filter->preds[i]->fn = filter_pred_none; } void filter_disable_preds(struct ftrace_event_call *call) @@ -217,32 +220,39 @@ void filter_disable_preds(struct ftrace_event_call *call) int init_preds(struct ftrace_event_call *call) { + struct event_filter *filter; struct filter_pred *pred; int i; - call->n_preds = 0; - - call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!call->preds) + filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!call->filter) return -ENOMEM; + call->filter_active = 0; + filter->n_preds = 0; + + filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); + if (!filter->preds) + goto oom; + for (i = 0; i < MAX_FILTER_PRED; i++) { pred = kzalloc(sizeof(*pred), GFP_KERNEL); if (!pred) goto oom; pred->fn = filter_pred_none; - call->preds[i] = pred; + filter->preds[i] = pred; } return 0; oom: for (i = 0; i < MAX_FILTER_PRED; i++) { - if (call->preds[i]) - filter_free_pred(call->preds[i]); + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); } - kfree(call->preds); - call->preds = NULL; + kfree(filter->preds); + kfree(call->filter); + call->filter = NULL; return -ENOMEM; } @@ -250,15 +260,16 @@ EXPORT_SYMBOL_GPL(init_preds); static void __filter_free_subsystem_preds(struct event_subsystem *system) { + struct event_filter *filter = system->filter; struct ftrace_event_call *call; int i; - if (system->n_preds) { - for (i = 0; i < system->n_preds; i++) - filter_free_pred(system->preds[i]); - kfree(system->preds); - system->preds = NULL; - system->n_preds = 0; + if (filter && filter->n_preds) { + for (i = 0; i < filter->n_preds; i++) + filter_free_pred(filter->preds[i]); + kfree(filter->preds); + kfree(filter); + system->filter = NULL; } list_for_each_entry(call, &ftrace_events, list) { @@ -281,21 +292,23 @@ static int filter_add_pred_fn(struct ftrace_event_call *call, struct filter_pred *pred, filter_pred_fn_t fn) { + struct event_filter *filter = call->filter; int idx, err; - if (call->n_preds && !pred->compound) + if (filter->n_preds && !pred->compound) __filter_disable_preds(call); - if (call->n_preds == MAX_FILTER_PRED) + if (filter->n_preds == MAX_FILTER_PRED) return -ENOSPC; - idx = call->n_preds; - filter_clear_pred(call->preds[idx]); - err = filter_set_pred(call->preds[idx], pred, fn); + idx = filter->n_preds; + filter_clear_pred(filter->preds[idx]); + err = filter_set_pred(filter->preds[idx], pred, fn); if (err) return err; - call->n_preds++; + filter->n_preds++; + call->filter_active = 1; return 0; } @@ -366,29 +379,41 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred) { + struct event_filter *filter = system->filter; struct ftrace_event_call *call; mutex_lock(&filter_mutex); - if (system->n_preds && !pred->compound) + if (filter && filter->n_preds && !pred->compound) { __filter_free_subsystem_preds(system); + filter = NULL; + } - if (!system->n_preds) { - system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), + if (!filter) { + system->filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!system->filter) { + mutex_unlock(&filter_mutex); + return -ENOMEM; + } + filter = system->filter; + filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!system->preds) { + + if (!filter->preds) { + kfree(system->filter); + system->filter = NULL; mutex_unlock(&filter_mutex); return -ENOMEM; } } - if (system->n_preds == MAX_FILTER_PRED) { + if (filter->n_preds == MAX_FILTER_PRED) { mutex_unlock(&filter_mutex); return -ENOSPC; } - system->preds[system->n_preds] = pred; - system->n_preds++; + filter->preds[filter->n_preds] = pred; + filter->n_preds++; list_for_each_entry(call, &ftrace_events, list) { int err; @@ -401,8 +426,8 @@ int filter_add_subsystem_pred(struct event_subsystem *system, err = __filter_add_pred(call, pred); if (err == -ENOMEM) { - system->preds[system->n_preds] = NULL; - system->n_preds--; + filter->preds[filter->n_preds] = NULL; + filter->n_preds--; mutex_unlock(&filter_mutex); return err; } -- cgit v1.2.2 From a118e4d1402f1349fe3d953493e4168a300a752d Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 28 Apr 2009 03:04:53 -0500 Subject: tracing/filters: distinguish between signed and unsigned fields The new filter comparison ops need to be able to distinguish between signed and unsigned field types, so add an is_signed flag/param to the event field struct/trace_define_fields(). Also define a simple macro, is_signed_type() to determine the signedness at compile time, used in the trace macros. If the is_signed_type() macro won't work with a specific type, a new slightly modified version of TRACE_FIELD() called TRACE_FIELD_SIGN(), allows the signedness to be set explicitly. [ Impact: extend trace-filter code for new feature ] Signed-off-by: Tom Zanussi Acked-by: Steven Rostedt Cc: fweisbec@gmail.com Cc: Li Zefan LKML-Reference: <1240905893.6416.120.camel@tropicana> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 7 ++++--- include/trace/ftrace.h | 16 ++++++++-------- kernel/trace/trace.h | 1 + kernel/trace/trace_event_types.h | 4 ++-- kernel/trace/trace_events.c | 3 ++- kernel/trace/trace_export.c | 29 ++++++++++++++++++++++------- 6 files changed, 39 insertions(+), 21 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 46a27f2695a6..e61a7403f3d0 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -122,8 +122,9 @@ extern int filter_current_check_discard(struct ftrace_event_call *call, struct ring_buffer_event *event); extern int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size); + char *name, int offset, int size, int is_signed); +#define is_signed_type(type) (((type)(-1)) < 0) /* * The double __builtin_constant_p is because gcc will give us an error @@ -144,10 +145,10 @@ do { \ __trace_printk(ip, fmt, ##args); \ } while (0) -#define __common_field(type, item) \ +#define __common_field(type, item, is_signed) \ ret = trace_define_field(event_call, #type, "common_" #item, \ offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item)); \ + sizeof(field.ent.item), is_signed); \ if (ret) \ return ret; diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 1e681142f1da..edb02bc9f8ff 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -225,7 +225,7 @@ ftrace_format_##call(struct trace_seq *s) \ #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), is_signed_type(type)); \ if (ret) \ return ret; @@ -234,7 +234,7 @@ ftrace_format_##call(struct trace_seq *s) \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), 0); \ if (ret) \ return ret; @@ -242,7 +242,7 @@ ftrace_format_##call(struct trace_seq *s) \ #define __string(item, src) \ ret = trace_define_field(event_call, "__str_loc", #item, \ offsetof(typeof(field), __str_loc_##item), \ - sizeof(field.__str_loc_##item)); + sizeof(field.__str_loc_##item), 0); #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ @@ -253,11 +253,11 @@ ftrace_define_fields_##call(void) \ struct ftrace_event_call *event_call = &event_##call; \ int ret; \ \ - __common_field(int, type); \ - __common_field(unsigned char, flags); \ - __common_field(unsigned char, preempt_count); \ - __common_field(int, pid); \ - __common_field(int, tgid); \ + __common_field(int, type, 1); \ + __common_field(unsigned char, flags, 0); \ + __common_field(unsigned char, preempt_count, 0); \ + __common_field(int, pid, 1); \ + __common_field(int, tgid, 1); \ \ tstruct; \ \ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1fb7d6ccadf4..866d0108fd2f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -729,6 +729,7 @@ struct ftrace_event_field { char *type; int offset; int size; + int is_signed; }; struct event_filter { diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index cfcecc4fd86d..5e32e375134d 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -141,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, TRACE_STRUCT( - TRACE_FIELD(ktime_t, state_data.stamp, stamp) - TRACE_FIELD(ktime_t, state_data.end, end) + TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) + TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1) TRACE_FIELD(int, state_data.type, type) TRACE_FIELD(int, state_data.state, state) ), diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1cd1f37373dd..bbbea7479371 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -26,7 +26,7 @@ static DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size) + char *name, int offset, int size, int is_signed) { struct ftrace_event_field *field; @@ -44,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type, field->offset = offset; field->size = size; + field->is_signed = is_signed; list_add(&field->link, &call->fields); return 0; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 0cb1a142c74f..d06cf898dc86 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -50,6 +50,9 @@ extern void __bad_type_size(void); if (!ret) \ return 0; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + TRACE_FIELD(type, item, assign) #undef TP_RAW_FMT #define TP_RAW_FMT(args...) args @@ -98,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_FIELD(type, item, assign)\ entry->item = assign; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + TRACE_FIELD(type, item, assign) + #undef TP_CMD #define TP_CMD(cmd...) cmd @@ -149,7 +156,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD(type, item, assign) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), is_signed_type(type)); \ if (ret) \ return ret; @@ -157,7 +164,15 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), 0); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed); \ if (ret) \ return ret; @@ -173,11 +188,11 @@ ftrace_define_fields_##call(void) \ struct args field; \ int ret; \ \ - __common_field(unsigned char, type); \ - __common_field(unsigned char, flags); \ - __common_field(unsigned char, preempt_count); \ - __common_field(int, pid); \ - __common_field(int, tgid); \ + __common_field(unsigned char, type, 0); \ + __common_field(unsigned char, flags, 0); \ + __common_field(unsigned char, preempt_count, 0); \ + __common_field(int, pid, 1); \ + __common_field(int, tgid, 1); \ \ tstruct; \ \ -- cgit v1.2.2 From 8b3725621074040d380664964ffbc40610aef8c6 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 28 Apr 2009 03:04:59 -0500 Subject: tracing/filters: a better event parser Replace the current event parser hack with a better one. Filters are no longer specified predicate by predicate, but all at once and can use parens and any of the following operators: numeric fields: ==, !=, <, <=, >, >= string fields: ==, != predicates can be combined with the logical operators: &&, || examples: "common_preempt_count > 4" > filter "((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter If there was an error, the erroneous string along with an error message can be seen by looking at the filter e.g.: ((sig >= 10 && sig < 15) || dsig == 17) && comm != bash ^ parse_error: Field not found Currently the caret for an error always appears at the beginning of the filter; a real position should be used, but the error message should be useful even without it. To clear a filter, '0' can be written to the filter file. Filters can also be set or cleared for a complete subsystem by writing the same filter as would be written to an individual event to the filter file at the root of the subsytem. Note however, that if any event in the subsystem lacks a field specified in the filter being set, the set will fail and all filters in the subsytem are automatically cleared. This change from the previous version was made because using only the fields that happen to exist for a given event would most likely result in a meaningless filter. Because the logical operators are now implemented as predicates, the maximum number of predicates in a filter was increased from 8 to 16. [ Impact: add new, extended trace-filter implementation ] Signed-off-by: Tom Zanussi Acked-by: Steven Rostedt Cc: fweisbec@gmail.com Cc: Li Zefan LKML-Reference: <1240905899.6416.121.camel@tropicana> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 2 +- kernel/trace/trace.h | 66 ++- kernel/trace/trace_events.c | 86 ++- kernel/trace/trace_events_filter.c | 1020 ++++++++++++++++++++++++++++-------- 4 files changed, 884 insertions(+), 290 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index e61a7403f3d0..5fff40c9ff59 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -112,7 +112,7 @@ struct ftrace_event_call { #endif }; -#define MAX_FILTER_PRED 8 +#define MAX_FILTER_PRED 32 #define MAX_FILTER_STR_VAL 128 extern int init_preds(struct ftrace_event_call *call); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 866d0108fd2f..7736fe8c1b76 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -735,6 +735,7 @@ struct ftrace_event_field { struct event_filter { int n_preds; struct filter_pred **preds; + char *filter_string; }; struct event_subsystem { @@ -746,7 +747,8 @@ struct event_subsystem { struct filter_pred; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, + int val1, int val2); struct filter_pred { filter_pred_fn_t fn; @@ -756,23 +758,18 @@ struct filter_pred { char *field_name; int offset; int not; - int or; - int compound; - int clear; + int op; + int pop_n; }; -extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct ftrace_event_call *call, +extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); -extern int filter_parse(char **pbuf, struct filter_pred *pred); -extern int filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred); -extern void filter_disable_preds(struct ftrace_event_call *call); -extern void filter_free_subsystem_preds(struct event_subsystem *system); -extern void filter_print_subsystem_preds(struct event_subsystem *system, +extern int apply_event_filter(struct ftrace_event_call *call, + char *filter_string); +extern int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string); +extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); -extern int filter_add_subsystem_pred(struct event_subsystem *system, - struct filter_pred *pred); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -787,6 +784,47 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + default: \ + break; \ + } \ + \ + return match; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bbbea7479371..f789ca540fe1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -492,7 +492,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call, s); + print_event_filter(call, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -505,40 +505,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; - char buf[64], *pbuf = buf; - struct filter_pred *pred; + char *buf; int err; - if (cnt >= sizeof(buf)) + if (cnt >= PAGE_SIZE) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return -ENOMEM; - err = filter_parse(&pbuf, pred); - if (err < 0) { - filter_free_pred(pred); - return err; - } - - if (pred->clear) { - filter_disable_preds(call); - filter_free_pred(pred); - return cnt; + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; } + buf[cnt] = '\0'; - err = filter_add_pred(call, pred); - if (err < 0) { - filter_free_pred(pred); + err = apply_event_filter(call, buf); + free_page((unsigned long) buf); + if (err < 0) return err; - } - - filter_free_pred(pred); *ppos += cnt; @@ -562,7 +548,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_subsystem_preds(system, s); + print_subsystem_event_filter(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -575,38 +561,26 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct event_subsystem *system = filp->private_data; - char buf[64], *pbuf = buf; - struct filter_pred *pred; + char *buf; int err; - if (cnt >= sizeof(buf)) + if (cnt >= PAGE_SIZE) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return -ENOMEM; - err = filter_parse(&pbuf, pred); - if (err < 0) { - filter_free_pred(pred); - return err; - } - - if (pred->clear) { - filter_free_subsystem_preds(system); - filter_free_pred(pred); - return cnt; + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; } + buf[cnt] = '\0'; - err = filter_add_subsystem_pred(system, pred); - if (err < 0) { - filter_free_pred(pred); + err = apply_subsystem_event_filter(system, buf); + free_page((unsigned long) buf); + if (err < 0) return err; - } *ppos += cnt; @@ -760,11 +734,21 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->filter = NULL; + system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + if (!system->filter) { + pr_warning("Could not allocate filter for subsystem " + "'%s'\n", name); + return system->entry; + } + entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); - if (!entry) + if (!entry) { + kfree(system->filter); + system->filter = NULL; pr_warning("Could not create debugfs " "'%s/filter' entry\n", name); + } return system->entry; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1e861eca3d02..f49486687ee2 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -29,51 +29,130 @@ static DEFINE_MUTEX(filter_mutex); -static int filter_pred_64(struct filter_pred *pred, void *event) +enum filter_op_ids { - u64 *addr = (u64 *)(event + pred->offset); - u64 val = (u64)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; -} - -static int filter_pred_32(struct filter_pred *pred, void *event) -{ - u32 *addr = (u32 *)(event + pred->offset); - u32 val = (u32)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; -} - -static int filter_pred_16(struct filter_pred *pred, void *event) + OP_OR, + OP_AND, + OP_NE, + OP_EQ, + OP_LT, + OP_LE, + OP_GT, + OP_GE, + OP_NONE, + OP_OPEN_PAREN, +}; + +struct filter_op { + int id; + char *string; + int precedence; +}; + +static struct filter_op filter_ops[] = { + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, +}; + +enum { + FILT_ERR_NONE, + FILT_ERR_INVALID_OP, + FILT_ERR_UNBALANCED_PAREN, + FILT_ERR_TOO_MANY_OPERANDS, + FILT_ERR_OPERAND_TOO_LONG, + FILT_ERR_FIELD_NOT_FOUND, + FILT_ERR_ILLEGAL_FIELD_OP, + FILT_ERR_ILLEGAL_INTVAL, + FILT_ERR_BAD_SUBSYS_FILTER, + FILT_ERR_TOO_MANY_PREDS, + FILT_ERR_MISSING_FIELD, + FILT_ERR_INVALID_FILTER, +}; + +static char *err_text[] = { + "No error", + "Invalid operator", + "Unbalanced parens", + "Too many operands", + "Operand too long", + "Field not found", + "Illegal operation for field type", + "Illegal integer value", + "Couldn't find or set field in one of a subsystem's events", + "Too many terms in predicate expression", + "Missing field name and/or value", + "Meaningless filter expression", +}; + +struct opstack_op { + int op; + struct list_head list; +}; + +struct postfix_elt { + int op; + char *operand; + struct list_head list; +}; + +struct filter_parse_state { + struct filter_op *ops; + struct list_head opstack; + struct list_head postfix; + int lasterr; + int lasterr_pos; + + struct { + char *string; + unsigned int cnt; + unsigned int tail; + } infix; + + struct { + char string[MAX_FILTER_STR_VAL]; + int pos; + unsigned int tail; + } operand; +}; + +DEFINE_COMPARISON_PRED(s64); +DEFINE_COMPARISON_PRED(u64); +DEFINE_COMPARISON_PRED(s32); +DEFINE_COMPARISON_PRED(u32); +DEFINE_COMPARISON_PRED(s16); +DEFINE_COMPARISON_PRED(u16); +DEFINE_COMPARISON_PRED(s8); +DEFINE_COMPARISON_PRED(u8); + +DEFINE_EQUALITY_PRED(64); +DEFINE_EQUALITY_PRED(32); +DEFINE_EQUALITY_PRED(16); +DEFINE_EQUALITY_PRED(8); + +static int filter_pred_and(struct filter_pred *pred __attribute((unused)), + void *event __attribute((unused)), + int val1, int val2) { - u16 *addr = (u16 *)(event + pred->offset); - u16 val = (u16)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; + return val1 && val2; } -static int filter_pred_8(struct filter_pred *pred, void *event) +static int filter_pred_or(struct filter_pred *pred __attribute((unused)), + void *event __attribute((unused)), + int val1, int val2) { - u8 *addr = (u8 *)(event + pred->offset); - u8 val = (u8)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; + return val1 || val2; } -static int filter_pred_string(struct filter_pred *pred, void *event) +static int filter_pred_string(struct filter_pred *pred, void *event, + int val1, int val2) { char *addr = (char *)(event + pred->offset); int cmp, match; @@ -85,7 +164,8 @@ static int filter_pred_string(struct filter_pred *pred, void *event) return match; } -static int filter_pred_none(struct filter_pred *pred, void *event) +static int filter_pred_none(struct filter_pred *pred, void *event, + int val1, int val2) { return 0; } @@ -94,66 +174,119 @@ static int filter_pred_none(struct filter_pred *pred, void *event) int filter_match_preds(struct ftrace_event_call *call, void *rec) { struct event_filter *filter = call->filter; - int i, matched, and_failed = 0; + int match, top = 0, val1 = 0, val2 = 0; + int stack[MAX_FILTER_PRED]; struct filter_pred *pred; + int i; for (i = 0; i < filter->n_preds; i++) { pred = filter->preds[i]; - if (and_failed && !pred->or) + if (!pred->pop_n) { + match = pred->fn(pred, rec, val1, val2); + stack[top++] = match; continue; - matched = pred->fn(pred, rec); - if (!matched && !pred->or) { - and_failed = 1; - continue; - } else if (matched && pred->or) - return 1; + } + if (pred->pop_n > top) { + WARN_ON_ONCE(1); + return 0; + } + val1 = stack[--top]; + val2 = stack[--top]; + match = pred->fn(pred, rec, val1, val2); + stack[top++] = match; } - if (and_failed) - return 0; - - return 1; + return stack[--top]; } EXPORT_SYMBOL_GPL(filter_match_preds); -static void __filter_print_preds(struct event_filter *filter, - struct trace_seq *s) +static void parse_error(struct filter_parse_state *ps, int err, int pos) { - struct filter_pred *pred; - char *field_name; - int i; + ps->lasterr = err; + ps->lasterr_pos = pos; +} - if (!filter || !filter->n_preds) { - trace_seq_printf(s, "none\n"); +static void remove_filter_string(struct event_filter *filter) +{ + kfree(filter->filter_string); + filter->filter_string = NULL; +} + +static int replace_filter_string(struct event_filter *filter, + char *filter_string) +{ + kfree(filter->filter_string); + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) + return -ENOMEM; + + return 0; +} + +static int append_filter_string(struct event_filter *filter, + char *string) +{ + int newlen; + char *new_filter_string; + + BUG_ON(!filter->filter_string); + newlen = strlen(filter->filter_string) + strlen(string) + 1; + new_filter_string = kmalloc(newlen, GFP_KERNEL); + if (!new_filter_string) + return -ENOMEM; + + strcpy(new_filter_string, filter->filter_string); + strcat(new_filter_string, string); + kfree(filter->filter_string); + filter->filter_string = new_filter_string; + + return 0; +} + +static void append_filter_err(struct filter_parse_state *ps, + struct event_filter *filter) +{ + int pos = ps->lasterr_pos; + char *buf, *pbuf; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return; - } - for (i = 0; i < filter->n_preds; i++) { - pred = filter->preds[i]; - field_name = pred->field_name; - if (i) - trace_seq_printf(s, pred->or ? "|| " : "&& "); - trace_seq_printf(s, "%s ", field_name); - trace_seq_printf(s, pred->not ? "!= " : "== "); - if (pred->str_len) - trace_seq_printf(s, "%s\n", pred->str_val); - else - trace_seq_printf(s, "%llu\n", pred->val); - } + append_filter_string(filter, "\n"); + memset(buf, ' ', PAGE_SIZE); + if (pos > PAGE_SIZE - 128) + pos = 0; + buf[pos] = '^'; + pbuf = &buf[pos] + 1; + + sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); + append_filter_string(filter, buf); + free_page((unsigned long) buf); } -void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s) +void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) { + struct event_filter *filter = call->filter; + mutex_lock(&filter_mutex); - __filter_print_preds(call->filter, s); + if (filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_printf(s, "none\n"); mutex_unlock(&filter_mutex); } -void filter_print_subsystem_preds(struct event_subsystem *system, +void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s) { + struct event_filter *filter = system->filter; + mutex_lock(&filter_mutex); - __filter_print_preds(system->filter, s); + if (filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_printf(s, "none\n"); mutex_unlock(&filter_mutex); } @@ -170,7 +303,7 @@ find_event_field(struct ftrace_event_call *call, char *name) return NULL; } -void filter_free_pred(struct filter_pred *pred) +static void filter_free_pred(struct filter_pred *pred) { if (!pred) return; @@ -191,15 +324,17 @@ static int filter_set_pred(struct filter_pred *dest, filter_pred_fn_t fn) { *dest = *src; - dest->field_name = kstrdup(src->field_name, GFP_KERNEL); - if (!dest->field_name) - return -ENOMEM; + if (src->field_name) { + dest->field_name = kstrdup(src->field_name, GFP_KERNEL); + if (!dest->field_name) + return -ENOMEM; + } dest->fn = fn; return 0; } -static void __filter_disable_preds(struct ftrace_event_call *call) +static void filter_disable_preds(struct ftrace_event_call *call) { struct event_filter *filter = call->filter; int i; @@ -211,13 +346,6 @@ static void __filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } -void filter_disable_preds(struct ftrace_event_call *call) -{ - mutex_lock(&filter_mutex); - __filter_disable_preds(call); - mutex_unlock(&filter_mutex); -} - int init_preds(struct ftrace_event_call *call) { struct event_filter *filter; @@ -258,48 +386,43 @@ oom: } EXPORT_SYMBOL_GPL(init_preds); -static void __filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct event_filter *filter = system->filter; struct ftrace_event_call *call; int i; - if (filter && filter->n_preds) { + if (filter->n_preds) { for (i = 0; i < filter->n_preds; i++) filter_free_pred(filter->preds[i]); kfree(filter->preds); - kfree(filter); - system->filter = NULL; + filter->preds = NULL; + filter->n_preds = 0; } list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; - if (!strcmp(call->system, system->name)) - __filter_disable_preds(call); + if (!strcmp(call->system, system->name)) { + filter_disable_preds(call); + remove_filter_string(call->filter); + } } } -void filter_free_subsystem_preds(struct event_subsystem *system) -{ - mutex_lock(&filter_mutex); - __filter_free_subsystem_preds(system); - mutex_unlock(&filter_mutex); -} - -static int filter_add_pred_fn(struct ftrace_event_call *call, +static int filter_add_pred_fn(struct filter_parse_state *ps, + struct ftrace_event_call *call, struct filter_pred *pred, filter_pred_fn_t fn) { struct event_filter *filter = call->filter; int idx, err; - if (filter->n_preds && !pred->compound) - __filter_disable_preds(call); - - if (filter->n_preds == MAX_FILTER_PRED) + if (filter->n_preds == MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; + } idx = filter->n_preds; filter_clear_pred(filter->preds[idx]); @@ -321,94 +444,132 @@ static int is_string_field(const char *type) return 0; } -static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred) +static int is_legal_op(struct ftrace_event_field *field, int op) +{ + if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) + return 0; + + return 1; +} + +static filter_pred_fn_t select_comparison_fn(int op, int field_size, + int field_is_signed) +{ + filter_pred_fn_t fn = NULL; + + switch (field_size) { + case 8: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_64; + else if (field_is_signed) + fn = filter_pred_s64; + else + fn = filter_pred_u64; + break; + case 4: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_32; + else if (field_is_signed) + fn = filter_pred_s32; + else + fn = filter_pred_u32; + break; + case 2: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_16; + else if (field_is_signed) + fn = filter_pred_s16; + else + fn = filter_pred_u16; + break; + case 1: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_8; + else if (field_is_signed) + fn = filter_pred_s8; + else + fn = filter_pred_u8; + break; + } + + return fn; +} + +static int filter_add_pred(struct filter_parse_state *ps, + struct ftrace_event_call *call, + struct filter_pred *pred) { struct ftrace_event_field *field; filter_pred_fn_t fn; unsigned long long val; + pred->fn = filter_pred_none; + + if (pred->op == OP_AND) { + pred->pop_n = 2; + return filter_add_pred_fn(ps, call, pred, filter_pred_and); + } else if (pred->op == OP_OR) { + pred->pop_n = 2; + return filter_add_pred_fn(ps, call, pred, filter_pred_or); + } + field = find_event_field(call, pred->field_name); - if (!field) + if (!field) { + parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); return -EINVAL; + } - pred->fn = filter_pred_none; pred->offset = field->offset; + if (!is_legal_op(field, pred->op)) { + parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); + return -EINVAL; + } + if (is_string_field(field->type)) { fn = filter_pred_string; pred->str_len = field->size; - return filter_add_pred_fn(call, pred, fn); + if (pred->op == OP_NE) + pred->not = 1; + return filter_add_pred_fn(ps, call, pred, fn); } else { - if (strict_strtoull(pred->str_val, 0, &val)) + if (strict_strtoull(pred->str_val, 0, &val)) { + parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; + } pred->val = val; } - switch (field->size) { - case 8: - fn = filter_pred_64; - break; - case 4: - fn = filter_pred_32; - break; - case 2: - fn = filter_pred_16; - break; - case 1: - fn = filter_pred_8; - break; - default: + fn = select_comparison_fn(pred->op, field->size, field->is_signed); + if (!fn) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); return -EINVAL; } - return filter_add_pred_fn(call, pred, fn); -} - -int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) -{ - int err; - - mutex_lock(&filter_mutex); - err = __filter_add_pred(call, pred); - mutex_unlock(&filter_mutex); + if (pred->op == OP_NE) + pred->not = 1; - return err; + return filter_add_pred_fn(ps, call, pred, fn); } -int filter_add_subsystem_pred(struct event_subsystem *system, - struct filter_pred *pred) +static int filter_add_subsystem_pred(struct filter_parse_state *ps, + struct event_subsystem *system, + struct filter_pred *pred, + char *filter_string) { struct event_filter *filter = system->filter; struct ftrace_event_call *call; - mutex_lock(&filter_mutex); - - if (filter && filter->n_preds && !pred->compound) { - __filter_free_subsystem_preds(system); - filter = NULL; - } - - if (!filter) { - system->filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!system->filter) { - mutex_unlock(&filter_mutex); - return -ENOMEM; - } - filter = system->filter; + if (!filter->preds) { filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!filter->preds) { - kfree(system->filter); - system->filter = NULL; - mutex_unlock(&filter_mutex); + if (!filter->preds) return -ENOMEM; - } } if (filter->n_preds == MAX_FILTER_PRED) { - mutex_unlock(&filter_mutex); + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } @@ -424,97 +585,508 @@ int filter_add_subsystem_pred(struct event_subsystem *system, if (strcmp(call->system, system->name)) continue; - err = __filter_add_pred(call, pred); - if (err == -ENOMEM) { - filter->preds[filter->n_preds] = NULL; - filter->n_preds--; - mutex_unlock(&filter_mutex); + err = filter_add_pred(ps, call, pred); + if (err) { + filter_free_subsystem_preds(system); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); return err; } + replace_filter_string(call->filter, filter_string); } - mutex_unlock(&filter_mutex); + return 0; +} + +static void parse_init(struct filter_parse_state *ps, + struct filter_op *ops, + char *infix_string) +{ + memset(ps, '\0', sizeof(*ps)); + + ps->infix.string = infix_string; + ps->infix.cnt = strlen(infix_string); + ps->ops = ops; + + INIT_LIST_HEAD(&ps->opstack); + INIT_LIST_HEAD(&ps->postfix); +} + +static char infix_next(struct filter_parse_state *ps) +{ + ps->infix.cnt--; + + return ps->infix.string[ps->infix.tail++]; +} + +static char infix_peek(struct filter_parse_state *ps) +{ + if (ps->infix.tail == strlen(ps->infix.string)) + return 0; + + return ps->infix.string[ps->infix.tail]; +} + +static void infix_advance(struct filter_parse_state *ps) +{ + ps->infix.cnt--; + ps->infix.tail++; +} + +static inline int is_precedence_lower(struct filter_parse_state *ps, + int a, int b) +{ + return ps->ops[a].precedence < ps->ops[b].precedence; +} + +static inline int is_op_char(struct filter_parse_state *ps, char c) +{ + int i; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (ps->ops[i].string[0] == c) + return 1; + } return 0; } -/* - * The filter format can be - * - 0, which means remove all filter preds - * - [||/&&] ==/!= - */ -int filter_parse(char **pbuf, struct filter_pred *pred) -{ - char *tok, *val_str = NULL; - int tok_n = 0; - - while ((tok = strsep(pbuf, " \n"))) { - if (tok_n == 0) { - if (!strcmp(tok, "0")) { - pred->clear = 1; - return 0; - } else if (!strcmp(tok, "&&")) { - pred->or = 0; - pred->compound = 1; - } else if (!strcmp(tok, "||")) { - pred->or = 1; - pred->compound = 1; - } else - pred->field_name = tok; - tok_n = 1; - continue; +static int infix_get_op(struct filter_parse_state *ps, char firstc) +{ + char nextc = infix_peek(ps); + char opstr[3]; + int i; + + opstr[0] = firstc; + opstr[1] = nextc; + opstr[2] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) { + infix_advance(ps); + return ps->ops[i].id; } - if (tok_n == 1) { - if (!pred->field_name) - pred->field_name = tok; - else if (!strcmp(tok, "!=")) - pred->not = 1; - else if (!strcmp(tok, "==")) - pred->not = 0; - else { - pred->field_name = NULL; + } + + opstr[1] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) + return ps->ops[i].id; + } + + return OP_NONE; +} + +static inline void clear_operand_string(struct filter_parse_state *ps) +{ + memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL); + ps->operand.tail = 0; +} + +static inline int append_operand_char(struct filter_parse_state *ps, char c) +{ + if (ps->operand.tail == MAX_FILTER_STR_VAL) + return -EINVAL; + + ps->operand.string[ps->operand.tail++] = c; + + return 0; +} + +static int filter_opstack_push(struct filter_parse_state *ps, int op) +{ + struct opstack_op *opstack_op; + + opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); + if (!opstack_op) + return -ENOMEM; + + opstack_op->op = op; + list_add(&opstack_op->list, &ps->opstack); + + return 0; +} + +static int filter_opstack_empty(struct filter_parse_state *ps) +{ + return list_empty(&ps->opstack); +} + +static int filter_opstack_top(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + + return opstack_op->op; +} + +static int filter_opstack_pop(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + int op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + op = opstack_op->op; + list_del(&opstack_op->list); + + kfree(opstack_op); + + return op; +} + +static void filter_opstack_clear(struct filter_parse_state *ps) +{ + while (!filter_opstack_empty(ps)) + filter_opstack_pop(ps); +} + +static char *curr_operand(struct filter_parse_state *ps) +{ + return ps->operand.string; +} + +static int postfix_append_operand(struct filter_parse_state *ps, char *operand) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = OP_NONE; + elt->operand = kstrdup(operand, GFP_KERNEL); + if (!elt->operand) { + kfree(elt); + return -ENOMEM; + } + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static int postfix_append_op(struct filter_parse_state *ps, int op) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = op; + elt->operand = NULL; + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static void postfix_clear(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + + while (!list_empty(&ps->postfix)) { + elt = list_first_entry(&ps->postfix, struct postfix_elt, list); + kfree(elt->operand); + list_del(&elt->list); + } +} + +static int filter_parse(struct filter_parse_state *ps) +{ + int op, top_op; + char ch; + + while ((ch = infix_next(ps))) { + if (isspace(ch)) + continue; + + if (is_op_char(ps, ch)) { + op = infix_get_op(ps, ch); + if (op == OP_NONE) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); return -EINVAL; } - tok_n = 2; + + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_top(ps); + if (!is_precedence_lower(ps, top_op, op)) { + top_op = filter_opstack_pop(ps); + postfix_append_op(ps, top_op); + continue; + } + break; + } + + filter_opstack_push(ps, op); continue; } - if (tok_n == 2) { - if (pred->compound) { - if (!strcmp(tok, "!=")) - pred->not = 1; - else if (!strcmp(tok, "==")) - pred->not = 0; - else { - pred->field_name = NULL; - return -EINVAL; - } - } else { - val_str = tok; - break; /* done */ + + if (ch == '(') { + filter_opstack_push(ps, OP_OPEN_PAREN); + continue; + } + + if (ch == ')') { + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + top_op = filter_opstack_pop(ps); + while (top_op != OP_NONE) { + if (top_op == OP_OPEN_PAREN) + break; + postfix_append_op(ps, top_op); + top_op = filter_opstack_pop(ps); + } + if (top_op == OP_NONE) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; } - tok_n = 3; continue; } - if (tok_n == 3) { - val_str = tok; - break; /* done */ + if (append_operand_char(ps, ch)) { + parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); + return -EINVAL; + } + } + + if (strlen(curr_operand(ps))) + postfix_append_operand(ps, curr_operand(ps)); + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_pop(ps); + if (top_op == OP_NONE) + break; + if (top_op == OP_OPEN_PAREN) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; + } + postfix_append_op(ps, top_op); + } + + return 0; +} + +static struct filter_pred *create_pred(int op, char *operand1, char *operand2) +{ + struct filter_pred *pred; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->field_name = kstrdup(operand1, GFP_KERNEL); + if (!pred->field_name) { + kfree(pred); + return NULL; + } + + strcpy(pred->str_val, operand2); + pred->str_len = strlen(operand2); + + pred->op = op; + + return pred; +} + +static struct filter_pred *create_logical_pred(int op) +{ + struct filter_pred *pred; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->op = op; + + return pred; +} + +static int check_preds(struct filter_parse_state *ps) +{ + int n_normal_preds = 0, n_logical_preds = 0; + struct postfix_elt *elt; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + + if (elt->op == OP_AND || elt->op == OP_OR) { + n_logical_preds++; + continue; } + n_normal_preds++; } - if (!val_str || !strlen(val_str) - || strlen(val_str) >= MAX_FILTER_STR_VAL) { - pred->field_name = NULL; + if (!n_normal_preds || n_logical_preds >= n_normal_preds) { + parse_error(ps, FILT_ERR_INVALID_FILTER, 0); return -EINVAL; } - strcpy(pred->str_val, val_str); - pred->str_len = strlen(val_str); + return 0; +} - pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); - if (!pred->field_name) - return -ENOMEM; +static int replace_preds(struct event_subsystem *system, + struct ftrace_event_call *call, + struct filter_parse_state *ps, + char *filter_string) +{ + char *operand1 = NULL, *operand2 = NULL; + struct filter_pred *pred; + struct postfix_elt *elt; + int err; + + err = check_preds(ps); + if (err) + return err; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) { + if (!operand1) + operand1 = elt->operand; + else if (!operand2) + operand2 = elt->operand; + else { + parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); + return -EINVAL; + } + continue; + } + + if (elt->op == OP_AND || elt->op == OP_OR) { + pred = create_logical_pred(elt->op); + if (call) { + err = filter_add_pred(ps, call, pred); + filter_free_pred(pred); + } else + err = filter_add_subsystem_pred(ps, system, + pred, filter_string); + if (err) + return err; + + operand1 = operand2 = NULL; + continue; + } + + if (!operand1 || !operand2) { + parse_error(ps, FILT_ERR_MISSING_FIELD, 0); + return -EINVAL; + } + + pred = create_pred(elt->op, operand1, operand2); + if (call) { + err = filter_add_pred(ps, call, pred); + filter_free_pred(pred); + } else + err = filter_add_subsystem_pred(ps, system, pred, + filter_string); + if (err) + return err; + + operand1 = operand2 = NULL; + } return 0; } +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + int err; + + struct filter_parse_state *ps; + + mutex_lock(&filter_mutex); + + if (!strcmp(strstrip(filter_string), "0")) { + filter_disable_preds(call); + remove_filter_string(call->filter); + mutex_unlock(&filter_mutex); + return 0; + } + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + return -ENOMEM; + + filter_disable_preds(call); + replace_filter_string(call->filter, filter_string); + + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, call->filter); + goto out; + } + + err = replace_preds(NULL, call, ps, filter_string); + if (err) + append_filter_err(ps, call->filter); + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + mutex_unlock(&filter_mutex); + + return err; +} + +int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string) +{ + int err; + + struct filter_parse_state *ps; + + mutex_lock(&filter_mutex); + + if (!strcmp(strstrip(filter_string), "0")) { + filter_free_subsystem_preds(system); + remove_filter_string(system->filter); + mutex_unlock(&filter_mutex); + return 0; + } + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + return -ENOMEM; + + filter_free_subsystem_preds(system); + replace_filter_string(system->filter, filter_string); + + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, system->filter); + goto out; + } + + err = replace_preds(system, NULL, ps, filter_string); + if (err) + append_filter_err(ps, system->filter); + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + mutex_unlock(&filter_mutex); + + return err; +} -- cgit v1.2.2 From a0e39ed378fb6ba916522764cd508fa7d42ad495 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 29 Apr 2009 13:51:39 +0200 Subject: tracing: fix build failure on s390 "tracing: create automated trace defines" causes this compile error on s390, as reported by Sachin Sant against linux-next: kernel/built-in.o: In function `__do_softirq': (.text+0x1c680): undefined reference to `__tracepoint_softirq_entry' This happens because the definitions of the softirq tracepoints were moved from kernel/softirq.c to kernel/irq/handle.c. Since s390 doesn't support generic hardirqs handle.c doesn't get compiled and the definitions are missing. So move the tracepoints to softirq.c again. [ Impact: fix build failure on s390 ] Reported-by: Sachin Sant Signed-off-by: Heiko Carstens Cc: Steven Rostedt Cc: fweisbec@gmail.com LKML-Reference: <20090429135139.5fac79b8@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 2 -- kernel/softirq.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 37c63633e78b..e68bb5aebe02 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -18,8 +18,6 @@ #include #include #include - -#define CREATE_TRACE_POINTS #include #include "internals.h" diff --git a/kernel/softirq.c b/kernel/softirq.c index 7ab9dfd8d082..d4ba347a872d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -24,6 +24,8 @@ #include #include #include + +#define CREATE_TRACE_POINTS #include #include -- cgit v1.2.2 From 56afb0f8823650f53a5f0e96d69a282e8892c61b Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 30 Apr 2009 13:29:36 -0400 Subject: kerneldoc, tracing: make kernel-doc understand TRACE_EVENT() macro (take #2) Add support to kernel-doc for tracepoint comments above TRACE_EVENT() macro definitions. Paves the way for tracepoint docbook. [ Impact: extend DocBook infrastructure ] Signed-off-by: Jason Baron Acked-by: Randy Dunlap Cc: akpm@linux-foundation.org Cc: rostedt@goodmis.org Cc: fweisbec@gmail.com Cc: mathieu.desnoyers@polymtl.ca Cc: wcohen@redhat.com LKML-Reference: Signed-off-by: Ingo Molnar --- scripts/kernel-doc | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 0f11870116dc..2b53a55fbece 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -1827,6 +1827,25 @@ sub reset_state { $state = 0; } +sub tracepoint_munge($) { + my $file = shift; + my $tracepointname = 0; + my $tracepointargs = 0; + + if($prototype =~ m/TRACE_EVENT\((.*?),/) { + $tracepointname = $1; + } + if($prototype =~ m/TP_PROTO\((.*?)\)/) { + $tracepointargs = $1; + } + if (($tracepointname eq 0) || ($tracepointargs eq 0)) { + print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n". + "$prototype\n"; + } else { + $prototype = "static inline void trace_$tracepointname($tracepointargs)"; + } +} + sub syscall_munge() { my $void = 0; @@ -1881,6 +1900,9 @@ sub process_state3_function($$) { if ($prototype =~ /SYSCALL_DEFINE/) { syscall_munge(); } + if ($prototype =~ /TRACE_EVENT/) { + tracepoint_munge($file); + } dump_function($prototype, $file); reset_state(); } -- cgit v1.2.2 From a76f8c6da1e48fd4ef025f42c736389532ff30ba Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 30 Apr 2009 13:29:42 -0400 Subject: tracing: add new tracepoints docbook Add tracepoint docbook. This will help us document and understand what tracepoints are in the kernel. Since there are multiple macros, and files that contain tracepoints. [ Impact: add documentation ] Signed-off-by: Jason Baron Acked-by: Randy Dunlap Cc: akpm@linux-foundation.org Cc: rostedt@goodmis.org Cc: fweisbec@gmail.com Cc: mathieu.desnoyers@polymtl.ca Cc: wcohen@redhat.com LKML-Reference: <84160b6bd94aff02455da7e12bad054d34c579a0.1241107197.git.jbaron@redhat.com> Signed-off-by: Ingo Molnar --- Documentation/DocBook/Makefile | 3 +- Documentation/DocBook/tracepoint.tmpl | 84 +++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 Documentation/DocBook/tracepoint.tmpl diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 8918a32c6b3a..4c8f4d6e114a 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -13,7 +13,8 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \ gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ mac80211.xml debugobjects.xml sh.xml regulator.xml \ - alsa-driver-api.xml writing-an-alsa-driver.xml + alsa-driver-api.xml writing-an-alsa-driver.xml \ + tracepoint.xml ### # The build process is as follows (targets): diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl new file mode 100644 index 000000000000..70891bc68491 --- /dev/null +++ b/Documentation/DocBook/tracepoint.tmpl @@ -0,0 +1,84 @@ + + + + + + The Linux Kernel Tracepoint API + + + + Jason + Baron + +
+ jbaron@redhat.com +
+
+
+
+ + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + Introduction + + Tracepoints are static probe points that are located in strategic points + throughout the kernel. 'Probes' register/unregister with tracepoints + via a callback mechanism. The 'probes' are strictly typed functions that + are passed a unique set of parameters defined by each tracepoint. + + + + From this simple callback mechanism, 'probes' can be used to profile, debug, + and understand kernel behavior. There are a number of tools that provide a + framework for using 'probes'. These tools include Systemtap, ftrace, and + LTTng. + + + + Tracepoints are defined in a number of header files via various macros. Thus, + the purpose of this document is to provide a clear accounting of the available + tracepoints. The intention is to understand not only what tracepoints are + available but also to understand where future tracepoints might be added. + + + + The API presented has functions of the form: + trace_tracepointname(function parameters). These are the + tracepoints callbacks that are found throughout the code. Registering and + unregistering probes with these callback sites is covered in the + Documentation/trace/* directory. + + + +
-- cgit v1.2.2 From 9ee1983c9aa18f12388ef660d0c76a23dc112959 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 30 Apr 2009 13:29:47 -0400 Subject: tracing: add irq tracepoint documentation Document irqs for the newly created docbook. [ Impact: add documentation ] Signed-off-by: Jason Baron Acked-by: Randy Dunlap Cc: akpm@linux-foundation.org Cc: rostedt@goodmis.org Cc: fweisbec@gmail.com Cc: mathieu.desnoyers@polymtl.ca Cc: wcohen@redhat.com LKML-Reference: <73ff42be3420157667ec548e9b0e409c3cfad05f.1241107197.git.jbaron@redhat.com> Signed-off-by: Ingo Molnar --- Documentation/DocBook/tracepoint.tmpl | 5 ++++ include/trace/events/irq.h | 46 ++++++++++++++++++++++++++++++++--- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl index 70891bc68491..b0756d0fd579 100644 --- a/Documentation/DocBook/tracepoint.tmpl +++ b/Documentation/DocBook/tracepoint.tmpl @@ -81,4 +81,9 @@ + + IRQ +!Iinclude/trace/events/irq.h + + diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 768686467518..32a9f7ef432b 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -7,8 +7,16 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq -/* - * Tracepoint for entry of interrupt handler: +/** + * irq_handler_entry - called immediately before the irq action handler + * @irq: irq number + * @action: pointer to struct irqaction + * + * The struct irqaction pointed to by @action contains various + * information about the handler, including the device name, + * @action->name, and the device id, @action->dev_id. When used in + * conjunction with the irq_handler_exit tracepoint, we can figure + * out irq handler latencies. */ TRACE_EVENT(irq_handler_entry, @@ -29,8 +37,16 @@ TRACE_EVENT(irq_handler_entry, TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name)) ); -/* - * Tracepoint for return of an interrupt handler: +/** + * irq_handler_exit - called immediately after the irq action handler returns + * @irq: irq number + * @action: pointer to struct irqaction + * @ret: return value + * + * If the @ret value is set to IRQ_HANDLED, then we know that the corresponding + * @action->handler scuccessully handled this irq. Otherwise, the irq might be + * a shared irq line, or the irq was not handled successfully. Can be used in + * conjunction with the irq_handler_entry to understand irq handler latencies. */ TRACE_EVENT(irq_handler_exit, @@ -52,6 +68,17 @@ TRACE_EVENT(irq_handler_exit, __entry->irq, __entry->ret ? "handled" : "unhandled") ); +/** + * softirq_entry - called immediately before the softirq handler + * @h: pointer to struct softirq_action + * @vec: pointer to first struct softirq_action in softirq_vec array + * + * The @h parameter, contains a pointer to the struct softirq_action + * which has a pointer to the action handler that is called. By subtracting + * the @vec pointer from the @h pointer, we can determine the softirq + * number. Also, when used in combination with the softirq_exit tracepoint + * we can determine the softirq latency. + */ TRACE_EVENT(softirq_entry, TP_PROTO(struct softirq_action *h, struct softirq_action *vec), @@ -71,6 +98,17 @@ TRACE_EVENT(softirq_entry, TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name)) ); +/** + * softirq_exit - called immediately after the softirq handler returns + * @h: pointer to struct softirq_action + * @vec: pointer to first struct softirq_action in softirq_vec array + * + * The @h parameter contains a pointer to the struct softirq_action + * that has handled the softirq. By subtracting the @vec pointer from + * the @h pointer, we can determine the softirq number. Also, when used in + * combination with the softirq_entry tracepoint we can determine the softirq + * latency. + */ TRACE_EVENT(softirq_exit, TP_PROTO(struct softirq_action *h, struct softirq_action *vec), -- cgit v1.2.2 From d6ce96dabe2c4409fd009ec14250a1fdbab4b133 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 01:15:24 -0400 Subject: ring-buffer: export symbols I'm adding a module to do a series of tests on the ring buffer as well as benchmarks. This module needs to have more of the ring buffer API exported. There's nothing wrong with reading the ring buffer from a module. [ Impact: allow modules to read pages from the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f4cc59040ebf..3e86da9b2a09 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2802,6 +2802,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) return bpage; } +EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page @@ -2814,6 +2815,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) { free_page((unsigned long)data); } +EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); /** * ring_buffer_read_page - extract a page from the ring buffer @@ -2959,6 +2961,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, out: return ret; } +EXPORT_SYMBOL_GPL(ring_buffer_read_page); static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, -- cgit v1.2.2 From f0d2c681ac0a85142fc8abe65fc33fcad35cb9b7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 13:43:37 -0400 Subject: ring-buffer: add counters for commit overrun and nmi dropped entries The WARN_ON in the ring buffer when a commit is preempted and the buffer is filled by preceding writes can happen in normal operations. The WARN_ON makes it look like a bug, not to mention, because it does not stop tracing and calls printk which can also recurse, this is prone to deadlock (the WARN_ON is not in a position to recurse). This patch removes the WARN_ON and replaces it with a counter that can be retrieved by a tracer. This counter is called commit_overrun. While at it, I added a nmi_dropped counter to count any time an NMI entry is dropped because the NMI could not take the spinlock. [ Impact: prevent deadlock by printing normal case warning ] Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 2 ++ kernel/trace/ring_buffer.c | 52 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 1c2f80911fbe..f1345828c7c5 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -153,6 +153,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer); unsigned long ring_buffer_overruns(struct ring_buffer *buffer); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3e86da9b2a09..26e1359fe193 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -402,6 +402,8 @@ struct ring_buffer_per_cpu { struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; + unsigned long nmi_dropped; + unsigned long commit_overrun; unsigned long overrun; unsigned long entries; u64 write_stamp; @@ -1216,8 +1218,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * simply fail. */ if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) + if (!__raw_spin_trylock(&cpu_buffer->lock)) { + cpu_buffer->nmi_dropped++; goto out_reset; + } } else __raw_spin_lock(&cpu_buffer->lock); @@ -1238,8 +1242,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * about it. */ if (unlikely(next_page == commit_page)) { - /* This can easily happen on small ring buffers */ - WARN_ON_ONCE(buffer->pages > 2); + cpu_buffer->commit_overrun++; goto out_reset; } @@ -1925,6 +1928,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); +/** + * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->nmi_dropped; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); + +/** + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->commit_overrun; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); + /** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer @@ -2595,6 +2639,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; + cpu_buffer->nmi_dropped = 0; + cpu_buffer->commit_overrun = 0; cpu_buffer->overrun = 0; cpu_buffer->entries = 0; -- cgit v1.2.2 From c8d771835e18c938dae8690611d65fe98ad30f58 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 18:03:45 -0400 Subject: tracing: export stats of ring buffers to userspace This patch adds stats to the ftrace ring buffers: # cat /debugfs/tracing/per_cpu/cpu0/stats entries: 42360 overrun: 30509326 commit overrun: 0 nmi dropped: 0 Where entries are the total number of data entries in the buffer. overrun is the number of entries not consumed and were overwritten by the writer. commit overrun is the number of entries dropped due to nested writers wrapping the buffer before the initial writer finished the commit. nmi dropped is the number of entries dropped due to the ring buffer lock being held when an nmi was going to write to the ring buffer. Note, this field will be meaningless and will go away when the ring buffer becomes lockless. [ Impact: let userspace know what is happening in the ring buffers ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f5427e0fc982..74df029056b0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3595,6 +3595,45 @@ static const struct file_operations tracing_buffers_fops = { .llseek = no_llseek, }; +static ssize_t +tracing_stats_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned long cpu = (unsigned long)filp->private_data; + struct trace_array *tr = &global_trace; + struct trace_seq *s; + unsigned long cnt; + + s = kmalloc(sizeof(*s), GFP_ATOMIC); + if (!s) + return ENOMEM; + + trace_seq_init(s); + + cnt = ring_buffer_entries_cpu(tr->buffer, cpu); + trace_seq_printf(s, "entries: %ld\n", cnt); + + cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "overrun: %ld\n", cnt); + + cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "commit overrun: %ld\n", cnt); + + cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu); + trace_seq_printf(s, "nmi dropped: %ld\n", cnt); + + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + + kfree(s); + + return count; +} + +static const struct file_operations tracing_stats_fops = { + .open = tracing_open_generic, + .read = tracing_stats_read, +}; + #ifdef CONFIG_DYNAMIC_FTRACE int __weak ftrace_arch_read_dyn_info(char *buf, int size) @@ -3708,6 +3747,9 @@ static void tracing_init_debugfs_percpu(long cpu) trace_create_file("trace_pipe_raw", 0444, d_cpu, (void *) cpu, &tracing_buffers_fops); + + trace_create_file("stats", 0444, d_cpu, + (void *) cpu, &tracing_stats_fops); } #ifdef CONFIG_FTRACE_SELFTEST -- cgit v1.2.2 From e4906eff9e6fbd2d311abcbcc53d5a531773c982 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 30 Apr 2009 20:49:44 -0400 Subject: ring-buffer: convert cpu buffer entries to local_t The entries counter in cpu buffer is not atomic. It can be updated by other interrupts or from another CPU (readers). But making entries into "atomic_t" causes an atomic operation that can hurt performance. Instead we convert it to a local_t that will increment a counter with a local CPU atomic operation (if the arch supports it). Instead of fighting with readers and overwrites that decrement the counter, I added a "read" counter. Every time a reader reads an entry it is incremented. We already have a overrun counter and with that, the entries counter and the read counter, we can calculate the total number of entries in the buffer with: (entries - overrun) - read As long as the total number of entries in the ring buffer is less than the word size, this will work. But since the entries counter was previously a long, this is no different than what we had before. Thanks to Andrew Morton for pointing out in the first version that atomic_t does not replace unsigned long. I switched to atomic_long_t even though it is signed. A negative count is most likely a bug. [ Impact: keep accurate count of cpu buffer entries ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 26e1359fe193..c792ea893b01 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -405,7 +405,8 @@ struct ring_buffer_per_cpu { unsigned long nmi_dropped; unsigned long commit_overrun; unsigned long overrun; - unsigned long entries; + unsigned long read; + local_t entries; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -997,7 +998,6 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; cpu_buffer->overrun++; - cpu_buffer->entries--; } } @@ -1588,7 +1588,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { - cpu_buffer->entries++; + local_inc(&cpu_buffer->entries); /* Only process further if we own the commit */ if (!rb_is_commit(cpu_buffer, event)) @@ -1722,7 +1722,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, * The commit is still visible by the reader, so we * must increment entries. */ - cpu_buffer->entries++; + local_inc(&cpu_buffer->entries); out: /* * If a write came in and pushed the tail page @@ -1902,7 +1902,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->entries; + ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) + - cpu_buffer->read; return ret; } @@ -1985,7 +1986,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - entries += cpu_buffer->entries; + entries += (local_read(&cpu_buffer->entries) - + cpu_buffer->overrun) - cpu_buffer->read; } return entries; @@ -2225,7 +2227,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX || rb_discarded_event(event)) - cpu_buffer->entries--; + cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); @@ -2642,7 +2644,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->nmi_dropped = 0; cpu_buffer->commit_overrun = 0; cpu_buffer->overrun = 0; - cpu_buffer->entries = 0; + cpu_buffer->read = 0; + local_set(&cpu_buffer->entries, 0); cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; @@ -2813,7 +2816,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, /* Only count data entries */ if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; - cpu_buffer->entries--; + cpu_buffer->read++; } __raw_spin_unlock(&cpu_buffer->lock); } -- cgit v1.2.2 From 778c55d44eb4f5f658915ed631d68ed9d1ac3ad1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 May 2009 18:44:45 -0400 Subject: ring-buffer: record page entries in buffer page descriptor Currently, when the ring buffer writer overflows the buffer and must write over non consumed data, we increment the overrun counter by reading the entries on the page we are about to overwrite. This reads the entries one by one. This is not very effecient. This patch adds another entry counter into each buffer page descriptor that keeps track of the number of entries on the page. Now on overwrite, the overrun counter simply needs to add the number of entries that is on the page it is about to overwrite. [ Impact: speed up of ring buffer in overwrite mode ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c792ea893b01..342eacc4baa8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -321,9 +321,10 @@ struct buffer_data_page { }; struct buffer_page { + struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ unsigned read; /* index for next read */ - struct list_head list; /* list of free pages */ + local_t entries; /* entries on this page */ struct buffer_data_page *page; /* Actual data page */ }; @@ -977,30 +978,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) return rb_page_commit(cpu_buffer->head_page); } -/* - * When the tail hits the head and the buffer is in overwrite mode, - * the head jumps to the next page and all content on the previous - * page is discarded. But before doing so, we update the overrun - * variable of the buffer. - */ -static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct ring_buffer_event *event; - unsigned long head; - - for (head = 0; head < rb_head_size(cpu_buffer); - head += rb_event_length(event)) { - - event = __rb_page_index(cpu_buffer->head_page, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) - continue; - cpu_buffer->overrun++; - } -} - static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page **bpage) { @@ -1253,7 +1230,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* tail_page has not moved yet? */ if (tail_page == cpu_buffer->tail_page) { /* count overflows */ - rb_update_overflow(cpu_buffer); + cpu_buffer->overrun += + local_read(&head_page->entries); rb_inc_page(cpu_buffer, &head_page); cpu_buffer->head_page = head_page; @@ -1268,6 +1246,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ if (tail_page == cpu_buffer->tail_page) { local_set(&next_page->write, 0); + local_set(&next_page->entries, 0); local_set(&next_page->page->commit, 0); cpu_buffer->tail_page = next_page; @@ -1313,6 +1292,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); + /* The passed in type is zero for DATA */ + if (likely(!type)) + local_inc(&tail_page->entries); + /* * If this is a commit and the tail is zero, then update * this page's time stamp. @@ -2183,6 +2166,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.prev = reader->list.prev; local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); /* Make the reader page now replace the head */ @@ -2629,6 +2613,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); + local_set(&cpu_buffer->head_page->entries, 0); local_set(&cpu_buffer->head_page->page->commit, 0); cpu_buffer->head_page->read = 0; @@ -2638,6 +2623,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; @@ -2996,6 +2982,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, bpage = reader->page; reader->page = *data_page; local_set(&reader->write, 0); + local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; -- cgit v1.2.2 From afbab76a62b69ea6197e19727d4b8a8aef8deb25 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 May 2009 19:40:05 -0400 Subject: ring-buffer: have read page swap increment counter with page entries In the swap page ring buffer code that is used by the ftrace splice code, we scan the page to increment the counter of entries read. With the number of entries already in the page we simply need to add it. [ Impact: speed up reading page from ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 342eacc4baa8..9e42a742a3f9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2785,28 +2785,6 @@ out: } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); -static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_data_page *bpage, - unsigned int offset) -{ - struct ring_buffer_event *event; - unsigned long head; - - __raw_spin_lock(&cpu_buffer->lock); - for (head = offset; head < local_read(&bpage->commit); - head += rb_event_length(event)) { - - event = __rb_data_page_index(bpage, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) - continue; - cpu_buffer->read++; - } - __raw_spin_unlock(&cpu_buffer->lock); -} - /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. @@ -2977,6 +2955,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* we copied everything to the beginning */ read = 0; } else { + /* update the entry counter */ + cpu_buffer->read += local_read(&reader->entries); + /* swap the pages */ rb_init_page(bpage); bpage = reader->page; @@ -2985,9 +2966,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer, local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; - - /* update the entry counter */ - rb_remove_entries(cpu_buffer, bpage, read); } ret = read; -- cgit v1.2.2 From 41ede23eded40832c955d98d4b71bc244809abb3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 May 2009 20:26:54 -0400 Subject: ring-buffer: disable writers when resetting buffers As a precaution, it is best to disable writing to the ring buffers when reseting them. [ Impact: prevent weird things if write happens during reset ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9e42a742a3f9..7876df00695f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2650,6 +2650,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; + atomic_inc(&cpu_buffer->record_disabled); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); __raw_spin_lock(&cpu_buffer->lock); @@ -2659,6 +2661,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + atomic_dec(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); -- cgit v1.2.2 From 31b6e76e21b2ffd3cb2f6fe4149790a9fdadce2d Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Thu, 30 Apr 2009 20:06:11 -0400 Subject: ftrace: use .sched.text, not .text.sched in recordmcount.pl The only references in the kernel to the .text.sched section are in recordmcount.pl. Since the code it has is intended to be example code it should refer to real kernel sections. So change it to .sched.text instead. [ Impact: consistency in comments ] Signed-off-by: Tim Abbott LKML-Reference: <1241136371-10768-1-git-send-email-tabbott@mit.edu> Acked-by: Sam Ravnborg Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 409596eca124..0fae7da0529c 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -26,7 +26,7 @@ # which will also be the location of that section after final link. # e.g. # -# .section ".text.sched" +# .section ".sched.text", "ax" # .globl my_func # my_func: # [...] @@ -39,7 +39,7 @@ # [...] # # Both relocation offsets for the mcounts in the above example will be -# offset from .text.sched. If we make another file called tmp.s with: +# offset from .sched.text. If we make another file called tmp.s with: # # .section __mcount_loc # .quad my_func + 0x5 @@ -51,7 +51,7 @@ # But this gets hard if my_func is not globl (a static function). # In such a case we have: # -# .section ".text.sched" +# .section ".sched.text", "ax" # my_func: # [...] # call mcount (offset: 0x5) -- cgit v1.2.2 From 94487d6d53af5acae10cf9fd52f74498994d46b1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 19:22:53 -0400 Subject: tracing: use proper export symbol for tracing api When adding the EXPORT_SYMBOL to some of the tracing API, I accidently used EXPORT_SYMBOL instead of EXPORT_SYMBOL_GPL. This patch fixes that mistake. [ Impact: export the tracing code only for GPL modules ] Reported-by: Christoph Hellwig Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 74df029056b0..4164a344e72a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -887,21 +887,21 @@ trace_current_buffer_lock_reserve(int type, unsigned long len, return trace_buffer_lock_reserve(&global_trace, type, len, flags, pc); } -EXPORT_SYMBOL(trace_current_buffer_lock_reserve); +EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } -EXPORT_SYMBOL(trace_current_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); } -EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); void trace_current_buffer_discard_commit(struct ring_buffer_event *event) { -- cgit v1.2.2 From aa20ae8444fc6c318272c643f856d8d8ad3e198d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 21:16:11 -0400 Subject: ring-buffer: move big if statement down In the hot path of the ring buffer "__rb_reserve_next" there's a big if statement that does not even return back to the work flow. code; if (cross to next page) { [ lots of code ] return; } more code; The condition is even the unlikely path, although we do not denote it with an unlikely because gcc is fine with it. The condition is true when the write crosses a page boundary, and we need to start at a new page. Having this if statement makes it hard to read, but calling another function to do the work is also not appropriate, because we are using a lot of variables that were set before the if statement, and we do not want to send them as parameters. This patch changes it to a goto: code; if (cross to next page) goto next_page; more code; return; next_page: [ lots of code] This makes the code easier to understand, and a bit more obvious. The output from gcc is practically identical. For some reason, gcc decided to use different registers when I switched it to a goto. But other than that, the logic is the same. [ Impact: easier to read code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 218 +++++++++++++++++++++++---------------------- 1 file changed, 111 insertions(+), 107 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7876df00695f..424129eb20a4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1159,6 +1159,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, unsigned type, unsigned long length, u64 *ts) { struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; + struct buffer_page *next_page; unsigned long tail, write; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; @@ -1173,137 +1174,140 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - length; /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) { - struct buffer_page *next_page = tail_page; + if (write > BUF_PAGE_SIZE) + goto next_page; - local_irq_save(flags); - /* - * Since the write to the buffer is still not - * fully lockless, we must be careful with NMIs. - * The locks in the writers are taken when a write - * crosses to a new page. The locks protect against - * races with the readers (this will soon be fixed - * with a lockless solution). - * - * Because we can not protect against NMIs, and we - * want to keep traces reentrant, we need to manage - * what happens when we are in an NMI. - * - * NMIs can happen after we take the lock. - * If we are in an NMI, only take the lock - * if it is not already taken. Otherwise - * simply fail. - */ - if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) { - cpu_buffer->nmi_dropped++; - goto out_reset; - } - } else - __raw_spin_lock(&cpu_buffer->lock); - - lock_taken = true; + /* We reserved something on the buffer */ - rb_inc_page(cpu_buffer, &next_page); + if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) + return NULL; - head_page = cpu_buffer->head_page; - reader_page = cpu_buffer->reader_page; + event = __rb_page_index(tail_page, tail); + rb_update_event(event, type, length); - /* we grabbed the lock before incrementing */ - if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_reset; + /* The passed in type is zero for DATA */ + if (likely(!type)) + local_inc(&tail_page->entries); - /* - * If for some reason, we had an interrupt storm that made - * it all the way around the buffer, bail, and warn - * about it. - */ - if (unlikely(next_page == commit_page)) { - cpu_buffer->commit_overrun++; - goto out_reset; - } + /* + * If this is a commit and the tail is zero, then update + * this page's time stamp. + */ + if (!tail && rb_is_commit(cpu_buffer, event)) + cpu_buffer->commit_page->page->time_stamp = *ts; - if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_reset; + return event; - /* tail_page has not moved yet? */ - if (tail_page == cpu_buffer->tail_page) { - /* count overflows */ - cpu_buffer->overrun += - local_read(&head_page->entries); + next_page: - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - cpu_buffer->head_page->read = 0; - } - } + next_page = tail_page; - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ - if (tail_page == cpu_buffer->tail_page) { - local_set(&next_page->write, 0); - local_set(&next_page->entries, 0); - local_set(&next_page->page->commit, 0); - cpu_buffer->tail_page = next_page; - - /* reread the time stamp */ - *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); - cpu_buffer->tail_page->page->time_stamp = *ts; + local_irq_save(flags); + /* + * Since the write to the buffer is still not + * fully lockless, we must be careful with NMIs. + * The locks in the writers are taken when a write + * crosses to a new page. The locks protect against + * races with the readers (this will soon be fixed + * with a lockless solution). + * + * Because we can not protect against NMIs, and we + * want to keep traces reentrant, we need to manage + * what happens when we are in an NMI. + * + * NMIs can happen after we take the lock. + * If we are in an NMI, only take the lock + * if it is not already taken. Otherwise + * simply fail. + */ + if (unlikely(in_nmi())) { + if (!__raw_spin_trylock(&cpu_buffer->lock)) { + cpu_buffer->nmi_dropped++; + goto out_reset; } + } else + __raw_spin_lock(&cpu_buffer->lock); - /* - * The actual tail page has moved forward. - */ - if (tail < BUF_PAGE_SIZE) { - /* Mark the rest of the page with padding */ - event = __rb_page_index(tail_page, tail); - rb_event_set_padding(event); - } + lock_taken = true; - if (tail <= BUF_PAGE_SIZE) - /* Set the write back to the previous setting */ - local_set(&tail_page->write, tail); + rb_inc_page(cpu_buffer, &next_page); - /* - * If this was a commit entry that failed, - * increment that too - */ - if (tail_page == cpu_buffer->commit_page && - tail == rb_commit_index(cpu_buffer)) { - rb_set_commit_to_write(cpu_buffer); - } + head_page = cpu_buffer->head_page; + reader_page = cpu_buffer->reader_page; - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); + /* we grabbed the lock before incrementing */ + if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) + goto out_reset; - /* fail and let the caller try again */ - return ERR_PTR(-EAGAIN); + /* + * If for some reason, we had an interrupt storm that made + * it all the way around the buffer, bail, and warn + * about it. + */ + if (unlikely(next_page == commit_page)) { + cpu_buffer->commit_overrun++; + goto out_reset; } - /* We reserved something on the buffer */ + if (next_page == head_page) { + if (!(buffer->flags & RB_FL_OVERWRITE)) + goto out_reset; - if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) - return NULL; + /* tail_page has not moved yet? */ + if (tail_page == cpu_buffer->tail_page) { + /* count overflows */ + cpu_buffer->overrun += + local_read(&head_page->entries); - event = __rb_page_index(tail_page, tail); - rb_update_event(event, type, length); + rb_inc_page(cpu_buffer, &head_page); + cpu_buffer->head_page = head_page; + cpu_buffer->head_page->read = 0; + } + } - /* The passed in type is zero for DATA */ - if (likely(!type)) - local_inc(&tail_page->entries); + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + local_set(&next_page->write, 0); + local_set(&next_page->entries, 0); + local_set(&next_page->page->commit, 0); + cpu_buffer->tail_page = next_page; + + /* reread the time stamp */ + *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); + cpu_buffer->tail_page->page->time_stamp = *ts; + } /* - * If this is a commit and the tail is zero, then update - * this page's time stamp. + * The actual tail page has moved forward. */ - if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->page->time_stamp = *ts; + if (tail < BUF_PAGE_SIZE) { + /* Mark the rest of the page with padding */ + event = __rb_page_index(tail_page, tail); + rb_event_set_padding(event); + } - return event; + if (tail <= BUF_PAGE_SIZE) + /* Set the write back to the previous setting */ + local_set(&tail_page->write, tail); + + /* + * If this was a commit entry that failed, + * increment that too + */ + if (tail_page == cpu_buffer->commit_page && + tail == rb_commit_index(cpu_buffer)) { + rb_set_commit_to_write(cpu_buffer); + } + + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + /* fail and let the caller try again */ + return ERR_PTR(-EAGAIN); out_reset: /* reset write */ -- cgit v1.2.2 From 5092dbc96f3acdac5433b27c06860352dc6d23b9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 22:47:18 -0400 Subject: ring-buffer: add benchmark and tester This patch adds code that can benchmark the ring buffer as well as test it. This code can be compiled into the kernel (not recommended) or as a module. A separate ring buffer is used to not interfer with other users, like ftrace. It creates a producer and a consumer (option to disable creation of the consumer) and will run for 10 seconds, then sleep for 10 seconds and then repeat. While running, the producer will write 10 byte loads into the ring buffer with just putting in the current CPU number. The reader will continually try to read the buffer. The reader will alternate from reading the buffer via event by event, or by full pages. The output is a pr_info, thus it will fill up the syslogs. Starting ring buffer hammer End ring buffer hammer Time: 9000349 (usecs) Overruns: 12578640 Read: 5358440 (by events) Entries: 0 Total: 17937080 Missed: 0 Hit: 17937080 Entries per millisec: 1993 501 ns per entry Sleeping for 10 secs Starting ring buffer hammer End ring buffer hammer Time: 9936350 (usecs) Overruns: 0 Read: 28146644 (by pages) Entries: 74 Total: 28146718 Missed: 0 Hit: 28146718 Entries per millisec: 2832 353 ns per entry Sleeping for 10 secs Time: is the time the test ran Overruns: the number of events that were overwritten and not read Read: the number of events read (either by pages or events) Entries: the number of entries left in the buffer (the by pages will only read full pages) Total: Entries + Read + Overruns Missed: the number of entries that failed to write Hit: the number of entries that were written The above example shows that it takes ~353 nanosecs per entry when there is a reader, reading by pages (and no overruns) The event by event reader slowed the producer down to 501 nanosecs. [ Impact: see how changes to the ring buffer affect stability and performance ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 16 ++ kernel/trace/Makefile | 1 + kernel/trace/ring_buffer_benchmark.c | 379 +++++++++++++++++++++++++++++++++++ 3 files changed, 396 insertions(+) create mode 100644 kernel/trace/ring_buffer_benchmark.c diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 450d3c2cfbd2..50f62a296e1d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -471,6 +471,22 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. +config RING_BUFFER_BENCHMARK + tristate "Ring buffer benchmark stress tester" + depends on RING_BUFFER + help + This option creates a test to stress the ring buffer and bench mark it. + It creates its own ring buffer such that it will not interfer with + any other users of the ring buffer (such as ftrace). It then creates + a producer and consumer that will run for 10 seconds and sleep for + 10 seconds. Each interval it will print out the number of events + it recorded and give a rough estimate of how long each iteration took. + + It does not disable interrupts or raise its priority, so it may be + affected by processes that are running. + + If unsure, say N + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index fb9d7f964898..7c34cbfff96e 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -17,6 +17,7 @@ endif obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o +obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_TRACING) += trace_clock.o diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c new file mode 100644 index 000000000000..747244acb8fd --- /dev/null +++ b/kernel/trace/ring_buffer_benchmark.c @@ -0,0 +1,379 @@ +/* + * ring buffer tester and benchmark + * + * Copyright (C) 2009 Steven Rostedt + */ +#include +#include +#include +#include +#include + +struct rb_page { + u64 ts; + local_t commit; + char data[4080]; +}; + +/* run time and sleep time in seconds */ +#define RUN_TIME 10 +#define SLEEP_TIME 10 + +/* number of events for writer to wake up the reader */ +static int wakeup_interval = 100; + +static int reader_finish; +static struct completion read_start; +static struct completion read_done; + +static struct ring_buffer *buffer; +static struct task_struct *producer; +static struct task_struct *consumer; +static unsigned long read; + +static int disable_reader; +module_param(disable_reader, uint, 0644); +MODULE_PARM_DESC(disable_reader, "only run producer"); + +static int read_events; + +static int kill_test; + +#define KILL_TEST() \ + do { \ + if (!kill_test) { \ + kill_test = 1; \ + WARN_ON(1); \ + } \ + } while (0) + +enum event_status { + EVENT_FOUND, + EVENT_DROPPED, +}; + +static enum event_status read_event(int cpu) +{ + struct ring_buffer_event *event; + int *entry; + u64 ts; + + event = ring_buffer_consume(buffer, cpu, &ts); + if (!event) + return EVENT_DROPPED; + + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + return EVENT_DROPPED; + } + + read++; + return EVENT_FOUND; +} + +static enum event_status read_page(int cpu) +{ + struct ring_buffer_event *event; + struct rb_page *rpage; + unsigned long commit; + void *bpage; + int *entry; + int ret; + int inc; + int i; + + bpage = ring_buffer_alloc_read_page(buffer); + ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); + if (ret >= 0) { + rpage = bpage; + commit = local_read(&rpage->commit); + for (i = 0; i < commit && !kill_test; i += inc) { + + if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { + KILL_TEST(); + break; + } + + inc = -1; + event = (void *)&rpage->data[i]; + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + /* We don't expect any padding */ + KILL_TEST(); + break; + case RINGBUF_TYPE_TIME_EXTEND: + inc = 8; + break; + case 0: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + if (!event->array[0]) { + KILL_TEST(); + break; + } + inc = event->array[0]; + break; + default: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + inc = ((event->type_len + 1) * 4); + } + if (kill_test) + break; + + if (inc <= 0) { + KILL_TEST(); + break; + } + } + } + ring_buffer_free_read_page(buffer, bpage); + + if (ret < 0) + return EVENT_DROPPED; + return EVENT_FOUND; +} + +static void ring_buffer_consumer(void) +{ + /* toggle between reading pages and events */ + read_events ^= 1; + + read = 0; + while (!reader_finish && !kill_test) { + int found; + + do { + int cpu; + + found = 0; + for_each_online_cpu(cpu) { + enum event_status stat; + + if (read_events) + stat = read_event(cpu); + else + stat = read_page(cpu); + + if (kill_test) + break; + if (stat == EVENT_FOUND) + found = 1; + } + } while (found && !kill_test); + + set_current_state(TASK_INTERRUPTIBLE); + if (reader_finish) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + reader_finish = 0; + complete(&read_done); +} + +static void ring_buffer_producer(void) +{ + struct timeval start_tv; + struct timeval end_tv; + unsigned long long time; + unsigned long long entries; + unsigned long long overruns; + unsigned long missed = 0; + unsigned long hit = 0; + unsigned long avg; + int cnt = 0; + + /* + * Hammer the buffer for 10 secs (this may + * make the system stall) + */ + pr_info("Starting ring buffer hammer\n"); + do_gettimeofday(&start_tv); + do { + struct ring_buffer_event *event; + int *entry; + + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer, event); + } + do_gettimeofday(&end_tv); + + if (consumer && !(++cnt % wakeup_interval)) + wake_up_process(consumer); + + } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); + pr_info("End ring buffer hammer\n"); + + if (consumer) { + /* Init both completions here to avoid races */ + init_completion(&read_start); + init_completion(&read_done); + /* the completions must be visible before the finish var */ + smp_wmb(); + reader_finish = 1; + /* finish var visible before waking up the consumer */ + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_done); + } + + time = end_tv.tv_sec - start_tv.tv_sec; + time *= 1000000; + time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); + + entries = ring_buffer_entries(buffer); + overruns = ring_buffer_overruns(buffer); + + if (kill_test) + pr_info("ERROR!\n"); + pr_info("Time: %lld (usecs)\n", time); + pr_info("Overruns: %lld\n", overruns); + if (disable_reader) + pr_info("Read: (reader disabled)\n"); + else + pr_info("Read: %ld (by %s)\n", read, + read_events ? "events" : "pages"); + pr_info("Entries: %lld\n", entries); + pr_info("Total: %lld\n", entries + overruns + read); + pr_info("Missed: %ld\n", missed); + pr_info("Hit: %ld\n", hit); + + do_div(time, 1000); + if (time) + hit /= (long)time; + else + pr_info("TIME IS ZERO??\n"); + + pr_info("Entries per millisec: %ld\n", hit); + + if (hit) { + avg = 1000000 / hit; + pr_info("%ld ns per entry\n", avg); + } +} + +static void wait_to_die(void) +{ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); +} + +static int ring_buffer_consumer_thread(void *arg) +{ + while (!kthread_should_stop() && !kill_test) { + complete(&read_start); + + ring_buffer_consumer(); + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop() || kill_test) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + __set_current_state(TASK_RUNNING); + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int ring_buffer_producer_thread(void *arg) +{ + init_completion(&read_start); + + while (!kthread_should_stop() && !kill_test) { + ring_buffer_reset(buffer); + + if (consumer) { + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_start); + } + + ring_buffer_producer(); + + pr_info("Sleeping for 10 secs\n"); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ * SLEEP_TIME); + __set_current_state(TASK_RUNNING); + } + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int __init ring_buffer_benchmark_init(void) +{ + int ret; + + /* make a one meg buffer in overwite mode */ + buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); + if (!buffer) + return -ENOMEM; + + if (!disable_reader) { + consumer = kthread_create(ring_buffer_consumer_thread, + NULL, "rb_consumer"); + ret = PTR_ERR(consumer); + if (IS_ERR(consumer)) + goto out_fail; + } + + producer = kthread_run(ring_buffer_producer_thread, + NULL, "rb_producer"); + ret = PTR_ERR(producer); + + if (IS_ERR(producer)) + goto out_kill; + + return 0; + + out_kill: + if (consumer) + kthread_stop(consumer); + + out_fail: + ring_buffer_free(buffer); + return ret; +} + +static void __exit ring_buffer_benchmark_exit(void) +{ + kthread_stop(producer); + if (consumer) + kthread_stop(consumer); + ring_buffer_free(buffer); +} + +module_init(ring_buffer_benchmark_init); +module_exit(ring_buffer_benchmark_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("ring_buffer_benchmark"); +MODULE_LICENSE("GPL"); -- cgit v1.2.2 From fd6da10a617f483348ee32bcfe53fd20c302eca1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:32:13 +0800 Subject: tracing/events: don't say hi when loading the trace event sample The sample is useful for testing, and I'm using it. But after loading the module, it keeps saying hi every 10 seconds, this may be disturbing. Also Steven said commenting out the "hi" helped in causing races. :) [ Impact: make testing a bit easier ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A00F6AD.2070008@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- samples/trace_events/trace-events-sample.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index f33b3ba744ac..aabc4e970911 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -16,10 +16,6 @@ static void simple_thread_func(int cnt) set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ); trace_foo_bar("hello", cnt); - - if (!(cnt % 10)) - /* It is really important that I say "hi!" */ - printk(KERN_EMERG "hi!\n"); } static int simple_thread(void *arg) -- cgit v1.2.2 From 96d17980fabeb757706d2d6db5a28580a6156bfc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:32:32 +0800 Subject: tracing/events: make SAMPLE_TRACE_EVENTS default to n Normally a config should be default to n. This patch also makes the sample module-only, like SAMPLE_MARKERS and SAMPLE_TRACEPOINTS. [ Impact: don't build trace event sample by default ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A00F6C0.8090803@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- samples/Kconfig | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index 93f41c051092..b75d28cba3f7 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -20,9 +20,8 @@ config SAMPLE_TRACEPOINTS This build tracepoints example modules. config SAMPLE_TRACE_EVENTS - tristate "Build trace_events examples" - depends on EVENT_TRACING - default m + tristate "Build trace_events examples -- loadable modules only" + depends on EVENT_TRACING && m help This build trace event example modules. -- cgit v1.2.2 From 2df75e415709ad12862028916c772c1f377f6a7c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:33:04 +0800 Subject: tracing/events: fix memory leak when unloading module When unloading a module, memory allocated by init_preds() and trace_define_field() is not freed. [ Impact: fix memory leak ] Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <4A00F6E0.3040503@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 1 + kernel/trace/trace_events.c | 18 ++++++++++++++++++ kernel/trace/trace_events_filter.c | 22 +++++++++++++++------- 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 5fff40c9ff59..662c1becf367 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -116,6 +116,7 @@ struct ftrace_event_call { #define MAX_FILTER_STR_VAL 128 extern int init_preds(struct ftrace_event_call *call); +extern void destroy_preds(struct ftrace_event_call *call); extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern int filter_current_check_discard(struct ftrace_event_call *call, void *rec, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f789ca540fe1..f251a150e75e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -60,6 +60,22 @@ err: } EXPORT_SYMBOL_GPL(trace_define_field); +#ifdef CONFIG_MODULES + +static void trace_destroy_fields(struct ftrace_event_call *call) +{ + struct ftrace_event_field *field, *next; + + list_for_each_entry_safe(field, next, &call->fields, link) { + list_del(&field->link); + kfree(field->type); + kfree(field->name); + kfree(field); + } +} + +#endif /* CONFIG_MODULES */ + static void ftrace_clear_events(void) { struct ftrace_event_call *call; @@ -925,6 +941,8 @@ static void trace_module_remove_events(struct module *mod) unregister_ftrace_event(call->event); debugfs_remove_recursive(call->dir); list_del(&call->list); + trace_destroy_fields(call); + destroy_preds(call); } } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f49486687ee2..ce07b8186710 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -346,6 +346,20 @@ static void filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } +void destroy_preds(struct ftrace_event_call *call) +{ + struct event_filter *filter = call->filter; + int i; + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); + } + kfree(filter->preds); + kfree(filter); + call->filter = NULL; +} + int init_preds(struct ftrace_event_call *call) { struct event_filter *filter; @@ -374,13 +388,7 @@ int init_preds(struct ftrace_event_call *call) return 0; oom: - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } - kfree(filter->preds); - kfree(call->filter); - call->filter = NULL; + destroy_preds(call); return -ENOMEM; } -- cgit v1.2.2 From 20c8928abe70e204bd077ab6cfe23002d7788983 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:33:45 +0800 Subject: tracing/events: fix concurrent access to ftrace_events list A module will add/remove its trace events when it gets loaded/unloaded, so the ftrace_events list is not "const", and concurrent access needs to be protected. This patch thus fixes races between loading/unloding modules and read 'available_events' or read/write 'set_event', etc. Below shows how to reproduce the race: # for ((; ;)) { cat /mnt/tracing/available_events; } > /dev/null & # for ((; ;)) { insmod trace-events-sample.ko; rmmod sample; } & After a while: BUG: unable to handle kernel paging request at 0010011c IP: [] t_next+0x1b/0x2d ... Call Trace: [] ? seq_read+0x217/0x30d [] ? seq_read+0x0/0x30d [] ? vfs_read+0x8f/0x136 [] ? sys_read+0x40/0x65 [] ? sysenter_do_call+0x12/0x36 [ Impact: fix races when concurrent accessing ftrace_events list ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker Cc: Tom Zanussi Cc: Peter Zijlstra LKML-Reference: <4A00F709.3080800@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 1 + kernel/trace/trace_event_profile.c | 19 ++++++++++++++----- kernel/trace/trace_events.c | 20 +++++++++++--------- kernel/trace/trace_events_filter.c | 10 +++++++--- 4 files changed, 33 insertions(+), 17 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7736fe8c1b76..777c6c3a0cde 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -825,6 +825,7 @@ static int filter_pred_##size(struct filter_pred *pred, void *event, \ return match; \ } +extern struct mutex event_mutex; extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 7bf2ad65eee5..5b5895afecfe 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -10,21 +10,30 @@ int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; + int ret = -EINVAL; + mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) - return event->profile_enable(event); + if (event->id == event_id) { + ret = event->profile_enable(event); + break; + } } + mutex_unlock(&event_mutex); - return -EINVAL; + return ret; } void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; + mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) - return event->profile_disable(event); + if (event->id == event_id) { + event->profile_disable(event); + break; + } } + mutex_unlock(&event_mutex); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f251a150e75e..8d579ff23610 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -21,7 +21,7 @@ #define TRACE_SYSTEM "TRACE_SYSTEM" -static DEFINE_MUTEX(event_mutex); +DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); @@ -80,6 +80,7 @@ static void ftrace_clear_events(void) { struct ftrace_event_call *call; + mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (call->enabled) { @@ -87,6 +88,7 @@ static void ftrace_clear_events(void) call->unregfunc(); } } + mutex_unlock(&event_mutex); } static void ftrace_event_enable_disable(struct ftrace_event_call *call, @@ -274,6 +276,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&event_mutex); + if (*pos == 0) + m->private = ftrace_events.next; return t_next(m, NULL, pos); } @@ -303,6 +308,9 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&event_mutex); + if (*pos == 0) + m->private = ftrace_events.next; return s_next(m, NULL, pos); } @@ -319,12 +327,12 @@ static int t_show(struct seq_file *m, void *v) static void t_stop(struct seq_file *m, void *p) { + mutex_unlock(&event_mutex); } static int ftrace_event_seq_open(struct inode *inode, struct file *file) { - int ret; const struct seq_operations *seq_ops; if ((file->f_mode & FMODE_WRITE) && @@ -332,13 +340,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) ftrace_clear_events(); seq_ops = inode->i_private; - ret = seq_open(file, seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = ftrace_events.next; - } - return ret; + return seq_open(file, seq_ops); } static ssize_t diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index ce07b8186710..7ac691085276 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -408,6 +408,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) filter->n_preds = 0; } + mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; @@ -417,6 +418,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) remove_filter_string(call->filter); } } + mutex_unlock(&event_mutex); } static int filter_add_pred_fn(struct filter_parse_state *ps, @@ -567,6 +569,7 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, { struct event_filter *filter = system->filter; struct ftrace_event_call *call; + int err = 0; if (!filter->preds) { filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), @@ -584,8 +587,8 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, filter->preds[filter->n_preds] = pred; filter->n_preds++; + mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - int err; if (!call->define_fields) continue; @@ -597,12 +600,13 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, if (err) { filter_free_subsystem_preds(system); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return err; + break; } replace_filter_string(call->filter, filter_string); } + mutex_unlock(&event_mutex); - return 0; + return err; } static void parse_init(struct filter_parse_state *ps, -- cgit v1.2.2 From de1d7286060430e79a1d50ad6e5fee8fe863c5f6 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 5 May 2009 16:49:59 +0800 Subject: tracepoint: trace_sched_migrate_task(): remove parameter The orig_cpu parameter in trace_sched_migrate_task() is not necessary, it can be got by using task_cpu(p) in the probe. [ Impact: micro-optimization ] Signed-off-by: Mathieu Desnoyers [ modified from Mathieu's patch. The original patch is at: http://marc.info/?l=linux-kernel&m=123791201716239&w=2 ] Signed-off-by: Xiao Guangrong Cc: fweisbec@gmail.com Cc: rostedt@goodmis.org Cc: Li Zefan Cc: zhaolei@cn.fujitsu.com Cc: laijs@cn.fujitsu.com LKML-Reference: <49FFFDB7.1050402@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 6 +++--- kernel/sched.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ffa1cab586b9..dd4033cf5b09 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -180,9 +180,9 @@ TRACE_EVENT(sched_switch, */ TRACE_EVENT(sched_migrate_task, - TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), + TP_PROTO(struct task_struct *p, int dest_cpu), - TP_ARGS(p, orig_cpu, dest_cpu), + TP_ARGS(p, dest_cpu), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -196,7 +196,7 @@ TRACE_EVENT(sched_migrate_task, memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; - __entry->orig_cpu = orig_cpu; + __entry->orig_cpu = task_cpu(p); __entry->dest_cpu = dest_cpu; ), diff --git a/kernel/sched.c b/kernel/sched.c index 9f7ffd00b6ea..9cdedbd181ce 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1954,7 +1954,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) clock_offset = old_rq->clock - new_rq->clock; - trace_sched_migrate_task(p, task_cpu(p), new_cpu); + trace_sched_migrate_task(p, new_cpu); #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) -- cgit v1.2.2 From a42aaa3bbce85ac487ad4fad5db99e8e91b7aac1 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Mon, 4 May 2009 16:27:26 -0400 Subject: blktrace: correct remap names This attempts to clarify names utilized during block I/O remap operations (partition, volume manager). It correctly matches up the /from/ information for both device & sector. This takes in the concept from Kosaki Motohiro and extends it to include better naming for the "device_from" field. [ Impact: cleanup ] Signed-off-by: Alan D. Brunelle Reviewed-by: Li Zefan Reviewed-by: KOSAKI Motohiro Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo LKML-Reference: <49FF4FAE.3000301@hp.com> Signed-off-by: Ingo Molnar --- include/linux/blktrace_api.h | 4 ++-- include/trace/block.h | 4 ++-- kernel/trace/blktrace.c | 24 ++++++++++++------------ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 62763c952854..82b4636030e9 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -116,9 +116,9 @@ struct blk_io_trace { * The remap event */ struct blk_io_trace_remap { - __be32 device; __be32 device_from; - __be64 sector; + __be32 device_to; + __be64 sector_from; }; enum { diff --git a/include/trace/block.h b/include/trace/block.h index 25b7068b819e..87f6456fd32e 100644 --- a/include/trace/block.h +++ b/include/trace/block.h @@ -70,7 +70,7 @@ DECLARE_TRACE(block_split, DECLARE_TRACE(block_remap, TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t from, sector_t to), - TP_ARGS(q, bio, dev, from, to)); + sector_t to, sector_t from), + TP_ARGS(q, bio, dev, to, from)); #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c32062bd10b3..f8d46d6f5d34 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -830,8 +830,8 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * @q: queue the io is for * @bio: the source bio * @dev: target device - * @from: source sector * @to: target sector + * @from: source sector * * Description: * Device mapper or raid target sometimes need to split a bio because @@ -839,7 +839,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * **/ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from, sector_t to) + dev_t dev, sector_t to, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -847,9 +847,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, if (likely(!bt)) return; - r.device = cpu_to_be32(dev); - r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); - r.sector = cpu_to_be64(to); + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); @@ -1028,11 +1028,11 @@ static void get_pdu_remap(const struct trace_entry *ent, struct blk_io_trace_remap *r) { const struct blk_io_trace_remap *__r = pdu_start(ent); - __u64 sector = __r->sector; + __u64 sector_from = __r->sector_from; - r->device = be32_to_cpu(__r->device); r->device_from = be32_to_cpu(__r->device_from); - r->sector = be64_to_cpu(sector); + r->device_to = be32_to_cpu(__r->device_to); + r->sector_from = be64_to_cpu(sector_from); } typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); @@ -1148,13 +1148,13 @@ static int blk_log_with_error(struct trace_seq *s, static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) { - struct blk_io_trace_remap r = { .device = 0, }; + struct blk_io_trace_remap r = { .device_from = 0, }; get_pdu_remap(ent, &r); return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", - t_sector(ent), - t_sec(ent), MAJOR(r.device), MINOR(r.device), - (unsigned long long)r.sector); + t_sector(ent), t_sec(ent), + MAJOR(r.device_from), MINOR(r.device_from), + (unsigned long long)r.sector_from); } static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) -- cgit v1.2.2 From 22a7c31a9659deaddafbbcec6562d44141e84474 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Mon, 4 May 2009 16:35:08 -0400 Subject: blktrace: from-sector redundant in trace_block_remap Remove redundant from-sector parameter: it's /always/ the bio's sector passed in. [ Impact: cleanup ] Signed-off-by: Alan D. Brunelle Reviewed-by: Li Zefan Reviewed-by: KOSAKI Motohiro Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo LKML-Reference: <49FF517C.7000503@hp.com> Signed-off-by: Ingo Molnar --- block/blk-core.c | 5 ++--- drivers/md/dm.c | 3 +-- include/trace/block.h | 4 ++-- kernel/trace/blktrace.c | 8 ++++---- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 07ab75403e1a..a5f747a8312e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1275,7 +1275,7 @@ static inline void blk_partition_remap(struct bio *bio) bio->bi_bdev = bdev->bd_contains; trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, - bdev->bd_dev, bio->bi_sector, + bdev->bd_dev, bio->bi_sector - p->start_sect); } } @@ -1444,8 +1444,7 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; if (old_sector != -1) - trace_block_remap(q, bio, old_dev, bio->bi_sector, - old_sector); + trace_block_remap(q, bio, old_dev, old_sector); trace_block_bio_queue(q, bio); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8a994be035ba..b01514afb6b5 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -657,8 +657,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, /* the bio has been remapped so dispatch it */ trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, - tio->io->bio->bi_bdev->bd_dev, - clone->bi_sector, sector); + tio->io->bio->bi_bdev->bd_dev, sector); generic_make_request(clone); } else if (r < 0 || r == DM_MAPIO_REQUEUE) { diff --git a/include/trace/block.h b/include/trace/block.h index 87f6456fd32e..8ac945b7746e 100644 --- a/include/trace/block.h +++ b/include/trace/block.h @@ -70,7 +70,7 @@ DECLARE_TRACE(block_split, DECLARE_TRACE(block_remap, TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t to, sector_t from), - TP_ARGS(q, bio, dev, to, from)); + sector_t to), + TP_ARGS(q, bio, dev, to)); #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f8d46d6f5d34..e099f8cc1d1c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -830,7 +830,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * @q: queue the io is for * @bio: the source bio * @dev: target device - * @to: target sector * @from: source sector * * Description: @@ -839,7 +838,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * **/ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t to, sector_t from) + dev_t dev, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -851,8 +850,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); r.sector_from = cpu_to_be64(from); - __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, - !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, + BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), + sizeof(r), &r); } /** -- cgit v1.2.2 From 48dd0fed90e2b1f1ba87401439b85942181c6df3 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 6 May 2009 15:45:45 +0530 Subject: tracing: trace_output.c, fix false positive compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This compiler warning: CC kernel/trace/trace_output.o kernel/trace/trace_output.c: In function ‘register_ftrace_event’: kernel/trace/trace_output.c:544: warning: ‘list’ may be used uninitialized in this function Is wrong as 'list' is always initialized - but GCC (4.3.2) does not recognize this relationship properly. Work around the warning by initializing the variable to NULL. [ Impact: fix false positive compiler warning ] Signed-off-by: Jaswinder Singh Rajput Acked-by: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5fc51f0f75fc..8bd9a2c1a46a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -541,7 +541,7 @@ int register_ftrace_event(struct trace_event *event) INIT_LIST_HEAD(&event->list); if (!event->type) { - struct list_head *list; + struct list_head *list = NULL; if (next_event_type > FTRACE_MAX_EVENT) { -- cgit v1.2.2 From 35cf723e99c0e26ddf51f037dffaa4ff2c2c9106 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 May 2009 12:33:38 +0200 Subject: tracing: small trave_events sample Makefile cleanup Use -I$(src) to add the current directory the include path. [ Impact: cleanup ] Signed-off-by: Christoph Hellwig Acked-by: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- samples/trace_events/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile index 06c6dea1eb84..0d428dc67283 100644 --- a/samples/trace_events/Makefile +++ b/samples/trace_events/Makefile @@ -1,8 +1,6 @@ # builds the trace events example kernel modules; # then to use one (as root): insmod -PWD := $(shell pwd) - -CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/ +CFLAGS_trace-events-sample.o := -I$(src) obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o -- cgit v1.2.2 From 8e7abf1c62941ebb7a1416cbc62392c8a0902625 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 10:26:45 -0400 Subject: ring-buffer: remove unneeded conditional in rb_reserve_next The code in __rb_reserve_next checks on page overflow if it is the original commiter and then resets the page back to the original setting. Although this is fine, and the code is correct, it is a bit fragil. Some experimental work I did breaks it easily. The better and more robust solution is to have all commiters that overflow the page, simply subtract what they added. [ Impact: more robust ring buffer account management ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 424129eb20a4..03ed52b67db3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1290,9 +1290,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, rb_event_set_padding(event); } - if (tail <= BUF_PAGE_SIZE) - /* Set the write back to the previous setting */ - local_set(&tail_page->write, tail); + /* Set the write back to the previous setting */ + local_sub(length, &tail_page->write); /* * If this was a commit entry that failed, @@ -1311,8 +1310,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, out_reset: /* reset write */ - if (tail <= BUF_PAGE_SIZE) - local_set(&tail_page->write, tail); + local_sub(length, &tail_page->write); if (likely(lock_taken)) __raw_spin_unlock(&cpu_buffer->lock); -- cgit v1.2.2 From 00c81a58c5b4e0de14ee33bfbc3d71c90f69f9ea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 12:40:51 -0400 Subject: ring-buffer: check for failed allocation in ring buffer benchmark The result of the allocation of the ring buffer read page in the ring buffer bench mark does not check the return to see if a page was actually allocated. This patch fixes that. [ Impact: avoid NULL dereference ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 747244acb8fd..dcd75e9e49f4 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -84,6 +84,9 @@ static enum event_status read_page(int cpu) int i; bpage = ring_buffer_alloc_read_page(buffer); + if (!bpage) + return EVENT_DROPPED; + ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); if (ret >= 0) { rpage = bpage; -- cgit v1.2.2 From 6634ff26cce2da04e5c2a5481bcb8888e7d01786 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 15:30:07 -0400 Subject: ring-buffer: make moving the tail page a separate function Ingo Molnar thought the code would be cleaner if we used a function call instead of a goto for moving the tail page. After implementing this, it seems that gcc still inlines the result and the output is pretty much the same. Since this is considered a cleaner approach, might as well implement it. [ Impact: code clean up ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 89 +++++++++++++++++++++++++--------------------- 1 file changed, 49 insertions(+), 40 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 03ed52b67db3..3ae5ccf2c0fc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1154,51 +1154,18 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } + static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 *ts) +rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length, unsigned long tail, + struct buffer_page *commit_page, + struct buffer_page *tail_page, u64 *ts) { - struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; - struct buffer_page *next_page; - unsigned long tail, write; + struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; - unsigned long flags; bool lock_taken = false; - - commit_page = cpu_buffer->commit_page; - /* we just need to protect against interrupts */ - barrier(); - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); - tail = write - length; - - /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) - goto next_page; - - /* We reserved something on the buffer */ - - if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) - return NULL; - - event = __rb_page_index(tail_page, tail); - rb_update_event(event, type, length); - - /* The passed in type is zero for DATA */ - if (likely(!type)) - local_inc(&tail_page->entries); - - /* - * If this is a commit and the tail is zero, then update - * this page's time stamp. - */ - if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->page->time_stamp = *ts; - - return event; - - next_page: + unsigned long flags; next_page = tail_page; @@ -1318,6 +1285,48 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + unsigned type, unsigned long length, u64 *ts) +{ + struct buffer_page *tail_page, *commit_page; + struct ring_buffer_event *event; + unsigned long tail, write; + + commit_page = cpu_buffer->commit_page; + /* we just need to protect against interrupts */ + barrier(); + tail_page = cpu_buffer->tail_page; + write = local_add_return(length, &tail_page->write); + tail = write - length; + + /* See if we shot pass the end of this buffer page */ + if (write > BUF_PAGE_SIZE) + return rb_move_tail(cpu_buffer, length, tail, + commit_page, tail_page, ts); + + /* We reserved something on the buffer */ + + if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) + return NULL; + + event = __rb_page_index(tail_page, tail); + rb_update_event(event, type, length); + + /* The passed in type is zero for DATA */ + if (likely(!type)) + local_inc(&tail_page->entries); + + /* + * If this is a commit and the tail is zero, then update + * this page's time stamp. + */ + if (!tail && rb_is_commit(cpu_buffer, event)) + cpu_buffer->commit_page->page->time_stamp = *ts; + + return event; +} + static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, u64 *delta) -- cgit v1.2.2 From 3e07a4f680adc66dfa175aa5021aedf340251b12 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 18:36:59 -0400 Subject: ring-buffer: change test to be more latency friendly The ring buffer benchmark/test runs a producer for 10 seconds. This is done with preemption and interrupts enabled. But if the kernel is not compiled with CONFIG_PREEMPT, it basically stops everything but interrupts for 10 seconds. Although this is just a test and is not for production, this attribute can be quite annoying. It can also spawn badness elsewhere. This patch solves the issues by calling "cond_resched" when the system is not compiled with CONFIG_PREEMPT. It also keeps track of the time spent to call cond_resched such that it does not go against the time calculations. That is, if the task schedules away, the time scheduled out is removed from the test data. Note, this only works for non PREEMPT because we do not know when the task is scheduled out if we have PREEMPT enabled. [ Impact: prevent test from stopping the world for 10 seconds ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index dcd75e9e49f4..a26fc67b63bb 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -185,6 +185,35 @@ static void ring_buffer_consumer(void) complete(&read_done); } +/* + * If we are a non preempt kernel, the 10 second run will + * stop everything while it runs. Instead, we will call cond_resched + * and also add any time that was lost by a rescedule. + */ +#ifdef CONFIG_PREEMPT +static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) +{ +} +#else +static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) +{ + struct timeval tv; + + cond_resched(); + do_gettimeofday(&tv); + if (tv.tv_usec < end_tv->tv_usec) { + tv.tv_usec += 1000000; + tv.tv_sec--; + } + start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec; + start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec; + if (start_tv->tv_usec > 1000000) { + start_tv->tv_usec -= 1000000; + start_tv->tv_sec++; + } +} +#endif + static void ring_buffer_producer(void) { struct timeval start_tv; @@ -221,6 +250,8 @@ static void ring_buffer_producer(void) if (consumer && !(++cnt % wakeup_interval)) wake_up_process(consumer); + sched_if_needed(&start_tv, &end_tv); + } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); pr_info("End ring buffer hammer\n"); -- cgit v1.2.2 From 71e1c8ac42ae4038ddb1367cce7097ab868dc532 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 21:20:39 -0400 Subject: tracing: update sample with TRACE_INCLUDE_FILE When creating trace events for ftrace, the header file with the TRACE_EVENT macros must also have a macro called TRACE_SYSTEM. This macro describes the name of the system the TRACE_EVENTS are defined for. It also doubles as a way for the define_trace.h file to include the file that included it. For example: in irq.h #define TRACE_SYSTEM irq [...] #include The define_trace will use TRACE_SYSTEM to include irq.h. But if the name of the trace system does not match the name of the trace header file, one can override it with: Which will change define_trace.h to inclued foo_trace.h instead of foo.h The sample comments this, but people that use the sample code will more likely use the code and not read the comments. This patch changes the sample code to use the TRACE_INCLUDE_FILE to better show developers how to use it. [ Impact: make sample less confusing to developers ] Reported-by: Christoph Hellwig Signed-off-by: Steven Rostedt --- samples/trace_events/trace-events-sample.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index eab46443e610..128a897687c5 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -31,7 +31,7 @@ * */ #undef TRACE_SYSTEM -#define TRACE_SYSTEM trace-events-sample +#define TRACE_SYSTEM sample /* * The TRACE_EVENT macro is broken up into 5 parts. @@ -120,5 +120,10 @@ TRACE_EVENT(foo_bar, * result. */ #undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH . +/* + * TRACE_INCLUDE_FILE is not needed if the filename and TRACE_SYSTEM are equal + */ +#define TRACE_INCLUDE_FILE trace-events-sample #include -- cgit v1.2.2 From 9456f0fa6d3cb944d3b9fc31c9a244e0362c26ea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 21:54:09 -0400 Subject: tracing: reset ring buffer when removing modules with events Li Zefan found that there's a race using the event ids of events and modules. When a module is loaded, an event id is incremented. We only have 16 bits for event ids (65536) and there is a possible (but highly unlikely) race that we could load and unload a module that registers events so many times that the event id counter overflows. When it overflows, it then restarts and goes looking for available ids. An id is available if it was added by a module and released. The race is if you have one module add an id, and then is removed. Another module loaded can use that same event id. But if the old module still had events in the ring buffer, the new module's call back would get bogus data. At best (and most likely) the output would just be garbage. But if the module for some reason used pointers (not recommended) then this could potentially crash. The safest thing to do is just reset the ring buffer if a module that registered events is removed. [ Impact: prevent unpredictable results of event id overflows ] Reported-by: Li Zefan LKML-Reference: <49FEAFD0.30106@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 ++++++++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 9 +++++++++ 3 files changed, 21 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4164a344e72a..dd40d2320346 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -639,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr) tracing_reset(tr, cpu); } +void tracing_reset_current(int cpu) +{ + tracing_reset(&global_trace, cpu); +} + +void tracing_reset_current_online_cpus(void) +{ + tracing_reset_online_cpus(&global_trace); +} + #define SAVED_CMDLINES 128 #define NO_CMDLINE_MAP UINT_MAX static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 777c6c3a0cde..ba25793ffe67 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -409,6 +409,8 @@ int tracing_is_enabled(void); void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset_current(int cpu); +void tracing_reset_current_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *trace_create_file(const char *name, mode_t mode, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8d579ff23610..6d2c842a0248 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -932,9 +932,11 @@ static void trace_module_remove_events(struct module *mod) { struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; + bool found = false; list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { + found = true; if (call->enabled) { call->enabled = 0; call->unregfunc(); @@ -957,6 +959,13 @@ static void trace_module_remove_events(struct module *mod) list_del(&file_ops->list); kfree(file_ops); } + + /* + * It is safest to reset the ring buffer if the module being unloaded + * registered any events. + */ + if (found) + tracing_reset_current_online_cpus(); } static int trace_module_notify(struct notifier_block *self, -- cgit v1.2.2 From 8ae79a138e88aceeeb07077bff2883245fb7c218 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 22:52:15 -0400 Subject: tracing: add hierarchical enabling of events With the current event directory, you can only enable individual events. The file debugfs/tracing/set_event is used to be able to enable or disable several events at once. But that can still be awkward. This patch adds hierarchical enabling of events. That is, each directory in debugfs/tracing/events has an "enable" file. This file can enable or disable all events within the directory and below. # echo 1 > /debugfs/tracing/events/enable will enable all events. # echo 1 > /debugfs/tracing/events/sched/enable will enable all events in the sched subsystem. # echo 1 > /debugfs/tracing/events/enable # echo 0 > /debugfs/tracing/events/irq/enable will enable all events, but then disable just the irq subsystem events. When reading one of these enable files, there are four results: 0 - all events this file affects are disabled 1 - all events this file affects are enabled X - there is a mixture of events enabled and disabled ? - this file does not affect any event Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 140 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6d2c842a0248..87feb0117ce2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -400,6 +400,133 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char *system = filp->private_data; + struct ftrace_event_call *call; + char buf[2]; + int set = -1; + int all = 0; + int ret; + + if (system[0] == '*') + all = 1; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!call->name || !call->regfunc) + continue; + + if (!all && strcmp(call->system, system) != 0) + continue; + + /* + * We need to find out if all the events are set + * or if all events or cleared, or if we have + * a mixture. + */ + if (call->enabled) { + switch (set) { + case -1: + set = 1; + break; + case 0: + set = 2; + break; + } + } else { + switch (set) { + case -1: + set = 0; + break; + case 1: + set = 2; + break; + } + } + /* + * If we have a mixture, no need to look further. + */ + if (set == 2) + break; + } + mutex_unlock(&event_mutex); + + buf[1] = '\n'; + switch (set) { + case 0: + buf[0] = '0'; + break; + case 1: + buf[0] = '1'; + break; + case 2: + buf[0] = 'X'; + break; + default: + buf[0] = '?'; + } + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + + return ret; +} + +static ssize_t +system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char *system = filp->private_data; + unsigned long val; + char *command; + char buf[64]; + ssize_t ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + switch (val) { + case 0: + case 1: + break; + + default: + return -EINVAL; + } + + command = kstrdup(system, GFP_KERNEL); + if (!command) + return -ENOMEM; + + ret = ftrace_set_clr_event(command, val); + if (ret) + goto out_free; + + ret = cnt; + + out_free: + kfree(command); + + *ppos += cnt; + + return ret; +} + extern char *__bad_type_size(void); #undef FIELD @@ -686,6 +813,12 @@ static const struct file_operations ftrace_subsystem_filter_fops = { .write = subsystem_filter_write, }; +static const struct file_operations ftrace_system_enable_fops = { + .open = tracing_open_generic, + .read = system_enable_read, + .write = system_enable_write, +}; + static const struct file_operations ftrace_show_header_fops = { .open = tracing_open_generic, .read = show_header, @@ -768,6 +901,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } + entry = trace_create_file("enable", 0644, system->entry, + (void *)system->name, + &ftrace_system_enable_fops); + return system->entry; } @@ -1041,6 +1178,9 @@ static __init int event_trace_init(void) ring_buffer_print_entry_header, &ftrace_show_header_fops); + trace_create_file("enable", 0644, d_events, + "*:*", &ftrace_system_enable_fops); + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) -- cgit v1.2.2 From e8808c1019b048a43686dbd25c188a035842c2e2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 3 May 2009 02:48:52 +0200 Subject: tracing/filters: support for filters of dynamic sized arrays Currently the filtering infrastructure supports well the numeric types and fixed sized array types. But the recently added __string() field uses a specific indirect offset mechanism which requires a specific predicate. Until now it wasn't supported. This patch adds this support and implies very few changes, only a new predicate is needed, the management of this specific field can be done through the usual string helpers in the filtering infrastructure. [ Impact: support all kinds of strings in the tracing filters ] Cc: Tom Zanussi Cc: Steven Rostedt Cc: Li Zefan Cc: Zhaolei Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events_filter.c | 44 +++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7ac691085276..01c76eb3e168 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -151,6 +151,7 @@ static int filter_pred_or(struct filter_pred *pred __attribute((unused)), return val1 || val2; } +/* Filter predicate for fixed sized arrays of characters */ static int filter_pred_string(struct filter_pred *pred, void *event, int val1, int val2) { @@ -164,6 +165,30 @@ static int filter_pred_string(struct filter_pred *pred, void *event, return match; } +/* + * Filter predicate for dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry. + * Also each of these strings have a field in the entry which + * contains its offset from the beginning of the entry. + * We have then first to get this field, dereference it + * and add it to the address of the entry, and at last we have + * the address of the string. + */ +static int filter_pred_strloc(struct filter_pred *pred, void *event, + int val1, int val2) +{ + int str_loc = *(int *)(event + pred->offset); + char *addr = (char *)(event + str_loc); + int cmp, match; + + cmp = strncmp(addr, pred->str_val, pred->str_len); + + match = (!cmp) ^ pred->not; + + return match; +} + static int filter_pred_none(struct filter_pred *pred, void *event, int val1, int val2) { @@ -446,10 +471,18 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return 0; } +enum { + FILTER_STATIC_STRING = 1, + FILTER_DYN_STRING +}; + static int is_string_field(const char *type) { if (strchr(type, '[') && strstr(type, "char")) - return 1; + return FILTER_STATIC_STRING; + + if (!strcmp(type, "__str_loc")) + return FILTER_DYN_STRING; return 0; } @@ -512,6 +545,7 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_field *field; filter_pred_fn_t fn; unsigned long long val; + int string_type; pred->fn = filter_pred_none; @@ -536,8 +570,12 @@ static int filter_add_pred(struct filter_parse_state *ps, return -EINVAL; } - if (is_string_field(field->type)) { - fn = filter_pred_string; + string_type = is_string_field(field->type); + if (string_type) { + if (string_type == FILTER_STATIC_STRING) + fn = filter_pred_string; + else + fn = filter_pred_strloc; pred->str_len = field->size; if (pred->op == OP_NE) pred->not = 1; -- cgit v1.2.2 From 5928c3cc0ffcb6894bbab6be591b7ae1786b2d87 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 3 May 2009 03:03:57 +0200 Subject: tracing/filters: support for operator reserved characters in strings When we set a filter for an event, such as: echo "name == my_lock_name" > \ /debug/tracing/events/lockdep/lock_acquired/filter then the following order of token type is parsed: - space - operator - parentheses - operand Because the operators and parentheses have a higher precedence than the operand characters, which is normal, then we can't use any string containing such special characters: ()=<>!&| To get this support and also avoid ambiguous intepretation from the parser or the human, we can do it using double quotes so that we keep the usual languages habits. Then after this patch you can still declare string condition like before: echo name == myname But if you want to compare against a string containing an operator character, you can use double quotes: echo 'name == "&myname"' Don't forget to include the whole expression into single quotes or the double ones will be eaten by echo. [ Impact: support strings with special characters for tracing filters ] Cc: Tom Zanussi Cc: Steven Rostedt Cc: Li Zefan Cc: Zhaolei Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events_filter.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 01c76eb3e168..8c62e5bdff09 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -851,10 +851,19 @@ static void postfix_clear(struct filter_parse_state *ps) static int filter_parse(struct filter_parse_state *ps) { + int in_string = 0; int op, top_op; char ch; while ((ch = infix_next(ps))) { + if (ch == '"') { + in_string ^= 1; + continue; + } + + if (in_string) + goto parse_operand; + if (isspace(ch)) continue; @@ -908,6 +917,7 @@ static int filter_parse(struct filter_parse_state *ps) } continue; } +parse_operand: if (append_operand_char(ps, ch)) { parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); return -EINVAL; -- cgit v1.2.2 From d94fc523f3c35bd8013f04827e94756cbc0212f4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 7 May 2009 15:11:15 +0800 Subject: tracing/events: fix concurrent access to ftrace_events list, fix In filter_add_subsystem_pred() we should release event_mutex before calling filter_free_subsystem_preds(), since both functions hold event_mutex. [ Impact: fix deadlock when writing invalid pred into subsystem filter ] Signed-off-by: Li Zefan Cc: tzanussi@gmail.com Cc: a.p.zijlstra@chello.nl Cc: fweisbec@gmail.com Cc: rostedt@goodmis.org LKML-Reference: <4A028993.7020509@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8c62e5bdff09..85ad6a8939ad 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -636,14 +636,15 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, err = filter_add_pred(ps, call, pred); if (err) { + mutex_unlock(&event_mutex); filter_free_subsystem_preds(system); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - break; + goto out; } replace_filter_string(call->filter, filter_string); } mutex_unlock(&event_mutex); - +out: return err; } -- cgit v1.2.2 From 29c8000ee7da3a6756d26143991e573eaaf2a9f6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 11:13:42 -0400 Subject: ring-buffer: remove complex calculations in ring-buffer-test Ingo Molnar thought that the code to calculate the time in cond_resched is a bit too ugly and is not needed. This patch removes it and replaces it with a simple call to cond_resched. I kept the comment that explains the reason for the cond_resched. [ Impact: remove ugly code ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 37 +++++++----------------------------- 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a26fc67b63bb..f4ceb453c7dd 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -185,35 +185,6 @@ static void ring_buffer_consumer(void) complete(&read_done); } -/* - * If we are a non preempt kernel, the 10 second run will - * stop everything while it runs. Instead, we will call cond_resched - * and also add any time that was lost by a rescedule. - */ -#ifdef CONFIG_PREEMPT -static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) -{ -} -#else -static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) -{ - struct timeval tv; - - cond_resched(); - do_gettimeofday(&tv); - if (tv.tv_usec < end_tv->tv_usec) { - tv.tv_usec += 1000000; - tv.tv_sec--; - } - start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec; - start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec; - if (start_tv->tv_usec > 1000000) { - start_tv->tv_usec -= 1000000; - start_tv->tv_sec++; - } -} -#endif - static void ring_buffer_producer(void) { struct timeval start_tv; @@ -250,7 +221,13 @@ static void ring_buffer_producer(void) if (consumer && !(++cnt % wakeup_interval)) wake_up_process(consumer); - sched_if_needed(&start_tv, &end_tv); + /* + * If we are a non preempt kernel, the 10 second run will + * stop everything while it runs. Instead, we will call + * cond_resched and also add any time that was lost by a + * rescedule. + */ + cond_resched(); } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); pr_info("End ring buffer hammer\n"); -- cgit v1.2.2 From d6bf81ef0f7474434c2a049e8bf3c9146a14dd96 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 11:49:35 -0400 Subject: tracing: append ":*" to internal setting of system events The system enabling of events uses the same code as the set_event file. It passes in the name of the system to the parser and that will enable all the events that has that system as a name. The problem is that it will also enable events with the same name as the system. If you have system name foo, and system name bar, but within the system bar, there exists an event called foo. By setting the system name foo, you will also be enabling the event foo in the system bar. This is not an expected result. The solution is to pass in "foo:*", which will only enable the system foo and not events called foo. [ Impact: prevent accidental enabling of events with same name as a system ] Reported-by: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 87feb0117ce2..8d0fae3af595 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -509,9 +509,11 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; } - command = kstrdup(system, GFP_KERNEL); + /* +3 for the ":*\0" */ + command = kmalloc(strlen(system)+3, GFP_KERNEL); if (!command) return -ENOMEM; + sprintf(command, "%s:*", system); ret = ftrace_set_clr_event(command, val); if (ret) @@ -1179,7 +1181,7 @@ static __init int event_trace_init(void) &ftrace_show_header_fops); trace_create_file("enable", 0644, d_events, - "*:*", &ftrace_system_enable_fops); + "*", &ftrace_system_enable_fops); for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ -- cgit v1.2.2 From 65b77242043f74bca6a0d733c0e48ef03a8c9893 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 12:49:27 -0400 Subject: tracing: have menu default enabled when kernel debug is configured Tracing can be very helpful to debug the kernel. When DEBUG_KERNEL is enabled it is nice to enable the trace menu as well. This patch only make the tracing menu enabled by default, it does not make any of the tracers enabled. And the menu is only enabled by default if DEBUG_KERNEL is enabled. [ Impact: show tracing options to those debugging the kernel ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 50f62a296e1d..f61be3015783 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -79,6 +79,7 @@ if TRACING_SUPPORT menuconfig FTRACE bool "Tracers" + default y if DEBUG_KERNEL help Enable the kernel tracing infrastructure. -- cgit v1.2.2 From 0574ea421b90e0e45a72c447dd3c2c79ffd8c153 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 14:20:28 -0400 Subject: ring-buffer: only periodically call cond_resched to ring-buffer-benchmark Calling cond_resched at every iteration of the loop adds a bit of overhead to the benchmark. This patch does two things. 1) only calls cond-resched when CONFIG_PREEMPT is not enabled 2) only calls cond-resched after so many traces has been performed. [ Impact: less overhead to the ring-buffer-benchmark ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index f4ceb453c7dd..a7c048bb4466 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -218,16 +218,23 @@ static void ring_buffer_producer(void) } do_gettimeofday(&end_tv); - if (consumer && !(++cnt % wakeup_interval)) + cnt++; + if (consumer && !(cnt % wakeup_interval)) wake_up_process(consumer); +#ifndef CONFIG_PREEMPT /* * If we are a non preempt kernel, the 10 second run will * stop everything while it runs. Instead, we will call * cond_resched and also add any time that was lost by a * rescedule. + * + * Do a cond resched at the same frequency we would wake up + * the reader. */ - cond_resched(); + if (cnt % wakeup_interval) + cond_resched(); +#endif } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); pr_info("End ring buffer hammer\n"); -- cgit v1.2.2 From 7da3046d6ce6ea97494020081c509b642b7016af Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 19:52:20 -0400 Subject: ring-buffer: add total count in ring-buffer-benchmark It is nice to see the overhead of the benchmark test when tracing is disabled. That is, we turn off the ring buffer just to see what the cost of running the loop that calls into the ring buffer is. Currently, if no entries wer made, we get 0. This is not informative. This patch changes it to check if we had any "missed" (non recorded) events. If so, a total count is also reported. [ Impact: evaluate the over head of the ring buffer benchmark test ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a7c048bb4466..a21aa7b3d05e 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -285,6 +285,17 @@ static void ring_buffer_producer(void) avg = 1000000 / hit; pr_info("%ld ns per entry\n", avg); } + + + if (missed) { + if (time) + missed /= (long)time; + + pr_info("Total iterations per millisec: %ld\n", hit + missed); + + avg = 1000000 / (hit + missed); + pr_info("%ld ns per entry\n", avg); + } } static void wait_to_die(void) -- cgit v1.2.2 From 74f4fd21664148b8c454cc07bfe74e4dd51cf07b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 19:58:55 -0400 Subject: ring-buffer: change WARN_ON from checking preempt_count to preemptible There's a WARN_ON in the ring buffer code that makes sure preemption is disabled. It checks "!preempt_count()". But when CONFIG_PREEMPT is not enabled, preempt_count() is always zero, and this will trigger the warning. [ Impact: prevent false warning on non preemptible kernels ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3ae5ccf2c0fc..361170609bd0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1688,7 +1688,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, * committed yet. Thus we can assume that preemption * is still disabled. */ - RB_WARN_ON(buffer, !preempt_count()); + RB_WARN_ON(buffer, preemptible()); cpu = smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; -- cgit v1.2.2 From 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 8 May 2009 10:31:42 +0800 Subject: tracing/events: clean up for ftrace_set_clr_event() Add a helper function __ftrace_set_clr_event(), and replace some ftrace_set_clr_event() calls with this helper, thus we don't need any kstrdup() or kmalloc(). As a side effect, this patch fixes an issue in self tests code, which is similar to the one fixed in commit d6bf81ef0f7474434c2a049e8bf3c9146a14dd96 ("tracing: append ":*" to internal setting of system events") It's a small issue and won't cause any bug in fact, but we should do things right anyway. [ Impact: prevent spurious event-enabling in tracing self-tests ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A03998E.3020503@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 126 ++++++++++++++++---------------------------- 1 file changed, 46 insertions(+), 80 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8d0fae3af595..45f1099386b6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -111,11 +111,44 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, } } -static int ftrace_set_clr_event(char *buf, int set) +/* + * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. + */ +static int __ftrace_set_clr_event(const char *match, const char *sub, + const char *event, int set) { struct ftrace_event_call *call; + int ret; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + + if (!call->name || !call->regfunc) + continue; + + if (match && + strcmp(match, call->name) != 0 && + strcmp(match, call->system) != 0) + continue; + + if (sub && strcmp(sub, call->system) != 0) + continue; + + if (event && strcmp(event, call->name) != 0) + continue; + + ftrace_event_enable_disable(call, set); + + ret = 0; + } + mutex_unlock(&event_mutex); + + return ret; +} + +static int ftrace_set_clr_event(char *buf, int set) +{ char *event = NULL, *sub = NULL, *match; - int ret = -EINVAL; /* * The buf format can be : @@ -141,30 +174,7 @@ static int ftrace_set_clr_event(char *buf, int set) event = NULL; } - mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - - if (!call->name || !call->regfunc) - continue; - - if (match && - strcmp(match, call->name) != 0 && - strcmp(match, call->system) != 0) - continue; - - if (sub && strcmp(sub, call->system) != 0) - continue; - - if (event && strcmp(event, call->name) != 0) - continue; - - ftrace_event_enable_disable(call, set); - - ret = 0; - } - mutex_unlock(&event_mutex); - - return ret; + return __ftrace_set_clr_event(match, sub, event, set); } /* 128 should be much more than enough */ @@ -408,18 +418,14 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, struct ftrace_event_call *call; char buf[2]; int set = -1; - int all = 0; int ret; - if (system[0] == '*') - all = 1; - mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (!call->name || !call->regfunc) continue; - if (!all && strcmp(call->system, system) != 0) + if (system && strcmp(call->system, system) != 0) continue; /* @@ -480,7 +486,6 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, { const char *system = filp->private_data; unsigned long val; - char *command; char buf[64]; ssize_t ret; @@ -500,30 +505,16 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret < 0) return ret; - switch (val) { - case 0: - case 1: - break; - - default: + if (val != 0 && val != 1) return -EINVAL; - } - /* +3 for the ":*\0" */ - command = kmalloc(strlen(system)+3, GFP_KERNEL); - if (!command) - return -ENOMEM; - sprintf(command, "%s:*", system); - - ret = ftrace_set_clr_event(command, val); + ret = __ftrace_set_clr_event(NULL, system, NULL, val); if (ret) - goto out_free; + goto out; ret = cnt; - out_free: - kfree(command); - +out: *ppos += cnt; return ret; @@ -1181,7 +1172,7 @@ static __init int event_trace_init(void) &ftrace_show_header_fops); trace_create_file("enable", 0644, d_events, - "*", &ftrace_system_enable_fops); + NULL, &ftrace_system_enable_fops); for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ @@ -1259,7 +1250,6 @@ static __init void event_trace_self_tests(void) { struct ftrace_event_call *call; struct event_subsystem *system; - char *sysname; int ret; pr_info("Running tests on trace events:\n"); @@ -1305,14 +1295,7 @@ static __init void event_trace_self_tests(void) pr_info("Testing event system %s: ", system->name); - /* ftrace_set_clr_event can modify the name passed in. */ - sysname = kstrdup(system->name, GFP_KERNEL); - if (WARN_ON(!sysname)) { - pr_warning("Can't allocate memory, giving up!\n"); - return; - } - ret = ftrace_set_clr_event(sysname, 1); - kfree(sysname); + ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warning("error enabling system %s\n", system->name); @@ -1321,14 +1304,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); - sysname = kstrdup(system->name, GFP_KERNEL); - if (WARN_ON(!sysname)) { - pr_warning("Can't allocate memory, giving up!\n"); - return; - } - ret = ftrace_set_clr_event(sysname, 0); - kfree(sysname); - + ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); if (WARN_ON_ONCE(ret)) pr_warning("error disabling system %s\n", system->name); @@ -1341,15 +1317,8 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); - sysname = kmalloc(4, GFP_KERNEL); - if (WARN_ON(!sysname)) { - pr_warning("Can't allocate memory, giving up!\n"); - return; - } - memcpy(sysname, "*:*", 4); - ret = ftrace_set_clr_event(sysname, 1); + ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { - kfree(sysname); pr_warning("error enabling all events\n"); return; } @@ -1357,10 +1326,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); /* reset sysname */ - memcpy(sysname, "*:*", 4); - ret = ftrace_set_clr_event(sysname, 0); - kfree(sysname); - + ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warning("error disabling all events\n"); return; -- cgit v1.2.2 From c142b15dc56ee6d55cb97a062e3c8e9c61e384c0 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 8 May 2009 10:32:05 +0800 Subject: tracing/events: simplify system_enable_read() A smarter way to figure out the output of an enable file. [ Impact: clean up ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A0399A5.2080603@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 40 ++++++---------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 45f1099386b6..df394bc6d54b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -414,10 +414,11 @@ static ssize_t system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + const char set_to_char[4] = { '?', '0', '1', 'X' }; const char *system = filp->private_data; struct ftrace_event_call *call; char buf[2]; - int set = -1; + int set = 0; int ret; mutex_lock(&event_mutex); @@ -433,47 +434,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, * or if all events or cleared, or if we have * a mixture. */ - if (call->enabled) { - switch (set) { - case -1: - set = 1; - break; - case 0: - set = 2; - break; - } - } else { - switch (set) { - case -1: - set = 0; - break; - case 1: - set = 2; - break; - } - } + set |= (1 << !!call->enabled); + /* * If we have a mixture, no need to look further. */ - if (set == 2) + if (set == 3) break; } mutex_unlock(&event_mutex); + buf[0] = set_to_char[set]; buf[1] = '\n'; - switch (set) { - case 0: - buf[0] = '0'; - break; - case 1: - buf[0] = '1'; - break; - case 2: - buf[0] = 'X'; - break; - default: - buf[0] = '?'; - } ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); -- cgit v1.2.2 From 29f93943d1916d1a3faa3f10f4a06994347ac990 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 16:06:47 -0400 Subject: tracing: initialize return value for __ftrace_set_clr_event Commit 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2 tracing/events: clean up for ftrace_set_clr_event() Moved out the code for ftrace_set_clr_event into a helper funciton but did not initialize the return value. As a result, we do not warn about a typo in the echoing of events in set_event. This patch restores the old warning: # echo foobar > set_event -bash: echo: write error: Invalid argument [ Impact: restore warning of invalid entries to set_event ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index df394bc6d54b..2eecb87e42d3 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -118,7 +118,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, const char *event, int set) { struct ftrace_event_call *call; - int ret; + int ret = -EINVAL; mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { -- cgit v1.2.2 From 4671c79408a3f8a5a6a45e39c4c164dada3a5678 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 16:27:41 -0400 Subject: tracing: add trace_set_clr_event to export event enabling function Other parts of the kernel may need to be able to enable or disable specific events. Especially parts that create trace events. [ Impact: allow enabling of trace events by those that create the event ] Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 ++ kernel/trace/trace_events.c | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 662c1becf367..bae51ddfabd3 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -127,6 +127,8 @@ extern int trace_define_field(struct ftrace_event_call *call, char *type, #define is_signed_type(type) (((type)(-1)) < 0) +int trace_set_clr_event(const char *system, const char *event, int set); + /* * The double __builtin_constant_p is because gcc will give us an error * if we try to allocate the static variable to fmt if it is not a diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2eecb87e42d3..0eec0c55dd87 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -177,6 +177,23 @@ static int ftrace_set_clr_event(char *buf, int set) return __ftrace_set_clr_event(match, sub, event, set); } +/** + * trace_set_clr_event - enable or disable an event + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @set: 1 to enable, 0 to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_set_clr_event(const char *system, const char *event, int set) +{ + return __ftrace_set_clr_event(NULL, system, event, set); +} + /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 -- cgit v1.2.2 From 049862579333cc6cd9e6edfd6987cd0addfd8c59 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 11 May 2009 14:33:23 +0800 Subject: blktrace: pdu_buf of pc events should be unsigned I got this: 8,0 1 305.417782332 2037 I R 32 (ffffff9e 10 00 ...) [bash] It should be: 8,0 1 305.417782332 2037 I R 32 (9e 10 00 ...) [bash] [ Impact: fix output of pc events ] Signed-off-by: Li Zefan Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A07C6B3.9080802@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e099f8cc1d1c..05b4747fd873 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1065,7 +1065,7 @@ static int blk_log_action(struct trace_iterator *iter, const char *act) static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) { - const char *pdu_buf; + const unsigned char *pdu_buf; int pdu_len; int i, end, ret; -- cgit v1.2.2 From 79c5d3ce614d8fe706545c7bca2158b63db6bb5e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 11 May 2009 15:06:46 +0800 Subject: blktrace: from-sector redundant in trace_block_remap, cleanup The last argument of block_remap prober is the original sector before remap, so it should be 'from', not 'to'. [ Impact: clean up ] Signed-off-by: Li Zefan Cc: "Alan D. Brunelle" Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: KOSAKI Motohiro LKML-Reference: <4A07CE86.5090301@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/block.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/block.h b/include/trace/block.h index 8ac945b7746e..5b12efa096b6 100644 --- a/include/trace/block.h +++ b/include/trace/block.h @@ -70,7 +70,7 @@ DECLARE_TRACE(block_split, DECLARE_TRACE(block_remap, TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t to), - TP_ARGS(q, bio, dev, to)); + sector_t from), + TP_ARGS(q, bio, dev, from)); #endif -- cgit v1.2.2 From 5a772b2b3c68e7e0b503c5a48469113bb0634314 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 10:56:33 -0400 Subject: ring-buffer: replace constants with time macros in ring-buffer-benchmark The use of numeric constants is discouraged. It is cleaner and more descriptive to use macros for constant time conversions. This patch also removes an extra new line. [ Impact: more descriptive time conversions ] Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a21aa7b3d05e..7d3aef93c49f 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -253,7 +253,7 @@ static void ring_buffer_producer(void) } time = end_tv.tv_sec - start_tv.tv_sec; - time *= 1000000; + time *= USEC_PER_SEC; time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); entries = ring_buffer_entries(buffer); @@ -273,7 +273,8 @@ static void ring_buffer_producer(void) pr_info("Missed: %ld\n", missed); pr_info("Hit: %ld\n", hit); - do_div(time, 1000); + /* Convert time from usecs to millisecs */ + do_div(time, USEC_PER_MSEC); if (time) hit /= (long)time; else @@ -282,18 +283,19 @@ static void ring_buffer_producer(void) pr_info("Entries per millisec: %ld\n", hit); if (hit) { - avg = 1000000 / hit; + /* Calculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / hit; pr_info("%ld ns per entry\n", avg); } - if (missed) { if (time) missed /= (long)time; pr_info("Total iterations per millisec: %ld\n", hit + missed); - avg = 1000000 / (hit + missed); + /* Caculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / (hit + missed); pr_info("%ld ns per entry\n", avg); } } -- cgit v1.2.2 From d988ff94c1074c4c914235c8591bcceafb585ecf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 11:03:57 -0400 Subject: ring-buffer: check for divide by zero in ring-buffer-benchmark Although we check if "missed" is not zero, we divide by hit + missed, and the addition can possible overflow and become a divide by zero. This patch checks for this case, and will report it when it happens then modify "hit" to make the calculation be non zero. [ Impact: prevent possible divide by zero in ring-buffer-benchmark ] Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 7d3aef93c49f..8d68e149a8b3 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -294,6 +294,12 @@ static void ring_buffer_producer(void) pr_info("Total iterations per millisec: %ld\n", hit + missed); + /* it is possible that hit + missed will overflow and be zero */ + if (!(hit + missed)) { + pr_info("hit + missed overflowed and totalled zero!\n"); + hit--; /* make it non zero */ + } + /* Caculate the average time in nanosecs */ avg = NSEC_PER_MSEC / (hit + missed); pr_info("%ld ns per entry\n", avg); -- cgit v1.2.2 From 1cd8d7358948909ab80b254eb14bcebc555ad417 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 14:08:09 -0400 Subject: ring-buffer: remove type parameter from rb_reserve_next_event The rb_reserve_next_event is only called for the data type (type = 0). There is no reason to pass in the type to the function. Before: text data bss dec hex filename 16554 24 12 16590 40ce kernel/trace/ring_buffer.o After: text data bss dec hex filename 16538 24 12 16574 40be kernel/trace/ring_buffer.o [ Impact: cleaner, smaller and slightly more efficient code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 361170609bd0..fe40f6c3507c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1389,7 +1389,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length) + unsigned long length) { struct ring_buffer_event *event; u64 ts, delta; @@ -1448,7 +1448,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, /* Non commits have zero deltas */ delta = 0; - event = __rb_reserve_next(cpu_buffer, type, length, &ts); + event = __rb_reserve_next(cpu_buffer, 0, length, &ts); if (PTR_ERR(event) == -EAGAIN) goto again; @@ -1556,7 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (length > BUF_PAGE_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, 0, length); + event = rb_reserve_next_event(cpu_buffer, length); if (!event) goto out; @@ -1782,7 +1782,7 @@ int ring_buffer_write(struct ring_buffer *buffer, goto out; event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, 0, event_length); + event = rb_reserve_next_event(cpu_buffer, event_length); if (!event) goto out; -- cgit v1.2.2 From be957c447f7233a67904a1b11eb3ab61e702bf4d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 14:42:53 -0400 Subject: ring-buffer: move calculation of event length The event length is calculated and passed in to rb_reserve_next_event in two different locations. Having rb_reserve_next_event do the calculations directly makes only one location to do the change and causes the calculation to be inlined by gcc. Before: text data bss dec hex filename 16538 24 12 16574 40be kernel/trace/ring_buffer.o After: text data bss dec hex filename 16490 24 12 16526 408e kernel/trace/ring_buffer.o [ Impact: smaller more efficient code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fe40f6c3507c..493cba46abc9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -367,6 +367,9 @@ static inline int test_time_stamp(u64 delta) #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) +/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ +#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) + int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; @@ -1396,6 +1399,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, int commit = 0; int nr_loops = 0; + length = rb_calculate_event_length(length); again: /* * We allow for interrupts to reenter here and do a trace. @@ -1552,8 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (atomic_read(&cpu_buffer->record_disabled)) goto out; - length = rb_calculate_event_length(length); - if (length > BUF_PAGE_SIZE) + if (length > BUF_MAX_DATA_SIZE) goto out; event = rb_reserve_next_event(cpu_buffer, length); @@ -1758,7 +1761,6 @@ int ring_buffer_write(struct ring_buffer *buffer, { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; - unsigned long event_length; void *body; int ret = -EBUSY; int cpu, resched; @@ -1781,8 +1783,10 @@ int ring_buffer_write(struct ring_buffer *buffer, if (atomic_read(&cpu_buffer->record_disabled)) goto out; - event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, event_length); + if (length > BUF_MAX_DATA_SIZE) + goto out; + + event = rb_reserve_next_event(cpu_buffer, length); if (!event) goto out; -- cgit v1.2.2 From 0f0c85fc80adbbd2265d89867d743f929d516805 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 16:08:00 -0400 Subject: ring-buffer: small optimizations Doing some small changes in the fast path of the ring buffer recording saves over 3% in the ring-buffer-benchmark test. [ Impact: a little faster ring buffer recording ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 493cba46abc9..f452de2ce490 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1000,7 +1000,7 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); } -static int +static inline int rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -1423,9 +1423,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * also be made. But only the entry that did the actual * commit will be something other than zero. */ - if (cpu_buffer->tail_page == cpu_buffer->commit_page && - rb_page_write(cpu_buffer->tail_page) == - rb_commit_index(cpu_buffer)) { + if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && + rb_page_write(cpu_buffer->tail_page) == + rb_commit_index(cpu_buffer))) { delta = ts - cpu_buffer->write_stamp; @@ -1436,7 +1436,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(ts < cpu_buffer->write_stamp)) delta = 0; - if (test_time_stamp(delta)) { + else if (unlikely(test_time_stamp(delta))) { commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); @@ -1470,7 +1470,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * If the timestamp was commited, make the commit our entry * now so that we will update it when needed. */ - if (commit) + if (unlikely(commit)) rb_set_commit_event(cpu_buffer, event); else if (!rb_is_commit(cpu_buffer, event)) delta = 0; -- cgit v1.2.2 From 88eb0125362f2ab272cbaf84252cf101ddc2dec9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 16:28:23 -0400 Subject: ring-buffer: use internal time stamp function The ring_buffer_time_stamp that is exported adds a little more overhead than is needed for using it internally. This patch adds an internal timestamp function that can be inlined (a single line function) and used internally for the ring buffer. [ Impact: a little less overhead to the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f452de2ce490..a9e645a5bc10 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -454,13 +454,18 @@ struct ring_buffer_iter { /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 +static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +{ + /* shift to debug/test normalization and TIME_EXTENTS */ + return buffer->clock() << DEBUG_SHIFT; +} + u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) { u64 time; preempt_disable_notrace(); - /* shift to debug/test normalization and TIME_EXTENTS */ - time = buffer->clock() << DEBUG_SHIFT; + time = rb_time_stamp(buffer, cpu); preempt_enable_no_resched_notrace(); return time; @@ -1247,7 +1252,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->tail_page = next_page; /* reread the time stamp */ - *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); + *ts = rb_time_stamp(buffer, cpu_buffer->cpu); cpu_buffer->tail_page->page->time_stamp = *ts; } @@ -1413,7 +1418,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) return NULL; - ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); /* * Only the first commit can update the timestamp. -- cgit v1.2.2 From 168b6b1d0594c7866caa73b12f3b8d91075695f2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 22:11:05 -0400 Subject: ring-buffer: move code around to remove some branches This is a bit of micro-optimizations. But since the ring buffer is used in tracing every function call, it is an extreme hot path. Every nanosecond counts. This change shows over 5% improvement in the ring-buffer-benchmark. [ Impact: more efficient code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a9e645a5bc10..16b24d49604c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1400,7 +1400,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta; + u64 ts, delta = 0; int commit = 0; int nr_loops = 0; @@ -1431,20 +1431,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && rb_page_write(cpu_buffer->tail_page) == rb_commit_index(cpu_buffer))) { + u64 diff; - delta = ts - cpu_buffer->write_stamp; + diff = ts - cpu_buffer->write_stamp; - /* make sure this delta is calculated here */ + /* make sure this diff is calculated here */ barrier(); /* Did the write stamp get updated already? */ if (unlikely(ts < cpu_buffer->write_stamp)) - delta = 0; + goto get_event; - else if (unlikely(test_time_stamp(delta))) { + delta = diff; + if (unlikely(test_time_stamp(delta))) { commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); - if (commit == -EBUSY) return NULL; @@ -1453,12 +1454,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, RB_WARN_ON(cpu_buffer, commit < 0); } - } else - /* Non commits have zero deltas */ - delta = 0; + } + get_event: event = __rb_reserve_next(cpu_buffer, 0, length, &ts); - if (PTR_ERR(event) == -EAGAIN) + if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; if (!event) { -- cgit v1.2.2 From 29a679754b1a2581ee456eada6c2de7ce95068bb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 May 2009 23:19:09 -0400 Subject: x86/stacktrace: return 0 instead of -1 for stack ops If we return -1 in the ops->stack for the stacktrace saving, we end up breaking out of the loop if the stack we are tracing is in the exception stack. This causes traces like: -0 [002] 34263.745825: raise_softirq_irqoff <-__blk_complete_request -0 [002] 34263.745826: <= 0 <= 0 <= 0 <= 0 <= 0 <= 0 <= 0 By returning "0" instead, the irq stack is saved as well, and we see: -0 [003] 883.280992: raise_softirq_irqoff <-__hrtimer_star t_range_ns -0 [003] 883.280992: <= hrtimer_start_range_ns <= tick_nohz_restart_sched_tick <= cpu_idle <= start_secondary <= <= 0 <= 0 [ Impact: record stacks from interrupts ] Signed-off-by: Steven Rostedt --- arch/x86/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index f7bddc2e37d1..4aaf7e48394f 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) static int save_stack_stack(void *data, char *name) { - return -1; + return 0; } static void save_stack_address(void *data, unsigned long addr, int reliable) -- cgit v1.2.2 From 1ec7c4849c214fc78b023230264399836ea3b245 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 May 2009 23:40:06 -0400 Subject: tracing: stop stack trace on first empty entry The stack tracer stores eight entries in the ring buffer when an event traces the stack. The output outputs all eight entries regardless of how many entries were recorded. This patch breaks out of the loop when a null entry is discovered. [ Impact: only print the stack that is recorded ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8bd9a2c1a46a..489c0e8ada09 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -898,6 +898,8 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (!field->caller[i]) + break; if (i) { if (!trace_seq_puts(s, " <= ")) goto partial; -- cgit v1.2.2 From 8cd995b6deedf98b7694ed32a786ee7f793d1eec Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 15 May 2009 11:07:27 +0800 Subject: tracing/filters: add missing unlock in a failure path [ Impact: fix deadlock in a rare case we fail to allocate memory ] Signed-off-by: Li Zefan LKML-Reference: <4A0CDC6F.7070200@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 85ad6a8939ad..22c29984fe0e 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1079,9 +1079,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) return 0; } + err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - return -ENOMEM; + goto out_unlock; filter_disable_preds(call); replace_filter_string(call->filter, filter_string); @@ -1101,7 +1102,7 @@ out: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); - +out_unlock: mutex_unlock(&filter_mutex); return err; @@ -1123,9 +1124,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system, return 0; } + err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - return -ENOMEM; + goto out_unlock; filter_free_subsystem_preds(system); replace_filter_string(system->filter, filter_string); @@ -1145,7 +1147,7 @@ out: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); - +out_unlock: mutex_unlock(&filter_mutex); return err; -- cgit v1.2.2 From 5872144f64b34a5942f6b4acedc90b02de72c58b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 15 May 2009 11:07:56 +0800 Subject: tracing/filters: fix off-by-one bug We should leave the last slot for the ending '\0'. [ Impact: fix possible crash when the length of an operand is 128 ] Signed-off-by: Li Zefan LKML-Reference: <4A0CDC8C.30602@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 22c29984fe0e..a7430b16d243 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -736,7 +736,7 @@ static inline void clear_operand_string(struct filter_parse_state *ps) static inline int append_operand_char(struct filter_parse_state *ps, char c) { - if (ps->operand.tail == MAX_FILTER_STR_VAL) + if (ps->operand.tail == MAX_FILTER_STR_VAL - 1) return -EINVAL; ps->operand.string[ps->operand.tail++] = c; -- cgit v1.2.2 From 24ed0c4bfc7d2d7507bb9d50f7f3bbdcd85d76dd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 17 May 2009 15:31:38 +0800 Subject: tracing: fix check for return value of register_module_notifier return zero should be correct, so fix it. [ Impact: eliminate incorrect syslog message ] Signed-off-by: Ming Lei Acked-by: Frederic Weisbecker Acked-by: Li Zefan Cc: rostedt@goodmis.org LKML-Reference: <1242545498-7285-1-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b606f45b6c4..140699a9a8a7 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2826,7 +2826,7 @@ void __init ftrace_init(void) __stop_mcount_loc); ret = register_module_notifier(&ftrace_module_nb); - if (!ret) + if (ret) pr_warning("Failed to register trace ftrace module notifier\n"); return; -- cgit v1.2.2 From 143c145e3a475065a4be661468d0df1bd0b25f74 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 19 May 2009 14:43:15 +0800 Subject: tracing/events: Documentation updates - fix some typos - document the difference between '>' and '>>' - document the 'enable' toggle - remove section "Defining an event-enabled tracepoint", since it's out-dated and sample/trace_events/ already serves this purpose. v2: add "Updated by Li Zefan" [ Impact: make documentation up-to-date ] Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: "Theodore Ts'o" LKML-Reference: <4A125503.5060406@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- Documentation/trace/events.txt | 159 +++++++++++++++-------------------------- 1 file changed, 57 insertions(+), 102 deletions(-) diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index abdee664c0f6..f157d7594ea7 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt @@ -1,9 +1,10 @@ Event Tracing Documentation written by Theodore Ts'o + Updated by Li Zefan -Introduction -============ +1. Introduction +=============== Tracepoints (see Documentation/trace/tracepoints.txt) can be used without creating custom kernel modules to register probe functions @@ -12,30 +13,37 @@ using the event tracing infrastructure. Not all tracepoints can be traced using the event tracing system; the kernel developer must provide code snippets which define how the tracing information is saved into the tracing buffer, and how the -the tracing information should be printed. +tracing information should be printed. -Using Event Tracing -=================== +2. Using Event Tracing +====================== + +2.1 Via the 'set_event' interface +--------------------------------- The events which are available for tracing can be found in the file -/sys/kernel/debug/tracing/available_events. +/debug/tracing/available_events. To enable a particular event, such as 'sched_wakeup', simply echo it -to /sys/debug/tracing/set_event. For example: +to /debug/tracing/set_event. For example: - # echo sched_wakeup > /sys/kernel/debug/tracing/set_event + # echo sched_wakeup >> /debug/tracing/set_event -[ Note: events can also be enabled/disabled via the 'enabled' toggle - found in the /sys/kernel/tracing/events/ hierarchy of directories. ] +[ Note: '>>' is necessary, otherwise it will firstly disable + all the events. ] To disable an event, echo the event name to the set_event file prefixed with an exclamation point: - # echo '!sched_wakeup' >> /sys/kernel/debug/tracing/set_event + # echo '!sched_wakeup' >> /debug/tracing/set_event + +To disable all events, echo an empty line to the set_event file: + + # echo > /debug/tracing/set_event -To disable events, echo an empty line to the set_event file: +To enable all events, echo '*:*' or '*:' to the set_event file: - # echo > /sys/kernel/debug/tracing/set_event + # echo *:* > /debug/tracing/set_event The events are organized into subsystems, such as ext4, irq, sched, etc., and a full event name looks like this: :. The @@ -44,92 +52,39 @@ file. All of the events in a subsystem can be specified via the syntax ":*"; for example, to enable all irq events, you can use the command: - # echo 'irq:*' > /sys/kernel/debug/tracing/set_event - -Defining an event-enabled tracepoint ------------------------------------- - -A kernel developer which wishes to define an event-enabled tracepoint -must declare the tracepoint using TRACE_EVENT instead of DECLARE_TRACE. -This is done via two header files in include/trace. For example, to -event-enable the jbd2 subsystem, we must create two files, -include/trace/jbd2.h and include/trace/jbd2_event_types.h. The -include/trace/jbd2.h file should be included by kernel source files that -will have a tracepoint inserted, and might look like this: - -#ifndef _TRACE_JBD2_H -#define _TRACE_JBD2_H - -#include -#include - -#include - -#endif - -In a file that utilizes a jbd2 tracepoint, this header file would be -included. Note that you still have to use DEFINE_TRACE(). So for -example, if fs/jbd2/commit.c planned to use the jbd2_start_commit -tracepoint, it would have the following near the beginning of the file: - -#include - -DEFINE_TRACE(jbd2_start_commit); - -Then in the function that would call the tracepoint, it would call the -tracepoint function. (For more information, please see the tracepoint -documentation in Documentation/trace/tracepoints.txt): - - trace_jbd2_start_commit(journal, commit_transaction); - -The code snippets which allow jbd2_start_commit to be an event-enabled -tracepoint are placed in the file include/trace/jbd2_event_types.h: - -/* use instead */ -#ifndef TRACE_EVENT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM jbd2 - -#include - -TRACE_EVENT(jbd2_start_commit, - TP_PROTO(journal_t *journal, transaction_t *commit_transaction), - TP_ARGS(journal, commit_transaction), - TP_STRUCT__entry( - __array( char, devname, BDEVNAME_SIZE+24 ) - __field( int, transaction ) - ), - TP_fast_assign( - memcpy(__entry->devname, journal->j_devname, BDEVNAME_SIZE+24); - __entry->transaction = commit_transaction->t_tid; - ), - TP_printk("dev %s transaction %d", - __entry->devname, __entry->transaction) -); - -The TP_PROTO and TP_ARGS are unchanged from DECLARE_TRACE. The new -arguments to TRACE_EVENT are TP_STRUCT__entry, TP_fast_assign, and -TP_printk. - -TP_STRUCT__entry defines the data structure which will be stored in the -trace buffer. Normally, fields in __entry will be arrays or simple -types. It is possible to place data structures in __entry --- however, -pointers in the data structure can not be trusted, since they will be -accessed sometime later by TP_printk, and if the data structure contains -fields that will not or cannot be used by TP_printk, this will waste -space in the trace buffer. In general, data structures should be -avoided, unless they do only contain non-pointer types and all of the -fields will be used by TP_printk. - -TP_fast_assign defines the code snippet which saves information into the -__entry data structure, using the passed-in arguments defined in -TP_PROTO and TP_ARGS. - -Finally, TP_printk will print the __entry data structure. At the time -when the code snippet defined by TP_printk is executed, it will not have -access to the TP_ARGS arguments; it can only use the information saved -in the __entry data structure. + # echo 'irq:*' > /debug/tracing/set_event + +2.2 Via the 'enable' toggle +--------------------------- + +The events available are also listed in /debug/tracing/events/ hierarchy +of directories. + +To enable event 'sched_wakeup': + + # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable + +To disable it: + + # echo 0 > /debug/tracing/events/sched/sched_wakeup/enable + +To enable all events in sched subsystem: + + # echo 1 > /debug/tracing/events/sched/enable + +To eanble all events: + + # echo 1 > /debug/tracing/events/enable + +When reading one of these enable files, there are four results: + + 0 - all events this file affects are disabled + 1 - all events this file affects are enabled + X - there is a mixture of events enabled and disabled + ? - this file does not affect any event + +3. Defining an event-enabled tracepoint +======================================= + +See The example provided in samples/trace_events + -- cgit v1.2.2 From fd51d251e4cdb21f68e9dbc4336514d64a105a79 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Tue, 19 May 2009 09:59:08 +0200 Subject: blktrace: remove debugfs entries on bad path debugfs directory entries for devices are not removed on some of the failure pathes in do_blk_trace_setup(). One way to reproduce is to start blktrace on multiple devices with insufficient Vmalloc space: Devices will fail with a message like this: BLKTRACESETUP(2) /dev/sdu failed: 5/Input/output error If so, the respective entries in debugfs (e.g. /sys/kernel/debug/block/sdu) will remain and subsequent attempts to start blktrace on the respective devices will not succeed due to existing directories. [ Impact: fix /debug/tracing file cleanup corner case ] Signed-off-by: Stefan Raspl Acked-by: Li Zefan Cc: Li Zefan Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com LKML-Reference: <4A1266CC.5040801@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 05b4747fd873..e3abf55bc8e5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -262,6 +262,7 @@ static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); + debugfs_remove(bt->dir); relay_close(bt->rchan); free_percpu(bt->sequence); free_percpu(bt->msg_data); -- cgit v1.2.2 From 5537937696c55530447c20aa27daccb8d0d29b33 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 May 2009 23:04:46 +0800 Subject: ftrace: fix check for return value of register_module_notifier in event_trace_init register_module_notifier() returns zero in the success case. So fix the inverted fail case check in trace events modules handler. [ Impact: fix spurious warning on ftrace initialization] Reported-by: Li Zefan Signed-off-by: Ming Lei Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0eec0c55dd87..9e91c4ad7c8b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1174,7 +1174,7 @@ static __init int event_trace_init(void) } ret = register_module_notifier(&trace_module_nb); - if (!ret) + if (ret) pr_warning("Failed to register trace events module notifier\n"); return 0; -- cgit v1.2.2 From 4f5359685af6de7dca101393dc606620adbe963f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 18 May 2009 19:35:34 +0800 Subject: tracing: add trace_event_read_lock() I found that there is nothing to protect event_hash in ftrace_find_event(). Rcu protects the event hashlist but not the event itself while we use it after its extraction through ftrace_find_event(). This lack of a proper locking in this spot opens a race window between any event dereferencing and module removal. Eg: --Task A-- print_trace_line(trace) { event = find_ftrace_event(trace) --Task B-- trace_module_remove_events(mod) { list_trace_events_module(ev, mod) { unregister_ftrace_event(ev->event) { hlist_del(ev->event->node) list_del(....) } } } |--> module removed, the event has been dropped --Task A-- event->print(trace); // Dereferencing freed memory If the event retrieved belongs to a module and this module is concurrently removed, we may end up dereferencing a data from a freed module. RCU could solve this, but it would add latency to the kernel and forbid tracers output callbacks to call any sleepable code. So this fix converts 'trace_event_mutex' to a read/write semaphore, and adds trace_event_read_lock() to protect ftrace_find_event(). [ Impact: fix possible freed memory dereference in ftrace ] Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt LKML-Reference: <4A114806.7090302@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 8 ++++++++ kernel/trace/trace_output.c | 25 ++++++++++++++++++------- kernel/trace/trace_output.h | 2 ++ 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dd40d2320346..02d32baa23ac 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1569,12 +1569,14 @@ static void *s_start(struct seq_file *m, loff_t *pos) p = s_next(m, p, &l); } + trace_event_read_lock(); return p; } static void s_stop(struct seq_file *m, void *p) { atomic_dec(&trace_record_cmdline_disabled); + trace_event_read_unlock(); } static void print_lat_help_header(struct seq_file *m) @@ -1817,6 +1819,7 @@ static int trace_empty(struct trace_iterator *iter) return 1; } +/* Called with trace_event_read_lock() held. */ static enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; @@ -3008,6 +3011,7 @@ waitagain: offsetof(struct trace_iterator, seq)); iter->pos = -1; + trace_event_read_lock(); while (find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -3024,6 +3028,7 @@ waitagain: if (iter->seq.len >= cnt) break; } + trace_event_read_unlock(); /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); @@ -3146,6 +3151,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, goto out_err; } + trace_event_read_lock(); + /* Fill as many pages as possible. */ for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { pages[i] = alloc_page(GFP_KERNEL); @@ -3168,6 +3175,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_seq_init(&iter->seq); } + trace_event_read_unlock(); mutex_unlock(&iter->mutex); spd.nr_pages = i; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 489c0e8ada09..7136420603aa 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@ /* must be a power of 2 */ #define EVENT_HASHSIZE 128 -static DEFINE_MUTEX(trace_event_mutex); +static DECLARE_RWSEM(trace_event_mutex); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; @@ -466,6 +466,7 @@ static int task_state_char(unsigned long state) * @type: the type of event to look for * * Returns an event of type @type otherwise NULL + * Called with trace_event_read_lock() held. */ struct trace_event *ftrace_find_event(int type) { @@ -475,7 +476,7 @@ struct trace_event *ftrace_find_event(int type) key = type & (EVENT_HASHSIZE - 1); - hlist_for_each_entry_rcu(event, n, &event_hash[key], node) { + hlist_for_each_entry(event, n, &event_hash[key], node) { if (event->type == type) return event; } @@ -513,6 +514,16 @@ static int trace_search_list(struct list_head **list) return last + 1; } +void trace_event_read_lock(void) +{ + down_read(&trace_event_mutex); +} + +void trace_event_read_unlock(void) +{ + up_read(&trace_event_mutex); +} + /** * register_ftrace_event - register output for an event type * @event: the event type to register @@ -533,7 +544,7 @@ int register_ftrace_event(struct trace_event *event) unsigned key; int ret = 0; - mutex_lock(&trace_event_mutex); + down_write(&trace_event_mutex); if (WARN_ON(!event)) goto out; @@ -581,11 +592,11 @@ int register_ftrace_event(struct trace_event *event) key = event->type & (EVENT_HASHSIZE - 1); - hlist_add_head_rcu(&event->node, &event_hash[key]); + hlist_add_head(&event->node, &event_hash[key]); ret = event->type; out: - mutex_unlock(&trace_event_mutex); + up_write(&trace_event_mutex); return ret; } @@ -597,10 +608,10 @@ EXPORT_SYMBOL_GPL(register_ftrace_event); */ int unregister_ftrace_event(struct trace_event *event) { - mutex_lock(&trace_event_mutex); + down_write(&trace_event_mutex); hlist_del(&event->node); list_del(&event->list); - mutex_unlock(&trace_event_mutex); + up_write(&trace_event_mutex); return 0; } diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 6e220a8e5706..ac240e76eb01 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -20,6 +20,8 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); +extern void trace_event_read_lock(void); +extern void trace_event_read_unlock(void); extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, -- cgit v1.2.2 From b0aae68cc5508f3c2fbf728988c954db4c8b8a53 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 21 May 2009 13:59:18 +0800 Subject: tracing/events: change the type of __str_loc_item to unsigned short When defining a dynamic size string, we add __str_loc_##item to the trace entry, and it stores the location of the actual string in entry->_str_data[] 'unsigned short' should be sufficient to store this information, thus we save 2 bytes per dyn-size string in the ring buffer. [ Impact: reduce memory occupied by dyn-size strings in ring buffer ] Signed-off-by: Li Zefan Cc: Steven Rostedt LKML-Reference: <4A14EDB6.2050507@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- include/trace/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index edb02bc9f8ff..b5ff2e8229ec 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -25,7 +25,7 @@ #define __field(type, item) type item; #undef __string -#define __string(item, src) int __str_loc_##item; +#define __string(item, src) unsigned short __str_loc_##item; #undef TP_STRUCT__entry #define TP_STRUCT__entry(args...) args -- cgit v1.2.2 From 29fcefba8a2f0fea11e2b721fe174a1832801284 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sun, 24 May 2009 11:13:17 +0300 Subject: kmemtrace: fix kernel parameter documentation The kmemtrace.enable kernel parameter no longer works. To enable kmemtrace at boot-time, you must pass "ftrace=kmemtrace" instead. [ Impact: remove obsolete kernel parameter documentation ] Cc: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg LKML-Reference: Signed-off-by: Frederic Weisbecker --- Documentation/kernel-parameters.txt | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e87bdbfbcc75..9243dd84f4d6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -56,7 +56,6 @@ parameter is applicable: ISAPNP ISA PnP code is enabled. ISDN Appropriate ISDN support is enabled. JOY Appropriate joystick support is enabled. - KMEMTRACE kmemtrace is enabled. LIBATA Libata driver is enabled LP Printer support is enabled. LOOP Loopback device support is enabled. @@ -1054,15 +1053,6 @@ and is between 256 and 4096 characters. It is defined in the file use the HighMem zone if it exists, and the Normal zone if it does not. - kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no } - Controls whether kmemtrace is enabled - at boot-time. - - kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of - subbufs kmemtrace's relay channel has. Set this - higher than default (KMEMTRACE_N_SUBBUFS in code) if - you experience buffer overruns. - kgdboc= [HW] kgdb over consoles. Requires a tty driver that supports console polling. (only serial suported for now) -- cgit v1.2.2 From b11c53e12f94a46b50bccc7a1a953d7ca1d54a31 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 25 May 2009 18:11:59 +0800 Subject: ftrace: Add task_comm support for trace_event If we enable a trace event alone without any tracer running (such as function tracer, sched switch tracer, etc...) it can't output enough task command information. We need to use the tracing_{start/stop}_cmdline_record() helpers which are designed to keep track of cmdlines for any tasks that were scheduled during the tracing. Before this patch: # echo 1 > debugfs/tracing/events/sched/sched_switch/enable # cat debugfs/tracing/trace # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | <...>-2289 [000] 526276.724790: sched_switch: task bash:2289 [120] ==> sshd:2287 [120] <...>-2287 [000] 526276.725231: sched_switch: task sshd:2287 [120] ==> bash:2289 [120] <...>-2289 [000] 526276.725452: sched_switch: task bash:2289 [120] ==> sshd:2287 [120] <...>-2287 [000] 526276.727181: sched_switch: task sshd:2287 [120] ==> swapper:0 [140] -0 [000] 526277.032734: sched_switch: task swapper:0 [140] ==> events/0:5 [115] <...>-5 [000] 526277.032782: sched_switch: task events/0:5 [115] ==> swapper:0 [140] ... After this patch: # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | bash-2269 [000] 527347.989229: sched_switch: task bash:2269 [120] ==> sshd:2267 [120] sshd-2267 [000] 527347.990960: sched_switch: task sshd:2267 [120] ==> bash:2269 [120] bash-2269 [000] 527347.991143: sched_switch: task bash:2269 [120] ==> sshd:2267 [120] sshd-2267 [000] 527347.992959: sched_switch: task sshd:2267 [120] ==> swapper:0 [140] -0 [000] 527348.531989: sched_switch: task swapper:0 [140] ==> events/0:5 [115] events/0-5 [000] 527348.532115: sched_switch: task events/0:5 [115] ==> swapper:0 [140] ... Changelog: v1->v2: Update Kconfig to select CONTEXT_SWITCH_TRACER in ENABLE_EVENT_TRACING v2->v3: v2 can solve problem that was caused by config EVENT_TRACING alone, but when CONFIG_FTRACE is off and CONFIG_TRACING is selected by other config, compile fail happened again. This version solves it. [ Impact: fix incomplete output of event tracing ] Signed-off-by: Zhao Lei Cc: Tom Zanussi Cc: Steven Rostedt LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/Kconfig | 9 +++++++-- kernel/trace/trace_events.c | 6 ++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f61be3015783..a508b9d2adb8 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,6 +49,11 @@ config FTRACE_NMI_ENTER default y config EVENT_TRACING + select CONTEXT_SWITCH_TRACER + bool + +config CONTEXT_SWITCH_TRACER + select MARKERS bool config TRACING @@ -176,10 +181,10 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. -config CONTEXT_SWITCH_TRACER +config ENABLE_CONTEXT_SWITCH_TRACER bool "Trace process context switches" select TRACING - select MARKERS + select CONTEXT_SWITCH_TRACER help This tracer gets called from the context switch and records all switching of tasks. diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9e91c4ad7c8b..9b246eb01d5f 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -85,6 +85,7 @@ static void ftrace_clear_events(void) if (call->enabled) { call->enabled = 0; + tracing_stop_cmdline_record(); call->unregfunc(); } } @@ -99,12 +100,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, case 0: if (call->enabled) { call->enabled = 0; + tracing_stop_cmdline_record(); call->unregfunc(); } break; case 1: if (!call->enabled) { call->enabled = 1; + tracing_start_cmdline_record(); call->regfunc(); } break; @@ -1058,6 +1061,7 @@ static void trace_module_remove_events(struct module *mod) found = true; if (call->enabled) { call->enabled = 0; + tracing_stop_cmdline_record(); call->unregfunc(); } if (call->event) @@ -1262,11 +1266,13 @@ static __init void event_trace_self_tests(void) } call->enabled = 1; + tracing_start_cmdline_record(); call->regfunc(); event_test_stuff(); call->unregfunc(); + tracing_stop_cmdline_record(); call->enabled = 0; pr_cont("OK\n"); -- cgit v1.2.2 From 0e907c99391362385c8e3af2c43b904dd1fd5d73 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 25 May 2009 18:13:59 +0800 Subject: ftrace: clean up of using ftrace_event_enable_disable() Always use ftrace_event_enable_disable() to enable/disable an event so that we can factorize out the event toggling code. [ Impact: factorize and cleanup event tracing code ] Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 44 ++++++++++++++------------------------------ 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9b246eb01d5f..6c81f9c21426 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -76,26 +76,9 @@ static void trace_destroy_fields(struct ftrace_event_call *call) #endif /* CONFIG_MODULES */ -static void ftrace_clear_events(void) -{ - struct ftrace_event_call *call; - - mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - - if (call->enabled) { - call->enabled = 0; - tracing_stop_cmdline_record(); - call->unregfunc(); - } - } - mutex_unlock(&event_mutex); -} - static void ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { - switch (enable) { case 0: if (call->enabled) { @@ -114,6 +97,17 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, } } +static void ftrace_clear_events(void) +{ + struct ftrace_event_call *call; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + ftrace_event_enable_disable(call, 0); + } + mutex_unlock(&event_mutex); +} + /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ @@ -1059,11 +1053,7 @@ static void trace_module_remove_events(struct module *mod) list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; - if (call->enabled) { - call->enabled = 0; - tracing_stop_cmdline_record(); - call->unregfunc(); - } + ftrace_event_enable_disable(call, 0); if (call->event) unregister_ftrace_event(call->event); debugfs_remove_recursive(call->dir); @@ -1265,15 +1255,9 @@ static __init void event_trace_self_tests(void) continue; } - call->enabled = 1; - tracing_start_cmdline_record(); - call->regfunc(); - + ftrace_event_enable_disable(call, 1); event_test_stuff(); - - call->unregfunc(); - tracing_stop_cmdline_record(); - call->enabled = 0; + ftrace_event_enable_disable(call, 0); pr_cont("OK\n"); } -- cgit v1.2.2 From be74b73a57645cc253d881ab0c1014eb64b9cf22 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 26 May 2009 20:25:22 +0200 Subject: tracing: add __print_flags for events Developers have been asking for the ability in the ftrace event tracer to display names of bits in a flags variable. Instead of printing out c2, it would be easier to read FOO|BAR|GOO, assuming that FOO is bit 1, BAR is bit 6 and GOO is bit 7. Some examples where this would be useful are the state flags in a context switch, kmalloc flags, and even permision flags in accessing files. [ v2 changes include: Frederic Weisbecker's idea of using a mask instead of bits, thus we can output GFP_KERNEL instead of GPF_WAIT|GFP_IO|GFP_FS. Li Zefan's idea of allowing the caller of __print_flags to add their own delimiter (or no delimiter) where we can get for file permissions rwx instead of r|w|x. ] [ v3 changes: Christoph Hellwig's idea of using an array instead of va_args. ] [ Impact: better displaying of flags in trace output ] Signed-off-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/linux/ftrace_event.h | 13 ++++++++++++- include/trace/ftrace.h | 14 ++++++++++++++ kernel/trace/trace_output.c | 39 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index bae51ddfabd3..4b58cf1a11c2 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -3,12 +3,23 @@ #include #include - +#include struct trace_array; struct tracer; struct dentry; +DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq); + +struct trace_print_flags { + unsigned long mask; + const char *name; +}; + +const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array); + /* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index b5ff2e8229ec..22c94719c569 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -87,6 +87,7 @@ * struct trace_seq *s = &iter->seq; * struct ftrace_raw_ *field; <-- defined in stage 1 * struct trace_entry *entry; + * struct trace_seq *p; * int ret; * * entry = iter->ent; @@ -98,7 +99,9 @@ * * field = (typeof(field))entry; * + * p = get_cpu_var(ftrace_event_seq); * ret = trace_seq_printf(s, "\n"); + * put_cpu(); * if (!ret) * return TRACE_TYPE_PARTIAL_LINE; * @@ -119,6 +122,14 @@ #undef __get_str #define __get_str(field) ((char *)__entry + __entry->__str_loc_##field) +#undef __print_flags +#define __print_flags(flag, delim, flag_array...) \ + ({ \ + static const struct trace_print_flags flags[] = \ + { flag_array, { -1, NULL }}; \ + ftrace_print_flags_seq(p, delim, flag, flags); \ + }) + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ enum print_line_t \ @@ -127,6 +138,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ struct trace_seq *s = &iter->seq; \ struct ftrace_raw_##call *field; \ struct trace_entry *entry; \ + struct trace_seq *p; \ int ret; \ \ entry = iter->ent; \ @@ -138,7 +150,9 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ \ field = (typeof(field))entry; \ \ + p = &get_cpu_var(ftrace_event_seq); \ ret = trace_seq_printf(s, #call ": " print); \ + put_cpu(); \ if (!ret) \ return TRACE_TYPE_PARTIAL_LINE; \ \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 7136420603aa..a4840c260c89 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -15,6 +15,9 @@ #define EVENT_HASHSIZE 128 static DECLARE_RWSEM(trace_event_mutex); + +DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); + static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; @@ -212,6 +215,42 @@ int trace_seq_path(struct trace_seq *s, struct path *path) return 0; } +const char * +ftrace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array) +{ + unsigned long mask; + const char *str; + int i; + + trace_seq_init(p); + + for (i = 0; flag_array[i].name && flags; i++) { + + mask = flag_array[i].mask; + if ((flags & mask) != mask) + continue; + + str = flag_array[i].name; + flags &= ~mask; + if (p->len && delim) + trace_seq_puts(p, delim); + trace_seq_puts(p, str); + } + + /* check for left over flags */ + if (flags) { + if (p->len && delim) + trace_seq_puts(p, delim); + trace_seq_printf(p, "0x%lx", flags); + } + + trace_seq_putc(p, 0); + + return p->buffer; +} + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { -- cgit v1.2.2 From 937cdb9db7f59278d0cb1582e6e64e3dfd73b4fc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 15 May 2009 10:51:13 -0400 Subject: tracing: add previous task state info to sched switch event It is useful to see the state of a task that is being switched out. This patch adds the output of the state of the previous task in the context switch event. [ Impact: see state of switched out task in context switch ] Signed-off-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/trace/events/sched.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index dd4033cf5b09..24ab5bcff7b2 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -156,6 +156,7 @@ TRACE_EVENT(sched_switch, __array( char, prev_comm, TASK_COMM_LEN ) __field( pid_t, prev_pid ) __field( int, prev_prio ) + __field( long, prev_state ) __array( char, next_comm, TASK_COMM_LEN ) __field( pid_t, next_pid ) __field( int, next_prio ) @@ -165,13 +166,19 @@ TRACE_EVENT(sched_switch, memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->prev_pid = prev->pid; __entry->prev_prio = prev->prio; + __entry->prev_state = prev->state; memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; ), - TP_printk("task %s:%d [%d] ==> %s:%d [%d]", + TP_printk("task %s:%d [%d] (%s) ==> %s:%d [%d]", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, + __entry->prev_state ? + __print_flags(__entry->prev_state, "|", + { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, + { 16, "Z" }, { 32, "X" }, { 64, "x" }, + { 128, "W" }) : "R", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); -- cgit v1.2.2 From 62ba180e80f4194a498585ac0e4c07daa8ca08d1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 15 May 2009 16:16:30 -0400 Subject: tracing: add flag output for kmem events This patch changes the output for gfp_flags from being a simple hex value to the actual names. gfp_flags=GFP_ATOMIC instead of gfp_flags=00000020 And even gfp_flags=GFP_KERNEL instead of gfp_flags=000000d0 (Thanks to Frederic Weisbecker for pointing out that the first version had a bad order of GFP masks) [ Impact: more human readable output from tracer ] Acked-by: Eduard - Gabriel Munteanu Signed-off-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/trace/events/kmem.h | 53 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index c22c42f980b5..9baba50d6512 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -7,6 +7,43 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kmem +/* + * The order of these masks is important. Matching masks will be seen + * first and the left over flags will end up showing by themselves. + * + * For example, if we have GFP_KERNEL before GFP_USER we wil get: + * + * GFP_KERNEL|GFP_HARDWALL + * + * Thus most bits set go first. + */ +#define show_gfp_flags(flags) \ + (flags) ? __print_flags(flags, "|", \ + {(unsigned long)GFP_HIGHUSER_MOVABLE, "GFP_HIGHUSER_MOVABLE"}, \ + {(unsigned long)GFP_HIGHUSER, "GFP_HIGHUSER"}, \ + {(unsigned long)GFP_USER, "GFP_USER"}, \ + {(unsigned long)GFP_TEMPORARY, "GFP_TEMPORARY"}, \ + {(unsigned long)GFP_KERNEL, "GFP_KERNEL"}, \ + {(unsigned long)GFP_NOFS, "GFP_NOFS"}, \ + {(unsigned long)GFP_ATOMIC, "GFP_ATOMIC"}, \ + {(unsigned long)GFP_NOIO, "GFP_NOIO"}, \ + {(unsigned long)__GFP_HIGH, "GFP_HIGH"}, \ + {(unsigned long)__GFP_WAIT, "GFP_WAIT"}, \ + {(unsigned long)__GFP_IO, "GFP_IO"}, \ + {(unsigned long)__GFP_COLD, "GFP_COLD"}, \ + {(unsigned long)__GFP_NOWARN, "GFP_NOWARN"}, \ + {(unsigned long)__GFP_REPEAT, "GFP_REPEAT"}, \ + {(unsigned long)__GFP_NOFAIL, "GFP_NOFAIL"}, \ + {(unsigned long)__GFP_NORETRY, "GFP_NORETRY"}, \ + {(unsigned long)__GFP_COMP, "GFP_COMP"}, \ + {(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \ + {(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \ + {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ + {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ + {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ + {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"} \ + ) : "GFP_NOWAIT" + TRACE_EVENT(kmalloc, TP_PROTO(unsigned long call_site, @@ -33,12 +70,12 @@ TRACE_EVENT(kmalloc, __entry->gfp_flags = gfp_flags; ), - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s", __entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, - __entry->gfp_flags) + show_gfp_flags(__entry->gfp_flags)) ); TRACE_EVENT(kmem_cache_alloc, @@ -67,12 +104,12 @@ TRACE_EVENT(kmem_cache_alloc, __entry->gfp_flags = gfp_flags; ), - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s", __entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, - __entry->gfp_flags) + show_gfp_flags(__entry->gfp_flags)) ); TRACE_EVENT(kmalloc_node, @@ -104,12 +141,12 @@ TRACE_EVENT(kmalloc_node, __entry->node = node; ), - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d", __entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, - __entry->gfp_flags, + show_gfp_flags(__entry->gfp_flags), __entry->node) ); @@ -142,12 +179,12 @@ TRACE_EVENT(kmem_cache_alloc_node, __entry->node = node; ), - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d", __entry->call_site, __entry->ptr, __entry->bytes_req, __entry->bytes_alloc, - __entry->gfp_flags, + show_gfp_flags(__entry->gfp_flags), __entry->node) ); -- cgit v1.2.2 From 0f4fc29dd68dfab9c6ddd5d087d34a5b6818cb00 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 May 2009 19:21:47 -0400 Subject: tracing: add __print_symbolic to trace events This patch adds __print_symbolic which is similar to __print_flags but works for an enumeration type instead. That is, there is only a one to one mapping between the values and the symbols. When a match is made, then it is printed, otherwise the hex value is outputed. [ Impact: add interface for showing symbol names in events ] Signed-off-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/linux/ftrace_event.h | 3 +++ include/trace/ftrace.h | 8 ++++++++ kernel/trace/trace_output.c | 25 +++++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 4b58cf1a11c2..bbf40f624fc8 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -20,6 +20,9 @@ const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, const struct trace_print_flags *flag_array); +const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array); + /* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 22c94719c569..87fc227c6fbe 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -130,6 +130,14 @@ ftrace_print_flags_seq(p, delim, flag, flags); \ }) +#undef __print_symbolic +#define __print_symbolic(value, symbol_array...) \ + ({ \ + static const struct trace_print_flags symbols[] = \ + { symbol_array, { -1, NULL }}; \ + ftrace_print_symbols_seq(p, value, symbols); \ + }) + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ enum print_line_t \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index a4840c260c89..c12d95db2f56 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -251,6 +251,31 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, return p->buffer; } +const char * +ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array) +{ + int i; + + trace_seq_init(p); + + for (i = 0; symbol_array[i].name; i++) { + + if (val != symbol_array[i].mask) + continue; + + trace_seq_puts(p, symbol_array[i].name); + break; + } + + if (!p->len) + trace_seq_printf(p, "0x%lx", val); + + trace_seq_putc(p, 0); + + return p->buffer; +} + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { -- cgit v1.2.2 From c2adae0970ca1db8adb92fb56ae3bcabd916e8bd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 May 2009 19:56:19 -0400 Subject: tracing: convert irq events to use __print_symbolic The recording of the names at trace time is inefficient. This patch implements the softirq event recording to only record the vector and then use the __print_symbolic interface to print out the names. [ Impact: faster recording of softirq events ] Signed-off-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/trace/events/irq.h | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 32a9f7ef432b..683fb36a9943 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -7,6 +7,19 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq +#define softirq_name(sirq) { sirq, #sirq } +#define show_softirq_name(val) \ + __print_symbolic(val, \ + softirq_name(HI_SOFTIRQ), \ + softirq_name(TIMER_SOFTIRQ), \ + softirq_name(NET_TX_SOFTIRQ), \ + softirq_name(NET_RX_SOFTIRQ), \ + softirq_name(BLOCK_SOFTIRQ), \ + softirq_name(TASKLET_SOFTIRQ), \ + softirq_name(SCHED_SOFTIRQ), \ + softirq_name(HRTIMER_SOFTIRQ), \ + softirq_name(RCU_SOFTIRQ)) + /** * irq_handler_entry - called immediately before the irq action handler * @irq: irq number @@ -87,15 +100,14 @@ TRACE_EVENT(softirq_entry, TP_STRUCT__entry( __field( int, vec ) - __string( name, softirq_to_name[h-vec] ) ), TP_fast_assign( __entry->vec = (int)(h - vec); - __assign_str(name, softirq_to_name[h-vec]); ), - TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name)) + TP_printk("softirq=%d action=%s", __entry->vec, + show_softirq_name(__entry->vec)) ); /** @@ -117,15 +129,14 @@ TRACE_EVENT(softirq_exit, TP_STRUCT__entry( __field( int, vec ) - __string( name, softirq_to_name[h-vec] ) ), TP_fast_assign( __entry->vec = (int)(h - vec); - __assign_str(name, softirq_to_name[h-vec]); ), - TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name)) + TP_printk("softirq=%d action=%s", __entry->vec, + show_softirq_name(__entry->vec)) ); #endif /* _TRACE_IRQ_H */ -- cgit v1.2.2 From 5b6045a906f48d37591365c5dcdd6d1d146bfd4a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 May 2009 17:28:02 +0200 Subject: trace: disable preemption before taking raw spinlocks s390 code uses smp_processor_id() in __raw_spin_lock() code which reveals that a (raw) spinlock is taken without preemption disabled. This can potentially deadlock. To fix this explicitly disable and enable preemption. BUG: using smp_processor_id() in preemptible [00000000] code: cat/2278 caller is trace_find_cmdline+0x40/0xfc CPU: 0 Not tainted 2.6.30-rc7-dirty #39 Process cat (pid: 2278, task: 000000003faedb68, ksp: 000000003b33b988) 000000003b33b988 000000003b33bae0 0000000000000002 0000000000000000 000000003b33bb80 000000003b33baf8 000000003b33baf8 00000000000175d6 0000000000000001 000000003b33b988 000000003f9b0000 000000000000000b 000000000000000c 000000003b33bb40 000000003b33bae0 0000000000000000 0000000000000000 00000000000175d6 000000003b33bae0 000000003b33bb28 Call Trace: ([<00000000000174b2>] show_trace+0x112/0x170) [<0000000000017582>] show_stack+0x72/0x100 [<0000000000441538>] dump_stack+0xc8/0xd8 [<000000000025c350>] debug_smp_processor_id+0x114/0x130 [<00000000000bf0e4>] trace_find_cmdline+0x40/0xfc [<00000000000c35d4>] trace_print_context+0x58/0xac [<00000000000bb676>] print_trace_line+0x416/0x470 [<00000000000bc8fe>] s_show+0x4e/0x428 [<000000000013834e>] seq_read+0x36a/0x5d4 [<0000000000112a78>] vfs_read+0xc8/0x174 [<0000000000112c58>] SyS_read+0x74/0xc4 [<000000000002c7ae>] sysc_noemu+0x10/0x16 [<000002000012436c>] 0x2000012436c 1 lock held by cat/2278: #0: (&p->lock){+.+.+.}, at: [<0000000000138056>] seq_read+0x72/0x5d4 [ Impact: fix preempt-unsafe raw spinlock ] Signed-off-by: Heiko Carstens Acked-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 02d32baa23ac..a3a8a87d7e91 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -808,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[]) return; } + preempt_disable(); __raw_spin_lock(&trace_cmdline_lock); map = map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) @@ -816,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[]) strcpy(comm, "<...>"); __raw_spin_unlock(&trace_cmdline_lock); + preempt_enable(); } void tracing_record_cmdline(struct task_struct *tsk) -- cgit v1.2.2 From f2aebaee653a35b01c3665de2cbb1e31456b8ea8 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Wed, 27 May 2009 21:36:02 +0800 Subject: ftrace: don't convert function's local variable name in macro "call" is an argument of macro, but it is also used as a local variable name of function in macro. We should keep this local variable name distinct from any CPP macro parameter name if both are in the same macro scope, although it hasn't caused any problem yet. [ Impact: robustify macro ] Signed-off-by: Zhao Lei Acked-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/trace/ftrace.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 87fc227c6fbe..b4ec83ae711f 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -397,19 +397,19 @@ static void ftrace_profile_##call(proto) \ perf_tpcounter_event(event_##call.id); \ } \ \ -static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \ +static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ { \ int ret = 0; \ \ - if (!atomic_inc_return(&call->profile_count)) \ + if (!atomic_inc_return(&event_call->profile_count)) \ ret = register_trace_##call(ftrace_profile_##call); \ \ return ret; \ } \ \ -static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ +static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ { \ - if (atomic_add_negative(-1, &call->profile_count)) \ + if (atomic_add_negative(-1, &event_call->profile_count)) \ unregister_trace_##call(ftrace_profile_##call); \ } @@ -433,9 +433,9 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ #define __array(type, item, len) #undef __string -#define __string(item, src) \ - __str_offsets.item = __str_size + \ - offsetof(typeof(*entry), __str_data); \ +#define __string(item, src) \ + __str_offsets.item = __str_size + \ + offsetof(typeof(*entry), __str_data); \ __str_size += strlen(src) + 1; #undef __assign_str @@ -451,8 +451,8 @@ static struct ftrace_event_call event_##call; \ \ static void ftrace_raw_event_##call(proto) \ { \ - struct ftrace_str_offsets_##call __maybe_unused __str_offsets; \ - struct ftrace_event_call *call = &event_##call; \ + struct ftrace_str_offsets_##call __maybe_unused __str_offsets; \ + struct ftrace_event_call *event_call = &event_##call; \ struct ring_buffer_event *event; \ struct ftrace_raw_##call *entry; \ unsigned long irq_flags; \ @@ -473,7 +473,7 @@ static void ftrace_raw_event_##call(proto) \ \ assign; \ \ - if (!filter_current_check_discard(call, entry, event)) \ + if (!filter_current_check_discard(event_call, entry, event)) \ trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ } \ \ -- cgit v1.2.2 From fb39125fd79a25c5002f3b45cf4c80e3fa6b961b Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 17 Apr 2009 15:15:51 +0800 Subject: ftrace, workqueuetrace: make workqueue tracepoints use TRACE_EVENT macro v3: zhaolei@cn.fujitsu.com: Change TRACE_EVENT definition to new format introduced by Steven Rostedt: consolidate trace and trace_event headers v2: kosaki@jp.fujitsu.com: print the function names instead of addr, and zap the work addr v1: zhaolei@cn.fujitsu.com: Make workqueue tracepoints use TRACE_EVENT macro TRACE_EVENT is a more generic way to define tracepoints. Doing so adds these new capabilities to the tracepoints: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions Then, this patch converts DEFINE_TRACE to TRACE_EVENT in workqueue related tracepoints. [ Impact: expand workqueue tracer to events tracing ] Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: KOSAKI Motohiro Signed-off-by: Frederic Weisbecker --- include/trace/events/workqueue.h | 100 +++++++++++++++++++++++++++++++++++++++ include/trace/workqueue.h | 25 ---------- kernel/trace/trace_workqueue.c | 2 +- kernel/workqueue.c | 11 +---- 4 files changed, 103 insertions(+), 35 deletions(-) create mode 100644 include/trace/events/workqueue.h delete mode 100644 include/trace/workqueue.h diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h new file mode 100644 index 000000000000..035f1bff288e --- /dev/null +++ b/include/trace/events/workqueue.h @@ -0,0 +1,100 @@ +#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_WORKQUEUE_H + +#include +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM workqueue + +TRACE_EVENT(workqueue_insertion, + + TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), + + TP_ARGS(wq_thread, work), + + TP_STRUCT__entry( + __array(char, thread_comm, TASK_COMM_LEN) + __field(pid_t, thread_pid) + __field(work_func_t, func) + ), + + TP_fast_assign( + memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN); + __entry->thread_pid = wq_thread->pid; + __entry->func = work->func; + ), + + TP_printk("thread=%s:%d func=%pF", __entry->thread_comm, + __entry->thread_pid, __entry->func) +); + +TRACE_EVENT(workqueue_execution, + + TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), + + TP_ARGS(wq_thread, work), + + TP_STRUCT__entry( + __array(char, thread_comm, TASK_COMM_LEN) + __field(pid_t, thread_pid) + __field(work_func_t, func) + ), + + TP_fast_assign( + memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN); + __entry->thread_pid = wq_thread->pid; + __entry->func = work->func; + ), + + TP_printk("thread=%s:%d func=%pF", __entry->thread_comm, + __entry->thread_pid, __entry->func) +); + +/* Trace the creation of one workqueue thread on a cpu */ +TRACE_EVENT(workqueue_creation, + + TP_PROTO(struct task_struct *wq_thread, int cpu), + + TP_ARGS(wq_thread, cpu), + + TP_STRUCT__entry( + __array(char, thread_comm, TASK_COMM_LEN) + __field(pid_t, thread_pid) + __field(int, cpu) + ), + + TP_fast_assign( + memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN); + __entry->thread_pid = wq_thread->pid; + __entry->cpu = cpu; + ), + + TP_printk("thread=%s:%d cpu=%d", __entry->thread_comm, + __entry->thread_pid, __entry->cpu) +); + +TRACE_EVENT(workqueue_destruction, + + TP_PROTO(struct task_struct *wq_thread), + + TP_ARGS(wq_thread), + + TP_STRUCT__entry( + __array(char, thread_comm, TASK_COMM_LEN) + __field(pid_t, thread_pid) + ), + + TP_fast_assign( + memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN); + __entry->thread_pid = wq_thread->pid; + ), + + TP_printk("thread=%s:%d", __entry->thread_comm, __entry->thread_pid) +); + +#endif /* _TRACE_WORKQUEUE_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h deleted file mode 100644 index 7626523deeba..000000000000 --- a/include/trace/workqueue.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef __TRACE_WORKQUEUE_H -#define __TRACE_WORKQUEUE_H - -#include -#include -#include - -DECLARE_TRACE(workqueue_insertion, - TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), - TP_ARGS(wq_thread, work)); - -DECLARE_TRACE(workqueue_execution, - TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), - TP_ARGS(wq_thread, work)); - -/* Trace the creation of one workqueue thread on a cpu */ -DECLARE_TRACE(workqueue_creation, - TP_PROTO(struct task_struct *wq_thread, int cpu), - TP_ARGS(wq_thread, cpu)); - -DECLARE_TRACE(workqueue_destruction, - TP_PROTO(struct task_struct *wq_thread), - TP_ARGS(wq_thread)); - -#endif /* __TRACE_WORKQUEUE_H */ diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 984b9175c13d..cfe56d31d85b 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -6,7 +6,7 @@ */ -#include +#include #include #include #include "trace_stat.h" diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f71fb2a08950..0668795d8818 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -33,7 +33,8 @@ #include #include #include -#include +#define CREATE_TRACE_POINTS +#include /* * The per-CPU workqueue (if single thread, we always use the first @@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); } -DEFINE_TRACE(workqueue_insertion); - static void insert_work(struct cpu_workqueue_struct *cwq, struct work_struct *work, struct list_head *head) { @@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); -DEFINE_TRACE(workqueue_execution); - static void run_workqueue(struct cpu_workqueue_struct *cwq) { spin_lock_irq(&cwq->lock); @@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu) return cwq; } -DEFINE_TRACE(workqueue_creation); - static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; @@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name, } EXPORT_SYMBOL_GPL(__create_workqueue_key); -DEFINE_TRACE(workqueue_destruction); - static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) { /* -- cgit v1.2.2 From 1fdfca9c577aac96a559c1ea68f5c9156f17d636 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 20 Apr 2009 14:58:26 +0800 Subject: trace_workqueue: use list_for_each_entry() instead of list_for_each_entry_safe() No need to use list_for_each_entry_safe() in iteration without deleting any node, we can use list_for_each_entry() instead. [ Impact: cleanup ] Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_workqueue.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index cfe56d31d85b..128b64b93f14 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -47,12 +47,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node, *next; + struct cpu_workqueue_stats *node; unsigned long flags; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, - list) { + list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { if (node->pid == wq_thread->pid) { atomic_inc(&node->inserted); goto found; @@ -69,12 +68,11 @@ probe_workqueue_execution(struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); - struct cpu_workqueue_stats *node, *next; + struct cpu_workqueue_stats *node; unsigned long flags; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, - list) { + list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { if (node->pid == wq_thread->pid) { node->executed++; goto found; -- cgit v1.2.2 From b8867164f05791a6b5363bd51c1274e03600886e Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 20 Apr 2009 14:59:36 +0800 Subject: trace_workqueue: remove cpu_workqueue_stats->first_entry cpu_workqueue_stats->first_entry is useless because we can retrieve the header of a cpu workqueue using: if (&cpu_workqueue_stats->list == workqueue_cpu_stat(cpu)->list.next) [ Impact: cleanup ] Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_workqueue.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 128b64b93f14..890974aed64d 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -16,8 +16,6 @@ /* A cpu workqueue thread */ struct cpu_workqueue_stats { struct list_head list; -/* Useful to know if we print the cpu headers */ - bool first_entry; int cpu; pid_t pid; /* Can be inserted from interrupt or user context, need to be atomic */ @@ -103,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) cws->pid = wq_thread->pid; spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (list_empty(&workqueue_cpu_stat(cpu)->list)) - cws->first_entry = true; list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); } -- cgit v1.2.2 From f3c4ae26e93d354152196b62797ba86ad86dd0cc Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 20 Apr 2009 15:02:17 +0800 Subject: trace_workqueue: remove blank line between each cpu The blankline between each cpu's workqueue stat is not necessary, because the cpu number is enough to part them by eye. Old style also caused a blankline below headline, and made code complex by using lock, disableirq and get cpu var. Old style: # CPU INSERTED EXECUTED NAME # | | | | 0 8644 8644 events/0 0 0 0 cpuset ... 0 1 1 kdmflush 1 35365 35365 events/1 ... New style: # CPU INSERTED EXECUTED NAME # | | | | 0 8644 8644 events/0 0 0 0 cpuset ... 0 1 1 kdmflush 1 35365 35365 events/1 ... [ Impact: provide more readable code ] Signed-off-by: Zhao Lei Cc: KOSAKI Motohiro Cc: Steven Rostedt Cc: Tom Zanussi Cc: Oleg Nesterov Cc: Andrew Morton Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_workqueue.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 890974aed64d..97fcea4acce1 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -185,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx) static int workqueue_stat_show(struct seq_file *s, void *p) { struct cpu_workqueue_stats *cws = p; - unsigned long flags; - int cpu = cws->cpu; struct pid *pid; struct task_struct *tsk; - spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); - if (&cws->list == workqueue_cpu_stat(cpu)->list.next) - seq_printf(s, "\n"); - spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); - pid = find_get_pid(cws->pid); if (pid) { tsk = get_pid_task(pid, PIDTYPE_PID); -- cgit v1.2.2 From 0d64f8342de26d02451900b1aad94716fe92c4ab Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 May 2009 05:58:49 +0200 Subject: tracing/stat: replace trace_stat_session by stat_session The "trace" prefix in struct trace_stat_session type is annoying while reading the trace_stat.c file. It makes the lines longer, and is not that much useful to explain the sense of this type. Just keep "struct stat_session" for this type. [ Impact: make the code a bit more readable ] Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index fdde3a4a94cd..3b6816be825d 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -22,7 +22,7 @@ struct trace_stat_list { }; /* A stat session is the stats output in one file */ -struct tracer_stat_session { +struct stat_session { struct list_head session_list; struct tracer_stat *ts; struct list_head stat_list; @@ -38,7 +38,7 @@ static DEFINE_MUTEX(all_stat_sessions_mutex); static struct dentry *stat_dir; -static void reset_stat_session(struct tracer_stat_session *session) +static void reset_stat_session(struct stat_session *session) { struct trace_stat_list *node, *next; @@ -48,7 +48,7 @@ static void reset_stat_session(struct tracer_stat_session *session) INIT_LIST_HEAD(&session->stat_list); } -static void destroy_session(struct tracer_stat_session *session) +static void destroy_session(struct stat_session *session) { debugfs_remove(session->file); reset_stat_session(session); @@ -71,7 +71,7 @@ static int dummy_cmp(void *p1, void *p2) * All of these copies and sorting are required on all opening * since the stats could have changed between two file sessions. */ -static int stat_seq_init(struct tracer_stat_session *session) +static int stat_seq_init(struct stat_session *session) { struct trace_stat_list *iter_entry, *new_entry; struct tracer_stat *ts = session->ts; @@ -154,7 +154,7 @@ exit_free_list: static void *stat_seq_start(struct seq_file *s, loff_t *pos) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; /* Prevent from tracer switch or stat_list modification */ mutex_lock(&session->stat_mutex); @@ -168,7 +168,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; if (p == SEQ_START_TOKEN) return seq_list_start(&session->stat_list, *pos); @@ -178,13 +178,13 @@ static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) static void stat_seq_stop(struct seq_file *s, void *p) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; mutex_unlock(&session->stat_mutex); } static int stat_seq_show(struct seq_file *s, void *v) { - struct tracer_stat_session *session = s->private; + struct stat_session *session = s->private; struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); if (v == SEQ_START_TOKEN) @@ -205,7 +205,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file) { int ret; - struct tracer_stat_session *session = inode->i_private; + struct stat_session *session = inode->i_private; ret = seq_open(file, &trace_stat_seq_ops); if (!ret) { @@ -222,7 +222,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file) */ static int tracing_stat_release(struct inode *i, struct file *f) { - struct tracer_stat_session *session = i->i_private; + struct stat_session *session = i->i_private; mutex_lock(&session->stat_mutex); reset_stat_session(session); @@ -251,7 +251,7 @@ static int tracing_stat_init(void) return 0; } -static int init_stat_file(struct tracer_stat_session *session) +static int init_stat_file(struct stat_session *session) { if (!stat_dir && tracing_stat_init()) return -ENODEV; @@ -266,7 +266,7 @@ static int init_stat_file(struct tracer_stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { - struct tracer_stat_session *session, *node, *tmp; + struct stat_session *session, *node, *tmp; int ret; if (!trace) @@ -286,7 +286,7 @@ int register_stat_tracer(struct tracer_stat *trace) mutex_unlock(&all_stat_sessions_mutex); /* Init the session */ - session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL); + session = kmalloc(sizeof(struct stat_session), GFP_KERNEL); if (!session) return -ENOMEM; @@ -312,7 +312,7 @@ int register_stat_tracer(struct tracer_stat *trace) void unregister_stat_tracer(struct tracer_stat *trace) { - struct tracer_stat_session *node, *tmp; + struct stat_session *node, *tmp; mutex_lock(&all_stat_sessions_mutex); list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { -- cgit v1.2.2 From 8f184f27300f66f6dcc8296c2dae7a1fbe8429c9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 16 May 2009 06:24:36 +0200 Subject: tracing/stat: replace linked list by an rbtree for sorting When the stat tracing framework prepares the entries from a tracer to output them to the user, it starts by computing a linear sort through a linked list to give the entries ordered by relevance to the user. This is quite ugly and causes a small latency when we begin to read the file. This patch changes that by turning the linked list into a red-black tree. Athough the whole iteration using the start and next tracer callbacks while opening the file remain the same, it is now much more fast and scalable. The rbtree guarantees O(log(n)) insertions whereas a linked list with linear sorting brought us a O(n) despair. Now the (visible) latency has disapeared. [ Impact: kill the latency while starting to read a stat tracer file ] Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 140 +++++++++++++++++++++++++++++++++------------- 1 file changed, 100 insertions(+), 40 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 3b6816be825d..0bd0fc82da5d 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -1,7 +1,7 @@ /* * Infrastructure for statistic tracing (histogram output). * - * Copyright (C) 2008 Frederic Weisbecker + * Copyright (C) 2008-2009 Frederic Weisbecker * * Based on the code from trace_branch.c which is * Copyright (C) 2008 Steven Rostedt @@ -10,14 +10,19 @@ #include +#include #include #include "trace_stat.h" #include "trace.h" -/* List of stat entries from a tracer */ -struct trace_stat_list { - struct list_head list; +/* + * List of stat red-black nodes from a tracer + * We use a such tree to sort quickly the stat + * entries from the tracer. + */ +struct stat_node { + struct rb_node node; void *stat; }; @@ -25,7 +30,7 @@ struct trace_stat_list { struct stat_session { struct list_head session_list; struct tracer_stat *ts; - struct list_head stat_list; + struct rb_root stat_root; struct mutex stat_mutex; struct dentry *file; }; @@ -37,15 +42,45 @@ static DEFINE_MUTEX(all_stat_sessions_mutex); /* The root directory for all stat files */ static struct dentry *stat_dir; +/* + * Iterate through the rbtree using a post order traversal path + * to release the next node. + * It won't necessary release one at each iteration + * but it will at least advance closer to the next one + * to be released. + */ +static struct rb_node *release_next(struct rb_node *node) +{ + struct stat_node *snode; + struct rb_node *parent = rb_parent(node); + + if (node->rb_left) + return node->rb_left; + else if (node->rb_right) + return node->rb_right; + else { + if (!parent) + return NULL; + if (parent->rb_left == node) + parent->rb_left = NULL; + else + parent->rb_right = NULL; + + snode = container_of(node, struct stat_node, node); + kfree(snode); + + return parent; + } +} static void reset_stat_session(struct stat_session *session) { - struct trace_stat_list *node, *next; + struct rb_node *node = session->stat_root.rb_node; - list_for_each_entry_safe(node, next, &session->stat_list, list) - kfree(node); + while (node) + node = release_next(node); - INIT_LIST_HEAD(&session->stat_list); + session->stat_root = RB_ROOT; } static void destroy_session(struct stat_session *session) @@ -56,6 +91,35 @@ static void destroy_session(struct stat_session *session) kfree(session); } +typedef int (*cmp_stat_t)(void *, void *); + +static void +insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* + * Figure out where to put new node + * This is a descendent sorting + */ + while (*new) { + struct stat_node *this; + int result; + + this = container_of(*new, struct stat_node, node); + result = cmp(data->stat, this->stat); + + parent = *new; + if (result >= 0) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); +} + /* * For tracers that don't provide a stat_cmp callback. * This one will force an immediate insertion on tail of @@ -73,8 +137,9 @@ static int dummy_cmp(void *p1, void *p2) */ static int stat_seq_init(struct stat_session *session) { - struct trace_stat_list *iter_entry, *new_entry; struct tracer_stat *ts = session->ts; + struct stat_node *new_entry; + struct rb_root *root; void *stat; int ret = 0; int i; @@ -93,15 +158,13 @@ static int stat_seq_init(struct stat_session *session) * The first entry. Actually this is the second, but the first * one (the stat_list head) is pointless. */ - new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); + new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); if (!new_entry) { ret = -ENOMEM; goto exit; } - - INIT_LIST_HEAD(&new_entry->list); - - list_add(&new_entry->list, &session->stat_list); + root = &session->stat_root; + insert_stat(root, new_entry, dummy_cmp); new_entry->stat = stat; @@ -116,31 +179,17 @@ static int stat_seq_init(struct stat_session *session) if (!stat) break; - new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL); + new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); if (!new_entry) { ret = -ENOMEM; goto exit_free_list; } - INIT_LIST_HEAD(&new_entry->list); new_entry->stat = stat; - list_for_each_entry_reverse(iter_entry, &session->stat_list, - list) { - - /* Insertion with a descendent sorting */ - if (ts->stat_cmp(iter_entry->stat, - new_entry->stat) >= 0) { - - list_add(&new_entry->list, &iter_entry->list); - break; - } - } - - /* The current larger value */ - if (list_empty(&new_entry->list)) - list_add(&new_entry->list, &session->stat_list); + insert_stat(root, new_entry, ts->stat_cmp); } + exit: mutex_unlock(&session->stat_mutex); return ret; @@ -155,25 +204,38 @@ exit_free_list: static void *stat_seq_start(struct seq_file *s, loff_t *pos) { struct stat_session *session = s->private; + struct rb_node *node; + int i; /* Prevent from tracer switch or stat_list modification */ mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ - if (!*pos && session->ts->stat_headers) + if (!*pos && session->ts->stat_headers) { + (*pos)++; return SEQ_START_TOKEN; + } - return seq_list_start(&session->stat_list, *pos); + node = rb_first(&session->stat_root); + for (i = 0; node && i < *pos; i++) + node = rb_next(node); + + (*pos)++; + + return node; } static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) { struct stat_session *session = s->private; + struct rb_node *node = p; + + (*pos)++; if (p == SEQ_START_TOKEN) - return seq_list_start(&session->stat_list, *pos); + return rb_first(&session->stat_root); - return seq_list_next(p, &session->stat_list, pos); + return rb_next(node); } static void stat_seq_stop(struct seq_file *s, void *p) @@ -185,7 +247,7 @@ static void stat_seq_stop(struct seq_file *s, void *p) static int stat_seq_show(struct seq_file *s, void *v) { struct stat_session *session = s->private; - struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list); + struct stat_node *l = container_of(v, struct stat_node, node); if (v == SEQ_START_TOKEN) return session->ts->stat_headers(s); @@ -286,15 +348,13 @@ int register_stat_tracer(struct tracer_stat *trace) mutex_unlock(&all_stat_sessions_mutex); /* Init the session */ - session = kmalloc(sizeof(struct stat_session), GFP_KERNEL); + session = kzalloc(sizeof(*session), GFP_KERNEL); if (!session) return -ENOMEM; session->ts = trace; INIT_LIST_HEAD(&session->session_list); - INIT_LIST_HEAD(&session->stat_list); mutex_init(&session->stat_mutex); - session->file = NULL; ret = init_stat_file(session); if (ret) { -- cgit v1.2.2 From b3dd7ba7d862707800c7ac45068f14ade2b65155 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 27 May 2009 11:04:26 +0800 Subject: tracing/stat: change dummpy_cmp() to return -1 Currently the output of trace_stat/workqueues is totally reversed: # cat /debug/tracing/trace_stat/workqueues ... 1 17 17 210 37 `-blk_unplug_work+0x0/0x57 1 3779 3779 181 11 |-cfq_kick_queue+0x0/0x2f 1 3796 3796 kblockd/1:120 ... The correct output should be: 1 3796 3796 kblockd/1:120 1 3779 3779 181 11 |-cfq_kick_queue+0x0/0x2f 1 17 17 210 37 `-blk_unplug_work+0x0/0x57 It's caused by "tracing/stat: replace linked list by an rbtree for sorting" (53059c9b67a62a3dc8c80204d3da42b9267ea5a0). dummpy_cmp() should return -1, so rb_node will always be inserted as right-most node in the rbtree, thus we sort the output in ascending order. [ Impact: fix the output of trace_stat/workqueues ] Signed-off-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 0bd0fc82da5d..5816d1aebcc9 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -127,7 +127,7 @@ insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp) */ static int dummy_cmp(void *p1, void *p2) { - return 1; + return -1; } /* -- cgit v1.2.2 From e16228069083a2f6b94383ac5739aea7a0f38ce4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 27 May 2009 11:04:48 +0800 Subject: tracing/stat: remember to free root node When closing a trace_stat file, we destroy the rbtree constructed during file open, but there is memory leak that the root node is not freed. [ Impact: fix memory leak when closing a trace_stat file ] Signed-off-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 5816d1aebcc9..8030ec98dbad 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -60,8 +60,8 @@ static struct rb_node *release_next(struct rb_node *node) return node->rb_right; else { if (!parent) - return NULL; - if (parent->rb_left == node) + ; + else if (parent->rb_left == node) parent->rb_left = NULL; else parent->rb_right = NULL; -- cgit v1.2.2 From dbd3fbdfeecfad4e71139db05d72560c3583e2a9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 27 May 2009 11:42:46 +0800 Subject: tracing/stat: do some cleanups - remove duplicate code in stat_seq_init() - update comments to reflect the change from stat list to stat rbtree [ Impact: clean up ] Signed-off-by: Li Zefan Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 54 ++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 8030ec98dbad..17f20ebdad2a 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -93,10 +93,15 @@ static void destroy_session(struct stat_session *session) typedef int (*cmp_stat_t)(void *, void *); -static void -insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp) +static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) { struct rb_node **new = &(root->rb_node), *parent = NULL; + struct stat_node *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->stat = stat; /* * Figure out where to put new node @@ -118,12 +123,13 @@ insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp) rb_link_node(&data->node, parent, new); rb_insert_color(&data->node, root); + return 0; } /* * For tracers that don't provide a stat_cmp callback. - * This one will force an immediate insertion on tail of - * the list. + * This one will force an insertion as right-most node + * in the rbtree. */ static int dummy_cmp(void *p1, void *p2) { @@ -131,15 +137,14 @@ static int dummy_cmp(void *p1, void *p2) } /* - * Initialize the stat list at each trace_stat file opening. + * Initialize the stat rbtree at each trace_stat file opening. * All of these copies and sorting are required on all opening * since the stats could have changed between two file sessions. */ static int stat_seq_init(struct stat_session *session) { struct tracer_stat *ts = session->ts; - struct stat_node *new_entry; - struct rb_root *root; + struct rb_root *root = &session->stat_root; void *stat; int ret = 0; int i; @@ -154,23 +159,12 @@ static int stat_seq_init(struct stat_session *session) if (!stat) goto exit; - /* - * The first entry. Actually this is the second, but the first - * one (the stat_list head) is pointless. - */ - new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); - if (!new_entry) { - ret = -ENOMEM; + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) goto exit; - } - root = &session->stat_root; - insert_stat(root, new_entry, dummy_cmp); - - new_entry->stat = stat; /* - * Iterate over the tracer stat entries and store them in a sorted - * list. + * Iterate over the tracer stat entries and store them in an rbtree. */ for (i = 1; ; i++) { stat = ts->stat_next(stat, i); @@ -179,22 +173,16 @@ static int stat_seq_init(struct stat_session *session) if (!stat) break; - new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL); - if (!new_entry) { - ret = -ENOMEM; - goto exit_free_list; - } - - new_entry->stat = stat; - - insert_stat(root, new_entry, ts->stat_cmp); + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) + goto exit_free_rbtree; } exit: mutex_unlock(&session->stat_mutex); return ret; -exit_free_list: +exit_free_rbtree: reset_stat_session(session); mutex_unlock(&session->stat_mutex); return ret; @@ -207,7 +195,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos) struct rb_node *node; int i; - /* Prevent from tracer switch or stat_list modification */ + /* Prevent from tracer switch or rbtree modification */ mutex_lock(&session->stat_mutex); /* If we are in the beginning of the file, print the headers */ @@ -280,7 +268,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file) } /* - * Avoid consuming memory with our now useless list. + * Avoid consuming memory with our now useless rbtree. */ static int tracing_stat_release(struct inode *i, struct file *f) { -- cgit v1.2.2 From 43bd1236234cacbc18d1476a9b57e7a306efddf5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 30 May 2009 04:25:30 +0200 Subject: tracing/stat: remove unappropriate safe walk on list register_stat_tracer() uses list_for_each_entry_safe to check whether a tracer is already present in the list. But we don't delete anything from the list here, so we don't need the safe version [ Impact: cleanup list use is stat tracing ] Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_stat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 17f20ebdad2a..c00643733f4c 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -316,7 +316,7 @@ static int init_stat_file(struct stat_session *session) int register_stat_tracer(struct tracer_stat *trace) { - struct stat_session *session, *node, *tmp; + struct stat_session *session, *node; int ret; if (!trace) @@ -327,7 +327,7 @@ int register_stat_tracer(struct tracer_stat *trace) /* Already registered? */ mutex_lock(&all_stat_sessions_mutex); - list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { + list_for_each_entry(node, &all_stat_sessions, session_list) { if (node->ts == trace) { mutex_unlock(&all_stat_sessions_mutex); return -EINVAL; -- cgit v1.2.2 From 2af15d6a44b871ad4c2a651302374cde8f335480 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 May 2009 13:37:24 -0400 Subject: ftrace: add kernel command line function filtering When using ftrace=function on the command line to trace functions on boot up, one can not filter out functions that are commonly called. This patch adds two new ftrace command line commands. ftrace_notrace=function-list ftrace_filter=function-list Where function-list is a comma separated list of functions to filter. The ftrace_notrace will make the functions listed not be included in the function tracing, and ftrace_filter will only trace the functions listed. These two act the same as the debugfs/tracing/set_ftrace_notrace and debugfs/tracing/set_ftrace_filter respectively. The simple glob expressions that are allowed by the filter files can also be used by the command line interface. ftrace_notrace=rcu*,*lock,*spin* Will not trace any function that starts with rcu, ends with lock, or has the word spin in it. Note, if the self tests are enabled, they may interfere with the filtering set by the command lines. Signed-off-by: Steven Rostedt --- Documentation/kernel-parameters.txt | 17 +++++++++++++-- kernel/trace/ftrace.c | 42 +++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9243dd84f4d6..fcd3bfbe74e8 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -751,12 +751,25 @@ and is between 256 and 4096 characters. It is defined in the file ia64_pal_cache_flush instead of SAL_CACHE_FLUSH. ftrace=[tracer] - [ftrace] will set and start the specified tracer + [FTRACE] will set and start the specified tracer as early as possible in order to facilitate early boot debugging. ftrace_dump_on_oops - [ftrace] will dump the trace buffers on oops. + [FTRACE] will dump the trace buffers on oops. + + ftrace_filter=[function-list] + [FTRACE] Limit the functions traced by the function + tracer at boot up. function-list is a comma separated + list of functions. This list can be changed at run + time by the set_ftrace_filter file in the debugfs + tracing directory. + + ftrace_notrace=[function-list] + [FTRACE] Do not trace the functions specified in + function-list. This list can be changed at run time + by the set_ftrace_notrace file in the debugfs + tracing directory. gamecon.map[2|3]= [HW,JOY] Multisystem joystick and NES/SNES/PSX pad diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 140699a9a8a7..2074e5b7766b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -32,6 +32,7 @@ #include #include +#include #include "trace_output.h" #include "trace_stat.h" @@ -2369,6 +2370,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset) ftrace_set_regex(buf, len, reset, 0); } +/* + * command line interface to allow users to set filters on boot up. + */ +#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE +static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; + +static int __init set_ftrace_notrace(char *str) +{ + strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_notrace=", set_ftrace_notrace); + +static int __init set_ftrace_filter(char *str) +{ + strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_filter=", set_ftrace_filter); + +static void __init set_ftrace_early_filter(char *buf, int enable) +{ + char *func; + + while (buf) { + func = strsep(&buf, ","); + ftrace_set_regex(func, strlen(func), 0, enable); + } +} + +static void __init set_ftrace_early_filters(void) +{ + if (ftrace_filter_buf[0]) + set_ftrace_early_filter(ftrace_filter_buf, 1); + if (ftrace_notrace_buf[0]) + set_ftrace_early_filter(ftrace_notrace_buf, 0); +} + static int ftrace_regex_release(struct inode *inode, struct file *file, int enable) { @@ -2829,6 +2869,8 @@ void __init ftrace_init(void) if (ret) pr_warning("Failed to register trace ftrace module notifier\n"); + set_ftrace_early_filters(); + return; failed: ftrace_disabled = 1; -- cgit v1.2.2 From 5e0a093910876882f91f1d4b8a1635a099e6c7ba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 May 2009 15:50:13 -0400 Subject: tracing: fix config options to not show when automatically selected There are two options that are selected by all tracers, but we want to have those options available when no tracer is selected. These are The event tracer and sched switch tracer. The are enabled by all tracers, but if a tracer is not selected we want the options to appear. All tracers including them select TRACING. Thus what we would like to do is: config EVENT_TRACER bool "prompt" depends on TRACING select TRACING But that gives us a bug in the kbuild system since we just created a circular dependency. We only want the prompt to show when TRACING is off. This patch adds GENERIC_TRACER that all tracers will select instead of TRACING. The two options (sched switch and event tracer) will select TRACING directly and depend on !GENERIC_TRACER. This solves the cicular dependency. [ Impact: hide options that are selected by default ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a508b9d2adb8..6e55cc3ac49d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -56,6 +56,13 @@ config CONTEXT_SWITCH_TRACER select MARKERS bool +# All tracer options should select GENERIC_TRACER. For those options that are +# enabled by all tracers (context switch and event tracer) they select TRACING. +# This allows those options to appear when no other tracer is selected. But the +# options do not appear when something else selects it. We need the two options +# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the +# hidding of the automatic options options. + config TRACING bool select DEBUG_FS @@ -66,6 +73,10 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING +config GENERIC_TRACER + bool + select TRACING + # # Minimum requirements an architecture has to meet for us to # be able to offer generic tracing facilities: @@ -95,7 +106,7 @@ config FUNCTION_TRACER depends on HAVE_FUNCTION_TRACER select FRAME_POINTER select KALLSYMS - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER help Enable the kernel to trace every kernel function. This is done @@ -126,7 +137,7 @@ config IRQSOFF_TRACER depends on TRACE_IRQFLAGS_SUPPORT depends on GENERIC_TIME select TRACE_IRQFLAGS - select TRACING + select GENERIC_TRACER select TRACER_MAX_TRACE help This option measures the time spent in irqs-off critical @@ -147,7 +158,7 @@ config PREEMPT_TRACER default n depends on GENERIC_TIME depends on PREEMPT - select TRACING + select GENERIC_TRACER select TRACER_MAX_TRACE help This option measures the time spent in preemption off critical @@ -166,7 +177,7 @@ config PREEMPT_TRACER config SYSPROF_TRACER bool "Sysprof Tracer" depends on X86 - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER help This tracer provides the trace needed by the 'Sysprof' userspace @@ -174,7 +185,7 @@ config SYSPROF_TRACER config SCHED_TRACER bool "Scheduling Latency Tracer" - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE help @@ -183,6 +194,7 @@ config SCHED_TRACER config ENABLE_CONTEXT_SWITCH_TRACER bool "Trace process context switches" + depends on !GENERIC_TRACER select TRACING select CONTEXT_SWITCH_TRACER help @@ -191,6 +203,7 @@ config ENABLE_CONTEXT_SWITCH_TRACER config ENABLE_EVENT_TRACING bool "Trace various events in the kernel" + depends on !GENERIC_TRACER select TRACING help This tracer hooks to various trace points in the kernel @@ -204,14 +217,14 @@ config ENABLE_EVENT_TRACING config FTRACE_SYSCALLS bool "Trace syscalls" depends on HAVE_FTRACE_SYSCALLS - select TRACING + select GENERIC_TRACER select KALLSYMS help Basic tracer to catch the syscall entry and exit events. config BOOT_TRACER bool "Trace boot initcalls" - select TRACING + select GENERIC_TRACER select CONTEXT_SWITCH_TRACER help This tracer helps developers to optimize boot times: it records @@ -228,7 +241,7 @@ config BOOT_TRACER config TRACE_BRANCH_PROFILING bool - select TRACING + select GENERIC_TRACER choice prompt "Branch Profiling" @@ -308,7 +321,7 @@ config BRANCH_TRACER config POWER_TRACER bool "Trace power consumption behavior" depends on X86 - select TRACING + select GENERIC_TRACER help This tracer helps developers to analyze and optimize the kernels power management decisions, specifically the C-state and P-state @@ -342,14 +355,14 @@ config STACK_TRACER config HW_BRANCH_TRACER depends on HAVE_HW_BRANCH_TRACER bool "Trace hw branches" - select TRACING + select GENERIC_TRACER help This tracer records all branches on the system in a circular buffer giving access to the last N branches for each cpu. config KMEMTRACE bool "Trace SLAB allocations" - select TRACING + select GENERIC_TRACER help kmemtrace provides tracing for slab allocator functions, such as kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected @@ -369,7 +382,7 @@ config KMEMTRACE config WORKQUEUE_TRACER bool "Trace workqueues" - select TRACING + select GENERIC_TRACER help The workqueue tracer provides some statistical informations about each cpu workqueue thread such as the number of the @@ -385,7 +398,7 @@ config BLK_DEV_IO_TRACE select RELAY select DEBUG_FS select TRACEPOINTS - select TRACING + select GENERIC_TRACER select STACKTRACE help Say Y here if you want to be able to trace the block layer actions @@ -446,7 +459,7 @@ config FTRACE_SELFTEST config FTRACE_STARTUP_TEST bool "Perform a startup test on ftrace" - depends on TRACING + depends on GENERIC_TRACER select FTRACE_SELFTEST help This option performs a series of startup tests on ftrace. On bootup @@ -457,7 +470,7 @@ config FTRACE_STARTUP_TEST config MMIOTRACE bool "Memory mapped IO tracing" depends on HAVE_MMIOTRACE_SUPPORT && PCI - select TRACING + select GENERIC_TRACER help Mmiotrace traces Memory Mapped I/O access and is meant for debugging and reverse engineering. It is called from the ioremap -- cgit v1.2.2 From 897f17a65389a26509bd0c79a9812d1c9ea8ea6f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 28 May 2009 16:31:21 -0400 Subject: tracing: combine the default tracers into one config Both event tracer and sched switch plugin are selected by default by all generic tracers. But if no generic tracer is enabled, their options appear. But ether one of them will select the other, thus it only makes sense to have the default tracers be selected by one option. [ Impact: clean up kconfig menu ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6e55cc3ac49d..4a13e5a01ce3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -192,27 +192,14 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. -config ENABLE_CONTEXT_SWITCH_TRACER - bool "Trace process context switches" - depends on !GENERIC_TRACER - select TRACING - select CONTEXT_SWITCH_TRACER - help - This tracer gets called from the context switch and records - all switching of tasks. - -config ENABLE_EVENT_TRACING - bool "Trace various events in the kernel" +config ENABLE_DEFAULT_TRACERS + bool "Trace process context switches and events" depends on !GENERIC_TRACER select TRACING help This tracer hooks to various trace points in the kernel allowing the user to pick and choose which trace point they - want to trace. - - Note, all tracers enable event tracing. This option is - only a convenience to enable event tracing when no other - tracers are selected. + want to trace. It also includes the sched_switch tracer plugin. config FTRACE_SYSCALLS bool "Trace syscalls" -- cgit v1.2.2 From 6e25db44a7ad7eb380f4ec774ec00a8fcddea112 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 May 2009 11:24:59 +0800 Subject: tracing/events: fix a typo in __string() format output "tsize" should be "\tsize". Also remove the space before "__str_loc". Before: # cat tracing/events/irq/irq_handler_entry/format ... field:int irq; offset:12; size:4; field: __str_loc name; offset:16;tsize:2; ... After: # cat tracing/events/irq/irq_handler_entry/format ... field:int irq; offset:12; size:4; field:__str_loc name; offset:16; size:2; ... [ Impact: standardize __string field description in events format file ] Signed-off-by: Li Zefan Signed-off-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index b4ec83ae711f..9276ec4f34de 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -209,8 +209,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ #undef __string #define __string(item, src) \ - ret = trace_seq_printf(s, "\tfield: __str_loc " #item ";\t" \ - "offset:%u;tsize:%u;\n", \ + ret = trace_seq_printf(s, "\tfield:__str_loc " #item ";\t" \ + "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), \ __str_loc_##item), \ (unsigned int)sizeof(field.__str_loc_##item)); \ -- cgit v1.2.2 From a9c1c3abe1160a5632e48c929b02b740556bf423 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 1 Jun 2009 15:35:13 +0800 Subject: tracing/events: put TP_fast_assign into braces Currently TP_fast_assign has a limitation that we can't define local variables in it. Here's one use case when we introduce __dynamic_array(): TP_fast_assign( type *p = __get_dynamic_array(item); foo(p); bar(p); ), [ Impact: allow defining local variables in TP_fast_assign ] Signed-off-by: Li Zefan LKML-Reference: <4A2384B1.90100@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 9276ec4f34de..ee9268222448 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -471,7 +471,7 @@ static void ftrace_raw_event_##call(proto) \ return; \ entry = ring_buffer_event_data(event); \ \ - assign; \ + { assign; } \ \ if (!filter_current_check_discard(event_call, entry, event)) \ trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ -- cgit v1.2.2 From 7fcb7c472f455d1711eb5a7633204dba8800a6d6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 1 Jun 2009 15:35:46 +0800 Subject: tracing/events: introduce __dynamic_array() __string() is limited: - it's a char array, but we may want to define array with other types - a source string should be available, but we may just know the string size We introduce __dynamic_array() to break those limitations, and __string() becomes a wrapper of it. As a side effect, now __get_str() can be used in TP_fast_assign but not only TP_print. Take XFS for example, we have the string length in the dirent, but the string itself is not NULL-terminated, so __dynamic_array() can be used: TRACE_EVENT(xfs_dir2, TP_PROTO(struct xfs_da_args *args), TP_ARGS(args), TP_STRUCT__entry( __field(int, namelen) __dynamic_array(char, name, args->namelen + 1) ... ), TP_fast_assign( char *name = __get_str(name); if (args->namelen) memcpy(name, args->name, args->namelen); name[args->namelen] = '\0'; __entry->namelen = args->namelen; ), TP_printk("name %.*s namelen %d", __entry->namelen ? __get_str(name) : NULL __entry->namelen) ); [ Impact: allow defining dynamic size arrays ] Signed-off-by: Li Zefan LKML-Reference: <4A2384D2.3080403@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 122 ++++++++++++++++++++++++++----------- kernel/trace/trace_events_filter.c | 6 +- 2 files changed, 91 insertions(+), 37 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index ee9268222448..b5478dab579b 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -18,14 +18,17 @@ #include +#undef __field +#define __field(type, item) type item; + #undef __array #define __array(type, item, len) type item[len]; -#undef __field -#define __field(type, item) type item; +#undef __dynamic_array +#define __dynamic_array(type, item, len) unsigned short __data_loc_##item; #undef __string -#define __string(item, src) unsigned short __str_loc_##item; +#define __string(item, src) __dynamic_array(char, item, -1) #undef TP_STRUCT__entry #define TP_STRUCT__entry(args...) args @@ -35,7 +38,7 @@ struct ftrace_raw_##name { \ struct trace_entry ent; \ tstruct \ - char __str_data[0]; \ + char __data[0]; \ }; \ static struct ftrace_event_call event_##name @@ -47,30 +50,31 @@ * * Include the following: * - * struct ftrace_str_offsets_ { - * int ; - * int ; + * struct ftrace_data_offsets_ { + * int ; + * int ; * [...] * }; * - * The __string() macro will create each int , this is to - * keep the offset of each string from the beggining of the event - * once we perform the strlen() of the src strings. - * + * The __dynamic_array() macro will create each int , this is + * to keep the offset of each array from the beginning of the event. */ +#undef __field +#define __field(type, item); + #undef __array #define __array(type, item, len) -#undef __field -#define __field(type, item); +#undef __dynamic_array +#define __dynamic_array(type, item, len) int item; #undef __string -#define __string(item, src) int item; +#define __string(item, src) __dynamic_array(char, item, -1) #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ - struct ftrace_str_offsets_##call { \ + struct ftrace_data_offsets_##call { \ tstruct; \ }; @@ -119,8 +123,12 @@ #undef TP_printk #define TP_printk(fmt, args...) fmt "\n", args +#undef __get_dynamic_array +#define __get_dynamic_array(field) \ + ((void *)__entry + __entry->__data_loc_##field) + #undef __get_str -#define __get_str(field) ((char *)__entry + __entry->__str_loc_##field) +#define __get_str(field) (char *)__get_dynamic_array(field) #undef __print_flags #define __print_flags(flag, delim, flag_array...) \ @@ -207,16 +215,19 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ if (!ret) \ return 0; -#undef __string -#define __string(item, src) \ - ret = trace_seq_printf(s, "\tfield:__str_loc " #item ";\t" \ +#undef __dynamic_array +#define __dynamic_array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), \ - __str_loc_##item), \ - (unsigned int)sizeof(field.__str_loc_##item)); \ + __data_loc_##item), \ + (unsigned int)sizeof(field.__data_loc_##item)); \ if (!ret) \ return 0; +#undef __string +#define __string(item, src) __dynamic_array(char, item, -1) + #undef __entry #define __entry REC @@ -260,11 +271,14 @@ ftrace_format_##call(struct trace_seq *s) \ if (ret) \ return ret; +#undef __dynamic_array +#define __dynamic_array(type, item, len) \ + ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\ + offsetof(typeof(field), __data_loc_##item), \ + sizeof(field.__data_loc_##item), 0); + #undef __string -#define __string(item, src) \ - ret = trace_define_field(event_call, "__str_loc", #item, \ - offsetof(typeof(field), __str_loc_##item), \ - sizeof(field.__str_loc_##item), 0); +#define __string(item, src) __dynamic_array(char, item, -1) #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ @@ -288,6 +302,43 @@ ftrace_define_fields_##call(void) \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +/* + * remember the offset of each array from the beginning of the event. + */ + +#undef __entry +#define __entry entry + +#undef __field +#define __field(type, item) + +#undef __array +#define __array(type, item, len) + +#undef __dynamic_array +#define __dynamic_array(type, item, len) \ + __data_offsets->item = __data_size + \ + offsetof(typeof(*entry), __data); \ + __data_size += (len) * sizeof(type); + +#undef __string +#define __string(item, src) __dynamic_array(char, item, strlen(src) + 1) \ + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +static inline int ftrace_get_offsets_##call( \ + struct ftrace_data_offsets_##call *__data_offsets, proto) \ +{ \ + int __data_size = 0; \ + struct ftrace_raw_##call __maybe_unused *entry; \ + \ + tstruct; \ + \ + return __data_size; \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + /* * Stage 4 of the trace events. * @@ -432,15 +483,15 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #undef __array #define __array(type, item, len) +#undef __dynamic_array +#define __dynamic_array(type, item, len) \ + __entry->__data_loc_##item = __data_offsets.item; + #undef __string -#define __string(item, src) \ - __str_offsets.item = __str_size + \ - offsetof(typeof(*entry), __str_data); \ - __str_size += strlen(src) + 1; +#define __string(item, src) __dynamic_array(char, item, -1) \ #undef __assign_str #define __assign_str(dst, src) \ - __entry->__str_loc_##dst = __str_offsets.dst; \ strcpy(__get_str(dst), src); #undef TRACE_EVENT @@ -451,26 +502,29 @@ static struct ftrace_event_call event_##call; \ \ static void ftrace_raw_event_##call(proto) \ { \ - struct ftrace_str_offsets_##call __maybe_unused __str_offsets; \ + struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ struct ftrace_event_call *event_call = &event_##call; \ struct ring_buffer_event *event; \ struct ftrace_raw_##call *entry; \ unsigned long irq_flags; \ - int __str_size = 0; \ + int __data_size; \ int pc; \ \ local_save_flags(irq_flags); \ pc = preempt_count(); \ \ - tstruct; \ + __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ \ event = trace_current_buffer_lock_reserve(event_##call.id, \ - sizeof(struct ftrace_raw_##call) + __str_size,\ + sizeof(*entry) + __data_size, \ irq_flags, pc); \ if (!event) \ return; \ entry = ring_buffer_event_data(event); \ \ + \ + tstruct \ + \ { assign; } \ \ if (!filter_current_check_discard(event_call, entry, event)) \ diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index a7430b16d243..db6e54bdb596 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -478,12 +478,12 @@ enum { static int is_string_field(const char *type) { + if (strstr(type, "__data_loc") && strstr(type, "char")) + return FILTER_DYN_STRING; + if (strchr(type, '[') && strstr(type, "char")) return FILTER_STATIC_STRING; - if (!strcmp(type, "__str_loc")) - return FILTER_DYN_STRING; - return 0; } -- cgit v1.2.2 From ec081ddc3d90aab35bc0de19a358b964978837cf Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 1 Jun 2009 15:53:35 +0100 Subject: tracing: add exports to use __print_symbolic and __print_flags from a module A patch to allow the use of __print_symbolic and __print_flags from a module. This allows the current GFS2 tracing patch to build. Signed-off-by: Steven Whitehouse LKML-Reference: <1243868015.29604.542.camel@localhost.localdomain> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c12d95db2f56..0fe3b223f7ed 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -17,6 +17,7 @@ static DECLARE_RWSEM(trace_event_mutex); DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); +EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; @@ -250,6 +251,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, return p->buffer; } +EXPORT_SYMBOL(ftrace_print_flags_seq); const char * ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, @@ -275,6 +277,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, return p->buffer; } +EXPORT_SYMBOL(ftrace_print_symbols_seq); #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) -- cgit v1.2.2 From 1d080d6c3141623c92caaebe20e847cb99ccbb60 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 1 Jun 2009 12:20:40 -0400 Subject: tracing: remove redundant SOFTIRQ from softirq event traces After converting the softirq tracer to use te flags options, this caused a regression with the name. Since the flag was used directly it was printed out (i.e. HRTIMER_SOFTIRQ). This patch only shows the softirq name without the SOFTIRQ part. [ Impact: fix regression of output from softirq events ] Signed-off-by: Steven Rostedt --- include/trace/events/irq.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 683fb36a9943..b0c7ede55eb1 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -7,18 +7,18 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq -#define softirq_name(sirq) { sirq, #sirq } -#define show_softirq_name(val) \ - __print_symbolic(val, \ - softirq_name(HI_SOFTIRQ), \ - softirq_name(TIMER_SOFTIRQ), \ - softirq_name(NET_TX_SOFTIRQ), \ - softirq_name(NET_RX_SOFTIRQ), \ - softirq_name(BLOCK_SOFTIRQ), \ - softirq_name(TASKLET_SOFTIRQ), \ - softirq_name(SCHED_SOFTIRQ), \ - softirq_name(HRTIMER_SOFTIRQ), \ - softirq_name(RCU_SOFTIRQ)) +#define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq } +#define show_softirq_name(val) \ + __print_symbolic(val, \ + softirq_name(HI), \ + softirq_name(TIMER), \ + softirq_name(NET_TX), \ + softirq_name(NET_RX), \ + softirq_name(BLOCK), \ + softirq_name(TASKLET), \ + softirq_name(SCHED), \ + softirq_name(HRTIMER), \ + softirq_name(RCU)) /** * irq_handler_entry - called immediately before the irq action handler -- cgit v1.2.2 From 112f38a7e36e9d688b389507136bf3af3e6d159b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 1 Jun 2009 15:16:05 -0400 Subject: tracing: make trace pipe recognize latency format flag The trace_pipe did not recognize the latency format flag and would produce different output than the trace file. The problem was partly due that the trace flags in the iterator was not set as well as the trace_pipe zeros out part of the iterator (including the flags) to be able to use the same routines as the trace file. trace_flags of the iterator should not cause any problems when not zeroed out by for trace_pipe. Reported-by: Johannes Berg Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 +- kernel/trace/trace.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index bbf40f624fc8..5c093ffc655b 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -51,6 +51,7 @@ struct trace_iterator { int cpu_file; struct mutex mutex; struct ring_buffer_iter *buffer_iter[NR_CPUS]; + unsigned long iter_flags; /* The below is zeroed out in pipe_read */ struct trace_seq seq; @@ -58,7 +59,6 @@ struct trace_iterator { int cpu; u64 ts; - unsigned long iter_flags; loff_t pos; long idx; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a3a8a87d7e91..cae34c69752f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2826,6 +2826,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) /* trace pipe does not show start of buffer */ cpumask_setall(iter->started); + if (trace_flags & TRACE_ITER_LATENCY_FMT) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + iter->cpu_file = cpu_file; iter->tr = &global_trace; mutex_init(&iter->mutex); -- cgit v1.2.2 From 0f6ce3de4ef6ff940308087c49760d068851c1a7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 1 Jun 2009 21:51:28 -0400 Subject: ftrace: do not profile functions when disabled A race was found that if one were to enable and disable the function profiler repeatedly, then the system can panic. This was because a profiled function may be preempted just before disabling interrupts. While the profiler is disabled and then reenabled, the preempted function could start again, and access the hash as it is being initialized. This just adds a check in the irq disabled part to check if the profiler is enabled, and if it is not then it will just exit. When the system is disabled, the profile_enabled variable is cleared before calling the unregistering of the function profiler. This unregistering calls stop machine which also acts as a synchronize schedule. [ Impact: fix panic in enabling/disabling function profiler ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2074e5b7766b..d6973dfadb36 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -599,7 +599,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) local_irq_save(flags); stat = &__get_cpu_var(ftrace_profile_stats); - if (!stat->hash) + if (!stat->hash || !ftrace_profile_enabled) goto out; rec = ftrace_find_profiled_func(stat, ip); @@ -630,7 +630,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) local_irq_save(flags); stat = &__get_cpu_var(ftrace_profile_stats); - if (!stat->hash) + if (!stat->hash || !ftrace_profile_enabled) goto out; calltime = trace->rettime - trace->calltime; @@ -724,6 +724,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, ftrace_profile_enabled = 1; } else { ftrace_profile_enabled = 0; + /* + * unregister_ftrace_profiler calls stop_machine + * so this acts like an synchronize_sched. + */ unregister_ftrace_profiler(); } } -- cgit v1.2.2 From a2023556409cf7fec5d67a26f7fcfa57c5a4086d Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Tue, 2 Jun 2009 17:06:54 -0700 Subject: ring-buffer: fix bug in ring_buffer_discard_commit There's a bug in ring_buffer_discard_commit. The wrong pointer is being compared in order to check if the event can be freed from the buffer rather than discarded (i.e. marked as PAD). I noticed this when I was working on duration filtering. The bug is not deadly - it just results in lots of wasted space in the buffer. All filtered events are left in the buffer and marked as discarded, rather than being removed from the buffer to make space for other events. Unfortunately, when I fixed this bug, I got errors doing a filtered function trace. Multiple TIME_EXTEND events pile up in the buffer, and trigger the following loop overage warning in rb_iter_peek(): again: ... if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) return NULL; I'm not sure what the best way is to fix this. I don't know if I should extend the loop threshhold, or if I should make the test more complex (ignore TIME_EXTEND events), or just get rid of this loop check completely. Note that if I implement a workaround for this, then I see another problem from rb_advance_iter(). I haven't tracked that one down yet. In general, it seems like the case of removing filtered events has not been working properly, and so some assumptions about buffer invariant conditions need to be revisited. Here's the patch for the simple fix: Compare correct pointer for checking if an event can be freed rather than left as discarded in the buffer. Signed-off-by: Tim Bird LKML-Reference: <4A25BE9E.5090909@am.sony.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 16b24d49604c..945302368691 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1708,7 +1708,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, bpage = cpu_buffer->tail_page; - if (bpage == (void *)addr && rb_page_write(bpage) == old_index) { + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { /* * This is on the tail page. It is possible that * a write could come in and move the tail page -- cgit v1.2.2 From edd813bffc62a980bb4fb9b1243f31c1cce78da3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 2 Jun 2009 23:00:53 -0400 Subject: ring-buffer: try to discard unneeded timestamps There are times that a race may happen that we add a timestamp in a nested write. This timestamp would just contain a zero delta and serves no purpose. Now that we have a way to discard events, this patch will try to discard the timestamp instead of just wasting the space in the ring buffer. Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 67 ++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 26 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 945302368691..50926601a28d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1335,6 +1335,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; } +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) + return 1; + } + + /* could not discard */ + return 0; +} + static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, u64 *delta) @@ -1384,10 +1416,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, /* let the caller know this was the commit */ ret = 1; } else { - /* Darn, this is just wasted space */ - event->time_delta = 0; - event->array[0] = 0; - ret = 0; + /* Try to discard the event */ + if (!rb_try_to_discard(cpu_buffer, event)) { + /* Darn, this is just wasted space */ + event->time_delta = 0; + event->array[0] = 0; + ret = 0; + } } *delta = 0; @@ -1682,10 +1717,6 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long new_index, old_index; - struct buffer_page *bpage; - unsigned long index; - unsigned long addr; int cpu; /* The event is discarded regardless */ @@ -1701,24 +1732,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, cpu = smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; - new_index = rb_event_index(event); - old_index = new_index + rb_event_length(event); - addr = (unsigned long)event; - addr &= PAGE_MASK; - - bpage = cpu_buffer->tail_page; - - if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { - /* - * This is on the tail page. It is possible that - * a write could come in and move the tail page - * and write to the next page. That is fine - * because we just shorten what is on this page. - */ - index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) - goto out; - } + if (!rb_try_to_discard(cpu_buffer, event)) + goto out; /* * The commit is still visible by the reader, so we -- cgit v1.2.2 From ea05b57cc19234d8de9887c8a32c2e58e84b56ba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Jun 2009 09:30:10 -0400 Subject: ring-buffer: discard timestamps that are at the start of the buffer Every buffer page in the ring buffer includes its own time stamp. When an event is recorded to the ring buffer with a delta time greater than what can be held in the event header, a time stamp event is created. If the the create timestamp falls over to the next buffer page, it is redundant because the buffer page holds a full time stamp. This patch will try to discard the time stamp when it falls to the start of the next page. This change also fixes a issues with disarding events. If most events are discarded, timestamps will start to creep into the ring buffer. If we do not discard the timestamps then they can fill up the ring buffer over time and waste space. This change will keep time stamps from filling up over another page. If something is recorded in the buffer page, and the rest is filtered, then the time stamps can only fill up to the end of the page. [ Impact: prevent time stamps from filling ring buffer ] Reported-by: Tim Bird Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 50926601a28d..7102d7a2fadb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -370,6 +370,9 @@ static inline int test_time_stamp(u64 delta) /* Max payload is BUF_PAGE_SIZE - header (8bytes) */ #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) +/* Max number of timestamps that can fit on a page */ +#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP) + int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; @@ -1409,8 +1412,12 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, event->array[0] = *delta >> TS_SHIFT; } else { cpu_buffer->commit_page->page->time_stamp = *ts; - event->time_delta = 0; - event->array[0] = 0; + /* try to discard, since we do not need this */ + if (!rb_try_to_discard(cpu_buffer, event)) { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } } cpu_buffer->write_stamp = *ts; /* let the caller know this was the commit */ @@ -2268,8 +2275,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) * Check if we are at the end of the buffer. */ if (iter->head >= rb_page_size(iter->head_page)) { - if (RB_WARN_ON(buffer, - iter->head_page == cpu_buffer->commit_page)) + /* discarded commits can make the page empty */ + if (iter->head_page == cpu_buffer->commit_page) return; rb_inc_iter(iter); return; @@ -2312,12 +2319,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) /* * We repeat when a timestamp is encountered. It is possible * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written. The max times - * that this can happen is the number of nested interrupts we - * can have. Nesting 10 deep of interrupts is clearly - * an anomaly. + * as one timestamp is about to be written, or from discarded + * commits. The most that we can have is the number on a single page. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) return NULL; reader = rb_get_reader_page(cpu_buffer); @@ -2383,14 +2388,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) again: /* - * We repeat when a timestamp is encountered. It is possible - * to get multiple timestamps from an interrupt entering just - * as one timestamp is about to be written. The max times - * that this can happen is the number of nested interrupts we - * can have. Nesting 10 deep of interrupts is clearly - * an anomaly. + * We repeat when a timestamp is encountered. + * We can get multiple timestamps by nested interrupts or also + * if filtering is on (discarding commits). Since discarding + * commits can be frequent we can get a lot of timestamps. + * But we limit them by not adding timestamps if they begin + * at the start of a page. */ - if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) + if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) return NULL; if (rb_per_cpu_empty(cpu_buffer)) -- cgit v1.2.2 From 083a63b48e4dd0a6a2d44216720076dc81ebb255 Mon Sep 17 00:00:00 2001 From: walimis Date: Wed, 3 Jun 2009 16:01:28 +0800 Subject: tracing/trace_stack: fix the number of entries in the header The last entry in the stack_dump_trace is ULONG_MAX, which is not a valid entry, but max_stack_trace.nr_entries has accounted for it. So when printing the header, we should decrease it by one. Before fix, print as following, for example: Depth Size Location (53 entries) <--- should be 52 ----- ---- -------- 0) 3264 108 update_wall_time+0x4d5/0x9a0 ... 51) 80 80 syscall_call+0x7/0xb ^^^ it's correct. Signed-off-by: walimis LKML-Reference: <1244016090-7814-1-git-send-email-walimisdev@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_stack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 1796f00524e1..2d7aebd71dbd 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", - max_stack_trace.nr_entries); + max_stack_trace.nr_entries - 1); if (!stack_tracer_enabled && !max_stack_size) print_disabled(m); -- cgit v1.2.2 From f11b3f4e2932bfdcfc458ab8d1ece62724ceabfc Mon Sep 17 00:00:00 2001 From: walimis Date: Wed, 3 Jun 2009 16:01:29 +0800 Subject: tracing/events: fix output format of kernel stack According to "events/ftrace/kernel_stack/format", output format of kernel stack should use "=>" instead of "<=". The second problem is that we shouldn't skip the first entry in the stack, although it seems to be duplicated when used in the "function" tracer, but events also use it. If we skip the first one, we will drop the topmost entry of the stack. The last problem is that if the last entry is ULONG_MAX(0xffffffff), we should drop it, otherwise it will print a NULL name line. before fix: sh-1072 [000] 26.957239: sched_process_fork: parent sh:1072 child sh:1073 sh-1072 [000] 26.957262: <= syscall_call <= sh-1072 [000] 26.957744: sched_switch: task sh:1072 [120] (R) ==> sh:1073 [120] sh-1072 [000] 26.957752: <= preempt_schedule <= wake_up_new_task <= do_fork <= sys_clone <= syscall_call <= After fix: sh-1075 [000] 39.791848: sched_process_fork: parent sh:1075 child sh:1076 sh-1075 [000] 39.791871: => sys_clone => syscall_call sh-1075 [000] 39.792713: sched_switch: task sh:1075 [120] (R) ==> sh:1076 [120] sh-1075 [000] 39.792722: => schedule => preempt_schedule => wake_up_new_task => do_fork => sys_clone => syscall_call Signed-off-by: walimis LKML-Reference: <1244016090-7814-2-git-send-email-walimisdev@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0fe3b223f7ed..64596a571609 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -975,16 +975,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); + if (!trace_seq_puts(s, "\n")) + goto partial; for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { - if (!field->caller[i]) + if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) break; - if (i) { - if (!trace_seq_puts(s, " <= ")) - goto partial; + if (!trace_seq_puts(s, " => ")) + goto partial; - if (!seq_print_ip_sym(s, field->caller[i], flags)) - goto partial; - } + if (!seq_print_ip_sym(s, field->caller[i], flags)) + goto partial; if (!trace_seq_puts(s, "\n")) goto partial; } -- cgit v1.2.2 From 048dc50c5e7eada19ebabbad70b7966d14283d41 Mon Sep 17 00:00:00 2001 From: walimis Date: Wed, 3 Jun 2009 16:01:30 +0800 Subject: tracing/events: fix output format of user stack According to "events/ftrace/user_stack/format", fix the output of user stack. before fix: sh-1073 [000] 31.137561: <- <0804e33c> <- <080835c1> after fix: sh-1072 [000] 37.039329: => => <0804e33c> => <080835c1> Signed-off-by: walimis LKML-Reference: <1244016090-7814-3-git-send-email-walimisdev@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 64596a571609..8dadbbbd2d5c 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -389,17 +389,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, if (ip == ULONG_MAX || !ret) break; - if (i && ret) - ret = trace_seq_puts(s, " <- "); + if (ret) + ret = trace_seq_puts(s, " => "); if (!ip) { if (ret) ret = trace_seq_puts(s, "??"); + if (ret) + ret = trace_seq_puts(s, "\n"); continue; } if (!ret) break; if (ret) ret = seq_print_user_ip(s, mm, ip, sym_flags); + ret = trace_seq_puts(s, "\n"); } if (mm) @@ -1012,10 +1015,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!seq_print_userip_objs(field, s, flags)) + if (!trace_seq_putc(s, '\n')) goto partial; - if (!trace_seq_putc(s, '\n')) + if (!seq_print_userip_objs(field, s, flags)) goto partial; return TRACE_TYPE_HANDLED; -- cgit v1.2.2 From 56d8bd3f0b98972312cad683947ec90b21011199 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 3 Jun 2009 14:52:03 +0100 Subject: tracing: fix multiple use of __print_flags and __print_symbolic Here is an updated patch to include the extra call to trace_seq_init() as requested. This is vs. the latest -tip tree and fixes the use of multiple __print_flags and __print_symbolic in a single tracer. Also tested to ensure its working now: mount.gfs2-2534 [000] 235.850587: gfs2_glock_queue: 8.7 glock 1:2 dequeue PR mount.gfs2-2534 [000] 235.850591: gfs2_demote_rq: 8.7 glock 1:0 demote EX to NL flags:DI mount.gfs2-2534 [000] 235.850591: gfs2_glock_queue: 8.7 glock 1:0 dequeue EX glock_workqueue-2529 [000] 235.850666: gfs2_glock_state_change: 8.7 glock 1:0 state EX => NL tgt:NL dmt:NL flags:lDpI glock_workqueue-2529 [000] 235.850672: gfs2_glock_put: 8.7 glock 1:0 state NL => IV flags:I Signed-off-by: Steven Whitehouse LKML-Reference: <1244037123.29604.603.camel@localhost.localdomain> Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 2 ++ kernel/trace/trace_output.c | 10 ++++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index b5478dab579b..40ede4db4d88 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -104,6 +104,7 @@ * field = (typeof(field))entry; * * p = get_cpu_var(ftrace_event_seq); + * trace_seq_init(p); * ret = trace_seq_printf(s, "\n"); * put_cpu(); * if (!ret) @@ -167,6 +168,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ field = (typeof(field))entry; \ \ p = &get_cpu_var(ftrace_event_seq); \ + trace_seq_init(p); \ ret = trace_seq_printf(s, #call ": " print); \ put_cpu(); \ if (!ret) \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8dadbbbd2d5c..8afeea412e77 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -223,10 +223,9 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, { unsigned long mask; const char *str; + const char *ret = p->buffer + p->len; int i; - trace_seq_init(p); - for (i = 0; flag_array[i].name && flags; i++) { mask = flag_array[i].mask; @@ -249,7 +248,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, trace_seq_putc(p, 0); - return p->buffer; + return ret; } EXPORT_SYMBOL(ftrace_print_flags_seq); @@ -258,8 +257,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, const struct trace_print_flags *symbol_array) { int i; - - trace_seq_init(p); + const char *ret = p->buffer + p->len; for (i = 0; symbol_array[i].name; i++) { @@ -275,7 +273,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, trace_seq_putc(p, 0); - return p->buffer; + return ret; } EXPORT_SYMBOL(ftrace_print_symbols_seq); -- cgit v1.2.2 From 563af16c30ede41eda2d614195d88e07f7c7103d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 3 Jun 2009 11:10:44 -0400 Subject: tracing: add annotation to what type of stack trace is recorded The current method of printing out a stack trace is to add a new line and print out the trace: yum-updatesd-3120 [002] 573.691303: => do_softirq => irq_exit => smp_apic_timer_interrupt => apic_timer_interrupt This looks a bit awkward, and if we have both stack and user stack traces running, it would be nice to have a title to tell them apart, although it is easy to tell by the output. This patch adds an annotation to the start of the stack traces: init-1 [003] 929.304979: => user_path_at => vfs_fstatat => vfs_stat => sys_newstat => system_call_fastpath cat-3459 [002] 1016.824040: => <0000003aae6c0250> => <00007ffff4b06ae4> => <69636172742f6775> Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8afeea412e77..425725c1622d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -976,7 +976,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!trace_seq_puts(s, "\n")) + if (!trace_seq_puts(s, "\n")) goto partial; for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) @@ -1013,7 +1013,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); - if (!trace_seq_putc(s, '\n')) + if (!trace_seq_puts(s, "\n")) goto partial; if (!seq_print_userip_objs(field, s, flags)) -- cgit v1.2.2 From 1f8a6a10fb9437eac3f516ea4324a19087872f30 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 Jun 2009 18:18:39 +0200 Subject: ring-buffer: pass in lockdep class key for reader_lock On Sun, 7 Jun 2009, Ingo Molnar wrote: > Testing tracer sched_switch: <6>Starting ring buffer hammer > PASSED > Testing tracer sysprof: PASSED > Testing tracer function: PASSED > Testing tracer irqsoff: > ============================================= > PASSED > Testing tracer preemptoff: PASSED > Testing tracer preemptirqsoff: [ INFO: possible recursive locking detected ] > PASSED > Testing tracer branch: 2.6.30-rc8-tip-01972-ge5b9078-dirty #5760 > --------------------------------------------- > rb_consumer/431 is trying to acquire lock: > (&cpu_buffer->reader_lock){......}, at: [] ring_buffer_reset_cpu+0x37/0x70 > > but task is already holding lock: > (&cpu_buffer->reader_lock){......}, at: [] ring_buffer_consume+0x7e/0xc0 > > other info that might help us debug this: > 1 lock held by rb_consumer/431: > #0: (&cpu_buffer->reader_lock){......}, at: [] ring_buffer_consume+0x7e/0xc0 The ring buffer is a generic structure, and can be used outside of ftrace. If ftrace traces within the use of the ring buffer, it can produce false positives with lockdep. This patch passes in a static lock key into the allocation of the ring buffer, so that different ring buffers will have their own lock class. Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra LKML-Reference: <1244477919.13761.9042.camel@twins> [ store key in ring buffer descriptor ] Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 14 +++++++++++++- kernel/trace/ring_buffer.c | 9 +++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index f1345828c7c5..8670f1575fe1 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -105,7 +105,19 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, * size is in bytes for each per CPU buffer. */ struct ring_buffer * -ring_buffer_alloc(unsigned long size, unsigned flags); +__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key); + +/* + * Because the ring buffer is generic, if other users of the ring buffer get + * traced by ftrace, it can produce lockdep warnings. We need to keep each + * ring buffer's lock class separate. + */ +#define ring_buffer_alloc(size, flags) \ +({ \ + static struct lock_class_key __key; \ + __ring_buffer_alloc((size), (flags), &__key); \ +}) + void ring_buffer_free(struct ring_buffer *buffer); int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7102d7a2fadb..22878b0d370c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -426,6 +426,8 @@ struct ring_buffer { atomic_t record_disabled; cpumask_var_t cpumask; + struct lock_class_key *reader_lock_key; + struct mutex mutex; struct ring_buffer_per_cpu **buffers; @@ -565,6 +567,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; spin_lock_init(&cpu_buffer->reader_lock); + lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; INIT_LIST_HEAD(&cpu_buffer->pages); @@ -635,7 +638,8 @@ static int rb_cpu_notify(struct notifier_block *self, * when the buffer wraps. If this flag is not set, the buffer will * drop data when the tail hits the head. */ -struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) +struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, + struct lock_class_key *key) { struct ring_buffer *buffer; int bsize; @@ -658,6 +662,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); buffer->flags = flags; buffer->clock = trace_clock_local; + buffer->reader_lock_key = key; /* need at least two pages */ if (buffer->pages == 1) @@ -715,7 +720,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) kfree(buffer); return NULL; } -EXPORT_SYMBOL_GPL(ring_buffer_alloc); +EXPORT_SYMBOL_GPL(__ring_buffer_alloc); /** * ring_buffer_free - free a ring buffer. -- cgit v1.2.2 From f57a8a1911342265e7acdc190333c4e9235a6632 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 5 Jun 2009 14:11:30 -0400 Subject: ring-buffer: fix ret in rb_add_time_stamp The update of ret got mistakenly added to the if statement of rb_try_to_discard. The variable ret should be 1 on commit and zero otherwise. [ Impact: fix compiler warning and real bug ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 22878b0d370c..2e642b2b7253 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1433,8 +1433,8 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, /* Darn, this is just wasted space */ event->time_delta = 0; event->array[0] = 0; - ret = 0; } + ret = 0; } *delta = 0; -- cgit v1.2.2 From 55782138e47d9baf2f7d3a7af9e7cf42adf72c56 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 9 Jun 2009 13:43:05 +0800 Subject: tracing/events: convert block trace points to TRACE_EVENT() TRACE_EVENT is a more generic way to define tracepoints. Doing so adds these new capabilities to this tracepoint: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions ... Cons: - no dev_t info for the output of plug, unplug_timer and unplug_io events. no dev_t info for getrq and sleeprq events if bio == NULL. no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL. This is mainly because we can't get the deivce from a request queue. But this may change in the future. - A packet command is converted to a string in TP_assign, not TP_print. While blktrace do the convertion just before output. Since pc requests should be rather rare, this is not a big issue. - In blktrace, an event can have 2 different print formats, but a TRACE_EVENT has a unique format, which means we have some unused data in a trace entry. The overhead is minimized by using __dynamic_array() instead of __array(). I've benchmarked the ioctl blktrace vs the splice based TRACE_EVENT tracing: dd dd + ioctl blktrace dd + TRACE_EVENT (splice) 1 7.36s, 42.7 MB/s 7.50s, 42.0 MB/s 7.41s, 42.5 MB/s 2 7.43s, 42.3 MB/s 7.48s, 42.1 MB/s 7.43s, 42.4 MB/s 3 7.38s, 42.6 MB/s 7.45s, 42.2 MB/s 7.41s, 42.5 MB/s So the overhead of tracing is very small, and no regression when using those trace events vs blktrace. And the binary output of TRACE_EVENT is much smaller than blktrace: # ls -l -h -rw-r--r-- 1 root root 8.8M 06-09 13:24 sda.blktrace.0 -rw-r--r-- 1 root root 195K 06-09 13:24 sda.blktrace.1 -rw-r--r-- 1 root root 2.7M 06-09 13:25 trace_splice.out Following are some comparisons between TRACE_EVENT and blktrace: plug: kjournald-480 [000] 303.084981: block_plug: [kjournald] kjournald-480 [000] 303.084981: 8,0 P N [kjournald] unplug_io: kblockd/0-118 [000] 300.052973: block_unplug_io: [kblockd/0] 1 kblockd/0-118 [000] 300.052974: 8,0 U N [kblockd/0] 1 remap: kjournald-480 [000] 303.085042: block_remap: 8,0 W 102736992 + 8 <- (8,8) 33384 kjournald-480 [000] 303.085043: 8,0 A W 102736992 + 8 <- (8,8) 33384 bio_backmerge: kjournald-480 [000] 303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald] kjournald-480 [000] 303.085086: 8,0 M W 102737032 + 8 [kjournald] getrq: kjournald-480 [000] 303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald] kjournald-480 [000] 303.084975: 8,0 G W 102736984 + 8 [kjournald] bash-2066 [001] 1072.953770: 8,0 G N [bash] bash-2066 [001] 1072.953773: block_getrq: 0,0 N 0 + 0 [bash] rq_complete: konsole-2065 [001] 300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0] konsole-2065 [001] 300.053191: 8,0 C W 103669040 + 16 [0] ksoftirqd/1-7 [001] 1072.953811: 8,0 C N (5a 00 08 00 00 00 00 00 24 00) [0] ksoftirqd/1-7 [001] 1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0] rq_insert: kjournald-480 [000] 303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald] kjournald-480 [000] 303.084986: 8,0 I W 102736984 + 8 [kjournald] Changelog from v2 -> v3: - use the newly introduced __dynamic_array(). Changelog from v1 -> v2: - use __string() instead of __array() to minimize the memory required to store hex dump of rq->cmd(). - support large pc requests. - add missing blk_fill_rwbs_rq() in block_rq_requeue TRACE_EVENT. - some cleanups. Signed-off-by: Li Zefan LKML-Reference: <4A2DF669.5070905@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- block/blk-core.c | 16 +- block/elevator.c | 8 +- drivers/md/dm.c | 5 +- fs/bio.c | 3 +- include/linux/blktrace_api.h | 13 ++ include/trace/block.h | 76 ------- include/trace/events/block.h | 483 +++++++++++++++++++++++++++++++++++++++++++ kernel/trace/Makefile | 5 +- kernel/trace/blktrace.c | 78 ++++++- mm/bounce.c | 5 +- 10 files changed, 588 insertions(+), 104 deletions(-) delete mode 100644 include/trace/block.h create mode 100644 include/trace/events/block.h diff --git a/block/blk-core.c b/block/blk-core.c index 1306de9cce04..9475bf99b891 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -28,22 +28,14 @@ #include #include #include -#include + +#define CREATE_TRACE_POINTS +#include #include "blk.h" -DEFINE_TRACE(block_plug); -DEFINE_TRACE(block_unplug_io); -DEFINE_TRACE(block_unplug_timer); -DEFINE_TRACE(block_getrq); -DEFINE_TRACE(block_sleeprq); -DEFINE_TRACE(block_rq_requeue); -DEFINE_TRACE(block_bio_backmerge); -DEFINE_TRACE(block_bio_frontmerge); -DEFINE_TRACE(block_bio_queue); -DEFINE_TRACE(block_rq_complete); -DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */ EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); static int __make_request(struct request_queue *q, struct bio *bio); diff --git a/block/elevator.c b/block/elevator.c index 7073a9072577..e220f0c543e3 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -33,17 +33,16 @@ #include #include #include -#include #include #include +#include + #include "blk.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); -DEFINE_TRACE(block_rq_abort); - /* * Merge hash stuff. */ @@ -55,9 +54,6 @@ static const int elv_hash_shift = 6; #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) -DEFINE_TRACE(block_rq_insert); -DEFINE_TRACE(block_rq_issue); - /* * Query io scheduler to see if the current process issuing bio may be * merged with rq. diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e2ee4a79ea2c..3fd8b1e65483 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -20,7 +20,8 @@ #include #include #include -#include + +#include #define DM_MSG_PREFIX "core" @@ -53,8 +54,6 @@ struct dm_target_io { union map_info info; }; -DEFINE_TRACE(block_bio_complete); - /* * For request-based dm. * One of these is allocated per request. diff --git a/fs/bio.c b/fs/bio.c index 98711647ece4..740699c4f90c 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -26,10 +26,9 @@ #include #include #include -#include #include /* for struct sg_iovec */ -DEFINE_TRACE(block_split); +#include /* * Test patch to inline a certain number of bi_io_vec's inside the bio diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 82b4636030e9..c7ec31dd04c9 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -218,5 +218,18 @@ static inline int blk_trace_init_sysfs(struct device *dev) #endif /* CONFIG_BLK_DEV_IO_TRACE */ +#ifdef CONFIG_EVENT_TRACING + +static inline int blk_cmd_buf_len(struct request *rq) +{ + return blk_pc_request(rq) ? rq->cmd_len * 3 : 1; +} + +extern void blk_dump_cmd(char *buf, struct request *rq); +extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes); +extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq); + +#endif /* CONFIG_EVENT_TRACING */ + #endif /* __KERNEL__ */ #endif diff --git a/include/trace/block.h b/include/trace/block.h deleted file mode 100644 index 5b12efa096b6..000000000000 --- a/include/trace/block.h +++ /dev/null @@ -1,76 +0,0 @@ -#ifndef _TRACE_BLOCK_H -#define _TRACE_BLOCK_H - -#include -#include - -DECLARE_TRACE(block_rq_abort, - TP_PROTO(struct request_queue *q, struct request *rq), - TP_ARGS(q, rq)); - -DECLARE_TRACE(block_rq_insert, - TP_PROTO(struct request_queue *q, struct request *rq), - TP_ARGS(q, rq)); - -DECLARE_TRACE(block_rq_issue, - TP_PROTO(struct request_queue *q, struct request *rq), - TP_ARGS(q, rq)); - -DECLARE_TRACE(block_rq_requeue, - TP_PROTO(struct request_queue *q, struct request *rq), - TP_ARGS(q, rq)); - -DECLARE_TRACE(block_rq_complete, - TP_PROTO(struct request_queue *q, struct request *rq), - TP_ARGS(q, rq)); - -DECLARE_TRACE(block_bio_bounce, - TP_PROTO(struct request_queue *q, struct bio *bio), - TP_ARGS(q, bio)); - -DECLARE_TRACE(block_bio_complete, - TP_PROTO(struct request_queue *q, struct bio *bio), - TP_ARGS(q, bio)); - -DECLARE_TRACE(block_bio_backmerge, - TP_PROTO(struct request_queue *q, struct bio *bio), - TP_ARGS(q, bio)); - -DECLARE_TRACE(block_bio_frontmerge, - TP_PROTO(struct request_queue *q, struct bio *bio), - TP_ARGS(q, bio)); - -DECLARE_TRACE(block_bio_queue, - TP_PROTO(struct request_queue *q, struct bio *bio), - TP_ARGS(q, bio)); - -DECLARE_TRACE(block_getrq, - TP_PROTO(struct request_queue *q, struct bio *bio, int rw), - TP_ARGS(q, bio, rw)); - -DECLARE_TRACE(block_sleeprq, - TP_PROTO(struct request_queue *q, struct bio *bio, int rw), - TP_ARGS(q, bio, rw)); - -DECLARE_TRACE(block_plug, - TP_PROTO(struct request_queue *q), - TP_ARGS(q)); - -DECLARE_TRACE(block_unplug_timer, - TP_PROTO(struct request_queue *q), - TP_ARGS(q)); - -DECLARE_TRACE(block_unplug_io, - TP_PROTO(struct request_queue *q), - TP_ARGS(q)); - -DECLARE_TRACE(block_split, - TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu), - TP_ARGS(q, bio, pdu)); - -DECLARE_TRACE(block_remap, - TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t from), - TP_ARGS(q, bio, dev, from)); - -#endif diff --git a/include/trace/events/block.h b/include/trace/events/block.h new file mode 100644 index 000000000000..a99d1e565bb0 --- /dev/null +++ b/include/trace/events/block.h @@ -0,0 +1,483 @@ +#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BLOCK_H + +#include +#include +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM block + +TRACE_EVENT(block_rq_abort, + + TP_PROTO(struct request_queue *q, struct request *rq), + + TP_ARGS(q, rq), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( int, errors ) + __array( char, rwbs, 6 ) + __dynamic_array( char, cmd, blk_cmd_buf_len(rq) ) + ), + + TP_fast_assign( + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector; + __entry->nr_sector = blk_pc_request(rq) ? + 0 : rq->hard_nr_sectors; + __entry->errors = rq->errors; + + blk_fill_rwbs_rq(__entry->rwbs, rq); + blk_dump_cmd(__get_str(cmd), rq); + ), + + TP_printk("%d,%d %s (%s) %llu + %u [%d]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, __get_str(cmd), + __entry->sector, __entry->nr_sector, __entry->errors) +); + +TRACE_EVENT(block_rq_insert, + + TP_PROTO(struct request_queue *q, struct request *rq), + + TP_ARGS(q, rq), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( unsigned int, bytes ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + __dynamic_array( char, cmd, blk_cmd_buf_len(rq) ) + ), + + TP_fast_assign( + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector; + __entry->nr_sector = blk_pc_request(rq) ? + 0 : rq->hard_nr_sectors; + __entry->bytes = blk_pc_request(rq) ? rq->data_len : 0; + + blk_fill_rwbs_rq(__entry->rwbs, rq); + blk_dump_cmd(__get_str(cmd), rq); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, __entry->bytes, __get_str(cmd), + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_rq_issue, + + TP_PROTO(struct request_queue *q, struct request *rq), + + TP_ARGS(q, rq), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( unsigned int, bytes ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + __dynamic_array( char, cmd, blk_cmd_buf_len(rq) ) + ), + + TP_fast_assign( + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector; + __entry->nr_sector = blk_pc_request(rq) ? + 0 : rq->hard_nr_sectors; + __entry->bytes = blk_pc_request(rq) ? rq->data_len : 0; + + blk_fill_rwbs_rq(__entry->rwbs, rq); + blk_dump_cmd(__get_str(cmd), rq); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, __entry->bytes, __get_str(cmd), + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_rq_requeue, + + TP_PROTO(struct request_queue *q, struct request *rq), + + TP_ARGS(q, rq), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( int, errors ) + __array( char, rwbs, 6 ) + __dynamic_array( char, cmd, blk_cmd_buf_len(rq) ) + ), + + TP_fast_assign( + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector; + __entry->nr_sector = blk_pc_request(rq) ? + 0 : rq->hard_nr_sectors; + __entry->errors = rq->errors; + + blk_fill_rwbs_rq(__entry->rwbs, rq); + blk_dump_cmd(__get_str(cmd), rq); + ), + + TP_printk("%d,%d %s (%s) %llu + %u [%d]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, __get_str(cmd), + __entry->sector, __entry->nr_sector, __entry->errors) +); + +TRACE_EVENT(block_rq_complete, + + TP_PROTO(struct request_queue *q, struct request *rq), + + TP_ARGS(q, rq), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( int, errors ) + __array( char, rwbs, 6 ) + __dynamic_array( char, cmd, blk_cmd_buf_len(rq) ) + ), + + TP_fast_assign( + __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; + __entry->sector = blk_pc_request(rq) ? 0 : rq->hard_sector; + __entry->nr_sector = blk_pc_request(rq) ? + 0 : rq->hard_nr_sectors; + __entry->errors = rq->errors; + + blk_fill_rwbs_rq(__entry->rwbs, rq); + blk_dump_cmd(__get_str(cmd), rq); + ), + + TP_printk("%d,%d %s (%s) %llu + %u [%d]", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rwbs, __get_str(cmd), + __entry->sector, __entry->nr_sector, __entry->errors) +); +TRACE_EVENT(block_bio_bounce, + + TP_PROTO(struct request_queue *q, struct bio *bio), + + TP_ARGS(q, bio), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_bio_complete, + + TP_PROTO(struct request_queue *q, struct bio *bio), + + TP_ARGS(q, bio), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned, nr_sector ) + __field( int, error ) + __array( char, rwbs, 6 ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + ), + + TP_printk("%d,%d %s %llu + %u [%d]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->error) +); + +TRACE_EVENT(block_bio_backmerge, + + TP_PROTO(struct request_queue *q, struct bio *bio), + + TP_ARGS(q, bio), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_bio_frontmerge, + + TP_PROTO(struct request_queue *q, struct bio *bio), + + TP_ARGS(q, bio), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned, nr_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_bio_queue, + + TP_PROTO(struct request_queue *q, struct bio *bio), + + TP_ARGS(q, bio), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_getrq, + + TP_PROTO(struct request_queue *q, struct bio *bio, int rw), + + TP_ARGS(q, bio, rw), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio ? bio->bi_bdev->bd_dev : 0; + __entry->sector = bio ? bio->bi_sector : 0; + __entry->nr_sector = bio ? bio->bi_size >> 9 : 0; + blk_fill_rwbs(__entry->rwbs, + bio ? bio->bi_rw : 0, __entry->nr_sector); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_sleeprq, + + TP_PROTO(struct request_queue *q, struct bio *bio, int rw), + + TP_ARGS(q, bio, rw), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio ? bio->bi_bdev->bd_dev : 0; + __entry->sector = bio ? bio->bi_sector : 0; + __entry->nr_sector = bio ? bio->bi_size >> 9 : 0; + blk_fill_rwbs(__entry->rwbs, + bio ? bio->bi_rw : 0, __entry->nr_sector); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu + %u [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, __entry->comm) +); + +TRACE_EVENT(block_plug, + + TP_PROTO(struct request_queue *q), + + TP_ARGS(q), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("[%s]", __entry->comm) +); + +TRACE_EVENT(block_unplug_timer, + + TP_PROTO(struct request_queue *q), + + TP_ARGS(q), + + TP_STRUCT__entry( + __field( int, nr_rq ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE]; + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("[%s] %d", __entry->comm, __entry->nr_rq) +); + +TRACE_EVENT(block_unplug_io, + + TP_PROTO(struct request_queue *q), + + TP_ARGS(q), + + TP_STRUCT__entry( + __field( int, nr_rq ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->nr_rq = q->rq.count[READ] + q->rq.count[WRITE]; + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("[%s] %d", __entry->comm, __entry->nr_rq) +); + +TRACE_EVENT(block_split, + + TP_PROTO(struct request_queue *q, struct bio *bio, + unsigned int new_sector), + + TP_ARGS(q, bio, new_sector), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( sector_t, new_sector ) + __array( char, rwbs, 6 ) + __array( char, comm, TASK_COMM_LEN ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->new_sector = new_sector; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + ), + + TP_printk("%d,%d %s %llu / %llu [%s]", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->new_sector, __entry->comm) +); + +TRACE_EVENT(block_remap, + + TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, + sector_t from), + + TP_ARGS(q, bio, dev, from), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( dev_t, old_dev ) + __field( sector_t, old_sector ) + __array( char, rwbs, 6 ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev->bd_dev; + __entry->sector = bio->bi_sector; + __entry->nr_sector = bio->bi_size >> 9; + __entry->old_dev = dev; + __entry->old_sector = from; + blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size); + ), + + TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + __entry->sector, __entry->nr_sector, + MAJOR(__entry->old_dev), MINOR(__entry->old_dev), + __entry->old_sector) +); + +#endif /* _TRACE_BLOCK_H */ + +/* This part must be outside protection */ +#include + diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 06b85850fab4..844164dca90a 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -45,7 +45,10 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o -obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +ifeq ($(CONFIG_BLOCK),y) +obj-$(CONFIG_EVENT_TRACING) += blktrace.o +endif obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e3abf55bc8e5..7bd6a9893c24 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -23,10 +23,14 @@ #include #include #include -#include #include + +#include + #include "trace_output.h" +#ifdef CONFIG_BLK_DEV_IO_TRACE + static unsigned int blktrace_seq __read_mostly = 1; static struct trace_array *blk_tr; @@ -1658,3 +1662,75 @@ int blk_trace_init_sysfs(struct device *dev) return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); } +#endif /* CONFIG_BLK_DEV_IO_TRACE */ + +#ifdef CONFIG_EVENT_TRACING + +void blk_dump_cmd(char *buf, struct request *rq) +{ + int i, end; + int len = rq->cmd_len; + unsigned char *cmd = rq->cmd; + + if (!blk_pc_request(rq)) { + buf[0] = '\0'; + return; + } + + for (end = len - 1; end >= 0; end--) + if (cmd[end]) + break; + end++; + + for (i = 0; i < len; i++) { + buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]); + if (i == end && end != len - 1) { + sprintf(buf, " .."); + break; + } + } +} + +void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) +{ + int i = 0; + + if (rw & WRITE) + rwbs[i++] = 'W'; + else if (rw & 1 << BIO_RW_DISCARD) + rwbs[i++] = 'D'; + else if (bytes) + rwbs[i++] = 'R'; + else + rwbs[i++] = 'N'; + + if (rw & 1 << BIO_RW_AHEAD) + rwbs[i++] = 'A'; + if (rw & 1 << BIO_RW_BARRIER) + rwbs[i++] = 'B'; + if (rw & 1 << BIO_RW_SYNCIO) + rwbs[i++] = 'S'; + if (rw & 1 << BIO_RW_META) + rwbs[i++] = 'M'; + + rwbs[i] = '\0'; +} + +void blk_fill_rwbs_rq(char *rwbs, struct request *rq) +{ + int rw = rq->cmd_flags & 0x03; + int bytes; + + if (blk_discard_rq(rq)) + rw |= (1 << BIO_RW_DISCARD); + + if (blk_pc_request(rq)) + bytes = rq->data_len; + else + bytes = rq->hard_nr_sectors << 9; + + blk_fill_rwbs(rwbs, rw, bytes); +} + +#endif /* CONFIG_EVENT_TRACING */ + diff --git a/mm/bounce.c b/mm/bounce.c index e590272fe7a8..65f5e17e411a 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -14,16 +14,15 @@ #include #include #include -#include #include +#include + #define POOL_SIZE 64 #define ISA_POOL_SIZE 16 static mempool_t *page_pool, *isa_page_pool; -DEFINE_TRACE(block_bio_bounce); - #ifdef CONFIG_HIGHMEM static __init int init_emergency_pool(void) { -- cgit v1.2.2 From 6556d1df88fe68f9836beeb43342a336691cb67c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 9 Jun 2009 14:04:26 -0400 Subject: tracing: fix the block trace points print size The sector field is either u64 or unsigned long depending on the arch. This patch casts the sector to unsigned long long to prevent the printf warnings. [ Impact: remove compile warnings ] Signed-off-by: Steven Rostedt --- include/trace/events/block.h | 45 +++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index a99d1e565bb0..53effd496a50 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -37,7 +37,8 @@ TRACE_EVENT(block_rq_abort, TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), - __entry->sector, __entry->nr_sector, __entry->errors) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->errors) ); TRACE_EVENT(block_rq_insert, @@ -71,7 +72,8 @@ TRACE_EVENT(block_rq_insert, TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __entry->bytes, __get_str(cmd), - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_rq_issue, @@ -105,7 +107,8 @@ TRACE_EVENT(block_rq_issue, TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __entry->bytes, __get_str(cmd), - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_rq_requeue, @@ -137,7 +140,8 @@ TRACE_EVENT(block_rq_requeue, TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), - __entry->sector, __entry->nr_sector, __entry->errors) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->errors) ); TRACE_EVENT(block_rq_complete, @@ -169,7 +173,8 @@ TRACE_EVENT(block_rq_complete, TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), - __entry->sector, __entry->nr_sector, __entry->errors) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->errors) ); TRACE_EVENT(block_bio_bounce, @@ -195,7 +200,8 @@ TRACE_EVENT(block_bio_bounce, TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_bio_complete, @@ -221,7 +227,8 @@ TRACE_EVENT(block_bio_complete, TP_printk("%d,%d %s %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->error) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->error) ); TRACE_EVENT(block_bio_backmerge, @@ -248,7 +255,8 @@ TRACE_EVENT(block_bio_backmerge, TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_bio_frontmerge, @@ -275,7 +283,8 @@ TRACE_EVENT(block_bio_frontmerge, TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_bio_queue, @@ -302,7 +311,8 @@ TRACE_EVENT(block_bio_queue, TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_getrq, @@ -330,7 +340,8 @@ TRACE_EVENT(block_getrq, TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_sleeprq, @@ -358,7 +369,8 @@ TRACE_EVENT(block_sleeprq, TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, __entry->comm) + (unsigned long long)__entry->sector, + __entry->nr_sector, __entry->comm) ); TRACE_EVENT(block_plug, @@ -441,7 +453,9 @@ TRACE_EVENT(block_split, TP_printk("%d,%d %s %llu / %llu [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->new_sector, __entry->comm) + (unsigned long long)__entry->sector, + (unsigned long long)__entry->new_sector, + __entry->comm) ); TRACE_EVENT(block_remap, @@ -471,9 +485,10 @@ TRACE_EVENT(block_remap, TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - __entry->sector, __entry->nr_sector, + (unsigned long long)__entry->sector, + __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), - __entry->old_sector) + (unsigned long long)__entry->old_sector) ); #endif /* _TRACE_BLOCK_H */ -- cgit v1.2.2 From 725c624a58a10ef90a2ff889e122158fabf36147 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 8 Jun 2009 19:09:45 -0400 Subject: tracing: add trace_seq_vprint interface The code to update the print formats for events requires a vprintf format in the trace_seq. This patch adds that interface. Signed-off-by: Steven Rostedt --- include/linux/trace_seq.h | 2 ++ kernel/trace/trace_output.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index ba9627f00d3f..c68bccba2074 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -27,6 +27,8 @@ trace_seq_init(struct trace_seq *s) #ifdef CONFIG_TRACING extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); +extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) + __attribute__ ((format (printf, 2, 0))); extern int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 425725c1622d..c05aff465dc9 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -100,6 +100,38 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) } EXPORT_SYMBOL_GPL(trace_seq_printf); +/** + * trace_seq_vprintf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +int +trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (!len) + return 0; + + ret = vsnprintf(s->buffer + s->len, len, fmt, args); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) + return 0; + + s->len += ret; + + return len; +} +EXPORT_SYMBOL_GPL(trace_seq_vprintf); + int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { int len = (PAGE_SIZE - 1) - s->len; -- cgit v1.2.2 From 110bf2b764eb6026b868d84499263cb24b1bcc8d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 9 Jun 2009 17:29:07 -0400 Subject: tracing: add protection around module events unload When reading the trace buffer, there is a race that when a module is unloaded it removes events that is stilled referenced in the buffers. This patch adds the protection around the unloading of the events from modules and the reading of the trace buffers. Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 4 +++- kernel/trace/trace_output.c | 15 ++++++++++++--- kernel/trace/trace_output.h | 4 ++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6c81f9c21426..aa08be69a1b6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1050,12 +1050,13 @@ static void trace_module_remove_events(struct module *mod) struct ftrace_event_call *call, *p; bool found = false; + down_write(&trace_event_mutex); list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; ftrace_event_enable_disable(call, 0); if (call->event) - unregister_ftrace_event(call->event); + __unregister_ftrace_event(call->event); debugfs_remove_recursive(call->dir); list_del(&call->list); trace_destroy_fields(call); @@ -1079,6 +1080,7 @@ static void trace_module_remove_events(struct module *mod) */ if (found) tracing_reset_current_online_cpus(); + up_write(&trace_event_mutex); } static int trace_module_notify(struct notifier_block *self, diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c05aff465dc9..7938f3ae93e3 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@ /* must be a power of 2 */ #define EVENT_HASHSIZE 128 -static DECLARE_RWSEM(trace_event_mutex); +DECLARE_RWSEM(trace_event_mutex); DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq); EXPORT_PER_CPU_SYMBOL(ftrace_event_seq); @@ -702,6 +702,16 @@ int register_ftrace_event(struct trace_event *event) } EXPORT_SYMBOL_GPL(register_ftrace_event); +/* + * Used by module code with the trace_event_mutex held for write. + */ +int __unregister_ftrace_event(struct trace_event *event) +{ + hlist_del(&event->node); + list_del(&event->list); + return 0; +} + /** * unregister_ftrace_event - remove a no longer used event * @event: the event to remove @@ -709,8 +719,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_event); int unregister_ftrace_event(struct trace_event *event) { down_write(&trace_event_mutex); - hlist_del(&event->node); - list_del(&event->list); + __unregister_ftrace_event(event); up_write(&trace_event_mutex); return 0; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index ac240e76eb01..d38bec4a9c30 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -27,6 +27,10 @@ extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); +/* used by module unregistering */ +extern int __unregister_ftrace_event(struct trace_event *event); +extern struct rw_semaphore trace_event_mutex; + #define MAX_MEMHEX_BYTES 8 #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) -- cgit v1.2.2 From f1db457ce6e2f63cb01022f58c0c023838958bd1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 10 Jun 2009 10:06:24 +0800 Subject: tracing/events: convert block trace points to TRACE_EVENT(), fix !CONFIG_BLOCK Fix building failures when CONFIG_BLOCK == n. Signed-off-by: Li Zefan LKML-Reference: <4A2F1520.8020003@cn.fujitsu.com> Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/blktrace_api.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index c7ec31dd04c9..7e4350ece0f8 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -218,7 +218,7 @@ static inline int blk_trace_init_sysfs(struct device *dev) #endif /* CONFIG_BLK_DEV_IO_TRACE */ -#ifdef CONFIG_EVENT_TRACING +#if defined(CONFIG_EVENT_TRACING) && defined(CONFIG_BLOCK) static inline int blk_cmd_buf_len(struct request *rq) { @@ -229,7 +229,7 @@ extern void blk_dump_cmd(char *buf, struct request *rq); extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes); extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq); -#endif /* CONFIG_EVENT_TRACING */ +#endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */ #endif /* __KERNEL__ */ #endif -- cgit v1.2.2 From bc5c6c043d8381676339fb3da59cc4cc5921d368 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 10 Jun 2009 04:48:41 -0400 Subject: ftrace/documentation: fix typo in function grapher name The function graph tracer is called just "function_graph" (no trailing "_tracer" needed). Signed-off-by: Mike Frysinger LKML-Reference: <1244623722-6325-1-git-send-email-vapier@gentoo.org> Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index fd9a3e693813..5ad2ded8aa63 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -179,7 +179,7 @@ Here is the list of current tracers that may be configured. Function call tracer to trace all kernel functions. - "function_graph_tracer" + "function_graph" Similar to the function tracer except that the function tracer probes the functions on their entry -- cgit v1.2.2 From 6ff9a64d2aaa6eae396adc95e9c91c0cbfa6dbe4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 10 Jun 2009 14:28:34 -0400 Subject: tracing: do not translate event helper macros in print format By moving the macro that creates the print format code above the defining of the event macro helpers (__get_str, __print_symbolic, and __get_dynamic_array), we get a little cleaner print format. Instead of: (char *)((void *)REC + REC->__data_loc_name) we get: __get_str(name) Instead of: ({ static const struct trace_print_flags symbols[] = { { HI_SOFTIRQ, "HI" }, { we get: __print_symbolic(REC->vec, { HI_SOFTIRQ, "HI" }, { Signed-off-by: Steven Rostedt --- include/trace/ftrace.h | 158 +++++++++++++++++++++++++------------------------ 1 file changed, 81 insertions(+), 77 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 40ede4db4d88..1867553c61e5 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -80,6 +80,87 @@ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +/* + * Setup the showing format of trace point. + * + * int + * ftrace_format_##call(struct trace_seq *s) + * { + * struct ftrace_raw_##call field; + * int ret; + * + * ret = trace_seq_printf(s, #type " " #item ";" + * " offset:%u; size:%u;\n", + * offsetof(struct ftrace_raw_##call, item), + * sizeof(field.type)); + * + * } + */ + +#undef TP_STRUCT__entry +#define TP_STRUCT__entry(args...) args + +#undef __field +#define __field(type, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef __array +#define __array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef __dynamic_array +#define __dynamic_array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), \ + __data_loc_##item), \ + (unsigned int)sizeof(field.__data_loc_##item)); \ + if (!ret) \ + return 0; + +#undef __string +#define __string(item, src) __dynamic_array(char, item, -1) + +#undef __entry +#define __entry REC + +#undef __print_symbolic +#undef __get_dynamic_array +#undef __get_str + +#undef TP_printk +#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) + +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ +static int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct ftrace_raw_##call field __attribute__((unused)); \ + int ret = 0; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: " print); \ + \ + return ret; \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + /* * Stage 3 of the trace events. * @@ -179,83 +260,6 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " offset:%u; size:%u;\n", - * offsetof(struct ftrace_raw_##call, item), - * sizeof(field.type)); - * - * } - */ - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __dynamic_array -#define __dynamic_array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), \ - __data_loc_##item), \ - (unsigned int)sizeof(field.__data_loc_##item)); \ - if (!ret) \ - return 0; - -#undef __string -#define __string(item, src) __dynamic_array(char, item, -1) - -#undef __entry -#define __entry REC - -#undef TP_printk -#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -static int \ -ftrace_format_##call(struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field __attribute__((unused)); \ - int ret = 0; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) - #undef __field #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ -- cgit v1.2.2 From 511b01bdf64ad8a38414096eab283c7784aebfc4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Jun 2009 00:32:00 +0200 Subject: Revert "x86, bts: reenable ptrace branch trace support" This reverts commit 7e0bfad24d85de7cf2202a7b0ce51de11a077b21. A late objection to the ABI has arrived: http://lkml.org/lkml/2009/6/10/253 Keep the ABI disabled out of caution, to not create premature user-space expectations. While the hw-branch-tracing variant uses and tests the BTS code. Cc: Peter Zijlstra Cc: Markus Metzger Cc: Oleg Nesterov Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 924e156a85ab..8130334329c0 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -506,6 +506,7 @@ config X86_PTRACE_BTS bool "Branch Trace Store" default y depends on X86_DEBUGCTLMSR + depends on BROKEN ---help--- This adds a ptrace interface to the hardware's branch trace store. -- cgit v1.2.2