diff options
author | Lai Jiangshan <laijs@cn.fujitsu.com> | 2010-01-06 07:08:50 -0500 |
---|---|---|
committer | Steven Rostedt <rostedt@goodmis.org> | 2010-01-06 12:51:34 -0500 |
commit | 7e53bd42d14c75192b99674c40fcc359392da59d (patch) | |
tree | 65c3638604a2c03947da5cbd7ffb3e4dfee66370 /kernel/trace/trace.c | |
parent | 0fa0edaf32b9a78b9854f1da98d4511a501089b0 (diff) |
tracing: Consolidate protection of reader access to the ring buffer
At the beginning, access to the ring buffer was fully serialized
by trace_types_lock. Patch d7350c3f4569 gives more freedom to readers,
and patch b04cc6b1f6 adds code to protect trace_pipe and cpu#/trace_pipe.
But actually it is not enough, ring buffer readers are not always
read-only, they may consume data.
This patch makes accesses to trace, trace_pipe, trace_pipe_raw
cpu#/trace, cpu#/trace_pipe and cpu#/trace_pipe_raw serialized.
And removes tracing_reader_cpumask which is used to protect trace_pipe.
Details:
Ring buffer serializes readers, but it is low level protection.
The validity of the events (which returns by ring_buffer_peek() ..etc)
are not protected by ring buffer.
The content of events may become garbage if we allow another process to consume
these events concurrently:
A) the page of the consumed events may become a normal page
(not reader page) in ring buffer, and this page will be rewritten
by the events producer.
B) The page of the consumed events may become a page for splice_read,
and this page will be returned to system.
This patch adds trace_access_lock() and trace_access_unlock() primitives.
These primitives allow multi process access to different cpu ring buffers
concurrently.
These primitives don't distinguish read-only and read-consume access.
Multi read-only access is also serialized.
And we don't use these primitives when we open files,
we only use them when we read files.
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4B447D52.1050602@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Diffstat (limited to 'kernel/trace/trace.c')
-rw-r--r-- | kernel/trace/trace.c | 136 |
1 files changed, 97 insertions, 39 deletions
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0df1b0f2cb9e..abdd333a0825 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/splice.h> | 32 | #include <linux/splice.h> |
33 | #include <linux/kdebug.h> | 33 | #include <linux/kdebug.h> |
34 | #include <linux/string.h> | 34 | #include <linux/string.h> |
35 | #include <linux/rwsem.h> | ||
35 | #include <linux/ctype.h> | 36 | #include <linux/ctype.h> |
36 | #include <linux/init.h> | 37 | #include <linux/init.h> |
37 | #include <linux/poll.h> | 38 | #include <linux/poll.h> |
@@ -102,9 +103,6 @@ static inline void ftrace_enable_cpu(void) | |||
102 | 103 | ||
103 | static cpumask_var_t __read_mostly tracing_buffer_mask; | 104 | static cpumask_var_t __read_mostly tracing_buffer_mask; |
104 | 105 | ||
105 | /* Define which cpu buffers are currently read in trace_pipe */ | ||
106 | static cpumask_var_t tracing_reader_cpumask; | ||
107 | |||
108 | #define for_each_tracing_cpu(cpu) \ | 106 | #define for_each_tracing_cpu(cpu) \ |
109 | for_each_cpu(cpu, tracing_buffer_mask) | 107 | for_each_cpu(cpu, tracing_buffer_mask) |
110 | 108 | ||
@@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly; | |||
243 | 241 | ||
244 | /* | 242 | /* |
245 | * trace_types_lock is used to protect the trace_types list. | 243 | * trace_types_lock is used to protect the trace_types list. |
246 | * This lock is also used to keep user access serialized. | ||
247 | * Accesses from userspace will grab this lock while userspace | ||
248 | * activities happen inside the kernel. | ||
249 | */ | 244 | */ |
250 | static DEFINE_MUTEX(trace_types_lock); | 245 | static DEFINE_MUTEX(trace_types_lock); |
251 | 246 | ||
247 | /* | ||
248 | * serialize the access of the ring buffer | ||
249 | * | ||
250 | * ring buffer serializes readers, but it is low level protection. | ||
251 | * The validity of the events (which returns by ring_buffer_peek() ..etc) | ||
252 | * are not protected by ring buffer. | ||
253 | * | ||
254 | * The content of events may become garbage if we allow other process consumes | ||
255 | * these events concurrently: | ||
256 | * A) the page of the consumed events may become a normal page | ||
257 | * (not reader page) in ring buffer, and this page will be rewrited | ||
258 | * by events producer. | ||
259 | * B) The page of the consumed events may become a page for splice_read, | ||
260 | * and this page will be returned to system. | ||
261 | * | ||
262 | * These primitives allow multi process access to different cpu ring buffer | ||
263 | * concurrently. | ||
264 | * | ||
265 | * These primitives don't distinguish read-only and read-consume access. | ||
266 | * Multi read-only access are also serialized. | ||
267 | */ | ||
268 | |||
269 | #ifdef CONFIG_SMP | ||
270 | static DECLARE_RWSEM(all_cpu_access_lock); | ||
271 | static DEFINE_PER_CPU(struct mutex, cpu_access_lock); | ||
272 | |||
273 | static inline void trace_access_lock(int cpu) | ||
274 | { | ||
275 | if (cpu == TRACE_PIPE_ALL_CPU) { | ||
276 | /* gain it for accessing the whole ring buffer. */ | ||
277 | down_write(&all_cpu_access_lock); | ||
278 | } else { | ||
279 | /* gain it for accessing a cpu ring buffer. */ | ||
280 | |||
281 | /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ | ||
282 | down_read(&all_cpu_access_lock); | ||
283 | |||
284 | /* Secondly block other access to this @cpu ring buffer. */ | ||
285 | mutex_lock(&per_cpu(cpu_access_lock, cpu)); | ||
286 | } | ||
287 | } | ||
288 | |||
289 | static inline void trace_access_unlock(int cpu) | ||
290 | { | ||
291 | if (cpu == TRACE_PIPE_ALL_CPU) { | ||
292 | up_write(&all_cpu_access_lock); | ||
293 | } else { | ||
294 | mutex_unlock(&per_cpu(cpu_access_lock, cpu)); | ||
295 | up_read(&all_cpu_access_lock); | ||
296 | } | ||
297 | } | ||
298 | |||
299 | static inline void trace_access_lock_init(void) | ||
300 | { | ||
301 | int cpu; | ||
302 | |||
303 | for_each_possible_cpu(cpu) | ||
304 | mutex_init(&per_cpu(cpu_access_lock, cpu)); | ||
305 | } | ||
306 | |||
307 | #else | ||
308 | |||
309 | static DEFINE_MUTEX(access_lock); | ||
310 | |||
311 | static inline void trace_access_lock(int cpu) | ||
312 | { | ||
313 | (void)cpu; | ||
314 | mutex_lock(&access_lock); | ||
315 | } | ||
316 | |||
317 | static inline void trace_access_unlock(int cpu) | ||
318 | { | ||
319 | (void)cpu; | ||
320 | mutex_unlock(&access_lock); | ||
321 | } | ||
322 | |||
323 | static inline void trace_access_lock_init(void) | ||
324 | { | ||
325 | } | ||
326 | |||
327 | #endif | ||
328 | |||
252 | /* trace_wait is a waitqueue for tasks blocked on trace_poll */ | 329 | /* trace_wait is a waitqueue for tasks blocked on trace_poll */ |
253 | static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | 330 | static DECLARE_WAIT_QUEUE_HEAD(trace_wait); |
254 | 331 | ||
@@ -1580,12 +1657,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu) | |||
1580 | } | 1657 | } |
1581 | 1658 | ||
1582 | /* | 1659 | /* |
1583 | * No necessary locking here. The worst thing which can | ||
1584 | * happen is loosing events consumed at the same time | ||
1585 | * by a trace_pipe reader. | ||
1586 | * Other than that, we don't risk to crash the ring buffer | ||
1587 | * because it serializes the readers. | ||
1588 | * | ||
1589 | * The current tracer is copied to avoid a global locking | 1660 | * The current tracer is copied to avoid a global locking |
1590 | * all around. | 1661 | * all around. |
1591 | */ | 1662 | */ |
@@ -1640,12 +1711,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
1640 | } | 1711 | } |
1641 | 1712 | ||
1642 | trace_event_read_lock(); | 1713 | trace_event_read_lock(); |
1714 | trace_access_lock(cpu_file); | ||
1643 | return p; | 1715 | return p; |
1644 | } | 1716 | } |
1645 | 1717 | ||
1646 | static void s_stop(struct seq_file *m, void *p) | 1718 | static void s_stop(struct seq_file *m, void *p) |
1647 | { | 1719 | { |
1720 | struct trace_iterator *iter = m->private; | ||
1721 | |||
1648 | atomic_dec(&trace_record_cmdline_disabled); | 1722 | atomic_dec(&trace_record_cmdline_disabled); |
1723 | trace_access_unlock(iter->cpu_file); | ||
1649 | trace_event_read_unlock(); | 1724 | trace_event_read_unlock(); |
1650 | } | 1725 | } |
1651 | 1726 | ||
@@ -2836,22 +2911,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) | |||
2836 | 2911 | ||
2837 | mutex_lock(&trace_types_lock); | 2912 | mutex_lock(&trace_types_lock); |
2838 | 2913 | ||
2839 | /* We only allow one reader per cpu */ | ||
2840 | if (cpu_file == TRACE_PIPE_ALL_CPU) { | ||
2841 | if (!cpumask_empty(tracing_reader_cpumask)) { | ||
2842 | ret = -EBUSY; | ||
2843 | goto out; | ||
2844 | } | ||
2845 | cpumask_setall(tracing_reader_cpumask); | ||
2846 | } else { | ||
2847 | if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask)) | ||
2848 | cpumask_set_cpu(cpu_file, tracing_reader_cpumask); | ||
2849 | else { | ||
2850 | ret = -EBUSY; | ||
2851 | goto out; | ||
2852 | } | ||
2853 | } | ||
2854 | |||
2855 | /* create a buffer to store the information to pass to userspace */ | 2914 | /* create a buffer to store the information to pass to userspace */ |
2856 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); | 2915 | iter = kzalloc(sizeof(*iter), GFP_KERNEL); |
2857 | if (!iter) { | 2916 | if (!iter) { |
@@ -2907,12 +2966,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) | |||
2907 | 2966 | ||
2908 | mutex_lock(&trace_types_lock); | 2967 | mutex_lock(&trace_types_lock); |
2909 | 2968 | ||
2910 | if (iter->cpu_file == TRACE_PIPE_ALL_CPU) | ||
2911 | cpumask_clear(tracing_reader_cpumask); | ||
2912 | else | ||
2913 | cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); | ||
2914 | |||
2915 | |||
2916 | if (iter->trace->pipe_close) | 2969 | if (iter->trace->pipe_close) |
2917 | iter->trace->pipe_close(iter); | 2970 | iter->trace->pipe_close(iter); |
2918 | 2971 | ||
@@ -3074,6 +3127,7 @@ waitagain: | |||
3074 | iter->pos = -1; | 3127 | iter->pos = -1; |
3075 | 3128 | ||
3076 | trace_event_read_lock(); | 3129 | trace_event_read_lock(); |
3130 | trace_access_lock(iter->cpu_file); | ||
3077 | while (find_next_entry_inc(iter) != NULL) { | 3131 | while (find_next_entry_inc(iter) != NULL) { |
3078 | enum print_line_t ret; | 3132 | enum print_line_t ret; |
3079 | int len = iter->seq.len; | 3133 | int len = iter->seq.len; |
@@ -3090,6 +3144,7 @@ waitagain: | |||
3090 | if (iter->seq.len >= cnt) | 3144 | if (iter->seq.len >= cnt) |
3091 | break; | 3145 | break; |
3092 | } | 3146 | } |
3147 | trace_access_unlock(iter->cpu_file); | ||
3093 | trace_event_read_unlock(); | 3148 | trace_event_read_unlock(); |
3094 | 3149 | ||
3095 | /* Now copy what we have to the user */ | 3150 | /* Now copy what we have to the user */ |
@@ -3215,6 +3270,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3215 | } | 3270 | } |
3216 | 3271 | ||
3217 | trace_event_read_lock(); | 3272 | trace_event_read_lock(); |
3273 | trace_access_lock(iter->cpu_file); | ||
3218 | 3274 | ||
3219 | /* Fill as many pages as possible. */ | 3275 | /* Fill as many pages as possible. */ |
3220 | for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { | 3276 | for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { |
@@ -3238,6 +3294,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, | |||
3238 | trace_seq_init(&iter->seq); | 3294 | trace_seq_init(&iter->seq); |
3239 | } | 3295 | } |
3240 | 3296 | ||
3297 | trace_access_unlock(iter->cpu_file); | ||
3241 | trace_event_read_unlock(); | 3298 | trace_event_read_unlock(); |
3242 | mutex_unlock(&iter->mutex); | 3299 | mutex_unlock(&iter->mutex); |
3243 | 3300 | ||
@@ -3539,10 +3596,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, | |||
3539 | 3596 | ||
3540 | info->read = 0; | 3597 | info->read = 0; |
3541 | 3598 | ||
3599 | trace_access_lock(info->cpu); | ||
3542 | ret = ring_buffer_read_page(info->tr->buffer, | 3600 | ret = ring_buffer_read_page(info->tr->buffer, |
3543 | &info->spare, | 3601 | &info->spare, |
3544 | count, | 3602 | count, |
3545 | info->cpu, 0); | 3603 | info->cpu, 0); |
3604 | trace_access_unlock(info->cpu); | ||
3546 | if (ret < 0) | 3605 | if (ret < 0) |
3547 | return 0; | 3606 | return 0; |
3548 | 3607 | ||
@@ -3670,6 +3729,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
3670 | len &= PAGE_MASK; | 3729 | len &= PAGE_MASK; |
3671 | } | 3730 | } |
3672 | 3731 | ||
3732 | trace_access_lock(info->cpu); | ||
3673 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); | 3733 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); |
3674 | 3734 | ||
3675 | for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { | 3735 | for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { |
@@ -3717,6 +3777,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, | |||
3717 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); | 3777 | entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); |
3718 | } | 3778 | } |
3719 | 3779 | ||
3780 | trace_access_unlock(info->cpu); | ||
3720 | spd.nr_pages = i; | 3781 | spd.nr_pages = i; |
3721 | 3782 | ||
3722 | /* did we read anything? */ | 3783 | /* did we read anything? */ |
@@ -4153,6 +4214,8 @@ static __init int tracer_init_debugfs(void) | |||
4153 | struct dentry *d_tracer; | 4214 | struct dentry *d_tracer; |
4154 | int cpu; | 4215 | int cpu; |
4155 | 4216 | ||
4217 | trace_access_lock_init(); | ||
4218 | |||
4156 | d_tracer = tracing_init_dentry(); | 4219 | d_tracer = tracing_init_dentry(); |
4157 | 4220 | ||
4158 | trace_create_file("tracing_enabled", 0644, d_tracer, | 4221 | trace_create_file("tracing_enabled", 0644, d_tracer, |
@@ -4387,9 +4450,6 @@ __init static int tracer_alloc_buffers(void) | |||
4387 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) | 4450 | if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) |
4388 | goto out_free_buffer_mask; | 4451 | goto out_free_buffer_mask; |
4389 | 4452 | ||
4390 | if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) | ||
4391 | goto out_free_tracing_cpumask; | ||
4392 | |||
4393 | /* To save memory, keep the ring buffer size to its minimum */ | 4453 | /* To save memory, keep the ring buffer size to its minimum */ |
4394 | if (ring_buffer_expanded) | 4454 | if (ring_buffer_expanded) |
4395 | ring_buf_size = trace_buf_size; | 4455 | ring_buf_size = trace_buf_size; |
@@ -4447,8 +4507,6 @@ __init static int tracer_alloc_buffers(void) | |||
4447 | return 0; | 4507 | return 0; |
4448 | 4508 | ||
4449 | out_free_cpumask: | 4509 | out_free_cpumask: |
4450 | free_cpumask_var(tracing_reader_cpumask); | ||
4451 | out_free_tracing_cpumask: | ||
4452 | free_cpumask_var(tracing_cpumask); | 4510 | free_cpumask_var(tracing_cpumask); |
4453 | out_free_buffer_mask: | 4511 | out_free_buffer_mask: |
4454 | free_cpumask_var(tracing_buffer_mask); | 4512 | free_cpumask_var(tracing_buffer_mask); |