aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLai Jiangshan <laijs@cn.fujitsu.com>2010-01-06 07:08:50 -0500
committerSteven Rostedt <rostedt@goodmis.org>2010-01-06 12:51:34 -0500
commit7e53bd42d14c75192b99674c40fcc359392da59d (patch)
tree65c3638604a2c03947da5cbd7ffb3e4dfee66370
parent0fa0edaf32b9a78b9854f1da98d4511a501089b0 (diff)
tracing: Consolidate protection of reader access to the ring buffer
At the beginning, access to the ring buffer was fully serialized by trace_types_lock. Patch d7350c3f4569 gives more freedom to readers, and patch b04cc6b1f6 adds code to protect trace_pipe and cpu#/trace_pipe. But actually it is not enough, ring buffer readers are not always read-only, they may consume data. This patch makes accesses to trace, trace_pipe, trace_pipe_raw cpu#/trace, cpu#/trace_pipe and cpu#/trace_pipe_raw serialized. And removes tracing_reader_cpumask which is used to protect trace_pipe. Details: Ring buffer serializes readers, but it is low level protection. The validity of the events (which returns by ring_buffer_peek() ..etc) are not protected by ring buffer. The content of events may become garbage if we allow another process to consume these events concurrently: A) the page of the consumed events may become a normal page (not reader page) in ring buffer, and this page will be rewritten by the events producer. B) The page of the consumed events may become a page for splice_read, and this page will be returned to system. This patch adds trace_access_lock() and trace_access_unlock() primitives. These primitives allow multi process access to different cpu ring buffers concurrently. These primitives don't distinguish read-only and read-consume access. Multi read-only access is also serialized. And we don't use these primitives when we open files, we only use them when we read files. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> LKML-Reference: <4B447D52.1050602@cn.fujitsu.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
-rw-r--r--kernel/trace/trace.c136
1 files changed, 97 insertions, 39 deletions
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0df1b0f2cb9e..abdd333a0825 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -32,6 +32,7 @@
32#include <linux/splice.h> 32#include <linux/splice.h>
33#include <linux/kdebug.h> 33#include <linux/kdebug.h>
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/rwsem.h>
35#include <linux/ctype.h> 36#include <linux/ctype.h>
36#include <linux/init.h> 37#include <linux/init.h>
37#include <linux/poll.h> 38#include <linux/poll.h>
@@ -102,9 +103,6 @@ static inline void ftrace_enable_cpu(void)
102 103
103static cpumask_var_t __read_mostly tracing_buffer_mask; 104static cpumask_var_t __read_mostly tracing_buffer_mask;
104 105
105/* Define which cpu buffers are currently read in trace_pipe */
106static cpumask_var_t tracing_reader_cpumask;
107
108#define for_each_tracing_cpu(cpu) \ 106#define for_each_tracing_cpu(cpu) \
109 for_each_cpu(cpu, tracing_buffer_mask) 107 for_each_cpu(cpu, tracing_buffer_mask)
110 108
@@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly;
243 241
244/* 242/*
245 * trace_types_lock is used to protect the trace_types list. 243 * trace_types_lock is used to protect the trace_types list.
246 * This lock is also used to keep user access serialized.
247 * Accesses from userspace will grab this lock while userspace
248 * activities happen inside the kernel.
249 */ 244 */
250static DEFINE_MUTEX(trace_types_lock); 245static DEFINE_MUTEX(trace_types_lock);
251 246
247/*
248 * serialize the access of the ring buffer
249 *
250 * ring buffer serializes readers, but it is low level protection.
251 * The validity of the events (which returns by ring_buffer_peek() ..etc)
252 * are not protected by ring buffer.
253 *
254 * The content of events may become garbage if we allow other process consumes
255 * these events concurrently:
256 * A) the page of the consumed events may become a normal page
257 * (not reader page) in ring buffer, and this page will be rewrited
258 * by events producer.
259 * B) The page of the consumed events may become a page for splice_read,
260 * and this page will be returned to system.
261 *
262 * These primitives allow multi process access to different cpu ring buffer
263 * concurrently.
264 *
265 * These primitives don't distinguish read-only and read-consume access.
266 * Multi read-only access are also serialized.
267 */
268
269#ifdef CONFIG_SMP
270static DECLARE_RWSEM(all_cpu_access_lock);
271static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
272
273static inline void trace_access_lock(int cpu)
274{
275 if (cpu == TRACE_PIPE_ALL_CPU) {
276 /* gain it for accessing the whole ring buffer. */
277 down_write(&all_cpu_access_lock);
278 } else {
279 /* gain it for accessing a cpu ring buffer. */
280
281 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
282 down_read(&all_cpu_access_lock);
283
284 /* Secondly block other access to this @cpu ring buffer. */
285 mutex_lock(&per_cpu(cpu_access_lock, cpu));
286 }
287}
288
289static inline void trace_access_unlock(int cpu)
290{
291 if (cpu == TRACE_PIPE_ALL_CPU) {
292 up_write(&all_cpu_access_lock);
293 } else {
294 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
295 up_read(&all_cpu_access_lock);
296 }
297}
298
299static inline void trace_access_lock_init(void)
300{
301 int cpu;
302
303 for_each_possible_cpu(cpu)
304 mutex_init(&per_cpu(cpu_access_lock, cpu));
305}
306
307#else
308
309static DEFINE_MUTEX(access_lock);
310
311static inline void trace_access_lock(int cpu)
312{
313 (void)cpu;
314 mutex_lock(&access_lock);
315}
316
317static inline void trace_access_unlock(int cpu)
318{
319 (void)cpu;
320 mutex_unlock(&access_lock);
321}
322
323static inline void trace_access_lock_init(void)
324{
325}
326
327#endif
328
252/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 329/* trace_wait is a waitqueue for tasks blocked on trace_poll */
253static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 330static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
254 331
@@ -1580,12 +1657,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1580} 1657}
1581 1658
1582/* 1659/*
1583 * No necessary locking here. The worst thing which can
1584 * happen is loosing events consumed at the same time
1585 * by a trace_pipe reader.
1586 * Other than that, we don't risk to crash the ring buffer
1587 * because it serializes the readers.
1588 *
1589 * The current tracer is copied to avoid a global locking 1660 * The current tracer is copied to avoid a global locking
1590 * all around. 1661 * all around.
1591 */ 1662 */
@@ -1640,12 +1711,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1640 } 1711 }
1641 1712
1642 trace_event_read_lock(); 1713 trace_event_read_lock();
1714 trace_access_lock(cpu_file);
1643 return p; 1715 return p;
1644} 1716}
1645 1717
1646static void s_stop(struct seq_file *m, void *p) 1718static void s_stop(struct seq_file *m, void *p)
1647{ 1719{
1720 struct trace_iterator *iter = m->private;
1721
1648 atomic_dec(&trace_record_cmdline_disabled); 1722 atomic_dec(&trace_record_cmdline_disabled);
1723 trace_access_unlock(iter->cpu_file);
1649 trace_event_read_unlock(); 1724 trace_event_read_unlock();
1650} 1725}
1651 1726
@@ -2836,22 +2911,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2836 2911
2837 mutex_lock(&trace_types_lock); 2912 mutex_lock(&trace_types_lock);
2838 2913
2839 /* We only allow one reader per cpu */
2840 if (cpu_file == TRACE_PIPE_ALL_CPU) {
2841 if (!cpumask_empty(tracing_reader_cpumask)) {
2842 ret = -EBUSY;
2843 goto out;
2844 }
2845 cpumask_setall(tracing_reader_cpumask);
2846 } else {
2847 if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
2848 cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
2849 else {
2850 ret = -EBUSY;
2851 goto out;
2852 }
2853 }
2854
2855 /* create a buffer to store the information to pass to userspace */ 2914 /* create a buffer to store the information to pass to userspace */
2856 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2915 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2857 if (!iter) { 2916 if (!iter) {
@@ -2907,12 +2966,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2907 2966
2908 mutex_lock(&trace_types_lock); 2967 mutex_lock(&trace_types_lock);
2909 2968
2910 if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
2911 cpumask_clear(tracing_reader_cpumask);
2912 else
2913 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2914
2915
2916 if (iter->trace->pipe_close) 2969 if (iter->trace->pipe_close)
2917 iter->trace->pipe_close(iter); 2970 iter->trace->pipe_close(iter);
2918 2971
@@ -3074,6 +3127,7 @@ waitagain:
3074 iter->pos = -1; 3127 iter->pos = -1;
3075 3128
3076 trace_event_read_lock(); 3129 trace_event_read_lock();
3130 trace_access_lock(iter->cpu_file);
3077 while (find_next_entry_inc(iter) != NULL) { 3131 while (find_next_entry_inc(iter) != NULL) {
3078 enum print_line_t ret; 3132 enum print_line_t ret;
3079 int len = iter->seq.len; 3133 int len = iter->seq.len;
@@ -3090,6 +3144,7 @@ waitagain:
3090 if (iter->seq.len >= cnt) 3144 if (iter->seq.len >= cnt)
3091 break; 3145 break;
3092 } 3146 }
3147 trace_access_unlock(iter->cpu_file);
3093 trace_event_read_unlock(); 3148 trace_event_read_unlock();
3094 3149
3095 /* Now copy what we have to the user */ 3150 /* Now copy what we have to the user */
@@ -3215,6 +3270,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3215 } 3270 }
3216 3271
3217 trace_event_read_lock(); 3272 trace_event_read_lock();
3273 trace_access_lock(iter->cpu_file);
3218 3274
3219 /* Fill as many pages as possible. */ 3275 /* Fill as many pages as possible. */
3220 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3276 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
@@ -3238,6 +3294,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3238 trace_seq_init(&iter->seq); 3294 trace_seq_init(&iter->seq);
3239 } 3295 }
3240 3296
3297 trace_access_unlock(iter->cpu_file);
3241 trace_event_read_unlock(); 3298 trace_event_read_unlock();
3242 mutex_unlock(&iter->mutex); 3299 mutex_unlock(&iter->mutex);
3243 3300
@@ -3539,10 +3596,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3539 3596
3540 info->read = 0; 3597 info->read = 0;
3541 3598
3599 trace_access_lock(info->cpu);
3542 ret = ring_buffer_read_page(info->tr->buffer, 3600 ret = ring_buffer_read_page(info->tr->buffer,
3543 &info->spare, 3601 &info->spare,
3544 count, 3602 count,
3545 info->cpu, 0); 3603 info->cpu, 0);
3604 trace_access_unlock(info->cpu);
3546 if (ret < 0) 3605 if (ret < 0)
3547 return 0; 3606 return 0;
3548 3607
@@ -3670,6 +3729,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3670 len &= PAGE_MASK; 3729 len &= PAGE_MASK;
3671 } 3730 }
3672 3731
3732 trace_access_lock(info->cpu);
3673 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3733 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3674 3734
3675 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3735 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
@@ -3717,6 +3777,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3717 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3777 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3718 } 3778 }
3719 3779
3780 trace_access_unlock(info->cpu);
3720 spd.nr_pages = i; 3781 spd.nr_pages = i;
3721 3782
3722 /* did we read anything? */ 3783 /* did we read anything? */
@@ -4153,6 +4214,8 @@ static __init int tracer_init_debugfs(void)
4153 struct dentry *d_tracer; 4214 struct dentry *d_tracer;
4154 int cpu; 4215 int cpu;
4155 4216
4217 trace_access_lock_init();
4218
4156 d_tracer = tracing_init_dentry(); 4219 d_tracer = tracing_init_dentry();
4157 4220
4158 trace_create_file("tracing_enabled", 0644, d_tracer, 4221 trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4387,9 +4450,6 @@ __init static int tracer_alloc_buffers(void)
4387 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4450 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4388 goto out_free_buffer_mask; 4451 goto out_free_buffer_mask;
4389 4452
4390 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4391 goto out_free_tracing_cpumask;
4392
4393 /* To save memory, keep the ring buffer size to its minimum */ 4453 /* To save memory, keep the ring buffer size to its minimum */
4394 if (ring_buffer_expanded) 4454 if (ring_buffer_expanded)
4395 ring_buf_size = trace_buf_size; 4455 ring_buf_size = trace_buf_size;
@@ -4447,8 +4507,6 @@ __init static int tracer_alloc_buffers(void)
4447 return 0; 4507 return 0;
4448 4508
4449out_free_cpumask: 4509out_free_cpumask:
4450 free_cpumask_var(tracing_reader_cpumask);
4451out_free_tracing_cpumask:
4452 free_cpumask_var(tracing_cpumask); 4510 free_cpumask_var(tracing_cpumask);
4453out_free_buffer_mask: 4511out_free_buffer_mask:
4454 free_cpumask_var(tracing_buffer_mask); 4512 free_cpumask_var(tracing_buffer_mask);