aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSteven Rostedt <srostedt@redhat.com>2011-09-22 11:50:27 -0400
committerSteven Rostedt <rostedt@goodmis.org>2011-10-11 09:13:53 -0400
commitd696b58ca2c3ca76e784ef89a7e0453d9b7ab187 (patch)
treee5f02a13dc8fdec22e9223189122e5938e9ce77e
parente0a413f619ef8bc366dafc6f8221674993b8d85f (diff)
tracing: Do not allocate buffer for trace_marker
When doing intense tracing, the kmalloc inside trace_marker can introduce side effects to what is being traced. As trace_marker() is used by userspace to inject data into the kernel ring buffer, it needs to do so with the least amount of intrusion to the operations of the kernel or the user space application. As the ring buffer is designed to write directly into the buffer without the need to make a temporary buffer, and userspace already went through the hassle of knowing how big the write will be, we can simply pin the userspace pages and write the data directly into the buffer. This improves the impact of tracing via trace_marker tremendously! Thanks to Peter Zijlstra and Thomas Gleixner for pointing out the use of get_user_pages_fast() and kmap_atomic(). Suggested-by: Thomas Gleixner <tglx@linutronix.de> Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
-rw-r--r--kernel/trace/trace.c111
1 files changed, 83 insertions, 28 deletions
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 13f2b8472fed..f86efe90ca45 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3628,22 +3628,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
3628 return 0; 3628 return 0;
3629} 3629}
3630 3630
3631static int mark_printk(const char *fmt, ...)
3632{
3633 int ret;
3634 va_list args;
3635 va_start(args, fmt);
3636 ret = trace_vprintk(0, fmt, args);
3637 va_end(args);
3638 return ret;
3639}
3640
3641static ssize_t 3631static ssize_t
3642tracing_mark_write(struct file *filp, const char __user *ubuf, 3632tracing_mark_write(struct file *filp, const char __user *ubuf,
3643 size_t cnt, loff_t *fpos) 3633 size_t cnt, loff_t *fpos)
3644{ 3634{
3645 char *buf; 3635 unsigned long addr = (unsigned long)ubuf;
3646 size_t written; 3636 struct ring_buffer_event *event;
3637 struct ring_buffer *buffer;
3638 struct print_entry *entry;
3639 unsigned long irq_flags;
3640 struct page *pages[2];
3641 int nr_pages = 1;
3642 ssize_t written;
3643 void *page1;
3644 void *page2;
3645 int offset;
3646 int size;
3647 int len;
3648 int ret;
3647 3649
3648 if (tracing_disabled) 3650 if (tracing_disabled)
3649 return -EINVAL; 3651 return -EINVAL;
@@ -3651,28 +3653,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3651 if (cnt > TRACE_BUF_SIZE) 3653 if (cnt > TRACE_BUF_SIZE)
3652 cnt = TRACE_BUF_SIZE; 3654 cnt = TRACE_BUF_SIZE;
3653 3655
3654 buf = kmalloc(cnt + 2, GFP_KERNEL); 3656 /*
3655 if (buf == NULL) 3657 * Userspace is injecting traces into the kernel trace buffer.
3656 return -ENOMEM; 3658 * We want to be as non intrusive as possible.
3659 * To do so, we do not want to allocate any special buffers
3660 * or take any locks, but instead write the userspace data
3661 * straight into the ring buffer.
3662 *
3663 * First we need to pin the userspace buffer into memory,
3664 * which, most likely it is, because it just referenced it.
3665 * But there's no guarantee that it is. By using get_user_pages_fast()
3666 * and kmap_atomic/kunmap_atomic() we can get access to the
3667 * pages directly. We then write the data directly into the
3668 * ring buffer.
3669 */
3670 BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
3657 3671
3658 if (copy_from_user(buf, ubuf, cnt)) { 3672 /* check if we cross pages */
3659 kfree(buf); 3673 if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
3660 return -EFAULT; 3674 nr_pages = 2;
3675
3676 offset = addr & (PAGE_SIZE - 1);
3677 addr &= PAGE_MASK;
3678
3679 ret = get_user_pages_fast(addr, nr_pages, 0, pages);
3680 if (ret < nr_pages) {
3681 while (--ret >= 0)
3682 put_page(pages[ret]);
3683 written = -EFAULT;
3684 goto out;
3661 } 3685 }
3662 if (buf[cnt-1] != '\n') { 3686
3663 buf[cnt] = '\n'; 3687 page1 = kmap_atomic(pages[0]);
3664 buf[cnt+1] = '\0'; 3688 if (nr_pages == 2)
3689 page2 = kmap_atomic(pages[1]);
3690
3691 local_save_flags(irq_flags);
3692 size = sizeof(*entry) + cnt + 2; /* possible \n added */
3693 buffer = global_trace.buffer;
3694 event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
3695 irq_flags, preempt_count());
3696 if (!event) {
3697 /* Ring buffer disabled, return as if not open for write */
3698 written = -EBADF;
3699 goto out_unlock;
3700 }
3701
3702 entry = ring_buffer_event_data(event);
3703 entry->ip = _THIS_IP_;
3704
3705 if (nr_pages == 2) {
3706 len = PAGE_SIZE - offset;
3707 memcpy(&entry->buf, page1 + offset, len);
3708 memcpy(&entry->buf[len], page2, cnt - len);
3665 } else 3709 } else
3666 buf[cnt] = '\0'; 3710 memcpy(&entry->buf, page1 + offset, cnt);
3667 3711
3668 written = mark_printk("%s", buf); 3712 if (entry->buf[cnt - 1] != '\n') {
3669 kfree(buf); 3713 entry->buf[cnt] = '\n';
3670 *fpos += written; 3714 entry->buf[cnt + 1] = '\0';
3715 } else
3716 entry->buf[cnt] = '\0';
3717
3718 ring_buffer_unlock_commit(buffer, event);
3671 3719
3672 /* don't tell userspace we wrote more - it might confuse them */ 3720 written = cnt;
3673 if (written > cnt)
3674 written = cnt;
3675 3721
3722 *fpos += written;
3723
3724 out_unlock:
3725 if (nr_pages == 2)
3726 kunmap_atomic(page2);
3727 kunmap_atomic(page1);
3728 while (nr_pages > 0)
3729 put_page(pages[--nr_pages]);
3730 out:
3676 return written; 3731 return written;
3677} 3732}
3678 3733