aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-27 21:48:00 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-27 21:48:00 -0400
commita042e26137d7674ac04b1cd2d5c06b9ebc1ee2d5 (patch)
treec1a7a8bda41b99caa4b4a0fe320fc73278879f7d /kernel
parentf66dd539feb849a3a00f7fac67c026e0935e373a (diff)
parente25804a0327dad954f7d43803178fdef2fd35b4e (diff)
Merge branch 'perf-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perf-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (50 commits) perf python scripting: Add futex-contention script perf python scripting: Fixup cut'n'paste error in sctop script perf scripting: Shut up 'perf record' final status perf record: Remove newline character from perror() argument perf python scripting: Support fedora 11 (audit 1.7.17) perf python scripting: Improve the syscalls-by-pid script perf python scripting: print the syscall name on sctop perf python scripting: Improve the syscalls-counts script perf python scripting: Improve the failed-syscalls-by-pid script kprobes: Remove redundant text_mutex lock in optimize x86/oprofile: Fix uninitialized variable use in debug printk tracing: Fix 'faild' -> 'failed' typo perf probe: Fix format specified for Dwarf_Off parameter perf trace: Fix detection of script extension perf trace: Use $PERF_EXEC_PATH in canned report scripts perf tools: Document event modifiers perf tools: Remove direct slang.h include perf_events: Fix for transaction recovery in group_sched_in() perf_events: Revert: Fix transaction recovery in group_sched_in() perf, x86: Use NUMA aware allocations for PEBS/BTS/DS allocations ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/kprobes.c7
-rw-r--r--kernel/perf_event.c94
-rw-r--r--kernel/softirq.c16
-rw-r--r--kernel/trace/ring_buffer.c335
-rw-r--r--kernel/trace/trace.c8
5 files changed, 218 insertions, 242 deletions
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 56a891914273..99865c33a60d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -74,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
74/* NOTE: change this value only with kprobe_mutex held */ 74/* NOTE: change this value only with kprobe_mutex held */
75static bool kprobes_all_disarmed; 75static bool kprobes_all_disarmed;
76 76
77static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 77/* This protects kprobe_table and optimizing_list */
78static DEFINE_MUTEX(kprobe_mutex);
78static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 79static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
79static struct { 80static struct {
80 spinlock_t lock ____cacheline_aligned_in_smp; 81 spinlock_t lock ____cacheline_aligned_in_smp;
@@ -595,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
595} 596}
596 597
597#ifdef CONFIG_SYSCTL 598#ifdef CONFIG_SYSCTL
599/* This should be called with kprobe_mutex locked */
598static void __kprobes optimize_all_kprobes(void) 600static void __kprobes optimize_all_kprobes(void)
599{ 601{
600 struct hlist_head *head; 602 struct hlist_head *head;
@@ -607,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void)
607 return; 609 return;
608 610
609 kprobes_allow_optimization = true; 611 kprobes_allow_optimization = true;
610 mutex_lock(&text_mutex);
611 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 612 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
612 head = &kprobe_table[i]; 613 head = &kprobe_table[i];
613 hlist_for_each_entry_rcu(p, node, head, hlist) 614 hlist_for_each_entry_rcu(p, node, head, hlist)
614 if (!kprobe_disabled(p)) 615 if (!kprobe_disabled(p))
615 optimize_kprobe(p); 616 optimize_kprobe(p);
616 } 617 }
617 mutex_unlock(&text_mutex);
618 printk(KERN_INFO "Kprobes globally optimized\n"); 618 printk(KERN_INFO "Kprobes globally optimized\n");
619} 619}
620 620
621/* This should be called with kprobe_mutex locked */
621static void __kprobes unoptimize_all_kprobes(void) 622static void __kprobes unoptimize_all_kprobes(void)
622{ 623{
623 struct hlist_head *head; 624 struct hlist_head *head;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f309e8014c78..517d827f4982 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -417,8 +417,8 @@ event_filter_match(struct perf_event *event)
417 return event->cpu == -1 || event->cpu == smp_processor_id(); 417 return event->cpu == -1 || event->cpu == smp_processor_id();
418} 418}
419 419
420static int 420static void
421__event_sched_out(struct perf_event *event, 421event_sched_out(struct perf_event *event,
422 struct perf_cpu_context *cpuctx, 422 struct perf_cpu_context *cpuctx,
423 struct perf_event_context *ctx) 423 struct perf_event_context *ctx)
424{ 424{
@@ -437,13 +437,14 @@ __event_sched_out(struct perf_event *event,
437 } 437 }
438 438
439 if (event->state != PERF_EVENT_STATE_ACTIVE) 439 if (event->state != PERF_EVENT_STATE_ACTIVE)
440 return 0; 440 return;
441 441
442 event->state = PERF_EVENT_STATE_INACTIVE; 442 event->state = PERF_EVENT_STATE_INACTIVE;
443 if (event->pending_disable) { 443 if (event->pending_disable) {
444 event->pending_disable = 0; 444 event->pending_disable = 0;
445 event->state = PERF_EVENT_STATE_OFF; 445 event->state = PERF_EVENT_STATE_OFF;
446 } 446 }
447 event->tstamp_stopped = ctx->time;
447 event->pmu->del(event, 0); 448 event->pmu->del(event, 0);
448 event->oncpu = -1; 449 event->oncpu = -1;
449 450
@@ -452,19 +453,6 @@ __event_sched_out(struct perf_event *event,
452 ctx->nr_active--; 453 ctx->nr_active--;
453 if (event->attr.exclusive || !cpuctx->active_oncpu) 454 if (event->attr.exclusive || !cpuctx->active_oncpu)
454 cpuctx->exclusive = 0; 455 cpuctx->exclusive = 0;
455 return 1;
456}
457
458static void
459event_sched_out(struct perf_event *event,
460 struct perf_cpu_context *cpuctx,
461 struct perf_event_context *ctx)
462{
463 int ret;
464
465 ret = __event_sched_out(event, cpuctx, ctx);
466 if (ret)
467 event->tstamp_stopped = ctx->time;
468} 456}
469 457
470static void 458static void
@@ -664,7 +652,7 @@ retry:
664} 652}
665 653
666static int 654static int
667__event_sched_in(struct perf_event *event, 655event_sched_in(struct perf_event *event,
668 struct perf_cpu_context *cpuctx, 656 struct perf_cpu_context *cpuctx,
669 struct perf_event_context *ctx) 657 struct perf_event_context *ctx)
670{ 658{
@@ -684,6 +672,8 @@ __event_sched_in(struct perf_event *event,
684 return -EAGAIN; 672 return -EAGAIN;
685 } 673 }
686 674
675 event->tstamp_running += ctx->time - event->tstamp_stopped;
676
687 if (!is_software_event(event)) 677 if (!is_software_event(event))
688 cpuctx->active_oncpu++; 678 cpuctx->active_oncpu++;
689 ctx->nr_active++; 679 ctx->nr_active++;
@@ -694,35 +684,6 @@ __event_sched_in(struct perf_event *event,
694 return 0; 684 return 0;
695} 685}
696 686
697static inline int
698event_sched_in(struct perf_event *event,
699 struct perf_cpu_context *cpuctx,
700 struct perf_event_context *ctx)
701{
702 int ret = __event_sched_in(event, cpuctx, ctx);
703 if (ret)
704 return ret;
705 event->tstamp_running += ctx->time - event->tstamp_stopped;
706 return 0;
707}
708
709static void
710group_commit_event_sched_in(struct perf_event *group_event,
711 struct perf_cpu_context *cpuctx,
712 struct perf_event_context *ctx)
713{
714 struct perf_event *event;
715 u64 now = ctx->time;
716
717 group_event->tstamp_running += now - group_event->tstamp_stopped;
718 /*
719 * Schedule in siblings as one group (if any):
720 */
721 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
722 event->tstamp_running += now - event->tstamp_stopped;
723 }
724}
725
726static int 687static int
727group_sched_in(struct perf_event *group_event, 688group_sched_in(struct perf_event *group_event,
728 struct perf_cpu_context *cpuctx, 689 struct perf_cpu_context *cpuctx,
@@ -730,19 +691,15 @@ group_sched_in(struct perf_event *group_event,
730{ 691{
731 struct perf_event *event, *partial_group = NULL; 692 struct perf_event *event, *partial_group = NULL;
732 struct pmu *pmu = group_event->pmu; 693 struct pmu *pmu = group_event->pmu;
694 u64 now = ctx->time;
695 bool simulate = false;
733 696
734 if (group_event->state == PERF_EVENT_STATE_OFF) 697 if (group_event->state == PERF_EVENT_STATE_OFF)
735 return 0; 698 return 0;
736 699
737 pmu->start_txn(pmu); 700 pmu->start_txn(pmu);
738 701
739 /* 702 if (event_sched_in(group_event, cpuctx, ctx)) {
740 * use __event_sched_in() to delay updating tstamp_running
741 * until the transaction is committed. In case of failure
742 * we will keep an unmodified tstamp_running which is a
743 * requirement to get correct timing information
744 */
745 if (__event_sched_in(group_event, cpuctx, ctx)) {
746 pmu->cancel_txn(pmu); 703 pmu->cancel_txn(pmu);
747 return -EAGAIN; 704 return -EAGAIN;
748 } 705 }
@@ -751,31 +708,42 @@ group_sched_in(struct perf_event *group_event,
751 * Schedule in siblings as one group (if any): 708 * Schedule in siblings as one group (if any):
752 */ 709 */
753 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 710 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
754 if (__event_sched_in(event, cpuctx, ctx)) { 711 if (event_sched_in(event, cpuctx, ctx)) {
755 partial_group = event; 712 partial_group = event;
756 goto group_error; 713 goto group_error;
757 } 714 }
758 } 715 }
759 716
760 if (!pmu->commit_txn(pmu)) { 717 if (!pmu->commit_txn(pmu))
761 /* commit tstamp_running */
762 group_commit_event_sched_in(group_event, cpuctx, ctx);
763 return 0; 718 return 0;
764 } 719
765group_error: 720group_error:
766 /* 721 /*
767 * Groups can be scheduled in as one unit only, so undo any 722 * Groups can be scheduled in as one unit only, so undo any
768 * partial group before returning: 723 * partial group before returning:
724 * The events up to the failed event are scheduled out normally,
725 * tstamp_stopped will be updated.
769 * 726 *
770 * use __event_sched_out() to avoid updating tstamp_stopped 727 * The failed events and the remaining siblings need to have
771 * because the event never actually ran 728 * their timings updated as if they had gone thru event_sched_in()
729 * and event_sched_out(). This is required to get consistent timings
730 * across the group. This also takes care of the case where the group
731 * could never be scheduled by ensuring tstamp_stopped is set to mark
732 * the time the event was actually stopped, such that time delta
733 * calculation in update_event_times() is correct.
772 */ 734 */
773 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 735 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
774 if (event == partial_group) 736 if (event == partial_group)
775 break; 737 simulate = true;
776 __event_sched_out(event, cpuctx, ctx); 738
739 if (simulate) {
740 event->tstamp_running += now - event->tstamp_stopped;
741 event->tstamp_stopped = now;
742 } else {
743 event_sched_out(event, cpuctx, ctx);
744 }
777 } 745 }
778 __event_sched_out(group_event, cpuctx, ctx); 746 event_sched_out(group_event, cpuctx, ctx);
779 747
780 pmu->cancel_txn(pmu); 748 pmu->cancel_txn(pmu);
781 749
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f02a9dfa19bc..18f4be0d5fe0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -229,18 +229,20 @@ restart:
229 229
230 do { 230 do {
231 if (pending & 1) { 231 if (pending & 1) {
232 unsigned int vec_nr = h - softirq_vec;
232 int prev_count = preempt_count(); 233 int prev_count = preempt_count();
233 kstat_incr_softirqs_this_cpu(h - softirq_vec);
234 234
235 trace_softirq_entry(h, softirq_vec); 235 kstat_incr_softirqs_this_cpu(vec_nr);
236
237 trace_softirq_entry(vec_nr);
236 h->action(h); 238 h->action(h);
237 trace_softirq_exit(h, softirq_vec); 239 trace_softirq_exit(vec_nr);
238 if (unlikely(prev_count != preempt_count())) { 240 if (unlikely(prev_count != preempt_count())) {
239 printk(KERN_ERR "huh, entered softirq %td %s %p" 241 printk(KERN_ERR "huh, entered softirq %u %s %p"
240 "with preempt_count %08x," 242 "with preempt_count %08x,"
241 " exited with %08x?\n", h - softirq_vec, 243 " exited with %08x?\n", vec_nr,
242 softirq_to_name[h - softirq_vec], 244 softirq_to_name[vec_nr], h->action,
243 h->action, prev_count, preempt_count()); 245 prev_count, preempt_count());
244 preempt_count() = prev_count; 246 preempt_count() = prev_count;
245 } 247 }
246 248
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c3dab054d18e..9ed509a015d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -224,6 +224,9 @@ enum {
224 RB_LEN_TIME_STAMP = 16, 224 RB_LEN_TIME_STAMP = 16,
225}; 225};
226 226
227#define skip_time_extend(event) \
228 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
229
227static inline int rb_null_event(struct ring_buffer_event *event) 230static inline int rb_null_event(struct ring_buffer_event *event)
228{ 231{
229 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; 232 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
248 return length + RB_EVNT_HDR_SIZE; 251 return length + RB_EVNT_HDR_SIZE;
249} 252}
250 253
251/* inline for ring buffer fast paths */ 254/*
252static unsigned 255 * Return the length of the given event. Will return
256 * the length of the time extend if the event is a
257 * time extend.
258 */
259static inline unsigned
253rb_event_length(struct ring_buffer_event *event) 260rb_event_length(struct ring_buffer_event *event)
254{ 261{
255 switch (event->type_len) { 262 switch (event->type_len) {
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
274 return 0; 281 return 0;
275} 282}
276 283
284/*
285 * Return total length of time extend and data,
286 * or just the event length for all other events.
287 */
288static inline unsigned
289rb_event_ts_length(struct ring_buffer_event *event)
290{
291 unsigned len = 0;
292
293 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
294 /* time extends include the data event after it */
295 len = RB_LEN_TIME_EXTEND;
296 event = skip_time_extend(event);
297 }
298 return len + rb_event_length(event);
299}
300
277/** 301/**
278 * ring_buffer_event_length - return the length of the event 302 * ring_buffer_event_length - return the length of the event
279 * @event: the event to get the length of 303 * @event: the event to get the length of
304 *
305 * Returns the size of the data load of a data event.
306 * If the event is something other than a data event, it
307 * returns the size of the event itself. With the exception
308 * of a TIME EXTEND, where it still returns the size of the
309 * data load of the data event after it.
280 */ 310 */
281unsigned ring_buffer_event_length(struct ring_buffer_event *event) 311unsigned ring_buffer_event_length(struct ring_buffer_event *event)
282{ 312{
283 unsigned length = rb_event_length(event); 313 unsigned length;
314
315 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
316 event = skip_time_extend(event);
317
318 length = rb_event_length(event);
284 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 319 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
285 return length; 320 return length;
286 length -= RB_EVNT_HDR_SIZE; 321 length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
294static void * 329static void *
295rb_event_data(struct ring_buffer_event *event) 330rb_event_data(struct ring_buffer_event *event)
296{ 331{
332 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
333 event = skip_time_extend(event);
297 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); 334 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
298 /* If length is in len field, then array[0] has the data */ 335 /* If length is in len field, then array[0] has the data */
299 if (event->type_len) 336 if (event->type_len)
@@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
404/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ 441/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
405#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) 442#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
406 443
407/* Max number of timestamps that can fit on a page */
408#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
409
410int ring_buffer_print_page_header(struct trace_seq *s) 444int ring_buffer_print_page_header(struct trace_seq *s)
411{ 445{
412 struct buffer_data_page field; 446 struct buffer_data_page field;
@@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1546 iter->head = 0; 1580 iter->head = 0;
1547} 1581}
1548 1582
1583/* Slow path, do not inline */
1584static noinline struct ring_buffer_event *
1585rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1586{
1587 event->type_len = RINGBUF_TYPE_TIME_EXTEND;
1588
1589 /* Not the first event on the page? */
1590 if (rb_event_index(event)) {
1591 event->time_delta = delta & TS_MASK;
1592 event->array[0] = delta >> TS_SHIFT;
1593 } else {
1594 /* nope, just zero it */
1595 event->time_delta = 0;
1596 event->array[0] = 0;
1597 }
1598
1599 return skip_time_extend(event);
1600}
1601
1549/** 1602/**
1550 * ring_buffer_update_event - update event type and data 1603 * ring_buffer_update_event - update event type and data
1551 * @event: the even to update 1604 * @event: the even to update
@@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1558 * data field. 1611 * data field.
1559 */ 1612 */
1560static void 1613static void
1561rb_update_event(struct ring_buffer_event *event, 1614rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
1562 unsigned type, unsigned length) 1615 struct ring_buffer_event *event, unsigned length,
1616 int add_timestamp, u64 delta)
1563{ 1617{
1564 event->type_len = type; 1618 /* Only a commit updates the timestamp */
1565 1619 if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
1566 switch (type) { 1620 delta = 0;
1567
1568 case RINGBUF_TYPE_PADDING:
1569 case RINGBUF_TYPE_TIME_EXTEND:
1570 case RINGBUF_TYPE_TIME_STAMP:
1571 break;
1572 1621
1573 case 0: 1622 /*
1574 length -= RB_EVNT_HDR_SIZE; 1623 * If we need to add a timestamp, then we
1575 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) 1624 * add it to the start of the resevered space.
1576 event->array[0] = length; 1625 */
1577 else 1626 if (unlikely(add_timestamp)) {
1578 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1627 event = rb_add_time_stamp(event, delta);
1579 break; 1628 length -= RB_LEN_TIME_EXTEND;
1580 default: 1629 delta = 0;
1581 BUG();
1582 } 1630 }
1631
1632 event->time_delta = delta;
1633 length -= RB_EVNT_HDR_SIZE;
1634 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
1635 event->type_len = 0;
1636 event->array[0] = length;
1637 } else
1638 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1583} 1639}
1584 1640
1585/* 1641/*
@@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1823 local_sub(length, &tail_page->write); 1879 local_sub(length, &tail_page->write);
1824} 1880}
1825 1881
1826static struct ring_buffer_event * 1882/*
1883 * This is the slow path, force gcc not to inline it.
1884 */
1885static noinline struct ring_buffer_event *
1827rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1886rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1828 unsigned long length, unsigned long tail, 1887 unsigned long length, unsigned long tail,
1829 struct buffer_page *tail_page, u64 *ts) 1888 struct buffer_page *tail_page, u64 ts)
1830{ 1889{
1831 struct buffer_page *commit_page = cpu_buffer->commit_page; 1890 struct buffer_page *commit_page = cpu_buffer->commit_page;
1832 struct ring_buffer *buffer = cpu_buffer->buffer; 1891 struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1909 * Nested commits always have zero deltas, so 1968 * Nested commits always have zero deltas, so
1910 * just reread the time stamp 1969 * just reread the time stamp
1911 */ 1970 */
1912 *ts = rb_time_stamp(buffer); 1971 ts = rb_time_stamp(buffer);
1913 next_page->page->time_stamp = *ts; 1972 next_page->page->time_stamp = ts;
1914 } 1973 }
1915 1974
1916 out_again: 1975 out_again:
@@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1929 1988
1930static struct ring_buffer_event * 1989static struct ring_buffer_event *
1931__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1990__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1932 unsigned type, unsigned long length, u64 *ts) 1991 unsigned long length, u64 ts,
1992 u64 delta, int add_timestamp)
1933{ 1993{
1934 struct buffer_page *tail_page; 1994 struct buffer_page *tail_page;
1935 struct ring_buffer_event *event; 1995 struct ring_buffer_event *event;
1936 unsigned long tail, write; 1996 unsigned long tail, write;
1937 1997
1998 /*
1999 * If the time delta since the last event is too big to
2000 * hold in the time field of the event, then we append a
2001 * TIME EXTEND event ahead of the data event.
2002 */
2003 if (unlikely(add_timestamp))
2004 length += RB_LEN_TIME_EXTEND;
2005
1938 tail_page = cpu_buffer->tail_page; 2006 tail_page = cpu_buffer->tail_page;
1939 write = local_add_return(length, &tail_page->write); 2007 write = local_add_return(length, &tail_page->write);
1940 2008
@@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1943 tail = write - length; 2011 tail = write - length;
1944 2012
1945 /* See if we shot pass the end of this buffer page */ 2013 /* See if we shot pass the end of this buffer page */
1946 if (write > BUF_PAGE_SIZE) 2014 if (unlikely(write > BUF_PAGE_SIZE))
1947 return rb_move_tail(cpu_buffer, length, tail, 2015 return rb_move_tail(cpu_buffer, length, tail,
1948 tail_page, ts); 2016 tail_page, ts);
1949 2017
@@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1951 2019
1952 event = __rb_page_index(tail_page, tail); 2020 event = __rb_page_index(tail_page, tail);
1953 kmemcheck_annotate_bitfield(event, bitfield); 2021 kmemcheck_annotate_bitfield(event, bitfield);
1954 rb_update_event(event, type, length); 2022 rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
1955 2023
1956 /* The passed in type is zero for DATA */ 2024 local_inc(&tail_page->entries);
1957 if (likely(!type))
1958 local_inc(&tail_page->entries);
1959 2025
1960 /* 2026 /*
1961 * If this is the first commit on the page, then update 2027 * If this is the first commit on the page, then update
1962 * its timestamp. 2028 * its timestamp.
1963 */ 2029 */
1964 if (!tail) 2030 if (!tail)
1965 tail_page->page->time_stamp = *ts; 2031 tail_page->page->time_stamp = ts;
1966 2032
1967 return event; 2033 return event;
1968} 2034}
@@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1977 unsigned long addr; 2043 unsigned long addr;
1978 2044
1979 new_index = rb_event_index(event); 2045 new_index = rb_event_index(event);
1980 old_index = new_index + rb_event_length(event); 2046 old_index = new_index + rb_event_ts_length(event);
1981 addr = (unsigned long)event; 2047 addr = (unsigned long)event;
1982 addr &= PAGE_MASK; 2048 addr &= PAGE_MASK;
1983 2049
@@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2003 return 0; 2069 return 0;
2004} 2070}
2005 2071
2006static int
2007rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2008 u64 *ts, u64 *delta)
2009{
2010 struct ring_buffer_event *event;
2011 int ret;
2012
2013 WARN_ONCE(*delta > (1ULL << 59),
2014 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2015 (unsigned long long)*delta,
2016 (unsigned long long)*ts,
2017 (unsigned long long)cpu_buffer->write_stamp);
2018
2019 /*
2020 * The delta is too big, we to add a
2021 * new timestamp.
2022 */
2023 event = __rb_reserve_next(cpu_buffer,
2024 RINGBUF_TYPE_TIME_EXTEND,
2025 RB_LEN_TIME_EXTEND,
2026 ts);
2027 if (!event)
2028 return -EBUSY;
2029
2030 if (PTR_ERR(event) == -EAGAIN)
2031 return -EAGAIN;
2032
2033 /* Only a commited time event can update the write stamp */
2034 if (rb_event_is_commit(cpu_buffer, event)) {
2035 /*
2036 * If this is the first on the page, then it was
2037 * updated with the page itself. Try to discard it
2038 * and if we can't just make it zero.
2039 */
2040 if (rb_event_index(event)) {
2041 event->time_delta = *delta & TS_MASK;
2042 event->array[0] = *delta >> TS_SHIFT;
2043 } else {
2044 /* try to discard, since we do not need this */
2045 if (!rb_try_to_discard(cpu_buffer, event)) {
2046 /* nope, just zero it */
2047 event->time_delta = 0;
2048 event->array[0] = 0;
2049 }
2050 }
2051 cpu_buffer->write_stamp = *ts;
2052 /* let the caller know this was the commit */
2053 ret = 1;
2054 } else {
2055 /* Try to discard the event */
2056 if (!rb_try_to_discard(cpu_buffer, event)) {
2057 /* Darn, this is just wasted space */
2058 event->time_delta = 0;
2059 event->array[0] = 0;
2060 }
2061 ret = 0;
2062 }
2063
2064 *delta = 0;
2065
2066 return ret;
2067}
2068
2069static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) 2072static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2070{ 2073{
2071 local_inc(&cpu_buffer->committing); 2074 local_inc(&cpu_buffer->committing);
2072 local_inc(&cpu_buffer->commits); 2075 local_inc(&cpu_buffer->commits);
2073} 2076}
2074 2077
2075static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) 2078static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2076{ 2079{
2077 unsigned long commits; 2080 unsigned long commits;
2078 2081
@@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2110 unsigned long length) 2113 unsigned long length)
2111{ 2114{
2112 struct ring_buffer_event *event; 2115 struct ring_buffer_event *event;
2113 u64 ts, delta = 0; 2116 u64 ts, delta;
2114 int commit = 0;
2115 int nr_loops = 0; 2117 int nr_loops = 0;
2118 int add_timestamp;
2119 u64 diff;
2116 2120
2117 rb_start_commit(cpu_buffer); 2121 rb_start_commit(cpu_buffer);
2118 2122
@@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2133 2137
2134 length = rb_calculate_event_length(length); 2138 length = rb_calculate_event_length(length);
2135 again: 2139 again:
2140 add_timestamp = 0;
2141 delta = 0;
2142
2136 /* 2143 /*
2137 * We allow for interrupts to reenter here and do a trace. 2144 * We allow for interrupts to reenter here and do a trace.
2138 * If one does, it will cause this original code to loop 2145 * If one does, it will cause this original code to loop
@@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2146 goto out_fail; 2153 goto out_fail;
2147 2154
2148 ts = rb_time_stamp(cpu_buffer->buffer); 2155 ts = rb_time_stamp(cpu_buffer->buffer);
2156 diff = ts - cpu_buffer->write_stamp;
2149 2157
2150 /* 2158 /* make sure this diff is calculated here */
2151 * Only the first commit can update the timestamp. 2159 barrier();
2152 * Yes there is a race here. If an interrupt comes in
2153 * just after the conditional and it traces too, then it
2154 * will also check the deltas. More than one timestamp may
2155 * also be made. But only the entry that did the actual
2156 * commit will be something other than zero.
2157 */
2158 if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
2159 rb_page_write(cpu_buffer->tail_page) ==
2160 rb_commit_index(cpu_buffer))) {
2161 u64 diff;
2162
2163 diff = ts - cpu_buffer->write_stamp;
2164
2165 /* make sure this diff is calculated here */
2166 barrier();
2167
2168 /* Did the write stamp get updated already? */
2169 if (unlikely(ts < cpu_buffer->write_stamp))
2170 goto get_event;
2171 2160
2161 /* Did the write stamp get updated already? */
2162 if (likely(ts >= cpu_buffer->write_stamp)) {
2172 delta = diff; 2163 delta = diff;
2173 if (unlikely(test_time_stamp(delta))) { 2164 if (unlikely(test_time_stamp(delta))) {
2174 2165 WARN_ONCE(delta > (1ULL << 59),
2175 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2176 if (commit == -EBUSY) 2167 (unsigned long long)delta,
2177 goto out_fail; 2168 (unsigned long long)ts,
2178 2169 (unsigned long long)cpu_buffer->write_stamp);
2179 if (commit == -EAGAIN) 2170 add_timestamp = 1;
2180 goto again;
2181
2182 RB_WARN_ON(cpu_buffer, commit < 0);
2183 } 2171 }
2184 } 2172 }
2185 2173
2186 get_event: 2174 event = __rb_reserve_next(cpu_buffer, length, ts,
2187 event = __rb_reserve_next(cpu_buffer, 0, length, &ts); 2175 delta, add_timestamp);
2188 if (unlikely(PTR_ERR(event) == -EAGAIN)) 2176 if (unlikely(PTR_ERR(event) == -EAGAIN))
2189 goto again; 2177 goto again;
2190 2178
2191 if (!event) 2179 if (!event)
2192 goto out_fail; 2180 goto out_fail;
2193 2181
2194 if (!rb_event_is_commit(cpu_buffer, event))
2195 delta = 0;
2196
2197 event->time_delta = delta;
2198
2199 return event; 2182 return event;
2200 2183
2201 out_fail: 2184 out_fail:
@@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2207 2190
2208#define TRACE_RECURSIVE_DEPTH 16 2191#define TRACE_RECURSIVE_DEPTH 16
2209 2192
2210static int trace_recursive_lock(void) 2193/* Keep this code out of the fast path cache */
2194static noinline void trace_recursive_fail(void)
2211{ 2195{
2212 current->trace_recursion++;
2213
2214 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2215 return 0;
2216
2217 /* Disable all tracing before we do anything else */ 2196 /* Disable all tracing before we do anything else */
2218 tracing_off_permanent(); 2197 tracing_off_permanent();
2219 2198
@@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)
2225 in_nmi()); 2204 in_nmi());
2226 2205
2227 WARN_ON_ONCE(1); 2206 WARN_ON_ONCE(1);
2207}
2208
2209static inline int trace_recursive_lock(void)
2210{
2211 current->trace_recursion++;
2212
2213 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2214 return 0;
2215
2216 trace_recursive_fail();
2217
2228 return -1; 2218 return -1;
2229} 2219}
2230 2220
2231static void trace_recursive_unlock(void) 2221static inline void trace_recursive_unlock(void)
2232{ 2222{
2233 WARN_ON_ONCE(!current->trace_recursion); 2223 WARN_ON_ONCE(!current->trace_recursion);
2234 2224
@@ -2308,12 +2298,28 @@ static void
2308rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, 2298rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2309 struct ring_buffer_event *event) 2299 struct ring_buffer_event *event)
2310{ 2300{
2301 u64 delta;
2302
2311 /* 2303 /*
2312 * The event first in the commit queue updates the 2304 * The event first in the commit queue updates the
2313 * time stamp. 2305 * time stamp.
2314 */ 2306 */
2315 if (rb_event_is_commit(cpu_buffer, event)) 2307 if (rb_event_is_commit(cpu_buffer, event)) {
2316 cpu_buffer->write_stamp += event->time_delta; 2308 /*
2309 * A commit event that is first on a page
2310 * updates the write timestamp with the page stamp
2311 */
2312 if (!rb_event_index(event))
2313 cpu_buffer->write_stamp =
2314 cpu_buffer->commit_page->page->time_stamp;
2315 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
2316 delta = event->array[0];
2317 delta <<= TS_SHIFT;
2318 delta += event->time_delta;
2319 cpu_buffer->write_stamp += delta;
2320 } else
2321 cpu_buffer->write_stamp += event->time_delta;
2322 }
2317} 2323}
2318 2324
2319static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2325static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
2353 2359
2354static inline void rb_event_discard(struct ring_buffer_event *event) 2360static inline void rb_event_discard(struct ring_buffer_event *event)
2355{ 2361{
2362 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
2363 event = skip_time_extend(event);
2364
2356 /* array[0] holds the actual length for the discarded event */ 2365 /* array[0] holds the actual length for the discarded event */
2357 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; 2366 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
2358 event->type_len = RINGBUF_TYPE_PADDING; 2367 event->type_len = RINGBUF_TYPE_PADDING;
@@ -3049,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3049 3058
3050 again: 3059 again:
3051 /* 3060 /*
3052 * We repeat when a timestamp is encountered. It is possible 3061 * We repeat when a time extend is encountered.
3053 * to get multiple timestamps from an interrupt entering just 3062 * Since the time extend is always attached to a data event,
3054 * as one timestamp is about to be written, or from discarded 3063 * we should never loop more than once.
3055 * commits. The most that we can have is the number on a single page. 3064 * (We never hit the following condition more than twice).
3056 */ 3065 */
3057 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3066 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3058 return NULL; 3067 return NULL;
3059 3068
3060 reader = rb_get_reader_page(cpu_buffer); 3069 reader = rb_get_reader_page(cpu_buffer);
@@ -3130,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3130 return NULL; 3139 return NULL;
3131 3140
3132 /* 3141 /*
3133 * We repeat when a timestamp is encountered. 3142 * We repeat when a time extend is encountered.
3134 * We can get multiple timestamps by nested interrupts or also 3143 * Since the time extend is always attached to a data event,
3135 * if filtering is on (discarding commits). Since discarding 3144 * we should never loop more than once.
3136 * commits can be frequent we can get a lot of timestamps. 3145 * (We never hit the following condition more than twice).
3137 * But we limit them by not adding timestamps if they begin
3138 * at the start of a page.
3139 */ 3146 */
3140 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3147 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3141 return NULL; 3148 return NULL;
3142 3149
3143 if (rb_per_cpu_empty(cpu_buffer)) 3150 if (rb_per_cpu_empty(cpu_buffer))
@@ -3835,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3835 if (len > (commit - read)) 3842 if (len > (commit - read))
3836 len = (commit - read); 3843 len = (commit - read);
3837 3844
3838 size = rb_event_length(event); 3845 /* Always keep the time extend and data together */
3846 size = rb_event_ts_length(event);
3839 3847
3840 if (len < size) 3848 if (len < size)
3841 goto out_unlock; 3849 goto out_unlock;
@@ -3857,7 +3865,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3857 break; 3865 break;
3858 3866
3859 event = rb_reader_event(cpu_buffer); 3867 event = rb_reader_event(cpu_buffer);
3860 size = rb_event_length(event); 3868 /* Always keep the time extend and data together */
3869 size = rb_event_ts_length(event);
3861 } while (len > size); 3870 } while (len > size);
3862 3871
3863 /* update bpage */ 3872 /* update bpage */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 001bcd2ccf4a..82d9b8106cd0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3996,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu)
3996{ 3996{
3997 struct dentry *d_percpu = tracing_dentry_percpu(); 3997 struct dentry *d_percpu = tracing_dentry_percpu();
3998 struct dentry *d_cpu; 3998 struct dentry *d_cpu;
3999 /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ 3999 char cpu_dir[30]; /* 30 characters should be more than enough */
4000 char cpu_dir[7];
4001 4000
4002 if (cpu > 999 || cpu < 0) 4001 snprintf(cpu_dir, 30, "cpu%ld", cpu);
4003 return;
4004
4005 sprintf(cpu_dir, "cpu%ld", cpu);
4006 d_cpu = debugfs_create_dir(cpu_dir, d_percpu); 4002 d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
4007 if (!d_cpu) { 4003 if (!d_cpu) {
4008 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); 4004 pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);