diff options
author | Steven Rostedt <rostedt@goodmis.org> | 2008-10-04 02:00:59 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-14 04:39:19 -0400 |
commit | bf41a158cacba6ca5fc6407a54e7ad8ce1567e2e (patch) | |
tree | e9424b4927f99cfb3acce3cfab2635ae8f8c8ba1 /kernel | |
parent | 6f807acd27734197b11d42829d3cbb9c0937b572 (diff) |
ring-buffer: make reentrant
This patch replaces the local_irq_save/restore with preempt_disable/
enable. This allows for interrupts to enter while recording.
To write to the ring buffer, you must reserve data, and then
commit it. During this time, an interrupt may call a trace function
that will also record into the buffer before the commit is made.
The interrupt will reserve its entry after the first entry, even
though the first entry did not finish yet.
The time stamp delta of the interrupt entry will be zero, since
in the view of the trace, the interrupt happened during the
first field anyway.
Locking still takes place when the tail/write moves from one page
to the next. The reader always takes the locks.
A new page pointer is added, called the commit. The write/tail will
always point to the end of all entries. The commit field will
point to the last committed entry. Only this commit entry may
update the write time stamp.
The reader can only go up to the commit. It cannot go past it.
If a lot of interrupts come in during a commit that fills up the
buffer, and it happens to make it all the way around the buffer
back to the commit, then a warning is printed and new events will
be dropped.
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/trace/ring_buffer.c | 487 |
1 files changed, 374 insertions, 113 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 09d4f0d879a7..94af1fe56bb4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -116,8 +116,8 @@ void *ring_buffer_event_data(struct ring_buffer_event *event) | |||
116 | */ | 116 | */ |
117 | struct buffer_page { | 117 | struct buffer_page { |
118 | u64 time_stamp; /* page time stamp */ | 118 | u64 time_stamp; /* page time stamp */ |
119 | unsigned size; /* size of page data */ | 119 | local_t write; /* index for next write */ |
120 | unsigned write; /* index for next write */ | 120 | local_t commit; /* write commited index */ |
121 | unsigned read; /* index for next read */ | 121 | unsigned read; /* index for next read */ |
122 | struct list_head list; /* list of free pages */ | 122 | struct list_head list; /* list of free pages */ |
123 | void *page; /* Actual data page */ | 123 | void *page; /* Actual data page */ |
@@ -157,6 +157,7 @@ struct ring_buffer_per_cpu { | |||
157 | struct list_head pages; | 157 | struct list_head pages; |
158 | struct buffer_page *head_page; /* read from head */ | 158 | struct buffer_page *head_page; /* read from head */ |
159 | struct buffer_page *tail_page; /* write to tail */ | 159 | struct buffer_page *tail_page; /* write to tail */ |
160 | struct buffer_page *commit_page; /* commited pages */ | ||
160 | struct buffer_page *reader_page; | 161 | struct buffer_page *reader_page; |
161 | unsigned long overrun; | 162 | unsigned long overrun; |
162 | unsigned long entries; | 163 | unsigned long entries; |
@@ -185,12 +186,32 @@ struct ring_buffer_iter { | |||
185 | u64 read_stamp; | 186 | u64 read_stamp; |
186 | }; | 187 | }; |
187 | 188 | ||
188 | #define RB_WARN_ON(buffer, cond) \ | 189 | #define RB_WARN_ON(buffer, cond) \ |
189 | if (unlikely(cond)) { \ | 190 | do { \ |
190 | atomic_inc(&buffer->record_disabled); \ | 191 | if (unlikely(cond)) { \ |
191 | WARN_ON(1); \ | 192 | atomic_inc(&buffer->record_disabled); \ |
192 | return -1; \ | 193 | WARN_ON(1); \ |
193 | } | 194 | } \ |
195 | } while (0) | ||
196 | |||
197 | #define RB_WARN_ON_RET(buffer, cond) \ | ||
198 | do { \ | ||
199 | if (unlikely(cond)) { \ | ||
200 | atomic_inc(&buffer->record_disabled); \ | ||
201 | WARN_ON(1); \ | ||
202 | return -1; \ | ||
203 | } \ | ||
204 | } while (0) | ||
205 | |||
206 | #define RB_WARN_ON_ONCE(buffer, cond) \ | ||
207 | do { \ | ||
208 | static int once; \ | ||
209 | if (unlikely(cond) && !once) { \ | ||
210 | once++; \ | ||
211 | atomic_inc(&buffer->record_disabled); \ | ||
212 | WARN_ON(1); \ | ||
213 | } \ | ||
214 | } while (0) | ||
194 | 215 | ||
195 | /** | 216 | /** |
196 | * check_pages - integrity check of buffer pages | 217 | * check_pages - integrity check of buffer pages |
@@ -204,22 +225,19 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
204 | struct list_head *head = &cpu_buffer->pages; | 225 | struct list_head *head = &cpu_buffer->pages; |
205 | struct buffer_page *page, *tmp; | 226 | struct buffer_page *page, *tmp; |
206 | 227 | ||
207 | RB_WARN_ON(cpu_buffer, head->next->prev != head); | 228 | RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); |
208 | RB_WARN_ON(cpu_buffer, head->prev->next != head); | 229 | RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); |
209 | 230 | ||
210 | list_for_each_entry_safe(page, tmp, head, list) { | 231 | list_for_each_entry_safe(page, tmp, head, list) { |
211 | RB_WARN_ON(cpu_buffer, page->list.next->prev != &page->list); | 232 | RB_WARN_ON_RET(cpu_buffer, |
212 | RB_WARN_ON(cpu_buffer, page->list.prev->next != &page->list); | 233 | page->list.next->prev != &page->list); |
234 | RB_WARN_ON_RET(cpu_buffer, | ||
235 | page->list.prev->next != &page->list); | ||
213 | } | 236 | } |
214 | 237 | ||
215 | return 0; | 238 | return 0; |
216 | } | 239 | } |
217 | 240 | ||
218 | static unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) | ||
219 | { | ||
220 | return cpu_buffer->head_page->size; | ||
221 | } | ||
222 | |||
223 | static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | 241 | static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, |
224 | unsigned nr_pages) | 242 | unsigned nr_pages) |
225 | { | 243 | { |
@@ -286,7 +304,6 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
286 | page->page = (void *)addr; | 304 | page->page = (void *)addr; |
287 | 305 | ||
288 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | 306 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); |
289 | cpu_buffer->reader_page->size = 0; | ||
290 | 307 | ||
291 | ret = rb_allocate_pages(cpu_buffer, buffer->pages); | 308 | ret = rb_allocate_pages(cpu_buffer, buffer->pages); |
292 | if (ret < 0) | 309 | if (ret < 0) |
@@ -294,8 +311,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
294 | 311 | ||
295 | cpu_buffer->head_page | 312 | cpu_buffer->head_page |
296 | = list_entry(cpu_buffer->pages.next, struct buffer_page, list); | 313 | = list_entry(cpu_buffer->pages.next, struct buffer_page, list); |
297 | cpu_buffer->tail_page | 314 | cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; |
298 | = list_entry(cpu_buffer->pages.next, struct buffer_page, list); | ||
299 | 315 | ||
300 | return cpu_buffer; | 316 | return cpu_buffer; |
301 | 317 | ||
@@ -563,15 +579,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) | |||
563 | return -ENOMEM; | 579 | return -ENOMEM; |
564 | } | 580 | } |
565 | 581 | ||
566 | static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) | ||
567 | { | ||
568 | return cpu_buffer->reader_page->read == cpu_buffer->reader_page->size && | ||
569 | (cpu_buffer->tail_page == cpu_buffer->reader_page || | ||
570 | (cpu_buffer->tail_page == cpu_buffer->head_page && | ||
571 | cpu_buffer->head_page->read == | ||
572 | cpu_buffer->tail_page->write)); | ||
573 | } | ||
574 | |||
575 | static inline int rb_null_event(struct ring_buffer_event *event) | 582 | static inline int rb_null_event(struct ring_buffer_event *event) |
576 | { | 583 | { |
577 | return event->type == RINGBUF_TYPE_PADDING; | 584 | return event->type == RINGBUF_TYPE_PADDING; |
@@ -602,6 +609,33 @@ rb_iter_head_event(struct ring_buffer_iter *iter) | |||
602 | return __rb_page_index(iter->head_page, iter->head); | 609 | return __rb_page_index(iter->head_page, iter->head); |
603 | } | 610 | } |
604 | 611 | ||
612 | static inline unsigned rb_page_write(struct buffer_page *bpage) | ||
613 | { | ||
614 | return local_read(&bpage->write); | ||
615 | } | ||
616 | |||
617 | static inline unsigned rb_page_commit(struct buffer_page *bpage) | ||
618 | { | ||
619 | return local_read(&bpage->commit); | ||
620 | } | ||
621 | |||
622 | /* Size is determined by what has been commited */ | ||
623 | static inline unsigned rb_page_size(struct buffer_page *bpage) | ||
624 | { | ||
625 | return rb_page_commit(bpage); | ||
626 | } | ||
627 | |||
628 | static inline unsigned | ||
629 | rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) | ||
630 | { | ||
631 | return rb_page_commit(cpu_buffer->commit_page); | ||
632 | } | ||
633 | |||
634 | static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) | ||
635 | { | ||
636 | return rb_page_commit(cpu_buffer->head_page); | ||
637 | } | ||
638 | |||
605 | /* | 639 | /* |
606 | * When the tail hits the head and the buffer is in overwrite mode, | 640 | * When the tail hits the head and the buffer is in overwrite mode, |
607 | * the head jumps to the next page and all content on the previous | 641 | * the head jumps to the next page and all content on the previous |
@@ -637,16 +671,76 @@ static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, | |||
637 | *page = list_entry(p, struct buffer_page, list); | 671 | *page = list_entry(p, struct buffer_page, list); |
638 | } | 672 | } |
639 | 673 | ||
640 | static inline void | 674 | static inline unsigned |
641 | rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) | 675 | rb_event_index(struct ring_buffer_event *event) |
642 | { | 676 | { |
643 | cpu_buffer->tail_page->time_stamp = *ts; | 677 | unsigned long addr = (unsigned long)event; |
644 | cpu_buffer->write_stamp = *ts; | 678 | |
679 | return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); | ||
645 | } | 680 | } |
646 | 681 | ||
647 | static void rb_reset_head_page(struct ring_buffer_per_cpu *cpu_buffer) | 682 | static inline int |
683 | rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, | ||
684 | struct ring_buffer_event *event) | ||
648 | { | 685 | { |
649 | cpu_buffer->head_page->read = 0; | 686 | unsigned long addr = (unsigned long)event; |
687 | unsigned long index; | ||
688 | |||
689 | index = rb_event_index(event); | ||
690 | addr &= PAGE_MASK; | ||
691 | |||
692 | return cpu_buffer->commit_page->page == (void *)addr && | ||
693 | rb_commit_index(cpu_buffer) == index; | ||
694 | } | ||
695 | |||
696 | static inline void | ||
697 | rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer, | ||
698 | struct ring_buffer_event *event) | ||
699 | { | ||
700 | unsigned long addr = (unsigned long)event; | ||
701 | unsigned long index; | ||
702 | |||
703 | index = rb_event_index(event); | ||
704 | addr &= PAGE_MASK; | ||
705 | |||
706 | while (cpu_buffer->commit_page->page != (void *)addr) { | ||
707 | RB_WARN_ON(cpu_buffer, | ||
708 | cpu_buffer->commit_page == cpu_buffer->tail_page); | ||
709 | cpu_buffer->commit_page->commit = | ||
710 | cpu_buffer->commit_page->write; | ||
711 | rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); | ||
712 | cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; | ||
713 | } | ||
714 | |||
715 | /* Now set the commit to the event's index */ | ||
716 | local_set(&cpu_buffer->commit_page->commit, index); | ||
717 | } | ||
718 | |||
719 | static inline void | ||
720 | rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) | ||
721 | { | ||
722 | /* | ||
723 | * We only race with interrupts and NMIs on this CPU. | ||
724 | * If we own the commit event, then we can commit | ||
725 | * all others that interrupted us, since the interruptions | ||
726 | * are in stack format (they finish before they come | ||
727 | * back to us). This allows us to do a simple loop to | ||
728 | * assign the commit to the tail. | ||
729 | */ | ||
730 | while (cpu_buffer->commit_page != cpu_buffer->tail_page) { | ||
731 | cpu_buffer->commit_page->commit = | ||
732 | cpu_buffer->commit_page->write; | ||
733 | rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); | ||
734 | cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; | ||
735 | /* add barrier to keep gcc from optimizing too much */ | ||
736 | barrier(); | ||
737 | } | ||
738 | while (rb_commit_index(cpu_buffer) != | ||
739 | rb_page_write(cpu_buffer->commit_page)) { | ||
740 | cpu_buffer->commit_page->commit = | ||
741 | cpu_buffer->commit_page->write; | ||
742 | barrier(); | ||
743 | } | ||
650 | } | 744 | } |
651 | 745 | ||
652 | static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | 746 | static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) |
@@ -745,61 +839,120 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
745 | unsigned type, unsigned long length, u64 *ts) | 839 | unsigned type, unsigned long length, u64 *ts) |
746 | { | 840 | { |
747 | struct buffer_page *tail_page, *head_page, *reader_page; | 841 | struct buffer_page *tail_page, *head_page, *reader_page; |
748 | unsigned long tail; | 842 | unsigned long tail, write; |
749 | struct ring_buffer *buffer = cpu_buffer->buffer; | 843 | struct ring_buffer *buffer = cpu_buffer->buffer; |
750 | struct ring_buffer_event *event; | 844 | struct ring_buffer_event *event; |
845 | unsigned long flags; | ||
751 | 846 | ||
752 | tail_page = cpu_buffer->tail_page; | 847 | tail_page = cpu_buffer->tail_page; |
753 | tail = cpu_buffer->tail_page->write; | 848 | write = local_add_return(length, &tail_page->write); |
849 | tail = write - length; | ||
754 | 850 | ||
755 | if (tail + length > BUF_PAGE_SIZE) { | 851 | /* See if we shot pass the end of this buffer page */ |
852 | if (write > BUF_PAGE_SIZE) { | ||
756 | struct buffer_page *next_page = tail_page; | 853 | struct buffer_page *next_page = tail_page; |
757 | 854 | ||
758 | spin_lock(&cpu_buffer->lock); | 855 | spin_lock_irqsave(&cpu_buffer->lock, flags); |
856 | |||
759 | rb_inc_page(cpu_buffer, &next_page); | 857 | rb_inc_page(cpu_buffer, &next_page); |
760 | 858 | ||
761 | head_page = cpu_buffer->head_page; | 859 | head_page = cpu_buffer->head_page; |
762 | reader_page = cpu_buffer->reader_page; | 860 | reader_page = cpu_buffer->reader_page; |
763 | 861 | ||
764 | /* we grabbed the lock before incrementing */ | 862 | /* we grabbed the lock before incrementing */ |
765 | WARN_ON(next_page == reader_page); | 863 | RB_WARN_ON(cpu_buffer, next_page == reader_page); |
864 | |||
865 | /* | ||
866 | * If for some reason, we had an interrupt storm that made | ||
867 | * it all the way around the buffer, bail, and warn | ||
868 | * about it. | ||
869 | */ | ||
870 | if (unlikely(next_page == cpu_buffer->commit_page)) { | ||
871 | WARN_ON_ONCE(1); | ||
872 | goto out_unlock; | ||
873 | } | ||
766 | 874 | ||
767 | if (next_page == head_page) { | 875 | if (next_page == head_page) { |
768 | if (!(buffer->flags & RB_FL_OVERWRITE)) { | 876 | if (!(buffer->flags & RB_FL_OVERWRITE)) { |
769 | spin_unlock(&cpu_buffer->lock); | 877 | /* reset write */ |
770 | return NULL; | 878 | if (tail <= BUF_PAGE_SIZE) |
879 | local_set(&tail_page->write, tail); | ||
880 | goto out_unlock; | ||
771 | } | 881 | } |
772 | 882 | ||
773 | /* count overflows */ | 883 | /* tail_page has not moved yet? */ |
774 | rb_update_overflow(cpu_buffer); | 884 | if (tail_page == cpu_buffer->tail_page) { |
885 | /* count overflows */ | ||
886 | rb_update_overflow(cpu_buffer); | ||
887 | |||
888 | rb_inc_page(cpu_buffer, &head_page); | ||
889 | cpu_buffer->head_page = head_page; | ||
890 | cpu_buffer->head_page->read = 0; | ||
891 | } | ||
892 | } | ||
775 | 893 | ||
776 | rb_inc_page(cpu_buffer, &head_page); | 894 | /* |
777 | cpu_buffer->head_page = head_page; | 895 | * If the tail page is still the same as what we think |
778 | rb_reset_head_page(cpu_buffer); | 896 | * it is, then it is up to us to update the tail |
897 | * pointer. | ||
898 | */ | ||
899 | if (tail_page == cpu_buffer->tail_page) { | ||
900 | local_set(&next_page->write, 0); | ||
901 | local_set(&next_page->commit, 0); | ||
902 | cpu_buffer->tail_page = next_page; | ||
903 | |||
904 | /* reread the time stamp */ | ||
905 | *ts = ring_buffer_time_stamp(cpu_buffer->cpu); | ||
906 | cpu_buffer->tail_page->time_stamp = *ts; | ||
779 | } | 907 | } |
780 | 908 | ||
781 | if (tail != BUF_PAGE_SIZE) { | 909 | /* |
910 | * The actual tail page has moved forward. | ||
911 | */ | ||
912 | if (tail < BUF_PAGE_SIZE) { | ||
913 | /* Mark the rest of the page with padding */ | ||
782 | event = __rb_page_index(tail_page, tail); | 914 | event = __rb_page_index(tail_page, tail); |
783 | /* page padding */ | ||
784 | event->type = RINGBUF_TYPE_PADDING; | 915 | event->type = RINGBUF_TYPE_PADDING; |
785 | } | 916 | } |
786 | 917 | ||
787 | tail_page->size = tail; | 918 | if (tail <= BUF_PAGE_SIZE) |
788 | tail_page = next_page; | 919 | /* Set the write back to the previous setting */ |
789 | tail_page->size = 0; | 920 | local_set(&tail_page->write, tail); |
790 | tail = 0; | 921 | |
791 | cpu_buffer->tail_page = tail_page; | 922 | /* |
792 | cpu_buffer->tail_page->write = tail; | 923 | * If this was a commit entry that failed, |
793 | rb_add_stamp(cpu_buffer, ts); | 924 | * increment that too |
794 | spin_unlock(&cpu_buffer->lock); | 925 | */ |
926 | if (tail_page == cpu_buffer->commit_page && | ||
927 | tail == rb_commit_index(cpu_buffer)) { | ||
928 | rb_set_commit_to_write(cpu_buffer); | ||
929 | } | ||
930 | |||
931 | spin_unlock_irqrestore(&cpu_buffer->lock, flags); | ||
932 | |||
933 | /* fail and let the caller try again */ | ||
934 | return ERR_PTR(-EAGAIN); | ||
795 | } | 935 | } |
796 | 936 | ||
797 | BUG_ON(tail + length > BUF_PAGE_SIZE); | 937 | /* We reserved something on the buffer */ |
938 | |||
939 | BUG_ON(write > BUF_PAGE_SIZE); | ||
798 | 940 | ||
799 | event = __rb_page_index(tail_page, tail); | 941 | event = __rb_page_index(tail_page, tail); |
800 | rb_update_event(event, type, length); | 942 | rb_update_event(event, type, length); |
801 | 943 | ||
944 | /* | ||
945 | * If this is a commit and the tail is zero, then update | ||
946 | * this page's time stamp. | ||
947 | */ | ||
948 | if (!tail && rb_is_commit(cpu_buffer, event)) | ||
949 | cpu_buffer->commit_page->time_stamp = *ts; | ||
950 | |||
802 | return event; | 951 | return event; |
952 | |||
953 | out_unlock: | ||
954 | spin_unlock_irqrestore(&cpu_buffer->lock, flags); | ||
955 | return NULL; | ||
803 | } | 956 | } |
804 | 957 | ||
805 | static int | 958 | static int |
@@ -808,6 +961,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, | |||
808 | { | 961 | { |
809 | struct ring_buffer_event *event; | 962 | struct ring_buffer_event *event; |
810 | static int once; | 963 | static int once; |
964 | int ret; | ||
811 | 965 | ||
812 | if (unlikely(*delta > (1ULL << 59) && !once++)) { | 966 | if (unlikely(*delta > (1ULL << 59) && !once++)) { |
813 | printk(KERN_WARNING "Delta way too big! %llu" | 967 | printk(KERN_WARNING "Delta way too big! %llu" |
@@ -825,21 +979,38 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, | |||
825 | RB_LEN_TIME_EXTEND, | 979 | RB_LEN_TIME_EXTEND, |
826 | ts); | 980 | ts); |
827 | if (!event) | 981 | if (!event) |
828 | return -1; | 982 | return -EBUSY; |
829 | 983 | ||
830 | /* check to see if we went to the next page */ | 984 | if (PTR_ERR(event) == -EAGAIN) |
831 | if (cpu_buffer->tail_page->write) { | 985 | return -EAGAIN; |
832 | /* Still on same page, update timestamp */ | 986 | |
833 | event->time_delta = *delta & TS_MASK; | 987 | /* Only a commited time event can update the write stamp */ |
834 | event->array[0] = *delta >> TS_SHIFT; | 988 | if (rb_is_commit(cpu_buffer, event)) { |
835 | /* commit the time event */ | 989 | /* |
836 | cpu_buffer->tail_page->write += | 990 | * If this is the first on the page, then we need to |
837 | rb_event_length(event); | 991 | * update the page itself, and just put in a zero. |
992 | */ | ||
993 | if (rb_event_index(event)) { | ||
994 | event->time_delta = *delta & TS_MASK; | ||
995 | event->array[0] = *delta >> TS_SHIFT; | ||
996 | } else { | ||
997 | cpu_buffer->commit_page->time_stamp = *ts; | ||
998 | event->time_delta = 0; | ||
999 | event->array[0] = 0; | ||
1000 | } | ||
838 | cpu_buffer->write_stamp = *ts; | 1001 | cpu_buffer->write_stamp = *ts; |
839 | *delta = 0; | 1002 | /* let the caller know this was the commit */ |
1003 | ret = 1; | ||
1004 | } else { | ||
1005 | /* Darn, this is just wasted space */ | ||
1006 | event->time_delta = 0; | ||
1007 | event->array[0] = 0; | ||
1008 | ret = 0; | ||
840 | } | 1009 | } |
841 | 1010 | ||
842 | return 0; | 1011 | *delta = 0; |
1012 | |||
1013 | return ret; | ||
843 | } | 1014 | } |
844 | 1015 | ||
845 | static struct ring_buffer_event * | 1016 | static struct ring_buffer_event * |
@@ -848,32 +1019,69 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, | |||
848 | { | 1019 | { |
849 | struct ring_buffer_event *event; | 1020 | struct ring_buffer_event *event; |
850 | u64 ts, delta; | 1021 | u64 ts, delta; |
1022 | int commit = 0; | ||
851 | 1023 | ||
1024 | again: | ||
852 | ts = ring_buffer_time_stamp(cpu_buffer->cpu); | 1025 | ts = ring_buffer_time_stamp(cpu_buffer->cpu); |
853 | 1026 | ||
854 | if (cpu_buffer->tail_page->write) { | 1027 | /* |
1028 | * Only the first commit can update the timestamp. | ||
1029 | * Yes there is a race here. If an interrupt comes in | ||
1030 | * just after the conditional and it traces too, then it | ||
1031 | * will also check the deltas. More than one timestamp may | ||
1032 | * also be made. But only the entry that did the actual | ||
1033 | * commit will be something other than zero. | ||
1034 | */ | ||
1035 | if (cpu_buffer->tail_page == cpu_buffer->commit_page && | ||
1036 | rb_page_write(cpu_buffer->tail_page) == | ||
1037 | rb_commit_index(cpu_buffer)) { | ||
1038 | |||
855 | delta = ts - cpu_buffer->write_stamp; | 1039 | delta = ts - cpu_buffer->write_stamp; |
856 | 1040 | ||
1041 | /* make sure this delta is calculated here */ | ||
1042 | barrier(); | ||
1043 | |||
1044 | /* Did the write stamp get updated already? */ | ||
1045 | if (unlikely(ts < cpu_buffer->write_stamp)) | ||
1046 | goto again; | ||
1047 | |||
857 | if (test_time_stamp(delta)) { | 1048 | if (test_time_stamp(delta)) { |
858 | int ret; | ||
859 | 1049 | ||
860 | ret = rb_add_time_stamp(cpu_buffer, &ts, &delta); | 1050 | commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); |
861 | if (ret < 0) | 1051 | |
1052 | if (commit == -EBUSY) | ||
862 | return NULL; | 1053 | return NULL; |
1054 | |||
1055 | if (commit == -EAGAIN) | ||
1056 | goto again; | ||
1057 | |||
1058 | RB_WARN_ON(cpu_buffer, commit < 0); | ||
863 | } | 1059 | } |
864 | } else { | 1060 | } else |
865 | spin_lock(&cpu_buffer->lock); | 1061 | /* Non commits have zero deltas */ |
866 | rb_add_stamp(cpu_buffer, &ts); | ||
867 | spin_unlock(&cpu_buffer->lock); | ||
868 | delta = 0; | 1062 | delta = 0; |
869 | } | ||
870 | 1063 | ||
871 | event = __rb_reserve_next(cpu_buffer, type, length, &ts); | 1064 | event = __rb_reserve_next(cpu_buffer, type, length, &ts); |
872 | if (!event) | 1065 | if (PTR_ERR(event) == -EAGAIN) |
1066 | goto again; | ||
1067 | |||
1068 | if (!event) { | ||
1069 | if (unlikely(commit)) | ||
1070 | /* | ||
1071 | * Ouch! We needed a timestamp and it was commited. But | ||
1072 | * we didn't get our event reserved. | ||
1073 | */ | ||
1074 | rb_set_commit_to_write(cpu_buffer); | ||
873 | return NULL; | 1075 | return NULL; |
1076 | } | ||
874 | 1077 | ||
875 | /* If the reserve went to the next page, our delta is zero */ | 1078 | /* |
876 | if (!cpu_buffer->tail_page->write) | 1079 | * If the timestamp was commited, make the commit our entry |
1080 | * now so that we will update it when needed. | ||
1081 | */ | ||
1082 | if (commit) | ||
1083 | rb_set_commit_event(cpu_buffer, event); | ||
1084 | else if (!rb_is_commit(cpu_buffer, event)) | ||
877 | delta = 0; | 1085 | delta = 0; |
878 | 1086 | ||
879 | event->time_delta = delta; | 1087 | event->time_delta = delta; |
@@ -881,6 +1089,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, | |||
881 | return event; | 1089 | return event; |
882 | } | 1090 | } |
883 | 1091 | ||
1092 | static DEFINE_PER_CPU(int, rb_need_resched); | ||
1093 | |||
884 | /** | 1094 | /** |
885 | * ring_buffer_lock_reserve - reserve a part of the buffer | 1095 | * ring_buffer_lock_reserve - reserve a part of the buffer |
886 | * @buffer: the ring buffer to reserve from | 1096 | * @buffer: the ring buffer to reserve from |
@@ -904,12 +1114,15 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, | |||
904 | { | 1114 | { |
905 | struct ring_buffer_per_cpu *cpu_buffer; | 1115 | struct ring_buffer_per_cpu *cpu_buffer; |
906 | struct ring_buffer_event *event; | 1116 | struct ring_buffer_event *event; |
907 | int cpu; | 1117 | int cpu, resched; |
908 | 1118 | ||
909 | if (atomic_read(&buffer->record_disabled)) | 1119 | if (atomic_read(&buffer->record_disabled)) |
910 | return NULL; | 1120 | return NULL; |
911 | 1121 | ||
912 | local_irq_save(*flags); | 1122 | /* If we are tracing schedule, we don't want to recurse */ |
1123 | resched = need_resched(); | ||
1124 | preempt_disable_notrace(); | ||
1125 | |||
913 | cpu = raw_smp_processor_id(); | 1126 | cpu = raw_smp_processor_id(); |
914 | 1127 | ||
915 | if (!cpu_isset(cpu, buffer->cpumask)) | 1128 | if (!cpu_isset(cpu, buffer->cpumask)) |
@@ -922,26 +1135,42 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, | |||
922 | 1135 | ||
923 | length = rb_calculate_event_length(length); | 1136 | length = rb_calculate_event_length(length); |
924 | if (length > BUF_PAGE_SIZE) | 1137 | if (length > BUF_PAGE_SIZE) |
925 | return NULL; | 1138 | goto out; |
926 | 1139 | ||
927 | event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); | 1140 | event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); |
928 | if (!event) | 1141 | if (!event) |
929 | goto out; | 1142 | goto out; |
930 | 1143 | ||
1144 | /* | ||
1145 | * Need to store resched state on this cpu. | ||
1146 | * Only the first needs to. | ||
1147 | */ | ||
1148 | |||
1149 | if (preempt_count() == 1) | ||
1150 | per_cpu(rb_need_resched, cpu) = resched; | ||
1151 | |||
931 | return event; | 1152 | return event; |
932 | 1153 | ||
933 | out: | 1154 | out: |
934 | local_irq_restore(*flags); | 1155 | if (resched) |
1156 | preempt_enable_notrace(); | ||
1157 | else | ||
1158 | preempt_enable_notrace(); | ||
935 | return NULL; | 1159 | return NULL; |
936 | } | 1160 | } |
937 | 1161 | ||
938 | static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, | 1162 | static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, |
939 | struct ring_buffer_event *event) | 1163 | struct ring_buffer_event *event) |
940 | { | 1164 | { |
941 | cpu_buffer->tail_page->write += rb_event_length(event); | ||
942 | cpu_buffer->tail_page->size = cpu_buffer->tail_page->write; | ||
943 | cpu_buffer->write_stamp += event->time_delta; | ||
944 | cpu_buffer->entries++; | 1165 | cpu_buffer->entries++; |
1166 | |||
1167 | /* Only process further if we own the commit */ | ||
1168 | if (!rb_is_commit(cpu_buffer, event)) | ||
1169 | return; | ||
1170 | |||
1171 | cpu_buffer->write_stamp += event->time_delta; | ||
1172 | |||
1173 | rb_set_commit_to_write(cpu_buffer); | ||
945 | } | 1174 | } |
946 | 1175 | ||
947 | /** | 1176 | /** |
@@ -965,7 +1194,16 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, | |||
965 | 1194 | ||
966 | rb_commit(cpu_buffer, event); | 1195 | rb_commit(cpu_buffer, event); |
967 | 1196 | ||
968 | local_irq_restore(flags); | 1197 | /* |
1198 | * Only the last preempt count needs to restore preemption. | ||
1199 | */ | ||
1200 | if (preempt_count() == 1) { | ||
1201 | if (per_cpu(rb_need_resched, cpu)) | ||
1202 | preempt_enable_no_resched_notrace(); | ||
1203 | else | ||
1204 | preempt_enable_notrace(); | ||
1205 | } else | ||
1206 | preempt_enable_no_resched_notrace(); | ||
969 | 1207 | ||
970 | return 0; | 1208 | return 0; |
971 | } | 1209 | } |
@@ -989,15 +1227,17 @@ int ring_buffer_write(struct ring_buffer *buffer, | |||
989 | { | 1227 | { |
990 | struct ring_buffer_per_cpu *cpu_buffer; | 1228 | struct ring_buffer_per_cpu *cpu_buffer; |
991 | struct ring_buffer_event *event; | 1229 | struct ring_buffer_event *event; |
992 | unsigned long event_length, flags; | 1230 | unsigned long event_length; |
993 | void *body; | 1231 | void *body; |
994 | int ret = -EBUSY; | 1232 | int ret = -EBUSY; |
995 | int cpu; | 1233 | int cpu, resched; |
996 | 1234 | ||
997 | if (atomic_read(&buffer->record_disabled)) | 1235 | if (atomic_read(&buffer->record_disabled)) |
998 | return -EBUSY; | 1236 | return -EBUSY; |
999 | 1237 | ||
1000 | local_irq_save(flags); | 1238 | resched = need_resched(); |
1239 | preempt_disable_notrace(); | ||
1240 | |||
1001 | cpu = raw_smp_processor_id(); | 1241 | cpu = raw_smp_processor_id(); |
1002 | 1242 | ||
1003 | if (!cpu_isset(cpu, buffer->cpumask)) | 1243 | if (!cpu_isset(cpu, buffer->cpumask)) |
@@ -1022,11 +1262,26 @@ int ring_buffer_write(struct ring_buffer *buffer, | |||
1022 | 1262 | ||
1023 | ret = 0; | 1263 | ret = 0; |
1024 | out: | 1264 | out: |
1025 | local_irq_restore(flags); | 1265 | if (resched) |
1266 | preempt_enable_no_resched_notrace(); | ||
1267 | else | ||
1268 | preempt_enable_notrace(); | ||
1026 | 1269 | ||
1027 | return ret; | 1270 | return ret; |
1028 | } | 1271 | } |
1029 | 1272 | ||
1273 | static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) | ||
1274 | { | ||
1275 | struct buffer_page *reader = cpu_buffer->reader_page; | ||
1276 | struct buffer_page *head = cpu_buffer->head_page; | ||
1277 | struct buffer_page *commit = cpu_buffer->commit_page; | ||
1278 | |||
1279 | return reader->read == rb_page_commit(reader) && | ||
1280 | (commit == reader || | ||
1281 | (commit == head && | ||
1282 | head->read == rb_page_commit(commit))); | ||
1283 | } | ||
1284 | |||
1030 | /** | 1285 | /** |
1031 | * ring_buffer_record_disable - stop all writes into the buffer | 1286 | * ring_buffer_record_disable - stop all writes into the buffer |
1032 | * @buffer: The ring buffer to stop writes to. | 1287 | * @buffer: The ring buffer to stop writes to. |
@@ -1204,8 +1459,8 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter) | |||
1204 | 1459 | ||
1205 | cpu_buffer = iter->cpu_buffer; | 1460 | cpu_buffer = iter->cpu_buffer; |
1206 | 1461 | ||
1207 | return iter->head_page == cpu_buffer->tail_page && | 1462 | return iter->head_page == cpu_buffer->commit_page && |
1208 | iter->head == cpu_buffer->tail_page->write; | 1463 | iter->head == rb_commit_index(cpu_buffer); |
1209 | } | 1464 | } |
1210 | 1465 | ||
1211 | static void | 1466 | static void |
@@ -1282,15 +1537,16 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | |||
1282 | reader = cpu_buffer->reader_page; | 1537 | reader = cpu_buffer->reader_page; |
1283 | 1538 | ||
1284 | /* If there's more to read, return this page */ | 1539 | /* If there's more to read, return this page */ |
1285 | if (cpu_buffer->reader_page->read < reader->size) | 1540 | if (cpu_buffer->reader_page->read < rb_page_size(reader)) |
1286 | goto out; | 1541 | goto out; |
1287 | 1542 | ||
1288 | /* Never should we have an index greater than the size */ | 1543 | /* Never should we have an index greater than the size */ |
1289 | WARN_ON(cpu_buffer->reader_page->read > reader->size); | 1544 | RB_WARN_ON(cpu_buffer, |
1545 | cpu_buffer->reader_page->read > rb_page_size(reader)); | ||
1290 | 1546 | ||
1291 | /* check if we caught up to the tail */ | 1547 | /* check if we caught up to the tail */ |
1292 | reader = NULL; | 1548 | reader = NULL; |
1293 | if (cpu_buffer->tail_page == cpu_buffer->reader_page) | 1549 | if (cpu_buffer->commit_page == cpu_buffer->reader_page) |
1294 | goto out; | 1550 | goto out; |
1295 | 1551 | ||
1296 | /* | 1552 | /* |
@@ -1301,7 +1557,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | |||
1301 | reader = cpu_buffer->head_page; | 1557 | reader = cpu_buffer->head_page; |
1302 | cpu_buffer->reader_page->list.next = reader->list.next; | 1558 | cpu_buffer->reader_page->list.next = reader->list.next; |
1303 | cpu_buffer->reader_page->list.prev = reader->list.prev; | 1559 | cpu_buffer->reader_page->list.prev = reader->list.prev; |
1304 | cpu_buffer->reader_page->size = 0; | 1560 | |
1561 | local_set(&cpu_buffer->reader_page->write, 0); | ||
1562 | local_set(&cpu_buffer->reader_page->commit, 0); | ||
1305 | 1563 | ||
1306 | /* Make the reader page now replace the head */ | 1564 | /* Make the reader page now replace the head */ |
1307 | reader->list.prev->next = &cpu_buffer->reader_page->list; | 1565 | reader->list.prev->next = &cpu_buffer->reader_page->list; |
@@ -1313,7 +1571,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | |||
1313 | */ | 1571 | */ |
1314 | cpu_buffer->head_page = cpu_buffer->reader_page; | 1572 | cpu_buffer->head_page = cpu_buffer->reader_page; |
1315 | 1573 | ||
1316 | if (cpu_buffer->tail_page != reader) | 1574 | if (cpu_buffer->commit_page != reader) |
1317 | rb_inc_page(cpu_buffer, &cpu_buffer->head_page); | 1575 | rb_inc_page(cpu_buffer, &cpu_buffer->head_page); |
1318 | 1576 | ||
1319 | /* Finally update the reader page to the new head */ | 1577 | /* Finally update the reader page to the new head */ |
@@ -1363,8 +1621,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) | |||
1363 | /* | 1621 | /* |
1364 | * Check if we are at the end of the buffer. | 1622 | * Check if we are at the end of the buffer. |
1365 | */ | 1623 | */ |
1366 | if (iter->head >= iter->head_page->size) { | 1624 | if (iter->head >= rb_page_size(iter->head_page)) { |
1367 | BUG_ON(iter->head_page == cpu_buffer->tail_page); | 1625 | BUG_ON(iter->head_page == cpu_buffer->commit_page); |
1368 | rb_inc_iter(iter); | 1626 | rb_inc_iter(iter); |
1369 | return; | 1627 | return; |
1370 | } | 1628 | } |
@@ -1377,16 +1635,16 @@ static void rb_advance_iter(struct ring_buffer_iter *iter) | |||
1377 | * This should not be called to advance the header if we are | 1635 | * This should not be called to advance the header if we are |
1378 | * at the tail of the buffer. | 1636 | * at the tail of the buffer. |
1379 | */ | 1637 | */ |
1380 | BUG_ON((iter->head_page == cpu_buffer->tail_page) && | 1638 | BUG_ON((iter->head_page == cpu_buffer->commit_page) && |
1381 | (iter->head + length > cpu_buffer->tail_page->write)); | 1639 | (iter->head + length > rb_commit_index(cpu_buffer))); |
1382 | 1640 | ||
1383 | rb_update_iter_read_stamp(iter, event); | 1641 | rb_update_iter_read_stamp(iter, event); |
1384 | 1642 | ||
1385 | iter->head += length; | 1643 | iter->head += length; |
1386 | 1644 | ||
1387 | /* check for end of page padding */ | 1645 | /* check for end of page padding */ |
1388 | if ((iter->head >= iter->head_page->size) && | 1646 | if ((iter->head >= rb_page_size(iter->head_page)) && |
1389 | (iter->head_page != cpu_buffer->tail_page)) | 1647 | (iter->head_page != cpu_buffer->commit_page)) |
1390 | rb_advance_iter(iter); | 1648 | rb_advance_iter(iter); |
1391 | } | 1649 | } |
1392 | 1650 | ||
@@ -1420,7 +1678,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) | |||
1420 | 1678 | ||
1421 | switch (event->type) { | 1679 | switch (event->type) { |
1422 | case RINGBUF_TYPE_PADDING: | 1680 | case RINGBUF_TYPE_PADDING: |
1423 | WARN_ON(1); | 1681 | RB_WARN_ON(cpu_buffer, 1); |
1424 | rb_advance_reader(cpu_buffer); | 1682 | rb_advance_reader(cpu_buffer); |
1425 | return NULL; | 1683 | return NULL; |
1426 | 1684 | ||
@@ -1622,14 +1880,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) | |||
1622 | { | 1880 | { |
1623 | cpu_buffer->head_page | 1881 | cpu_buffer->head_page |
1624 | = list_entry(cpu_buffer->pages.next, struct buffer_page, list); | 1882 | = list_entry(cpu_buffer->pages.next, struct buffer_page, list); |
1625 | cpu_buffer->head_page->size = 0; | 1883 | local_set(&cpu_buffer->head_page->write, 0); |
1626 | cpu_buffer->tail_page = cpu_buffer->head_page; | 1884 | local_set(&cpu_buffer->head_page->commit, 0); |
1627 | cpu_buffer->tail_page->size = 0; | ||
1628 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | ||
1629 | cpu_buffer->reader_page->size = 0; | ||
1630 | 1885 | ||
1631 | cpu_buffer->head_page->read = 0; | 1886 | cpu_buffer->head_page->read = 0; |
1632 | cpu_buffer->tail_page->write = 0; | 1887 | |
1888 | cpu_buffer->tail_page = cpu_buffer->head_page; | ||
1889 | cpu_buffer->commit_page = cpu_buffer->head_page; | ||
1890 | |||
1891 | INIT_LIST_HEAD(&cpu_buffer->reader_page->list); | ||
1892 | local_set(&cpu_buffer->reader_page->write, 0); | ||
1893 | local_set(&cpu_buffer->reader_page->commit, 0); | ||
1633 | cpu_buffer->reader_page->read = 0; | 1894 | cpu_buffer->reader_page->read = 0; |
1634 | 1895 | ||
1635 | cpu_buffer->overrun = 0; | 1896 | cpu_buffer->overrun = 0; |