aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
authorSteven Rostedt <rostedt@goodmis.org>2008-10-04 02:00:59 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-14 04:39:19 -0400
commitbf41a158cacba6ca5fc6407a54e7ad8ce1567e2e (patch)
treee9424b4927f99cfb3acce3cfab2635ae8f8c8ba1 /kernel/trace
parent6f807acd27734197b11d42829d3cbb9c0937b572 (diff)
ring-buffer: make reentrant
This patch replaces the local_irq_save/restore with preempt_disable/ enable. This allows for interrupts to enter while recording. To write to the ring buffer, you must reserve data, and then commit it. During this time, an interrupt may call a trace function that will also record into the buffer before the commit is made. The interrupt will reserve its entry after the first entry, even though the first entry did not finish yet. The time stamp delta of the interrupt entry will be zero, since in the view of the trace, the interrupt happened during the first field anyway. Locking still takes place when the tail/write moves from one page to the next. The reader always takes the locks. A new page pointer is added, called the commit. The write/tail will always point to the end of all entries. The commit field will point to the last committed entry. Only this commit entry may update the write time stamp. The reader can only go up to the commit. It cannot go past it. If a lot of interrupts come in during a commit that fills up the buffer, and it happens to make it all the way around the buffer back to the commit, then a warning is printed and new events will be dropped. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/ring_buffer.c487
1 files changed, 374 insertions, 113 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 09d4f0d879a7..94af1fe56bb4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -116,8 +116,8 @@ void *ring_buffer_event_data(struct ring_buffer_event *event)
116 */ 116 */
117struct buffer_page { 117struct buffer_page {
118 u64 time_stamp; /* page time stamp */ 118 u64 time_stamp; /* page time stamp */
119 unsigned size; /* size of page data */ 119 local_t write; /* index for next write */
120 unsigned write; /* index for next write */ 120 local_t commit; /* write commited index */
121 unsigned read; /* index for next read */ 121 unsigned read; /* index for next read */
122 struct list_head list; /* list of free pages */ 122 struct list_head list; /* list of free pages */
123 void *page; /* Actual data page */ 123 void *page; /* Actual data page */
@@ -157,6 +157,7 @@ struct ring_buffer_per_cpu {
157 struct list_head pages; 157 struct list_head pages;
158 struct buffer_page *head_page; /* read from head */ 158 struct buffer_page *head_page; /* read from head */
159 struct buffer_page *tail_page; /* write to tail */ 159 struct buffer_page *tail_page; /* write to tail */
160 struct buffer_page *commit_page; /* commited pages */
160 struct buffer_page *reader_page; 161 struct buffer_page *reader_page;
161 unsigned long overrun; 162 unsigned long overrun;
162 unsigned long entries; 163 unsigned long entries;
@@ -185,12 +186,32 @@ struct ring_buffer_iter {
185 u64 read_stamp; 186 u64 read_stamp;
186}; 187};
187 188
188#define RB_WARN_ON(buffer, cond) \ 189#define RB_WARN_ON(buffer, cond) \
189 if (unlikely(cond)) { \ 190 do { \
190 atomic_inc(&buffer->record_disabled); \ 191 if (unlikely(cond)) { \
191 WARN_ON(1); \ 192 atomic_inc(&buffer->record_disabled); \
192 return -1; \ 193 WARN_ON(1); \
193 } 194 } \
195 } while (0)
196
197#define RB_WARN_ON_RET(buffer, cond) \
198 do { \
199 if (unlikely(cond)) { \
200 atomic_inc(&buffer->record_disabled); \
201 WARN_ON(1); \
202 return -1; \
203 } \
204 } while (0)
205
206#define RB_WARN_ON_ONCE(buffer, cond) \
207 do { \
208 static int once; \
209 if (unlikely(cond) && !once) { \
210 once++; \
211 atomic_inc(&buffer->record_disabled); \
212 WARN_ON(1); \
213 } \
214 } while (0)
194 215
195/** 216/**
196 * check_pages - integrity check of buffer pages 217 * check_pages - integrity check of buffer pages
@@ -204,22 +225,19 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
204 struct list_head *head = &cpu_buffer->pages; 225 struct list_head *head = &cpu_buffer->pages;
205 struct buffer_page *page, *tmp; 226 struct buffer_page *page, *tmp;
206 227
207 RB_WARN_ON(cpu_buffer, head->next->prev != head); 228 RB_WARN_ON_RET(cpu_buffer, head->next->prev != head);
208 RB_WARN_ON(cpu_buffer, head->prev->next != head); 229 RB_WARN_ON_RET(cpu_buffer, head->prev->next != head);
209 230
210 list_for_each_entry_safe(page, tmp, head, list) { 231 list_for_each_entry_safe(page, tmp, head, list) {
211 RB_WARN_ON(cpu_buffer, page->list.next->prev != &page->list); 232 RB_WARN_ON_RET(cpu_buffer,
212 RB_WARN_ON(cpu_buffer, page->list.prev->next != &page->list); 233 page->list.next->prev != &page->list);
234 RB_WARN_ON_RET(cpu_buffer,
235 page->list.prev->next != &page->list);
213 } 236 }
214 237
215 return 0; 238 return 0;
216} 239}
217 240
218static unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
219{
220 return cpu_buffer->head_page->size;
221}
222
223static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 241static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
224 unsigned nr_pages) 242 unsigned nr_pages)
225{ 243{
@@ -286,7 +304,6 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
286 page->page = (void *)addr; 304 page->page = (void *)addr;
287 305
288 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 306 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
289 cpu_buffer->reader_page->size = 0;
290 307
291 ret = rb_allocate_pages(cpu_buffer, buffer->pages); 308 ret = rb_allocate_pages(cpu_buffer, buffer->pages);
292 if (ret < 0) 309 if (ret < 0)
@@ -294,8 +311,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
294 311
295 cpu_buffer->head_page 312 cpu_buffer->head_page
296 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 313 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
297 cpu_buffer->tail_page 314 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
298 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
299 315
300 return cpu_buffer; 316 return cpu_buffer;
301 317
@@ -563,15 +579,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
563 return -ENOMEM; 579 return -ENOMEM;
564} 580}
565 581
566static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
567{
568 return cpu_buffer->reader_page->read == cpu_buffer->reader_page->size &&
569 (cpu_buffer->tail_page == cpu_buffer->reader_page ||
570 (cpu_buffer->tail_page == cpu_buffer->head_page &&
571 cpu_buffer->head_page->read ==
572 cpu_buffer->tail_page->write));
573}
574
575static inline int rb_null_event(struct ring_buffer_event *event) 582static inline int rb_null_event(struct ring_buffer_event *event)
576{ 583{
577 return event->type == RINGBUF_TYPE_PADDING; 584 return event->type == RINGBUF_TYPE_PADDING;
@@ -602,6 +609,33 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
602 return __rb_page_index(iter->head_page, iter->head); 609 return __rb_page_index(iter->head_page, iter->head);
603} 610}
604 611
612static inline unsigned rb_page_write(struct buffer_page *bpage)
613{
614 return local_read(&bpage->write);
615}
616
617static inline unsigned rb_page_commit(struct buffer_page *bpage)
618{
619 return local_read(&bpage->commit);
620}
621
622/* Size is determined by what has been commited */
623static inline unsigned rb_page_size(struct buffer_page *bpage)
624{
625 return rb_page_commit(bpage);
626}
627
628static inline unsigned
629rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
630{
631 return rb_page_commit(cpu_buffer->commit_page);
632}
633
634static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
635{
636 return rb_page_commit(cpu_buffer->head_page);
637}
638
605/* 639/*
606 * When the tail hits the head and the buffer is in overwrite mode, 640 * When the tail hits the head and the buffer is in overwrite mode,
607 * the head jumps to the next page and all content on the previous 641 * the head jumps to the next page and all content on the previous
@@ -637,16 +671,76 @@ static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
637 *page = list_entry(p, struct buffer_page, list); 671 *page = list_entry(p, struct buffer_page, list);
638} 672}
639 673
640static inline void 674static inline unsigned
641rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) 675rb_event_index(struct ring_buffer_event *event)
642{ 676{
643 cpu_buffer->tail_page->time_stamp = *ts; 677 unsigned long addr = (unsigned long)event;
644 cpu_buffer->write_stamp = *ts; 678
679 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
645} 680}
646 681
647static void rb_reset_head_page(struct ring_buffer_per_cpu *cpu_buffer) 682static inline int
683rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
684 struct ring_buffer_event *event)
648{ 685{
649 cpu_buffer->head_page->read = 0; 686 unsigned long addr = (unsigned long)event;
687 unsigned long index;
688
689 index = rb_event_index(event);
690 addr &= PAGE_MASK;
691
692 return cpu_buffer->commit_page->page == (void *)addr &&
693 rb_commit_index(cpu_buffer) == index;
694}
695
696static inline void
697rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
698 struct ring_buffer_event *event)
699{
700 unsigned long addr = (unsigned long)event;
701 unsigned long index;
702
703 index = rb_event_index(event);
704 addr &= PAGE_MASK;
705
706 while (cpu_buffer->commit_page->page != (void *)addr) {
707 RB_WARN_ON(cpu_buffer,
708 cpu_buffer->commit_page == cpu_buffer->tail_page);
709 cpu_buffer->commit_page->commit =
710 cpu_buffer->commit_page->write;
711 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
712 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
713 }
714
715 /* Now set the commit to the event's index */
716 local_set(&cpu_buffer->commit_page->commit, index);
717}
718
719static inline void
720rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
721{
722 /*
723 * We only race with interrupts and NMIs on this CPU.
724 * If we own the commit event, then we can commit
725 * all others that interrupted us, since the interruptions
726 * are in stack format (they finish before they come
727 * back to us). This allows us to do a simple loop to
728 * assign the commit to the tail.
729 */
730 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
731 cpu_buffer->commit_page->commit =
732 cpu_buffer->commit_page->write;
733 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
734 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
735 /* add barrier to keep gcc from optimizing too much */
736 barrier();
737 }
738 while (rb_commit_index(cpu_buffer) !=
739 rb_page_write(cpu_buffer->commit_page)) {
740 cpu_buffer->commit_page->commit =
741 cpu_buffer->commit_page->write;
742 barrier();
743 }
650} 744}
651 745
652static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 746static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
@@ -745,61 +839,120 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
745 unsigned type, unsigned long length, u64 *ts) 839 unsigned type, unsigned long length, u64 *ts)
746{ 840{
747 struct buffer_page *tail_page, *head_page, *reader_page; 841 struct buffer_page *tail_page, *head_page, *reader_page;
748 unsigned long tail; 842 unsigned long tail, write;
749 struct ring_buffer *buffer = cpu_buffer->buffer; 843 struct ring_buffer *buffer = cpu_buffer->buffer;
750 struct ring_buffer_event *event; 844 struct ring_buffer_event *event;
845 unsigned long flags;
751 846
752 tail_page = cpu_buffer->tail_page; 847 tail_page = cpu_buffer->tail_page;
753 tail = cpu_buffer->tail_page->write; 848 write = local_add_return(length, &tail_page->write);
849 tail = write - length;
754 850
755 if (tail + length > BUF_PAGE_SIZE) { 851 /* See if we shot pass the end of this buffer page */
852 if (write > BUF_PAGE_SIZE) {
756 struct buffer_page *next_page = tail_page; 853 struct buffer_page *next_page = tail_page;
757 854
758 spin_lock(&cpu_buffer->lock); 855 spin_lock_irqsave(&cpu_buffer->lock, flags);
856
759 rb_inc_page(cpu_buffer, &next_page); 857 rb_inc_page(cpu_buffer, &next_page);
760 858
761 head_page = cpu_buffer->head_page; 859 head_page = cpu_buffer->head_page;
762 reader_page = cpu_buffer->reader_page; 860 reader_page = cpu_buffer->reader_page;
763 861
764 /* we grabbed the lock before incrementing */ 862 /* we grabbed the lock before incrementing */
765 WARN_ON(next_page == reader_page); 863 RB_WARN_ON(cpu_buffer, next_page == reader_page);
864
865 /*
866 * If for some reason, we had an interrupt storm that made
867 * it all the way around the buffer, bail, and warn
868 * about it.
869 */
870 if (unlikely(next_page == cpu_buffer->commit_page)) {
871 WARN_ON_ONCE(1);
872 goto out_unlock;
873 }
766 874
767 if (next_page == head_page) { 875 if (next_page == head_page) {
768 if (!(buffer->flags & RB_FL_OVERWRITE)) { 876 if (!(buffer->flags & RB_FL_OVERWRITE)) {
769 spin_unlock(&cpu_buffer->lock); 877 /* reset write */
770 return NULL; 878 if (tail <= BUF_PAGE_SIZE)
879 local_set(&tail_page->write, tail);
880 goto out_unlock;
771 } 881 }
772 882
773 /* count overflows */ 883 /* tail_page has not moved yet? */
774 rb_update_overflow(cpu_buffer); 884 if (tail_page == cpu_buffer->tail_page) {
885 /* count overflows */
886 rb_update_overflow(cpu_buffer);
887
888 rb_inc_page(cpu_buffer, &head_page);
889 cpu_buffer->head_page = head_page;
890 cpu_buffer->head_page->read = 0;
891 }
892 }
775 893
776 rb_inc_page(cpu_buffer, &head_page); 894 /*
777 cpu_buffer->head_page = head_page; 895 * If the tail page is still the same as what we think
778 rb_reset_head_page(cpu_buffer); 896 * it is, then it is up to us to update the tail
897 * pointer.
898 */
899 if (tail_page == cpu_buffer->tail_page) {
900 local_set(&next_page->write, 0);
901 local_set(&next_page->commit, 0);
902 cpu_buffer->tail_page = next_page;
903
904 /* reread the time stamp */
905 *ts = ring_buffer_time_stamp(cpu_buffer->cpu);
906 cpu_buffer->tail_page->time_stamp = *ts;
779 } 907 }
780 908
781 if (tail != BUF_PAGE_SIZE) { 909 /*
910 * The actual tail page has moved forward.
911 */
912 if (tail < BUF_PAGE_SIZE) {
913 /* Mark the rest of the page with padding */
782 event = __rb_page_index(tail_page, tail); 914 event = __rb_page_index(tail_page, tail);
783 /* page padding */
784 event->type = RINGBUF_TYPE_PADDING; 915 event->type = RINGBUF_TYPE_PADDING;
785 } 916 }
786 917
787 tail_page->size = tail; 918 if (tail <= BUF_PAGE_SIZE)
788 tail_page = next_page; 919 /* Set the write back to the previous setting */
789 tail_page->size = 0; 920 local_set(&tail_page->write, tail);
790 tail = 0; 921
791 cpu_buffer->tail_page = tail_page; 922 /*
792 cpu_buffer->tail_page->write = tail; 923 * If this was a commit entry that failed,
793 rb_add_stamp(cpu_buffer, ts); 924 * increment that too
794 spin_unlock(&cpu_buffer->lock); 925 */
926 if (tail_page == cpu_buffer->commit_page &&
927 tail == rb_commit_index(cpu_buffer)) {
928 rb_set_commit_to_write(cpu_buffer);
929 }
930
931 spin_unlock_irqrestore(&cpu_buffer->lock, flags);
932
933 /* fail and let the caller try again */
934 return ERR_PTR(-EAGAIN);
795 } 935 }
796 936
797 BUG_ON(tail + length > BUF_PAGE_SIZE); 937 /* We reserved something on the buffer */
938
939 BUG_ON(write > BUF_PAGE_SIZE);
798 940
799 event = __rb_page_index(tail_page, tail); 941 event = __rb_page_index(tail_page, tail);
800 rb_update_event(event, type, length); 942 rb_update_event(event, type, length);
801 943
944 /*
945 * If this is a commit and the tail is zero, then update
946 * this page's time stamp.
947 */
948 if (!tail && rb_is_commit(cpu_buffer, event))
949 cpu_buffer->commit_page->time_stamp = *ts;
950
802 return event; 951 return event;
952
953 out_unlock:
954 spin_unlock_irqrestore(&cpu_buffer->lock, flags);
955 return NULL;
803} 956}
804 957
805static int 958static int
@@ -808,6 +961,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
808{ 961{
809 struct ring_buffer_event *event; 962 struct ring_buffer_event *event;
810 static int once; 963 static int once;
964 int ret;
811 965
812 if (unlikely(*delta > (1ULL << 59) && !once++)) { 966 if (unlikely(*delta > (1ULL << 59) && !once++)) {
813 printk(KERN_WARNING "Delta way too big! %llu" 967 printk(KERN_WARNING "Delta way too big! %llu"
@@ -825,21 +979,38 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
825 RB_LEN_TIME_EXTEND, 979 RB_LEN_TIME_EXTEND,
826 ts); 980 ts);
827 if (!event) 981 if (!event)
828 return -1; 982 return -EBUSY;
829 983
830 /* check to see if we went to the next page */ 984 if (PTR_ERR(event) == -EAGAIN)
831 if (cpu_buffer->tail_page->write) { 985 return -EAGAIN;
832 /* Still on same page, update timestamp */ 986
833 event->time_delta = *delta & TS_MASK; 987 /* Only a commited time event can update the write stamp */
834 event->array[0] = *delta >> TS_SHIFT; 988 if (rb_is_commit(cpu_buffer, event)) {
835 /* commit the time event */ 989 /*
836 cpu_buffer->tail_page->write += 990 * If this is the first on the page, then we need to
837 rb_event_length(event); 991 * update the page itself, and just put in a zero.
992 */
993 if (rb_event_index(event)) {
994 event->time_delta = *delta & TS_MASK;
995 event->array[0] = *delta >> TS_SHIFT;
996 } else {
997 cpu_buffer->commit_page->time_stamp = *ts;
998 event->time_delta = 0;
999 event->array[0] = 0;
1000 }
838 cpu_buffer->write_stamp = *ts; 1001 cpu_buffer->write_stamp = *ts;
839 *delta = 0; 1002 /* let the caller know this was the commit */
1003 ret = 1;
1004 } else {
1005 /* Darn, this is just wasted space */
1006 event->time_delta = 0;
1007 event->array[0] = 0;
1008 ret = 0;
840 } 1009 }
841 1010
842 return 0; 1011 *delta = 0;
1012
1013 return ret;
843} 1014}
844 1015
845static struct ring_buffer_event * 1016static struct ring_buffer_event *
@@ -848,32 +1019,69 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
848{ 1019{
849 struct ring_buffer_event *event; 1020 struct ring_buffer_event *event;
850 u64 ts, delta; 1021 u64 ts, delta;
1022 int commit = 0;
851 1023
1024 again:
852 ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1025 ts = ring_buffer_time_stamp(cpu_buffer->cpu);
853 1026
854 if (cpu_buffer->tail_page->write) { 1027 /*
1028 * Only the first commit can update the timestamp.
1029 * Yes there is a race here. If an interrupt comes in
1030 * just after the conditional and it traces too, then it
1031 * will also check the deltas. More than one timestamp may
1032 * also be made. But only the entry that did the actual
1033 * commit will be something other than zero.
1034 */
1035 if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
1036 rb_page_write(cpu_buffer->tail_page) ==
1037 rb_commit_index(cpu_buffer)) {
1038
855 delta = ts - cpu_buffer->write_stamp; 1039 delta = ts - cpu_buffer->write_stamp;
856 1040
1041 /* make sure this delta is calculated here */
1042 barrier();
1043
1044 /* Did the write stamp get updated already? */
1045 if (unlikely(ts < cpu_buffer->write_stamp))
1046 goto again;
1047
857 if (test_time_stamp(delta)) { 1048 if (test_time_stamp(delta)) {
858 int ret;
859 1049
860 ret = rb_add_time_stamp(cpu_buffer, &ts, &delta); 1050 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
861 if (ret < 0) 1051
1052 if (commit == -EBUSY)
862 return NULL; 1053 return NULL;
1054
1055 if (commit == -EAGAIN)
1056 goto again;
1057
1058 RB_WARN_ON(cpu_buffer, commit < 0);
863 } 1059 }
864 } else { 1060 } else
865 spin_lock(&cpu_buffer->lock); 1061 /* Non commits have zero deltas */
866 rb_add_stamp(cpu_buffer, &ts);
867 spin_unlock(&cpu_buffer->lock);
868 delta = 0; 1062 delta = 0;
869 }
870 1063
871 event = __rb_reserve_next(cpu_buffer, type, length, &ts); 1064 event = __rb_reserve_next(cpu_buffer, type, length, &ts);
872 if (!event) 1065 if (PTR_ERR(event) == -EAGAIN)
1066 goto again;
1067
1068 if (!event) {
1069 if (unlikely(commit))
1070 /*
1071 * Ouch! We needed a timestamp and it was commited. But
1072 * we didn't get our event reserved.
1073 */
1074 rb_set_commit_to_write(cpu_buffer);
873 return NULL; 1075 return NULL;
1076 }
874 1077
875 /* If the reserve went to the next page, our delta is zero */ 1078 /*
876 if (!cpu_buffer->tail_page->write) 1079 * If the timestamp was commited, make the commit our entry
1080 * now so that we will update it when needed.
1081 */
1082 if (commit)
1083 rb_set_commit_event(cpu_buffer, event);
1084 else if (!rb_is_commit(cpu_buffer, event))
877 delta = 0; 1085 delta = 0;
878 1086
879 event->time_delta = delta; 1087 event->time_delta = delta;
@@ -881,6 +1089,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
881 return event; 1089 return event;
882} 1090}
883 1091
1092static DEFINE_PER_CPU(int, rb_need_resched);
1093
884/** 1094/**
885 * ring_buffer_lock_reserve - reserve a part of the buffer 1095 * ring_buffer_lock_reserve - reserve a part of the buffer
886 * @buffer: the ring buffer to reserve from 1096 * @buffer: the ring buffer to reserve from
@@ -904,12 +1114,15 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
904{ 1114{
905 struct ring_buffer_per_cpu *cpu_buffer; 1115 struct ring_buffer_per_cpu *cpu_buffer;
906 struct ring_buffer_event *event; 1116 struct ring_buffer_event *event;
907 int cpu; 1117 int cpu, resched;
908 1118
909 if (atomic_read(&buffer->record_disabled)) 1119 if (atomic_read(&buffer->record_disabled))
910 return NULL; 1120 return NULL;
911 1121
912 local_irq_save(*flags); 1122 /* If we are tracing schedule, we don't want to recurse */
1123 resched = need_resched();
1124 preempt_disable_notrace();
1125
913 cpu = raw_smp_processor_id(); 1126 cpu = raw_smp_processor_id();
914 1127
915 if (!cpu_isset(cpu, buffer->cpumask)) 1128 if (!cpu_isset(cpu, buffer->cpumask))
@@ -922,26 +1135,42 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
922 1135
923 length = rb_calculate_event_length(length); 1136 length = rb_calculate_event_length(length);
924 if (length > BUF_PAGE_SIZE) 1137 if (length > BUF_PAGE_SIZE)
925 return NULL; 1138 goto out;
926 1139
927 event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); 1140 event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
928 if (!event) 1141 if (!event)
929 goto out; 1142 goto out;
930 1143
1144 /*
1145 * Need to store resched state on this cpu.
1146 * Only the first needs to.
1147 */
1148
1149 if (preempt_count() == 1)
1150 per_cpu(rb_need_resched, cpu) = resched;
1151
931 return event; 1152 return event;
932 1153
933 out: 1154 out:
934 local_irq_restore(*flags); 1155 if (resched)
1156 preempt_enable_notrace();
1157 else
1158 preempt_enable_notrace();
935 return NULL; 1159 return NULL;
936} 1160}
937 1161
938static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 1162static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
939 struct ring_buffer_event *event) 1163 struct ring_buffer_event *event)
940{ 1164{
941 cpu_buffer->tail_page->write += rb_event_length(event);
942 cpu_buffer->tail_page->size = cpu_buffer->tail_page->write;
943 cpu_buffer->write_stamp += event->time_delta;
944 cpu_buffer->entries++; 1165 cpu_buffer->entries++;
1166
1167 /* Only process further if we own the commit */
1168 if (!rb_is_commit(cpu_buffer, event))
1169 return;
1170
1171 cpu_buffer->write_stamp += event->time_delta;
1172
1173 rb_set_commit_to_write(cpu_buffer);
945} 1174}
946 1175
947/** 1176/**
@@ -965,7 +1194,16 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
965 1194
966 rb_commit(cpu_buffer, event); 1195 rb_commit(cpu_buffer, event);
967 1196
968 local_irq_restore(flags); 1197 /*
1198 * Only the last preempt count needs to restore preemption.
1199 */
1200 if (preempt_count() == 1) {
1201 if (per_cpu(rb_need_resched, cpu))
1202 preempt_enable_no_resched_notrace();
1203 else
1204 preempt_enable_notrace();
1205 } else
1206 preempt_enable_no_resched_notrace();
969 1207
970 return 0; 1208 return 0;
971} 1209}
@@ -989,15 +1227,17 @@ int ring_buffer_write(struct ring_buffer *buffer,
989{ 1227{
990 struct ring_buffer_per_cpu *cpu_buffer; 1228 struct ring_buffer_per_cpu *cpu_buffer;
991 struct ring_buffer_event *event; 1229 struct ring_buffer_event *event;
992 unsigned long event_length, flags; 1230 unsigned long event_length;
993 void *body; 1231 void *body;
994 int ret = -EBUSY; 1232 int ret = -EBUSY;
995 int cpu; 1233 int cpu, resched;
996 1234
997 if (atomic_read(&buffer->record_disabled)) 1235 if (atomic_read(&buffer->record_disabled))
998 return -EBUSY; 1236 return -EBUSY;
999 1237
1000 local_irq_save(flags); 1238 resched = need_resched();
1239 preempt_disable_notrace();
1240
1001 cpu = raw_smp_processor_id(); 1241 cpu = raw_smp_processor_id();
1002 1242
1003 if (!cpu_isset(cpu, buffer->cpumask)) 1243 if (!cpu_isset(cpu, buffer->cpumask))
@@ -1022,11 +1262,26 @@ int ring_buffer_write(struct ring_buffer *buffer,
1022 1262
1023 ret = 0; 1263 ret = 0;
1024 out: 1264 out:
1025 local_irq_restore(flags); 1265 if (resched)
1266 preempt_enable_no_resched_notrace();
1267 else
1268 preempt_enable_notrace();
1026 1269
1027 return ret; 1270 return ret;
1028} 1271}
1029 1272
1273static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1274{
1275 struct buffer_page *reader = cpu_buffer->reader_page;
1276 struct buffer_page *head = cpu_buffer->head_page;
1277 struct buffer_page *commit = cpu_buffer->commit_page;
1278
1279 return reader->read == rb_page_commit(reader) &&
1280 (commit == reader ||
1281 (commit == head &&
1282 head->read == rb_page_commit(commit)));
1283}
1284
1030/** 1285/**
1031 * ring_buffer_record_disable - stop all writes into the buffer 1286 * ring_buffer_record_disable - stop all writes into the buffer
1032 * @buffer: The ring buffer to stop writes to. 1287 * @buffer: The ring buffer to stop writes to.
@@ -1204,8 +1459,8 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
1204 1459
1205 cpu_buffer = iter->cpu_buffer; 1460 cpu_buffer = iter->cpu_buffer;
1206 1461
1207 return iter->head_page == cpu_buffer->tail_page && 1462 return iter->head_page == cpu_buffer->commit_page &&
1208 iter->head == cpu_buffer->tail_page->write; 1463 iter->head == rb_commit_index(cpu_buffer);
1209} 1464}
1210 1465
1211static void 1466static void
@@ -1282,15 +1537,16 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1282 reader = cpu_buffer->reader_page; 1537 reader = cpu_buffer->reader_page;
1283 1538
1284 /* If there's more to read, return this page */ 1539 /* If there's more to read, return this page */
1285 if (cpu_buffer->reader_page->read < reader->size) 1540 if (cpu_buffer->reader_page->read < rb_page_size(reader))
1286 goto out; 1541 goto out;
1287 1542
1288 /* Never should we have an index greater than the size */ 1543 /* Never should we have an index greater than the size */
1289 WARN_ON(cpu_buffer->reader_page->read > reader->size); 1544 RB_WARN_ON(cpu_buffer,
1545 cpu_buffer->reader_page->read > rb_page_size(reader));
1290 1546
1291 /* check if we caught up to the tail */ 1547 /* check if we caught up to the tail */
1292 reader = NULL; 1548 reader = NULL;
1293 if (cpu_buffer->tail_page == cpu_buffer->reader_page) 1549 if (cpu_buffer->commit_page == cpu_buffer->reader_page)
1294 goto out; 1550 goto out;
1295 1551
1296 /* 1552 /*
@@ -1301,7 +1557,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1301 reader = cpu_buffer->head_page; 1557 reader = cpu_buffer->head_page;
1302 cpu_buffer->reader_page->list.next = reader->list.next; 1558 cpu_buffer->reader_page->list.next = reader->list.next;
1303 cpu_buffer->reader_page->list.prev = reader->list.prev; 1559 cpu_buffer->reader_page->list.prev = reader->list.prev;
1304 cpu_buffer->reader_page->size = 0; 1560
1561 local_set(&cpu_buffer->reader_page->write, 0);
1562 local_set(&cpu_buffer->reader_page->commit, 0);
1305 1563
1306 /* Make the reader page now replace the head */ 1564 /* Make the reader page now replace the head */
1307 reader->list.prev->next = &cpu_buffer->reader_page->list; 1565 reader->list.prev->next = &cpu_buffer->reader_page->list;
@@ -1313,7 +1571,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1313 */ 1571 */
1314 cpu_buffer->head_page = cpu_buffer->reader_page; 1572 cpu_buffer->head_page = cpu_buffer->reader_page;
1315 1573
1316 if (cpu_buffer->tail_page != reader) 1574 if (cpu_buffer->commit_page != reader)
1317 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 1575 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
1318 1576
1319 /* Finally update the reader page to the new head */ 1577 /* Finally update the reader page to the new head */
@@ -1363,8 +1621,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1363 /* 1621 /*
1364 * Check if we are at the end of the buffer. 1622 * Check if we are at the end of the buffer.
1365 */ 1623 */
1366 if (iter->head >= iter->head_page->size) { 1624 if (iter->head >= rb_page_size(iter->head_page)) {
1367 BUG_ON(iter->head_page == cpu_buffer->tail_page); 1625 BUG_ON(iter->head_page == cpu_buffer->commit_page);
1368 rb_inc_iter(iter); 1626 rb_inc_iter(iter);
1369 return; 1627 return;
1370 } 1628 }
@@ -1377,16 +1635,16 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1377 * This should not be called to advance the header if we are 1635 * This should not be called to advance the header if we are
1378 * at the tail of the buffer. 1636 * at the tail of the buffer.
1379 */ 1637 */
1380 BUG_ON((iter->head_page == cpu_buffer->tail_page) && 1638 BUG_ON((iter->head_page == cpu_buffer->commit_page) &&
1381 (iter->head + length > cpu_buffer->tail_page->write)); 1639 (iter->head + length > rb_commit_index(cpu_buffer)));
1382 1640
1383 rb_update_iter_read_stamp(iter, event); 1641 rb_update_iter_read_stamp(iter, event);
1384 1642
1385 iter->head += length; 1643 iter->head += length;
1386 1644
1387 /* check for end of page padding */ 1645 /* check for end of page padding */
1388 if ((iter->head >= iter->head_page->size) && 1646 if ((iter->head >= rb_page_size(iter->head_page)) &&
1389 (iter->head_page != cpu_buffer->tail_page)) 1647 (iter->head_page != cpu_buffer->commit_page))
1390 rb_advance_iter(iter); 1648 rb_advance_iter(iter);
1391} 1649}
1392 1650
@@ -1420,7 +1678,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1420 1678
1421 switch (event->type) { 1679 switch (event->type) {
1422 case RINGBUF_TYPE_PADDING: 1680 case RINGBUF_TYPE_PADDING:
1423 WARN_ON(1); 1681 RB_WARN_ON(cpu_buffer, 1);
1424 rb_advance_reader(cpu_buffer); 1682 rb_advance_reader(cpu_buffer);
1425 return NULL; 1683 return NULL;
1426 1684
@@ -1622,14 +1880,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
1622{ 1880{
1623 cpu_buffer->head_page 1881 cpu_buffer->head_page
1624 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 1882 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
1625 cpu_buffer->head_page->size = 0; 1883 local_set(&cpu_buffer->head_page->write, 0);
1626 cpu_buffer->tail_page = cpu_buffer->head_page; 1884 local_set(&cpu_buffer->head_page->commit, 0);
1627 cpu_buffer->tail_page->size = 0;
1628 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1629 cpu_buffer->reader_page->size = 0;
1630 1885
1631 cpu_buffer->head_page->read = 0; 1886 cpu_buffer->head_page->read = 0;
1632 cpu_buffer->tail_page->write = 0; 1887
1888 cpu_buffer->tail_page = cpu_buffer->head_page;
1889 cpu_buffer->commit_page = cpu_buffer->head_page;
1890
1891 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1892 local_set(&cpu_buffer->reader_page->write, 0);
1893 local_set(&cpu_buffer->reader_page->commit, 0);
1633 cpu_buffer->reader_page->read = 0; 1894 cpu_buffer->reader_page->read = 0;
1634 1895
1635 cpu_buffer->overrun = 0; 1896 cpu_buffer->overrun = 0;