aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace/ring_buffer.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-09-07 02:19:51 -0400
committerIngo Molnar <mingo@elte.hu>2009-09-07 02:19:51 -0400
commita1922ed661ab2c1637d0b10cde933bd9cd33d965 (patch)
tree0f1777542b385ebefd30b3586d830fd8ed6fda5b /kernel/trace/ring_buffer.c
parent75e33751ca8bbb72dd6f1a74d2810ddc8cbe4bdf (diff)
parentd28daf923ac5e4a0d7cecebae56f3e339189366b (diff)
Merge branch 'tracing/core' into tracing/hw-breakpoints
Conflicts: arch/Kconfig kernel/trace/trace.h Merge reason: resolve the conflicts, plus adopt to the new ring-buffer APIs. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/trace/ring_buffer.c')
-rw-r--r--kernel/trace/ring_buffer.c1437
1 files changed, 1090 insertions, 347 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dc4dc70171ce..454e74e718cf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -206,6 +206,7 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
207#define RB_ALIGNMENT 4U 207#define RB_ALIGNMENT 4U
208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
209#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
209 210
210/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 211/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
211#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 212#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
@@ -217,17 +218,12 @@ enum {
217 218
218static inline int rb_null_event(struct ring_buffer_event *event) 219static inline int rb_null_event(struct ring_buffer_event *event)
219{ 220{
220 return event->type_len == RINGBUF_TYPE_PADDING 221 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
221 && event->time_delta == 0;
222}
223
224static inline int rb_discarded_event(struct ring_buffer_event *event)
225{
226 return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
227} 222}
228 223
229static void rb_event_set_padding(struct ring_buffer_event *event) 224static void rb_event_set_padding(struct ring_buffer_event *event)
230{ 225{
226 /* padding has a NULL time_delta */
231 event->type_len = RINGBUF_TYPE_PADDING; 227 event->type_len = RINGBUF_TYPE_PADDING;
232 event->time_delta = 0; 228 event->time_delta = 0;
233} 229}
@@ -321,6 +317,14 @@ struct buffer_data_page {
321 unsigned char data[]; /* data of buffer page */ 317 unsigned char data[]; /* data of buffer page */
322}; 318};
323 319
320/*
321 * Note, the buffer_page list must be first. The buffer pages
322 * are allocated in cache lines, which means that each buffer
323 * page will be at the beginning of a cache line, and thus
324 * the least significant bits will be zero. We use this to
325 * add flags in the list struct pointers, to make the ring buffer
326 * lockless.
327 */
324struct buffer_page { 328struct buffer_page {
325 struct list_head list; /* list of buffer pages */ 329 struct list_head list; /* list of buffer pages */
326 local_t write; /* index for next write */ 330 local_t write; /* index for next write */
@@ -329,6 +333,21 @@ struct buffer_page {
329 struct buffer_data_page *page; /* Actual data page */ 333 struct buffer_data_page *page; /* Actual data page */
330}; 334};
331 335
336/*
337 * The buffer page counters, write and entries, must be reset
338 * atomically when crossing page boundaries. To synchronize this
339 * update, two counters are inserted into the number. One is
340 * the actual counter for the write position or count on the page.
341 *
342 * The other is a counter of updaters. Before an update happens
343 * the update partition of the counter is incremented. This will
344 * allow the updater to update the counter atomically.
345 *
346 * The counter is 20 bits, and the state data is 12.
347 */
348#define RB_WRITE_MASK 0xfffff
349#define RB_WRITE_INTCNT (1 << 20)
350
332static void rb_init_page(struct buffer_data_page *bpage) 351static void rb_init_page(struct buffer_data_page *bpage)
333{ 352{
334 local_set(&bpage->commit, 0); 353 local_set(&bpage->commit, 0);
@@ -402,19 +421,20 @@ int ring_buffer_print_page_header(struct trace_seq *s)
402struct ring_buffer_per_cpu { 421struct ring_buffer_per_cpu {
403 int cpu; 422 int cpu;
404 struct ring_buffer *buffer; 423 struct ring_buffer *buffer;
405 spinlock_t reader_lock; /* serialize readers */ 424 spinlock_t reader_lock; /* serialize readers */
406 raw_spinlock_t lock; 425 raw_spinlock_t lock;
407 struct lock_class_key lock_key; 426 struct lock_class_key lock_key;
408 struct list_head pages; 427 struct list_head *pages;
409 struct buffer_page *head_page; /* read from head */ 428 struct buffer_page *head_page; /* read from head */
410 struct buffer_page *tail_page; /* write to tail */ 429 struct buffer_page *tail_page; /* write to tail */
411 struct buffer_page *commit_page; /* committed pages */ 430 struct buffer_page *commit_page; /* committed pages */
412 struct buffer_page *reader_page; 431 struct buffer_page *reader_page;
413 unsigned long nmi_dropped; 432 local_t commit_overrun;
414 unsigned long commit_overrun; 433 local_t overrun;
415 unsigned long overrun;
416 unsigned long read;
417 local_t entries; 434 local_t entries;
435 local_t committing;
436 local_t commits;
437 unsigned long read;
418 u64 write_stamp; 438 u64 write_stamp;
419 u64 read_stamp; 439 u64 read_stamp;
420 atomic_t record_disabled; 440 atomic_t record_disabled;
@@ -447,14 +467,19 @@ struct ring_buffer_iter {
447}; 467};
448 468
449/* buffer may be either ring_buffer or ring_buffer_per_cpu */ 469/* buffer may be either ring_buffer or ring_buffer_per_cpu */
450#define RB_WARN_ON(buffer, cond) \ 470#define RB_WARN_ON(b, cond) \
451 ({ \ 471 ({ \
452 int _____ret = unlikely(cond); \ 472 int _____ret = unlikely(cond); \
453 if (_____ret) { \ 473 if (_____ret) { \
454 atomic_inc(&buffer->record_disabled); \ 474 if (__same_type(*(b), struct ring_buffer_per_cpu)) { \
455 WARN_ON(1); \ 475 struct ring_buffer_per_cpu *__b = \
456 } \ 476 (void *)b; \
457 _____ret; \ 477 atomic_inc(&__b->buffer->record_disabled); \
478 } else \
479 atomic_inc(&b->record_disabled); \
480 WARN_ON(1); \
481 } \
482 _____ret; \
458 }) 483 })
459 484
460/* Up this if you want to test the TIME_EXTENTS and normalization */ 485/* Up this if you want to test the TIME_EXTENTS and normalization */
@@ -486,6 +511,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
486} 511}
487EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); 512EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
488 513
514/*
515 * Making the ring buffer lockless makes things tricky.
516 * Although writes only happen on the CPU that they are on,
517 * and they only need to worry about interrupts. Reads can
518 * happen on any CPU.
519 *
520 * The reader page is always off the ring buffer, but when the
521 * reader finishes with a page, it needs to swap its page with
522 * a new one from the buffer. The reader needs to take from
523 * the head (writes go to the tail). But if a writer is in overwrite
524 * mode and wraps, it must push the head page forward.
525 *
526 * Here lies the problem.
527 *
528 * The reader must be careful to replace only the head page, and
529 * not another one. As described at the top of the file in the
530 * ASCII art, the reader sets its old page to point to the next
531 * page after head. It then sets the page after head to point to
532 * the old reader page. But if the writer moves the head page
533 * during this operation, the reader could end up with the tail.
534 *
535 * We use cmpxchg to help prevent this race. We also do something
536 * special with the page before head. We set the LSB to 1.
537 *
538 * When the writer must push the page forward, it will clear the
539 * bit that points to the head page, move the head, and then set
540 * the bit that points to the new head page.
541 *
542 * We also don't want an interrupt coming in and moving the head
543 * page on another writer. Thus we use the second LSB to catch
544 * that too. Thus:
545 *
546 * head->list->prev->next bit 1 bit 0
547 * ------- -------
548 * Normal page 0 0
549 * Points to head page 0 1
550 * New head page 1 0
551 *
552 * Note we can not trust the prev pointer of the head page, because:
553 *
554 * +----+ +-----+ +-----+
555 * | |------>| T |---X--->| N |
556 * | |<------| | | |
557 * +----+ +-----+ +-----+
558 * ^ ^ |
559 * | +-----+ | |
560 * +----------| R |----------+ |
561 * | |<-----------+
562 * +-----+
563 *
564 * Key: ---X--> HEAD flag set in pointer
565 * T Tail page
566 * R Reader page
567 * N Next page
568 *
569 * (see __rb_reserve_next() to see where this happens)
570 *
571 * What the above shows is that the reader just swapped out
572 * the reader page with a page in the buffer, but before it
573 * could make the new header point back to the new page added
574 * it was preempted by a writer. The writer moved forward onto
575 * the new page added by the reader and is about to move forward
576 * again.
577 *
578 * You can see, it is legitimate for the previous pointer of
579 * the head (or any page) not to point back to itself. But only
580 * temporarially.
581 */
582
583#define RB_PAGE_NORMAL 0UL
584#define RB_PAGE_HEAD 1UL
585#define RB_PAGE_UPDATE 2UL
586
587
588#define RB_FLAG_MASK 3UL
589
590/* PAGE_MOVED is not part of the mask */
591#define RB_PAGE_MOVED 4UL
592
593/*
594 * rb_list_head - remove any bit
595 */
596static struct list_head *rb_list_head(struct list_head *list)
597{
598 unsigned long val = (unsigned long)list;
599
600 return (struct list_head *)(val & ~RB_FLAG_MASK);
601}
602
603/*
604 * rb_is_head_page - test if the give page is the head page
605 *
606 * Because the reader may move the head_page pointer, we can
607 * not trust what the head page is (it may be pointing to
608 * the reader page). But if the next page is a header page,
609 * its flags will be non zero.
610 */
611static int inline
612rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
613 struct buffer_page *page, struct list_head *list)
614{
615 unsigned long val;
616
617 val = (unsigned long)list->next;
618
619 if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
620 return RB_PAGE_MOVED;
621
622 return val & RB_FLAG_MASK;
623}
624
625/*
626 * rb_is_reader_page
627 *
628 * The unique thing about the reader page, is that, if the
629 * writer is ever on it, the previous pointer never points
630 * back to the reader page.
631 */
632static int rb_is_reader_page(struct buffer_page *page)
633{
634 struct list_head *list = page->list.prev;
635
636 return rb_list_head(list->next) != &page->list;
637}
638
639/*
640 * rb_set_list_to_head - set a list_head to be pointing to head.
641 */
642static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
643 struct list_head *list)
644{
645 unsigned long *ptr;
646
647 ptr = (unsigned long *)&list->next;
648 *ptr |= RB_PAGE_HEAD;
649 *ptr &= ~RB_PAGE_UPDATE;
650}
651
652/*
653 * rb_head_page_activate - sets up head page
654 */
655static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
656{
657 struct buffer_page *head;
658
659 head = cpu_buffer->head_page;
660 if (!head)
661 return;
662
663 /*
664 * Set the previous list pointer to have the HEAD flag.
665 */
666 rb_set_list_to_head(cpu_buffer, head->list.prev);
667}
668
669static void rb_list_head_clear(struct list_head *list)
670{
671 unsigned long *ptr = (unsigned long *)&list->next;
672
673 *ptr &= ~RB_FLAG_MASK;
674}
675
676/*
677 * rb_head_page_dactivate - clears head page ptr (for free list)
678 */
679static void
680rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
681{
682 struct list_head *hd;
683
684 /* Go through the whole list and clear any pointers found. */
685 rb_list_head_clear(cpu_buffer->pages);
686
687 list_for_each(hd, cpu_buffer->pages)
688 rb_list_head_clear(hd);
689}
690
691static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
692 struct buffer_page *head,
693 struct buffer_page *prev,
694 int old_flag, int new_flag)
695{
696 struct list_head *list;
697 unsigned long val = (unsigned long)&head->list;
698 unsigned long ret;
699
700 list = &prev->list;
701
702 val &= ~RB_FLAG_MASK;
703
704 ret = (unsigned long)cmpxchg(&list->next,
705 val | old_flag, val | new_flag);
706
707 /* check if the reader took the page */
708 if ((ret & ~RB_FLAG_MASK) != val)
709 return RB_PAGE_MOVED;
710
711 return ret & RB_FLAG_MASK;
712}
713
714static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
715 struct buffer_page *head,
716 struct buffer_page *prev,
717 int old_flag)
718{
719 return rb_head_page_set(cpu_buffer, head, prev,
720 old_flag, RB_PAGE_UPDATE);
721}
722
723static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
724 struct buffer_page *head,
725 struct buffer_page *prev,
726 int old_flag)
727{
728 return rb_head_page_set(cpu_buffer, head, prev,
729 old_flag, RB_PAGE_HEAD);
730}
731
732static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
733 struct buffer_page *head,
734 struct buffer_page *prev,
735 int old_flag)
736{
737 return rb_head_page_set(cpu_buffer, head, prev,
738 old_flag, RB_PAGE_NORMAL);
739}
740
741static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
742 struct buffer_page **bpage)
743{
744 struct list_head *p = rb_list_head((*bpage)->list.next);
745
746 *bpage = list_entry(p, struct buffer_page, list);
747}
748
749static struct buffer_page *
750rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
751{
752 struct buffer_page *head;
753 struct buffer_page *page;
754 struct list_head *list;
755 int i;
756
757 if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
758 return NULL;
759
760 /* sanity check */
761 list = cpu_buffer->pages;
762 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
763 return NULL;
764
765 page = head = cpu_buffer->head_page;
766 /*
767 * It is possible that the writer moves the header behind
768 * where we started, and we miss in one loop.
769 * A second loop should grab the header, but we'll do
770 * three loops just because I'm paranoid.
771 */
772 for (i = 0; i < 3; i++) {
773 do {
774 if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
775 cpu_buffer->head_page = page;
776 return page;
777 }
778 rb_inc_page(cpu_buffer, &page);
779 } while (page != head);
780 }
781
782 RB_WARN_ON(cpu_buffer, 1);
783
784 return NULL;
785}
786
787static int rb_head_page_replace(struct buffer_page *old,
788 struct buffer_page *new)
789{
790 unsigned long *ptr = (unsigned long *)&old->list.prev->next;
791 unsigned long val;
792 unsigned long ret;
793
794 val = *ptr & ~RB_FLAG_MASK;
795 val |= RB_PAGE_HEAD;
796
797 ret = cmpxchg(ptr, val, &new->list);
798
799 return ret == val;
800}
801
802/*
803 * rb_tail_page_update - move the tail page forward
804 *
805 * Returns 1 if moved tail page, 0 if someone else did.
806 */
807static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
808 struct buffer_page *tail_page,
809 struct buffer_page *next_page)
810{
811 struct buffer_page *old_tail;
812 unsigned long old_entries;
813 unsigned long old_write;
814 int ret = 0;
815
816 /*
817 * The tail page now needs to be moved forward.
818 *
819 * We need to reset the tail page, but without messing
820 * with possible erasing of data brought in by interrupts
821 * that have moved the tail page and are currently on it.
822 *
823 * We add a counter to the write field to denote this.
824 */
825 old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
826 old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
827
828 /*
829 * Just make sure we have seen our old_write and synchronize
830 * with any interrupts that come in.
831 */
832 barrier();
833
834 /*
835 * If the tail page is still the same as what we think
836 * it is, then it is up to us to update the tail
837 * pointer.
838 */
839 if (tail_page == cpu_buffer->tail_page) {
840 /* Zero the write counter */
841 unsigned long val = old_write & ~RB_WRITE_MASK;
842 unsigned long eval = old_entries & ~RB_WRITE_MASK;
843
844 /*
845 * This will only succeed if an interrupt did
846 * not come in and change it. In which case, we
847 * do not want to modify it.
848 *
849 * We add (void) to let the compiler know that we do not care
850 * about the return value of these functions. We use the
851 * cmpxchg to only update if an interrupt did not already
852 * do it for us. If the cmpxchg fails, we don't care.
853 */
854 (void)local_cmpxchg(&next_page->write, old_write, val);
855 (void)local_cmpxchg(&next_page->entries, old_entries, eval);
856
857 /*
858 * No need to worry about races with clearing out the commit.
859 * it only can increment when a commit takes place. But that
860 * only happens in the outer most nested commit.
861 */
862 local_set(&next_page->page->commit, 0);
863
864 old_tail = cmpxchg(&cpu_buffer->tail_page,
865 tail_page, next_page);
866
867 if (old_tail == tail_page)
868 ret = 1;
869 }
870
871 return ret;
872}
873
874static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
875 struct buffer_page *bpage)
876{
877 unsigned long val = (unsigned long)bpage;
878
879 if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
880 return 1;
881
882 return 0;
883}
884
885/**
886 * rb_check_list - make sure a pointer to a list has the last bits zero
887 */
888static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
889 struct list_head *list)
890{
891 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
892 return 1;
893 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
894 return 1;
895 return 0;
896}
897
489/** 898/**
490 * check_pages - integrity check of buffer pages 899 * check_pages - integrity check of buffer pages
491 * @cpu_buffer: CPU buffer with pages to test 900 * @cpu_buffer: CPU buffer with pages to test
@@ -495,14 +904,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
495 */ 904 */
496static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 905static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
497{ 906{
498 struct list_head *head = &cpu_buffer->pages; 907 struct list_head *head = cpu_buffer->pages;
499 struct buffer_page *bpage, *tmp; 908 struct buffer_page *bpage, *tmp;
500 909
910 rb_head_page_deactivate(cpu_buffer);
911
501 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 912 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
502 return -1; 913 return -1;
503 if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) 914 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
504 return -1; 915 return -1;
505 916
917 if (rb_check_list(cpu_buffer, head))
918 return -1;
919
506 list_for_each_entry_safe(bpage, tmp, head, list) { 920 list_for_each_entry_safe(bpage, tmp, head, list) {
507 if (RB_WARN_ON(cpu_buffer, 921 if (RB_WARN_ON(cpu_buffer,
508 bpage->list.next->prev != &bpage->list)) 922 bpage->list.next->prev != &bpage->list))
@@ -510,25 +924,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
510 if (RB_WARN_ON(cpu_buffer, 924 if (RB_WARN_ON(cpu_buffer,
511 bpage->list.prev->next != &bpage->list)) 925 bpage->list.prev->next != &bpage->list))
512 return -1; 926 return -1;
927 if (rb_check_list(cpu_buffer, &bpage->list))
928 return -1;
513 } 929 }
514 930
931 rb_head_page_activate(cpu_buffer);
932
515 return 0; 933 return 0;
516} 934}
517 935
518static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 936static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
519 unsigned nr_pages) 937 unsigned nr_pages)
520{ 938{
521 struct list_head *head = &cpu_buffer->pages;
522 struct buffer_page *bpage, *tmp; 939 struct buffer_page *bpage, *tmp;
523 unsigned long addr; 940 unsigned long addr;
524 LIST_HEAD(pages); 941 LIST_HEAD(pages);
525 unsigned i; 942 unsigned i;
526 943
944 WARN_ON(!nr_pages);
945
527 for (i = 0; i < nr_pages; i++) { 946 for (i = 0; i < nr_pages; i++) {
528 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 947 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
529 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 948 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
530 if (!bpage) 949 if (!bpage)
531 goto free_pages; 950 goto free_pages;
951
952 rb_check_bpage(cpu_buffer, bpage);
953
532 list_add(&bpage->list, &pages); 954 list_add(&bpage->list, &pages);
533 955
534 addr = __get_free_page(GFP_KERNEL); 956 addr = __get_free_page(GFP_KERNEL);
@@ -538,7 +960,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
538 rb_init_page(bpage->page); 960 rb_init_page(bpage->page);
539 } 961 }
540 962
541 list_splice(&pages, head); 963 /*
964 * The ring buffer page list is a circular list that does not
965 * start and end with a list head. All page list items point to
966 * other pages.
967 */
968 cpu_buffer->pages = pages.next;
969 list_del(&pages);
542 970
543 rb_check_pages(cpu_buffer); 971 rb_check_pages(cpu_buffer);
544 972
@@ -570,13 +998,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
570 spin_lock_init(&cpu_buffer->reader_lock); 998 spin_lock_init(&cpu_buffer->reader_lock);
571 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 999 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
572 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1000 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
573 INIT_LIST_HEAD(&cpu_buffer->pages);
574 1001
575 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1002 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
576 GFP_KERNEL, cpu_to_node(cpu)); 1003 GFP_KERNEL, cpu_to_node(cpu));
577 if (!bpage) 1004 if (!bpage)
578 goto fail_free_buffer; 1005 goto fail_free_buffer;
579 1006
1007 rb_check_bpage(cpu_buffer, bpage);
1008
580 cpu_buffer->reader_page = bpage; 1009 cpu_buffer->reader_page = bpage;
581 addr = __get_free_page(GFP_KERNEL); 1010 addr = __get_free_page(GFP_KERNEL);
582 if (!addr) 1011 if (!addr)
@@ -591,9 +1020,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
591 goto fail_free_reader; 1020 goto fail_free_reader;
592 1021
593 cpu_buffer->head_page 1022 cpu_buffer->head_page
594 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 1023 = list_entry(cpu_buffer->pages, struct buffer_page, list);
595 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; 1024 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
596 1025
1026 rb_head_page_activate(cpu_buffer);
1027
597 return cpu_buffer; 1028 return cpu_buffer;
598 1029
599 fail_free_reader: 1030 fail_free_reader:
@@ -606,24 +1037,25 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
606 1037
607static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 1038static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
608{ 1039{
609 struct list_head *head = &cpu_buffer->pages; 1040 struct list_head *head = cpu_buffer->pages;
610 struct buffer_page *bpage, *tmp; 1041 struct buffer_page *bpage, *tmp;
611 1042
612 free_buffer_page(cpu_buffer->reader_page); 1043 free_buffer_page(cpu_buffer->reader_page);
613 1044
614 list_for_each_entry_safe(bpage, tmp, head, list) { 1045 rb_head_page_deactivate(cpu_buffer);
615 list_del_init(&bpage->list); 1046
1047 if (head) {
1048 list_for_each_entry_safe(bpage, tmp, head, list) {
1049 list_del_init(&bpage->list);
1050 free_buffer_page(bpage);
1051 }
1052 bpage = list_entry(head, struct buffer_page, list);
616 free_buffer_page(bpage); 1053 free_buffer_page(bpage);
617 } 1054 }
1055
618 kfree(cpu_buffer); 1056 kfree(cpu_buffer);
619} 1057}
620 1058
621/*
622 * Causes compile errors if the struct buffer_page gets bigger
623 * than the struct page.
624 */
625extern int ring_buffer_page_too_big(void);
626
627#ifdef CONFIG_HOTPLUG_CPU 1059#ifdef CONFIG_HOTPLUG_CPU
628static int rb_cpu_notify(struct notifier_block *self, 1060static int rb_cpu_notify(struct notifier_block *self,
629 unsigned long action, void *hcpu); 1061 unsigned long action, void *hcpu);
@@ -646,11 +1078,6 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
646 int bsize; 1078 int bsize;
647 int cpu; 1079 int cpu;
648 1080
649 /* Paranoid! Optimizes out when all is well */
650 if (sizeof(struct buffer_page) > sizeof(struct page))
651 ring_buffer_page_too_big();
652
653
654 /* keep it in its own cache line */ 1081 /* keep it in its own cache line */
655 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), 1082 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
656 GFP_KERNEL); 1083 GFP_KERNEL);
@@ -666,8 +1093,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
666 buffer->reader_lock_key = key; 1093 buffer->reader_lock_key = key;
667 1094
668 /* need at least two pages */ 1095 /* need at least two pages */
669 if (buffer->pages == 1) 1096 if (buffer->pages < 2)
670 buffer->pages++; 1097 buffer->pages = 2;
671 1098
672 /* 1099 /*
673 * In case of non-hotplug cpu, if the ring-buffer is allocated 1100 * In case of non-hotplug cpu, if the ring-buffer is allocated
@@ -743,6 +1170,7 @@ ring_buffer_free(struct ring_buffer *buffer)
743 1170
744 put_online_cpus(); 1171 put_online_cpus();
745 1172
1173 kfree(buffer->buffers);
746 free_cpumask_var(buffer->cpumask); 1174 free_cpumask_var(buffer->cpumask);
747 1175
748 kfree(buffer); 1176 kfree(buffer);
@@ -767,15 +1195,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
767 atomic_inc(&cpu_buffer->record_disabled); 1195 atomic_inc(&cpu_buffer->record_disabled);
768 synchronize_sched(); 1196 synchronize_sched();
769 1197
1198 rb_head_page_deactivate(cpu_buffer);
1199
770 for (i = 0; i < nr_pages; i++) { 1200 for (i = 0; i < nr_pages; i++) {
771 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1201 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
772 return; 1202 return;
773 p = cpu_buffer->pages.next; 1203 p = cpu_buffer->pages->next;
774 bpage = list_entry(p, struct buffer_page, list); 1204 bpage = list_entry(p, struct buffer_page, list);
775 list_del_init(&bpage->list); 1205 list_del_init(&bpage->list);
776 free_buffer_page(bpage); 1206 free_buffer_page(bpage);
777 } 1207 }
778 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1208 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
779 return; 1209 return;
780 1210
781 rb_reset_cpu(cpu_buffer); 1211 rb_reset_cpu(cpu_buffer);
@@ -797,15 +1227,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
797 atomic_inc(&cpu_buffer->record_disabled); 1227 atomic_inc(&cpu_buffer->record_disabled);
798 synchronize_sched(); 1228 synchronize_sched();
799 1229
1230 spin_lock_irq(&cpu_buffer->reader_lock);
1231 rb_head_page_deactivate(cpu_buffer);
1232
800 for (i = 0; i < nr_pages; i++) { 1233 for (i = 0; i < nr_pages; i++) {
801 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1234 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
802 return; 1235 return;
803 p = pages->next; 1236 p = pages->next;
804 bpage = list_entry(p, struct buffer_page, list); 1237 bpage = list_entry(p, struct buffer_page, list);
805 list_del_init(&bpage->list); 1238 list_del_init(&bpage->list);
806 list_add_tail(&bpage->list, &cpu_buffer->pages); 1239 list_add_tail(&bpage->list, cpu_buffer->pages);
807 } 1240 }
808 rb_reset_cpu(cpu_buffer); 1241 rb_reset_cpu(cpu_buffer);
1242 spin_unlock_irq(&cpu_buffer->reader_lock);
809 1243
810 rb_check_pages(cpu_buffer); 1244 rb_check_pages(cpu_buffer);
811 1245
@@ -956,21 +1390,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
956} 1390}
957 1391
958static inline struct ring_buffer_event * 1392static inline struct ring_buffer_event *
959rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
960{
961 return __rb_page_index(cpu_buffer->head_page,
962 cpu_buffer->head_page->read);
963}
964
965static inline struct ring_buffer_event *
966rb_iter_head_event(struct ring_buffer_iter *iter) 1393rb_iter_head_event(struct ring_buffer_iter *iter)
967{ 1394{
968 return __rb_page_index(iter->head_page, iter->head); 1395 return __rb_page_index(iter->head_page, iter->head);
969} 1396}
970 1397
971static inline unsigned rb_page_write(struct buffer_page *bpage) 1398static inline unsigned long rb_page_write(struct buffer_page *bpage)
972{ 1399{
973 return local_read(&bpage->write); 1400 return local_read(&bpage->write) & RB_WRITE_MASK;
974} 1401}
975 1402
976static inline unsigned rb_page_commit(struct buffer_page *bpage) 1403static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -978,6 +1405,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
978 return local_read(&bpage->page->commit); 1405 return local_read(&bpage->page->commit);
979} 1406}
980 1407
1408static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1409{
1410 return local_read(&bpage->entries) & RB_WRITE_MASK;
1411}
1412
981/* Size is determined by what has been commited */ 1413/* Size is determined by what has been commited */
982static inline unsigned rb_page_size(struct buffer_page *bpage) 1414static inline unsigned rb_page_size(struct buffer_page *bpage)
983{ 1415{
@@ -990,33 +1422,17 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
990 return rb_page_commit(cpu_buffer->commit_page); 1422 return rb_page_commit(cpu_buffer->commit_page);
991} 1423}
992 1424
993static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
994{
995 return rb_page_commit(cpu_buffer->head_page);
996}
997
998static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
999 struct buffer_page **bpage)
1000{
1001 struct list_head *p = (*bpage)->list.next;
1002
1003 if (p == &cpu_buffer->pages)
1004 p = p->next;
1005
1006 *bpage = list_entry(p, struct buffer_page, list);
1007}
1008
1009static inline unsigned 1425static inline unsigned
1010rb_event_index(struct ring_buffer_event *event) 1426rb_event_index(struct ring_buffer_event *event)
1011{ 1427{
1012 unsigned long addr = (unsigned long)event; 1428 unsigned long addr = (unsigned long)event;
1013 1429
1014 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); 1430 return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
1015} 1431}
1016 1432
1017static inline int 1433static inline int
1018rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, 1434rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1019 struct ring_buffer_event *event) 1435 struct ring_buffer_event *event)
1020{ 1436{
1021 unsigned long addr = (unsigned long)event; 1437 unsigned long addr = (unsigned long)event;
1022 unsigned long index; 1438 unsigned long index;
@@ -1029,33 +1445,10 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1029} 1445}
1030 1446
1031static void 1447static void
1032rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
1033 struct ring_buffer_event *event)
1034{
1035 unsigned long addr = (unsigned long)event;
1036 unsigned long index;
1037
1038 index = rb_event_index(event);
1039 addr &= PAGE_MASK;
1040
1041 while (cpu_buffer->commit_page->page != (void *)addr) {
1042 if (RB_WARN_ON(cpu_buffer,
1043 cpu_buffer->commit_page == cpu_buffer->tail_page))
1044 return;
1045 cpu_buffer->commit_page->page->commit =
1046 cpu_buffer->commit_page->write;
1047 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1048 cpu_buffer->write_stamp =
1049 cpu_buffer->commit_page->page->time_stamp;
1050 }
1051
1052 /* Now set the commit to the event's index */
1053 local_set(&cpu_buffer->commit_page->page->commit, index);
1054}
1055
1056static void
1057rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1448rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1058{ 1449{
1450 unsigned long max_count;
1451
1059 /* 1452 /*
1060 * We only race with interrupts and NMIs on this CPU. 1453 * We only race with interrupts and NMIs on this CPU.
1061 * If we own the commit event, then we can commit 1454 * If we own the commit event, then we can commit
@@ -1065,9 +1458,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1065 * assign the commit to the tail. 1458 * assign the commit to the tail.
1066 */ 1459 */
1067 again: 1460 again:
1461 max_count = cpu_buffer->buffer->pages * 100;
1462
1068 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1463 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1069 cpu_buffer->commit_page->page->commit = 1464 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
1070 cpu_buffer->commit_page->write; 1465 return;
1466 if (RB_WARN_ON(cpu_buffer,
1467 rb_is_reader_page(cpu_buffer->tail_page)))
1468 return;
1469 local_set(&cpu_buffer->commit_page->page->commit,
1470 rb_page_write(cpu_buffer->commit_page));
1071 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 1471 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1072 cpu_buffer->write_stamp = 1472 cpu_buffer->write_stamp =
1073 cpu_buffer->commit_page->page->time_stamp; 1473 cpu_buffer->commit_page->page->time_stamp;
@@ -1076,8 +1476,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1076 } 1476 }
1077 while (rb_commit_index(cpu_buffer) != 1477 while (rb_commit_index(cpu_buffer) !=
1078 rb_page_write(cpu_buffer->commit_page)) { 1478 rb_page_write(cpu_buffer->commit_page)) {
1079 cpu_buffer->commit_page->page->commit = 1479
1080 cpu_buffer->commit_page->write; 1480 local_set(&cpu_buffer->commit_page->page->commit,
1481 rb_page_write(cpu_buffer->commit_page));
1482 RB_WARN_ON(cpu_buffer,
1483 local_read(&cpu_buffer->commit_page->page->commit) &
1484 ~RB_WRITE_MASK);
1081 barrier(); 1485 barrier();
1082 } 1486 }
1083 1487
@@ -1110,7 +1514,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1110 * to the head page instead of next. 1514 * to the head page instead of next.
1111 */ 1515 */
1112 if (iter->head_page == cpu_buffer->reader_page) 1516 if (iter->head_page == cpu_buffer->reader_page)
1113 iter->head_page = cpu_buffer->head_page; 1517 iter->head_page = rb_set_head_page(cpu_buffer);
1114 else 1518 else
1115 rb_inc_page(cpu_buffer, &iter->head_page); 1519 rb_inc_page(cpu_buffer, &iter->head_page);
1116 1520
@@ -1154,6 +1558,163 @@ rb_update_event(struct ring_buffer_event *event,
1154 } 1558 }
1155} 1559}
1156 1560
1561/*
1562 * rb_handle_head_page - writer hit the head page
1563 *
1564 * Returns: +1 to retry page
1565 * 0 to continue
1566 * -1 on error
1567 */
1568static int
1569rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1570 struct buffer_page *tail_page,
1571 struct buffer_page *next_page)
1572{
1573 struct buffer_page *new_head;
1574 int entries;
1575 int type;
1576 int ret;
1577
1578 entries = rb_page_entries(next_page);
1579
1580 /*
1581 * The hard part is here. We need to move the head
1582 * forward, and protect against both readers on
1583 * other CPUs and writers coming in via interrupts.
1584 */
1585 type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
1586 RB_PAGE_HEAD);
1587
1588 /*
1589 * type can be one of four:
1590 * NORMAL - an interrupt already moved it for us
1591 * HEAD - we are the first to get here.
1592 * UPDATE - we are the interrupt interrupting
1593 * a current move.
1594 * MOVED - a reader on another CPU moved the next
1595 * pointer to its reader page. Give up
1596 * and try again.
1597 */
1598
1599 switch (type) {
1600 case RB_PAGE_HEAD:
1601 /*
1602 * We changed the head to UPDATE, thus
1603 * it is our responsibility to update
1604 * the counters.
1605 */
1606 local_add(entries, &cpu_buffer->overrun);
1607
1608 /*
1609 * The entries will be zeroed out when we move the
1610 * tail page.
1611 */
1612
1613 /* still more to do */
1614 break;
1615
1616 case RB_PAGE_UPDATE:
1617 /*
1618 * This is an interrupt that interrupt the
1619 * previous update. Still more to do.
1620 */
1621 break;
1622 case RB_PAGE_NORMAL:
1623 /*
1624 * An interrupt came in before the update
1625 * and processed this for us.
1626 * Nothing left to do.
1627 */
1628 return 1;
1629 case RB_PAGE_MOVED:
1630 /*
1631 * The reader is on another CPU and just did
1632 * a swap with our next_page.
1633 * Try again.
1634 */
1635 return 1;
1636 default:
1637 RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
1638 return -1;
1639 }
1640
1641 /*
1642 * Now that we are here, the old head pointer is
1643 * set to UPDATE. This will keep the reader from
1644 * swapping the head page with the reader page.
1645 * The reader (on another CPU) will spin till
1646 * we are finished.
1647 *
1648 * We just need to protect against interrupts
1649 * doing the job. We will set the next pointer
1650 * to HEAD. After that, we set the old pointer
1651 * to NORMAL, but only if it was HEAD before.
1652 * otherwise we are an interrupt, and only
1653 * want the outer most commit to reset it.
1654 */
1655 new_head = next_page;
1656 rb_inc_page(cpu_buffer, &new_head);
1657
1658 ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
1659 RB_PAGE_NORMAL);
1660
1661 /*
1662 * Valid returns are:
1663 * HEAD - an interrupt came in and already set it.
1664 * NORMAL - One of two things:
1665 * 1) We really set it.
1666 * 2) A bunch of interrupts came in and moved
1667 * the page forward again.
1668 */
1669 switch (ret) {
1670 case RB_PAGE_HEAD:
1671 case RB_PAGE_NORMAL:
1672 /* OK */
1673 break;
1674 default:
1675 RB_WARN_ON(cpu_buffer, 1);
1676 return -1;
1677 }
1678
1679 /*
1680 * It is possible that an interrupt came in,
1681 * set the head up, then more interrupts came in
1682 * and moved it again. When we get back here,
1683 * the page would have been set to NORMAL but we
1684 * just set it back to HEAD.
1685 *
1686 * How do you detect this? Well, if that happened
1687 * the tail page would have moved.
1688 */
1689 if (ret == RB_PAGE_NORMAL) {
1690 /*
1691 * If the tail had moved passed next, then we need
1692 * to reset the pointer.
1693 */
1694 if (cpu_buffer->tail_page != tail_page &&
1695 cpu_buffer->tail_page != next_page)
1696 rb_head_page_set_normal(cpu_buffer, new_head,
1697 next_page,
1698 RB_PAGE_HEAD);
1699 }
1700
1701 /*
1702 * If this was the outer most commit (the one that
1703 * changed the original pointer from HEAD to UPDATE),
1704 * then it is up to us to reset it to NORMAL.
1705 */
1706 if (type == RB_PAGE_HEAD) {
1707 ret = rb_head_page_set_normal(cpu_buffer, next_page,
1708 tail_page,
1709 RB_PAGE_UPDATE);
1710 if (RB_WARN_ON(cpu_buffer,
1711 ret != RB_PAGE_UPDATE))
1712 return -1;
1713 }
1714
1715 return 0;
1716}
1717
1157static unsigned rb_calculate_event_length(unsigned length) 1718static unsigned rb_calculate_event_length(unsigned length)
1158{ 1719{
1159 struct ring_buffer_event event; /* Used only for sizeof array */ 1720 struct ring_buffer_event event; /* Used only for sizeof array */
@@ -1171,6 +1732,57 @@ static unsigned rb_calculate_event_length(unsigned length)
1171 return length; 1732 return length;
1172} 1733}
1173 1734
1735static inline void
1736rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1737 struct buffer_page *tail_page,
1738 unsigned long tail, unsigned long length)
1739{
1740 struct ring_buffer_event *event;
1741
1742 /*
1743 * Only the event that crossed the page boundary
1744 * must fill the old tail_page with padding.
1745 */
1746 if (tail >= BUF_PAGE_SIZE) {
1747 local_sub(length, &tail_page->write);
1748 return;
1749 }
1750
1751 event = __rb_page_index(tail_page, tail);
1752 kmemcheck_annotate_bitfield(event, bitfield);
1753
1754 /*
1755 * If this event is bigger than the minimum size, then
1756 * we need to be careful that we don't subtract the
1757 * write counter enough to allow another writer to slip
1758 * in on this page.
1759 * We put in a discarded commit instead, to make sure
1760 * that this space is not used again.
1761 *
1762 * If we are less than the minimum size, we don't need to
1763 * worry about it.
1764 */
1765 if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
1766 /* No room for any events */
1767
1768 /* Mark the rest of the page with padding */
1769 rb_event_set_padding(event);
1770
1771 /* Set the write back to the previous setting */
1772 local_sub(length, &tail_page->write);
1773 return;
1774 }
1775
1776 /* Put in a discarded event */
1777 event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
1778 event->type_len = RINGBUF_TYPE_PADDING;
1779 /* time delta must be non zero */
1780 event->time_delta = 1;
1781
1782 /* Set write to end of buffer */
1783 length = (tail + length) - BUF_PAGE_SIZE;
1784 local_sub(length, &tail_page->write);
1785}
1174 1786
1175static struct ring_buffer_event * 1787static struct ring_buffer_event *
1176rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1788rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1178,128 +1790,101 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1178 struct buffer_page *commit_page, 1790 struct buffer_page *commit_page,
1179 struct buffer_page *tail_page, u64 *ts) 1791 struct buffer_page *tail_page, u64 *ts)
1180{ 1792{
1181 struct buffer_page *next_page, *head_page, *reader_page;
1182 struct ring_buffer *buffer = cpu_buffer->buffer; 1793 struct ring_buffer *buffer = cpu_buffer->buffer;
1183 struct ring_buffer_event *event; 1794 struct buffer_page *next_page;
1184 bool lock_taken = false; 1795 int ret;
1185 unsigned long flags;
1186 1796
1187 next_page = tail_page; 1797 next_page = tail_page;
1188 1798
1189 local_irq_save(flags);
1190 /*
1191 * Since the write to the buffer is still not
1192 * fully lockless, we must be careful with NMIs.
1193 * The locks in the writers are taken when a write
1194 * crosses to a new page. The locks protect against
1195 * races with the readers (this will soon be fixed
1196 * with a lockless solution).
1197 *
1198 * Because we can not protect against NMIs, and we
1199 * want to keep traces reentrant, we need to manage
1200 * what happens when we are in an NMI.
1201 *
1202 * NMIs can happen after we take the lock.
1203 * If we are in an NMI, only take the lock
1204 * if it is not already taken. Otherwise
1205 * simply fail.
1206 */
1207 if (unlikely(in_nmi())) {
1208 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1209 cpu_buffer->nmi_dropped++;
1210 goto out_reset;
1211 }
1212 } else
1213 __raw_spin_lock(&cpu_buffer->lock);
1214
1215 lock_taken = true;
1216
1217 rb_inc_page(cpu_buffer, &next_page); 1799 rb_inc_page(cpu_buffer, &next_page);
1218 1800
1219 head_page = cpu_buffer->head_page;
1220 reader_page = cpu_buffer->reader_page;
1221
1222 /* we grabbed the lock before incrementing */
1223 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1224 goto out_reset;
1225
1226 /* 1801 /*
1227 * If for some reason, we had an interrupt storm that made 1802 * If for some reason, we had an interrupt storm that made
1228 * it all the way around the buffer, bail, and warn 1803 * it all the way around the buffer, bail, and warn
1229 * about it. 1804 * about it.
1230 */ 1805 */
1231 if (unlikely(next_page == commit_page)) { 1806 if (unlikely(next_page == commit_page)) {
1232 cpu_buffer->commit_overrun++; 1807 local_inc(&cpu_buffer->commit_overrun);
1233 goto out_reset; 1808 goto out_reset;
1234 } 1809 }
1235 1810
1236 if (next_page == head_page) {
1237 if (!(buffer->flags & RB_FL_OVERWRITE))
1238 goto out_reset;
1239
1240 /* tail_page has not moved yet? */
1241 if (tail_page == cpu_buffer->tail_page) {
1242 /* count overflows */
1243 cpu_buffer->overrun +=
1244 local_read(&head_page->entries);
1245
1246 rb_inc_page(cpu_buffer, &head_page);
1247 cpu_buffer->head_page = head_page;
1248 cpu_buffer->head_page->read = 0;
1249 }
1250 }
1251
1252 /* 1811 /*
1253 * If the tail page is still the same as what we think 1812 * This is where the fun begins!
1254 * it is, then it is up to us to update the tail 1813 *
1255 * pointer. 1814 * We are fighting against races between a reader that
1815 * could be on another CPU trying to swap its reader
1816 * page with the buffer head.
1817 *
1818 * We are also fighting against interrupts coming in and
1819 * moving the head or tail on us as well.
1820 *
1821 * If the next page is the head page then we have filled
1822 * the buffer, unless the commit page is still on the
1823 * reader page.
1256 */ 1824 */
1257 if (tail_page == cpu_buffer->tail_page) { 1825 if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
1258 local_set(&next_page->write, 0);
1259 local_set(&next_page->entries, 0);
1260 local_set(&next_page->page->commit, 0);
1261 cpu_buffer->tail_page = next_page;
1262 1826
1263 /* reread the time stamp */ 1827 /*
1264 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1828 * If the commit is not on the reader page, then
1265 cpu_buffer->tail_page->page->time_stamp = *ts; 1829 * move the header page.
1830 */
1831 if (!rb_is_reader_page(cpu_buffer->commit_page)) {
1832 /*
1833 * If we are not in overwrite mode,
1834 * this is easy, just stop here.
1835 */
1836 if (!(buffer->flags & RB_FL_OVERWRITE))
1837 goto out_reset;
1838
1839 ret = rb_handle_head_page(cpu_buffer,
1840 tail_page,
1841 next_page);
1842 if (ret < 0)
1843 goto out_reset;
1844 if (ret)
1845 goto out_again;
1846 } else {
1847 /*
1848 * We need to be careful here too. The
1849 * commit page could still be on the reader
1850 * page. We could have a small buffer, and
1851 * have filled up the buffer with events
1852 * from interrupts and such, and wrapped.
1853 *
1854 * Note, if the tail page is also the on the
1855 * reader_page, we let it move out.
1856 */
1857 if (unlikely((cpu_buffer->commit_page !=
1858 cpu_buffer->tail_page) &&
1859 (cpu_buffer->commit_page ==
1860 cpu_buffer->reader_page))) {
1861 local_inc(&cpu_buffer->commit_overrun);
1862 goto out_reset;
1863 }
1864 }
1266 } 1865 }
1267 1866
1268 /* 1867 ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
1269 * The actual tail page has moved forward. 1868 if (ret) {
1270 */ 1869 /*
1271 if (tail < BUF_PAGE_SIZE) { 1870 * Nested commits always have zero deltas, so
1272 /* Mark the rest of the page with padding */ 1871 * just reread the time stamp
1273 event = __rb_page_index(tail_page, tail); 1872 */
1274 kmemcheck_annotate_bitfield(event, bitfield); 1873 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1275 rb_event_set_padding(event); 1874 next_page->page->time_stamp = *ts;
1276 } 1875 }
1277 1876
1278 /* Set the write back to the previous setting */ 1877 out_again:
1279 local_sub(length, &tail_page->write);
1280 1878
1281 /* 1879 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1282 * If this was a commit entry that failed,
1283 * increment that too
1284 */
1285 if (tail_page == cpu_buffer->commit_page &&
1286 tail == rb_commit_index(cpu_buffer)) {
1287 rb_set_commit_to_write(cpu_buffer);
1288 }
1289
1290 __raw_spin_unlock(&cpu_buffer->lock);
1291 local_irq_restore(flags);
1292 1880
1293 /* fail and let the caller try again */ 1881 /* fail and let the caller try again */
1294 return ERR_PTR(-EAGAIN); 1882 return ERR_PTR(-EAGAIN);
1295 1883
1296 out_reset: 1884 out_reset:
1297 /* reset write */ 1885 /* reset write */
1298 local_sub(length, &tail_page->write); 1886 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1299 1887
1300 if (likely(lock_taken))
1301 __raw_spin_unlock(&cpu_buffer->lock);
1302 local_irq_restore(flags);
1303 return NULL; 1888 return NULL;
1304} 1889}
1305 1890
@@ -1316,6 +1901,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1316 barrier(); 1901 barrier();
1317 tail_page = cpu_buffer->tail_page; 1902 tail_page = cpu_buffer->tail_page;
1318 write = local_add_return(length, &tail_page->write); 1903 write = local_add_return(length, &tail_page->write);
1904
1905 /* set write to only the index of the write */
1906 write &= RB_WRITE_MASK;
1319 tail = write - length; 1907 tail = write - length;
1320 1908
1321 /* See if we shot pass the end of this buffer page */ 1909 /* See if we shot pass the end of this buffer page */
@@ -1325,9 +1913,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1325 1913
1326 /* We reserved something on the buffer */ 1914 /* We reserved something on the buffer */
1327 1915
1328 if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
1329 return NULL;
1330
1331 event = __rb_page_index(tail_page, tail); 1916 event = __rb_page_index(tail_page, tail);
1332 kmemcheck_annotate_bitfield(event, bitfield); 1917 kmemcheck_annotate_bitfield(event, bitfield);
1333 rb_update_event(event, type, length); 1918 rb_update_event(event, type, length);
@@ -1337,11 +1922,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1337 local_inc(&tail_page->entries); 1922 local_inc(&tail_page->entries);
1338 1923
1339 /* 1924 /*
1340 * If this is a commit and the tail is zero, then update 1925 * If this is the first commit on the page, then update
1341 * this page's time stamp. 1926 * its timestamp.
1342 */ 1927 */
1343 if (!tail && rb_is_commit(cpu_buffer, event)) 1928 if (!tail)
1344 cpu_buffer->commit_page->page->time_stamp = *ts; 1929 tail_page->page->time_stamp = *ts;
1345 1930
1346 return event; 1931 return event;
1347} 1932}
@@ -1363,12 +1948,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1363 bpage = cpu_buffer->tail_page; 1948 bpage = cpu_buffer->tail_page;
1364 1949
1365 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 1950 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1951 unsigned long write_mask =
1952 local_read(&bpage->write) & ~RB_WRITE_MASK;
1366 /* 1953 /*
1367 * This is on the tail page. It is possible that 1954 * This is on the tail page. It is possible that
1368 * a write could come in and move the tail page 1955 * a write could come in and move the tail page
1369 * and write to the next page. That is fine 1956 * and write to the next page. That is fine
1370 * because we just shorten what is on this page. 1957 * because we just shorten what is on this page.
1371 */ 1958 */
1959 old_index += write_mask;
1960 new_index += write_mask;
1372 index = local_cmpxchg(&bpage->write, old_index, new_index); 1961 index = local_cmpxchg(&bpage->write, old_index, new_index);
1373 if (index == old_index) 1962 if (index == old_index)
1374 return 1; 1963 return 1;
@@ -1410,16 +1999,16 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1410 return -EAGAIN; 1999 return -EAGAIN;
1411 2000
1412 /* Only a commited time event can update the write stamp */ 2001 /* Only a commited time event can update the write stamp */
1413 if (rb_is_commit(cpu_buffer, event)) { 2002 if (rb_event_is_commit(cpu_buffer, event)) {
1414 /* 2003 /*
1415 * If this is the first on the page, then we need to 2004 * If this is the first on the page, then it was
1416 * update the page itself, and just put in a zero. 2005 * updated with the page itself. Try to discard it
2006 * and if we can't just make it zero.
1417 */ 2007 */
1418 if (rb_event_index(event)) { 2008 if (rb_event_index(event)) {
1419 event->time_delta = *delta & TS_MASK; 2009 event->time_delta = *delta & TS_MASK;
1420 event->array[0] = *delta >> TS_SHIFT; 2010 event->array[0] = *delta >> TS_SHIFT;
1421 } else { 2011 } else {
1422 cpu_buffer->commit_page->page->time_stamp = *ts;
1423 /* try to discard, since we do not need this */ 2012 /* try to discard, since we do not need this */
1424 if (!rb_try_to_discard(cpu_buffer, event)) { 2013 if (!rb_try_to_discard(cpu_buffer, event)) {
1425 /* nope, just zero it */ 2014 /* nope, just zero it */
@@ -1445,8 +2034,47 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1445 return ret; 2034 return ret;
1446} 2035}
1447 2036
2037static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2038{
2039 local_inc(&cpu_buffer->committing);
2040 local_inc(&cpu_buffer->commits);
2041}
2042
2043static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2044{
2045 unsigned long commits;
2046
2047 if (RB_WARN_ON(cpu_buffer,
2048 !local_read(&cpu_buffer->committing)))
2049 return;
2050
2051 again:
2052 commits = local_read(&cpu_buffer->commits);
2053 /* synchronize with interrupts */
2054 barrier();
2055 if (local_read(&cpu_buffer->committing) == 1)
2056 rb_set_commit_to_write(cpu_buffer);
2057
2058 local_dec(&cpu_buffer->committing);
2059
2060 /* synchronize with interrupts */
2061 barrier();
2062
2063 /*
2064 * Need to account for interrupts coming in between the
2065 * updating of the commit page and the clearing of the
2066 * committing counter.
2067 */
2068 if (unlikely(local_read(&cpu_buffer->commits) != commits) &&
2069 !local_read(&cpu_buffer->committing)) {
2070 local_inc(&cpu_buffer->committing);
2071 goto again;
2072 }
2073}
2074
1448static struct ring_buffer_event * 2075static struct ring_buffer_event *
1449rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 2076rb_reserve_next_event(struct ring_buffer *buffer,
2077 struct ring_buffer_per_cpu *cpu_buffer,
1450 unsigned long length) 2078 unsigned long length)
1451{ 2079{
1452 struct ring_buffer_event *event; 2080 struct ring_buffer_event *event;
@@ -1454,6 +2082,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1454 int commit = 0; 2082 int commit = 0;
1455 int nr_loops = 0; 2083 int nr_loops = 0;
1456 2084
2085 rb_start_commit(cpu_buffer);
2086
2087#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2088 /*
2089 * Due to the ability to swap a cpu buffer from a buffer
2090 * it is possible it was swapped before we committed.
2091 * (committing stops a swap). We check for it here and
2092 * if it happened, we have to fail the write.
2093 */
2094 barrier();
2095 if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
2096 local_dec(&cpu_buffer->committing);
2097 local_dec(&cpu_buffer->commits);
2098 return NULL;
2099 }
2100#endif
2101
1457 length = rb_calculate_event_length(length); 2102 length = rb_calculate_event_length(length);
1458 again: 2103 again:
1459 /* 2104 /*
@@ -1466,7 +2111,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1466 * Bail! 2111 * Bail!
1467 */ 2112 */
1468 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 2113 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1469 return NULL; 2114 goto out_fail;
1470 2115
1471 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 2116 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
1472 2117
@@ -1497,7 +2142,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1497 2142
1498 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 2143 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
1499 if (commit == -EBUSY) 2144 if (commit == -EBUSY)
1500 return NULL; 2145 goto out_fail;
1501 2146
1502 if (commit == -EAGAIN) 2147 if (commit == -EAGAIN)
1503 goto again; 2148 goto again;
@@ -1511,30 +2156,23 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1511 if (unlikely(PTR_ERR(event) == -EAGAIN)) 2156 if (unlikely(PTR_ERR(event) == -EAGAIN))
1512 goto again; 2157 goto again;
1513 2158
1514 if (!event) { 2159 if (!event)
1515 if (unlikely(commit)) 2160 goto out_fail;
1516 /*
1517 * Ouch! We needed a timestamp and it was commited. But
1518 * we didn't get our event reserved.
1519 */
1520 rb_set_commit_to_write(cpu_buffer);
1521 return NULL;
1522 }
1523 2161
1524 /* 2162 if (!rb_event_is_commit(cpu_buffer, event))
1525 * If the timestamp was commited, make the commit our entry
1526 * now so that we will update it when needed.
1527 */
1528 if (unlikely(commit))
1529 rb_set_commit_event(cpu_buffer, event);
1530 else if (!rb_is_commit(cpu_buffer, event))
1531 delta = 0; 2163 delta = 0;
1532 2164
1533 event->time_delta = delta; 2165 event->time_delta = delta;
1534 2166
1535 return event; 2167 return event;
2168
2169 out_fail:
2170 rb_end_commit(cpu_buffer);
2171 return NULL;
1536} 2172}
1537 2173
2174#ifdef CONFIG_TRACING
2175
1538#define TRACE_RECURSIVE_DEPTH 16 2176#define TRACE_RECURSIVE_DEPTH 16
1539 2177
1540static int trace_recursive_lock(void) 2178static int trace_recursive_lock(void)
@@ -1565,6 +2203,13 @@ static void trace_recursive_unlock(void)
1565 current->trace_recursion--; 2203 current->trace_recursion--;
1566} 2204}
1567 2205
2206#else
2207
2208#define trace_recursive_lock() (0)
2209#define trace_recursive_unlock() do { } while (0)
2210
2211#endif
2212
1568static DEFINE_PER_CPU(int, rb_need_resched); 2213static DEFINE_PER_CPU(int, rb_need_resched);
1569 2214
1570/** 2215/**
@@ -1614,7 +2259,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1614 if (length > BUF_MAX_DATA_SIZE) 2259 if (length > BUF_MAX_DATA_SIZE)
1615 goto out; 2260 goto out;
1616 2261
1617 event = rb_reserve_next_event(cpu_buffer, length); 2262 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1618 if (!event) 2263 if (!event)
1619 goto out; 2264 goto out;
1620 2265
@@ -1637,18 +2282,24 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1637} 2282}
1638EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2283EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1639 2284
2285static void
2286rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2287 struct ring_buffer_event *event)
2288{
2289 /*
2290 * The event first in the commit queue updates the
2291 * time stamp.
2292 */
2293 if (rb_event_is_commit(cpu_buffer, event))
2294 cpu_buffer->write_stamp += event->time_delta;
2295}
2296
1640static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2297static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1641 struct ring_buffer_event *event) 2298 struct ring_buffer_event *event)
1642{ 2299{
1643 local_inc(&cpu_buffer->entries); 2300 local_inc(&cpu_buffer->entries);
1644 2301 rb_update_write_stamp(cpu_buffer, event);
1645 /* Only process further if we own the commit */ 2302 rb_end_commit(cpu_buffer);
1646 if (!rb_is_commit(cpu_buffer, event))
1647 return;
1648
1649 cpu_buffer->write_stamp += event->time_delta;
1650
1651 rb_set_commit_to_write(cpu_buffer);
1652} 2303}
1653 2304
1654/** 2305/**
@@ -1694,32 +2345,57 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
1694 event->time_delta = 1; 2345 event->time_delta = 1;
1695} 2346}
1696 2347
1697/** 2348/*
1698 * ring_buffer_event_discard - discard any event in the ring buffer 2349 * Decrement the entries to the page that an event is on.
1699 * @event: the event to discard 2350 * The event does not even need to exist, only the pointer
1700 * 2351 * to the page it is on. This may only be called before the commit
1701 * Sometimes a event that is in the ring buffer needs to be ignored. 2352 * takes place.
1702 * This function lets the user discard an event in the ring buffer
1703 * and then that event will not be read later.
1704 *
1705 * Note, it is up to the user to be careful with this, and protect
1706 * against races. If the user discards an event that has been consumed
1707 * it is possible that it could corrupt the ring buffer.
1708 */ 2353 */
1709void ring_buffer_event_discard(struct ring_buffer_event *event) 2354static inline void
2355rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
2356 struct ring_buffer_event *event)
1710{ 2357{
1711 rb_event_discard(event); 2358 unsigned long addr = (unsigned long)event;
2359 struct buffer_page *bpage = cpu_buffer->commit_page;
2360 struct buffer_page *start;
2361
2362 addr &= PAGE_MASK;
2363
2364 /* Do the likely case first */
2365 if (likely(bpage->page == (void *)addr)) {
2366 local_dec(&bpage->entries);
2367 return;
2368 }
2369
2370 /*
2371 * Because the commit page may be on the reader page we
2372 * start with the next page and check the end loop there.
2373 */
2374 rb_inc_page(cpu_buffer, &bpage);
2375 start = bpage;
2376 do {
2377 if (bpage->page == (void *)addr) {
2378 local_dec(&bpage->entries);
2379 return;
2380 }
2381 rb_inc_page(cpu_buffer, &bpage);
2382 } while (bpage != start);
2383
2384 /* commit not part of this buffer?? */
2385 RB_WARN_ON(cpu_buffer, 1);
1712} 2386}
1713EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
1714 2387
1715/** 2388/**
1716 * ring_buffer_commit_discard - discard an event that has not been committed 2389 * ring_buffer_commit_discard - discard an event that has not been committed
1717 * @buffer: the ring buffer 2390 * @buffer: the ring buffer
1718 * @event: non committed event to discard 2391 * @event: non committed event to discard
1719 * 2392 *
1720 * This is similar to ring_buffer_event_discard but must only be 2393 * Sometimes an event that is in the ring buffer needs to be ignored.
1721 * performed on an event that has not been committed yet. The difference 2394 * This function lets the user discard an event in the ring buffer
1722 * is that this will also try to free the event from the ring buffer 2395 * and then that event will not be read later.
2396 *
2397 * This function only works if it is called before the the item has been
2398 * committed. It will try to free the event from the ring buffer
1723 * if another event has not been added behind it. 2399 * if another event has not been added behind it.
1724 * 2400 *
1725 * If another event has been added behind it, it will set the event 2401 * If another event has been added behind it, it will set the event
@@ -1737,32 +2413,27 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
1737 /* The event is discarded regardless */ 2413 /* The event is discarded regardless */
1738 rb_event_discard(event); 2414 rb_event_discard(event);
1739 2415
2416 cpu = smp_processor_id();
2417 cpu_buffer = buffer->buffers[cpu];
2418
1740 /* 2419 /*
1741 * This must only be called if the event has not been 2420 * This must only be called if the event has not been
1742 * committed yet. Thus we can assume that preemption 2421 * committed yet. Thus we can assume that preemption
1743 * is still disabled. 2422 * is still disabled.
1744 */ 2423 */
1745 RB_WARN_ON(buffer, preemptible()); 2424 RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing));
1746 2425
1747 cpu = smp_processor_id(); 2426 rb_decrement_entry(cpu_buffer, event);
1748 cpu_buffer = buffer->buffers[cpu]; 2427 if (rb_try_to_discard(cpu_buffer, event))
1749
1750 if (!rb_try_to_discard(cpu_buffer, event))
1751 goto out; 2428 goto out;
1752 2429
1753 /* 2430 /*
1754 * The commit is still visible by the reader, so we 2431 * The commit is still visible by the reader, so we
1755 * must increment entries. 2432 * must still update the timestamp.
1756 */ 2433 */
1757 local_inc(&cpu_buffer->entries); 2434 rb_update_write_stamp(cpu_buffer, event);
1758 out: 2435 out:
1759 /* 2436 rb_end_commit(cpu_buffer);
1760 * If a write came in and pushed the tail page
1761 * we still need to update the commit pointer
1762 * if we were the commit.
1763 */
1764 if (rb_is_commit(cpu_buffer, event))
1765 rb_set_commit_to_write(cpu_buffer);
1766 2437
1767 trace_recursive_unlock(); 2438 trace_recursive_unlock();
1768 2439
@@ -1821,7 +2492,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1821 if (length > BUF_MAX_DATA_SIZE) 2492 if (length > BUF_MAX_DATA_SIZE)
1822 goto out; 2493 goto out;
1823 2494
1824 event = rb_reserve_next_event(cpu_buffer, length); 2495 event = rb_reserve_next_event(buffer, cpu_buffer, length);
1825 if (!event) 2496 if (!event)
1826 goto out; 2497 goto out;
1827 2498
@@ -1842,9 +2513,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write);
1842static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 2513static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1843{ 2514{
1844 struct buffer_page *reader = cpu_buffer->reader_page; 2515 struct buffer_page *reader = cpu_buffer->reader_page;
1845 struct buffer_page *head = cpu_buffer->head_page; 2516 struct buffer_page *head = rb_set_head_page(cpu_buffer);
1846 struct buffer_page *commit = cpu_buffer->commit_page; 2517 struct buffer_page *commit = cpu_buffer->commit_page;
1847 2518
2519 /* In case of error, head will be NULL */
2520 if (unlikely(!head))
2521 return 1;
2522
1848 return reader->read == rb_page_commit(reader) && 2523 return reader->read == rb_page_commit(reader) &&
1849 (commit == reader || 2524 (commit == reader ||
1850 (commit == head && 2525 (commit == head &&
@@ -1935,7 +2610,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1935 return 0; 2610 return 0;
1936 2611
1937 cpu_buffer = buffer->buffers[cpu]; 2612 cpu_buffer = buffer->buffers[cpu];
1938 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) 2613 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
1939 - cpu_buffer->read; 2614 - cpu_buffer->read;
1940 2615
1941 return ret; 2616 return ret;
@@ -1956,33 +2631,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1956 return 0; 2631 return 0;
1957 2632
1958 cpu_buffer = buffer->buffers[cpu]; 2633 cpu_buffer = buffer->buffers[cpu];
1959 ret = cpu_buffer->overrun; 2634 ret = local_read(&cpu_buffer->overrun);
1960 2635
1961 return ret; 2636 return ret;
1962} 2637}
1963EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 2638EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1964 2639
1965/** 2640/**
1966 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
1967 * @buffer: The ring buffer
1968 * @cpu: The per CPU buffer to get the number of overruns from
1969 */
1970unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
1971{
1972 struct ring_buffer_per_cpu *cpu_buffer;
1973 unsigned long ret;
1974
1975 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1976 return 0;
1977
1978 cpu_buffer = buffer->buffers[cpu];
1979 ret = cpu_buffer->nmi_dropped;
1980
1981 return ret;
1982}
1983EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
1984
1985/**
1986 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 2641 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
1987 * @buffer: The ring buffer 2642 * @buffer: The ring buffer
1988 * @cpu: The per CPU buffer to get the number of overruns from 2643 * @cpu: The per CPU buffer to get the number of overruns from
@@ -1997,7 +2652,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
1997 return 0; 2652 return 0;
1998 2653
1999 cpu_buffer = buffer->buffers[cpu]; 2654 cpu_buffer = buffer->buffers[cpu];
2000 ret = cpu_buffer->commit_overrun; 2655 ret = local_read(&cpu_buffer->commit_overrun);
2001 2656
2002 return ret; 2657 return ret;
2003} 2658}
@@ -2020,7 +2675,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2020 for_each_buffer_cpu(buffer, cpu) { 2675 for_each_buffer_cpu(buffer, cpu) {
2021 cpu_buffer = buffer->buffers[cpu]; 2676 cpu_buffer = buffer->buffers[cpu];
2022 entries += (local_read(&cpu_buffer->entries) - 2677 entries += (local_read(&cpu_buffer->entries) -
2023 cpu_buffer->overrun) - cpu_buffer->read; 2678 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2024 } 2679 }
2025 2680
2026 return entries; 2681 return entries;
@@ -2043,7 +2698,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
2043 /* if you care about this being correct, lock the buffer */ 2698 /* if you care about this being correct, lock the buffer */
2044 for_each_buffer_cpu(buffer, cpu) { 2699 for_each_buffer_cpu(buffer, cpu) {
2045 cpu_buffer = buffer->buffers[cpu]; 2700 cpu_buffer = buffer->buffers[cpu];
2046 overruns += cpu_buffer->overrun; 2701 overruns += local_read(&cpu_buffer->overrun);
2047 } 2702 }
2048 2703
2049 return overruns; 2704 return overruns;
@@ -2056,8 +2711,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2056 2711
2057 /* Iterator usage is expected to have record disabled */ 2712 /* Iterator usage is expected to have record disabled */
2058 if (list_empty(&cpu_buffer->reader_page->list)) { 2713 if (list_empty(&cpu_buffer->reader_page->list)) {
2059 iter->head_page = cpu_buffer->head_page; 2714 iter->head_page = rb_set_head_page(cpu_buffer);
2060 iter->head = cpu_buffer->head_page->read; 2715 if (unlikely(!iter->head_page))
2716 return;
2717 iter->head = iter->head_page->read;
2061 } else { 2718 } else {
2062 iter->head_page = cpu_buffer->reader_page; 2719 iter->head_page = cpu_buffer->reader_page;
2063 iter->head = cpu_buffer->reader_page->read; 2720 iter->head = cpu_buffer->reader_page->read;
@@ -2174,6 +2831,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2174 struct buffer_page *reader = NULL; 2831 struct buffer_page *reader = NULL;
2175 unsigned long flags; 2832 unsigned long flags;
2176 int nr_loops = 0; 2833 int nr_loops = 0;
2834 int ret;
2177 2835
2178 local_irq_save(flags); 2836 local_irq_save(flags);
2179 __raw_spin_lock(&cpu_buffer->lock); 2837 __raw_spin_lock(&cpu_buffer->lock);
@@ -2207,30 +2865,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2207 goto out; 2865 goto out;
2208 2866
2209 /* 2867 /*
2210 * Splice the empty reader page into the list around the head.
2211 * Reset the reader page to size zero. 2868 * Reset the reader page to size zero.
2212 */ 2869 */
2870 local_set(&cpu_buffer->reader_page->write, 0);
2871 local_set(&cpu_buffer->reader_page->entries, 0);
2872 local_set(&cpu_buffer->reader_page->page->commit, 0);
2213 2873
2214 reader = cpu_buffer->head_page; 2874 spin:
2875 /*
2876 * Splice the empty reader page into the list around the head.
2877 */
2878 reader = rb_set_head_page(cpu_buffer);
2215 cpu_buffer->reader_page->list.next = reader->list.next; 2879 cpu_buffer->reader_page->list.next = reader->list.next;
2216 cpu_buffer->reader_page->list.prev = reader->list.prev; 2880 cpu_buffer->reader_page->list.prev = reader->list.prev;
2217 2881
2218 local_set(&cpu_buffer->reader_page->write, 0); 2882 /*
2219 local_set(&cpu_buffer->reader_page->entries, 0); 2883 * cpu_buffer->pages just needs to point to the buffer, it
2220 local_set(&cpu_buffer->reader_page->page->commit, 0); 2884 * has no specific buffer page to point to. Lets move it out
2885 * of our way so we don't accidently swap it.
2886 */
2887 cpu_buffer->pages = reader->list.prev;
2221 2888
2222 /* Make the reader page now replace the head */ 2889 /* The reader page will be pointing to the new head */
2223 reader->list.prev->next = &cpu_buffer->reader_page->list; 2890 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2224 reader->list.next->prev = &cpu_buffer->reader_page->list; 2891
2892 /*
2893 * Here's the tricky part.
2894 *
2895 * We need to move the pointer past the header page.
2896 * But we can only do that if a writer is not currently
2897 * moving it. The page before the header page has the
2898 * flag bit '1' set if it is pointing to the page we want.
2899 * but if the writer is in the process of moving it
2900 * than it will be '2' or already moved '0'.
2901 */
2902
2903 ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
2225 2904
2226 /* 2905 /*
2227 * If the tail is on the reader, then we must set the head 2906 * If we did not convert it, then we must try again.
2228 * to the inserted page, otherwise we set it one before.
2229 */ 2907 */
2230 cpu_buffer->head_page = cpu_buffer->reader_page; 2908 if (!ret)
2909 goto spin;
2231 2910
2232 if (cpu_buffer->commit_page != reader) 2911 /*
2233 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2912 * Yeah! We succeeded in replacing the page.
2913 *
2914 * Now make the new head point back to the reader page.
2915 */
2916 reader->list.next->prev = &cpu_buffer->reader_page->list;
2917 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2234 2918
2235 /* Finally update the reader page to the new head */ 2919 /* Finally update the reader page to the new head */
2236 cpu_buffer->reader_page = reader; 2920 cpu_buffer->reader_page = reader;
@@ -2259,8 +2943,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2259 2943
2260 event = rb_reader_event(cpu_buffer); 2944 event = rb_reader_event(cpu_buffer);
2261 2945
2262 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX 2946 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
2263 || rb_discarded_event(event))
2264 cpu_buffer->read++; 2947 cpu_buffer->read++;
2265 2948
2266 rb_update_read_stamp(cpu_buffer, event); 2949 rb_update_read_stamp(cpu_buffer, event);
@@ -2351,7 +3034,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2351 * the box. Return the padding, and we will release 3034 * the box. Return the padding, and we will release
2352 * the current locks, and try again. 3035 * the current locks, and try again.
2353 */ 3036 */
2354 rb_advance_reader(cpu_buffer);
2355 return event; 3037 return event;
2356 3038
2357 case RINGBUF_TYPE_TIME_EXTEND: 3039 case RINGBUF_TYPE_TIME_EXTEND:
@@ -2446,6 +3128,21 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2446} 3128}
2447EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); 3129EXPORT_SYMBOL_GPL(ring_buffer_iter_peek);
2448 3130
3131static inline int rb_ok_to_lock(void)
3132{
3133 /*
3134 * If an NMI die dumps out the content of the ring buffer
3135 * do not grab locks. We also permanently disable the ring
3136 * buffer too. A one time deal is all you get from reading
3137 * the ring buffer from an NMI.
3138 */
3139 if (likely(!in_nmi()))
3140 return 1;
3141
3142 tracing_off_permanent();
3143 return 0;
3144}
3145
2449/** 3146/**
2450 * ring_buffer_peek - peek at the next event to be read 3147 * ring_buffer_peek - peek at the next event to be read
2451 * @buffer: The ring buffer to read 3148 * @buffer: The ring buffer to read
@@ -2461,19 +3158,25 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2461 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 3158 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2462 struct ring_buffer_event *event; 3159 struct ring_buffer_event *event;
2463 unsigned long flags; 3160 unsigned long flags;
3161 int dolock;
2464 3162
2465 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3163 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2466 return NULL; 3164 return NULL;
2467 3165
3166 dolock = rb_ok_to_lock();
2468 again: 3167 again:
2469 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3168 local_irq_save(flags);
3169 if (dolock)
3170 spin_lock(&cpu_buffer->reader_lock);
2470 event = rb_buffer_peek(buffer, cpu, ts); 3171 event = rb_buffer_peek(buffer, cpu, ts);
2471 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3172 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3173 rb_advance_reader(cpu_buffer);
3174 if (dolock)
3175 spin_unlock(&cpu_buffer->reader_lock);
3176 local_irq_restore(flags);
2472 3177
2473 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3178 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2474 cpu_relax();
2475 goto again; 3179 goto again;
2476 }
2477 3180
2478 return event; 3181 return event;
2479} 3182}
@@ -2498,10 +3201,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2498 event = rb_iter_peek(iter, ts); 3201 event = rb_iter_peek(iter, ts);
2499 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3202 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2500 3203
2501 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3204 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2502 cpu_relax();
2503 goto again; 3205 goto again;
2504 }
2505 3206
2506 return event; 3207 return event;
2507} 3208}
@@ -2520,6 +3221,9 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2520 struct ring_buffer_per_cpu *cpu_buffer; 3221 struct ring_buffer_per_cpu *cpu_buffer;
2521 struct ring_buffer_event *event = NULL; 3222 struct ring_buffer_event *event = NULL;
2522 unsigned long flags; 3223 unsigned long flags;
3224 int dolock;
3225
3226 dolock = rb_ok_to_lock();
2523 3227
2524 again: 3228 again:
2525 /* might be called in atomic */ 3229 /* might be called in atomic */
@@ -2529,24 +3233,23 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2529 goto out; 3233 goto out;
2530 3234
2531 cpu_buffer = buffer->buffers[cpu]; 3235 cpu_buffer = buffer->buffers[cpu];
2532 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3236 local_irq_save(flags);
3237 if (dolock)
3238 spin_lock(&cpu_buffer->reader_lock);
2533 3239
2534 event = rb_buffer_peek(buffer, cpu, ts); 3240 event = rb_buffer_peek(buffer, cpu, ts);
2535 if (!event) 3241 if (event)
2536 goto out_unlock; 3242 rb_advance_reader(cpu_buffer);
2537
2538 rb_advance_reader(cpu_buffer);
2539 3243
2540 out_unlock: 3244 if (dolock)
2541 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3245 spin_unlock(&cpu_buffer->reader_lock);
3246 local_irq_restore(flags);
2542 3247
2543 out: 3248 out:
2544 preempt_enable(); 3249 preempt_enable();
2545 3250
2546 if (event && event->type_len == RINGBUF_TYPE_PADDING) { 3251 if (event && event->type_len == RINGBUF_TYPE_PADDING)
2547 cpu_relax();
2548 goto again; 3252 goto again;
2549 }
2550 3253
2551 return event; 3254 return event;
2552} 3255}
@@ -2626,21 +3329,19 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2626 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3329 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2627 unsigned long flags; 3330 unsigned long flags;
2628 3331
2629 again:
2630 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3332 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3333 again:
2631 event = rb_iter_peek(iter, ts); 3334 event = rb_iter_peek(iter, ts);
2632 if (!event) 3335 if (!event)
2633 goto out; 3336 goto out;
2634 3337
3338 if (event->type_len == RINGBUF_TYPE_PADDING)
3339 goto again;
3340
2635 rb_advance_iter(iter); 3341 rb_advance_iter(iter);
2636 out: 3342 out:
2637 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3343 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2638 3344
2639 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2640 cpu_relax();
2641 goto again;
2642 }
2643
2644 return event; 3345 return event;
2645} 3346}
2646EXPORT_SYMBOL_GPL(ring_buffer_read); 3347EXPORT_SYMBOL_GPL(ring_buffer_read);
@@ -2658,8 +3359,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
2658static void 3359static void
2659rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 3360rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2660{ 3361{
3362 rb_head_page_deactivate(cpu_buffer);
3363
2661 cpu_buffer->head_page 3364 cpu_buffer->head_page
2662 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 3365 = list_entry(cpu_buffer->pages, struct buffer_page, list);
2663 local_set(&cpu_buffer->head_page->write, 0); 3366 local_set(&cpu_buffer->head_page->write, 0);
2664 local_set(&cpu_buffer->head_page->entries, 0); 3367 local_set(&cpu_buffer->head_page->entries, 0);
2665 local_set(&cpu_buffer->head_page->page->commit, 0); 3368 local_set(&cpu_buffer->head_page->page->commit, 0);
@@ -2675,14 +3378,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2675 local_set(&cpu_buffer->reader_page->page->commit, 0); 3378 local_set(&cpu_buffer->reader_page->page->commit, 0);
2676 cpu_buffer->reader_page->read = 0; 3379 cpu_buffer->reader_page->read = 0;
2677 3380
2678 cpu_buffer->nmi_dropped = 0; 3381 local_set(&cpu_buffer->commit_overrun, 0);
2679 cpu_buffer->commit_overrun = 0; 3382 local_set(&cpu_buffer->overrun, 0);
2680 cpu_buffer->overrun = 0;
2681 cpu_buffer->read = 0;
2682 local_set(&cpu_buffer->entries, 0); 3383 local_set(&cpu_buffer->entries, 0);
3384 local_set(&cpu_buffer->committing, 0);
3385 local_set(&cpu_buffer->commits, 0);
3386 cpu_buffer->read = 0;
2683 3387
2684 cpu_buffer->write_stamp = 0; 3388 cpu_buffer->write_stamp = 0;
2685 cpu_buffer->read_stamp = 0; 3389 cpu_buffer->read_stamp = 0;
3390
3391 rb_head_page_activate(cpu_buffer);
2686} 3392}
2687 3393
2688/** 3394/**
@@ -2702,12 +3408,16 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2702 3408
2703 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3409 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2704 3410
3411 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3412 goto out;
3413
2705 __raw_spin_lock(&cpu_buffer->lock); 3414 __raw_spin_lock(&cpu_buffer->lock);
2706 3415
2707 rb_reset_cpu(cpu_buffer); 3416 rb_reset_cpu(cpu_buffer);
2708 3417
2709 __raw_spin_unlock(&cpu_buffer->lock); 3418 __raw_spin_unlock(&cpu_buffer->lock);
2710 3419
3420 out:
2711 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3421 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2712 3422
2713 atomic_dec(&cpu_buffer->record_disabled); 3423 atomic_dec(&cpu_buffer->record_disabled);
@@ -2734,12 +3444,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
2734int ring_buffer_empty(struct ring_buffer *buffer) 3444int ring_buffer_empty(struct ring_buffer *buffer)
2735{ 3445{
2736 struct ring_buffer_per_cpu *cpu_buffer; 3446 struct ring_buffer_per_cpu *cpu_buffer;
3447 unsigned long flags;
3448 int dolock;
2737 int cpu; 3449 int cpu;
3450 int ret;
3451
3452 dolock = rb_ok_to_lock();
2738 3453
2739 /* yes this is racy, but if you don't like the race, lock the buffer */ 3454 /* yes this is racy, but if you don't like the race, lock the buffer */
2740 for_each_buffer_cpu(buffer, cpu) { 3455 for_each_buffer_cpu(buffer, cpu) {
2741 cpu_buffer = buffer->buffers[cpu]; 3456 cpu_buffer = buffer->buffers[cpu];
2742 if (!rb_per_cpu_empty(cpu_buffer)) 3457 local_irq_save(flags);
3458 if (dolock)
3459 spin_lock(&cpu_buffer->reader_lock);
3460 ret = rb_per_cpu_empty(cpu_buffer);
3461 if (dolock)
3462 spin_unlock(&cpu_buffer->reader_lock);
3463 local_irq_restore(flags);
3464
3465 if (!ret)
2743 return 0; 3466 return 0;
2744 } 3467 }
2745 3468
@@ -2755,19 +3478,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
2755int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) 3478int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
2756{ 3479{
2757 struct ring_buffer_per_cpu *cpu_buffer; 3480 struct ring_buffer_per_cpu *cpu_buffer;
3481 unsigned long flags;
3482 int dolock;
2758 int ret; 3483 int ret;
2759 3484
2760 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3485 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2761 return 1; 3486 return 1;
2762 3487
3488 dolock = rb_ok_to_lock();
3489
2763 cpu_buffer = buffer->buffers[cpu]; 3490 cpu_buffer = buffer->buffers[cpu];
3491 local_irq_save(flags);
3492 if (dolock)
3493 spin_lock(&cpu_buffer->reader_lock);
2764 ret = rb_per_cpu_empty(cpu_buffer); 3494 ret = rb_per_cpu_empty(cpu_buffer);
2765 3495 if (dolock)
3496 spin_unlock(&cpu_buffer->reader_lock);
3497 local_irq_restore(flags);
2766 3498
2767 return ret; 3499 return ret;
2768} 3500}
2769EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); 3501EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
2770 3502
3503#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2771/** 3504/**
2772 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers 3505 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
2773 * @buffer_a: One buffer to swap with 3506 * @buffer_a: One buffer to swap with
@@ -2822,20 +3555,28 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2822 atomic_inc(&cpu_buffer_a->record_disabled); 3555 atomic_inc(&cpu_buffer_a->record_disabled);
2823 atomic_inc(&cpu_buffer_b->record_disabled); 3556 atomic_inc(&cpu_buffer_b->record_disabled);
2824 3557
3558 ret = -EBUSY;
3559 if (local_read(&cpu_buffer_a->committing))
3560 goto out_dec;
3561 if (local_read(&cpu_buffer_b->committing))
3562 goto out_dec;
3563
2825 buffer_a->buffers[cpu] = cpu_buffer_b; 3564 buffer_a->buffers[cpu] = cpu_buffer_b;
2826 buffer_b->buffers[cpu] = cpu_buffer_a; 3565 buffer_b->buffers[cpu] = cpu_buffer_a;
2827 3566
2828 cpu_buffer_b->buffer = buffer_a; 3567 cpu_buffer_b->buffer = buffer_a;
2829 cpu_buffer_a->buffer = buffer_b; 3568 cpu_buffer_a->buffer = buffer_b;
2830 3569
3570 ret = 0;
3571
3572out_dec:
2831 atomic_dec(&cpu_buffer_a->record_disabled); 3573 atomic_dec(&cpu_buffer_a->record_disabled);
2832 atomic_dec(&cpu_buffer_b->record_disabled); 3574 atomic_dec(&cpu_buffer_b->record_disabled);
2833
2834 ret = 0;
2835out: 3575out:
2836 return ret; 3576 return ret;
2837} 3577}
2838EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 3578EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
3579#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */
2839 3580
2840/** 3581/**
2841 * ring_buffer_alloc_read_page - allocate a page to read from buffer 3582 * ring_buffer_alloc_read_page - allocate a page to read from buffer
@@ -3008,7 +3749,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3008 read = 0; 3749 read = 0;
3009 } else { 3750 } else {
3010 /* update the entry counter */ 3751 /* update the entry counter */
3011 cpu_buffer->read += local_read(&reader->entries); 3752 cpu_buffer->read += rb_page_entries(reader);
3012 3753
3013 /* swap the pages */ 3754 /* swap the pages */
3014 rb_init_page(bpage); 3755 rb_init_page(bpage);
@@ -3029,6 +3770,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3029} 3770}
3030EXPORT_SYMBOL_GPL(ring_buffer_read_page); 3771EXPORT_SYMBOL_GPL(ring_buffer_read_page);
3031 3772
3773#ifdef CONFIG_TRACING
3032static ssize_t 3774static ssize_t
3033rb_simple_read(struct file *filp, char __user *ubuf, 3775rb_simple_read(struct file *filp, char __user *ubuf,
3034 size_t cnt, loff_t *ppos) 3776 size_t cnt, loff_t *ppos)
@@ -3096,6 +3838,7 @@ static __init int rb_init_debugfs(void)
3096} 3838}
3097 3839
3098fs_initcall(rb_init_debugfs); 3840fs_initcall(rb_init_debugfs);
3841#endif
3099 3842
3100#ifdef CONFIG_HOTPLUG_CPU 3843#ifdef CONFIG_HOTPLUG_CPU
3101static int rb_cpu_notify(struct notifier_block *self, 3844static int rb_cpu_notify(struct notifier_block *self,
@@ -3108,7 +3851,7 @@ static int rb_cpu_notify(struct notifier_block *self,
3108 switch (action) { 3851 switch (action) {
3109 case CPU_UP_PREPARE: 3852 case CPU_UP_PREPARE:
3110 case CPU_UP_PREPARE_FROZEN: 3853 case CPU_UP_PREPARE_FROZEN:
3111 if (cpu_isset(cpu, *buffer->cpumask)) 3854 if (cpumask_test_cpu(cpu, buffer->cpumask))
3112 return NOTIFY_OK; 3855 return NOTIFY_OK;
3113 3856
3114 buffer->buffers[cpu] = 3857 buffer->buffers[cpu] =
@@ -3119,7 +3862,7 @@ static int rb_cpu_notify(struct notifier_block *self,
3119 return NOTIFY_OK; 3862 return NOTIFY_OK;
3120 } 3863 }
3121 smp_wmb(); 3864 smp_wmb();
3122 cpu_set(cpu, *buffer->cpumask); 3865 cpumask_set_cpu(cpu, buffer->cpumask);
3123 break; 3866 break;
3124 case CPU_DOWN_PREPARE: 3867 case CPU_DOWN_PREPARE:
3125 case CPU_DOWN_PREPARE_FROZEN: 3868 case CPU_DOWN_PREPARE_FROZEN: