aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
authorSteven Rostedt <srostedt@redhat.com>2009-03-27 11:00:29 -0400
committerSteven Rostedt <rostedt@goodmis.org>2009-07-07 18:36:12 -0400
commit77ae365eca895061c8bf2b2e3ae1d9ea62869739 (patch)
treefc808698a4f2869b45ef4e8c958cbab183a7ad37 /kernel/trace
parent3adc54fa82a68be1cd1ac82ad786ee362796e50a (diff)
ring-buffer: make lockless
This patch converts the ring buffers into a completely lockless buffer recording system. The read side still takes locks since we still serialize readers. But the writers are the ones that must be lockless (those can happen in NMIs). The main change is to the "head_page" pointer. We write to the tail, and read from the head. The "head_page" pointer in the cpu buffer is now just a reference to where to look. The real head page is now kept in the head_page->list->prev->next pointer. That is, in the list head of the previous page we set flags. The list pages are allocated to be aligned such that the lowest significant bits are always zero pointing to the list. This gives us play to put in flags to their pointers. bit 0: set when the page is a head page bit 1: set when the writer is moving the page (for overwrite mode) cmpxchg is used to update the pointer. When the writer wraps the buffer and the tail meets the head, in overwrite mode, the writer must move the head page forward. It first uses cmpxchg to change the pointer flag from 1 to 2. Once this is done, the reader on another CPU will not take the page from the buffer. The writers need to protect against interrupts (we don't bother with disabling interrupts because NMIs are allowed to write too). After the writer sets the pointer flag to 2, it takes care to manage interrupts coming in. This is discribed in detail within the comments of the code. Changes in version 2: - Let reader reset entries value of header page. - Fix tail page passing commit page on reader page test. - Always increment entries and write counter in rb_tail_page_update - Add safety check in rb_set_commit_to_write to break out of infinite loop - add mask in rb_is_reader_page [ Impact: lock free writing to the ring buffer ] Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/ring_buffer.c886
-rw-r--r--kernel/trace/trace.c3
2 files changed, 738 insertions, 151 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7c0168ad6d51..e648ba4f70e0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -322,6 +322,14 @@ struct buffer_data_page {
322 unsigned char data[]; /* data of buffer page */ 322 unsigned char data[]; /* data of buffer page */
323}; 323};
324 324
325/*
326 * Note, the buffer_page list must be first. The buffer pages
327 * are allocated in cache lines, which means that each buffer
328 * page will be at the beginning of a cache line, and thus
329 * the least significant bits will be zero. We use this to
330 * add flags in the list struct pointers, to make the ring buffer
331 * lockless.
332 */
325struct buffer_page { 333struct buffer_page {
326 struct list_head list; /* list of buffer pages */ 334 struct list_head list; /* list of buffer pages */
327 local_t write; /* index for next write */ 335 local_t write; /* index for next write */
@@ -330,6 +338,21 @@ struct buffer_page {
330 struct buffer_data_page *page; /* Actual data page */ 338 struct buffer_data_page *page; /* Actual data page */
331}; 339};
332 340
341/*
342 * The buffer page counters, write and entries, must be reset
343 * atomically when crossing page boundaries. To synchronize this
344 * update, two counters are inserted into the number. One is
345 * the actual counter for the write position or count on the page.
346 *
347 * The other is a counter of updaters. Before an update happens
348 * the update partition of the counter is incremented. This will
349 * allow the updater to update the counter atomically.
350 *
351 * The counter is 20 bits, and the state data is 12.
352 */
353#define RB_WRITE_MASK 0xfffff
354#define RB_WRITE_INTCNT (1 << 20)
355
333static void rb_init_page(struct buffer_data_page *bpage) 356static void rb_init_page(struct buffer_data_page *bpage)
334{ 357{
335 local_set(&bpage->commit, 0); 358 local_set(&bpage->commit, 0);
@@ -403,7 +426,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
403struct ring_buffer_per_cpu { 426struct ring_buffer_per_cpu {
404 int cpu; 427 int cpu;
405 struct ring_buffer *buffer; 428 struct ring_buffer *buffer;
406 spinlock_t reader_lock; /* serialize readers */ 429 spinlock_t reader_lock; /* serialize readers */
407 raw_spinlock_t lock; 430 raw_spinlock_t lock;
408 struct lock_class_key lock_key; 431 struct lock_class_key lock_key;
409 struct list_head *pages; 432 struct list_head *pages;
@@ -411,13 +434,12 @@ struct ring_buffer_per_cpu {
411 struct buffer_page *tail_page; /* write to tail */ 434 struct buffer_page *tail_page; /* write to tail */
412 struct buffer_page *commit_page; /* committed pages */ 435 struct buffer_page *commit_page; /* committed pages */
413 struct buffer_page *reader_page; 436 struct buffer_page *reader_page;
414 unsigned long nmi_dropped; 437 local_t commit_overrun;
415 unsigned long commit_overrun; 438 local_t overrun;
416 unsigned long overrun;
417 unsigned long read;
418 local_t entries; 439 local_t entries;
419 local_t committing; 440 local_t committing;
420 local_t commits; 441 local_t commits;
442 unsigned long read;
421 u64 write_stamp; 443 u64 write_stamp;
422 u64 read_stamp; 444 u64 read_stamp;
423 atomic_t record_disabled; 445 atomic_t record_disabled;
@@ -489,6 +511,385 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
489} 511}
490EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); 512EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
491 513
514/*
515 * Making the ring buffer lockless makes things tricky.
516 * Although writes only happen on the CPU that they are on,
517 * and they only need to worry about interrupts. Reads can
518 * happen on any CPU.
519 *
520 * The reader page is always off the ring buffer, but when the
521 * reader finishes with a page, it needs to swap its page with
522 * a new one from the buffer. The reader needs to take from
523 * the head (writes go to the tail). But if a writer is in overwrite
524 * mode and wraps, it must push the head page forward.
525 *
526 * Here lies the problem.
527 *
528 * The reader must be careful to replace only the head page, and
529 * not another one. As described at the top of the file in the
530 * ASCII art, the reader sets its old page to point to the next
531 * page after head. It then sets the page after head to point to
532 * the old reader page. But if the writer moves the head page
533 * during this operation, the reader could end up with the tail.
534 *
535 * We use cmpxchg to help prevent this race. We also do something
536 * special with the page before head. We set the LSB to 1.
537 *
538 * When the writer must push the page forward, it will clear the
539 * bit that points to the head page, move the head, and then set
540 * the bit that points to the new head page.
541 *
542 * We also don't want an interrupt coming in and moving the head
543 * page on another writer. Thus we use the second LSB to catch
544 * that too. Thus:
545 *
546 * head->list->prev->next bit 1 bit 0
547 * ------- -------
548 * Normal page 0 0
549 * Points to head page 0 1
550 * New head page 1 0
551 *
552 * Note we can not trust the prev pointer of the head page, because:
553 *
554 * +----+ +-----+ +-----+
555 * | |------>| T |---X--->| N |
556 * | |<------| | | |
557 * +----+ +-----+ +-----+
558 * ^ ^ |
559 * | +-----+ | |
560 * +----------| R |----------+ |
561 * | |<-----------+
562 * +-----+
563 *
564 * Key: ---X--> HEAD flag set in pointer
565 * T Tail page
566 * R Reader page
567 * N Next page
568 *
569 * (see __rb_reserve_next() to see where this happens)
570 *
571 * What the above shows is that the reader just swapped out
572 * the reader page with a page in the buffer, but before it
573 * could make the new header point back to the new page added
574 * it was preempted by a writer. The writer moved forward onto
575 * the new page added by the reader and is about to move forward
576 * again.
577 *
578 * You can see, it is legitimate for the previous pointer of
579 * the head (or any page) not to point back to itself. But only
580 * temporarially.
581 */
582
583#define RB_PAGE_NORMAL 0UL
584#define RB_PAGE_HEAD 1UL
585#define RB_PAGE_UPDATE 2UL
586
587
588#define RB_FLAG_MASK 3UL
589
590/* PAGE_MOVED is not part of the mask */
591#define RB_PAGE_MOVED 4UL
592
593/*
594 * rb_list_head - remove any bit
595 */
596static struct list_head *rb_list_head(struct list_head *list)
597{
598 unsigned long val = (unsigned long)list;
599
600 return (struct list_head *)(val & ~RB_FLAG_MASK);
601}
602
603/*
604 * rb_is_head_page - test if the give page is the head page
605 *
606 * Because the reader may move the head_page pointer, we can
607 * not trust what the head page is (it may be pointing to
608 * the reader page). But if the next page is a header page,
609 * its flags will be non zero.
610 */
611static int inline
612rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
613 struct buffer_page *page, struct list_head *list)
614{
615 unsigned long val;
616
617 val = (unsigned long)list->next;
618
619 if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
620 return RB_PAGE_MOVED;
621
622 return val & RB_FLAG_MASK;
623}
624
625/*
626 * rb_is_reader_page
627 *
628 * The unique thing about the reader page, is that, if the
629 * writer is ever on it, the previous pointer never points
630 * back to the reader page.
631 */
632static int rb_is_reader_page(struct buffer_page *page)
633{
634 struct list_head *list = page->list.prev;
635
636 return rb_list_head(list->next) != &page->list;
637}
638
639/*
640 * rb_set_list_to_head - set a list_head to be pointing to head.
641 */
642static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
643 struct list_head *list)
644{
645 unsigned long *ptr;
646
647 ptr = (unsigned long *)&list->next;
648 *ptr |= RB_PAGE_HEAD;
649 *ptr &= ~RB_PAGE_UPDATE;
650}
651
652/*
653 * rb_head_page_activate - sets up head page
654 */
655static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
656{
657 struct buffer_page *head;
658
659 head = cpu_buffer->head_page;
660 if (!head)
661 return;
662
663 /*
664 * Set the previous list pointer to have the HEAD flag.
665 */
666 rb_set_list_to_head(cpu_buffer, head->list.prev);
667}
668
669static void rb_list_head_clear(struct list_head *list)
670{
671 unsigned long *ptr = (unsigned long *)&list->next;
672
673 *ptr &= ~RB_FLAG_MASK;
674}
675
676/*
677 * rb_head_page_dactivate - clears head page ptr (for free list)
678 */
679static void
680rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
681{
682 struct list_head *hd;
683
684 /* Go through the whole list and clear any pointers found. */
685 rb_list_head_clear(cpu_buffer->pages);
686
687 list_for_each(hd, cpu_buffer->pages)
688 rb_list_head_clear(hd);
689}
690
691static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
692 struct buffer_page *head,
693 struct buffer_page *prev,
694 int old_flag, int new_flag)
695{
696 struct list_head *list;
697 unsigned long val = (unsigned long)&head->list;
698 unsigned long ret;
699
700 list = &prev->list;
701
702 val &= ~RB_FLAG_MASK;
703
704 ret = (unsigned long)cmpxchg(&list->next,
705 val | old_flag, val | new_flag);
706
707 /* check if the reader took the page */
708 if ((ret & ~RB_FLAG_MASK) != val)
709 return RB_PAGE_MOVED;
710
711 return ret & RB_FLAG_MASK;
712}
713
714static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
715 struct buffer_page *head,
716 struct buffer_page *prev,
717 int old_flag)
718{
719 return rb_head_page_set(cpu_buffer, head, prev,
720 old_flag, RB_PAGE_UPDATE);
721}
722
723static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
724 struct buffer_page *head,
725 struct buffer_page *prev,
726 int old_flag)
727{
728 return rb_head_page_set(cpu_buffer, head, prev,
729 old_flag, RB_PAGE_HEAD);
730}
731
732static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
733 struct buffer_page *head,
734 struct buffer_page *prev,
735 int old_flag)
736{
737 return rb_head_page_set(cpu_buffer, head, prev,
738 old_flag, RB_PAGE_NORMAL);
739}
740
741static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
742 struct buffer_page **bpage)
743{
744 struct list_head *p = rb_list_head((*bpage)->list.next);
745
746 *bpage = list_entry(p, struct buffer_page, list);
747}
748
749static struct buffer_page *
750rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
751{
752 struct buffer_page *head;
753 struct buffer_page *page;
754 struct list_head *list;
755 int i;
756
757 if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
758 return NULL;
759
760 /* sanity check */
761 list = cpu_buffer->pages;
762 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
763 return NULL;
764
765 page = head = cpu_buffer->head_page;
766 /*
767 * It is possible that the writer moves the header behind
768 * where we started, and we miss in one loop.
769 * A second loop should grab the header, but we'll do
770 * three loops just because I'm paranoid.
771 */
772 for (i = 0; i < 3; i++) {
773 do {
774 if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
775 cpu_buffer->head_page = page;
776 return page;
777 }
778 rb_inc_page(cpu_buffer, &page);
779 } while (page != head);
780 }
781
782 RB_WARN_ON(cpu_buffer, 1);
783
784 return NULL;
785}
786
787static int rb_head_page_replace(struct buffer_page *old,
788 struct buffer_page *new)
789{
790 unsigned long *ptr = (unsigned long *)&old->list.prev->next;
791 unsigned long val;
792 unsigned long ret;
793
794 val = *ptr & ~RB_FLAG_MASK;
795 val |= RB_PAGE_HEAD;
796
797 ret = cmpxchg(ptr, val, &new->list);
798
799 return ret == val;
800}
801
802/*
803 * rb_tail_page_update - move the tail page forward
804 *
805 * Returns 1 if moved tail page, 0 if someone else did.
806 */
807static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
808 struct buffer_page *tail_page,
809 struct buffer_page *next_page)
810{
811 struct buffer_page *old_tail;
812 unsigned long old_entries;
813 unsigned long old_write;
814 int ret = 0;
815
816 /*
817 * The tail page now needs to be moved forward.
818 *
819 * We need to reset the tail page, but without messing
820 * with possible erasing of data brought in by interrupts
821 * that have moved the tail page and are currently on it.
822 *
823 * We add a counter to the write field to denote this.
824 */
825 old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
826 old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
827
828 /*
829 * Just make sure we have seen our old_write and synchronize
830 * with any interrupts that come in.
831 */
832 barrier();
833
834 /*
835 * If the tail page is still the same as what we think
836 * it is, then it is up to us to update the tail
837 * pointer.
838 */
839 if (tail_page == cpu_buffer->tail_page) {
840 /* Zero the write counter */
841 unsigned long val = old_write & ~RB_WRITE_MASK;
842 unsigned long eval = old_entries & ~RB_WRITE_MASK;
843
844 /*
845 * This will only succeed if an interrupt did
846 * not come in and change it. In which case, we
847 * do not want to modify it.
848 */
849 local_cmpxchg(&next_page->write, old_write, val);
850 local_cmpxchg(&next_page->entries, old_entries, eval);
851
852 /*
853 * No need to worry about races with clearing out the commit.
854 * it only can increment when a commit takes place. But that
855 * only happens in the outer most nested commit.
856 */
857 local_set(&next_page->page->commit, 0);
858
859 old_tail = cmpxchg(&cpu_buffer->tail_page,
860 tail_page, next_page);
861
862 if (old_tail == tail_page)
863 ret = 1;
864 }
865
866 return ret;
867}
868
869static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
870 struct buffer_page *bpage)
871{
872 unsigned long val = (unsigned long)bpage;
873
874 if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
875 return 1;
876
877 return 0;
878}
879
880/**
881 * rb_check_list - make sure a pointer to a list has the last bits zero
882 */
883static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
884 struct list_head *list)
885{
886 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
887 return 1;
888 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
889 return 1;
890 return 0;
891}
892
492/** 893/**
493 * check_pages - integrity check of buffer pages 894 * check_pages - integrity check of buffer pages
494 * @cpu_buffer: CPU buffer with pages to test 895 * @cpu_buffer: CPU buffer with pages to test
@@ -501,11 +902,16 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
501 struct list_head *head = cpu_buffer->pages; 902 struct list_head *head = cpu_buffer->pages;
502 struct buffer_page *bpage, *tmp; 903 struct buffer_page *bpage, *tmp;
503 904
905 rb_head_page_deactivate(cpu_buffer);
906
504 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 907 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
505 return -1; 908 return -1;
506 if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) 909 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
507 return -1; 910 return -1;
508 911
912 if (rb_check_list(cpu_buffer, head))
913 return -1;
914
509 list_for_each_entry_safe(bpage, tmp, head, list) { 915 list_for_each_entry_safe(bpage, tmp, head, list) {
510 if (RB_WARN_ON(cpu_buffer, 916 if (RB_WARN_ON(cpu_buffer,
511 bpage->list.next->prev != &bpage->list)) 917 bpage->list.next->prev != &bpage->list))
@@ -513,8 +919,12 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
513 if (RB_WARN_ON(cpu_buffer, 919 if (RB_WARN_ON(cpu_buffer,
514 bpage->list.prev->next != &bpage->list)) 920 bpage->list.prev->next != &bpage->list))
515 return -1; 921 return -1;
922 if (rb_check_list(cpu_buffer, &bpage->list))
923 return -1;
516 } 924 }
517 925
926 rb_head_page_activate(cpu_buffer);
927
518 return 0; 928 return 0;
519} 929}
520 930
@@ -533,6 +943,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
533 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 943 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
534 if (!bpage) 944 if (!bpage)
535 goto free_pages; 945 goto free_pages;
946
947 rb_check_bpage(cpu_buffer, bpage);
948
536 list_add(&bpage->list, &pages); 949 list_add(&bpage->list, &pages);
537 950
538 addr = __get_free_page(GFP_KERNEL); 951 addr = __get_free_page(GFP_KERNEL);
@@ -586,6 +999,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
586 if (!bpage) 999 if (!bpage)
587 goto fail_free_buffer; 1000 goto fail_free_buffer;
588 1001
1002 rb_check_bpage(cpu_buffer, bpage);
1003
589 cpu_buffer->reader_page = bpage; 1004 cpu_buffer->reader_page = bpage;
590 addr = __get_free_page(GFP_KERNEL); 1005 addr = __get_free_page(GFP_KERNEL);
591 if (!addr) 1006 if (!addr)
@@ -603,6 +1018,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
603 = list_entry(cpu_buffer->pages, struct buffer_page, list); 1018 = list_entry(cpu_buffer->pages, struct buffer_page, list);
604 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; 1019 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
605 1020
1021 rb_head_page_activate(cpu_buffer);
1022
606 return cpu_buffer; 1023 return cpu_buffer;
607 1024
608 fail_free_reader: 1025 fail_free_reader:
@@ -620,6 +1037,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
620 1037
621 free_buffer_page(cpu_buffer->reader_page); 1038 free_buffer_page(cpu_buffer->reader_page);
622 1039
1040 rb_head_page_deactivate(cpu_buffer);
1041
623 if (head) { 1042 if (head) {
624 list_for_each_entry_safe(bpage, tmp, head, list) { 1043 list_for_each_entry_safe(bpage, tmp, head, list) {
625 list_del_init(&bpage->list); 1044 list_del_init(&bpage->list);
@@ -770,6 +1189,8 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
770 atomic_inc(&cpu_buffer->record_disabled); 1189 atomic_inc(&cpu_buffer->record_disabled);
771 synchronize_sched(); 1190 synchronize_sched();
772 1191
1192 rb_head_page_deactivate(cpu_buffer);
1193
773 for (i = 0; i < nr_pages; i++) { 1194 for (i = 0; i < nr_pages; i++) {
774 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1195 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
775 return; 1196 return;
@@ -800,6 +1221,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
800 atomic_inc(&cpu_buffer->record_disabled); 1221 atomic_inc(&cpu_buffer->record_disabled);
801 synchronize_sched(); 1222 synchronize_sched();
802 1223
1224 spin_lock_irq(&cpu_buffer->reader_lock);
1225 rb_head_page_deactivate(cpu_buffer);
1226
803 for (i = 0; i < nr_pages; i++) { 1227 for (i = 0; i < nr_pages; i++) {
804 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1228 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
805 return; 1229 return;
@@ -809,6 +1233,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
809 list_add_tail(&bpage->list, cpu_buffer->pages); 1233 list_add_tail(&bpage->list, cpu_buffer->pages);
810 } 1234 }
811 rb_reset_cpu(cpu_buffer); 1235 rb_reset_cpu(cpu_buffer);
1236 spin_unlock_irq(&cpu_buffer->reader_lock);
812 1237
813 rb_check_pages(cpu_buffer); 1238 rb_check_pages(cpu_buffer);
814 1239
@@ -959,21 +1384,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
959} 1384}
960 1385
961static inline struct ring_buffer_event * 1386static inline struct ring_buffer_event *
962rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
963{
964 return __rb_page_index(cpu_buffer->head_page,
965 cpu_buffer->head_page->read);
966}
967
968static inline struct ring_buffer_event *
969rb_iter_head_event(struct ring_buffer_iter *iter) 1387rb_iter_head_event(struct ring_buffer_iter *iter)
970{ 1388{
971 return __rb_page_index(iter->head_page, iter->head); 1389 return __rb_page_index(iter->head_page, iter->head);
972} 1390}
973 1391
974static inline unsigned rb_page_write(struct buffer_page *bpage) 1392static inline unsigned long rb_page_write(struct buffer_page *bpage)
975{ 1393{
976 return local_read(&bpage->write); 1394 return local_read(&bpage->write) & RB_WRITE_MASK;
977} 1395}
978 1396
979static inline unsigned rb_page_commit(struct buffer_page *bpage) 1397static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -981,6 +1399,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
981 return local_read(&bpage->page->commit); 1399 return local_read(&bpage->page->commit);
982} 1400}
983 1401
1402static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1403{
1404 return local_read(&bpage->entries) & RB_WRITE_MASK;
1405}
1406
984/* Size is determined by what has been commited */ 1407/* Size is determined by what has been commited */
985static inline unsigned rb_page_size(struct buffer_page *bpage) 1408static inline unsigned rb_page_size(struct buffer_page *bpage)
986{ 1409{
@@ -993,19 +1416,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
993 return rb_page_commit(cpu_buffer->commit_page); 1416 return rb_page_commit(cpu_buffer->commit_page);
994} 1417}
995 1418
996static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
997{
998 return rb_page_commit(cpu_buffer->head_page);
999}
1000
1001static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
1002 struct buffer_page **bpage)
1003{
1004 struct list_head *p = (*bpage)->list.next;
1005
1006 *bpage = list_entry(p, struct buffer_page, list);
1007}
1008
1009static inline unsigned 1419static inline unsigned
1010rb_event_index(struct ring_buffer_event *event) 1420rb_event_index(struct ring_buffer_event *event)
1011{ 1421{
@@ -1031,6 +1441,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1031static void 1441static void
1032rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1442rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1033{ 1443{
1444 unsigned long max_count;
1445
1034 /* 1446 /*
1035 * We only race with interrupts and NMIs on this CPU. 1447 * We only race with interrupts and NMIs on this CPU.
1036 * If we own the commit event, then we can commit 1448 * If we own the commit event, then we can commit
@@ -1040,9 +1452,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1040 * assign the commit to the tail. 1452 * assign the commit to the tail.
1041 */ 1453 */
1042 again: 1454 again:
1455 max_count = cpu_buffer->buffer->pages * 100;
1456
1043 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1457 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1044 cpu_buffer->commit_page->page->commit = 1458 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
1045 cpu_buffer->commit_page->write; 1459 return;
1460 if (RB_WARN_ON(cpu_buffer,
1461 rb_is_reader_page(cpu_buffer->tail_page)))
1462 return;
1463 local_set(&cpu_buffer->commit_page->page->commit,
1464 rb_page_write(cpu_buffer->commit_page));
1046 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 1465 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1047 cpu_buffer->write_stamp = 1466 cpu_buffer->write_stamp =
1048 cpu_buffer->commit_page->page->time_stamp; 1467 cpu_buffer->commit_page->page->time_stamp;
@@ -1051,8 +1470,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1051 } 1470 }
1052 while (rb_commit_index(cpu_buffer) != 1471 while (rb_commit_index(cpu_buffer) !=
1053 rb_page_write(cpu_buffer->commit_page)) { 1472 rb_page_write(cpu_buffer->commit_page)) {
1054 cpu_buffer->commit_page->page->commit = 1473
1055 cpu_buffer->commit_page->write; 1474 local_set(&cpu_buffer->commit_page->page->commit,
1475 rb_page_write(cpu_buffer->commit_page));
1476 RB_WARN_ON(cpu_buffer,
1477 local_read(&cpu_buffer->commit_page->page->commit) &
1478 ~RB_WRITE_MASK);
1056 barrier(); 1479 barrier();
1057 } 1480 }
1058 1481
@@ -1085,7 +1508,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1085 * to the head page instead of next. 1508 * to the head page instead of next.
1086 */ 1509 */
1087 if (iter->head_page == cpu_buffer->reader_page) 1510 if (iter->head_page == cpu_buffer->reader_page)
1088 iter->head_page = cpu_buffer->head_page; 1511 iter->head_page = rb_set_head_page(cpu_buffer);
1089 else 1512 else
1090 rb_inc_page(cpu_buffer, &iter->head_page); 1513 rb_inc_page(cpu_buffer, &iter->head_page);
1091 1514
@@ -1129,6 +1552,163 @@ rb_update_event(struct ring_buffer_event *event,
1129 } 1552 }
1130} 1553}
1131 1554
1555/*
1556 * rb_handle_head_page - writer hit the head page
1557 *
1558 * Returns: +1 to retry page
1559 * 0 to continue
1560 * -1 on error
1561 */
1562static int
1563rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1564 struct buffer_page *tail_page,
1565 struct buffer_page *next_page)
1566{
1567 struct buffer_page *new_head;
1568 int entries;
1569 int type;
1570 int ret;
1571
1572 entries = rb_page_entries(next_page);
1573
1574 /*
1575 * The hard part is here. We need to move the head
1576 * forward, and protect against both readers on
1577 * other CPUs and writers coming in via interrupts.
1578 */
1579 type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
1580 RB_PAGE_HEAD);
1581
1582 /*
1583 * type can be one of four:
1584 * NORMAL - an interrupt already moved it for us
1585 * HEAD - we are the first to get here.
1586 * UPDATE - we are the interrupt interrupting
1587 * a current move.
1588 * MOVED - a reader on another CPU moved the next
1589 * pointer to its reader page. Give up
1590 * and try again.
1591 */
1592
1593 switch (type) {
1594 case RB_PAGE_HEAD:
1595 /*
1596 * We changed the head to UPDATE, thus
1597 * it is our responsibility to update
1598 * the counters.
1599 */
1600 local_add(entries, &cpu_buffer->overrun);
1601
1602 /*
1603 * The entries will be zeroed out when we move the
1604 * tail page.
1605 */
1606
1607 /* still more to do */
1608 break;
1609
1610 case RB_PAGE_UPDATE:
1611 /*
1612 * This is an interrupt that interrupt the
1613 * previous update. Still more to do.
1614 */
1615 break;
1616 case RB_PAGE_NORMAL:
1617 /*
1618 * An interrupt came in before the update
1619 * and processed this for us.
1620 * Nothing left to do.
1621 */
1622 return 1;
1623 case RB_PAGE_MOVED:
1624 /*
1625 * The reader is on another CPU and just did
1626 * a swap with our next_page.
1627 * Try again.
1628 */
1629 return 1;
1630 default:
1631 RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
1632 return -1;
1633 }
1634
1635 /*
1636 * Now that we are here, the old head pointer is
1637 * set to UPDATE. This will keep the reader from
1638 * swapping the head page with the reader page.
1639 * The reader (on another CPU) will spin till
1640 * we are finished.
1641 *
1642 * We just need to protect against interrupts
1643 * doing the job. We will set the next pointer
1644 * to HEAD. After that, we set the old pointer
1645 * to NORMAL, but only if it was HEAD before.
1646 * otherwise we are an interrupt, and only
1647 * want the outer most commit to reset it.
1648 */
1649 new_head = next_page;
1650 rb_inc_page(cpu_buffer, &new_head);
1651
1652 ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
1653 RB_PAGE_NORMAL);
1654
1655 /*
1656 * Valid returns are:
1657 * HEAD - an interrupt came in and already set it.
1658 * NORMAL - One of two things:
1659 * 1) We really set it.
1660 * 2) A bunch of interrupts came in and moved
1661 * the page forward again.
1662 */
1663 switch (ret) {
1664 case RB_PAGE_HEAD:
1665 case RB_PAGE_NORMAL:
1666 /* OK */
1667 break;
1668 default:
1669 RB_WARN_ON(cpu_buffer, 1);
1670 return -1;
1671 }
1672
1673 /*
1674 * It is possible that an interrupt came in,
1675 * set the head up, then more interrupts came in
1676 * and moved it again. When we get back here,
1677 * the page would have been set to NORMAL but we
1678 * just set it back to HEAD.
1679 *
1680 * How do you detect this? Well, if that happened
1681 * the tail page would have moved.
1682 */
1683 if (ret == RB_PAGE_NORMAL) {
1684 /*
1685 * If the tail had moved passed next, then we need
1686 * to reset the pointer.
1687 */
1688 if (cpu_buffer->tail_page != tail_page &&
1689 cpu_buffer->tail_page != next_page)
1690 rb_head_page_set_normal(cpu_buffer, new_head,
1691 next_page,
1692 RB_PAGE_HEAD);
1693 }
1694
1695 /*
1696 * If this was the outer most commit (the one that
1697 * changed the original pointer from HEAD to UPDATE),
1698 * then it is up to us to reset it to NORMAL.
1699 */
1700 if (type == RB_PAGE_HEAD) {
1701 ret = rb_head_page_set_normal(cpu_buffer, next_page,
1702 tail_page,
1703 RB_PAGE_UPDATE);
1704 if (RB_WARN_ON(cpu_buffer,
1705 ret != RB_PAGE_UPDATE))
1706 return -1;
1707 }
1708
1709 return 0;
1710}
1711
1132static unsigned rb_calculate_event_length(unsigned length) 1712static unsigned rb_calculate_event_length(unsigned length)
1133{ 1713{
1134 struct ring_buffer_event event; /* Used only for sizeof array */ 1714 struct ring_buffer_event event; /* Used only for sizeof array */
@@ -1207,96 +1787,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1207 struct buffer_page *commit_page, 1787 struct buffer_page *commit_page,
1208 struct buffer_page *tail_page, u64 *ts) 1788 struct buffer_page *tail_page, u64 *ts)
1209{ 1789{
1210 struct buffer_page *next_page, *head_page, *reader_page;
1211 struct ring_buffer *buffer = cpu_buffer->buffer; 1790 struct ring_buffer *buffer = cpu_buffer->buffer;
1212 bool lock_taken = false; 1791 struct buffer_page *next_page;
1213 unsigned long flags; 1792 int ret;
1214 1793
1215 next_page = tail_page; 1794 next_page = tail_page;
1216 1795
1217 local_irq_save(flags);
1218 /*
1219 * Since the write to the buffer is still not
1220 * fully lockless, we must be careful with NMIs.
1221 * The locks in the writers are taken when a write
1222 * crosses to a new page. The locks protect against
1223 * races with the readers (this will soon be fixed
1224 * with a lockless solution).
1225 *
1226 * Because we can not protect against NMIs, and we
1227 * want to keep traces reentrant, we need to manage
1228 * what happens when we are in an NMI.
1229 *
1230 * NMIs can happen after we take the lock.
1231 * If we are in an NMI, only take the lock
1232 * if it is not already taken. Otherwise
1233 * simply fail.
1234 */
1235 if (unlikely(in_nmi())) {
1236 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1237 cpu_buffer->nmi_dropped++;
1238 goto out_reset;
1239 }
1240 } else
1241 __raw_spin_lock(&cpu_buffer->lock);
1242
1243 lock_taken = true;
1244
1245 rb_inc_page(cpu_buffer, &next_page); 1796 rb_inc_page(cpu_buffer, &next_page);
1246 1797
1247 head_page = cpu_buffer->head_page;
1248 reader_page = cpu_buffer->reader_page;
1249
1250 /* we grabbed the lock before incrementing */
1251 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1252 goto out_reset;
1253
1254 /* 1798 /*
1255 * If for some reason, we had an interrupt storm that made 1799 * If for some reason, we had an interrupt storm that made
1256 * it all the way around the buffer, bail, and warn 1800 * it all the way around the buffer, bail, and warn
1257 * about it. 1801 * about it.
1258 */ 1802 */
1259 if (unlikely(next_page == commit_page)) { 1803 if (unlikely(next_page == commit_page)) {
1260 cpu_buffer->commit_overrun++; 1804 local_inc(&cpu_buffer->commit_overrun);
1261 goto out_reset; 1805 goto out_reset;
1262 } 1806 }
1263 1807
1264 if (next_page == head_page) { 1808 /*
1265 if (!(buffer->flags & RB_FL_OVERWRITE)) 1809 * This is where the fun begins!
1266 goto out_reset; 1810 *
1267 1811 * We are fighting against races between a reader that
1268 /* tail_page has not moved yet? */ 1812 * could be on another CPU trying to swap its reader
1269 if (tail_page == cpu_buffer->tail_page) { 1813 * page with the buffer head.
1270 /* count overflows */ 1814 *
1271 cpu_buffer->overrun += 1815 * We are also fighting against interrupts coming in and
1272 local_read(&head_page->entries); 1816 * moving the head or tail on us as well.
1817 *
1818 * If the next page is the head page then we have filled
1819 * the buffer, unless the commit page is still on the
1820 * reader page.
1821 */
1822 if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
1273 1823
1274 rb_inc_page(cpu_buffer, &head_page); 1824 /*
1275 cpu_buffer->head_page = head_page; 1825 * If the commit is not on the reader page, then
1276 cpu_buffer->head_page->read = 0; 1826 * move the header page.
1827 */
1828 if (!rb_is_reader_page(cpu_buffer->commit_page)) {
1829 /*
1830 * If we are not in overwrite mode,
1831 * this is easy, just stop here.
1832 */
1833 if (!(buffer->flags & RB_FL_OVERWRITE))
1834 goto out_reset;
1835
1836 ret = rb_handle_head_page(cpu_buffer,
1837 tail_page,
1838 next_page);
1839 if (ret < 0)
1840 goto out_reset;
1841 if (ret)
1842 goto out_again;
1843 } else {
1844 /*
1845 * We need to be careful here too. The
1846 * commit page could still be on the reader
1847 * page. We could have a small buffer, and
1848 * have filled up the buffer with events
1849 * from interrupts and such, and wrapped.
1850 *
1851 * Note, if the tail page is also the on the
1852 * reader_page, we let it move out.
1853 */
1854 if (unlikely((cpu_buffer->commit_page !=
1855 cpu_buffer->tail_page) &&
1856 (cpu_buffer->commit_page ==
1857 cpu_buffer->reader_page))) {
1858 local_inc(&cpu_buffer->commit_overrun);
1859 goto out_reset;
1860 }
1277 } 1861 }
1278 } 1862 }
1279 1863
1280 /* 1864 ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
1281 * If the tail page is still the same as what we think 1865 if (ret) {
1282 * it is, then it is up to us to update the tail 1866 /*
1283 * pointer. 1867 * Nested commits always have zero deltas, so
1284 */ 1868 * just reread the time stamp
1285 if (tail_page == cpu_buffer->tail_page) { 1869 */
1286 local_set(&next_page->write, 0);
1287 local_set(&next_page->entries, 0);
1288 local_set(&next_page->page->commit, 0);
1289 cpu_buffer->tail_page = next_page;
1290
1291 /* reread the time stamp */
1292 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1870 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1293 cpu_buffer->tail_page->page->time_stamp = *ts; 1871 next_page->page->time_stamp = *ts;
1294 } 1872 }
1295 1873
1296 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1874 out_again:
1297 1875
1298 __raw_spin_unlock(&cpu_buffer->lock); 1876 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1299 local_irq_restore(flags);
1300 1877
1301 /* fail and let the caller try again */ 1878 /* fail and let the caller try again */
1302 return ERR_PTR(-EAGAIN); 1879 return ERR_PTR(-EAGAIN);
@@ -1305,9 +1882,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1305 /* reset write */ 1882 /* reset write */
1306 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1883 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1307 1884
1308 if (likely(lock_taken))
1309 __raw_spin_unlock(&cpu_buffer->lock);
1310 local_irq_restore(flags);
1311 return NULL; 1885 return NULL;
1312} 1886}
1313 1887
@@ -1324,6 +1898,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1324 barrier(); 1898 barrier();
1325 tail_page = cpu_buffer->tail_page; 1899 tail_page = cpu_buffer->tail_page;
1326 write = local_add_return(length, &tail_page->write); 1900 write = local_add_return(length, &tail_page->write);
1901
1902 /* set write to only the index of the write */
1903 write &= RB_WRITE_MASK;
1327 tail = write - length; 1904 tail = write - length;
1328 1905
1329 /* See if we shot pass the end of this buffer page */ 1906 /* See if we shot pass the end of this buffer page */
@@ -1368,12 +1945,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1368 bpage = cpu_buffer->tail_page; 1945 bpage = cpu_buffer->tail_page;
1369 1946
1370 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 1947 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1948 unsigned long write_mask =
1949 local_read(&bpage->write) & ~RB_WRITE_MASK;
1371 /* 1950 /*
1372 * This is on the tail page. It is possible that 1951 * This is on the tail page. It is possible that
1373 * a write could come in and move the tail page 1952 * a write could come in and move the tail page
1374 * and write to the next page. That is fine 1953 * and write to the next page. That is fine
1375 * because we just shorten what is on this page. 1954 * because we just shorten what is on this page.
1376 */ 1955 */
1956 old_index += write_mask;
1957 new_index += write_mask;
1377 index = local_cmpxchg(&bpage->write, old_index, new_index); 1958 index = local_cmpxchg(&bpage->write, old_index, new_index);
1378 if (index == old_index) 1959 if (index == old_index)
1379 return 1; 1960 return 1;
@@ -1882,9 +2463,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write);
1882static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 2463static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1883{ 2464{
1884 struct buffer_page *reader = cpu_buffer->reader_page; 2465 struct buffer_page *reader = cpu_buffer->reader_page;
1885 struct buffer_page *head = cpu_buffer->head_page; 2466 struct buffer_page *head = rb_set_head_page(cpu_buffer);
1886 struct buffer_page *commit = cpu_buffer->commit_page; 2467 struct buffer_page *commit = cpu_buffer->commit_page;
1887 2468
2469 /* In case of error, head will be NULL */
2470 if (unlikely(!head))
2471 return 1;
2472
1888 return reader->read == rb_page_commit(reader) && 2473 return reader->read == rb_page_commit(reader) &&
1889 (commit == reader || 2474 (commit == reader ||
1890 (commit == head && 2475 (commit == head &&
@@ -1975,7 +2560,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1975 return 0; 2560 return 0;
1976 2561
1977 cpu_buffer = buffer->buffers[cpu]; 2562 cpu_buffer = buffer->buffers[cpu];
1978 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) 2563 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
1979 - cpu_buffer->read; 2564 - cpu_buffer->read;
1980 2565
1981 return ret; 2566 return ret;
@@ -1996,33 +2581,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1996 return 0; 2581 return 0;
1997 2582
1998 cpu_buffer = buffer->buffers[cpu]; 2583 cpu_buffer = buffer->buffers[cpu];
1999 ret = cpu_buffer->overrun; 2584 ret = local_read(&cpu_buffer->overrun);
2000 2585
2001 return ret; 2586 return ret;
2002} 2587}
2003EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 2588EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
2004 2589
2005/** 2590/**
2006 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
2007 * @buffer: The ring buffer
2008 * @cpu: The per CPU buffer to get the number of overruns from
2009 */
2010unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
2011{
2012 struct ring_buffer_per_cpu *cpu_buffer;
2013 unsigned long ret;
2014
2015 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2016 return 0;
2017
2018 cpu_buffer = buffer->buffers[cpu];
2019 ret = cpu_buffer->nmi_dropped;
2020
2021 return ret;
2022}
2023EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
2024
2025/**
2026 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 2591 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
2027 * @buffer: The ring buffer 2592 * @buffer: The ring buffer
2028 * @cpu: The per CPU buffer to get the number of overruns from 2593 * @cpu: The per CPU buffer to get the number of overruns from
@@ -2037,7 +2602,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
2037 return 0; 2602 return 0;
2038 2603
2039 cpu_buffer = buffer->buffers[cpu]; 2604 cpu_buffer = buffer->buffers[cpu];
2040 ret = cpu_buffer->commit_overrun; 2605 ret = local_read(&cpu_buffer->commit_overrun);
2041 2606
2042 return ret; 2607 return ret;
2043} 2608}
@@ -2060,7 +2625,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2060 for_each_buffer_cpu(buffer, cpu) { 2625 for_each_buffer_cpu(buffer, cpu) {
2061 cpu_buffer = buffer->buffers[cpu]; 2626 cpu_buffer = buffer->buffers[cpu];
2062 entries += (local_read(&cpu_buffer->entries) - 2627 entries += (local_read(&cpu_buffer->entries) -
2063 cpu_buffer->overrun) - cpu_buffer->read; 2628 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2064 } 2629 }
2065 2630
2066 return entries; 2631 return entries;
@@ -2083,7 +2648,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
2083 /* if you care about this being correct, lock the buffer */ 2648 /* if you care about this being correct, lock the buffer */
2084 for_each_buffer_cpu(buffer, cpu) { 2649 for_each_buffer_cpu(buffer, cpu) {
2085 cpu_buffer = buffer->buffers[cpu]; 2650 cpu_buffer = buffer->buffers[cpu];
2086 overruns += cpu_buffer->overrun; 2651 overruns += local_read(&cpu_buffer->overrun);
2087 } 2652 }
2088 2653
2089 return overruns; 2654 return overruns;
@@ -2096,8 +2661,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2096 2661
2097 /* Iterator usage is expected to have record disabled */ 2662 /* Iterator usage is expected to have record disabled */
2098 if (list_empty(&cpu_buffer->reader_page->list)) { 2663 if (list_empty(&cpu_buffer->reader_page->list)) {
2099 iter->head_page = cpu_buffer->head_page; 2664 iter->head_page = rb_set_head_page(cpu_buffer);
2100 iter->head = cpu_buffer->head_page->read; 2665 if (unlikely(!iter->head_page))
2666 return;
2667 iter->head = iter->head_page->read;
2101 } else { 2668 } else {
2102 iter->head_page = cpu_buffer->reader_page; 2669 iter->head_page = cpu_buffer->reader_page;
2103 iter->head = cpu_buffer->reader_page->read; 2670 iter->head = cpu_buffer->reader_page->read;
@@ -2214,6 +2781,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2214 struct buffer_page *reader = NULL; 2781 struct buffer_page *reader = NULL;
2215 unsigned long flags; 2782 unsigned long flags;
2216 int nr_loops = 0; 2783 int nr_loops = 0;
2784 int ret;
2217 2785
2218 local_irq_save(flags); 2786 local_irq_save(flags);
2219 __raw_spin_lock(&cpu_buffer->lock); 2787 __raw_spin_lock(&cpu_buffer->lock);
@@ -2247,11 +2815,17 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2247 goto out; 2815 goto out;
2248 2816
2249 /* 2817 /*
2250 * Splice the empty reader page into the list around the head.
2251 * Reset the reader page to size zero. 2818 * Reset the reader page to size zero.
2252 */ 2819 */
2820 local_set(&cpu_buffer->reader_page->write, 0);
2821 local_set(&cpu_buffer->reader_page->entries, 0);
2822 local_set(&cpu_buffer->reader_page->page->commit, 0);
2253 2823
2254 reader = cpu_buffer->head_page; 2824 spin:
2825 /*
2826 * Splice the empty reader page into the list around the head.
2827 */
2828 reader = rb_set_head_page(cpu_buffer);
2255 cpu_buffer->reader_page->list.next = reader->list.next; 2829 cpu_buffer->reader_page->list.next = reader->list.next;
2256 cpu_buffer->reader_page->list.prev = reader->list.prev; 2830 cpu_buffer->reader_page->list.prev = reader->list.prev;
2257 2831
@@ -2262,22 +2836,35 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2262 */ 2836 */
2263 cpu_buffer->pages = reader->list.prev; 2837 cpu_buffer->pages = reader->list.prev;
2264 2838
2265 local_set(&cpu_buffer->reader_page->write, 0); 2839 /* The reader page will be pointing to the new head */
2266 local_set(&cpu_buffer->reader_page->entries, 0); 2840 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2267 local_set(&cpu_buffer->reader_page->page->commit, 0);
2268 2841
2269 /* Make the reader page now replace the head */ 2842 /*
2270 reader->list.prev->next = &cpu_buffer->reader_page->list; 2843 * Here's the tricky part.
2271 reader->list.next->prev = &cpu_buffer->reader_page->list; 2844 *
2845 * We need to move the pointer past the header page.
2846 * But we can only do that if a writer is not currently
2847 * moving it. The page before the header page has the
2848 * flag bit '1' set if it is pointing to the page we want.
2849 * but if the writer is in the process of moving it
2850 * than it will be '2' or already moved '0'.
2851 */
2852
2853 ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
2272 2854
2273 /* 2855 /*
2274 * If the tail is on the reader, then we must set the head 2856 * If we did not convert it, then we must try again.
2275 * to the inserted page, otherwise we set it one before.
2276 */ 2857 */
2277 cpu_buffer->head_page = cpu_buffer->reader_page; 2858 if (!ret)
2859 goto spin;
2278 2860
2279 if (cpu_buffer->commit_page != reader) 2861 /*
2280 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2862 * Yeah! We succeeded in replacing the page.
2863 *
2864 * Now make the new head point back to the reader page.
2865 */
2866 reader->list.next->prev = &cpu_buffer->reader_page->list;
2867 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2281 2868
2282 /* Finally update the reader page to the new head */ 2869 /* Finally update the reader page to the new head */
2283 cpu_buffer->reader_page = reader; 2870 cpu_buffer->reader_page = reader;
@@ -2733,6 +3320,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
2733static void 3320static void
2734rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 3321rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2735{ 3322{
3323 rb_head_page_deactivate(cpu_buffer);
3324
2736 cpu_buffer->head_page 3325 cpu_buffer->head_page
2737 = list_entry(cpu_buffer->pages, struct buffer_page, list); 3326 = list_entry(cpu_buffer->pages, struct buffer_page, list);
2738 local_set(&cpu_buffer->head_page->write, 0); 3327 local_set(&cpu_buffer->head_page->write, 0);
@@ -2750,16 +3339,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2750 local_set(&cpu_buffer->reader_page->page->commit, 0); 3339 local_set(&cpu_buffer->reader_page->page->commit, 0);
2751 cpu_buffer->reader_page->read = 0; 3340 cpu_buffer->reader_page->read = 0;
2752 3341
2753 cpu_buffer->nmi_dropped = 0; 3342 local_set(&cpu_buffer->commit_overrun, 0);
2754 cpu_buffer->commit_overrun = 0; 3343 local_set(&cpu_buffer->overrun, 0);
2755 cpu_buffer->overrun = 0;
2756 cpu_buffer->read = 0;
2757 local_set(&cpu_buffer->entries, 0); 3344 local_set(&cpu_buffer->entries, 0);
2758 local_set(&cpu_buffer->committing, 0); 3345 local_set(&cpu_buffer->committing, 0);
2759 local_set(&cpu_buffer->commits, 0); 3346 local_set(&cpu_buffer->commits, 0);
3347 cpu_buffer->read = 0;
2760 3348
2761 cpu_buffer->write_stamp = 0; 3349 cpu_buffer->write_stamp = 0;
2762 cpu_buffer->read_stamp = 0; 3350 cpu_buffer->read_stamp = 0;
3351
3352 rb_head_page_activate(cpu_buffer);
2763} 3353}
2764 3354
2765/** 3355/**
@@ -3107,7 +3697,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3107 read = 0; 3697 read = 0;
3108 } else { 3698 } else {
3109 /* update the entry counter */ 3699 /* update the entry counter */
3110 cpu_buffer->read += local_read(&reader->entries); 3700 cpu_buffer->read += rb_page_entries(reader);
3111 3701
3112 /* swap the pages */ 3702 /* swap the pages */
3113 rb_init_page(bpage); 3703 rb_init_page(bpage);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bdb3afc8b306..b591f7a1bd7b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3630,9 +3630,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3630 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 3630 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
3631 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 3631 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
3632 3632
3633 cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
3634 trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
3635
3636 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 3633 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
3637 3634
3638 kfree(s); 3635 kfree(s);