aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace/ring_buffer.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace/ring_buffer.c')
-rw-r--r--kernel/trace/ring_buffer.c655
1 files changed, 417 insertions, 238 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index edefe3b2801b..bd1c35a4fbcc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -14,12 +14,14 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/percpu.h> 15#include <linux/percpu.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/hash.h> 19#include <linux/hash.h>
19#include <linux/list.h> 20#include <linux/list.h>
20#include <linux/cpu.h> 21#include <linux/cpu.h>
21#include <linux/fs.h> 22#include <linux/fs.h>
22 23
24#include <asm/local.h>
23#include "trace.h" 25#include "trace.h"
24 26
25/* 27/*
@@ -206,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
206#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
207#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ 209#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
208 210
211#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
212# define RB_FORCE_8BYTE_ALIGNMENT 0
213# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
214#else
215# define RB_FORCE_8BYTE_ALIGNMENT 1
216# define RB_ARCH_ALIGNMENT 8U
217#endif
218
209/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 219/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
210#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 220#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
211 221
@@ -214,6 +224,9 @@ enum {
214 RB_LEN_TIME_STAMP = 16, 224 RB_LEN_TIME_STAMP = 16,
215}; 225};
216 226
227#define skip_time_extend(event) \
228 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
229
217static inline int rb_null_event(struct ring_buffer_event *event) 230static inline int rb_null_event(struct ring_buffer_event *event)
218{ 231{
219 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; 232 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -238,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
238 return length + RB_EVNT_HDR_SIZE; 251 return length + RB_EVNT_HDR_SIZE;
239} 252}
240 253
241/* inline for ring buffer fast paths */ 254/*
242static unsigned 255 * Return the length of the given event. Will return
256 * the length of the time extend if the event is a
257 * time extend.
258 */
259static inline unsigned
243rb_event_length(struct ring_buffer_event *event) 260rb_event_length(struct ring_buffer_event *event)
244{ 261{
245 switch (event->type_len) { 262 switch (event->type_len) {
@@ -264,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
264 return 0; 281 return 0;
265} 282}
266 283
284/*
285 * Return total length of time extend and data,
286 * or just the event length for all other events.
287 */
288static inline unsigned
289rb_event_ts_length(struct ring_buffer_event *event)
290{
291 unsigned len = 0;
292
293 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
294 /* time extends include the data event after it */
295 len = RB_LEN_TIME_EXTEND;
296 event = skip_time_extend(event);
297 }
298 return len + rb_event_length(event);
299}
300
267/** 301/**
268 * ring_buffer_event_length - return the length of the event 302 * ring_buffer_event_length - return the length of the event
269 * @event: the event to get the length of 303 * @event: the event to get the length of
304 *
305 * Returns the size of the data load of a data event.
306 * If the event is something other than a data event, it
307 * returns the size of the event itself. With the exception
308 * of a TIME EXTEND, where it still returns the size of the
309 * data load of the data event after it.
270 */ 310 */
271unsigned ring_buffer_event_length(struct ring_buffer_event *event) 311unsigned ring_buffer_event_length(struct ring_buffer_event *event)
272{ 312{
273 unsigned length = rb_event_length(event); 313 unsigned length;
314
315 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
316 event = skip_time_extend(event);
317
318 length = rb_event_length(event);
274 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 319 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
275 return length; 320 return length;
276 length -= RB_EVNT_HDR_SIZE; 321 length -= RB_EVNT_HDR_SIZE;
@@ -284,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
284static void * 329static void *
285rb_event_data(struct ring_buffer_event *event) 330rb_event_data(struct ring_buffer_event *event)
286{ 331{
332 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
333 event = skip_time_extend(event);
287 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); 334 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
288 /* If length is in len field, then array[0] has the data */ 335 /* If length is in len field, then array[0] has the data */
289 if (event->type_len) 336 if (event->type_len)
@@ -309,6 +356,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
309#define TS_MASK ((1ULL << TS_SHIFT) - 1) 356#define TS_MASK ((1ULL << TS_SHIFT) - 1)
310#define TS_DELTA_TEST (~TS_MASK) 357#define TS_DELTA_TEST (~TS_MASK)
311 358
359/* Flag when events were overwritten */
360#define RB_MISSED_EVENTS (1 << 31)
361/* Missed count stored at end */
362#define RB_MISSED_STORED (1 << 30)
363
312struct buffer_data_page { 364struct buffer_data_page {
313 u64 time_stamp; /* page time stamp */ 365 u64 time_stamp; /* page time stamp */
314 local_t commit; /* write committed index */ 366 local_t commit; /* write committed index */
@@ -328,6 +380,7 @@ struct buffer_page {
328 local_t write; /* index for next write */ 380 local_t write; /* index for next write */
329 unsigned read; /* index for next read */ 381 unsigned read; /* index for next read */
330 local_t entries; /* entries on this page */ 382 local_t entries; /* entries on this page */
383 unsigned long real_end; /* real end of data */
331 struct buffer_data_page *page; /* Actual data page */ 384 struct buffer_data_page *page; /* Actual data page */
332}; 385};
333 386
@@ -388,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
388/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ 441/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
389#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) 442#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
390 443
391/* Max number of timestamps that can fit on a page */
392#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
393
394int ring_buffer_print_page_header(struct trace_seq *s) 444int ring_buffer_print_page_header(struct trace_seq *s)
395{ 445{
396 struct buffer_data_page field; 446 struct buffer_data_page field;
@@ -407,6 +457,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
407 (unsigned int)sizeof(field.commit), 457 (unsigned int)sizeof(field.commit),
408 (unsigned int)is_signed_type(long)); 458 (unsigned int)is_signed_type(long));
409 459
460 ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
461 "offset:%u;\tsize:%u;\tsigned:%u;\n",
462 (unsigned int)offsetof(typeof(field), commit),
463 1,
464 (unsigned int)is_signed_type(long));
465
410 ret = trace_seq_printf(s, "\tfield: char data;\t" 466 ret = trace_seq_printf(s, "\tfield: char data;\t"
411 "offset:%u;\tsize:%u;\tsigned:%u;\n", 467 "offset:%u;\tsize:%u;\tsigned:%u;\n",
412 (unsigned int)offsetof(typeof(field), data), 468 (unsigned int)offsetof(typeof(field), data),
@@ -421,6 +477,7 @@ int ring_buffer_print_page_header(struct trace_seq *s)
421 */ 477 */
422struct ring_buffer_per_cpu { 478struct ring_buffer_per_cpu {
423 int cpu; 479 int cpu;
480 atomic_t record_disabled;
424 struct ring_buffer *buffer; 481 struct ring_buffer *buffer;
425 spinlock_t reader_lock; /* serialize readers */ 482 spinlock_t reader_lock; /* serialize readers */
426 arch_spinlock_t lock; 483 arch_spinlock_t lock;
@@ -430,6 +487,8 @@ struct ring_buffer_per_cpu {
430 struct buffer_page *tail_page; /* write to tail */ 487 struct buffer_page *tail_page; /* write to tail */
431 struct buffer_page *commit_page; /* committed pages */ 488 struct buffer_page *commit_page; /* committed pages */
432 struct buffer_page *reader_page; 489 struct buffer_page *reader_page;
490 unsigned long lost_events;
491 unsigned long last_overrun;
433 local_t commit_overrun; 492 local_t commit_overrun;
434 local_t overrun; 493 local_t overrun;
435 local_t entries; 494 local_t entries;
@@ -438,7 +497,6 @@ struct ring_buffer_per_cpu {
438 unsigned long read; 497 unsigned long read;
439 u64 write_stamp; 498 u64 write_stamp;
440 u64 read_stamp; 499 u64 read_stamp;
441 atomic_t record_disabled;
442}; 500};
443 501
444struct ring_buffer { 502struct ring_buffer {
@@ -464,6 +522,8 @@ struct ring_buffer_iter {
464 struct ring_buffer_per_cpu *cpu_buffer; 522 struct ring_buffer_per_cpu *cpu_buffer;
465 unsigned long head; 523 unsigned long head;
466 struct buffer_page *head_page; 524 struct buffer_page *head_page;
525 struct buffer_page *cache_reader_page;
526 unsigned long cache_read;
467 u64 read_stamp; 527 u64 read_stamp;
468}; 528};
469 529
@@ -1198,18 +1258,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1198 1258
1199 for (i = 0; i < nr_pages; i++) { 1259 for (i = 0; i < nr_pages; i++) {
1200 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1260 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1201 return; 1261 goto out;
1202 p = cpu_buffer->pages->next; 1262 p = cpu_buffer->pages->next;
1203 bpage = list_entry(p, struct buffer_page, list); 1263 bpage = list_entry(p, struct buffer_page, list);
1204 list_del_init(&bpage->list); 1264 list_del_init(&bpage->list);
1205 free_buffer_page(bpage); 1265 free_buffer_page(bpage);
1206 } 1266 }
1207 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1267 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1208 return; 1268 goto out;
1209 1269
1210 rb_reset_cpu(cpu_buffer); 1270 rb_reset_cpu(cpu_buffer);
1211 rb_check_pages(cpu_buffer); 1271 rb_check_pages(cpu_buffer);
1212 1272
1273out:
1213 spin_unlock_irq(&cpu_buffer->reader_lock); 1274 spin_unlock_irq(&cpu_buffer->reader_lock);
1214} 1275}
1215 1276
@@ -1226,7 +1287,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1226 1287
1227 for (i = 0; i < nr_pages; i++) { 1288 for (i = 0; i < nr_pages; i++) {
1228 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1289 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
1229 return; 1290 goto out;
1230 p = pages->next; 1291 p = pages->next;
1231 bpage = list_entry(p, struct buffer_page, list); 1292 bpage = list_entry(p, struct buffer_page, list);
1232 list_del_init(&bpage->list); 1293 list_del_init(&bpage->list);
@@ -1235,6 +1296,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1235 rb_reset_cpu(cpu_buffer); 1296 rb_reset_cpu(cpu_buffer);
1236 rb_check_pages(cpu_buffer); 1297 rb_check_pages(cpu_buffer);
1237 1298
1299out:
1238 spin_unlock_irq(&cpu_buffer->reader_lock); 1300 spin_unlock_irq(&cpu_buffer->reader_lock);
1239} 1301}
1240 1302
@@ -1518,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1518 iter->head = 0; 1580 iter->head = 0;
1519} 1581}
1520 1582
1583/* Slow path, do not inline */
1584static noinline struct ring_buffer_event *
1585rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1586{
1587 event->type_len = RINGBUF_TYPE_TIME_EXTEND;
1588
1589 /* Not the first event on the page? */
1590 if (rb_event_index(event)) {
1591 event->time_delta = delta & TS_MASK;
1592 event->array[0] = delta >> TS_SHIFT;
1593 } else {
1594 /* nope, just zero it */
1595 event->time_delta = 0;
1596 event->array[0] = 0;
1597 }
1598
1599 return skip_time_extend(event);
1600}
1601
1521/** 1602/**
1522 * ring_buffer_update_event - update event type and data 1603 * ring_buffer_update_event - update event type and data
1523 * @event: the even to update 1604 * @event: the even to update
@@ -1530,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1530 * data field. 1611 * data field.
1531 */ 1612 */
1532static void 1613static void
1533rb_update_event(struct ring_buffer_event *event, 1614rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
1534 unsigned type, unsigned length) 1615 struct ring_buffer_event *event, unsigned length,
1616 int add_timestamp, u64 delta)
1535{ 1617{
1536 event->type_len = type; 1618 /* Only a commit updates the timestamp */
1537 1619 if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
1538 switch (type) { 1620 delta = 0;
1539
1540 case RINGBUF_TYPE_PADDING:
1541 case RINGBUF_TYPE_TIME_EXTEND:
1542 case RINGBUF_TYPE_TIME_STAMP:
1543 break;
1544 1621
1545 case 0: 1622 /*
1546 length -= RB_EVNT_HDR_SIZE; 1623 * If we need to add a timestamp, then we
1547 if (length > RB_MAX_SMALL_DATA) 1624 * add it to the start of the resevered space.
1548 event->array[0] = length; 1625 */
1549 else 1626 if (unlikely(add_timestamp)) {
1550 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1627 event = rb_add_time_stamp(event, delta);
1551 break; 1628 length -= RB_LEN_TIME_EXTEND;
1552 default: 1629 delta = 0;
1553 BUG();
1554 } 1630 }
1631
1632 event->time_delta = delta;
1633 length -= RB_EVNT_HDR_SIZE;
1634 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
1635 event->type_len = 0;
1636 event->array[0] = length;
1637 } else
1638 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1555} 1639}
1556 1640
1557/* 1641/*
@@ -1719,11 +1803,11 @@ static unsigned rb_calculate_event_length(unsigned length)
1719 if (!length) 1803 if (!length)
1720 length = 1; 1804 length = 1;
1721 1805
1722 if (length > RB_MAX_SMALL_DATA) 1806 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1723 length += sizeof(event.array[0]); 1807 length += sizeof(event.array[0]);
1724 1808
1725 length += RB_EVNT_HDR_SIZE; 1809 length += RB_EVNT_HDR_SIZE;
1726 length = ALIGN(length, RB_ALIGNMENT); 1810 length = ALIGN(length, RB_ARCH_ALIGNMENT);
1727 1811
1728 return length; 1812 return length;
1729} 1813}
@@ -1740,6 +1824,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1740 * must fill the old tail_page with padding. 1824 * must fill the old tail_page with padding.
1741 */ 1825 */
1742 if (tail >= BUF_PAGE_SIZE) { 1826 if (tail >= BUF_PAGE_SIZE) {
1827 /*
1828 * If the page was filled, then we still need
1829 * to update the real_end. Reset it to zero
1830 * and the reader will ignore it.
1831 */
1832 if (tail == BUF_PAGE_SIZE)
1833 tail_page->real_end = 0;
1834
1743 local_sub(length, &tail_page->write); 1835 local_sub(length, &tail_page->write);
1744 return; 1836 return;
1745 } 1837 }
@@ -1748,6 +1840,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1748 kmemcheck_annotate_bitfield(event, bitfield); 1840 kmemcheck_annotate_bitfield(event, bitfield);
1749 1841
1750 /* 1842 /*
1843 * Save the original length to the meta data.
1844 * This will be used by the reader to add lost event
1845 * counter.
1846 */
1847 tail_page->real_end = tail;
1848
1849 /*
1751 * If this event is bigger than the minimum size, then 1850 * If this event is bigger than the minimum size, then
1752 * we need to be careful that we don't subtract the 1851 * we need to be careful that we don't subtract the
1753 * write counter enough to allow another writer to slip 1852 * write counter enough to allow another writer to slip
@@ -1780,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1780 local_sub(length, &tail_page->write); 1879 local_sub(length, &tail_page->write);
1781} 1880}
1782 1881
1783static struct ring_buffer_event * 1882/*
1883 * This is the slow path, force gcc not to inline it.
1884 */
1885static noinline struct ring_buffer_event *
1784rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 1886rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1785 unsigned long length, unsigned long tail, 1887 unsigned long length, unsigned long tail,
1786 struct buffer_page *tail_page, u64 *ts) 1888 struct buffer_page *tail_page, u64 ts)
1787{ 1889{
1788 struct buffer_page *commit_page = cpu_buffer->commit_page; 1890 struct buffer_page *commit_page = cpu_buffer->commit_page;
1789 struct ring_buffer *buffer = cpu_buffer->buffer; 1891 struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1866,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1866 * Nested commits always have zero deltas, so 1968 * Nested commits always have zero deltas, so
1867 * just reread the time stamp 1969 * just reread the time stamp
1868 */ 1970 */
1869 *ts = rb_time_stamp(buffer); 1971 ts = rb_time_stamp(buffer);
1870 next_page->page->time_stamp = *ts; 1972 next_page->page->time_stamp = ts;
1871 } 1973 }
1872 1974
1873 out_again: 1975 out_again:
@@ -1886,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1886 1988
1887static struct ring_buffer_event * 1989static struct ring_buffer_event *
1888__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1990__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1889 unsigned type, unsigned long length, u64 *ts) 1991 unsigned long length, u64 ts,
1992 u64 delta, int add_timestamp)
1890{ 1993{
1891 struct buffer_page *tail_page; 1994 struct buffer_page *tail_page;
1892 struct ring_buffer_event *event; 1995 struct ring_buffer_event *event;
1893 unsigned long tail, write; 1996 unsigned long tail, write;
1894 1997
1998 /*
1999 * If the time delta since the last event is too big to
2000 * hold in the time field of the event, then we append a
2001 * TIME EXTEND event ahead of the data event.
2002 */
2003 if (unlikely(add_timestamp))
2004 length += RB_LEN_TIME_EXTEND;
2005
1895 tail_page = cpu_buffer->tail_page; 2006 tail_page = cpu_buffer->tail_page;
1896 write = local_add_return(length, &tail_page->write); 2007 write = local_add_return(length, &tail_page->write);
1897 2008
@@ -1900,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1900 tail = write - length; 2011 tail = write - length;
1901 2012
1902 /* See if we shot pass the end of this buffer page */ 2013 /* See if we shot pass the end of this buffer page */
1903 if (write > BUF_PAGE_SIZE) 2014 if (unlikely(write > BUF_PAGE_SIZE))
1904 return rb_move_tail(cpu_buffer, length, tail, 2015 return rb_move_tail(cpu_buffer, length, tail,
1905 tail_page, ts); 2016 tail_page, ts);
1906 2017
@@ -1908,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1908 2019
1909 event = __rb_page_index(tail_page, tail); 2020 event = __rb_page_index(tail_page, tail);
1910 kmemcheck_annotate_bitfield(event, bitfield); 2021 kmemcheck_annotate_bitfield(event, bitfield);
1911 rb_update_event(event, type, length); 2022 rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
1912 2023
1913 /* The passed in type is zero for DATA */ 2024 local_inc(&tail_page->entries);
1914 if (likely(!type))
1915 local_inc(&tail_page->entries);
1916 2025
1917 /* 2026 /*
1918 * If this is the first commit on the page, then update 2027 * If this is the first commit on the page, then update
1919 * its timestamp. 2028 * its timestamp.
1920 */ 2029 */
1921 if (!tail) 2030 if (!tail)
1922 tail_page->page->time_stamp = *ts; 2031 tail_page->page->time_stamp = ts;
1923 2032
1924 return event; 2033 return event;
1925} 2034}
@@ -1934,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1934 unsigned long addr; 2043 unsigned long addr;
1935 2044
1936 new_index = rb_event_index(event); 2045 new_index = rb_event_index(event);
1937 old_index = new_index + rb_event_length(event); 2046 old_index = new_index + rb_event_ts_length(event);
1938 addr = (unsigned long)event; 2047 addr = (unsigned long)event;
1939 addr &= PAGE_MASK; 2048 addr &= PAGE_MASK;
1940 2049
@@ -1960,80 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1960 return 0; 2069 return 0;
1961} 2070}
1962 2071
1963static int
1964rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1965 u64 *ts, u64 *delta)
1966{
1967 struct ring_buffer_event *event;
1968 static int once;
1969 int ret;
1970
1971 if (unlikely(*delta > (1ULL << 59) && !once++)) {
1972 printk(KERN_WARNING "Delta way too big! %llu"
1973 " ts=%llu write stamp = %llu\n",
1974 (unsigned long long)*delta,
1975 (unsigned long long)*ts,
1976 (unsigned long long)cpu_buffer->write_stamp);
1977 WARN_ON(1);
1978 }
1979
1980 /*
1981 * The delta is too big, we to add a
1982 * new timestamp.
1983 */
1984 event = __rb_reserve_next(cpu_buffer,
1985 RINGBUF_TYPE_TIME_EXTEND,
1986 RB_LEN_TIME_EXTEND,
1987 ts);
1988 if (!event)
1989 return -EBUSY;
1990
1991 if (PTR_ERR(event) == -EAGAIN)
1992 return -EAGAIN;
1993
1994 /* Only a commited time event can update the write stamp */
1995 if (rb_event_is_commit(cpu_buffer, event)) {
1996 /*
1997 * If this is the first on the page, then it was
1998 * updated with the page itself. Try to discard it
1999 * and if we can't just make it zero.
2000 */
2001 if (rb_event_index(event)) {
2002 event->time_delta = *delta & TS_MASK;
2003 event->array[0] = *delta >> TS_SHIFT;
2004 } else {
2005 /* try to discard, since we do not need this */
2006 if (!rb_try_to_discard(cpu_buffer, event)) {
2007 /* nope, just zero it */
2008 event->time_delta = 0;
2009 event->array[0] = 0;
2010 }
2011 }
2012 cpu_buffer->write_stamp = *ts;
2013 /* let the caller know this was the commit */
2014 ret = 1;
2015 } else {
2016 /* Try to discard the event */
2017 if (!rb_try_to_discard(cpu_buffer, event)) {
2018 /* Darn, this is just wasted space */
2019 event->time_delta = 0;
2020 event->array[0] = 0;
2021 }
2022 ret = 0;
2023 }
2024
2025 *delta = 0;
2026
2027 return ret;
2028}
2029
2030static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) 2072static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2031{ 2073{
2032 local_inc(&cpu_buffer->committing); 2074 local_inc(&cpu_buffer->committing);
2033 local_inc(&cpu_buffer->commits); 2075 local_inc(&cpu_buffer->commits);
2034} 2076}
2035 2077
2036static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) 2078static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2037{ 2079{
2038 unsigned long commits; 2080 unsigned long commits;
2039 2081
@@ -2071,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2071 unsigned long length) 2113 unsigned long length)
2072{ 2114{
2073 struct ring_buffer_event *event; 2115 struct ring_buffer_event *event;
2074 u64 ts, delta = 0; 2116 u64 ts, delta;
2075 int commit = 0;
2076 int nr_loops = 0; 2117 int nr_loops = 0;
2118 int add_timestamp;
2119 u64 diff;
2077 2120
2078 rb_start_commit(cpu_buffer); 2121 rb_start_commit(cpu_buffer);
2079 2122
@@ -2094,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2094 2137
2095 length = rb_calculate_event_length(length); 2138 length = rb_calculate_event_length(length);
2096 again: 2139 again:
2140 add_timestamp = 0;
2141 delta = 0;
2142
2097 /* 2143 /*
2098 * We allow for interrupts to reenter here and do a trace. 2144 * We allow for interrupts to reenter here and do a trace.
2099 * If one does, it will cause this original code to loop 2145 * If one does, it will cause this original code to loop
@@ -2107,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2107 goto out_fail; 2153 goto out_fail;
2108 2154
2109 ts = rb_time_stamp(cpu_buffer->buffer); 2155 ts = rb_time_stamp(cpu_buffer->buffer);
2156 diff = ts - cpu_buffer->write_stamp;
2110 2157
2111 /* 2158 /* make sure this diff is calculated here */
2112 * Only the first commit can update the timestamp. 2159 barrier();
2113 * Yes there is a race here. If an interrupt comes in
2114 * just after the conditional and it traces too, then it
2115 * will also check the deltas. More than one timestamp may
2116 * also be made. But only the entry that did the actual
2117 * commit will be something other than zero.
2118 */
2119 if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
2120 rb_page_write(cpu_buffer->tail_page) ==
2121 rb_commit_index(cpu_buffer))) {
2122 u64 diff;
2123
2124 diff = ts - cpu_buffer->write_stamp;
2125
2126 /* make sure this diff is calculated here */
2127 barrier();
2128
2129 /* Did the write stamp get updated already? */
2130 if (unlikely(ts < cpu_buffer->write_stamp))
2131 goto get_event;
2132 2160
2161 /* Did the write stamp get updated already? */
2162 if (likely(ts >= cpu_buffer->write_stamp)) {
2133 delta = diff; 2163 delta = diff;
2134 if (unlikely(test_time_stamp(delta))) { 2164 if (unlikely(test_time_stamp(delta))) {
2135 2165 WARN_ONCE(delta > (1ULL << 59),
2136 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 2166 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2137 if (commit == -EBUSY) 2167 (unsigned long long)delta,
2138 goto out_fail; 2168 (unsigned long long)ts,
2139 2169 (unsigned long long)cpu_buffer->write_stamp);
2140 if (commit == -EAGAIN) 2170 add_timestamp = 1;
2141 goto again;
2142
2143 RB_WARN_ON(cpu_buffer, commit < 0);
2144 } 2171 }
2145 } 2172 }
2146 2173
2147 get_event: 2174 event = __rb_reserve_next(cpu_buffer, length, ts,
2148 event = __rb_reserve_next(cpu_buffer, 0, length, &ts); 2175 delta, add_timestamp);
2149 if (unlikely(PTR_ERR(event) == -EAGAIN)) 2176 if (unlikely(PTR_ERR(event) == -EAGAIN))
2150 goto again; 2177 goto again;
2151 2178
2152 if (!event) 2179 if (!event)
2153 goto out_fail; 2180 goto out_fail;
2154 2181
2155 if (!rb_event_is_commit(cpu_buffer, event))
2156 delta = 0;
2157
2158 event->time_delta = delta;
2159
2160 return event; 2182 return event;
2161 2183
2162 out_fail: 2184 out_fail:
@@ -2168,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2168 2190
2169#define TRACE_RECURSIVE_DEPTH 16 2191#define TRACE_RECURSIVE_DEPTH 16
2170 2192
2171static int trace_recursive_lock(void) 2193/* Keep this code out of the fast path cache */
2194static noinline void trace_recursive_fail(void)
2172{ 2195{
2173 current->trace_recursion++;
2174
2175 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2176 return 0;
2177
2178 /* Disable all tracing before we do anything else */ 2196 /* Disable all tracing before we do anything else */
2179 tracing_off_permanent(); 2197 tracing_off_permanent();
2180 2198
@@ -2186,10 +2204,21 @@ static int trace_recursive_lock(void)
2186 in_nmi()); 2204 in_nmi());
2187 2205
2188 WARN_ON_ONCE(1); 2206 WARN_ON_ONCE(1);
2207}
2208
2209static inline int trace_recursive_lock(void)
2210{
2211 current->trace_recursion++;
2212
2213 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
2214 return 0;
2215
2216 trace_recursive_fail();
2217
2189 return -1; 2218 return -1;
2190} 2219}
2191 2220
2192static void trace_recursive_unlock(void) 2221static inline void trace_recursive_unlock(void)
2193{ 2222{
2194 WARN_ON_ONCE(!current->trace_recursion); 2223 WARN_ON_ONCE(!current->trace_recursion);
2195 2224
@@ -2203,8 +2232,6 @@ static void trace_recursive_unlock(void)
2203 2232
2204#endif 2233#endif
2205 2234
2206static DEFINE_PER_CPU(int, rb_need_resched);
2207
2208/** 2235/**
2209 * ring_buffer_lock_reserve - reserve a part of the buffer 2236 * ring_buffer_lock_reserve - reserve a part of the buffer
2210 * @buffer: the ring buffer to reserve from 2237 * @buffer: the ring buffer to reserve from
@@ -2225,16 +2252,16 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2225{ 2252{
2226 struct ring_buffer_per_cpu *cpu_buffer; 2253 struct ring_buffer_per_cpu *cpu_buffer;
2227 struct ring_buffer_event *event; 2254 struct ring_buffer_event *event;
2228 int cpu, resched; 2255 int cpu;
2229 2256
2230 if (ring_buffer_flags != RB_BUFFERS_ON) 2257 if (ring_buffer_flags != RB_BUFFERS_ON)
2231 return NULL; 2258 return NULL;
2232 2259
2233 if (atomic_read(&buffer->record_disabled))
2234 return NULL;
2235
2236 /* If we are tracing schedule, we don't want to recurse */ 2260 /* If we are tracing schedule, we don't want to recurse */
2237 resched = ftrace_preempt_disable(); 2261 preempt_disable_notrace();
2262
2263 if (atomic_read(&buffer->record_disabled))
2264 goto out_nocheck;
2238 2265
2239 if (trace_recursive_lock()) 2266 if (trace_recursive_lock())
2240 goto out_nocheck; 2267 goto out_nocheck;
@@ -2256,21 +2283,13 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2256 if (!event) 2283 if (!event)
2257 goto out; 2284 goto out;
2258 2285
2259 /*
2260 * Need to store resched state on this cpu.
2261 * Only the first needs to.
2262 */
2263
2264 if (preempt_count() == 1)
2265 per_cpu(rb_need_resched, cpu) = resched;
2266
2267 return event; 2286 return event;
2268 2287
2269 out: 2288 out:
2270 trace_recursive_unlock(); 2289 trace_recursive_unlock();
2271 2290
2272 out_nocheck: 2291 out_nocheck:
2273 ftrace_preempt_enable(resched); 2292 preempt_enable_notrace();
2274 return NULL; 2293 return NULL;
2275} 2294}
2276EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2295EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
@@ -2279,12 +2298,28 @@ static void
2279rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, 2298rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2280 struct ring_buffer_event *event) 2299 struct ring_buffer_event *event)
2281{ 2300{
2301 u64 delta;
2302
2282 /* 2303 /*
2283 * The event first in the commit queue updates the 2304 * The event first in the commit queue updates the
2284 * time stamp. 2305 * time stamp.
2285 */ 2306 */
2286 if (rb_event_is_commit(cpu_buffer, event)) 2307 if (rb_event_is_commit(cpu_buffer, event)) {
2287 cpu_buffer->write_stamp += event->time_delta; 2308 /*
2309 * A commit event that is first on a page
2310 * updates the write timestamp with the page stamp
2311 */
2312 if (!rb_event_index(event))
2313 cpu_buffer->write_stamp =
2314 cpu_buffer->commit_page->page->time_stamp;
2315 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
2316 delta = event->array[0];
2317 delta <<= TS_SHIFT;
2318 delta += event->time_delta;
2319 cpu_buffer->write_stamp += delta;
2320 } else
2321 cpu_buffer->write_stamp += event->time_delta;
2322 }
2288} 2323}
2289 2324
2290static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2325static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2316,13 +2351,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2316 2351
2317 trace_recursive_unlock(); 2352 trace_recursive_unlock();
2318 2353
2319 /* 2354 preempt_enable_notrace();
2320 * Only the last preempt count needs to restore preemption.
2321 */
2322 if (preempt_count() == 1)
2323 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2324 else
2325 preempt_enable_no_resched_notrace();
2326 2355
2327 return 0; 2356 return 0;
2328} 2357}
@@ -2330,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
2330 2359
2331static inline void rb_event_discard(struct ring_buffer_event *event) 2360static inline void rb_event_discard(struct ring_buffer_event *event)
2332{ 2361{
2362 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
2363 event = skip_time_extend(event);
2364
2333 /* array[0] holds the actual length for the discarded event */ 2365 /* array[0] holds the actual length for the discarded event */
2334 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; 2366 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
2335 event->type_len = RINGBUF_TYPE_PADDING; 2367 event->type_len = RINGBUF_TYPE_PADDING;
@@ -2430,13 +2462,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
2430 2462
2431 trace_recursive_unlock(); 2463 trace_recursive_unlock();
2432 2464
2433 /* 2465 preempt_enable_notrace();
2434 * Only the last preempt count needs to restore preemption.
2435 */
2436 if (preempt_count() == 1)
2437 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
2438 else
2439 preempt_enable_no_resched_notrace();
2440 2466
2441} 2467}
2442EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); 2468EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
@@ -2462,15 +2488,15 @@ int ring_buffer_write(struct ring_buffer *buffer,
2462 struct ring_buffer_event *event; 2488 struct ring_buffer_event *event;
2463 void *body; 2489 void *body;
2464 int ret = -EBUSY; 2490 int ret = -EBUSY;
2465 int cpu, resched; 2491 int cpu;
2466 2492
2467 if (ring_buffer_flags != RB_BUFFERS_ON) 2493 if (ring_buffer_flags != RB_BUFFERS_ON)
2468 return -EBUSY; 2494 return -EBUSY;
2469 2495
2470 if (atomic_read(&buffer->record_disabled)) 2496 preempt_disable_notrace();
2471 return -EBUSY;
2472 2497
2473 resched = ftrace_preempt_disable(); 2498 if (atomic_read(&buffer->record_disabled))
2499 goto out;
2474 2500
2475 cpu = raw_smp_processor_id(); 2501 cpu = raw_smp_processor_id();
2476 2502
@@ -2497,7 +2523,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
2497 2523
2498 ret = 0; 2524 ret = 0;
2499 out: 2525 out:
2500 ftrace_preempt_enable(resched); 2526 preempt_enable_notrace();
2501 2527
2502 return ret; 2528 return ret;
2503} 2529}
@@ -2539,7 +2565,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable);
2539 * @buffer: The ring buffer to enable writes 2565 * @buffer: The ring buffer to enable writes
2540 * 2566 *
2541 * Note, multiple disables will need the same number of enables 2567 * Note, multiple disables will need the same number of enables
2542 * to truely enable the writing (much like preempt_disable). 2568 * to truly enable the writing (much like preempt_disable).
2543 */ 2569 */
2544void ring_buffer_record_enable(struct ring_buffer *buffer) 2570void ring_buffer_record_enable(struct ring_buffer *buffer)
2545{ 2571{
@@ -2575,7 +2601,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
2575 * @cpu: The CPU to enable. 2601 * @cpu: The CPU to enable.
2576 * 2602 *
2577 * Note, multiple disables will need the same number of enables 2603 * Note, multiple disables will need the same number of enables
2578 * to truely enable the writing (much like preempt_disable). 2604 * to truly enable the writing (much like preempt_disable).
2579 */ 2605 */
2580void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) 2606void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2581{ 2607{
@@ -2589,6 +2615,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
2589} 2615}
2590EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); 2616EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2591 2617
2618/*
2619 * The total entries in the ring buffer is the running counter
2620 * of entries entered into the ring buffer, minus the sum of
2621 * the entries read from the ring buffer and the number of
2622 * entries that were overwritten.
2623 */
2624static inline unsigned long
2625rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
2626{
2627 return local_read(&cpu_buffer->entries) -
2628 (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
2629}
2630
2592/** 2631/**
2593 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer 2632 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
2594 * @buffer: The ring buffer 2633 * @buffer: The ring buffer
@@ -2597,16 +2636,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
2597unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) 2636unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
2598{ 2637{
2599 struct ring_buffer_per_cpu *cpu_buffer; 2638 struct ring_buffer_per_cpu *cpu_buffer;
2600 unsigned long ret;
2601 2639
2602 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2640 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2603 return 0; 2641 return 0;
2604 2642
2605 cpu_buffer = buffer->buffers[cpu]; 2643 cpu_buffer = buffer->buffers[cpu];
2606 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
2607 - cpu_buffer->read;
2608 2644
2609 return ret; 2645 return rb_num_of_entries(cpu_buffer);
2610} 2646}
2611EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); 2647EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
2612 2648
@@ -2667,8 +2703,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2667 /* if you care about this being correct, lock the buffer */ 2703 /* if you care about this being correct, lock the buffer */
2668 for_each_buffer_cpu(buffer, cpu) { 2704 for_each_buffer_cpu(buffer, cpu) {
2669 cpu_buffer = buffer->buffers[cpu]; 2705 cpu_buffer = buffer->buffers[cpu];
2670 entries += (local_read(&cpu_buffer->entries) - 2706 entries += rb_num_of_entries(cpu_buffer);
2671 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2672 } 2707 }
2673 2708
2674 return entries; 2709 return entries;
@@ -2716,6 +2751,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2716 iter->read_stamp = cpu_buffer->read_stamp; 2751 iter->read_stamp = cpu_buffer->read_stamp;
2717 else 2752 else
2718 iter->read_stamp = iter->head_page->page->time_stamp; 2753 iter->read_stamp = iter->head_page->page->time_stamp;
2754 iter->cache_reader_page = cpu_buffer->reader_page;
2755 iter->cache_read = cpu_buffer->read;
2719} 2756}
2720 2757
2721/** 2758/**
@@ -2822,6 +2859,7 @@ static struct buffer_page *
2822rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 2859rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2823{ 2860{
2824 struct buffer_page *reader = NULL; 2861 struct buffer_page *reader = NULL;
2862 unsigned long overwrite;
2825 unsigned long flags; 2863 unsigned long flags;
2826 int nr_loops = 0; 2864 int nr_loops = 0;
2827 int ret; 2865 int ret;
@@ -2863,6 +2901,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2863 local_set(&cpu_buffer->reader_page->write, 0); 2901 local_set(&cpu_buffer->reader_page->write, 0);
2864 local_set(&cpu_buffer->reader_page->entries, 0); 2902 local_set(&cpu_buffer->reader_page->entries, 0);
2865 local_set(&cpu_buffer->reader_page->page->commit, 0); 2903 local_set(&cpu_buffer->reader_page->page->commit, 0);
2904 cpu_buffer->reader_page->real_end = 0;
2866 2905
2867 spin: 2906 spin:
2868 /* 2907 /*
@@ -2883,6 +2922,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2883 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); 2922 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2884 2923
2885 /* 2924 /*
2925 * We want to make sure we read the overruns after we set up our
2926 * pointers to the next object. The writer side does a
2927 * cmpxchg to cross pages which acts as the mb on the writer
2928 * side. Note, the reader will constantly fail the swap
2929 * while the writer is updating the pointers, so this
2930 * guarantees that the overwrite recorded here is the one we
2931 * want to compare with the last_overrun.
2932 */
2933 smp_mb();
2934 overwrite = local_read(&(cpu_buffer->overrun));
2935
2936 /*
2886 * Here's the tricky part. 2937 * Here's the tricky part.
2887 * 2938 *
2888 * We need to move the pointer past the header page. 2939 * We need to move the pointer past the header page.
@@ -2913,6 +2964,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2913 cpu_buffer->reader_page = reader; 2964 cpu_buffer->reader_page = reader;
2914 rb_reset_reader_page(cpu_buffer); 2965 rb_reset_reader_page(cpu_buffer);
2915 2966
2967 if (overwrite != cpu_buffer->last_overrun) {
2968 cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
2969 cpu_buffer->last_overrun = overwrite;
2970 }
2971
2916 goto again; 2972 goto again;
2917 2973
2918 out: 2974 out:
@@ -2947,13 +3003,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2947 3003
2948static void rb_advance_iter(struct ring_buffer_iter *iter) 3004static void rb_advance_iter(struct ring_buffer_iter *iter)
2949{ 3005{
2950 struct ring_buffer *buffer;
2951 struct ring_buffer_per_cpu *cpu_buffer; 3006 struct ring_buffer_per_cpu *cpu_buffer;
2952 struct ring_buffer_event *event; 3007 struct ring_buffer_event *event;
2953 unsigned length; 3008 unsigned length;
2954 3009
2955 cpu_buffer = iter->cpu_buffer; 3010 cpu_buffer = iter->cpu_buffer;
2956 buffer = cpu_buffer->buffer;
2957 3011
2958 /* 3012 /*
2959 * Check if we are at the end of the buffer. 3013 * Check if we are at the end of the buffer.
@@ -2989,8 +3043,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2989 rb_advance_iter(iter); 3043 rb_advance_iter(iter);
2990} 3044}
2991 3045
3046static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
3047{
3048 return cpu_buffer->lost_events;
3049}
3050
2992static struct ring_buffer_event * 3051static struct ring_buffer_event *
2993rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) 3052rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3053 unsigned long *lost_events)
2994{ 3054{
2995 struct ring_buffer_event *event; 3055 struct ring_buffer_event *event;
2996 struct buffer_page *reader; 3056 struct buffer_page *reader;
@@ -2998,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
2998 3058
2999 again: 3059 again:
3000 /* 3060 /*
3001 * We repeat when a timestamp is encountered. It is possible 3061 * We repeat when a time extend is encountered.
3002 * to get multiple timestamps from an interrupt entering just 3062 * Since the time extend is always attached to a data event,
3003 * as one timestamp is about to be written, or from discarded 3063 * we should never loop more than once.
3004 * commits. The most that we can have is the number on a single page. 3064 * (We never hit the following condition more than twice).
3005 */ 3065 */
3006 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3066 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3007 return NULL; 3067 return NULL;
3008 3068
3009 reader = rb_get_reader_page(cpu_buffer); 3069 reader = rb_get_reader_page(cpu_buffer);
@@ -3042,6 +3102,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3042 ring_buffer_normalize_time_stamp(cpu_buffer->buffer, 3102 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3043 cpu_buffer->cpu, ts); 3103 cpu_buffer->cpu, ts);
3044 } 3104 }
3105 if (lost_events)
3106 *lost_events = rb_lost_events(cpu_buffer);
3045 return event; 3107 return event;
3046 3108
3047 default: 3109 default:
@@ -3060,27 +3122,39 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3060 struct ring_buffer_event *event; 3122 struct ring_buffer_event *event;
3061 int nr_loops = 0; 3123 int nr_loops = 0;
3062 3124
3063 if (ring_buffer_iter_empty(iter))
3064 return NULL;
3065
3066 cpu_buffer = iter->cpu_buffer; 3125 cpu_buffer = iter->cpu_buffer;
3067 buffer = cpu_buffer->buffer; 3126 buffer = cpu_buffer->buffer;
3068 3127
3128 /*
3129 * Check if someone performed a consuming read to
3130 * the buffer. A consuming read invalidates the iterator
3131 * and we need to reset the iterator in this case.
3132 */
3133 if (unlikely(iter->cache_read != cpu_buffer->read ||
3134 iter->cache_reader_page != cpu_buffer->reader_page))
3135 rb_iter_reset(iter);
3136
3069 again: 3137 again:
3138 if (ring_buffer_iter_empty(iter))
3139 return NULL;
3140
3070 /* 3141 /*
3071 * We repeat when a timestamp is encountered. 3142 * We repeat when a time extend is encountered.
3072 * We can get multiple timestamps by nested interrupts or also 3143 * Since the time extend is always attached to a data event,
3073 * if filtering is on (discarding commits). Since discarding 3144 * we should never loop more than once.
3074 * commits can be frequent we can get a lot of timestamps. 3145 * (We never hit the following condition more than twice).
3075 * But we limit them by not adding timestamps if they begin
3076 * at the start of a page.
3077 */ 3146 */
3078 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3147 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3079 return NULL; 3148 return NULL;
3080 3149
3081 if (rb_per_cpu_empty(cpu_buffer)) 3150 if (rb_per_cpu_empty(cpu_buffer))
3082 return NULL; 3151 return NULL;
3083 3152
3153 if (iter->head >= local_read(&iter->head_page->page->commit)) {
3154 rb_inc_iter(iter);
3155 goto again;
3156 }
3157
3084 event = rb_iter_head_event(iter); 3158 event = rb_iter_head_event(iter);
3085 3159
3086 switch (event->type_len) { 3160 switch (event->type_len) {
@@ -3138,12 +3212,14 @@ static inline int rb_ok_to_lock(void)
3138 * @buffer: The ring buffer to read 3212 * @buffer: The ring buffer to read
3139 * @cpu: The cpu to peak at 3213 * @cpu: The cpu to peak at
3140 * @ts: The timestamp counter of this event. 3214 * @ts: The timestamp counter of this event.
3215 * @lost_events: a variable to store if events were lost (may be NULL)
3141 * 3216 *
3142 * This will return the event that will be read next, but does 3217 * This will return the event that will be read next, but does
3143 * not consume the data. 3218 * not consume the data.
3144 */ 3219 */
3145struct ring_buffer_event * 3220struct ring_buffer_event *
3146ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3221ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
3222 unsigned long *lost_events)
3147{ 3223{
3148 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 3224 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
3149 struct ring_buffer_event *event; 3225 struct ring_buffer_event *event;
@@ -3158,7 +3234,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3158 local_irq_save(flags); 3234 local_irq_save(flags);
3159 if (dolock) 3235 if (dolock)
3160 spin_lock(&cpu_buffer->reader_lock); 3236 spin_lock(&cpu_buffer->reader_lock);
3161 event = rb_buffer_peek(cpu_buffer, ts); 3237 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3162 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3238 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3163 rb_advance_reader(cpu_buffer); 3239 rb_advance_reader(cpu_buffer);
3164 if (dolock) 3240 if (dolock)
@@ -3200,13 +3276,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3200/** 3276/**
3201 * ring_buffer_consume - return an event and consume it 3277 * ring_buffer_consume - return an event and consume it
3202 * @buffer: The ring buffer to get the next event from 3278 * @buffer: The ring buffer to get the next event from
3279 * @cpu: the cpu to read the buffer from
3280 * @ts: a variable to store the timestamp (may be NULL)
3281 * @lost_events: a variable to store if events were lost (may be NULL)
3203 * 3282 *
3204 * Returns the next event in the ring buffer, and that event is consumed. 3283 * Returns the next event in the ring buffer, and that event is consumed.
3205 * Meaning, that sequential reads will keep returning a different event, 3284 * Meaning, that sequential reads will keep returning a different event,
3206 * and eventually empty the ring buffer if the producer is slower. 3285 * and eventually empty the ring buffer if the producer is slower.
3207 */ 3286 */
3208struct ring_buffer_event * 3287struct ring_buffer_event *
3209ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 3288ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3289 unsigned long *lost_events)
3210{ 3290{
3211 struct ring_buffer_per_cpu *cpu_buffer; 3291 struct ring_buffer_per_cpu *cpu_buffer;
3212 struct ring_buffer_event *event = NULL; 3292 struct ring_buffer_event *event = NULL;
@@ -3227,9 +3307,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3227 if (dolock) 3307 if (dolock)
3228 spin_lock(&cpu_buffer->reader_lock); 3308 spin_lock(&cpu_buffer->reader_lock);
3229 3309
3230 event = rb_buffer_peek(cpu_buffer, ts); 3310 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3231 if (event) 3311 if (event) {
3312 cpu_buffer->lost_events = 0;
3232 rb_advance_reader(cpu_buffer); 3313 rb_advance_reader(cpu_buffer);
3314 }
3233 3315
3234 if (dolock) 3316 if (dolock)
3235 spin_unlock(&cpu_buffer->reader_lock); 3317 spin_unlock(&cpu_buffer->reader_lock);
@@ -3246,23 +3328,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3246EXPORT_SYMBOL_GPL(ring_buffer_consume); 3328EXPORT_SYMBOL_GPL(ring_buffer_consume);
3247 3329
3248/** 3330/**
3249 * ring_buffer_read_start - start a non consuming read of the buffer 3331 * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
3250 * @buffer: The ring buffer to read from 3332 * @buffer: The ring buffer to read from
3251 * @cpu: The cpu buffer to iterate over 3333 * @cpu: The cpu buffer to iterate over
3252 * 3334 *
3253 * This starts up an iteration through the buffer. It also disables 3335 * This performs the initial preparations necessary to iterate
3254 * the recording to the buffer until the reading is finished. 3336 * through the buffer. Memory is allocated, buffer recording
3255 * This prevents the reading from being corrupted. This is not 3337 * is disabled, and the iterator pointer is returned to the caller.
3256 * a consuming read, so a producer is not expected.
3257 * 3338 *
3258 * Must be paired with ring_buffer_finish. 3339 * Disabling buffer recordng prevents the reading from being
3340 * corrupted. This is not a consuming read, so a producer is not
3341 * expected.
3342 *
3343 * After a sequence of ring_buffer_read_prepare calls, the user is
3344 * expected to make at least one call to ring_buffer_prepare_sync.
3345 * Afterwards, ring_buffer_read_start is invoked to get things going
3346 * for real.
3347 *
3348 * This overall must be paired with ring_buffer_finish.
3259 */ 3349 */
3260struct ring_buffer_iter * 3350struct ring_buffer_iter *
3261ring_buffer_read_start(struct ring_buffer *buffer, int cpu) 3351ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3262{ 3352{
3263 struct ring_buffer_per_cpu *cpu_buffer; 3353 struct ring_buffer_per_cpu *cpu_buffer;
3264 struct ring_buffer_iter *iter; 3354 struct ring_buffer_iter *iter;
3265 unsigned long flags;
3266 3355
3267 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3356 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3268 return NULL; 3357 return NULL;
@@ -3276,15 +3365,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3276 iter->cpu_buffer = cpu_buffer; 3365 iter->cpu_buffer = cpu_buffer;
3277 3366
3278 atomic_inc(&cpu_buffer->record_disabled); 3367 atomic_inc(&cpu_buffer->record_disabled);
3368
3369 return iter;
3370}
3371EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
3372
3373/**
3374 * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
3375 *
3376 * All previously invoked ring_buffer_read_prepare calls to prepare
3377 * iterators will be synchronized. Afterwards, read_buffer_read_start
3378 * calls on those iterators are allowed.
3379 */
3380void
3381ring_buffer_read_prepare_sync(void)
3382{
3279 synchronize_sched(); 3383 synchronize_sched();
3384}
3385EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
3386
3387/**
3388 * ring_buffer_read_start - start a non consuming read of the buffer
3389 * @iter: The iterator returned by ring_buffer_read_prepare
3390 *
3391 * This finalizes the startup of an iteration through the buffer.
3392 * The iterator comes from a call to ring_buffer_read_prepare and
3393 * an intervening ring_buffer_read_prepare_sync must have been
3394 * performed.
3395 *
3396 * Must be paired with ring_buffer_finish.
3397 */
3398void
3399ring_buffer_read_start(struct ring_buffer_iter *iter)
3400{
3401 struct ring_buffer_per_cpu *cpu_buffer;
3402 unsigned long flags;
3403
3404 if (!iter)
3405 return;
3406
3407 cpu_buffer = iter->cpu_buffer;
3280 3408
3281 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3409 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3282 arch_spin_lock(&cpu_buffer->lock); 3410 arch_spin_lock(&cpu_buffer->lock);
3283 rb_iter_reset(iter); 3411 rb_iter_reset(iter);
3284 arch_spin_unlock(&cpu_buffer->lock); 3412 arch_spin_unlock(&cpu_buffer->lock);
3285 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3413 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3286
3287 return iter;
3288} 3414}
3289EXPORT_SYMBOL_GPL(ring_buffer_read_start); 3415EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3290 3416
@@ -3378,6 +3504,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3378 cpu_buffer->write_stamp = 0; 3504 cpu_buffer->write_stamp = 0;
3379 cpu_buffer->read_stamp = 0; 3505 cpu_buffer->read_stamp = 0;
3380 3506
3507 cpu_buffer->lost_events = 0;
3508 cpu_buffer->last_overrun = 0;
3509
3381 rb_head_page_activate(cpu_buffer); 3510 rb_head_page_activate(cpu_buffer);
3382} 3511}
3383 3512
@@ -3653,6 +3782,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3653 struct ring_buffer_event *event; 3782 struct ring_buffer_event *event;
3654 struct buffer_data_page *bpage; 3783 struct buffer_data_page *bpage;
3655 struct buffer_page *reader; 3784 struct buffer_page *reader;
3785 unsigned long missed_events;
3656 unsigned long flags; 3786 unsigned long flags;
3657 unsigned int commit; 3787 unsigned int commit;
3658 unsigned int read; 3788 unsigned int read;
@@ -3689,6 +3819,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3689 read = reader->read; 3819 read = reader->read;
3690 commit = rb_page_commit(reader); 3820 commit = rb_page_commit(reader);
3691 3821
3822 /* Check if any events were dropped */
3823 missed_events = cpu_buffer->lost_events;
3824
3692 /* 3825 /*
3693 * If this page has been partially read or 3826 * If this page has been partially read or
3694 * if len is not big enough to read the rest of the page or 3827 * if len is not big enough to read the rest of the page or
@@ -3709,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3709 if (len > (commit - read)) 3842 if (len > (commit - read))
3710 len = (commit - read); 3843 len = (commit - read);
3711 3844
3712 size = rb_event_length(event); 3845 /* Always keep the time extend and data together */
3846 size = rb_event_ts_length(event);
3713 3847
3714 if (len < size) 3848 if (len < size)
3715 goto out_unlock; 3849 goto out_unlock;
@@ -3719,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3719 3853
3720 /* Need to copy one event at a time */ 3854 /* Need to copy one event at a time */
3721 do { 3855 do {
3856 /* We need the size of one event, because
3857 * rb_advance_reader only advances by one event,
3858 * whereas rb_event_ts_length may include the size of
3859 * one or two events.
3860 * We have already ensured there's enough space if this
3861 * is a time extend. */
3862 size = rb_event_length(event);
3722 memcpy(bpage->data + pos, rpage->data + rpos, size); 3863 memcpy(bpage->data + pos, rpage->data + rpos, size);
3723 3864
3724 len -= size; 3865 len -= size;
@@ -3727,9 +3868,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3727 rpos = reader->read; 3868 rpos = reader->read;
3728 pos += size; 3869 pos += size;
3729 3870
3871 if (rpos >= commit)
3872 break;
3873
3730 event = rb_reader_event(cpu_buffer); 3874 event = rb_reader_event(cpu_buffer);
3731 size = rb_event_length(event); 3875 /* Always keep the time extend and data together */
3732 } while (len > size); 3876 size = rb_event_ts_length(event);
3877 } while (len >= size);
3733 3878
3734 /* update bpage */ 3879 /* update bpage */
3735 local_set(&bpage->commit, pos); 3880 local_set(&bpage->commit, pos);
@@ -3749,9 +3894,42 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3749 local_set(&reader->entries, 0); 3894 local_set(&reader->entries, 0);
3750 reader->read = 0; 3895 reader->read = 0;
3751 *data_page = bpage; 3896 *data_page = bpage;
3897
3898 /*
3899 * Use the real_end for the data size,
3900 * This gives us a chance to store the lost events
3901 * on the page.
3902 */
3903 if (reader->real_end)
3904 local_set(&bpage->commit, reader->real_end);
3752 } 3905 }
3753 ret = read; 3906 ret = read;
3754 3907
3908 cpu_buffer->lost_events = 0;
3909
3910 commit = local_read(&bpage->commit);
3911 /*
3912 * Set a flag in the commit field if we lost events
3913 */
3914 if (missed_events) {
3915 /* If there is room at the end of the page to save the
3916 * missed events, then record it there.
3917 */
3918 if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
3919 memcpy(&bpage->data[commit], &missed_events,
3920 sizeof(missed_events));
3921 local_add(RB_MISSED_STORED, &bpage->commit);
3922 commit += sizeof(missed_events);
3923 }
3924 local_add(RB_MISSED_EVENTS, &bpage->commit);
3925 }
3926
3927 /*
3928 * This page may be off to user land. Zero it out here.
3929 */
3930 if (commit < BUF_PAGE_SIZE)
3931 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
3932
3755 out_unlock: 3933 out_unlock:
3756 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3934 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3757 3935
@@ -3812,6 +3990,7 @@ static const struct file_operations rb_simple_fops = {
3812 .open = tracing_open_generic, 3990 .open = tracing_open_generic,
3813 .read = rb_simple_read, 3991 .read = rb_simple_read,
3814 .write = rb_simple_write, 3992 .write = rb_simple_write,
3993 .llseek = default_llseek,
3815}; 3994};
3816 3995
3817 3996