aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorSteven Rostedt <srostedt@redhat.com>2010-10-07 18:18:05 -0400
committerSteven Rostedt <rostedt@goodmis.org>2010-10-20 15:17:16 -0400
commit69d1b839f7eee347e357b3f6cce7f630cc6ff93d (patch)
tree5aeefde6c8c0234ccea22169817cef07fcf9c692 /kernel
parentafcc5c6872f0215d515a637041bb51f8691a8ea7 (diff)
ring-buffer: Bind time extend and data events together
When the time between two timestamps is greater than 2^27 nanosecs (~134 ms) a time extend event is added that extends the time difference to 59 bits (~18 years). This is due to events only having a 27 bit field to store time. Currently this time extend is a separate event. We add it just before the event data that is being written to the buffer. But before the event data is committed, the event data can also be discarded (as with the case of filters). But because the time extend has already been committed, it will stay in the buffer. If lots of events are being filtered and no event is being written, then every 134ms a time extend can be added to the buffer without any data attached. To keep from filling the entire buffer with time extends, a time extend will never be the first event in a page because the page timestamp can be used. Time extends can only fill the rest of a page with some data at the beginning. This patch binds the time extend with the data. The difference here is that the time extend is not committed before the data is added. Instead, when a time extend is needed, the space reserved on the ring buffer is the time extend + the data event size. The time extend is added to the first part of the reserved block and the data is added to the second. The time extend event is passed back to the reserver, but since the reserver also uses a function to find the data portion of the reserved block, no changes to the ring buffer interface need to be made. When a commit is discarded, we now remove both the time extend and the event. With this approach no more than one time extend can be in the buffer in a row. Data must always follow a time extend. Thanks to Mathieu Desnoyers for suggesting this idea. Suggested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/trace/ring_buffer.c266
1 files changed, 142 insertions, 124 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3af77cd47f21..f50f43107e93 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -224,6 +224,9 @@ enum {
224 RB_LEN_TIME_STAMP = 16, 224 RB_LEN_TIME_STAMP = 16,
225}; 225};
226 226
227#define skip_time_extend(event) \
228 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
229
227static inline int rb_null_event(struct ring_buffer_event *event) 230static inline int rb_null_event(struct ring_buffer_event *event)
228{ 231{
229 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; 232 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
248 return length + RB_EVNT_HDR_SIZE; 251 return length + RB_EVNT_HDR_SIZE;
249} 252}
250 253
251/* inline for ring buffer fast paths */ 254/*
252static unsigned 255 * Return the length of the given event. Will return
256 * the length of the time extend if the event is a
257 * time extend.
258 */
259static inline unsigned
253rb_event_length(struct ring_buffer_event *event) 260rb_event_length(struct ring_buffer_event *event)
254{ 261{
255 switch (event->type_len) { 262 switch (event->type_len) {
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
274 return 0; 281 return 0;
275} 282}
276 283
284/*
285 * Return total length of time extend and data,
286 * or just the event length for all other events.
287 */
288static inline unsigned
289rb_event_ts_length(struct ring_buffer_event *event)
290{
291 unsigned len = 0;
292
293 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
294 /* time extends include the data event after it */
295 len = RB_LEN_TIME_EXTEND;
296 event = skip_time_extend(event);
297 }
298 return len + rb_event_length(event);
299}
300
277/** 301/**
278 * ring_buffer_event_length - return the length of the event 302 * ring_buffer_event_length - return the length of the event
279 * @event: the event to get the length of 303 * @event: the event to get the length of
304 *
305 * Returns the size of the data load of a data event.
306 * If the event is something other than a data event, it
307 * returns the size of the event itself. With the exception
308 * of a TIME EXTEND, where it still returns the size of the
309 * data load of the data event after it.
280 */ 310 */
281unsigned ring_buffer_event_length(struct ring_buffer_event *event) 311unsigned ring_buffer_event_length(struct ring_buffer_event *event)
282{ 312{
283 unsigned length = rb_event_length(event); 313 unsigned length;
314
315 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
316 event = skip_time_extend(event);
317
318 length = rb_event_length(event);
284 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 319 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
285 return length; 320 return length;
286 length -= RB_EVNT_HDR_SIZE; 321 length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
294static void * 329static void *
295rb_event_data(struct ring_buffer_event *event) 330rb_event_data(struct ring_buffer_event *event)
296{ 331{
332 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
333 event = skip_time_extend(event);
297 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); 334 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
298 /* If length is in len field, then array[0] has the data */ 335 /* If length is in len field, then array[0] has the data */
299 if (event->type_len) 336 if (event->type_len)
@@ -1546,6 +1583,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1546 iter->head = 0; 1583 iter->head = 0;
1547} 1584}
1548 1585
1586/* Slow path, do not inline */
1587static noinline struct ring_buffer_event *
1588rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1589{
1590 event->type_len = RINGBUF_TYPE_TIME_EXTEND;
1591
1592 /* Not the first event on the page? */
1593 if (rb_event_index(event)) {
1594 event->time_delta = delta & TS_MASK;
1595 event->array[0] = delta >> TS_SHIFT;
1596 } else {
1597 /* nope, just zero it */
1598 event->time_delta = 0;
1599 event->array[0] = 0;
1600 }
1601
1602 return skip_time_extend(event);
1603}
1604
1549/** 1605/**
1550 * ring_buffer_update_event - update event type and data 1606 * ring_buffer_update_event - update event type and data
1551 * @event: the even to update 1607 * @event: the even to update
@@ -1558,28 +1614,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1558 * data field. 1614 * data field.
1559 */ 1615 */
1560static void 1616static void
1561rb_update_event(struct ring_buffer_event *event, 1617rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
1562 unsigned type, unsigned length) 1618 struct ring_buffer_event *event, unsigned length,
1619 int add_timestamp, u64 delta)
1563{ 1620{
1564 event->type_len = type; 1621 /* Only a commit updates the timestamp */
1565 1622 if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
1566 switch (type) { 1623 delta = 0;
1567
1568 case RINGBUF_TYPE_PADDING:
1569 case RINGBUF_TYPE_TIME_EXTEND:
1570 case RINGBUF_TYPE_TIME_STAMP:
1571 break;
1572 1624
1573 case 0: 1625 /*
1574 length -= RB_EVNT_HDR_SIZE; 1626 * If we need to add a timestamp, then we
1575 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) 1627 * add it to the start of the resevered space.
1576 event->array[0] = length; 1628 */
1577 else 1629 if (unlikely(add_timestamp)) {
1578 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1630 event = rb_add_time_stamp(event, delta);
1579 break; 1631 length -= RB_LEN_TIME_EXTEND;
1580 default: 1632 delta = 0;
1581 BUG();
1582 } 1633 }
1634
1635 event->time_delta = delta;
1636 length -= RB_EVNT_HDR_SIZE;
1637 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
1638 event->type_len = 0;
1639 event->array[0] = length;
1640 } else
1641 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1583} 1642}
1584 1643
1585/* 1644/*
@@ -1932,12 +1991,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1932 1991
1933static struct ring_buffer_event * 1992static struct ring_buffer_event *
1934__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1993__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1935 unsigned type, unsigned long length, u64 ts) 1994 unsigned long length, u64 ts,
1995 u64 delta, int add_timestamp)
1936{ 1996{
1937 struct buffer_page *tail_page; 1997 struct buffer_page *tail_page;
1938 struct ring_buffer_event *event; 1998 struct ring_buffer_event *event;
1939 unsigned long tail, write; 1999 unsigned long tail, write;
1940 2000
2001 /*
2002 * If the time delta since the last event is too big to
2003 * hold in the time field of the event, then we append a
2004 * TIME EXTEND event ahead of the data event.
2005 */
2006 if (unlikely(add_timestamp))
2007 length += RB_LEN_TIME_EXTEND;
2008
1941 tail_page = cpu_buffer->tail_page; 2009 tail_page = cpu_buffer->tail_page;
1942 write = local_add_return(length, &tail_page->write); 2010 write = local_add_return(length, &tail_page->write);
1943 2011
@@ -1954,11 +2022,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1954 2022
1955 event = __rb_page_index(tail_page, tail); 2023 event = __rb_page_index(tail_page, tail);
1956 kmemcheck_annotate_bitfield(event, bitfield); 2024 kmemcheck_annotate_bitfield(event, bitfield);
1957 rb_update_event(event, type, length); 2025 rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
1958 2026
1959 /* The passed in type is zero for DATA */ 2027 local_inc(&tail_page->entries);
1960 if (likely(!type))
1961 local_inc(&tail_page->entries);
1962 2028
1963 /* 2029 /*
1964 * If this is the first commit on the page, then update 2030 * If this is the first commit on the page, then update
@@ -1980,7 +2046,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1980 unsigned long addr; 2046 unsigned long addr;
1981 2047
1982 new_index = rb_event_index(event); 2048 new_index = rb_event_index(event);
1983 old_index = new_index + rb_event_length(event); 2049 old_index = new_index + rb_event_ts_length(event);
1984 addr = (unsigned long)event; 2050 addr = (unsigned long)event;
1985 addr &= PAGE_MASK; 2051 addr &= PAGE_MASK;
1986 2052
@@ -2006,67 +2072,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
2006 return 0; 2072 return 0;
2007} 2073}
2008 2074
2009static int
2010rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2011 u64 ts, u64 delta)
2012{
2013 struct ring_buffer_event *event;
2014 int ret;
2015
2016 WARN_ONCE(delta > (1ULL << 59),
2017 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2018 (unsigned long long)delta,
2019 (unsigned long long)ts,
2020 (unsigned long long)cpu_buffer->write_stamp);
2021
2022 /*
2023 * The delta is too big, we to add a
2024 * new timestamp.
2025 */
2026 event = __rb_reserve_next(cpu_buffer,
2027 RINGBUF_TYPE_TIME_EXTEND,
2028 RB_LEN_TIME_EXTEND,
2029 ts);
2030 if (!event)
2031 return -EBUSY;
2032
2033 if (PTR_ERR(event) == -EAGAIN)
2034 return -EAGAIN;
2035
2036 /* Only a commited time event can update the write stamp */
2037 if (rb_event_is_commit(cpu_buffer, event)) {
2038 /*
2039 * If this is the first on the page, then it was
2040 * updated with the page itself. Try to discard it
2041 * and if we can't just make it zero.
2042 */
2043 if (rb_event_index(event)) {
2044 event->time_delta = delta & TS_MASK;
2045 event->array[0] = delta >> TS_SHIFT;
2046 } else {
2047 /* try to discard, since we do not need this */
2048 if (!rb_try_to_discard(cpu_buffer, event)) {
2049 /* nope, just zero it */
2050 event->time_delta = 0;
2051 event->array[0] = 0;
2052 }
2053 }
2054 cpu_buffer->write_stamp = ts;
2055 /* let the caller know this was the commit */
2056 ret = 1;
2057 } else {
2058 /* Try to discard the event */
2059 if (!rb_try_to_discard(cpu_buffer, event)) {
2060 /* Darn, this is just wasted space */
2061 event->time_delta = 0;
2062 event->array[0] = 0;
2063 }
2064 ret = 0;
2065 }
2066
2067 return ret;
2068}
2069
2070static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) 2075static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2071{ 2076{
2072 local_inc(&cpu_buffer->committing); 2077 local_inc(&cpu_buffer->committing);
@@ -2111,9 +2116,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2111 unsigned long length) 2116 unsigned long length)
2112{ 2117{
2113 struct ring_buffer_event *event; 2118 struct ring_buffer_event *event;
2114 u64 ts, delta = 0; 2119 u64 ts, delta;
2115 int commit = 0;
2116 int nr_loops = 0; 2120 int nr_loops = 0;
2121 int add_timestamp;
2117 2122
2118 rb_start_commit(cpu_buffer); 2123 rb_start_commit(cpu_buffer);
2119 2124
@@ -2134,6 +2139,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2134 2139
2135 length = rb_calculate_event_length(length); 2140 length = rb_calculate_event_length(length);
2136 again: 2141 again:
2142 add_timestamp = 0;
2143 delta = 0;
2144
2137 /* 2145 /*
2138 * We allow for interrupts to reenter here and do a trace. 2146 * We allow for interrupts to reenter here and do a trace.
2139 * If one does, it will cause this original code to loop 2147 * If one does, it will cause this original code to loop
@@ -2172,33 +2180,24 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2172 2180
2173 delta = diff; 2181 delta = diff;
2174 if (unlikely(test_time_stamp(delta))) { 2182 if (unlikely(test_time_stamp(delta))) {
2175 2183 WARN_ONCE(delta > (1ULL << 59),
2176 commit = rb_add_time_stamp(cpu_buffer, ts, delta); 2184 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
2177 delta = 0; 2185 (unsigned long long)delta,
2178 2186 (unsigned long long)ts,
2179 if (commit == -EBUSY) 2187 (unsigned long long)cpu_buffer->write_stamp);
2180 goto out_fail; 2188 add_timestamp = 1;
2181
2182 if (commit == -EAGAIN)
2183 goto again;
2184
2185 RB_WARN_ON(cpu_buffer, commit < 0);
2186 } 2189 }
2187 } 2190 }
2188 2191
2189 get_event: 2192 get_event:
2190 event = __rb_reserve_next(cpu_buffer, 0, length, ts); 2193 event = __rb_reserve_next(cpu_buffer, length, ts,
2194 delta, add_timestamp);
2191 if (unlikely(PTR_ERR(event) == -EAGAIN)) 2195 if (unlikely(PTR_ERR(event) == -EAGAIN))
2192 goto again; 2196 goto again;
2193 2197
2194 if (!event) 2198 if (!event)
2195 goto out_fail; 2199 goto out_fail;
2196 2200
2197 if (!rb_event_is_commit(cpu_buffer, event))
2198 delta = 0;
2199
2200 event->time_delta = delta;
2201
2202 return event; 2201 return event;
2203 2202
2204 out_fail: 2203 out_fail:
@@ -2311,12 +2310,28 @@ static void
2311rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, 2310rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2312 struct ring_buffer_event *event) 2311 struct ring_buffer_event *event)
2313{ 2312{
2313 u64 delta;
2314
2314 /* 2315 /*
2315 * The event first in the commit queue updates the 2316 * The event first in the commit queue updates the
2316 * time stamp. 2317 * time stamp.
2317 */ 2318 */
2318 if (rb_event_is_commit(cpu_buffer, event)) 2319 if (rb_event_is_commit(cpu_buffer, event)) {
2319 cpu_buffer->write_stamp += event->time_delta; 2320 /*
2321 * A commit event that is first on a page
2322 * updates the write timestamp with the page stamp
2323 */
2324 if (!rb_event_index(event))
2325 cpu_buffer->write_stamp =
2326 cpu_buffer->commit_page->page->time_stamp;
2327 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
2328 delta = event->array[0];
2329 delta <<= TS_SHIFT;
2330 delta += event->time_delta;
2331 cpu_buffer->write_stamp += delta;
2332 } else
2333 cpu_buffer->write_stamp += event->time_delta;
2334 }
2320} 2335}
2321 2336
2322static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 2337static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2356,6 +2371,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
2356 2371
2357static inline void rb_event_discard(struct ring_buffer_event *event) 2372static inline void rb_event_discard(struct ring_buffer_event *event)
2358{ 2373{
2374 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
2375 event = skip_time_extend(event);
2376
2359 /* array[0] holds the actual length for the discarded event */ 2377 /* array[0] holds the actual length for the discarded event */
2360 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; 2378 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
2361 event->type_len = RINGBUF_TYPE_PADDING; 2379 event->type_len = RINGBUF_TYPE_PADDING;
@@ -3043,12 +3061,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3043 3061
3044 again: 3062 again:
3045 /* 3063 /*
3046 * We repeat when a timestamp is encountered. It is possible 3064 * We repeat when a time extend is encountered.
3047 * to get multiple timestamps from an interrupt entering just 3065 * Since the time extend is always attached to a data event,
3048 * as one timestamp is about to be written, or from discarded 3066 * we should never loop more than once.
3049 * commits. The most that we can have is the number on a single page. 3067 * (We never hit the following condition more than twice).
3050 */ 3068 */
3051 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3069 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3052 return NULL; 3070 return NULL;
3053 3071
3054 reader = rb_get_reader_page(cpu_buffer); 3072 reader = rb_get_reader_page(cpu_buffer);
@@ -3124,14 +3142,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3124 return NULL; 3142 return NULL;
3125 3143
3126 /* 3144 /*
3127 * We repeat when a timestamp is encountered. 3145 * We repeat when a time extend is encountered.
3128 * We can get multiple timestamps by nested interrupts or also 3146 * Since the time extend is always attached to a data event,
3129 * if filtering is on (discarding commits). Since discarding 3147 * we should never loop more than once.
3130 * commits can be frequent we can get a lot of timestamps. 3148 * (We never hit the following condition more than twice).
3131 * But we limit them by not adding timestamps if they begin
3132 * at the start of a page.
3133 */ 3149 */
3134 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE)) 3150 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
3135 return NULL; 3151 return NULL;
3136 3152
3137 if (rb_per_cpu_empty(cpu_buffer)) 3153 if (rb_per_cpu_empty(cpu_buffer))
@@ -3829,7 +3845,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3829 if (len > (commit - read)) 3845 if (len > (commit - read))
3830 len = (commit - read); 3846 len = (commit - read);
3831 3847
3832 size = rb_event_length(event); 3848 /* Always keep the time extend and data together */
3849 size = rb_event_ts_length(event);
3833 3850
3834 if (len < size) 3851 if (len < size)
3835 goto out_unlock; 3852 goto out_unlock;
@@ -3851,7 +3868,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3851 break; 3868 break;
3852 3869
3853 event = rb_reader_event(cpu_buffer); 3870 event = rb_reader_event(cpu_buffer);
3854 size = rb_event_length(event); 3871 /* Always keep the time extend and data together */
3872 size = rb_event_ts_length(event);
3855 } while (len > size); 3873 } while (len > size);
3856 3874
3857 /* update bpage */ 3875 /* update bpage */