aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace/ring_buffer.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace/ring_buffer.c')
-rw-r--r--kernel/trace/ring_buffer.c780
1 files changed, 509 insertions, 271 deletions
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 960cbf44c844..dc4dc70171ce 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -10,6 +10,7 @@
10#include <linux/debugfs.h> 10#include <linux/debugfs.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/hardirq.h> 12#include <linux/hardirq.h>
13#include <linux/kmemcheck.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/percpu.h> 15#include <linux/percpu.h>
15#include <linux/mutex.h> 16#include <linux/mutex.h>
@@ -22,6 +23,28 @@
22#include "trace.h" 23#include "trace.h"
23 24
24/* 25/*
26 * The ring buffer header is special. We must manually up keep it.
27 */
28int ring_buffer_print_entry_header(struct trace_seq *s)
29{
30 int ret;
31
32 ret = trace_seq_printf(s, "# compressed entry header\n");
33 ret = trace_seq_printf(s, "\ttype_len : 5 bits\n");
34 ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n");
35 ret = trace_seq_printf(s, "\tarray : 32 bits\n");
36 ret = trace_seq_printf(s, "\n");
37 ret = trace_seq_printf(s, "\tpadding : type == %d\n",
38 RINGBUF_TYPE_PADDING);
39 ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
40 RINGBUF_TYPE_TIME_EXTEND);
41 ret = trace_seq_printf(s, "\tdata max type_len == %d\n",
42 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
43
44 return ret;
45}
46
47/*
25 * The ring buffer is made up of a list of pages. A separate list of pages is 48 * The ring buffer is made up of a list of pages. A separate list of pages is
26 * allocated for each CPU. A writer may only write to a buffer that is 49 * allocated for each CPU. A writer may only write to a buffer that is
27 * associated with the CPU it is currently executing on. A reader may read 50 * associated with the CPU it is currently executing on. A reader may read
@@ -182,7 +205,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
182 205
183#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) 206#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
184#define RB_ALIGNMENT 4U 207#define RB_ALIGNMENT 4U
185#define RB_MAX_SMALL_DATA 28 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
209
210/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
211#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
186 212
187enum { 213enum {
188 RB_LEN_TIME_EXTEND = 8, 214 RB_LEN_TIME_EXTEND = 8,
@@ -191,48 +217,28 @@ enum {
191 217
192static inline int rb_null_event(struct ring_buffer_event *event) 218static inline int rb_null_event(struct ring_buffer_event *event)
193{ 219{
194 return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; 220 return event->type_len == RINGBUF_TYPE_PADDING
221 && event->time_delta == 0;
195} 222}
196 223
197static inline int rb_discarded_event(struct ring_buffer_event *event) 224static inline int rb_discarded_event(struct ring_buffer_event *event)
198{ 225{
199 return event->type == RINGBUF_TYPE_PADDING && event->time_delta; 226 return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
200} 227}
201 228
202static void rb_event_set_padding(struct ring_buffer_event *event) 229static void rb_event_set_padding(struct ring_buffer_event *event)
203{ 230{
204 event->type = RINGBUF_TYPE_PADDING; 231 event->type_len = RINGBUF_TYPE_PADDING;
205 event->time_delta = 0; 232 event->time_delta = 0;
206} 233}
207 234
208/**
209 * ring_buffer_event_discard - discard an event in the ring buffer
210 * @buffer: the ring buffer
211 * @event: the event to discard
212 *
213 * Sometimes a event that is in the ring buffer needs to be ignored.
214 * This function lets the user discard an event in the ring buffer
215 * and then that event will not be read later.
216 *
217 * Note, it is up to the user to be careful with this, and protect
218 * against races. If the user discards an event that has been consumed
219 * it is possible that it could corrupt the ring buffer.
220 */
221void ring_buffer_event_discard(struct ring_buffer_event *event)
222{
223 event->type = RINGBUF_TYPE_PADDING;
224 /* time delta must be non zero */
225 if (!event->time_delta)
226 event->time_delta = 1;
227}
228
229static unsigned 235static unsigned
230rb_event_data_length(struct ring_buffer_event *event) 236rb_event_data_length(struct ring_buffer_event *event)
231{ 237{
232 unsigned length; 238 unsigned length;
233 239
234 if (event->len) 240 if (event->type_len)
235 length = event->len * RB_ALIGNMENT; 241 length = event->type_len * RB_ALIGNMENT;
236 else 242 else
237 length = event->array[0]; 243 length = event->array[0];
238 return length + RB_EVNT_HDR_SIZE; 244 return length + RB_EVNT_HDR_SIZE;
@@ -242,12 +248,12 @@ rb_event_data_length(struct ring_buffer_event *event)
242static unsigned 248static unsigned
243rb_event_length(struct ring_buffer_event *event) 249rb_event_length(struct ring_buffer_event *event)
244{ 250{
245 switch (event->type) { 251 switch (event->type_len) {
246 case RINGBUF_TYPE_PADDING: 252 case RINGBUF_TYPE_PADDING:
247 if (rb_null_event(event)) 253 if (rb_null_event(event))
248 /* undefined */ 254 /* undefined */
249 return -1; 255 return -1;
250 return rb_event_data_length(event); 256 return event->array[0] + RB_EVNT_HDR_SIZE;
251 257
252 case RINGBUF_TYPE_TIME_EXTEND: 258 case RINGBUF_TYPE_TIME_EXTEND:
253 return RB_LEN_TIME_EXTEND; 259 return RB_LEN_TIME_EXTEND;
@@ -271,7 +277,7 @@ rb_event_length(struct ring_buffer_event *event)
271unsigned ring_buffer_event_length(struct ring_buffer_event *event) 277unsigned ring_buffer_event_length(struct ring_buffer_event *event)
272{ 278{
273 unsigned length = rb_event_length(event); 279 unsigned length = rb_event_length(event);
274 if (event->type != RINGBUF_TYPE_DATA) 280 if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
275 return length; 281 return length;
276 length -= RB_EVNT_HDR_SIZE; 282 length -= RB_EVNT_HDR_SIZE;
277 if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) 283 if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
@@ -284,9 +290,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
284static void * 290static void *
285rb_event_data(struct ring_buffer_event *event) 291rb_event_data(struct ring_buffer_event *event)
286{ 292{
287 BUG_ON(event->type != RINGBUF_TYPE_DATA); 293 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
288 /* If length is in len field, then array[0] has the data */ 294 /* If length is in len field, then array[0] has the data */
289 if (event->len) 295 if (event->type_len)
290 return (void *)&event->array[0]; 296 return (void *)&event->array[0];
291 /* Otherwise length is in array[0] and array[1] has the data */ 297 /* Otherwise length is in array[0] and array[1] has the data */
292 return (void *)&event->array[1]; 298 return (void *)&event->array[1];
@@ -316,9 +322,10 @@ struct buffer_data_page {
316}; 322};
317 323
318struct buffer_page { 324struct buffer_page {
325 struct list_head list; /* list of buffer pages */
319 local_t write; /* index for next write */ 326 local_t write; /* index for next write */
320 unsigned read; /* index for next read */ 327 unsigned read; /* index for next read */
321 struct list_head list; /* list of free pages */ 328 local_t entries; /* entries on this page */
322 struct buffer_data_page *page; /* Actual data page */ 329 struct buffer_data_page *page; /* Actual data page */
323}; 330};
324 331
@@ -361,6 +368,34 @@ static inline int test_time_stamp(u64 delta)
361 368
362#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) 369#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
363 370
371/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
372#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
373
374/* Max number of timestamps that can fit on a page */
375#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
376
377int ring_buffer_print_page_header(struct trace_seq *s)
378{
379 struct buffer_data_page field;
380 int ret;
381
382 ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
383 "offset:0;\tsize:%u;\n",
384 (unsigned int)sizeof(field.time_stamp));
385
386 ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
387 "offset:%u;\tsize:%u;\n",
388 (unsigned int)offsetof(typeof(field), commit),
389 (unsigned int)sizeof(field.commit));
390
391 ret = trace_seq_printf(s, "\tfield: char data;\t"
392 "offset:%u;\tsize:%u;\n",
393 (unsigned int)offsetof(typeof(field), data),
394 (unsigned int)BUF_PAGE_SIZE);
395
396 return ret;
397}
398
364/* 399/*
365 * head_page == tail_page && head == tail then buffer is empty. 400 * head_page == tail_page && head == tail then buffer is empty.
366 */ 401 */
@@ -375,8 +410,11 @@ struct ring_buffer_per_cpu {
375 struct buffer_page *tail_page; /* write to tail */ 410 struct buffer_page *tail_page; /* write to tail */
376 struct buffer_page *commit_page; /* committed pages */ 411 struct buffer_page *commit_page; /* committed pages */
377 struct buffer_page *reader_page; 412 struct buffer_page *reader_page;
413 unsigned long nmi_dropped;
414 unsigned long commit_overrun;
378 unsigned long overrun; 415 unsigned long overrun;
379 unsigned long entries; 416 unsigned long read;
417 local_t entries;
380 u64 write_stamp; 418 u64 write_stamp;
381 u64 read_stamp; 419 u64 read_stamp;
382 atomic_t record_disabled; 420 atomic_t record_disabled;
@@ -389,6 +427,8 @@ struct ring_buffer {
389 atomic_t record_disabled; 427 atomic_t record_disabled;
390 cpumask_var_t cpumask; 428 cpumask_var_t cpumask;
391 429
430 struct lock_class_key *reader_lock_key;
431
392 struct mutex mutex; 432 struct mutex mutex;
393 433
394 struct ring_buffer_per_cpu **buffers; 434 struct ring_buffer_per_cpu **buffers;
@@ -420,13 +460,18 @@ struct ring_buffer_iter {
420/* Up this if you want to test the TIME_EXTENTS and normalization */ 460/* Up this if you want to test the TIME_EXTENTS and normalization */
421#define DEBUG_SHIFT 0 461#define DEBUG_SHIFT 0
422 462
463static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
464{
465 /* shift to debug/test normalization and TIME_EXTENTS */
466 return buffer->clock() << DEBUG_SHIFT;
467}
468
423u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) 469u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
424{ 470{
425 u64 time; 471 u64 time;
426 472
427 preempt_disable_notrace(); 473 preempt_disable_notrace();
428 /* shift to debug/test normalization and TIME_EXTENTS */ 474 time = rb_time_stamp(buffer, cpu);
429 time = buffer->clock() << DEBUG_SHIFT;
430 preempt_enable_no_resched_notrace(); 475 preempt_enable_no_resched_notrace();
431 476
432 return time; 477 return time;
@@ -523,6 +568,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
523 cpu_buffer->cpu = cpu; 568 cpu_buffer->cpu = cpu;
524 cpu_buffer->buffer = buffer; 569 cpu_buffer->buffer = buffer;
525 spin_lock_init(&cpu_buffer->reader_lock); 570 spin_lock_init(&cpu_buffer->reader_lock);
571 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
526 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 572 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
527 INIT_LIST_HEAD(&cpu_buffer->pages); 573 INIT_LIST_HEAD(&cpu_buffer->pages);
528 574
@@ -593,7 +639,8 @@ static int rb_cpu_notify(struct notifier_block *self,
593 * when the buffer wraps. If this flag is not set, the buffer will 639 * when the buffer wraps. If this flag is not set, the buffer will
594 * drop data when the tail hits the head. 640 * drop data when the tail hits the head.
595 */ 641 */
596struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags) 642struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
643 struct lock_class_key *key)
597{ 644{
598 struct ring_buffer *buffer; 645 struct ring_buffer *buffer;
599 int bsize; 646 int bsize;
@@ -616,6 +663,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
616 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); 663 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
617 buffer->flags = flags; 664 buffer->flags = flags;
618 buffer->clock = trace_clock_local; 665 buffer->clock = trace_clock_local;
666 buffer->reader_lock_key = key;
619 667
620 /* need at least two pages */ 668 /* need at least two pages */
621 if (buffer->pages == 1) 669 if (buffer->pages == 1)
@@ -673,7 +721,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
673 kfree(buffer); 721 kfree(buffer);
674 return NULL; 722 return NULL;
675} 723}
676EXPORT_SYMBOL_GPL(ring_buffer_alloc); 724EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
677 725
678/** 726/**
679 * ring_buffer_free - free a ring buffer. 727 * ring_buffer_free - free a ring buffer.
@@ -947,31 +995,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
947 return rb_page_commit(cpu_buffer->head_page); 995 return rb_page_commit(cpu_buffer->head_page);
948} 996}
949 997
950/*
951 * When the tail hits the head and the buffer is in overwrite mode,
952 * the head jumps to the next page and all content on the previous
953 * page is discarded. But before doing so, we update the overrun
954 * variable of the buffer.
955 */
956static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
957{
958 struct ring_buffer_event *event;
959 unsigned long head;
960
961 for (head = 0; head < rb_head_size(cpu_buffer);
962 head += rb_event_length(event)) {
963
964 event = __rb_page_index(cpu_buffer->head_page, head);
965 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
966 return;
967 /* Only count data entries */
968 if (event->type != RINGBUF_TYPE_DATA)
969 continue;
970 cpu_buffer->overrun++;
971 cpu_buffer->entries--;
972 }
973}
974
975static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, 998static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
976 struct buffer_page **bpage) 999 struct buffer_page **bpage)
977{ 1000{
@@ -991,7 +1014,7 @@ rb_event_index(struct ring_buffer_event *event)
991 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); 1014 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
992} 1015}
993 1016
994static int 1017static inline int
995rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, 1018rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
996 struct ring_buffer_event *event) 1019 struct ring_buffer_event *event)
997{ 1020{
@@ -1110,28 +1133,21 @@ static void
1110rb_update_event(struct ring_buffer_event *event, 1133rb_update_event(struct ring_buffer_event *event,
1111 unsigned type, unsigned length) 1134 unsigned type, unsigned length)
1112{ 1135{
1113 event->type = type; 1136 event->type_len = type;
1114 1137
1115 switch (type) { 1138 switch (type) {
1116 1139
1117 case RINGBUF_TYPE_PADDING: 1140 case RINGBUF_TYPE_PADDING:
1118 break;
1119
1120 case RINGBUF_TYPE_TIME_EXTEND: 1141 case RINGBUF_TYPE_TIME_EXTEND:
1121 event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
1122 break;
1123
1124 case RINGBUF_TYPE_TIME_STAMP: 1142 case RINGBUF_TYPE_TIME_STAMP:
1125 event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
1126 break; 1143 break;
1127 1144
1128 case RINGBUF_TYPE_DATA: 1145 case 0:
1129 length -= RB_EVNT_HDR_SIZE; 1146 length -= RB_EVNT_HDR_SIZE;
1130 if (length > RB_MAX_SMALL_DATA) { 1147 if (length > RB_MAX_SMALL_DATA)
1131 event->len = 0;
1132 event->array[0] = length; 1148 event->array[0] = length;
1133 } else 1149 else
1134 event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1150 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
1135 break; 1151 break;
1136 default: 1152 default:
1137 BUG(); 1153 BUG();
@@ -1155,131 +1171,157 @@ static unsigned rb_calculate_event_length(unsigned length)
1155 return length; 1171 return length;
1156} 1172}
1157 1173
1174
1158static struct ring_buffer_event * 1175static struct ring_buffer_event *
1159__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 1176rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1160 unsigned type, unsigned long length, u64 *ts) 1177 unsigned long length, unsigned long tail,
1178 struct buffer_page *commit_page,
1179 struct buffer_page *tail_page, u64 *ts)
1161{ 1180{
1162 struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; 1181 struct buffer_page *next_page, *head_page, *reader_page;
1163 unsigned long tail, write;
1164 struct ring_buffer *buffer = cpu_buffer->buffer; 1182 struct ring_buffer *buffer = cpu_buffer->buffer;
1165 struct ring_buffer_event *event; 1183 struct ring_buffer_event *event;
1166 unsigned long flags;
1167 bool lock_taken = false; 1184 bool lock_taken = false;
1185 unsigned long flags;
1168 1186
1169 commit_page = cpu_buffer->commit_page; 1187 next_page = tail_page;
1170 /* we just need to protect against interrupts */
1171 barrier();
1172 tail_page = cpu_buffer->tail_page;
1173 write = local_add_return(length, &tail_page->write);
1174 tail = write - length;
1175 1188
1176 /* See if we shot pass the end of this buffer page */ 1189 local_irq_save(flags);
1177 if (write > BUF_PAGE_SIZE) { 1190 /*
1178 struct buffer_page *next_page = tail_page; 1191 * Since the write to the buffer is still not
1192 * fully lockless, we must be careful with NMIs.
1193 * The locks in the writers are taken when a write
1194 * crosses to a new page. The locks protect against
1195 * races with the readers (this will soon be fixed
1196 * with a lockless solution).
1197 *
1198 * Because we can not protect against NMIs, and we
1199 * want to keep traces reentrant, we need to manage
1200 * what happens when we are in an NMI.
1201 *
1202 * NMIs can happen after we take the lock.
1203 * If we are in an NMI, only take the lock
1204 * if it is not already taken. Otherwise
1205 * simply fail.
1206 */
1207 if (unlikely(in_nmi())) {
1208 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1209 cpu_buffer->nmi_dropped++;
1210 goto out_reset;
1211 }
1212 } else
1213 __raw_spin_lock(&cpu_buffer->lock);
1179 1214
1180 local_irq_save(flags); 1215 lock_taken = true;
1181 /*
1182 * Since the write to the buffer is still not
1183 * fully lockless, we must be careful with NMIs.
1184 * The locks in the writers are taken when a write
1185 * crosses to a new page. The locks protect against
1186 * races with the readers (this will soon be fixed
1187 * with a lockless solution).
1188 *
1189 * Because we can not protect against NMIs, and we
1190 * want to keep traces reentrant, we need to manage
1191 * what happens when we are in an NMI.
1192 *
1193 * NMIs can happen after we take the lock.
1194 * If we are in an NMI, only take the lock
1195 * if it is not already taken. Otherwise
1196 * simply fail.
1197 */
1198 if (unlikely(in_nmi())) {
1199 if (!__raw_spin_trylock(&cpu_buffer->lock))
1200 goto out_reset;
1201 } else
1202 __raw_spin_lock(&cpu_buffer->lock);
1203 1216
1204 lock_taken = true; 1217 rb_inc_page(cpu_buffer, &next_page);
1205 1218
1206 rb_inc_page(cpu_buffer, &next_page); 1219 head_page = cpu_buffer->head_page;
1220 reader_page = cpu_buffer->reader_page;
1207 1221
1208 head_page = cpu_buffer->head_page; 1222 /* we grabbed the lock before incrementing */
1209 reader_page = cpu_buffer->reader_page; 1223 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1224 goto out_reset;
1210 1225
1211 /* we grabbed the lock before incrementing */ 1226 /*
1212 if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) 1227 * If for some reason, we had an interrupt storm that made
1213 goto out_reset; 1228 * it all the way around the buffer, bail, and warn
1229 * about it.
1230 */
1231 if (unlikely(next_page == commit_page)) {
1232 cpu_buffer->commit_overrun++;
1233 goto out_reset;
1234 }
1214 1235
1215 /* 1236 if (next_page == head_page) {
1216 * If for some reason, we had an interrupt storm that made 1237 if (!(buffer->flags & RB_FL_OVERWRITE))
1217 * it all the way around the buffer, bail, and warn
1218 * about it.
1219 */
1220 if (unlikely(next_page == commit_page)) {
1221 WARN_ON_ONCE(1);
1222 goto out_reset; 1238 goto out_reset;
1223 }
1224 1239
1225 if (next_page == head_page) { 1240 /* tail_page has not moved yet? */
1226 if (!(buffer->flags & RB_FL_OVERWRITE)) 1241 if (tail_page == cpu_buffer->tail_page) {
1227 goto out_reset; 1242 /* count overflows */
1228 1243 cpu_buffer->overrun +=
1229 /* tail_page has not moved yet? */ 1244 local_read(&head_page->entries);
1230 if (tail_page == cpu_buffer->tail_page) {
1231 /* count overflows */
1232 rb_update_overflow(cpu_buffer);
1233 1245
1234 rb_inc_page(cpu_buffer, &head_page); 1246 rb_inc_page(cpu_buffer, &head_page);
1235 cpu_buffer->head_page = head_page; 1247 cpu_buffer->head_page = head_page;
1236 cpu_buffer->head_page->read = 0; 1248 cpu_buffer->head_page->read = 0;
1237 }
1238 } 1249 }
1250 }
1239 1251
1240 /* 1252 /*
1241 * If the tail page is still the same as what we think 1253 * If the tail page is still the same as what we think
1242 * it is, then it is up to us to update the tail 1254 * it is, then it is up to us to update the tail
1243 * pointer. 1255 * pointer.
1244 */ 1256 */
1245 if (tail_page == cpu_buffer->tail_page) { 1257 if (tail_page == cpu_buffer->tail_page) {
1246 local_set(&next_page->write, 0); 1258 local_set(&next_page->write, 0);
1247 local_set(&next_page->page->commit, 0); 1259 local_set(&next_page->entries, 0);
1248 cpu_buffer->tail_page = next_page; 1260 local_set(&next_page->page->commit, 0);
1261 cpu_buffer->tail_page = next_page;
1262
1263 /* reread the time stamp */
1264 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1265 cpu_buffer->tail_page->page->time_stamp = *ts;
1266 }
1249 1267
1250 /* reread the time stamp */ 1268 /*
1251 *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); 1269 * The actual tail page has moved forward.
1252 cpu_buffer->tail_page->page->time_stamp = *ts; 1270 */
1253 } 1271 if (tail < BUF_PAGE_SIZE) {
1272 /* Mark the rest of the page with padding */
1273 event = __rb_page_index(tail_page, tail);
1274 kmemcheck_annotate_bitfield(event, bitfield);
1275 rb_event_set_padding(event);
1276 }
1254 1277
1255 /* 1278 /* Set the write back to the previous setting */
1256 * The actual tail page has moved forward. 1279 local_sub(length, &tail_page->write);
1257 */
1258 if (tail < BUF_PAGE_SIZE) {
1259 /* Mark the rest of the page with padding */
1260 event = __rb_page_index(tail_page, tail);
1261 rb_event_set_padding(event);
1262 }
1263 1280
1264 if (tail <= BUF_PAGE_SIZE) 1281 /*
1265 /* Set the write back to the previous setting */ 1282 * If this was a commit entry that failed,
1266 local_set(&tail_page->write, tail); 1283 * increment that too
1284 */
1285 if (tail_page == cpu_buffer->commit_page &&
1286 tail == rb_commit_index(cpu_buffer)) {
1287 rb_set_commit_to_write(cpu_buffer);
1288 }
1267 1289
1268 /* 1290 __raw_spin_unlock(&cpu_buffer->lock);
1269 * If this was a commit entry that failed, 1291 local_irq_restore(flags);
1270 * increment that too 1292
1271 */ 1293 /* fail and let the caller try again */
1272 if (tail_page == cpu_buffer->commit_page && 1294 return ERR_PTR(-EAGAIN);
1273 tail == rb_commit_index(cpu_buffer)) { 1295
1274 rb_set_commit_to_write(cpu_buffer); 1296 out_reset:
1275 } 1297 /* reset write */
1298 local_sub(length, &tail_page->write);
1276 1299
1300 if (likely(lock_taken))
1277 __raw_spin_unlock(&cpu_buffer->lock); 1301 __raw_spin_unlock(&cpu_buffer->lock);
1278 local_irq_restore(flags); 1302 local_irq_restore(flags);
1303 return NULL;
1304}
1279 1305
1280 /* fail and let the caller try again */ 1306static struct ring_buffer_event *
1281 return ERR_PTR(-EAGAIN); 1307__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1282 } 1308 unsigned type, unsigned long length, u64 *ts)
1309{
1310 struct buffer_page *tail_page, *commit_page;
1311 struct ring_buffer_event *event;
1312 unsigned long tail, write;
1313
1314 commit_page = cpu_buffer->commit_page;
1315 /* we just need to protect against interrupts */
1316 barrier();
1317 tail_page = cpu_buffer->tail_page;
1318 write = local_add_return(length, &tail_page->write);
1319 tail = write - length;
1320
1321 /* See if we shot pass the end of this buffer page */
1322 if (write > BUF_PAGE_SIZE)
1323 return rb_move_tail(cpu_buffer, length, tail,
1324 commit_page, tail_page, ts);
1283 1325
1284 /* We reserved something on the buffer */ 1326 /* We reserved something on the buffer */
1285 1327
@@ -1287,8 +1329,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1287 return NULL; 1329 return NULL;
1288 1330
1289 event = __rb_page_index(tail_page, tail); 1331 event = __rb_page_index(tail_page, tail);
1332 kmemcheck_annotate_bitfield(event, bitfield);
1290 rb_update_event(event, type, length); 1333 rb_update_event(event, type, length);
1291 1334
1335 /* The passed in type is zero for DATA */
1336 if (likely(!type))
1337 local_inc(&tail_page->entries);
1338
1292 /* 1339 /*
1293 * If this is a commit and the tail is zero, then update 1340 * If this is a commit and the tail is zero, then update
1294 * this page's time stamp. 1341 * this page's time stamp.
@@ -1297,16 +1344,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1297 cpu_buffer->commit_page->page->time_stamp = *ts; 1344 cpu_buffer->commit_page->page->time_stamp = *ts;
1298 1345
1299 return event; 1346 return event;
1347}
1300 1348
1301 out_reset: 1349static inline int
1302 /* reset write */ 1350rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1303 if (tail <= BUF_PAGE_SIZE) 1351 struct ring_buffer_event *event)
1304 local_set(&tail_page->write, tail); 1352{
1353 unsigned long new_index, old_index;
1354 struct buffer_page *bpage;
1355 unsigned long index;
1356 unsigned long addr;
1305 1357
1306 if (likely(lock_taken)) 1358 new_index = rb_event_index(event);
1307 __raw_spin_unlock(&cpu_buffer->lock); 1359 old_index = new_index + rb_event_length(event);
1308 local_irq_restore(flags); 1360 addr = (unsigned long)event;
1309 return NULL; 1361 addr &= PAGE_MASK;
1362
1363 bpage = cpu_buffer->tail_page;
1364
1365 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1366 /*
1367 * This is on the tail page. It is possible that
1368 * a write could come in and move the tail page
1369 * and write to the next page. That is fine
1370 * because we just shorten what is on this page.
1371 */
1372 index = local_cmpxchg(&bpage->write, old_index, new_index);
1373 if (index == old_index)
1374 return 1;
1375 }
1376
1377 /* could not discard */
1378 return 0;
1310} 1379}
1311 1380
1312static int 1381static int
@@ -1351,16 +1420,23 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1351 event->array[0] = *delta >> TS_SHIFT; 1420 event->array[0] = *delta >> TS_SHIFT;
1352 } else { 1421 } else {
1353 cpu_buffer->commit_page->page->time_stamp = *ts; 1422 cpu_buffer->commit_page->page->time_stamp = *ts;
1354 event->time_delta = 0; 1423 /* try to discard, since we do not need this */
1355 event->array[0] = 0; 1424 if (!rb_try_to_discard(cpu_buffer, event)) {
1425 /* nope, just zero it */
1426 event->time_delta = 0;
1427 event->array[0] = 0;
1428 }
1356 } 1429 }
1357 cpu_buffer->write_stamp = *ts; 1430 cpu_buffer->write_stamp = *ts;
1358 /* let the caller know this was the commit */ 1431 /* let the caller know this was the commit */
1359 ret = 1; 1432 ret = 1;
1360 } else { 1433 } else {
1361 /* Darn, this is just wasted space */ 1434 /* Try to discard the event */
1362 event->time_delta = 0; 1435 if (!rb_try_to_discard(cpu_buffer, event)) {
1363 event->array[0] = 0; 1436 /* Darn, this is just wasted space */
1437 event->time_delta = 0;
1438 event->array[0] = 0;
1439 }
1364 ret = 0; 1440 ret = 0;
1365 } 1441 }
1366 1442
@@ -1371,13 +1447,14 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1371 1447
1372static struct ring_buffer_event * 1448static struct ring_buffer_event *
1373rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, 1449rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1374 unsigned type, unsigned long length) 1450 unsigned long length)
1375{ 1451{
1376 struct ring_buffer_event *event; 1452 struct ring_buffer_event *event;
1377 u64 ts, delta; 1453 u64 ts, delta = 0;
1378 int commit = 0; 1454 int commit = 0;
1379 int nr_loops = 0; 1455 int nr_loops = 0;
1380 1456
1457 length = rb_calculate_event_length(length);
1381 again: 1458 again:
1382 /* 1459 /*
1383 * We allow for interrupts to reenter here and do a trace. 1460 * We allow for interrupts to reenter here and do a trace.
@@ -1391,7 +1468,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1391 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 1468 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1392 return NULL; 1469 return NULL;
1393 1470
1394 ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 1471 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
1395 1472
1396 /* 1473 /*
1397 * Only the first commit can update the timestamp. 1474 * Only the first commit can update the timestamp.
@@ -1401,23 +1478,24 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1401 * also be made. But only the entry that did the actual 1478 * also be made. But only the entry that did the actual
1402 * commit will be something other than zero. 1479 * commit will be something other than zero.
1403 */ 1480 */
1404 if (cpu_buffer->tail_page == cpu_buffer->commit_page && 1481 if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
1405 rb_page_write(cpu_buffer->tail_page) == 1482 rb_page_write(cpu_buffer->tail_page) ==
1406 rb_commit_index(cpu_buffer)) { 1483 rb_commit_index(cpu_buffer))) {
1484 u64 diff;
1407 1485
1408 delta = ts - cpu_buffer->write_stamp; 1486 diff = ts - cpu_buffer->write_stamp;
1409 1487
1410 /* make sure this delta is calculated here */ 1488 /* make sure this diff is calculated here */
1411 barrier(); 1489 barrier();
1412 1490
1413 /* Did the write stamp get updated already? */ 1491 /* Did the write stamp get updated already? */
1414 if (unlikely(ts < cpu_buffer->write_stamp)) 1492 if (unlikely(ts < cpu_buffer->write_stamp))
1415 delta = 0; 1493 goto get_event;
1416 1494
1417 if (test_time_stamp(delta)) { 1495 delta = diff;
1496 if (unlikely(test_time_stamp(delta))) {
1418 1497
1419 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); 1498 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
1420
1421 if (commit == -EBUSY) 1499 if (commit == -EBUSY)
1422 return NULL; 1500 return NULL;
1423 1501
@@ -1426,12 +1504,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1426 1504
1427 RB_WARN_ON(cpu_buffer, commit < 0); 1505 RB_WARN_ON(cpu_buffer, commit < 0);
1428 } 1506 }
1429 } else 1507 }
1430 /* Non commits have zero deltas */
1431 delta = 0;
1432 1508
1433 event = __rb_reserve_next(cpu_buffer, type, length, &ts); 1509 get_event:
1434 if (PTR_ERR(event) == -EAGAIN) 1510 event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
1511 if (unlikely(PTR_ERR(event) == -EAGAIN))
1435 goto again; 1512 goto again;
1436 1513
1437 if (!event) { 1514 if (!event) {
@@ -1448,7 +1525,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1448 * If the timestamp was commited, make the commit our entry 1525 * If the timestamp was commited, make the commit our entry
1449 * now so that we will update it when needed. 1526 * now so that we will update it when needed.
1450 */ 1527 */
1451 if (commit) 1528 if (unlikely(commit))
1452 rb_set_commit_event(cpu_buffer, event); 1529 rb_set_commit_event(cpu_buffer, event);
1453 else if (!rb_is_commit(cpu_buffer, event)) 1530 else if (!rb_is_commit(cpu_buffer, event))
1454 delta = 0; 1531 delta = 0;
@@ -1458,6 +1535,36 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1458 return event; 1535 return event;
1459} 1536}
1460 1537
1538#define TRACE_RECURSIVE_DEPTH 16
1539
1540static int trace_recursive_lock(void)
1541{
1542 current->trace_recursion++;
1543
1544 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
1545 return 0;
1546
1547 /* Disable all tracing before we do anything else */
1548 tracing_off_permanent();
1549
1550 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
1551 "HC[%lu]:SC[%lu]:NMI[%lu]\n",
1552 current->trace_recursion,
1553 hardirq_count() >> HARDIRQ_SHIFT,
1554 softirq_count() >> SOFTIRQ_SHIFT,
1555 in_nmi());
1556
1557 WARN_ON_ONCE(1);
1558 return -1;
1559}
1560
1561static void trace_recursive_unlock(void)
1562{
1563 WARN_ON_ONCE(!current->trace_recursion);
1564
1565 current->trace_recursion--;
1566}
1567
1461static DEFINE_PER_CPU(int, rb_need_resched); 1568static DEFINE_PER_CPU(int, rb_need_resched);
1462 1569
1463/** 1570/**
@@ -1491,6 +1598,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1491 /* If we are tracing schedule, we don't want to recurse */ 1598 /* If we are tracing schedule, we don't want to recurse */
1492 resched = ftrace_preempt_disable(); 1599 resched = ftrace_preempt_disable();
1493 1600
1601 if (trace_recursive_lock())
1602 goto out_nocheck;
1603
1494 cpu = raw_smp_processor_id(); 1604 cpu = raw_smp_processor_id();
1495 1605
1496 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 1606 if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -1501,11 +1611,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1501 if (atomic_read(&cpu_buffer->record_disabled)) 1611 if (atomic_read(&cpu_buffer->record_disabled))
1502 goto out; 1612 goto out;
1503 1613
1504 length = rb_calculate_event_length(length); 1614 if (length > BUF_MAX_DATA_SIZE)
1505 if (length > BUF_PAGE_SIZE)
1506 goto out; 1615 goto out;
1507 1616
1508 event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); 1617 event = rb_reserve_next_event(cpu_buffer, length);
1509 if (!event) 1618 if (!event)
1510 goto out; 1619 goto out;
1511 1620
@@ -1520,6 +1629,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
1520 return event; 1629 return event;
1521 1630
1522 out: 1631 out:
1632 trace_recursive_unlock();
1633
1634 out_nocheck:
1523 ftrace_preempt_enable(resched); 1635 ftrace_preempt_enable(resched);
1524 return NULL; 1636 return NULL;
1525} 1637}
@@ -1528,7 +1640,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
1528static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, 1640static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1529 struct ring_buffer_event *event) 1641 struct ring_buffer_event *event)
1530{ 1642{
1531 cpu_buffer->entries++; 1643 local_inc(&cpu_buffer->entries);
1532 1644
1533 /* Only process further if we own the commit */ 1645 /* Only process further if we own the commit */
1534 if (!rb_is_commit(cpu_buffer, event)) 1646 if (!rb_is_commit(cpu_buffer, event))
@@ -1558,6 +1670,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1558 1670
1559 rb_commit(cpu_buffer, event); 1671 rb_commit(cpu_buffer, event);
1560 1672
1673 trace_recursive_unlock();
1674
1561 /* 1675 /*
1562 * Only the last preempt count needs to restore preemption. 1676 * Only the last preempt count needs to restore preemption.
1563 */ 1677 */
@@ -1570,6 +1684,99 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1570} 1684}
1571EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); 1685EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
1572 1686
1687static inline void rb_event_discard(struct ring_buffer_event *event)
1688{
1689 /* array[0] holds the actual length for the discarded event */
1690 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
1691 event->type_len = RINGBUF_TYPE_PADDING;
1692 /* time delta must be non zero */
1693 if (!event->time_delta)
1694 event->time_delta = 1;
1695}
1696
1697/**
1698 * ring_buffer_event_discard - discard any event in the ring buffer
1699 * @event: the event to discard
1700 *
1701 * Sometimes a event that is in the ring buffer needs to be ignored.
1702 * This function lets the user discard an event in the ring buffer
1703 * and then that event will not be read later.
1704 *
1705 * Note, it is up to the user to be careful with this, and protect
1706 * against races. If the user discards an event that has been consumed
1707 * it is possible that it could corrupt the ring buffer.
1708 */
1709void ring_buffer_event_discard(struct ring_buffer_event *event)
1710{
1711 rb_event_discard(event);
1712}
1713EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
1714
1715/**
1716 * ring_buffer_commit_discard - discard an event that has not been committed
1717 * @buffer: the ring buffer
1718 * @event: non committed event to discard
1719 *
1720 * This is similar to ring_buffer_event_discard but must only be
1721 * performed on an event that has not been committed yet. The difference
1722 * is that this will also try to free the event from the ring buffer
1723 * if another event has not been added behind it.
1724 *
1725 * If another event has been added behind it, it will set the event
1726 * up as discarded, and perform the commit.
1727 *
1728 * If this function is called, do not call ring_buffer_unlock_commit on
1729 * the event.
1730 */
1731void ring_buffer_discard_commit(struct ring_buffer *buffer,
1732 struct ring_buffer_event *event)
1733{
1734 struct ring_buffer_per_cpu *cpu_buffer;
1735 int cpu;
1736
1737 /* The event is discarded regardless */
1738 rb_event_discard(event);
1739
1740 /*
1741 * This must only be called if the event has not been
1742 * committed yet. Thus we can assume that preemption
1743 * is still disabled.
1744 */
1745 RB_WARN_ON(buffer, preemptible());
1746
1747 cpu = smp_processor_id();
1748 cpu_buffer = buffer->buffers[cpu];
1749
1750 if (!rb_try_to_discard(cpu_buffer, event))
1751 goto out;
1752
1753 /*
1754 * The commit is still visible by the reader, so we
1755 * must increment entries.
1756 */
1757 local_inc(&cpu_buffer->entries);
1758 out:
1759 /*
1760 * If a write came in and pushed the tail page
1761 * we still need to update the commit pointer
1762 * if we were the commit.
1763 */
1764 if (rb_is_commit(cpu_buffer, event))
1765 rb_set_commit_to_write(cpu_buffer);
1766
1767 trace_recursive_unlock();
1768
1769 /*
1770 * Only the last preempt count needs to restore preemption.
1771 */
1772 if (preempt_count() == 1)
1773 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
1774 else
1775 preempt_enable_no_resched_notrace();
1776
1777}
1778EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
1779
1573/** 1780/**
1574 * ring_buffer_write - write data to the buffer without reserving 1781 * ring_buffer_write - write data to the buffer without reserving
1575 * @buffer: The ring buffer to write to. 1782 * @buffer: The ring buffer to write to.
@@ -1589,7 +1796,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
1589{ 1796{
1590 struct ring_buffer_per_cpu *cpu_buffer; 1797 struct ring_buffer_per_cpu *cpu_buffer;
1591 struct ring_buffer_event *event; 1798 struct ring_buffer_event *event;
1592 unsigned long event_length;
1593 void *body; 1799 void *body;
1594 int ret = -EBUSY; 1800 int ret = -EBUSY;
1595 int cpu, resched; 1801 int cpu, resched;
@@ -1612,9 +1818,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
1612 if (atomic_read(&cpu_buffer->record_disabled)) 1818 if (atomic_read(&cpu_buffer->record_disabled))
1613 goto out; 1819 goto out;
1614 1820
1615 event_length = rb_calculate_event_length(length); 1821 if (length > BUF_MAX_DATA_SIZE)
1616 event = rb_reserve_next_event(cpu_buffer, 1822 goto out;
1617 RINGBUF_TYPE_DATA, event_length); 1823
1824 event = rb_reserve_next_event(cpu_buffer, length);
1618 if (!event) 1825 if (!event)
1619 goto out; 1826 goto out;
1620 1827
@@ -1728,7 +1935,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1728 return 0; 1935 return 0;
1729 1936
1730 cpu_buffer = buffer->buffers[cpu]; 1937 cpu_buffer = buffer->buffers[cpu];
1731 ret = cpu_buffer->entries; 1938 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
1939 - cpu_buffer->read;
1732 1940
1733 return ret; 1941 return ret;
1734} 1942}
@@ -1755,6 +1963,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1755EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 1963EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1756 1964
1757/** 1965/**
1966 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
1967 * @buffer: The ring buffer
1968 * @cpu: The per CPU buffer to get the number of overruns from
1969 */
1970unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
1971{
1972 struct ring_buffer_per_cpu *cpu_buffer;
1973 unsigned long ret;
1974
1975 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1976 return 0;
1977
1978 cpu_buffer = buffer->buffers[cpu];
1979 ret = cpu_buffer->nmi_dropped;
1980
1981 return ret;
1982}
1983EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
1984
1985/**
1986 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
1987 * @buffer: The ring buffer
1988 * @cpu: The per CPU buffer to get the number of overruns from
1989 */
1990unsigned long
1991ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
1992{
1993 struct ring_buffer_per_cpu *cpu_buffer;
1994 unsigned long ret;
1995
1996 if (!cpumask_test_cpu(cpu, buffer->cpumask))
1997 return 0;
1998
1999 cpu_buffer = buffer->buffers[cpu];
2000 ret = cpu_buffer->commit_overrun;
2001
2002 return ret;
2003}
2004EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
2005
2006/**
1758 * ring_buffer_entries - get the number of entries in a buffer 2007 * ring_buffer_entries - get the number of entries in a buffer
1759 * @buffer: The ring buffer 2008 * @buffer: The ring buffer
1760 * 2009 *
@@ -1770,7 +2019,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
1770 /* if you care about this being correct, lock the buffer */ 2019 /* if you care about this being correct, lock the buffer */
1771 for_each_buffer_cpu(buffer, cpu) { 2020 for_each_buffer_cpu(buffer, cpu) {
1772 cpu_buffer = buffer->buffers[cpu]; 2021 cpu_buffer = buffer->buffers[cpu];
1773 entries += cpu_buffer->entries; 2022 entries += (local_read(&cpu_buffer->entries) -
2023 cpu_buffer->overrun) - cpu_buffer->read;
1774 } 2024 }
1775 2025
1776 return entries; 2026 return entries;
@@ -1862,7 +2112,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1862{ 2112{
1863 u64 delta; 2113 u64 delta;
1864 2114
1865 switch (event->type) { 2115 switch (event->type_len) {
1866 case RINGBUF_TYPE_PADDING: 2116 case RINGBUF_TYPE_PADDING:
1867 return; 2117 return;
1868 2118
@@ -1893,7 +2143,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
1893{ 2143{
1894 u64 delta; 2144 u64 delta;
1895 2145
1896 switch (event->type) { 2146 switch (event->type_len) {
1897 case RINGBUF_TYPE_PADDING: 2147 case RINGBUF_TYPE_PADDING:
1898 return; 2148 return;
1899 2149
@@ -1966,6 +2216,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1966 cpu_buffer->reader_page->list.prev = reader->list.prev; 2216 cpu_buffer->reader_page->list.prev = reader->list.prev;
1967 2217
1968 local_set(&cpu_buffer->reader_page->write, 0); 2218 local_set(&cpu_buffer->reader_page->write, 0);
2219 local_set(&cpu_buffer->reader_page->entries, 0);
1969 local_set(&cpu_buffer->reader_page->page->commit, 0); 2220 local_set(&cpu_buffer->reader_page->page->commit, 0);
1970 2221
1971 /* Make the reader page now replace the head */ 2222 /* Make the reader page now replace the head */
@@ -2008,8 +2259,9 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
2008 2259
2009 event = rb_reader_event(cpu_buffer); 2260 event = rb_reader_event(cpu_buffer);
2010 2261
2011 if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) 2262 if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
2012 cpu_buffer->entries--; 2263 || rb_discarded_event(event))
2264 cpu_buffer->read++;
2013 2265
2014 rb_update_read_stamp(cpu_buffer, event); 2266 rb_update_read_stamp(cpu_buffer, event);
2015 2267
@@ -2031,8 +2283,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2031 * Check if we are at the end of the buffer. 2283 * Check if we are at the end of the buffer.
2032 */ 2284 */
2033 if (iter->head >= rb_page_size(iter->head_page)) { 2285 if (iter->head >= rb_page_size(iter->head_page)) {
2034 if (RB_WARN_ON(buffer, 2286 /* discarded commits can make the page empty */
2035 iter->head_page == cpu_buffer->commit_page)) 2287 if (iter->head_page == cpu_buffer->commit_page)
2036 return; 2288 return;
2037 rb_inc_iter(iter); 2289 rb_inc_iter(iter);
2038 return; 2290 return;
@@ -2075,12 +2327,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2075 /* 2327 /*
2076 * We repeat when a timestamp is encountered. It is possible 2328 * We repeat when a timestamp is encountered. It is possible
2077 * to get multiple timestamps from an interrupt entering just 2329 * to get multiple timestamps from an interrupt entering just
2078 * as one timestamp is about to be written. The max times 2330 * as one timestamp is about to be written, or from discarded
2079 * that this can happen is the number of nested interrupts we 2331 * commits. The most that we can have is the number on a single page.
2080 * can have. Nesting 10 deep of interrupts is clearly
2081 * an anomaly.
2082 */ 2332 */
2083 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) 2333 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
2084 return NULL; 2334 return NULL;
2085 2335
2086 reader = rb_get_reader_page(cpu_buffer); 2336 reader = rb_get_reader_page(cpu_buffer);
@@ -2089,7 +2339,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2089 2339
2090 event = rb_reader_event(cpu_buffer); 2340 event = rb_reader_event(cpu_buffer);
2091 2341
2092 switch (event->type) { 2342 switch (event->type_len) {
2093 case RINGBUF_TYPE_PADDING: 2343 case RINGBUF_TYPE_PADDING:
2094 if (rb_null_event(event)) 2344 if (rb_null_event(event))
2095 RB_WARN_ON(cpu_buffer, 1); 2345 RB_WARN_ON(cpu_buffer, 1);
@@ -2146,14 +2396,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2146 2396
2147 again: 2397 again:
2148 /* 2398 /*
2149 * We repeat when a timestamp is encountered. It is possible 2399 * We repeat when a timestamp is encountered.
2150 * to get multiple timestamps from an interrupt entering just 2400 * We can get multiple timestamps by nested interrupts or also
2151 * as one timestamp is about to be written. The max times 2401 * if filtering is on (discarding commits). Since discarding
2152 * that this can happen is the number of nested interrupts we 2402 * commits can be frequent we can get a lot of timestamps.
2153 * can have. Nesting 10 deep of interrupts is clearly 2403 * But we limit them by not adding timestamps if they begin
2154 * an anomaly. 2404 * at the start of a page.
2155 */ 2405 */
2156 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10)) 2406 if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
2157 return NULL; 2407 return NULL;
2158 2408
2159 if (rb_per_cpu_empty(cpu_buffer)) 2409 if (rb_per_cpu_empty(cpu_buffer))
@@ -2161,7 +2411,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2161 2411
2162 event = rb_iter_head_event(iter); 2412 event = rb_iter_head_event(iter);
2163 2413
2164 switch (event->type) { 2414 switch (event->type_len) {
2165 case RINGBUF_TYPE_PADDING: 2415 case RINGBUF_TYPE_PADDING:
2166 if (rb_null_event(event)) { 2416 if (rb_null_event(event)) {
2167 rb_inc_iter(iter); 2417 rb_inc_iter(iter);
@@ -2220,7 +2470,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
2220 event = rb_buffer_peek(buffer, cpu, ts); 2470 event = rb_buffer_peek(buffer, cpu, ts);
2221 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2471 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2222 2472
2223 if (event && event->type == RINGBUF_TYPE_PADDING) { 2473 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2224 cpu_relax(); 2474 cpu_relax();
2225 goto again; 2475 goto again;
2226 } 2476 }
@@ -2248,7 +2498,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
2248 event = rb_iter_peek(iter, ts); 2498 event = rb_iter_peek(iter, ts);
2249 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2499 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2250 2500
2251 if (event && event->type == RINGBUF_TYPE_PADDING) { 2501 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2252 cpu_relax(); 2502 cpu_relax();
2253 goto again; 2503 goto again;
2254 } 2504 }
@@ -2293,7 +2543,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
2293 out: 2543 out:
2294 preempt_enable(); 2544 preempt_enable();
2295 2545
2296 if (event && event->type == RINGBUF_TYPE_PADDING) { 2546 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2297 cpu_relax(); 2547 cpu_relax();
2298 goto again; 2548 goto again;
2299 } 2549 }
@@ -2386,7 +2636,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
2386 out: 2636 out:
2387 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2637 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2388 2638
2389 if (event && event->type == RINGBUF_TYPE_PADDING) { 2639 if (event && event->type_len == RINGBUF_TYPE_PADDING) {
2390 cpu_relax(); 2640 cpu_relax();
2391 goto again; 2641 goto again;
2392 } 2642 }
@@ -2411,6 +2661,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2411 cpu_buffer->head_page 2661 cpu_buffer->head_page
2412 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 2662 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
2413 local_set(&cpu_buffer->head_page->write, 0); 2663 local_set(&cpu_buffer->head_page->write, 0);
2664 local_set(&cpu_buffer->head_page->entries, 0);
2414 local_set(&cpu_buffer->head_page->page->commit, 0); 2665 local_set(&cpu_buffer->head_page->page->commit, 0);
2415 2666
2416 cpu_buffer->head_page->read = 0; 2667 cpu_buffer->head_page->read = 0;
@@ -2420,11 +2671,15 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2420 2671
2421 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 2672 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
2422 local_set(&cpu_buffer->reader_page->write, 0); 2673 local_set(&cpu_buffer->reader_page->write, 0);
2674 local_set(&cpu_buffer->reader_page->entries, 0);
2423 local_set(&cpu_buffer->reader_page->page->commit, 0); 2675 local_set(&cpu_buffer->reader_page->page->commit, 0);
2424 cpu_buffer->reader_page->read = 0; 2676 cpu_buffer->reader_page->read = 0;
2425 2677
2678 cpu_buffer->nmi_dropped = 0;
2679 cpu_buffer->commit_overrun = 0;
2426 cpu_buffer->overrun = 0; 2680 cpu_buffer->overrun = 0;
2427 cpu_buffer->entries = 0; 2681 cpu_buffer->read = 0;
2682 local_set(&cpu_buffer->entries, 0);
2428 2683
2429 cpu_buffer->write_stamp = 0; 2684 cpu_buffer->write_stamp = 0;
2430 cpu_buffer->read_stamp = 0; 2685 cpu_buffer->read_stamp = 0;
@@ -2443,6 +2698,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2443 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 2698 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2444 return; 2699 return;
2445 2700
2701 atomic_inc(&cpu_buffer->record_disabled);
2702
2446 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 2703 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2447 2704
2448 __raw_spin_lock(&cpu_buffer->lock); 2705 __raw_spin_lock(&cpu_buffer->lock);
@@ -2452,6 +2709,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2452 __raw_spin_unlock(&cpu_buffer->lock); 2709 __raw_spin_unlock(&cpu_buffer->lock);
2453 2710
2454 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 2711 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2712
2713 atomic_dec(&cpu_buffer->record_disabled);
2455} 2714}
2456EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); 2715EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
2457 2716
@@ -2578,28 +2837,6 @@ out:
2578} 2837}
2579EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); 2838EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
2580 2839
2581static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2582 struct buffer_data_page *bpage,
2583 unsigned int offset)
2584{
2585 struct ring_buffer_event *event;
2586 unsigned long head;
2587
2588 __raw_spin_lock(&cpu_buffer->lock);
2589 for (head = offset; head < local_read(&bpage->commit);
2590 head += rb_event_length(event)) {
2591
2592 event = __rb_data_page_index(bpage, head);
2593 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
2594 return;
2595 /* Only count data entries */
2596 if (event->type != RINGBUF_TYPE_DATA)
2597 continue;
2598 cpu_buffer->entries--;
2599 }
2600 __raw_spin_unlock(&cpu_buffer->lock);
2601}
2602
2603/** 2840/**
2604 * ring_buffer_alloc_read_page - allocate a page to read from buffer 2841 * ring_buffer_alloc_read_page - allocate a page to read from buffer
2605 * @buffer: the buffer to allocate for. 2842 * @buffer: the buffer to allocate for.
@@ -2630,6 +2867,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2630 2867
2631 return bpage; 2868 return bpage;
2632} 2869}
2870EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
2633 2871
2634/** 2872/**
2635 * ring_buffer_free_read_page - free an allocated read page 2873 * ring_buffer_free_read_page - free an allocated read page
@@ -2642,6 +2880,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2642{ 2880{
2643 free_page((unsigned long)data); 2881 free_page((unsigned long)data);
2644} 2882}
2883EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
2645 2884
2646/** 2885/**
2647 * ring_buffer_read_page - extract a page from the ring buffer 2886 * ring_buffer_read_page - extract a page from the ring buffer
@@ -2768,16 +3007,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
2768 /* we copied everything to the beginning */ 3007 /* we copied everything to the beginning */
2769 read = 0; 3008 read = 0;
2770 } else { 3009 } else {
3010 /* update the entry counter */
3011 cpu_buffer->read += local_read(&reader->entries);
3012
2771 /* swap the pages */ 3013 /* swap the pages */
2772 rb_init_page(bpage); 3014 rb_init_page(bpage);
2773 bpage = reader->page; 3015 bpage = reader->page;
2774 reader->page = *data_page; 3016 reader->page = *data_page;
2775 local_set(&reader->write, 0); 3017 local_set(&reader->write, 0);
3018 local_set(&reader->entries, 0);
2776 reader->read = 0; 3019 reader->read = 0;
2777 *data_page = bpage; 3020 *data_page = bpage;
2778
2779 /* update the entry counter */
2780 rb_remove_entries(cpu_buffer, bpage, read);
2781 } 3021 }
2782 ret = read; 3022 ret = read;
2783 3023
@@ -2787,6 +3027,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
2787 out: 3027 out:
2788 return ret; 3028 return ret;
2789} 3029}
3030EXPORT_SYMBOL_GPL(ring_buffer_read_page);
2790 3031
2791static ssize_t 3032static ssize_t
2792rb_simple_read(struct file *filp, char __user *ubuf, 3033rb_simple_read(struct file *filp, char __user *ubuf,
@@ -2845,14 +3086,11 @@ static const struct file_operations rb_simple_fops = {
2845static __init int rb_init_debugfs(void) 3086static __init int rb_init_debugfs(void)
2846{ 3087{
2847 struct dentry *d_tracer; 3088 struct dentry *d_tracer;
2848 struct dentry *entry;
2849 3089
2850 d_tracer = tracing_init_dentry(); 3090 d_tracer = tracing_init_dentry();
2851 3091
2852 entry = debugfs_create_file("tracing_on", 0644, d_tracer, 3092 trace_create_file("tracing_on", 0644, d_tracer,
2853 &ring_buffer_flags, &rb_simple_fops); 3093 &ring_buffer_flags, &rb_simple_fops);
2854 if (!entry)
2855 pr_warning("Could not create debugfs 'tracing_on' entry\n");
2856 3094
2857 return 0; 3095 return 0;
2858} 3096}