aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--Documentation/trace/events.txt9
-rw-r--r--Documentation/trace/ring-buffer-design.txt955
-rw-r--r--arch/x86/kernel/ftrace.c4
-rw-r--r--include/linux/ftrace_event.h4
-rw-r--r--include/linux/ring_buffer.h1
-rw-r--r--include/trace/ftrace.h18
-rw-r--r--kernel/kprobes.c30
-rw-r--r--kernel/trace/ftrace.c92
-rw-r--r--kernel/trace/kmemtrace.c145
-rw-r--r--kernel/trace/ring_buffer.c940
-rw-r--r--kernel/trace/trace.c208
-rw-r--r--kernel/trace/trace.h37
-rw-r--r--kernel/trace/trace_events.c69
-rw-r--r--kernel/trace/trace_events_filter.c169
-rw-r--r--kernel/trace/trace_functions.c4
-rw-r--r--kernel/trace/trace_functions_graph.c164
-rw-r--r--kernel/trace/trace_sched_switch.c57
-rw-r--r--kernel/trace/trace_selftest.c1
-rw-r--r--kernel/trace/trace_stack.c9
-rw-r--r--kernel/trace/trace_stat.c7
-rw-r--r--kernel/trace/trace_stat.h2
-rw-r--r--kernel/trace/trace_workqueue.c32
-rwxr-xr-xscripts/recordmcount.pl1
24 files changed, 2302 insertions, 661 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index dd1a6d4bb747..81cdb7d5e380 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2476,6 +2476,11 @@ and is between 256 and 4096 characters. It is defined in the file
2476 trace_buf_size=nn[KMG] 2476 trace_buf_size=nn[KMG]
2477 [FTRACE] will set tracing buffer size. 2477 [FTRACE] will set tracing buffer size.
2478 2478
2479 trace_event=[event-list]
2480 [FTRACE] Set and start specified trace events in order
2481 to facilitate early boot debugging.
2482 See also Documentation/trace/events.txt
2483
2479 trix= [HW,OSS] MediaTrix AudioTrix Pro 2484 trix= [HW,OSS] MediaTrix AudioTrix Pro
2480 Format: 2485 Format:
2481 <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq> 2486 <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index f157d7594ea7..2bcc8d4dea29 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -83,6 +83,15 @@ When reading one of these enable files, there are four results:
83 X - there is a mixture of events enabled and disabled 83 X - there is a mixture of events enabled and disabled
84 ? - this file does not affect any event 84 ? - this file does not affect any event
85 85
862.3 Boot option
87---------------
88
89In order to facilitate early boot debugging, use boot option:
90
91 trace_event=[event-list]
92
93The format of this boot option is the same as described in section 2.1.
94
863. Defining an event-enabled tracepoint 953. Defining an event-enabled tracepoint
87======================================= 96=======================================
88 97
diff --git a/Documentation/trace/ring-buffer-design.txt b/Documentation/trace/ring-buffer-design.txt
new file mode 100644
index 000000000000..5b1d23d604c5
--- /dev/null
+++ b/Documentation/trace/ring-buffer-design.txt
@@ -0,0 +1,955 @@
1 Lockless Ring Buffer Design
2 ===========================
3
4Copyright 2009 Red Hat Inc.
5 Author: Steven Rostedt <srostedt@redhat.com>
6 License: The GNU Free Documentation License, Version 1.2
7 (dual licensed under the GPL v2)
8Reviewers: Mathieu Desnoyers, Huang Ying, Hidetoshi Seto,
9 and Frederic Weisbecker.
10
11
12Written for: 2.6.31
13
14Terminology used in this Document
15---------------------------------
16
17tail - where new writes happen in the ring buffer.
18
19head - where new reads happen in the ring buffer.
20
21producer - the task that writes into the ring buffer (same as writer)
22
23writer - same as producer
24
25consumer - the task that reads from the buffer (same as reader)
26
27reader - same as consumer.
28
29reader_page - A page outside the ring buffer used solely (for the most part)
30 by the reader.
31
32head_page - a pointer to the page that the reader will use next
33
34tail_page - a pointer to the page that will be written to next
35
36commit_page - a pointer to the page with the last finished non nested write.
37
38cmpxchg - hardware assisted atomic transaction that performs the following:
39
40 A = B iff previous A == C
41
42 R = cmpxchg(A, C, B) is saying that we replace A with B if and only if
43 current A is equal to C, and we put the old (current) A into R
44
45 R gets the previous A regardless if A is updated with B or not.
46
47 To see if the update was successful a compare of R == C may be used.
48
49The Generic Ring Buffer
50-----------------------
51
52The ring buffer can be used in either an overwrite mode or in
53producer/consumer mode.
54
55Producer/consumer mode is where the producer were to fill up the
56buffer before the consumer could free up anything, the producer
57will stop writing to the buffer. This will lose most recent events.
58
59Overwrite mode is where the produce were to fill up the buffer
60before the consumer could free up anything, the producer will
61overwrite the older data. This will lose the oldest events.
62
63No two writers can write at the same time (on the same per cpu buffer),
64but a writer may interrupt another writer, but it must finish writing
65before the previous writer may continue. This is very important to the
66algorithm. The writers act like a "stack". The way interrupts works
67enforces this behavior.
68
69
70 writer1 start
71 <preempted> writer2 start
72 <preempted> writer3 start
73 writer3 finishes
74 writer2 finishes
75 writer1 finishes
76
77This is very much like a writer being preempted by an interrupt and
78the interrupt doing a write as well.
79
80Readers can happen at any time. But no two readers may run at the
81same time, nor can a reader preempt/interrupt another reader. A reader
82can not preempt/interrupt a writer, but it may read/consume from the
83buffer at the same time as a writer is writing, but the reader must be
84on another processor to do so. A reader may read on its own processor
85and can be preempted by a writer.
86
87A writer can preempt a reader, but a reader can not preempt a writer.
88But a reader can read the buffer at the same time (on another processor)
89as a writer.
90
91The ring buffer is made up of a list of pages held together by a link list.
92
93At initialization a reader page is allocated for the reader that is not
94part of the ring buffer.
95
96The head_page, tail_page and commit_page are all initialized to point
97to the same page.
98
99The reader page is initialized to have its next pointer pointing to
100the head page, and its previous pointer pointing to a page before
101the head page.
102
103The reader has its own page to use. At start up time, this page is
104allocated but is not attached to the list. When the reader wants
105to read from the buffer, if its page is empty (like it is on start up)
106it will swap its page with the head_page. The old reader page will
107become part of the ring buffer and the head_page will be removed.
108The page after the inserted page (old reader_page) will become the
109new head page.
110
111Once the new page is given to the reader, the reader could do what
112it wants with it, as long as a writer has left that page.
113
114A sample of how the reader page is swapped: Note this does not
115show the head page in the buffer, it is for demonstrating a swap
116only.
117
118 +------+
119 |reader| RING BUFFER
120 |page |
121 +------+
122 +---+ +---+ +---+
123 | |-->| |-->| |
124 | |<--| |<--| |
125 +---+ +---+ +---+
126 ^ | ^ |
127 | +-------------+ |
128 +-----------------+
129
130
131 +------+
132 |reader| RING BUFFER
133 |page |-------------------+
134 +------+ v
135 | +---+ +---+ +---+
136 | | |-->| |-->| |
137 | | |<--| |<--| |<-+
138 | +---+ +---+ +---+ |
139 | ^ | ^ | |
140 | | +-------------+ | |
141 | +-----------------+ |
142 +------------------------------------+
143
144 +------+
145 |reader| RING BUFFER
146 |page |-------------------+
147 +------+ <---------------+ v
148 | ^ +---+ +---+ +---+
149 | | | |-->| |-->| |
150 | | | | | |<--| |<-+
151 | | +---+ +---+ +---+ |
152 | | | ^ | |
153 | | +-------------+ | |
154 | +-----------------------------+ |
155 +------------------------------------+
156
157 +------+
158 |buffer| RING BUFFER
159 |page |-------------------+
160 +------+ <---------------+ v
161 | ^ +---+ +---+ +---+
162 | | | | | |-->| |
163 | | New | | | |<--| |<-+
164 | | Reader +---+ +---+ +---+ |
165 | | page ----^ | |
166 | | | |
167 | +-----------------------------+ |
168 +------------------------------------+
169
170
171
172It is possible that the page swapped is the commit page and the tail page,
173if what is in the ring buffer is less than what is held in a buffer page.
174
175
176 reader page commit page tail page
177 | | |
178 v | |
179 +---+ | |
180 | |<----------+ |
181 | |<------------------------+
182 | |------+
183 +---+ |
184 |
185 v
186 +---+ +---+ +---+ +---+
187<---| |--->| |--->| |--->| |--->
188--->| |<---| |<---| |<---| |<---
189 +---+ +---+ +---+ +---+
190
191This case is still valid for this algorithm.
192When the writer leaves the page, it simply goes into the ring buffer
193since the reader page still points to the next location in the ring
194buffer.
195
196
197The main pointers:
198
199 reader page - The page used solely by the reader and is not part
200 of the ring buffer (may be swapped in)
201
202 head page - the next page in the ring buffer that will be swapped
203 with the reader page.
204
205 tail page - the page where the next write will take place.
206
207 commit page - the page that last finished a write.
208
209The commit page only is updated by the outer most writer in the
210writer stack. A writer that preempts another writer will not move the
211commit page.
212
213When data is written into the ring buffer, a position is reserved
214in the ring buffer and passed back to the writer. When the writer
215is finished writing data into that position, it commits the write.
216
217Another write (or a read) may take place at anytime during this
218transaction. If another write happens it must finish before continuing
219with the previous write.
220
221
222 Write reserve:
223
224 Buffer page
225 +---------+
226 |written |
227 +---------+ <--- given back to writer (current commit)
228 |reserved |
229 +---------+ <--- tail pointer
230 | empty |
231 +---------+
232
233 Write commit:
234
235 Buffer page
236 +---------+
237 |written |
238 +---------+
239 |written |
240 +---------+ <--- next positon for write (current commit)
241 | empty |
242 +---------+
243
244
245 If a write happens after the first reserve:
246
247 Buffer page
248 +---------+
249 |written |
250 +---------+ <-- current commit
251 |reserved |
252 +---------+ <--- given back to second writer
253 |reserved |
254 +---------+ <--- tail pointer
255
256 After second writer commits:
257
258
259 Buffer page
260 +---------+
261 |written |
262 +---------+ <--(last full commit)
263 |reserved |
264 +---------+
265 |pending |
266 |commit |
267 +---------+ <--- tail pointer
268
269 When the first writer commits:
270
271 Buffer page
272 +---------+
273 |written |
274 +---------+
275 |written |
276 +---------+
277 |written |
278 +---------+ <--(last full commit and tail pointer)
279
280
281The commit pointer points to the last write location that was
282committed without preempting another write. When a write that
283preempted another write is committed, it only becomes a pending commit
284and will not be a full commit till all writes have been committed.
285
286The commit page points to the page that has the last full commit.
287The tail page points to the page with the last write (before
288committing).
289
290The tail page is always equal to or after the commit page. It may
291be several pages ahead. If the tail page catches up to the commit
292page then no more writes may take place (regardless of the mode
293of the ring buffer: overwrite and produce/consumer).
294
295The order of pages are:
296
297 head page
298 commit page
299 tail page
300
301Possible scenario:
302 tail page
303 head page commit page |
304 | | |
305 v v v
306 +---+ +---+ +---+ +---+
307<---| |--->| |--->| |--->| |--->
308--->| |<---| |<---| |<---| |<---
309 +---+ +---+ +---+ +---+
310
311There is a special case that the head page is after either the commit page
312and possibly the tail page. That is when the commit (and tail) page has been
313swapped with the reader page. This is because the head page is always
314part of the ring buffer, but the reader page is not. When ever there
315has been less than a full page that has been committed inside the ring buffer,
316and a reader swaps out a page, it will be swapping out the commit page.
317
318
319 reader page commit page tail page
320 | | |
321 v | |
322 +---+ | |
323 | |<----------+ |
324 | |<------------------------+
325 | |------+
326 +---+ |
327 |
328 v
329 +---+ +---+ +---+ +---+
330<---| |--->| |--->| |--->| |--->
331--->| |<---| |<---| |<---| |<---
332 +---+ +---+ +---+ +---+
333 ^
334 |
335 head page
336
337
338In this case, the head page will not move when the tail and commit
339move back into the ring buffer.
340
341The reader can not swap a page into the ring buffer if the commit page
342is still on that page. If the read meets the last commit (real commit
343not pending or reserved), then there is nothing more to read.
344The buffer is considered empty until another full commit finishes.
345
346When the tail meets the head page, if the buffer is in overwrite mode,
347the head page will be pushed ahead one. If the buffer is in producer/consumer
348mode, the write will fail.
349
350Overwrite mode:
351
352 tail page
353 |
354 v
355 +---+ +---+ +---+ +---+
356<---| |--->| |--->| |--->| |--->
357--->| |<---| |<---| |<---| |<---
358 +---+ +---+ +---+ +---+
359 ^
360 |
361 head page
362
363
364 tail page
365 |
366 v
367 +---+ +---+ +---+ +---+
368<---| |--->| |--->| |--->| |--->
369--->| |<---| |<---| |<---| |<---
370 +---+ +---+ +---+ +---+
371 ^
372 |
373 head page
374
375
376 tail page
377 |
378 v
379 +---+ +---+ +---+ +---+
380<---| |--->| |--->| |--->| |--->
381--->| |<---| |<---| |<---| |<---
382 +---+ +---+ +---+ +---+
383 ^
384 |
385 head page
386
387Note, the reader page will still point to the previous head page.
388But when a swap takes place, it will use the most recent head page.
389
390
391Making the Ring Buffer Lockless:
392--------------------------------
393
394The main idea behind the lockless algorithm is to combine the moving
395of the head_page pointer with the swapping of pages with the reader.
396State flags are placed inside the pointer to the page. To do this,
397each page must be aligned in memory by 4 bytes. This will allow the 2
398least significant bits of the address to be used as flags. Since
399they will always be zero for the address. To get the address,
400simply mask out the flags.
401
402 MASK = ~3
403
404 address & MASK
405
406Two flags will be kept by these two bits:
407
408 HEADER - the page being pointed to is a head page
409
410 UPDATE - the page being pointed to is being updated by a writer
411 and was or is about to be a head page.
412
413
414 reader page
415 |
416 v
417 +---+
418 | |------+
419 +---+ |
420 |
421 v
422 +---+ +---+ +---+ +---+
423<---| |--->| |-H->| |--->| |--->
424--->| |<---| |<---| |<---| |<---
425 +---+ +---+ +---+ +---+
426
427
428The above pointer "-H->" would have the HEADER flag set. That is
429the next page is the next page to be swapped out by the reader.
430This pointer means the next page is the head page.
431
432When the tail page meets the head pointer, it will use cmpxchg to
433change the pointer to the UPDATE state:
434
435
436 tail page
437 |
438 v
439 +---+ +---+ +---+ +---+
440<---| |--->| |-H->| |--->| |--->
441--->| |<---| |<---| |<---| |<---
442 +---+ +---+ +---+ +---+
443
444 tail page
445 |
446 v
447 +---+ +---+ +---+ +---+
448<---| |--->| |-U->| |--->| |--->
449--->| |<---| |<---| |<---| |<---
450 +---+ +---+ +---+ +---+
451
452"-U->" represents a pointer in the UPDATE state.
453
454Any access to the reader will need to take some sort of lock to serialize
455the readers. But the writers will never take a lock to write to the
456ring buffer. This means we only need to worry about a single reader,
457and writes only preempt in "stack" formation.
458
459When the reader tries to swap the page with the ring buffer, it
460will also use cmpxchg. If the flag bit in the pointer to the
461head page does not have the HEADER flag set, the compare will fail
462and the reader will need to look for the new head page and try again.
463Note, the flag UPDATE and HEADER are never set at the same time.
464
465The reader swaps the reader page as follows:
466
467 +------+
468 |reader| RING BUFFER
469 |page |
470 +------+
471 +---+ +---+ +---+
472 | |--->| |--->| |
473 | |<---| |<---| |
474 +---+ +---+ +---+
475 ^ | ^ |
476 | +---------------+ |
477 +-----H-------------+
478
479The reader sets the reader page next pointer as HEADER to the page after
480the head page.
481
482
483 +------+
484 |reader| RING BUFFER
485 |page |-------H-----------+
486 +------+ v
487 | +---+ +---+ +---+
488 | | |--->| |--->| |
489 | | |<---| |<---| |<-+
490 | +---+ +---+ +---+ |
491 | ^ | ^ | |
492 | | +---------------+ | |
493 | +-----H-------------+ |
494 +--------------------------------------+
495
496It does a cmpxchg with the pointer to the previous head page to make it
497point to the reader page. Note that the new pointer does not have the HEADER
498flag set. This action atomically moves the head page forward.
499
500 +------+
501 |reader| RING BUFFER
502 |page |-------H-----------+
503 +------+ v
504 | ^ +---+ +---+ +---+
505 | | | |-->| |-->| |
506 | | | |<--| |<--| |<-+
507 | | +---+ +---+ +---+ |
508 | | | ^ | |
509 | | +-------------+ | |
510 | +-----------------------------+ |
511 +------------------------------------+
512
513After the new head page is set, the previous pointer of the head page is
514updated to the reader page.
515
516 +------+
517 |reader| RING BUFFER
518 |page |-------H-----------+
519 +------+ <---------------+ v
520 | ^ +---+ +---+ +---+
521 | | | |-->| |-->| |
522 | | | | | |<--| |<-+
523 | | +---+ +---+ +---+ |
524 | | | ^ | |
525 | | +-------------+ | |
526 | +-----------------------------+ |
527 +------------------------------------+
528
529 +------+
530 |buffer| RING BUFFER
531 |page |-------H-----------+ <--- New head page
532 +------+ <---------------+ v
533 | ^ +---+ +---+ +---+
534 | | | | | |-->| |
535 | | New | | | |<--| |<-+
536 | | Reader +---+ +---+ +---+ |
537 | | page ----^ | |
538 | | | |
539 | +-----------------------------+ |
540 +------------------------------------+
541
542Another important point. The page that the reader page points back to
543by its previous pointer (the one that now points to the new head page)
544never points back to the reader page. That is because the reader page is
545not part of the ring buffer. Traversing the ring buffer via the next pointers
546will always stay in the ring buffer. Traversing the ring buffer via the
547prev pointers may not.
548
549Note, the way to determine a reader page is simply by examining the previous
550pointer of the page. If the next pointer of the previous page does not
551point back to the original page, then the original page is a reader page:
552
553
554 +--------+
555 | reader | next +----+
556 | page |-------->| |<====== (buffer page)
557 +--------+ +----+
558 | | ^
559 | v | next
560 prev | +----+
561 +------------->| |
562 +----+
563
564The way the head page moves forward:
565
566When the tail page meets the head page and the buffer is in overwrite mode
567and more writes take place, the head page must be moved forward before the
568writer may move the tail page. The way this is done is that the writer
569performs a cmpxchg to convert the pointer to the head page from the HEADER
570flag to have the UPDATE flag set. Once this is done, the reader will
571not be able to swap the head page from the buffer, nor will it be able to
572move the head page, until the writer is finished with the move.
573
574This eliminates any races that the reader can have on the writer. The reader
575must spin, and this is why the reader can not preempt the writer.
576
577 tail page
578 |
579 v
580 +---+ +---+ +---+ +---+
581<---| |--->| |-H->| |--->| |--->
582--->| |<---| |<---| |<---| |<---
583 +---+ +---+ +---+ +---+
584
585 tail page
586 |
587 v
588 +---+ +---+ +---+ +---+
589<---| |--->| |-U->| |--->| |--->
590--->| |<---| |<---| |<---| |<---
591 +---+ +---+ +---+ +---+
592
593The following page will be made into the new head page.
594
595 tail page
596 |
597 v
598 +---+ +---+ +---+ +---+
599<---| |--->| |-U->| |-H->| |--->
600--->| |<---| |<---| |<---| |<---
601 +---+ +---+ +---+ +---+
602
603After the new head page has been set, we can set the old head page
604pointer back to NORMAL.
605
606 tail page
607 |
608 v
609 +---+ +---+ +---+ +---+
610<---| |--->| |--->| |-H->| |--->
611--->| |<---| |<---| |<---| |<---
612 +---+ +---+ +---+ +---+
613
614After the head page has been moved, the tail page may now move forward.
615
616 tail page
617 |
618 v
619 +---+ +---+ +---+ +---+
620<---| |--->| |--->| |-H->| |--->
621--->| |<---| |<---| |<---| |<---
622 +---+ +---+ +---+ +---+
623
624
625The above are the trivial updates. Now for the more complex scenarios.
626
627
628As stated before, if enough writes preempt the first write, the
629tail page may make it all the way around the buffer and meet the commit
630page. At this time, we must start dropping writes (usually with some kind
631of warning to the user). But what happens if the commit was still on the
632reader page? The commit page is not part of the ring buffer. The tail page
633must account for this.
634
635
636 reader page commit page
637 | |
638 v |
639 +---+ |
640 | |<----------+
641 | |
642 | |------+
643 +---+ |
644 |
645 v
646 +---+ +---+ +---+ +---+
647<---| |--->| |-H->| |--->| |--->
648--->| |<---| |<---| |<---| |<---
649 +---+ +---+ +---+ +---+
650 ^
651 |
652 tail page
653
654If the tail page were to simply push the head page forward, the commit when
655leaving the reader page would not be pointing to the correct page.
656
657The solution to this is to test if the commit page is on the reader page
658before pushing the head page. If it is, then it can be assumed that the
659tail page wrapped the buffer, and we must drop new writes.
660
661This is not a race condition, because the commit page can only be moved
662by the outter most writer (the writer that was preempted).
663This means that the commit will not move while a writer is moving the
664tail page. The reader can not swap the reader page if it is also being
665used as the commit page. The reader can simply check that the commit
666is off the reader page. Once the commit page leaves the reader page
667it will never go back on it unless a reader does another swap with the
668buffer page that is also the commit page.
669
670
671Nested writes
672-------------
673
674In the pushing forward of the tail page we must first push forward
675the head page if the head page is the next page. If the head page
676is not the next page, the tail page is simply updated with a cmpxchg.
677
678Only writers move the tail page. This must be done atomically to protect
679against nested writers.
680
681 temp_page = tail_page
682 next_page = temp_page->next
683 cmpxchg(tail_page, temp_page, next_page)
684
685The above will update the tail page if it is still pointing to the expected
686page. If this fails, a nested write pushed it forward, the the current write
687does not need to push it.
688
689
690 temp page
691 |
692 v
693 tail page
694 |
695 v
696 +---+ +---+ +---+ +---+
697<---| |--->| |--->| |--->| |--->
698--->| |<---| |<---| |<---| |<---
699 +---+ +---+ +---+ +---+
700
701Nested write comes in and moves the tail page forward:
702
703 tail page (moved by nested writer)
704 temp page |
705 | |
706 v v
707 +---+ +---+ +---+ +---+
708<---| |--->| |--->| |--->| |--->
709--->| |<---| |<---| |<---| |<---
710 +---+ +---+ +---+ +---+
711
712The above would fail the cmpxchg, but since the tail page has already
713been moved forward, the writer will just try again to reserve storage
714on the new tail page.
715
716But the moving of the head page is a bit more complex.
717
718 tail page
719 |
720 v
721 +---+ +---+ +---+ +---+
722<---| |--->| |-H->| |--->| |--->
723--->| |<---| |<---| |<---| |<---
724 +---+ +---+ +---+ +---+
725
726The write converts the head page pointer to UPDATE.
727
728 tail page
729 |
730 v
731 +---+ +---+ +---+ +---+
732<---| |--->| |-U->| |--->| |--->
733--->| |<---| |<---| |<---| |<---
734 +---+ +---+ +---+ +---+
735
736But if a nested writer preempts here. It will see that the next
737page is a head page, but it is also nested. It will detect that
738it is nested and will save that information. The detection is the
739fact that it sees the UPDATE flag instead of a HEADER or NORMAL
740pointer.
741
742The nested writer will set the new head page pointer.
743
744 tail page
745 |
746 v
747 +---+ +---+ +---+ +---+
748<---| |--->| |-U->| |-H->| |--->
749--->| |<---| |<---| |<---| |<---
750 +---+ +---+ +---+ +---+
751
752But it will not reset the update back to normal. Only the writer
753that converted a pointer from HEAD to UPDATE will convert it back
754to NORMAL.
755
756 tail page
757 |
758 v
759 +---+ +---+ +---+ +---+
760<---| |--->| |-U->| |-H->| |--->
761--->| |<---| |<---| |<---| |<---
762 +---+ +---+ +---+ +---+
763
764After the nested writer finishes, the outer most writer will convert
765the UPDATE pointer to NORMAL.
766
767
768 tail page
769 |
770 v
771 +---+ +---+ +---+ +---+
772<---| |--->| |--->| |-H->| |--->
773--->| |<---| |<---| |<---| |<---
774 +---+ +---+ +---+ +---+
775
776
777It can be even more complex if several nested writes came in and moved
778the tail page ahead several pages:
779
780
781(first writer)
782
783 tail page
784 |
785 v
786 +---+ +---+ +---+ +---+
787<---| |--->| |-H->| |--->| |--->
788--->| |<---| |<---| |<---| |<---
789 +---+ +---+ +---+ +---+
790
791The write converts the head page pointer to UPDATE.
792
793 tail page
794 |
795 v
796 +---+ +---+ +---+ +---+
797<---| |--->| |-U->| |--->| |--->
798--->| |<---| |<---| |<---| |<---
799 +---+ +---+ +---+ +---+
800
801Next writer comes in, and sees the update and sets up the new
802head page.
803
804(second writer)
805
806 tail page
807 |
808 v
809 +---+ +---+ +---+ +---+
810<---| |--->| |-U->| |-H->| |--->
811--->| |<---| |<---| |<---| |<---
812 +---+ +---+ +---+ +---+
813
814The nested writer moves the tail page forward. But does not set the old
815update page to NORMAL because it is not the outer most writer.
816
817 tail page
818 |
819 v
820 +---+ +---+ +---+ +---+
821<---| |--->| |-U->| |-H->| |--->
822--->| |<---| |<---| |<---| |<---
823 +---+ +---+ +---+ +---+
824
825Another writer preempts and sees the page after the tail page is a head page.
826It changes it from HEAD to UPDATE.
827
828(third writer)
829
830 tail page
831 |
832 v
833 +---+ +---+ +---+ +---+
834<---| |--->| |-U->| |-U->| |--->
835--->| |<---| |<---| |<---| |<---
836 +---+ +---+ +---+ +---+
837
838The writer will move the head page forward:
839
840
841(third writer)
842
843 tail page
844 |
845 v
846 +---+ +---+ +---+ +---+
847<---| |--->| |-U->| |-U->| |-H->
848--->| |<---| |<---| |<---| |<---
849 +---+ +---+ +---+ +---+
850
851But now that the third writer did change the HEAD flag to UPDATE it
852will convert it to normal:
853
854
855(third writer)
856
857 tail page
858 |
859 v
860 +---+ +---+ +---+ +---+
861<---| |--->| |-U->| |--->| |-H->
862--->| |<---| |<---| |<---| |<---
863 +---+ +---+ +---+ +---+
864
865
866Then it will move the tail page, and return back to the second writer.
867
868
869(second writer)
870
871 tail page
872 |
873 v
874 +---+ +---+ +---+ +---+
875<---| |--->| |-U->| |--->| |-H->
876--->| |<---| |<---| |<---| |<---
877 +---+ +---+ +---+ +---+
878
879
880The second writer will fail to move the tail page because it was already
881moved, so it will try again and add its data to the new tail page.
882It will return to the first writer.
883
884
885(first writer)
886
887 tail page
888 |
889 v
890 +---+ +---+ +---+ +---+
891<---| |--->| |-U->| |--->| |-H->
892--->| |<---| |<---| |<---| |<---
893 +---+ +---+ +---+ +---+
894
895The first writer can not know atomically test if the tail page moved
896while it updates the HEAD page. It will then update the head page to
897what it thinks is the new head page.
898
899
900(first writer)
901
902 tail page
903 |
904 v
905 +---+ +---+ +---+ +---+
906<---| |--->| |-U->| |-H->| |-H->
907--->| |<---| |<---| |<---| |<---
908 +---+ +---+ +---+ +---+
909
910Since the cmpxchg returns the old value of the pointer the first writer
911will see it succeeded in updating the pointer from NORMAL to HEAD.
912But as we can see, this is not good enough. It must also check to see
913if the tail page is either where it use to be or on the next page:
914
915
916(first writer)
917
918 A B tail page
919 | | |
920 v v v
921 +---+ +---+ +---+ +---+
922<---| |--->| |-U->| |-H->| |-H->
923--->| |<---| |<---| |<---| |<---
924 +---+ +---+ +---+ +---+
925
926If tail page != A and tail page does not equal B, then it must reset the
927pointer back to NORMAL. The fact that it only needs to worry about
928nested writers, it only needs to check this after setting the HEAD page.
929
930
931(first writer)
932
933 A B tail page
934 | | |
935 v v v
936 +---+ +---+ +---+ +---+
937<---| |--->| |-U->| |--->| |-H->
938--->| |<---| |<---| |<---| |<---
939 +---+ +---+ +---+ +---+
940
941Now the writer can update the head page. This is also why the head page must
942remain in UPDATE and only reset by the outer most writer. This prevents
943the reader from seeing the incorrect head page.
944
945
946(first writer)
947
948 A B tail page
949 | | |
950 v v v
951 +---+ +---+ +---+ +---+
952<---| |--->| |--->| |--->| |-H->
953--->| |<---| |<---| |<---| |<---
954 +---+ +---+ +---+ +---+
955
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d94e1ea3b9fe..8e9663413b7f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -417,10 +417,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
417 unsigned long return_hooker = (unsigned long) 417 unsigned long return_hooker = (unsigned long)
418 &return_to_handler; 418 &return_to_handler;
419 419
420 /* Nmi's are currently unsupported */
421 if (unlikely(in_nmi()))
422 return;
423
424 if (unlikely(atomic_read(&current->tracing_graph_pause))) 420 if (unlikely(atomic_read(&current->tracing_graph_pause)))
425 return; 421 return;
426 422
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index a81170de7f6b..ac8c6f8cf242 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -103,6 +103,8 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
103 103
104void tracing_record_cmdline(struct task_struct *tsk); 104void tracing_record_cmdline(struct task_struct *tsk);
105 105
106struct event_filter;
107
106struct ftrace_event_call { 108struct ftrace_event_call {
107 struct list_head list; 109 struct list_head list;
108 char *name; 110 char *name;
@@ -118,7 +120,7 @@ struct ftrace_event_call {
118 int (*define_fields)(void); 120 int (*define_fields)(void);
119 struct list_head fields; 121 struct list_head fields;
120 int filter_active; 122 int filter_active;
121 void *filter; 123 struct event_filter *filter;
122 void *mod; 124 void *mod;
123 125
124 atomic_t profile_count; 126 atomic_t profile_count;
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 29f8599e6bea..7fca71693ae7 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -170,7 +170,6 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
170unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); 170unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
171unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); 171unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
172unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu); 172unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu);
173unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu);
174 173
175u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); 174u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
176void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, 175void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index f64fbaae781a..25d3b02a06f8 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -25,7 +25,7 @@
25#define __array(type, item, len) type item[len]; 25#define __array(type, item, len) type item[len];
26 26
27#undef __dynamic_array 27#undef __dynamic_array
28#define __dynamic_array(type, item, len) unsigned short __data_loc_##item; 28#define __dynamic_array(type, item, len) u32 __data_loc_##item;
29 29
30#undef __string 30#undef __string
31#define __string(item, src) __dynamic_array(char, item, -1) 31#define __string(item, src) __dynamic_array(char, item, -1)
@@ -51,13 +51,14 @@
51 * Include the following: 51 * Include the following:
52 * 52 *
53 * struct ftrace_data_offsets_<call> { 53 * struct ftrace_data_offsets_<call> {
54 * int <item1>; 54 * u32 <item1>;
55 * int <item2>; 55 * u32 <item2>;
56 * [...] 56 * [...]
57 * }; 57 * };
58 * 58 *
59 * The __dynamic_array() macro will create each int <item>, this is 59 * The __dynamic_array() macro will create each u32 <item>, this is
60 * to keep the offset of each array from the beginning of the event. 60 * to keep the offset of each array from the beginning of the event.
61 * The size of an array is also encoded, in the higher 16 bits of <item>.
61 */ 62 */
62 63
63#undef __field 64#undef __field
@@ -67,7 +68,7 @@
67#define __array(type, item, len) 68#define __array(type, item, len)
68 69
69#undef __dynamic_array 70#undef __dynamic_array
70#define __dynamic_array(type, item, len) int item; 71#define __dynamic_array(type, item, len) u32 item;
71 72
72#undef __string 73#undef __string
73#define __string(item, src) __dynamic_array(char, item, -1) 74#define __string(item, src) __dynamic_array(char, item, -1)
@@ -120,7 +121,7 @@
120 121
121#undef __dynamic_array 122#undef __dynamic_array
122#define __dynamic_array(type, item, len) \ 123#define __dynamic_array(type, item, len) \
123 ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t" \ 124 ret = trace_seq_printf(s, "\tfield:__data_loc " #type "[] " #item ";\t"\
124 "offset:%u;\tsize:%u;\n", \ 125 "offset:%u;\tsize:%u;\n", \
125 (unsigned int)offsetof(typeof(field), \ 126 (unsigned int)offsetof(typeof(field), \
126 __data_loc_##item), \ 127 __data_loc_##item), \
@@ -210,7 +211,7 @@ ftrace_format_##call(struct trace_seq *s) \
210 211
211#undef __get_dynamic_array 212#undef __get_dynamic_array
212#define __get_dynamic_array(field) \ 213#define __get_dynamic_array(field) \
213 ((void *)__entry + __entry->__data_loc_##field) 214 ((void *)__entry + (__entry->__data_loc_##field & 0xffff))
214 215
215#undef __get_str 216#undef __get_str
216#define __get_str(field) (char *)__get_dynamic_array(field) 217#define __get_str(field) (char *)__get_dynamic_array(field)
@@ -282,7 +283,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
282 283
283#undef __dynamic_array 284#undef __dynamic_array
284#define __dynamic_array(type, item, len) \ 285#define __dynamic_array(type, item, len) \
285 ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\ 286 ret = trace_define_field(event_call, "__data_loc " #type "[]", #item, \
286 offsetof(typeof(field), __data_loc_##item), \ 287 offsetof(typeof(field), __data_loc_##item), \
287 sizeof(field.__data_loc_##item), 0); 288 sizeof(field.__data_loc_##item), 0);
288 289
@@ -328,6 +329,7 @@ ftrace_define_fields_##call(void) \
328#define __dynamic_array(type, item, len) \ 329#define __dynamic_array(type, item, len) \
329 __data_offsets->item = __data_size + \ 330 __data_offsets->item = __data_size + \
330 offsetof(typeof(*entry), __data); \ 331 offsetof(typeof(*entry), __data); \
332 __data_offsets->item |= (len * sizeof(type)) << 16; \
331 __data_size += (len) * sizeof(type); 333 __data_size += (len) * sizeof(type);
332 334
333#undef __string 335#undef __string
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0540948e29ab..ef177d653b2c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -103,7 +103,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) 103#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
104 104
105struct kprobe_insn_page { 105struct kprobe_insn_page {
106 struct hlist_node hlist; 106 struct list_head list;
107 kprobe_opcode_t *insns; /* Page of instruction slots */ 107 kprobe_opcode_t *insns; /* Page of instruction slots */
108 char slot_used[INSNS_PER_PAGE]; 108 char slot_used[INSNS_PER_PAGE];
109 int nused; 109 int nused;
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
117}; 117};
118 118
119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ 119static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */
120static struct hlist_head kprobe_insn_pages; 120static LIST_HEAD(kprobe_insn_pages);
121static int kprobe_garbage_slots; 121static int kprobe_garbage_slots;
122static int collect_garbage_slots(void); 122static int collect_garbage_slots(void);
123 123
@@ -152,10 +152,9 @@ loop_end:
152static kprobe_opcode_t __kprobes *__get_insn_slot(void) 152static kprobe_opcode_t __kprobes *__get_insn_slot(void)
153{ 153{
154 struct kprobe_insn_page *kip; 154 struct kprobe_insn_page *kip;
155 struct hlist_node *pos;
156 155
157 retry: 156 retry:
158 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 157 list_for_each_entry(kip, &kprobe_insn_pages, list) {
159 if (kip->nused < INSNS_PER_PAGE) { 158 if (kip->nused < INSNS_PER_PAGE) {
160 int i; 159 int i;
161 for (i = 0; i < INSNS_PER_PAGE; i++) { 160 for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -189,8 +188,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void)
189 kfree(kip); 188 kfree(kip);
190 return NULL; 189 return NULL;
191 } 190 }
192 INIT_HLIST_NODE(&kip->hlist); 191 INIT_LIST_HEAD(&kip->list);
193 hlist_add_head(&kip->hlist, &kprobe_insn_pages); 192 list_add(&kip->list, &kprobe_insn_pages);
194 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); 193 memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE);
195 kip->slot_used[0] = SLOT_USED; 194 kip->slot_used[0] = SLOT_USED;
196 kip->nused = 1; 195 kip->nused = 1;
@@ -219,12 +218,8 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
219 * so as not to have to set it up again the 218 * so as not to have to set it up again the
220 * next time somebody inserts a probe. 219 * next time somebody inserts a probe.
221 */ 220 */
222 hlist_del(&kip->hlist); 221 if (!list_is_singular(&kprobe_insn_pages)) {
223 if (hlist_empty(&kprobe_insn_pages)) { 222 list_del(&kip->list);
224 INIT_HLIST_NODE(&kip->hlist);
225 hlist_add_head(&kip->hlist,
226 &kprobe_insn_pages);
227 } else {
228 module_free(NULL, kip->insns); 223 module_free(NULL, kip->insns);
229 kfree(kip); 224 kfree(kip);
230 } 225 }
@@ -235,14 +230,13 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
235 230
236static int __kprobes collect_garbage_slots(void) 231static int __kprobes collect_garbage_slots(void)
237{ 232{
238 struct kprobe_insn_page *kip; 233 struct kprobe_insn_page *kip, *next;
239 struct hlist_node *pos, *next;
240 234
241 /* Ensure no-one is preepmted on the garbages */ 235 /* Ensure no-one is preepmted on the garbages */
242 if (check_safety()) 236 if (check_safety())
243 return -EAGAIN; 237 return -EAGAIN;
244 238
245 hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) { 239 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
246 int i; 240 int i;
247 if (kip->ngarbage == 0) 241 if (kip->ngarbage == 0)
248 continue; 242 continue;
@@ -260,19 +254,17 @@ static int __kprobes collect_garbage_slots(void)
260void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) 254void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
261{ 255{
262 struct kprobe_insn_page *kip; 256 struct kprobe_insn_page *kip;
263 struct hlist_node *pos;
264 257
265 mutex_lock(&kprobe_insn_mutex); 258 mutex_lock(&kprobe_insn_mutex);
266 hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) { 259 list_for_each_entry(kip, &kprobe_insn_pages, list) {
267 if (kip->insns <= slot && 260 if (kip->insns <= slot &&
268 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { 261 slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
269 int i = (slot - kip->insns) / MAX_INSN_SIZE; 262 int i = (slot - kip->insns) / MAX_INSN_SIZE;
270 if (dirty) { 263 if (dirty) {
271 kip->slot_used[i] = SLOT_DIRTY; 264 kip->slot_used[i] = SLOT_DIRTY;
272 kip->ngarbage++; 265 kip->ngarbage++;
273 } else { 266 } else
274 collect_one_slot(kip, i); 267 collect_one_slot(kip, i);
275 }
276 break; 268 break;
277 } 269 }
278 } 270 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e1d23c26308..094863416b2e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1016,71 +1016,35 @@ static int
1016__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1016__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1017{ 1017{
1018 unsigned long ftrace_addr; 1018 unsigned long ftrace_addr;
1019 unsigned long ip, fl; 1019 unsigned long flag = 0UL;
1020 1020
1021 ftrace_addr = (unsigned long)FTRACE_ADDR; 1021 ftrace_addr = (unsigned long)FTRACE_ADDR;
1022 1022
1023 ip = rec->ip;
1024
1025 /* 1023 /*
1026 * If this record is not to be traced and 1024 * If this record is not to be traced or we want to disable it,
1027 * it is not enabled then do nothing. 1025 * then disable it.
1028 * 1026 *
1029 * If this record is not to be traced and 1027 * If we want to enable it and filtering is off, then enable it.
1030 * it is enabled then disable it.
1031 * 1028 *
1029 * If we want to enable it and filtering is on, enable it only if
1030 * it's filtered
1032 */ 1031 */
1033 if (rec->flags & FTRACE_FL_NOTRACE) { 1032 if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
1034 if (rec->flags & FTRACE_FL_ENABLED) 1033 if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
1035 rec->flags &= ~FTRACE_FL_ENABLED; 1034 flag = FTRACE_FL_ENABLED;
1036 else 1035 }
1037 return 0;
1038
1039 } else if (ftrace_filtered && enable) {
1040 /*
1041 * Filtering is on:
1042 */
1043
1044 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED);
1045
1046 /* Record is filtered and enabled, do nothing */
1047 if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
1048 return 0;
1049
1050 /* Record is not filtered or enabled, do nothing */
1051 if (!fl)
1052 return 0;
1053
1054 /* Record is not filtered but enabled, disable it */
1055 if (fl == FTRACE_FL_ENABLED)
1056 rec->flags &= ~FTRACE_FL_ENABLED;
1057 else
1058 /* Otherwise record is filtered but not enabled, enable it */
1059 rec->flags |= FTRACE_FL_ENABLED;
1060 } else {
1061 /* Disable or not filtered */
1062
1063 if (enable) {
1064 /* if record is enabled, do nothing */
1065 if (rec->flags & FTRACE_FL_ENABLED)
1066 return 0;
1067
1068 rec->flags |= FTRACE_FL_ENABLED;
1069
1070 } else {
1071 1036
1072 /* if record is not enabled, do nothing */ 1037 /* If the state of this record hasn't changed, then do nothing */
1073 if (!(rec->flags & FTRACE_FL_ENABLED)) 1038 if ((rec->flags & FTRACE_FL_ENABLED) == flag)
1074 return 0; 1039 return 0;
1075 1040
1076 rec->flags &= ~FTRACE_FL_ENABLED; 1041 if (flag) {
1077 } 1042 rec->flags |= FTRACE_FL_ENABLED;
1043 return ftrace_make_call(rec, ftrace_addr);
1078 } 1044 }
1079 1045
1080 if (rec->flags & FTRACE_FL_ENABLED) 1046 rec->flags &= ~FTRACE_FL_ENABLED;
1081 return ftrace_make_call(rec, ftrace_addr); 1047 return ftrace_make_nop(NULL, rec, ftrace_addr);
1082 else
1083 return ftrace_make_nop(NULL, rec, ftrace_addr);
1084} 1048}
1085 1049
1086static void ftrace_replace_code(int enable) 1050static void ftrace_replace_code(int enable)
@@ -1375,7 +1339,6 @@ struct ftrace_iterator {
1375 unsigned flags; 1339 unsigned flags;
1376 unsigned char buffer[FTRACE_BUFF_MAX+1]; 1340 unsigned char buffer[FTRACE_BUFF_MAX+1];
1377 unsigned buffer_idx; 1341 unsigned buffer_idx;
1378 unsigned filtered;
1379}; 1342};
1380 1343
1381static void * 1344static void *
@@ -1438,18 +1401,13 @@ static int t_hash_show(struct seq_file *m, void *v)
1438{ 1401{
1439 struct ftrace_func_probe *rec; 1402 struct ftrace_func_probe *rec;
1440 struct hlist_node *hnd = v; 1403 struct hlist_node *hnd = v;
1441 char str[KSYM_SYMBOL_LEN];
1442 1404
1443 rec = hlist_entry(hnd, struct ftrace_func_probe, node); 1405 rec = hlist_entry(hnd, struct ftrace_func_probe, node);
1444 1406
1445 if (rec->ops->print) 1407 if (rec->ops->print)
1446 return rec->ops->print(m, rec->ip, rec->ops, rec->data); 1408 return rec->ops->print(m, rec->ip, rec->ops, rec->data);
1447 1409
1448 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1410 seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func);
1449 seq_printf(m, "%s:", str);
1450
1451 kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
1452 seq_printf(m, "%s", str);
1453 1411
1454 if (rec->data) 1412 if (rec->data)
1455 seq_printf(m, ":%p", rec->data); 1413 seq_printf(m, ":%p", rec->data);
@@ -1547,7 +1505,6 @@ static int t_show(struct seq_file *m, void *v)
1547{ 1505{
1548 struct ftrace_iterator *iter = m->private; 1506 struct ftrace_iterator *iter = m->private;
1549 struct dyn_ftrace *rec = v; 1507 struct dyn_ftrace *rec = v;
1550 char str[KSYM_SYMBOL_LEN];
1551 1508
1552 if (iter->flags & FTRACE_ITER_HASH) 1509 if (iter->flags & FTRACE_ITER_HASH)
1553 return t_hash_show(m, v); 1510 return t_hash_show(m, v);
@@ -1560,9 +1517,7 @@ static int t_show(struct seq_file *m, void *v)
1560 if (!rec) 1517 if (!rec)
1561 return 0; 1518 return 0;
1562 1519
1563 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 1520 seq_printf(m, "%pf\n", (void *)rec->ip);
1564
1565 seq_printf(m, "%s\n", str);
1566 1521
1567 return 0; 1522 return 0;
1568} 1523}
@@ -2312,7 +2267,6 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2312 } 2267 }
2313 2268
2314 if (isspace(ch)) { 2269 if (isspace(ch)) {
2315 iter->filtered++;
2316 iter->buffer[iter->buffer_idx] = 0; 2270 iter->buffer[iter->buffer_idx] = 0;
2317 ret = ftrace_process_regex(iter->buffer, 2271 ret = ftrace_process_regex(iter->buffer,
2318 iter->buffer_idx, enable); 2272 iter->buffer_idx, enable);
@@ -2443,7 +2397,6 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
2443 iter = file->private_data; 2397 iter = file->private_data;
2444 2398
2445 if (iter->buffer_idx) { 2399 if (iter->buffer_idx) {
2446 iter->filtered++;
2447 iter->buffer[iter->buffer_idx] = 0; 2400 iter->buffer[iter->buffer_idx] = 0;
2448 ftrace_match_records(iter->buffer, iter->buffer_idx, enable); 2401 ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
2449 } 2402 }
@@ -2543,7 +2496,6 @@ static void g_stop(struct seq_file *m, void *p)
2543static int g_show(struct seq_file *m, void *v) 2496static int g_show(struct seq_file *m, void *v)
2544{ 2497{
2545 unsigned long *ptr = v; 2498 unsigned long *ptr = v;
2546 char str[KSYM_SYMBOL_LEN];
2547 2499
2548 if (!ptr) 2500 if (!ptr)
2549 return 0; 2501 return 0;
@@ -2553,9 +2505,7 @@ static int g_show(struct seq_file *m, void *v)
2553 return 0; 2505 return 0;
2554 } 2506 }
2555 2507
2556 kallsyms_lookup(*ptr, NULL, NULL, NULL, str); 2508 seq_printf(m, "%pf\n", v);
2557
2558 seq_printf(m, "%s\n", str);
2559 2509
2560 return 0; 2510 return 0;
2561} 2511}
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 1edaa9516e81..dda53ccf749b 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -239,12 +239,52 @@ struct kmemtrace_user_event_alloc {
239}; 239};
240 240
241static enum print_line_t 241static enum print_line_t
242kmemtrace_print_alloc_user(struct trace_iterator *iter, 242kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
243 struct kmemtrace_alloc_entry *entry)
244{ 243{
245 struct kmemtrace_user_event_alloc *ev_alloc;
246 struct trace_seq *s = &iter->seq; 244 struct trace_seq *s = &iter->seq;
245 struct kmemtrace_alloc_entry *entry;
246 int ret;
247
248 trace_assign_type(entry, iter->ent);
249
250 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu "
251 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
252 entry->type_id, (void *)entry->call_site, (unsigned long)entry->ptr,
253 (unsigned long)entry->bytes_req, (unsigned long)entry->bytes_alloc,
254 (unsigned long)entry->gfp_flags, entry->node);
255
256 if (!ret)
257 return TRACE_TYPE_PARTIAL_LINE;
258 return TRACE_TYPE_HANDLED;
259}
260
261static enum print_line_t
262kmemtrace_print_free(struct trace_iterator *iter, int flags)
263{
264 struct trace_seq *s = &iter->seq;
265 struct kmemtrace_free_entry *entry;
266 int ret;
267
268 trace_assign_type(entry, iter->ent);
269
270 ret = trace_seq_printf(s, "type_id %d call_site %pF ptr %lu\n",
271 entry->type_id, (void *)entry->call_site,
272 (unsigned long)entry->ptr);
273
274 if (!ret)
275 return TRACE_TYPE_PARTIAL_LINE;
276 return TRACE_TYPE_HANDLED;
277}
278
279static enum print_line_t
280kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
281{
282 struct trace_seq *s = &iter->seq;
283 struct kmemtrace_alloc_entry *entry;
247 struct kmemtrace_user_event *ev; 284 struct kmemtrace_user_event *ev;
285 struct kmemtrace_user_event_alloc *ev_alloc;
286
287 trace_assign_type(entry, iter->ent);
248 288
249 ev = trace_seq_reserve(s, sizeof(*ev)); 289 ev = trace_seq_reserve(s, sizeof(*ev));
250 if (!ev) 290 if (!ev)
@@ -271,12 +311,14 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter,
271} 311}
272 312
273static enum print_line_t 313static enum print_line_t
274kmemtrace_print_free_user(struct trace_iterator *iter, 314kmemtrace_print_free_user(struct trace_iterator *iter, int flags)
275 struct kmemtrace_free_entry *entry)
276{ 315{
277 struct trace_seq *s = &iter->seq; 316 struct trace_seq *s = &iter->seq;
317 struct kmemtrace_free_entry *entry;
278 struct kmemtrace_user_event *ev; 318 struct kmemtrace_user_event *ev;
279 319
320 trace_assign_type(entry, iter->ent);
321
280 ev = trace_seq_reserve(s, sizeof(*ev)); 322 ev = trace_seq_reserve(s, sizeof(*ev));
281 if (!ev) 323 if (!ev)
282 return TRACE_TYPE_PARTIAL_LINE; 324 return TRACE_TYPE_PARTIAL_LINE;
@@ -294,12 +336,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter,
294 336
295/* The two other following provide a more minimalistic output */ 337/* The two other following provide a more minimalistic output */
296static enum print_line_t 338static enum print_line_t
297kmemtrace_print_alloc_compress(struct trace_iterator *iter, 339kmemtrace_print_alloc_compress(struct trace_iterator *iter)
298 struct kmemtrace_alloc_entry *entry)
299{ 340{
341 struct kmemtrace_alloc_entry *entry;
300 struct trace_seq *s = &iter->seq; 342 struct trace_seq *s = &iter->seq;
301 int ret; 343 int ret;
302 344
345 trace_assign_type(entry, iter->ent);
346
303 /* Alloc entry */ 347 /* Alloc entry */
304 ret = trace_seq_printf(s, " + "); 348 ret = trace_seq_printf(s, " + ");
305 if (!ret) 349 if (!ret)
@@ -345,29 +389,24 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
345 if (!ret) 389 if (!ret)
346 return TRACE_TYPE_PARTIAL_LINE; 390 return TRACE_TYPE_PARTIAL_LINE;
347 391
348 /* Node */ 392 /* Node and call site*/
349 ret = trace_seq_printf(s, "%4d ", entry->node); 393 ret = trace_seq_printf(s, "%4d %pf\n", entry->node,
350 if (!ret) 394 (void *)entry->call_site);
351 return TRACE_TYPE_PARTIAL_LINE;
352
353 /* Call site */
354 ret = seq_print_ip_sym(s, entry->call_site, 0);
355 if (!ret) 395 if (!ret)
356 return TRACE_TYPE_PARTIAL_LINE; 396 return TRACE_TYPE_PARTIAL_LINE;
357 397
358 if (!trace_seq_printf(s, "\n"))
359 return TRACE_TYPE_PARTIAL_LINE;
360
361 return TRACE_TYPE_HANDLED; 398 return TRACE_TYPE_HANDLED;
362} 399}
363 400
364static enum print_line_t 401static enum print_line_t
365kmemtrace_print_free_compress(struct trace_iterator *iter, 402kmemtrace_print_free_compress(struct trace_iterator *iter)
366 struct kmemtrace_free_entry *entry)
367{ 403{
404 struct kmemtrace_free_entry *entry;
368 struct trace_seq *s = &iter->seq; 405 struct trace_seq *s = &iter->seq;
369 int ret; 406 int ret;
370 407
408 trace_assign_type(entry, iter->ent);
409
371 /* Free entry */ 410 /* Free entry */
372 ret = trace_seq_printf(s, " - "); 411 ret = trace_seq_printf(s, " - ");
373 if (!ret) 412 if (!ret)
@@ -401,19 +440,11 @@ kmemtrace_print_free_compress(struct trace_iterator *iter,
401 if (!ret) 440 if (!ret)
402 return TRACE_TYPE_PARTIAL_LINE; 441 return TRACE_TYPE_PARTIAL_LINE;
403 442
404 /* Skip node */ 443 /* Skip node and print call site*/
405 ret = trace_seq_printf(s, " "); 444 ret = trace_seq_printf(s, " %pf\n", (void *)entry->call_site);
406 if (!ret) 445 if (!ret)
407 return TRACE_TYPE_PARTIAL_LINE; 446 return TRACE_TYPE_PARTIAL_LINE;
408 447
409 /* Call site */
410 ret = seq_print_ip_sym(s, entry->call_site, 0);
411 if (!ret)
412 return TRACE_TYPE_PARTIAL_LINE;
413
414 if (!trace_seq_printf(s, "\n"))
415 return TRACE_TYPE_PARTIAL_LINE;
416
417 return TRACE_TYPE_HANDLED; 448 return TRACE_TYPE_HANDLED;
418} 449}
419 450
@@ -421,32 +452,31 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
421{ 452{
422 struct trace_entry *entry = iter->ent; 453 struct trace_entry *entry = iter->ent;
423 454
424 switch (entry->type) { 455 if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
425 case TRACE_KMEM_ALLOC: { 456 return TRACE_TYPE_UNHANDLED;
426 struct kmemtrace_alloc_entry *field;
427
428 trace_assign_type(field, entry);
429 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
430 return kmemtrace_print_alloc_compress(iter, field);
431 else
432 return kmemtrace_print_alloc_user(iter, field);
433 }
434
435 case TRACE_KMEM_FREE: {
436 struct kmemtrace_free_entry *field;
437
438 trace_assign_type(field, entry);
439 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
440 return kmemtrace_print_free_compress(iter, field);
441 else
442 return kmemtrace_print_free_user(iter, field);
443 }
444 457
458 switch (entry->type) {
459 case TRACE_KMEM_ALLOC:
460 return kmemtrace_print_alloc_compress(iter);
461 case TRACE_KMEM_FREE:
462 return kmemtrace_print_free_compress(iter);
445 default: 463 default:
446 return TRACE_TYPE_UNHANDLED; 464 return TRACE_TYPE_UNHANDLED;
447 } 465 }
448} 466}
449 467
468static struct trace_event kmem_trace_alloc = {
469 .type = TRACE_KMEM_ALLOC,
470 .trace = kmemtrace_print_alloc,
471 .binary = kmemtrace_print_alloc_user,
472};
473
474static struct trace_event kmem_trace_free = {
475 .type = TRACE_KMEM_FREE,
476 .trace = kmemtrace_print_free,
477 .binary = kmemtrace_print_free_user,
478};
479
450static struct tracer kmem_tracer __read_mostly = { 480static struct tracer kmem_tracer __read_mostly = {
451 .name = "kmemtrace", 481 .name = "kmemtrace",
452 .init = kmem_trace_init, 482 .init = kmem_trace_init,
@@ -463,6 +493,21 @@ void kmemtrace_init(void)
463 493
464static int __init init_kmem_tracer(void) 494static int __init init_kmem_tracer(void)
465{ 495{
466 return register_tracer(&kmem_tracer); 496 if (!register_ftrace_event(&kmem_trace_alloc)) {
497 pr_warning("Warning: could not register kmem events\n");
498 return 1;
499 }
500
501 if (!register_ftrace_event(&kmem_trace_free)) {
502 pr_warning("Warning: could not register kmem events\n");
503 return 1;
504 }
505
506 if (!register_tracer(&kmem_tracer)) {
507 pr_warning("Warning: could not register the kmem tracer\n");
508 return 1;
509 }
510
511 return 0;
467} 512}
468device_initcall(init_kmem_tracer); 513device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a330513d96ce..da2c59d8f486 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -322,6 +322,14 @@ struct buffer_data_page {
322 unsigned char data[]; /* data of buffer page */ 322 unsigned char data[]; /* data of buffer page */
323}; 323};
324 324
325/*
326 * Note, the buffer_page list must be first. The buffer pages
327 * are allocated in cache lines, which means that each buffer
328 * page will be at the beginning of a cache line, and thus
329 * the least significant bits will be zero. We use this to
330 * add flags in the list struct pointers, to make the ring buffer
331 * lockless.
332 */
325struct buffer_page { 333struct buffer_page {
326 struct list_head list; /* list of buffer pages */ 334 struct list_head list; /* list of buffer pages */
327 local_t write; /* index for next write */ 335 local_t write; /* index for next write */
@@ -330,6 +338,21 @@ struct buffer_page {
330 struct buffer_data_page *page; /* Actual data page */ 338 struct buffer_data_page *page; /* Actual data page */
331}; 339};
332 340
341/*
342 * The buffer page counters, write and entries, must be reset
343 * atomically when crossing page boundaries. To synchronize this
344 * update, two counters are inserted into the number. One is
345 * the actual counter for the write position or count on the page.
346 *
347 * The other is a counter of updaters. Before an update happens
348 * the update partition of the counter is incremented. This will
349 * allow the updater to update the counter atomically.
350 *
351 * The counter is 20 bits, and the state data is 12.
352 */
353#define RB_WRITE_MASK 0xfffff
354#define RB_WRITE_INTCNT (1 << 20)
355
333static void rb_init_page(struct buffer_data_page *bpage) 356static void rb_init_page(struct buffer_data_page *bpage)
334{ 357{
335 local_set(&bpage->commit, 0); 358 local_set(&bpage->commit, 0);
@@ -403,21 +426,20 @@ int ring_buffer_print_page_header(struct trace_seq *s)
403struct ring_buffer_per_cpu { 426struct ring_buffer_per_cpu {
404 int cpu; 427 int cpu;
405 struct ring_buffer *buffer; 428 struct ring_buffer *buffer;
406 spinlock_t reader_lock; /* serialize readers */ 429 spinlock_t reader_lock; /* serialize readers */
407 raw_spinlock_t lock; 430 raw_spinlock_t lock;
408 struct lock_class_key lock_key; 431 struct lock_class_key lock_key;
409 struct list_head pages; 432 struct list_head *pages;
410 struct buffer_page *head_page; /* read from head */ 433 struct buffer_page *head_page; /* read from head */
411 struct buffer_page *tail_page; /* write to tail */ 434 struct buffer_page *tail_page; /* write to tail */
412 struct buffer_page *commit_page; /* committed pages */ 435 struct buffer_page *commit_page; /* committed pages */
413 struct buffer_page *reader_page; 436 struct buffer_page *reader_page;
414 unsigned long nmi_dropped; 437 local_t commit_overrun;
415 unsigned long commit_overrun; 438 local_t overrun;
416 unsigned long overrun;
417 unsigned long read;
418 local_t entries; 439 local_t entries;
419 local_t committing; 440 local_t committing;
420 local_t commits; 441 local_t commits;
442 unsigned long read;
421 u64 write_stamp; 443 u64 write_stamp;
422 u64 read_stamp; 444 u64 read_stamp;
423 atomic_t record_disabled; 445 atomic_t record_disabled;
@@ -489,6 +511,390 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
489} 511}
490EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); 512EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
491 513
514/*
515 * Making the ring buffer lockless makes things tricky.
516 * Although writes only happen on the CPU that they are on,
517 * and they only need to worry about interrupts. Reads can
518 * happen on any CPU.
519 *
520 * The reader page is always off the ring buffer, but when the
521 * reader finishes with a page, it needs to swap its page with
522 * a new one from the buffer. The reader needs to take from
523 * the head (writes go to the tail). But if a writer is in overwrite
524 * mode and wraps, it must push the head page forward.
525 *
526 * Here lies the problem.
527 *
528 * The reader must be careful to replace only the head page, and
529 * not another one. As described at the top of the file in the
530 * ASCII art, the reader sets its old page to point to the next
531 * page after head. It then sets the page after head to point to
532 * the old reader page. But if the writer moves the head page
533 * during this operation, the reader could end up with the tail.
534 *
535 * We use cmpxchg to help prevent this race. We also do something
536 * special with the page before head. We set the LSB to 1.
537 *
538 * When the writer must push the page forward, it will clear the
539 * bit that points to the head page, move the head, and then set
540 * the bit that points to the new head page.
541 *
542 * We also don't want an interrupt coming in and moving the head
543 * page on another writer. Thus we use the second LSB to catch
544 * that too. Thus:
545 *
546 * head->list->prev->next bit 1 bit 0
547 * ------- -------
548 * Normal page 0 0
549 * Points to head page 0 1
550 * New head page 1 0
551 *
552 * Note we can not trust the prev pointer of the head page, because:
553 *
554 * +----+ +-----+ +-----+
555 * | |------>| T |---X--->| N |
556 * | |<------| | | |
557 * +----+ +-----+ +-----+
558 * ^ ^ |
559 * | +-----+ | |
560 * +----------| R |----------+ |
561 * | |<-----------+
562 * +-----+
563 *
564 * Key: ---X--> HEAD flag set in pointer
565 * T Tail page
566 * R Reader page
567 * N Next page
568 *
569 * (see __rb_reserve_next() to see where this happens)
570 *
571 * What the above shows is that the reader just swapped out
572 * the reader page with a page in the buffer, but before it
573 * could make the new header point back to the new page added
574 * it was preempted by a writer. The writer moved forward onto
575 * the new page added by the reader and is about to move forward
576 * again.
577 *
578 * You can see, it is legitimate for the previous pointer of
579 * the head (or any page) not to point back to itself. But only
580 * temporarially.
581 */
582
583#define RB_PAGE_NORMAL 0UL
584#define RB_PAGE_HEAD 1UL
585#define RB_PAGE_UPDATE 2UL
586
587
588#define RB_FLAG_MASK 3UL
589
590/* PAGE_MOVED is not part of the mask */
591#define RB_PAGE_MOVED 4UL
592
593/*
594 * rb_list_head - remove any bit
595 */
596static struct list_head *rb_list_head(struct list_head *list)
597{
598 unsigned long val = (unsigned long)list;
599
600 return (struct list_head *)(val & ~RB_FLAG_MASK);
601}
602
603/*
604 * rb_is_head_page - test if the give page is the head page
605 *
606 * Because the reader may move the head_page pointer, we can
607 * not trust what the head page is (it may be pointing to
608 * the reader page). But if the next page is a header page,
609 * its flags will be non zero.
610 */
611static int inline
612rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
613 struct buffer_page *page, struct list_head *list)
614{
615 unsigned long val;
616
617 val = (unsigned long)list->next;
618
619 if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list)
620 return RB_PAGE_MOVED;
621
622 return val & RB_FLAG_MASK;
623}
624
625/*
626 * rb_is_reader_page
627 *
628 * The unique thing about the reader page, is that, if the
629 * writer is ever on it, the previous pointer never points
630 * back to the reader page.
631 */
632static int rb_is_reader_page(struct buffer_page *page)
633{
634 struct list_head *list = page->list.prev;
635
636 return rb_list_head(list->next) != &page->list;
637}
638
639/*
640 * rb_set_list_to_head - set a list_head to be pointing to head.
641 */
642static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer,
643 struct list_head *list)
644{
645 unsigned long *ptr;
646
647 ptr = (unsigned long *)&list->next;
648 *ptr |= RB_PAGE_HEAD;
649 *ptr &= ~RB_PAGE_UPDATE;
650}
651
652/*
653 * rb_head_page_activate - sets up head page
654 */
655static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer)
656{
657 struct buffer_page *head;
658
659 head = cpu_buffer->head_page;
660 if (!head)
661 return;
662
663 /*
664 * Set the previous list pointer to have the HEAD flag.
665 */
666 rb_set_list_to_head(cpu_buffer, head->list.prev);
667}
668
669static void rb_list_head_clear(struct list_head *list)
670{
671 unsigned long *ptr = (unsigned long *)&list->next;
672
673 *ptr &= ~RB_FLAG_MASK;
674}
675
676/*
677 * rb_head_page_dactivate - clears head page ptr (for free list)
678 */
679static void
680rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer)
681{
682 struct list_head *hd;
683
684 /* Go through the whole list and clear any pointers found. */
685 rb_list_head_clear(cpu_buffer->pages);
686
687 list_for_each(hd, cpu_buffer->pages)
688 rb_list_head_clear(hd);
689}
690
691static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,
692 struct buffer_page *head,
693 struct buffer_page *prev,
694 int old_flag, int new_flag)
695{
696 struct list_head *list;
697 unsigned long val = (unsigned long)&head->list;
698 unsigned long ret;
699
700 list = &prev->list;
701
702 val &= ~RB_FLAG_MASK;
703
704 ret = (unsigned long)cmpxchg(&list->next,
705 val | old_flag, val | new_flag);
706
707 /* check if the reader took the page */
708 if ((ret & ~RB_FLAG_MASK) != val)
709 return RB_PAGE_MOVED;
710
711 return ret & RB_FLAG_MASK;
712}
713
714static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer,
715 struct buffer_page *head,
716 struct buffer_page *prev,
717 int old_flag)
718{
719 return rb_head_page_set(cpu_buffer, head, prev,
720 old_flag, RB_PAGE_UPDATE);
721}
722
723static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer,
724 struct buffer_page *head,
725 struct buffer_page *prev,
726 int old_flag)
727{
728 return rb_head_page_set(cpu_buffer, head, prev,
729 old_flag, RB_PAGE_HEAD);
730}
731
732static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer,
733 struct buffer_page *head,
734 struct buffer_page *prev,
735 int old_flag)
736{
737 return rb_head_page_set(cpu_buffer, head, prev,
738 old_flag, RB_PAGE_NORMAL);
739}
740
741static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
742 struct buffer_page **bpage)
743{
744 struct list_head *p = rb_list_head((*bpage)->list.next);
745
746 *bpage = list_entry(p, struct buffer_page, list);
747}
748
749static struct buffer_page *
750rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer)
751{
752 struct buffer_page *head;
753 struct buffer_page *page;
754 struct list_head *list;
755 int i;
756
757 if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page))
758 return NULL;
759
760 /* sanity check */
761 list = cpu_buffer->pages;
762 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list))
763 return NULL;
764
765 page = head = cpu_buffer->head_page;
766 /*
767 * It is possible that the writer moves the header behind
768 * where we started, and we miss in one loop.
769 * A second loop should grab the header, but we'll do
770 * three loops just because I'm paranoid.
771 */
772 for (i = 0; i < 3; i++) {
773 do {
774 if (rb_is_head_page(cpu_buffer, page, page->list.prev)) {
775 cpu_buffer->head_page = page;
776 return page;
777 }
778 rb_inc_page(cpu_buffer, &page);
779 } while (page != head);
780 }
781
782 RB_WARN_ON(cpu_buffer, 1);
783
784 return NULL;
785}
786
787static int rb_head_page_replace(struct buffer_page *old,
788 struct buffer_page *new)
789{
790 unsigned long *ptr = (unsigned long *)&old->list.prev->next;
791 unsigned long val;
792 unsigned long ret;
793
794 val = *ptr & ~RB_FLAG_MASK;
795 val |= RB_PAGE_HEAD;
796
797 ret = cmpxchg(ptr, val, &new->list);
798
799 return ret == val;
800}
801
802/*
803 * rb_tail_page_update - move the tail page forward
804 *
805 * Returns 1 if moved tail page, 0 if someone else did.
806 */
807static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
808 struct buffer_page *tail_page,
809 struct buffer_page *next_page)
810{
811 struct buffer_page *old_tail;
812 unsigned long old_entries;
813 unsigned long old_write;
814 int ret = 0;
815
816 /*
817 * The tail page now needs to be moved forward.
818 *
819 * We need to reset the tail page, but without messing
820 * with possible erasing of data brought in by interrupts
821 * that have moved the tail page and are currently on it.
822 *
823 * We add a counter to the write field to denote this.
824 */
825 old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write);
826 old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries);
827
828 /*
829 * Just make sure we have seen our old_write and synchronize
830 * with any interrupts that come in.
831 */
832 barrier();
833
834 /*
835 * If the tail page is still the same as what we think
836 * it is, then it is up to us to update the tail
837 * pointer.
838 */
839 if (tail_page == cpu_buffer->tail_page) {
840 /* Zero the write counter */
841 unsigned long val = old_write & ~RB_WRITE_MASK;
842 unsigned long eval = old_entries & ~RB_WRITE_MASK;
843
844 /*
845 * This will only succeed if an interrupt did
846 * not come in and change it. In which case, we
847 * do not want to modify it.
848 *
849 * We add (void) to let the compiler know that we do not care
850 * about the return value of these functions. We use the
851 * cmpxchg to only update if an interrupt did not already
852 * do it for us. If the cmpxchg fails, we don't care.
853 */
854 (void)local_cmpxchg(&next_page->write, old_write, val);
855 (void)local_cmpxchg(&next_page->entries, old_entries, eval);
856
857 /*
858 * No need to worry about races with clearing out the commit.
859 * it only can increment when a commit takes place. But that
860 * only happens in the outer most nested commit.
861 */
862 local_set(&next_page->page->commit, 0);
863
864 old_tail = cmpxchg(&cpu_buffer->tail_page,
865 tail_page, next_page);
866
867 if (old_tail == tail_page)
868 ret = 1;
869 }
870
871 return ret;
872}
873
874static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
875 struct buffer_page *bpage)
876{
877 unsigned long val = (unsigned long)bpage;
878
879 if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK))
880 return 1;
881
882 return 0;
883}
884
885/**
886 * rb_check_list - make sure a pointer to a list has the last bits zero
887 */
888static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer,
889 struct list_head *list)
890{
891 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev))
892 return 1;
893 if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next))
894 return 1;
895 return 0;
896}
897
492/** 898/**
493 * check_pages - integrity check of buffer pages 899 * check_pages - integrity check of buffer pages
494 * @cpu_buffer: CPU buffer with pages to test 900 * @cpu_buffer: CPU buffer with pages to test
@@ -498,14 +904,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
498 */ 904 */
499static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 905static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
500{ 906{
501 struct list_head *head = &cpu_buffer->pages; 907 struct list_head *head = cpu_buffer->pages;
502 struct buffer_page *bpage, *tmp; 908 struct buffer_page *bpage, *tmp;
503 909
910 rb_head_page_deactivate(cpu_buffer);
911
504 if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) 912 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
505 return -1; 913 return -1;
506 if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) 914 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
507 return -1; 915 return -1;
508 916
917 if (rb_check_list(cpu_buffer, head))
918 return -1;
919
509 list_for_each_entry_safe(bpage, tmp, head, list) { 920 list_for_each_entry_safe(bpage, tmp, head, list) {
510 if (RB_WARN_ON(cpu_buffer, 921 if (RB_WARN_ON(cpu_buffer,
511 bpage->list.next->prev != &bpage->list)) 922 bpage->list.next->prev != &bpage->list))
@@ -513,25 +924,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
513 if (RB_WARN_ON(cpu_buffer, 924 if (RB_WARN_ON(cpu_buffer,
514 bpage->list.prev->next != &bpage->list)) 925 bpage->list.prev->next != &bpage->list))
515 return -1; 926 return -1;
927 if (rb_check_list(cpu_buffer, &bpage->list))
928 return -1;
516 } 929 }
517 930
931 rb_head_page_activate(cpu_buffer);
932
518 return 0; 933 return 0;
519} 934}
520 935
521static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, 936static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
522 unsigned nr_pages) 937 unsigned nr_pages)
523{ 938{
524 struct list_head *head = &cpu_buffer->pages;
525 struct buffer_page *bpage, *tmp; 939 struct buffer_page *bpage, *tmp;
526 unsigned long addr; 940 unsigned long addr;
527 LIST_HEAD(pages); 941 LIST_HEAD(pages);
528 unsigned i; 942 unsigned i;
529 943
944 WARN_ON(!nr_pages);
945
530 for (i = 0; i < nr_pages; i++) { 946 for (i = 0; i < nr_pages; i++) {
531 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 947 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
532 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 948 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
533 if (!bpage) 949 if (!bpage)
534 goto free_pages; 950 goto free_pages;
951
952 rb_check_bpage(cpu_buffer, bpage);
953
535 list_add(&bpage->list, &pages); 954 list_add(&bpage->list, &pages);
536 955
537 addr = __get_free_page(GFP_KERNEL); 956 addr = __get_free_page(GFP_KERNEL);
@@ -541,7 +960,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
541 rb_init_page(bpage->page); 960 rb_init_page(bpage->page);
542 } 961 }
543 962
544 list_splice(&pages, head); 963 /*
964 * The ring buffer page list is a circular list that does not
965 * start and end with a list head. All page list items point to
966 * other pages.
967 */
968 cpu_buffer->pages = pages.next;
969 list_del(&pages);
545 970
546 rb_check_pages(cpu_buffer); 971 rb_check_pages(cpu_buffer);
547 972
@@ -573,13 +998,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
573 spin_lock_init(&cpu_buffer->reader_lock); 998 spin_lock_init(&cpu_buffer->reader_lock);
574 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 999 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
575 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1000 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
576 INIT_LIST_HEAD(&cpu_buffer->pages);
577 1001
578 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1002 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
579 GFP_KERNEL, cpu_to_node(cpu)); 1003 GFP_KERNEL, cpu_to_node(cpu));
580 if (!bpage) 1004 if (!bpage)
581 goto fail_free_buffer; 1005 goto fail_free_buffer;
582 1006
1007 rb_check_bpage(cpu_buffer, bpage);
1008
583 cpu_buffer->reader_page = bpage; 1009 cpu_buffer->reader_page = bpage;
584 addr = __get_free_page(GFP_KERNEL); 1010 addr = __get_free_page(GFP_KERNEL);
585 if (!addr) 1011 if (!addr)
@@ -594,9 +1020,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
594 goto fail_free_reader; 1020 goto fail_free_reader;
595 1021
596 cpu_buffer->head_page 1022 cpu_buffer->head_page
597 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 1023 = list_entry(cpu_buffer->pages, struct buffer_page, list);
598 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; 1024 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
599 1025
1026 rb_head_page_activate(cpu_buffer);
1027
600 return cpu_buffer; 1028 return cpu_buffer;
601 1029
602 fail_free_reader: 1030 fail_free_reader:
@@ -609,15 +1037,22 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
609 1037
610static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 1038static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
611{ 1039{
612 struct list_head *head = &cpu_buffer->pages; 1040 struct list_head *head = cpu_buffer->pages;
613 struct buffer_page *bpage, *tmp; 1041 struct buffer_page *bpage, *tmp;
614 1042
615 free_buffer_page(cpu_buffer->reader_page); 1043 free_buffer_page(cpu_buffer->reader_page);
616 1044
617 list_for_each_entry_safe(bpage, tmp, head, list) { 1045 rb_head_page_deactivate(cpu_buffer);
618 list_del_init(&bpage->list); 1046
1047 if (head) {
1048 list_for_each_entry_safe(bpage, tmp, head, list) {
1049 list_del_init(&bpage->list);
1050 free_buffer_page(bpage);
1051 }
1052 bpage = list_entry(head, struct buffer_page, list);
619 free_buffer_page(bpage); 1053 free_buffer_page(bpage);
620 } 1054 }
1055
621 kfree(cpu_buffer); 1056 kfree(cpu_buffer);
622} 1057}
623 1058
@@ -760,15 +1195,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
760 atomic_inc(&cpu_buffer->record_disabled); 1195 atomic_inc(&cpu_buffer->record_disabled);
761 synchronize_sched(); 1196 synchronize_sched();
762 1197
1198 rb_head_page_deactivate(cpu_buffer);
1199
763 for (i = 0; i < nr_pages; i++) { 1200 for (i = 0; i < nr_pages; i++) {
764 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1201 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
765 return; 1202 return;
766 p = cpu_buffer->pages.next; 1203 p = cpu_buffer->pages->next;
767 bpage = list_entry(p, struct buffer_page, list); 1204 bpage = list_entry(p, struct buffer_page, list);
768 list_del_init(&bpage->list); 1205 list_del_init(&bpage->list);
769 free_buffer_page(bpage); 1206 free_buffer_page(bpage);
770 } 1207 }
771 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) 1208 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
772 return; 1209 return;
773 1210
774 rb_reset_cpu(cpu_buffer); 1211 rb_reset_cpu(cpu_buffer);
@@ -790,15 +1227,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
790 atomic_inc(&cpu_buffer->record_disabled); 1227 atomic_inc(&cpu_buffer->record_disabled);
791 synchronize_sched(); 1228 synchronize_sched();
792 1229
1230 spin_lock_irq(&cpu_buffer->reader_lock);
1231 rb_head_page_deactivate(cpu_buffer);
1232
793 for (i = 0; i < nr_pages; i++) { 1233 for (i = 0; i < nr_pages; i++) {
794 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1234 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
795 return; 1235 return;
796 p = pages->next; 1236 p = pages->next;
797 bpage = list_entry(p, struct buffer_page, list); 1237 bpage = list_entry(p, struct buffer_page, list);
798 list_del_init(&bpage->list); 1238 list_del_init(&bpage->list);
799 list_add_tail(&bpage->list, &cpu_buffer->pages); 1239 list_add_tail(&bpage->list, cpu_buffer->pages);
800 } 1240 }
801 rb_reset_cpu(cpu_buffer); 1241 rb_reset_cpu(cpu_buffer);
1242 spin_unlock_irq(&cpu_buffer->reader_lock);
802 1243
803 rb_check_pages(cpu_buffer); 1244 rb_check_pages(cpu_buffer);
804 1245
@@ -949,21 +1390,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
949} 1390}
950 1391
951static inline struct ring_buffer_event * 1392static inline struct ring_buffer_event *
952rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
953{
954 return __rb_page_index(cpu_buffer->head_page,
955 cpu_buffer->head_page->read);
956}
957
958static inline struct ring_buffer_event *
959rb_iter_head_event(struct ring_buffer_iter *iter) 1393rb_iter_head_event(struct ring_buffer_iter *iter)
960{ 1394{
961 return __rb_page_index(iter->head_page, iter->head); 1395 return __rb_page_index(iter->head_page, iter->head);
962} 1396}
963 1397
964static inline unsigned rb_page_write(struct buffer_page *bpage) 1398static inline unsigned long rb_page_write(struct buffer_page *bpage)
965{ 1399{
966 return local_read(&bpage->write); 1400 return local_read(&bpage->write) & RB_WRITE_MASK;
967} 1401}
968 1402
969static inline unsigned rb_page_commit(struct buffer_page *bpage) 1403static inline unsigned rb_page_commit(struct buffer_page *bpage)
@@ -971,6 +1405,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage)
971 return local_read(&bpage->page->commit); 1405 return local_read(&bpage->page->commit);
972} 1406}
973 1407
1408static inline unsigned long rb_page_entries(struct buffer_page *bpage)
1409{
1410 return local_read(&bpage->entries) & RB_WRITE_MASK;
1411}
1412
974/* Size is determined by what has been commited */ 1413/* Size is determined by what has been commited */
975static inline unsigned rb_page_size(struct buffer_page *bpage) 1414static inline unsigned rb_page_size(struct buffer_page *bpage)
976{ 1415{
@@ -983,22 +1422,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
983 return rb_page_commit(cpu_buffer->commit_page); 1422 return rb_page_commit(cpu_buffer->commit_page);
984} 1423}
985 1424
986static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
987{
988 return rb_page_commit(cpu_buffer->head_page);
989}
990
991static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
992 struct buffer_page **bpage)
993{
994 struct list_head *p = (*bpage)->list.next;
995
996 if (p == &cpu_buffer->pages)
997 p = p->next;
998
999 *bpage = list_entry(p, struct buffer_page, list);
1000}
1001
1002static inline unsigned 1425static inline unsigned
1003rb_event_index(struct ring_buffer_event *event) 1426rb_event_index(struct ring_buffer_event *event)
1004{ 1427{
@@ -1024,6 +1447,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1024static void 1447static void
1025rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) 1448rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1026{ 1449{
1450 unsigned long max_count;
1451
1027 /* 1452 /*
1028 * We only race with interrupts and NMIs on this CPU. 1453 * We only race with interrupts and NMIs on this CPU.
1029 * If we own the commit event, then we can commit 1454 * If we own the commit event, then we can commit
@@ -1033,9 +1458,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1033 * assign the commit to the tail. 1458 * assign the commit to the tail.
1034 */ 1459 */
1035 again: 1460 again:
1461 max_count = cpu_buffer->buffer->pages * 100;
1462
1036 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 1463 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1037 cpu_buffer->commit_page->page->commit = 1464 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
1038 cpu_buffer->commit_page->write; 1465 return;
1466 if (RB_WARN_ON(cpu_buffer,
1467 rb_is_reader_page(cpu_buffer->tail_page)))
1468 return;
1469 local_set(&cpu_buffer->commit_page->page->commit,
1470 rb_page_write(cpu_buffer->commit_page));
1039 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 1471 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1040 cpu_buffer->write_stamp = 1472 cpu_buffer->write_stamp =
1041 cpu_buffer->commit_page->page->time_stamp; 1473 cpu_buffer->commit_page->page->time_stamp;
@@ -1044,8 +1476,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1044 } 1476 }
1045 while (rb_commit_index(cpu_buffer) != 1477 while (rb_commit_index(cpu_buffer) !=
1046 rb_page_write(cpu_buffer->commit_page)) { 1478 rb_page_write(cpu_buffer->commit_page)) {
1047 cpu_buffer->commit_page->page->commit = 1479
1048 cpu_buffer->commit_page->write; 1480 local_set(&cpu_buffer->commit_page->page->commit,
1481 rb_page_write(cpu_buffer->commit_page));
1482 RB_WARN_ON(cpu_buffer,
1483 local_read(&cpu_buffer->commit_page->page->commit) &
1484 ~RB_WRITE_MASK);
1049 barrier(); 1485 barrier();
1050 } 1486 }
1051 1487
@@ -1078,7 +1514,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1078 * to the head page instead of next. 1514 * to the head page instead of next.
1079 */ 1515 */
1080 if (iter->head_page == cpu_buffer->reader_page) 1516 if (iter->head_page == cpu_buffer->reader_page)
1081 iter->head_page = cpu_buffer->head_page; 1517 iter->head_page = rb_set_head_page(cpu_buffer);
1082 else 1518 else
1083 rb_inc_page(cpu_buffer, &iter->head_page); 1519 rb_inc_page(cpu_buffer, &iter->head_page);
1084 1520
@@ -1122,6 +1558,163 @@ rb_update_event(struct ring_buffer_event *event,
1122 } 1558 }
1123} 1559}
1124 1560
1561/*
1562 * rb_handle_head_page - writer hit the head page
1563 *
1564 * Returns: +1 to retry page
1565 * 0 to continue
1566 * -1 on error
1567 */
1568static int
1569rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
1570 struct buffer_page *tail_page,
1571 struct buffer_page *next_page)
1572{
1573 struct buffer_page *new_head;
1574 int entries;
1575 int type;
1576 int ret;
1577
1578 entries = rb_page_entries(next_page);
1579
1580 /*
1581 * The hard part is here. We need to move the head
1582 * forward, and protect against both readers on
1583 * other CPUs and writers coming in via interrupts.
1584 */
1585 type = rb_head_page_set_update(cpu_buffer, next_page, tail_page,
1586 RB_PAGE_HEAD);
1587
1588 /*
1589 * type can be one of four:
1590 * NORMAL - an interrupt already moved it for us
1591 * HEAD - we are the first to get here.
1592 * UPDATE - we are the interrupt interrupting
1593 * a current move.
1594 * MOVED - a reader on another CPU moved the next
1595 * pointer to its reader page. Give up
1596 * and try again.
1597 */
1598
1599 switch (type) {
1600 case RB_PAGE_HEAD:
1601 /*
1602 * We changed the head to UPDATE, thus
1603 * it is our responsibility to update
1604 * the counters.
1605 */
1606 local_add(entries, &cpu_buffer->overrun);
1607
1608 /*
1609 * The entries will be zeroed out when we move the
1610 * tail page.
1611 */
1612
1613 /* still more to do */
1614 break;
1615
1616 case RB_PAGE_UPDATE:
1617 /*
1618 * This is an interrupt that interrupt the
1619 * previous update. Still more to do.
1620 */
1621 break;
1622 case RB_PAGE_NORMAL:
1623 /*
1624 * An interrupt came in before the update
1625 * and processed this for us.
1626 * Nothing left to do.
1627 */
1628 return 1;
1629 case RB_PAGE_MOVED:
1630 /*
1631 * The reader is on another CPU and just did
1632 * a swap with our next_page.
1633 * Try again.
1634 */
1635 return 1;
1636 default:
1637 RB_WARN_ON(cpu_buffer, 1); /* WTF??? */
1638 return -1;
1639 }
1640
1641 /*
1642 * Now that we are here, the old head pointer is
1643 * set to UPDATE. This will keep the reader from
1644 * swapping the head page with the reader page.
1645 * The reader (on another CPU) will spin till
1646 * we are finished.
1647 *
1648 * We just need to protect against interrupts
1649 * doing the job. We will set the next pointer
1650 * to HEAD. After that, we set the old pointer
1651 * to NORMAL, but only if it was HEAD before.
1652 * otherwise we are an interrupt, and only
1653 * want the outer most commit to reset it.
1654 */
1655 new_head = next_page;
1656 rb_inc_page(cpu_buffer, &new_head);
1657
1658 ret = rb_head_page_set_head(cpu_buffer, new_head, next_page,
1659 RB_PAGE_NORMAL);
1660
1661 /*
1662 * Valid returns are:
1663 * HEAD - an interrupt came in and already set it.
1664 * NORMAL - One of two things:
1665 * 1) We really set it.
1666 * 2) A bunch of interrupts came in and moved
1667 * the page forward again.
1668 */
1669 switch (ret) {
1670 case RB_PAGE_HEAD:
1671 case RB_PAGE_NORMAL:
1672 /* OK */
1673 break;
1674 default:
1675 RB_WARN_ON(cpu_buffer, 1);
1676 return -1;
1677 }
1678
1679 /*
1680 * It is possible that an interrupt came in,
1681 * set the head up, then more interrupts came in
1682 * and moved it again. When we get back here,
1683 * the page would have been set to NORMAL but we
1684 * just set it back to HEAD.
1685 *
1686 * How do you detect this? Well, if that happened
1687 * the tail page would have moved.
1688 */
1689 if (ret == RB_PAGE_NORMAL) {
1690 /*
1691 * If the tail had moved passed next, then we need
1692 * to reset the pointer.
1693 */
1694 if (cpu_buffer->tail_page != tail_page &&
1695 cpu_buffer->tail_page != next_page)
1696 rb_head_page_set_normal(cpu_buffer, new_head,
1697 next_page,
1698 RB_PAGE_HEAD);
1699 }
1700
1701 /*
1702 * If this was the outer most commit (the one that
1703 * changed the original pointer from HEAD to UPDATE),
1704 * then it is up to us to reset it to NORMAL.
1705 */
1706 if (type == RB_PAGE_HEAD) {
1707 ret = rb_head_page_set_normal(cpu_buffer, next_page,
1708 tail_page,
1709 RB_PAGE_UPDATE);
1710 if (RB_WARN_ON(cpu_buffer,
1711 ret != RB_PAGE_UPDATE))
1712 return -1;
1713 }
1714
1715 return 0;
1716}
1717
1125static unsigned rb_calculate_event_length(unsigned length) 1718static unsigned rb_calculate_event_length(unsigned length)
1126{ 1719{
1127 struct ring_buffer_event event; /* Used only for sizeof array */ 1720 struct ring_buffer_event event; /* Used only for sizeof array */
@@ -1200,96 +1793,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1200 struct buffer_page *commit_page, 1793 struct buffer_page *commit_page,
1201 struct buffer_page *tail_page, u64 *ts) 1794 struct buffer_page *tail_page, u64 *ts)
1202{ 1795{
1203 struct buffer_page *next_page, *head_page, *reader_page;
1204 struct ring_buffer *buffer = cpu_buffer->buffer; 1796 struct ring_buffer *buffer = cpu_buffer->buffer;
1205 bool lock_taken = false; 1797 struct buffer_page *next_page;
1206 unsigned long flags; 1798 int ret;
1207 1799
1208 next_page = tail_page; 1800 next_page = tail_page;
1209 1801
1210 local_irq_save(flags);
1211 /*
1212 * Since the write to the buffer is still not
1213 * fully lockless, we must be careful with NMIs.
1214 * The locks in the writers are taken when a write
1215 * crosses to a new page. The locks protect against
1216 * races with the readers (this will soon be fixed
1217 * with a lockless solution).
1218 *
1219 * Because we can not protect against NMIs, and we
1220 * want to keep traces reentrant, we need to manage
1221 * what happens when we are in an NMI.
1222 *
1223 * NMIs can happen after we take the lock.
1224 * If we are in an NMI, only take the lock
1225 * if it is not already taken. Otherwise
1226 * simply fail.
1227 */
1228 if (unlikely(in_nmi())) {
1229 if (!__raw_spin_trylock(&cpu_buffer->lock)) {
1230 cpu_buffer->nmi_dropped++;
1231 goto out_reset;
1232 }
1233 } else
1234 __raw_spin_lock(&cpu_buffer->lock);
1235
1236 lock_taken = true;
1237
1238 rb_inc_page(cpu_buffer, &next_page); 1802 rb_inc_page(cpu_buffer, &next_page);
1239 1803
1240 head_page = cpu_buffer->head_page;
1241 reader_page = cpu_buffer->reader_page;
1242
1243 /* we grabbed the lock before incrementing */
1244 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
1245 goto out_reset;
1246
1247 /* 1804 /*
1248 * If for some reason, we had an interrupt storm that made 1805 * If for some reason, we had an interrupt storm that made
1249 * it all the way around the buffer, bail, and warn 1806 * it all the way around the buffer, bail, and warn
1250 * about it. 1807 * about it.
1251 */ 1808 */
1252 if (unlikely(next_page == commit_page)) { 1809 if (unlikely(next_page == commit_page)) {
1253 cpu_buffer->commit_overrun++; 1810 local_inc(&cpu_buffer->commit_overrun);
1254 goto out_reset; 1811 goto out_reset;
1255 } 1812 }
1256 1813
1257 if (next_page == head_page) { 1814 /*
1258 if (!(buffer->flags & RB_FL_OVERWRITE)) 1815 * This is where the fun begins!
1259 goto out_reset; 1816 *
1260 1817 * We are fighting against races between a reader that
1261 /* tail_page has not moved yet? */ 1818 * could be on another CPU trying to swap its reader
1262 if (tail_page == cpu_buffer->tail_page) { 1819 * page with the buffer head.
1263 /* count overflows */ 1820 *
1264 cpu_buffer->overrun += 1821 * We are also fighting against interrupts coming in and
1265 local_read(&head_page->entries); 1822 * moving the head or tail on us as well.
1823 *
1824 * If the next page is the head page then we have filled
1825 * the buffer, unless the commit page is still on the
1826 * reader page.
1827 */
1828 if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) {
1266 1829
1267 rb_inc_page(cpu_buffer, &head_page); 1830 /*
1268 cpu_buffer->head_page = head_page; 1831 * If the commit is not on the reader page, then
1269 cpu_buffer->head_page->read = 0; 1832 * move the header page.
1833 */
1834 if (!rb_is_reader_page(cpu_buffer->commit_page)) {
1835 /*
1836 * If we are not in overwrite mode,
1837 * this is easy, just stop here.
1838 */
1839 if (!(buffer->flags & RB_FL_OVERWRITE))
1840 goto out_reset;
1841
1842 ret = rb_handle_head_page(cpu_buffer,
1843 tail_page,
1844 next_page);
1845 if (ret < 0)
1846 goto out_reset;
1847 if (ret)
1848 goto out_again;
1849 } else {
1850 /*
1851 * We need to be careful here too. The
1852 * commit page could still be on the reader
1853 * page. We could have a small buffer, and
1854 * have filled up the buffer with events
1855 * from interrupts and such, and wrapped.
1856 *
1857 * Note, if the tail page is also the on the
1858 * reader_page, we let it move out.
1859 */
1860 if (unlikely((cpu_buffer->commit_page !=
1861 cpu_buffer->tail_page) &&
1862 (cpu_buffer->commit_page ==
1863 cpu_buffer->reader_page))) {
1864 local_inc(&cpu_buffer->commit_overrun);
1865 goto out_reset;
1866 }
1270 } 1867 }
1271 } 1868 }
1272 1869
1273 /* 1870 ret = rb_tail_page_update(cpu_buffer, tail_page, next_page);
1274 * If the tail page is still the same as what we think 1871 if (ret) {
1275 * it is, then it is up to us to update the tail 1872 /*
1276 * pointer. 1873 * Nested commits always have zero deltas, so
1277 */ 1874 * just reread the time stamp
1278 if (tail_page == cpu_buffer->tail_page) { 1875 */
1279 local_set(&next_page->write, 0);
1280 local_set(&next_page->entries, 0);
1281 local_set(&next_page->page->commit, 0);
1282 cpu_buffer->tail_page = next_page;
1283
1284 /* reread the time stamp */
1285 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1876 *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
1286 cpu_buffer->tail_page->page->time_stamp = *ts; 1877 next_page->page->time_stamp = *ts;
1287 } 1878 }
1288 1879
1289 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1880 out_again:
1290 1881
1291 __raw_spin_unlock(&cpu_buffer->lock); 1882 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1292 local_irq_restore(flags);
1293 1883
1294 /* fail and let the caller try again */ 1884 /* fail and let the caller try again */
1295 return ERR_PTR(-EAGAIN); 1885 return ERR_PTR(-EAGAIN);
@@ -1298,9 +1888,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1298 /* reset write */ 1888 /* reset write */
1299 rb_reset_tail(cpu_buffer, tail_page, tail, length); 1889 rb_reset_tail(cpu_buffer, tail_page, tail, length);
1300 1890
1301 if (likely(lock_taken))
1302 __raw_spin_unlock(&cpu_buffer->lock);
1303 local_irq_restore(flags);
1304 return NULL; 1891 return NULL;
1305} 1892}
1306 1893
@@ -1317,6 +1904,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
1317 barrier(); 1904 barrier();
1318 tail_page = cpu_buffer->tail_page; 1905 tail_page = cpu_buffer->tail_page;
1319 write = local_add_return(length, &tail_page->write); 1906 write = local_add_return(length, &tail_page->write);
1907
1908 /* set write to only the index of the write */
1909 write &= RB_WRITE_MASK;
1320 tail = write - length; 1910 tail = write - length;
1321 1911
1322 /* See if we shot pass the end of this buffer page */ 1912 /* See if we shot pass the end of this buffer page */
@@ -1361,12 +1951,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
1361 bpage = cpu_buffer->tail_page; 1951 bpage = cpu_buffer->tail_page;
1362 1952
1363 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { 1953 if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
1954 unsigned long write_mask =
1955 local_read(&bpage->write) & ~RB_WRITE_MASK;
1364 /* 1956 /*
1365 * This is on the tail page. It is possible that 1957 * This is on the tail page. It is possible that
1366 * a write could come in and move the tail page 1958 * a write could come in and move the tail page
1367 * and write to the next page. That is fine 1959 * and write to the next page. That is fine
1368 * because we just shorten what is on this page. 1960 * because we just shorten what is on this page.
1369 */ 1961 */
1962 old_index += write_mask;
1963 new_index += write_mask;
1370 index = local_cmpxchg(&bpage->write, old_index, new_index); 1964 index = local_cmpxchg(&bpage->write, old_index, new_index);
1371 if (index == old_index) 1965 if (index == old_index)
1372 return 1; 1966 return 1;
@@ -1875,9 +2469,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write);
1875static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) 2469static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1876{ 2470{
1877 struct buffer_page *reader = cpu_buffer->reader_page; 2471 struct buffer_page *reader = cpu_buffer->reader_page;
1878 struct buffer_page *head = cpu_buffer->head_page; 2472 struct buffer_page *head = rb_set_head_page(cpu_buffer);
1879 struct buffer_page *commit = cpu_buffer->commit_page; 2473 struct buffer_page *commit = cpu_buffer->commit_page;
1880 2474
2475 /* In case of error, head will be NULL */
2476 if (unlikely(!head))
2477 return 1;
2478
1881 return reader->read == rb_page_commit(reader) && 2479 return reader->read == rb_page_commit(reader) &&
1882 (commit == reader || 2480 (commit == reader ||
1883 (commit == head && 2481 (commit == head &&
@@ -1968,7 +2566,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1968 return 0; 2566 return 0;
1969 2567
1970 cpu_buffer = buffer->buffers[cpu]; 2568 cpu_buffer = buffer->buffers[cpu];
1971 ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) 2569 ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun))
1972 - cpu_buffer->read; 2570 - cpu_buffer->read;
1973 2571
1974 return ret; 2572 return ret;
@@ -1989,33 +2587,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1989 return 0; 2587 return 0;
1990 2588
1991 cpu_buffer = buffer->buffers[cpu]; 2589 cpu_buffer = buffer->buffers[cpu];
1992 ret = cpu_buffer->overrun; 2590 ret = local_read(&cpu_buffer->overrun);
1993 2591
1994 return ret; 2592 return ret;
1995} 2593}
1996EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); 2594EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
1997 2595
1998/** 2596/**
1999 * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
2000 * @buffer: The ring buffer
2001 * @cpu: The per CPU buffer to get the number of overruns from
2002 */
2003unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
2004{
2005 struct ring_buffer_per_cpu *cpu_buffer;
2006 unsigned long ret;
2007
2008 if (!cpumask_test_cpu(cpu, buffer->cpumask))
2009 return 0;
2010
2011 cpu_buffer = buffer->buffers[cpu];
2012 ret = cpu_buffer->nmi_dropped;
2013
2014 return ret;
2015}
2016EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
2017
2018/**
2019 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits 2597 * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
2020 * @buffer: The ring buffer 2598 * @buffer: The ring buffer
2021 * @cpu: The per CPU buffer to get the number of overruns from 2599 * @cpu: The per CPU buffer to get the number of overruns from
@@ -2030,7 +2608,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
2030 return 0; 2608 return 0;
2031 2609
2032 cpu_buffer = buffer->buffers[cpu]; 2610 cpu_buffer = buffer->buffers[cpu];
2033 ret = cpu_buffer->commit_overrun; 2611 ret = local_read(&cpu_buffer->commit_overrun);
2034 2612
2035 return ret; 2613 return ret;
2036} 2614}
@@ -2053,7 +2631,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2053 for_each_buffer_cpu(buffer, cpu) { 2631 for_each_buffer_cpu(buffer, cpu) {
2054 cpu_buffer = buffer->buffers[cpu]; 2632 cpu_buffer = buffer->buffers[cpu];
2055 entries += (local_read(&cpu_buffer->entries) - 2633 entries += (local_read(&cpu_buffer->entries) -
2056 cpu_buffer->overrun) - cpu_buffer->read; 2634 local_read(&cpu_buffer->overrun)) - cpu_buffer->read;
2057 } 2635 }
2058 2636
2059 return entries; 2637 return entries;
@@ -2076,7 +2654,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
2076 /* if you care about this being correct, lock the buffer */ 2654 /* if you care about this being correct, lock the buffer */
2077 for_each_buffer_cpu(buffer, cpu) { 2655 for_each_buffer_cpu(buffer, cpu) {
2078 cpu_buffer = buffer->buffers[cpu]; 2656 cpu_buffer = buffer->buffers[cpu];
2079 overruns += cpu_buffer->overrun; 2657 overruns += local_read(&cpu_buffer->overrun);
2080 } 2658 }
2081 2659
2082 return overruns; 2660 return overruns;
@@ -2089,8 +2667,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2089 2667
2090 /* Iterator usage is expected to have record disabled */ 2668 /* Iterator usage is expected to have record disabled */
2091 if (list_empty(&cpu_buffer->reader_page->list)) { 2669 if (list_empty(&cpu_buffer->reader_page->list)) {
2092 iter->head_page = cpu_buffer->head_page; 2670 iter->head_page = rb_set_head_page(cpu_buffer);
2093 iter->head = cpu_buffer->head_page->read; 2671 if (unlikely(!iter->head_page))
2672 return;
2673 iter->head = iter->head_page->read;
2094 } else { 2674 } else {
2095 iter->head_page = cpu_buffer->reader_page; 2675 iter->head_page = cpu_buffer->reader_page;
2096 iter->head = cpu_buffer->reader_page->read; 2676 iter->head = cpu_buffer->reader_page->read;
@@ -2207,6 +2787,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2207 struct buffer_page *reader = NULL; 2787 struct buffer_page *reader = NULL;
2208 unsigned long flags; 2788 unsigned long flags;
2209 int nr_loops = 0; 2789 int nr_loops = 0;
2790 int ret;
2210 2791
2211 local_irq_save(flags); 2792 local_irq_save(flags);
2212 __raw_spin_lock(&cpu_buffer->lock); 2793 __raw_spin_lock(&cpu_buffer->lock);
@@ -2240,30 +2821,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2240 goto out; 2821 goto out;
2241 2822
2242 /* 2823 /*
2243 * Splice the empty reader page into the list around the head.
2244 * Reset the reader page to size zero. 2824 * Reset the reader page to size zero.
2245 */ 2825 */
2826 local_set(&cpu_buffer->reader_page->write, 0);
2827 local_set(&cpu_buffer->reader_page->entries, 0);
2828 local_set(&cpu_buffer->reader_page->page->commit, 0);
2246 2829
2247 reader = cpu_buffer->head_page; 2830 spin:
2831 /*
2832 * Splice the empty reader page into the list around the head.
2833 */
2834 reader = rb_set_head_page(cpu_buffer);
2248 cpu_buffer->reader_page->list.next = reader->list.next; 2835 cpu_buffer->reader_page->list.next = reader->list.next;
2249 cpu_buffer->reader_page->list.prev = reader->list.prev; 2836 cpu_buffer->reader_page->list.prev = reader->list.prev;
2250 2837
2251 local_set(&cpu_buffer->reader_page->write, 0); 2838 /*
2252 local_set(&cpu_buffer->reader_page->entries, 0); 2839 * cpu_buffer->pages just needs to point to the buffer, it
2253 local_set(&cpu_buffer->reader_page->page->commit, 0); 2840 * has no specific buffer page to point to. Lets move it out
2841 * of our way so we don't accidently swap it.
2842 */
2843 cpu_buffer->pages = reader->list.prev;
2254 2844
2255 /* Make the reader page now replace the head */ 2845 /* The reader page will be pointing to the new head */
2256 reader->list.prev->next = &cpu_buffer->reader_page->list; 2846 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2257 reader->list.next->prev = &cpu_buffer->reader_page->list; 2847
2848 /*
2849 * Here's the tricky part.
2850 *
2851 * We need to move the pointer past the header page.
2852 * But we can only do that if a writer is not currently
2853 * moving it. The page before the header page has the
2854 * flag bit '1' set if it is pointing to the page we want.
2855 * but if the writer is in the process of moving it
2856 * than it will be '2' or already moved '0'.
2857 */
2858
2859 ret = rb_head_page_replace(reader, cpu_buffer->reader_page);
2258 2860
2259 /* 2861 /*
2260 * If the tail is on the reader, then we must set the head 2862 * If we did not convert it, then we must try again.
2261 * to the inserted page, otherwise we set it one before.
2262 */ 2863 */
2263 cpu_buffer->head_page = cpu_buffer->reader_page; 2864 if (!ret)
2865 goto spin;
2264 2866
2265 if (cpu_buffer->commit_page != reader) 2867 /*
2266 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2868 * Yeah! We succeeded in replacing the page.
2869 *
2870 * Now make the new head point back to the reader page.
2871 */
2872 reader->list.next->prev = &cpu_buffer->reader_page->list;
2873 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2267 2874
2268 /* Finally update the reader page to the new head */ 2875 /* Finally update the reader page to the new head */
2269 cpu_buffer->reader_page = reader; 2876 cpu_buffer->reader_page = reader;
@@ -2717,8 +3324,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size);
2717static void 3324static void
2718rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) 3325rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2719{ 3326{
3327 rb_head_page_deactivate(cpu_buffer);
3328
2720 cpu_buffer->head_page 3329 cpu_buffer->head_page
2721 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 3330 = list_entry(cpu_buffer->pages, struct buffer_page, list);
2722 local_set(&cpu_buffer->head_page->write, 0); 3331 local_set(&cpu_buffer->head_page->write, 0);
2723 local_set(&cpu_buffer->head_page->entries, 0); 3332 local_set(&cpu_buffer->head_page->entries, 0);
2724 local_set(&cpu_buffer->head_page->page->commit, 0); 3333 local_set(&cpu_buffer->head_page->page->commit, 0);
@@ -2734,16 +3343,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
2734 local_set(&cpu_buffer->reader_page->page->commit, 0); 3343 local_set(&cpu_buffer->reader_page->page->commit, 0);
2735 cpu_buffer->reader_page->read = 0; 3344 cpu_buffer->reader_page->read = 0;
2736 3345
2737 cpu_buffer->nmi_dropped = 0; 3346 local_set(&cpu_buffer->commit_overrun, 0);
2738 cpu_buffer->commit_overrun = 0; 3347 local_set(&cpu_buffer->overrun, 0);
2739 cpu_buffer->overrun = 0;
2740 cpu_buffer->read = 0;
2741 local_set(&cpu_buffer->entries, 0); 3348 local_set(&cpu_buffer->entries, 0);
2742 local_set(&cpu_buffer->committing, 0); 3349 local_set(&cpu_buffer->committing, 0);
2743 local_set(&cpu_buffer->commits, 0); 3350 local_set(&cpu_buffer->commits, 0);
3351 cpu_buffer->read = 0;
2744 3352
2745 cpu_buffer->write_stamp = 0; 3353 cpu_buffer->write_stamp = 0;
2746 cpu_buffer->read_stamp = 0; 3354 cpu_buffer->read_stamp = 0;
3355
3356 rb_head_page_activate(cpu_buffer);
2747} 3357}
2748 3358
2749/** 3359/**
@@ -3091,7 +3701,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3091 read = 0; 3701 read = 0;
3092 } else { 3702 } else {
3093 /* update the entry counter */ 3703 /* update the entry counter */
3094 cpu_buffer->read += local_read(&reader->entries); 3704 cpu_buffer->read += rb_page_entries(reader);
3095 3705
3096 /* swap the pages */ 3706 /* swap the pages */
3097 rb_init_page(bpage); 3707 rb_init_page(bpage);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c22b40f8f576..e793cda91dd3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -50,7 +50,7 @@ unsigned long __read_mostly tracing_thresh;
50 * On boot up, the ring buffer is set to the minimum size, so that 50 * On boot up, the ring buffer is set to the minimum size, so that
51 * we do not waste memory on systems that are not using tracing. 51 * we do not waste memory on systems that are not using tracing.
52 */ 52 */
53static int ring_buffer_expanded; 53int ring_buffer_expanded;
54 54
55/* 55/*
56 * We need to change this state when a selftest is running. 56 * We need to change this state when a selftest is running.
@@ -64,7 +64,7 @@ static bool __read_mostly tracing_selftest_running;
64/* 64/*
65 * If a tracer is running, we do not want to run SELFTEST. 65 * If a tracer is running, we do not want to run SELFTEST.
66 */ 66 */
67static bool __read_mostly tracing_selftest_disabled; 67bool __read_mostly tracing_selftest_disabled;
68 68
69/* For tracers that don't implement custom flags */ 69/* For tracers that don't implement custom flags */
70static struct tracer_opt dummy_tracer_opt[] = { 70static struct tracer_opt dummy_tracer_opt[] = {
@@ -89,7 +89,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
89 */ 89 */
90static int tracing_disabled = 1; 90static int tracing_disabled = 1;
91 91
92static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 92DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
93 93
94static inline void ftrace_disable_cpu(void) 94static inline void ftrace_disable_cpu(void)
95{ 95{
@@ -867,10 +867,6 @@ struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
867 867
868 return event; 868 return event;
869} 869}
870static void ftrace_trace_stack(struct trace_array *tr,
871 unsigned long flags, int skip, int pc);
872static void ftrace_trace_userstack(struct trace_array *tr,
873 unsigned long flags, int pc);
874 870
875static inline void __trace_buffer_unlock_commit(struct trace_array *tr, 871static inline void __trace_buffer_unlock_commit(struct trace_array *tr,
876 struct ring_buffer_event *event, 872 struct ring_buffer_event *event,
@@ -947,54 +943,6 @@ trace_function(struct trace_array *tr,
947 ring_buffer_unlock_commit(tr->buffer, event); 943 ring_buffer_unlock_commit(tr->buffer, event);
948} 944}
949 945
950#ifdef CONFIG_FUNCTION_GRAPH_TRACER
951static int __trace_graph_entry(struct trace_array *tr,
952 struct ftrace_graph_ent *trace,
953 unsigned long flags,
954 int pc)
955{
956 struct ftrace_event_call *call = &event_funcgraph_entry;
957 struct ring_buffer_event *event;
958 struct ftrace_graph_ent_entry *entry;
959
960 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
961 return 0;
962
963 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
964 sizeof(*entry), flags, pc);
965 if (!event)
966 return 0;
967 entry = ring_buffer_event_data(event);
968 entry->graph_ent = *trace;
969 if (!filter_current_check_discard(call, entry, event))
970 ring_buffer_unlock_commit(global_trace.buffer, event);
971
972 return 1;
973}
974
975static void __trace_graph_return(struct trace_array *tr,
976 struct ftrace_graph_ret *trace,
977 unsigned long flags,
978 int pc)
979{
980 struct ftrace_event_call *call = &event_funcgraph_exit;
981 struct ring_buffer_event *event;
982 struct ftrace_graph_ret_entry *entry;
983
984 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
985 return;
986
987 event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
988 sizeof(*entry), flags, pc);
989 if (!event)
990 return;
991 entry = ring_buffer_event_data(event);
992 entry->ret = *trace;
993 if (!filter_current_check_discard(call, entry, event))
994 ring_buffer_unlock_commit(global_trace.buffer, event);
995}
996#endif
997
998void 946void
999ftrace(struct trace_array *tr, struct trace_array_cpu *data, 947ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1000 unsigned long ip, unsigned long parent_ip, unsigned long flags, 948 unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -1004,11 +952,11 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
1004 trace_function(tr, ip, parent_ip, flags, pc); 952 trace_function(tr, ip, parent_ip, flags, pc);
1005} 953}
1006 954
955#ifdef CONFIG_STACKTRACE
1007static void __ftrace_trace_stack(struct trace_array *tr, 956static void __ftrace_trace_stack(struct trace_array *tr,
1008 unsigned long flags, 957 unsigned long flags,
1009 int skip, int pc) 958 int skip, int pc)
1010{ 959{
1011#ifdef CONFIG_STACKTRACE
1012 struct ftrace_event_call *call = &event_kernel_stack; 960 struct ftrace_event_call *call = &event_kernel_stack;
1013 struct ring_buffer_event *event; 961 struct ring_buffer_event *event;
1014 struct stack_entry *entry; 962 struct stack_entry *entry;
@@ -1029,12 +977,10 @@ static void __ftrace_trace_stack(struct trace_array *tr,
1029 save_stack_trace(&trace); 977 save_stack_trace(&trace);
1030 if (!filter_check_discard(call, entry, tr->buffer, event)) 978 if (!filter_check_discard(call, entry, tr->buffer, event))
1031 ring_buffer_unlock_commit(tr->buffer, event); 979 ring_buffer_unlock_commit(tr->buffer, event);
1032#endif
1033} 980}
1034 981
1035static void ftrace_trace_stack(struct trace_array *tr, 982void ftrace_trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1036 unsigned long flags, 983 int pc)
1037 int skip, int pc)
1038{ 984{
1039 if (!(trace_flags & TRACE_ITER_STACKTRACE)) 985 if (!(trace_flags & TRACE_ITER_STACKTRACE))
1040 return; 986 return;
@@ -1042,17 +988,14 @@ static void ftrace_trace_stack(struct trace_array *tr,
1042 __ftrace_trace_stack(tr, flags, skip, pc); 988 __ftrace_trace_stack(tr, flags, skip, pc);
1043} 989}
1044 990
1045void __trace_stack(struct trace_array *tr, 991void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1046 unsigned long flags, 992 int pc)
1047 int skip, int pc)
1048{ 993{
1049 __ftrace_trace_stack(tr, flags, skip, pc); 994 __ftrace_trace_stack(tr, flags, skip, pc);
1050} 995}
1051 996
1052static void ftrace_trace_userstack(struct trace_array *tr, 997void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc)
1053 unsigned long flags, int pc)
1054{ 998{
1055#ifdef CONFIG_STACKTRACE
1056 struct ftrace_event_call *call = &event_user_stack; 999 struct ftrace_event_call *call = &event_user_stack;
1057 struct ring_buffer_event *event; 1000 struct ring_buffer_event *event;
1058 struct userstack_entry *entry; 1001 struct userstack_entry *entry;
@@ -1077,7 +1020,6 @@ static void ftrace_trace_userstack(struct trace_array *tr,
1077 save_stack_trace_user(&trace); 1020 save_stack_trace_user(&trace);
1078 if (!filter_check_discard(call, entry, tr->buffer, event)) 1021 if (!filter_check_discard(call, entry, tr->buffer, event))
1079 ring_buffer_unlock_commit(tr->buffer, event); 1022 ring_buffer_unlock_commit(tr->buffer, event);
1080#endif
1081} 1023}
1082 1024
1083#ifdef UNUSED 1025#ifdef UNUSED
@@ -1087,6 +1029,8 @@ static void __trace_userstack(struct trace_array *tr, unsigned long flags)
1087} 1029}
1088#endif /* UNUSED */ 1030#endif /* UNUSED */
1089 1031
1032#endif /* CONFIG_STACKTRACE */
1033
1090static void 1034static void
1091ftrace_trace_special(void *__tr, 1035ftrace_trace_special(void *__tr,
1092 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1036 unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -1115,62 +1059,6 @@ __trace_special(void *__tr, void *__data,
1115} 1059}
1116 1060
1117void 1061void
1118tracing_sched_switch_trace(struct trace_array *tr,
1119 struct task_struct *prev,
1120 struct task_struct *next,
1121 unsigned long flags, int pc)
1122{
1123 struct ftrace_event_call *call = &event_context_switch;
1124 struct ring_buffer_event *event;
1125 struct ctx_switch_entry *entry;
1126
1127 event = trace_buffer_lock_reserve(tr, TRACE_CTX,
1128 sizeof(*entry), flags, pc);
1129 if (!event)
1130 return;
1131 entry = ring_buffer_event_data(event);
1132 entry->prev_pid = prev->pid;
1133 entry->prev_prio = prev->prio;
1134 entry->prev_state = prev->state;
1135 entry->next_pid = next->pid;
1136 entry->next_prio = next->prio;
1137 entry->next_state = next->state;
1138 entry->next_cpu = task_cpu(next);
1139
1140 if (!filter_check_discard(call, entry, tr->buffer, event))
1141 trace_buffer_unlock_commit(tr, event, flags, pc);
1142}
1143
1144void
1145tracing_sched_wakeup_trace(struct trace_array *tr,
1146 struct task_struct *wakee,
1147 struct task_struct *curr,
1148 unsigned long flags, int pc)
1149{
1150 struct ftrace_event_call *call = &event_wakeup;
1151 struct ring_buffer_event *event;
1152 struct ctx_switch_entry *entry;
1153
1154 event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
1155 sizeof(*entry), flags, pc);
1156 if (!event)
1157 return;
1158 entry = ring_buffer_event_data(event);
1159 entry->prev_pid = curr->pid;
1160 entry->prev_prio = curr->prio;
1161 entry->prev_state = curr->state;
1162 entry->next_pid = wakee->pid;
1163 entry->next_prio = wakee->prio;
1164 entry->next_state = wakee->state;
1165 entry->next_cpu = task_cpu(wakee);
1166
1167 if (!filter_check_discard(call, entry, tr->buffer, event))
1168 ring_buffer_unlock_commit(tr->buffer, event);
1169 ftrace_trace_stack(tr, flags, 6, pc);
1170 ftrace_trace_userstack(tr, flags, pc);
1171}
1172
1173void
1174ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) 1062ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1175{ 1063{
1176 struct trace_array *tr = &global_trace; 1064 struct trace_array *tr = &global_trace;
@@ -1194,68 +1082,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1194 local_irq_restore(flags); 1082 local_irq_restore(flags);
1195} 1083}
1196 1084
1197#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1198int trace_graph_entry(struct ftrace_graph_ent *trace)
1199{
1200 struct trace_array *tr = &global_trace;
1201 struct trace_array_cpu *data;
1202 unsigned long flags;
1203 long disabled;
1204 int ret;
1205 int cpu;
1206 int pc;
1207
1208 if (!ftrace_trace_task(current))
1209 return 0;
1210
1211 if (!ftrace_graph_addr(trace->func))
1212 return 0;
1213
1214 local_irq_save(flags);
1215 cpu = raw_smp_processor_id();
1216 data = tr->data[cpu];
1217 disabled = atomic_inc_return(&data->disabled);
1218 if (likely(disabled == 1)) {
1219 pc = preempt_count();
1220 ret = __trace_graph_entry(tr, trace, flags, pc);
1221 } else {
1222 ret = 0;
1223 }
1224 /* Only do the atomic if it is not already set */
1225 if (!test_tsk_trace_graph(current))
1226 set_tsk_trace_graph(current);
1227
1228 atomic_dec(&data->disabled);
1229 local_irq_restore(flags);
1230
1231 return ret;
1232}
1233
1234void trace_graph_return(struct ftrace_graph_ret *trace)
1235{
1236 struct trace_array *tr = &global_trace;
1237 struct trace_array_cpu *data;
1238 unsigned long flags;
1239 long disabled;
1240 int cpu;
1241 int pc;
1242
1243 local_irq_save(flags);
1244 cpu = raw_smp_processor_id();
1245 data = tr->data[cpu];
1246 disabled = atomic_inc_return(&data->disabled);
1247 if (likely(disabled == 1)) {
1248 pc = preempt_count();
1249 __trace_graph_return(tr, trace, flags, pc);
1250 }
1251 if (!trace->depth)
1252 clear_tsk_trace_graph(current);
1253 atomic_dec(&data->disabled);
1254 local_irq_restore(flags);
1255}
1256#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1257
1258
1259/** 1085/**
1260 * trace_vbprintk - write binary msg to tracing buffer 1086 * trace_vbprintk - write binary msg to tracing buffer
1261 * 1087 *
@@ -2257,8 +2083,8 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2257 len += 3; /* "no" and newline */ 2083 len += 3; /* "no" and newline */
2258 } 2084 }
2259 2085
2260 /* +2 for \n and \0 */ 2086 /* +1 for \0 */
2261 buf = kmalloc(len + 2, GFP_KERNEL); 2087 buf = kmalloc(len + 1, GFP_KERNEL);
2262 if (!buf) { 2088 if (!buf) {
2263 mutex_unlock(&trace_types_lock); 2089 mutex_unlock(&trace_types_lock);
2264 return -ENOMEM; 2090 return -ENOMEM;
@@ -2281,7 +2107,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
2281 } 2107 }
2282 mutex_unlock(&trace_types_lock); 2108 mutex_unlock(&trace_types_lock);
2283 2109
2284 WARN_ON(r >= len + 2); 2110 WARN_ON(r >= len + 1);
2285 2111
2286 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2112 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2287 2113
@@ -3633,9 +3459,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
3633 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); 3459 cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
3634 trace_seq_printf(s, "commit overrun: %ld\n", cnt); 3460 trace_seq_printf(s, "commit overrun: %ld\n", cnt);
3635 3461
3636 cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
3637 trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
3638
3639 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 3462 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
3640 3463
3641 kfree(s); 3464 kfree(s);
@@ -4273,7 +4096,6 @@ void ftrace_dump(void)
4273 4096
4274__init static int tracer_alloc_buffers(void) 4097__init static int tracer_alloc_buffers(void)
4275{ 4098{
4276 struct trace_array_cpu *data;
4277 int ring_buf_size; 4099 int ring_buf_size;
4278 int i; 4100 int i;
4279 int ret = -ENOMEM; 4101 int ret = -ENOMEM;
@@ -4323,7 +4145,7 @@ __init static int tracer_alloc_buffers(void)
4323 4145
4324 /* Allocate the first page for all buffers */ 4146 /* Allocate the first page for all buffers */
4325 for_each_tracing_cpu(i) { 4147 for_each_tracing_cpu(i) {
4326 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4148 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4327 max_tr.data[i] = &per_cpu(max_data, i); 4149 max_tr.data[i] = &per_cpu(max_data, i);
4328 } 4150 }
4329 4151
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8b9f4f6e9559..d682357e4b1f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -467,6 +467,7 @@ void trace_function(struct trace_array *tr,
467 467
468void trace_graph_return(struct ftrace_graph_ret *trace); 468void trace_graph_return(struct ftrace_graph_ret *trace);
469int trace_graph_entry(struct ftrace_graph_ent *trace); 469int trace_graph_entry(struct ftrace_graph_ent *trace);
470void set_graph_array(struct trace_array *tr);
470 471
471void tracing_start_cmdline_record(void); 472void tracing_start_cmdline_record(void);
472void tracing_stop_cmdline_record(void); 473void tracing_stop_cmdline_record(void);
@@ -485,9 +486,31 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
485void update_max_tr_single(struct trace_array *tr, 486void update_max_tr_single(struct trace_array *tr,
486 struct task_struct *tsk, int cpu); 487 struct task_struct *tsk, int cpu);
487 488
488void __trace_stack(struct trace_array *tr, 489#ifdef CONFIG_STACKTRACE
489 unsigned long flags, 490void ftrace_trace_stack(struct trace_array *tr, unsigned long flags,
490 int skip, int pc); 491 int skip, int pc);
492
493void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags,
494 int pc);
495
496void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
497 int pc);
498#else
499static inline void ftrace_trace_stack(struct trace_array *tr,
500 unsigned long flags, int skip, int pc)
501{
502}
503
504static inline void ftrace_trace_userstack(struct trace_array *tr,
505 unsigned long flags, int pc)
506{
507}
508
509static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
510 int skip, int pc)
511{
512}
513#endif /* CONFIG_STACKTRACE */
491 514
492extern cycle_t ftrace_now(int cpu); 515extern cycle_t ftrace_now(int cpu);
493 516
@@ -513,6 +536,10 @@ extern unsigned long ftrace_update_tot_cnt;
513extern int DYN_FTRACE_TEST_NAME(void); 536extern int DYN_FTRACE_TEST_NAME(void);
514#endif 537#endif
515 538
539extern int ring_buffer_expanded;
540extern bool tracing_selftest_disabled;
541DECLARE_PER_CPU(local_t, ftrace_cpu_disabled);
542
516#ifdef CONFIG_FTRACE_STARTUP_TEST 543#ifdef CONFIG_FTRACE_STARTUP_TEST
517extern int trace_selftest_startup_function(struct tracer *trace, 544extern int trace_selftest_startup_function(struct tracer *trace,
518 struct trace_array *tr); 545 struct trace_array *tr);
@@ -743,13 +770,15 @@ struct event_filter {
743 int n_preds; 770 int n_preds;
744 struct filter_pred **preds; 771 struct filter_pred **preds;
745 char *filter_string; 772 char *filter_string;
773 bool no_reset;
746}; 774};
747 775
748struct event_subsystem { 776struct event_subsystem {
749 struct list_head list; 777 struct list_head list;
750 const char *name; 778 const char *name;
751 struct dentry *entry; 779 struct dentry *entry;
752 void *filter; 780 struct event_filter *filter;
781 int nr_events;
753}; 782};
754 783
755struct filter_pred; 784struct filter_pred;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e75276a49cf5..e0cbede96783 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -17,6 +17,8 @@
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/delay.h> 18#include <linux/delay.h>
19 19
20#include <asm/setup.h>
21
20#include "trace_output.h" 22#include "trace_output.h"
21 23
22#define TRACE_SYSTEM "TRACE_SYSTEM" 24#define TRACE_SYSTEM "TRACE_SYSTEM"
@@ -849,8 +851,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
849 851
850 /* First see if we did not already create this dir */ 852 /* First see if we did not already create this dir */
851 list_for_each_entry(system, &event_subsystems, list) { 853 list_for_each_entry(system, &event_subsystems, list) {
852 if (strcmp(system->name, name) == 0) 854 if (strcmp(system->name, name) == 0) {
855 system->nr_events++;
853 return system->entry; 856 return system->entry;
857 }
854 } 858 }
855 859
856 /* need to create new entry */ 860 /* need to create new entry */
@@ -869,6 +873,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
869 return d_events; 873 return d_events;
870 } 874 }
871 875
876 system->nr_events = 1;
872 system->name = kstrdup(name, GFP_KERNEL); 877 system->name = kstrdup(name, GFP_KERNEL);
873 if (!system->name) { 878 if (!system->name) {
874 debugfs_remove(system->entry); 879 debugfs_remove(system->entry);
@@ -987,6 +992,32 @@ struct ftrace_module_file_ops {
987 struct file_operations filter; 992 struct file_operations filter;
988}; 993};
989 994
995static void remove_subsystem_dir(const char *name)
996{
997 struct event_subsystem *system;
998
999 if (strcmp(name, TRACE_SYSTEM) == 0)
1000 return;
1001
1002 list_for_each_entry(system, &event_subsystems, list) {
1003 if (strcmp(system->name, name) == 0) {
1004 if (!--system->nr_events) {
1005 struct event_filter *filter = system->filter;
1006
1007 debugfs_remove_recursive(system->entry);
1008 list_del(&system->list);
1009 if (filter) {
1010 kfree(filter->filter_string);
1011 kfree(filter);
1012 }
1013 kfree(system->name);
1014 kfree(system);
1015 }
1016 break;
1017 }
1018 }
1019}
1020
990static struct ftrace_module_file_ops * 1021static struct ftrace_module_file_ops *
991trace_create_file_ops(struct module *mod) 1022trace_create_file_ops(struct module *mod)
992{ 1023{
@@ -1077,6 +1108,7 @@ static void trace_module_remove_events(struct module *mod)
1077 list_del(&call->list); 1108 list_del(&call->list);
1078 trace_destroy_fields(call); 1109 trace_destroy_fields(call);
1079 destroy_preds(call); 1110 destroy_preds(call);
1111 remove_subsystem_dir(call->system);
1080 } 1112 }
1081 } 1113 }
1082 1114
@@ -1133,6 +1165,18 @@ struct notifier_block trace_module_nb = {
1133extern struct ftrace_event_call __start_ftrace_events[]; 1165extern struct ftrace_event_call __start_ftrace_events[];
1134extern struct ftrace_event_call __stop_ftrace_events[]; 1166extern struct ftrace_event_call __stop_ftrace_events[];
1135 1167
1168static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
1169
1170static __init int setup_trace_event(char *str)
1171{
1172 strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
1173 ring_buffer_expanded = 1;
1174 tracing_selftest_disabled = 1;
1175
1176 return 1;
1177}
1178__setup("trace_event=", setup_trace_event);
1179
1136static __init int event_trace_init(void) 1180static __init int event_trace_init(void)
1137{ 1181{
1138 struct ftrace_event_call *call; 1182 struct ftrace_event_call *call;
@@ -1140,6 +1184,8 @@ static __init int event_trace_init(void)
1140 struct dentry *entry; 1184 struct dentry *entry;
1141 struct dentry *d_events; 1185 struct dentry *d_events;
1142 int ret; 1186 int ret;
1187 char *buf = bootup_event_buf;
1188 char *token;
1143 1189
1144 d_tracer = tracing_init_dentry(); 1190 d_tracer = tracing_init_dentry();
1145 if (!d_tracer) 1191 if (!d_tracer)
@@ -1185,6 +1231,19 @@ static __init int event_trace_init(void)
1185 &ftrace_event_format_fops); 1231 &ftrace_event_format_fops);
1186 } 1232 }
1187 1233
1234 while (true) {
1235 token = strsep(&buf, ",");
1236
1237 if (!token)
1238 break;
1239 if (!*token)
1240 continue;
1241
1242 ret = ftrace_set_clr_event(token, 1);
1243 if (ret)
1244 pr_warning("Failed to enable trace event: %s\n", token);
1245 }
1246
1188 ret = register_module_notifier(&trace_module_nb); 1247 ret = register_module_notifier(&trace_module_nb);
1189 if (ret) 1248 if (ret)
1190 pr_warning("Failed to register trace events module notifier\n"); 1249 pr_warning("Failed to register trace events module notifier\n");
@@ -1392,10 +1451,10 @@ static __init void event_trace_self_test_with_function(void)
1392 1451
1393static __init int event_trace_self_tests_init(void) 1452static __init int event_trace_self_tests_init(void)
1394{ 1453{
1395 1454 if (!tracing_selftest_disabled) {
1396 event_trace_self_tests(); 1455 event_trace_self_tests();
1397 1456 event_trace_self_test_with_function();
1398 event_trace_self_test_with_function(); 1457 }
1399 1458
1400 return 0; 1459 return 0;
1401} 1460}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f32dc9d1ea7b..490337abed75 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -176,11 +176,13 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
176static int filter_pred_strloc(struct filter_pred *pred, void *event, 176static int filter_pred_strloc(struct filter_pred *pred, void *event,
177 int val1, int val2) 177 int val1, int val2)
178{ 178{
179 unsigned short str_loc = *(unsigned short *)(event + pred->offset); 179 u32 str_item = *(u32 *)(event + pred->offset);
180 int str_loc = str_item & 0xffff;
181 int str_len = str_item >> 16;
180 char *addr = (char *)(event + str_loc); 182 char *addr = (char *)(event + str_loc);
181 int cmp, match; 183 int cmp, match;
182 184
183 cmp = strncmp(addr, pred->str_val, pred->str_len); 185 cmp = strncmp(addr, pred->str_val, str_len);
184 186
185 match = (!cmp) ^ pred->not; 187 match = (!cmp) ^ pred->not;
186 188
@@ -418,24 +420,29 @@ oom:
418} 420}
419EXPORT_SYMBOL_GPL(init_preds); 421EXPORT_SYMBOL_GPL(init_preds);
420 422
421static void filter_free_subsystem_preds(struct event_subsystem *system) 423enum {
424 FILTER_DISABLE_ALL,
425 FILTER_INIT_NO_RESET,
426 FILTER_SKIP_NO_RESET,
427};
428
429static void filter_free_subsystem_preds(struct event_subsystem *system,
430 int flag)
422{ 431{
423 struct event_filter *filter = system->filter;
424 struct ftrace_event_call *call; 432 struct ftrace_event_call *call;
425 int i;
426
427 if (filter->n_preds) {
428 for (i = 0; i < filter->n_preds; i++)
429 filter_free_pred(filter->preds[i]);
430 kfree(filter->preds);
431 filter->preds = NULL;
432 filter->n_preds = 0;
433 }
434 433
435 list_for_each_entry(call, &ftrace_events, list) { 434 list_for_each_entry(call, &ftrace_events, list) {
436 if (!call->define_fields) 435 if (!call->define_fields)
437 continue; 436 continue;
438 437
438 if (flag == FILTER_INIT_NO_RESET) {
439 call->filter->no_reset = false;
440 continue;
441 }
442
443 if (flag == FILTER_SKIP_NO_RESET && call->filter->no_reset)
444 continue;
445
439 if (!strcmp(call->system, system->name)) { 446 if (!strcmp(call->system, system->name)) {
440 filter_disable_preds(call); 447 filter_disable_preds(call);
441 remove_filter_string(call->filter); 448 remove_filter_string(call->filter);
@@ -537,7 +544,8 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
537 544
538static int filter_add_pred(struct filter_parse_state *ps, 545static int filter_add_pred(struct filter_parse_state *ps,
539 struct ftrace_event_call *call, 546 struct ftrace_event_call *call,
540 struct filter_pred *pred) 547 struct filter_pred *pred,
548 bool dry_run)
541{ 549{
542 struct ftrace_event_field *field; 550 struct ftrace_event_field *field;
543 filter_pred_fn_t fn; 551 filter_pred_fn_t fn;
@@ -549,10 +557,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
549 557
550 if (pred->op == OP_AND) { 558 if (pred->op == OP_AND) {
551 pred->pop_n = 2; 559 pred->pop_n = 2;
552 return filter_add_pred_fn(ps, call, pred, filter_pred_and); 560 fn = filter_pred_and;
561 goto add_pred_fn;
553 } else if (pred->op == OP_OR) { 562 } else if (pred->op == OP_OR) {
554 pred->pop_n = 2; 563 pred->pop_n = 2;
555 return filter_add_pred_fn(ps, call, pred, filter_pred_or); 564 fn = filter_pred_or;
565 goto add_pred_fn;
556 } 566 }
557 567
558 field = find_event_field(call, pred->field_name); 568 field = find_event_field(call, pred->field_name);
@@ -575,9 +585,6 @@ static int filter_add_pred(struct filter_parse_state *ps,
575 else 585 else
576 fn = filter_pred_strloc; 586 fn = filter_pred_strloc;
577 pred->str_len = field->size; 587 pred->str_len = field->size;
578 if (pred->op == OP_NE)
579 pred->not = 1;
580 return filter_add_pred_fn(ps, call, pred, fn);
581 } else { 588 } else {
582 if (field->is_signed) 589 if (field->is_signed)
583 ret = strict_strtoll(pred->str_val, 0, &val); 590 ret = strict_strtoll(pred->str_val, 0, &val);
@@ -588,41 +595,33 @@ static int filter_add_pred(struct filter_parse_state *ps,
588 return -EINVAL; 595 return -EINVAL;
589 } 596 }
590 pred->val = val; 597 pred->val = val;
591 }
592 598
593 fn = select_comparison_fn(pred->op, field->size, field->is_signed); 599 fn = select_comparison_fn(pred->op, field->size,
594 if (!fn) { 600 field->is_signed);
595 parse_error(ps, FILT_ERR_INVALID_OP, 0); 601 if (!fn) {
596 return -EINVAL; 602 parse_error(ps, FILT_ERR_INVALID_OP, 0);
603 return -EINVAL;
604 }
597 } 605 }
598 606
599 if (pred->op == OP_NE) 607 if (pred->op == OP_NE)
600 pred->not = 1; 608 pred->not = 1;
601 609
602 return filter_add_pred_fn(ps, call, pred, fn); 610add_pred_fn:
611 if (!dry_run)
612 return filter_add_pred_fn(ps, call, pred, fn);
613 return 0;
603} 614}
604 615
605static int filter_add_subsystem_pred(struct filter_parse_state *ps, 616static int filter_add_subsystem_pred(struct filter_parse_state *ps,
606 struct event_subsystem *system, 617 struct event_subsystem *system,
607 struct filter_pred *pred, 618 struct filter_pred *pred,
608 char *filter_string) 619 char *filter_string,
620 bool dry_run)
609{ 621{
610 struct event_filter *filter = system->filter;
611 struct ftrace_event_call *call; 622 struct ftrace_event_call *call;
612 int err = 0; 623 int err = 0;
613 624 bool fail = true;
614 if (!filter->preds) {
615 filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
616 GFP_KERNEL);
617
618 if (!filter->preds)
619 return -ENOMEM;
620 }
621
622 if (filter->n_preds == MAX_FILTER_PRED) {
623 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
624 return -ENOSPC;
625 }
626 625
627 list_for_each_entry(call, &ftrace_events, list) { 626 list_for_each_entry(call, &ftrace_events, list) {
628 627
@@ -632,19 +631,24 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
632 if (strcmp(call->system, system->name)) 631 if (strcmp(call->system, system->name))
633 continue; 632 continue;
634 633
635 err = filter_add_pred(ps, call, pred); 634 if (call->filter->no_reset)
636 if (err) { 635 continue;
637 filter_free_subsystem_preds(system); 636
638 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 637 err = filter_add_pred(ps, call, pred, dry_run);
639 goto out; 638 if (err)
640 } 639 call->filter->no_reset = true;
641 replace_filter_string(call->filter, filter_string); 640 else
641 fail = false;
642
643 if (!dry_run)
644 replace_filter_string(call->filter, filter_string);
642 } 645 }
643 646
644 filter->preds[filter->n_preds] = pred; 647 if (fail) {
645 filter->n_preds++; 648 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
646out: 649 return err;
647 return err; 650 }
651 return 0;
648} 652}
649 653
650static void parse_init(struct filter_parse_state *ps, 654static void parse_init(struct filter_parse_state *ps,
@@ -1003,12 +1007,14 @@ static int check_preds(struct filter_parse_state *ps)
1003static int replace_preds(struct event_subsystem *system, 1007static int replace_preds(struct event_subsystem *system,
1004 struct ftrace_event_call *call, 1008 struct ftrace_event_call *call,
1005 struct filter_parse_state *ps, 1009 struct filter_parse_state *ps,
1006 char *filter_string) 1010 char *filter_string,
1011 bool dry_run)
1007{ 1012{
1008 char *operand1 = NULL, *operand2 = NULL; 1013 char *operand1 = NULL, *operand2 = NULL;
1009 struct filter_pred *pred; 1014 struct filter_pred *pred;
1010 struct postfix_elt *elt; 1015 struct postfix_elt *elt;
1011 int err; 1016 int err;
1017 int n_preds = 0;
1012 1018
1013 err = check_preds(ps); 1019 err = check_preds(ps);
1014 if (err) 1020 if (err)
@@ -1027,24 +1033,14 @@ static int replace_preds(struct event_subsystem *system,
1027 continue; 1033 continue;
1028 } 1034 }
1029 1035
1036 if (n_preds++ == MAX_FILTER_PRED) {
1037 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1038 return -ENOSPC;
1039 }
1040
1030 if (elt->op == OP_AND || elt->op == OP_OR) { 1041 if (elt->op == OP_AND || elt->op == OP_OR) {
1031 pred = create_logical_pred(elt->op); 1042 pred = create_logical_pred(elt->op);
1032 if (!pred) 1043 goto add_pred;
1033 return -ENOMEM;
1034 if (call) {
1035 err = filter_add_pred(ps, call, pred);
1036 filter_free_pred(pred);
1037 } else {
1038 err = filter_add_subsystem_pred(ps, system,
1039 pred, filter_string);
1040 if (err)
1041 filter_free_pred(pred);
1042 }
1043 if (err)
1044 return err;
1045
1046 operand1 = operand2 = NULL;
1047 continue;
1048 } 1044 }
1049 1045
1050 if (!operand1 || !operand2) { 1046 if (!operand1 || !operand2) {
@@ -1053,17 +1049,15 @@ static int replace_preds(struct event_subsystem *system,
1053 } 1049 }
1054 1050
1055 pred = create_pred(elt->op, operand1, operand2); 1051 pred = create_pred(elt->op, operand1, operand2);
1052add_pred:
1056 if (!pred) 1053 if (!pred)
1057 return -ENOMEM; 1054 return -ENOMEM;
1058 if (call) { 1055 if (call)
1059 err = filter_add_pred(ps, call, pred); 1056 err = filter_add_pred(ps, call, pred, false);
1060 filter_free_pred(pred); 1057 else
1061 } else {
1062 err = filter_add_subsystem_pred(ps, system, pred, 1058 err = filter_add_subsystem_pred(ps, system, pred,
1063 filter_string); 1059 filter_string, dry_run);
1064 if (err) 1060 filter_free_pred(pred);
1065 filter_free_pred(pred);
1066 }
1067 if (err) 1061 if (err)
1068 return err; 1062 return err;
1069 1063
@@ -1103,7 +1097,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1103 goto out; 1097 goto out;
1104 } 1098 }
1105 1099
1106 err = replace_preds(NULL, call, ps, filter_string); 1100 err = replace_preds(NULL, call, ps, filter_string, false);
1107 if (err) 1101 if (err)
1108 append_filter_err(ps, call->filter); 1102 append_filter_err(ps, call->filter);
1109 1103
@@ -1127,7 +1121,7 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1127 mutex_lock(&event_mutex); 1121 mutex_lock(&event_mutex);
1128 1122
1129 if (!strcmp(strstrip(filter_string), "0")) { 1123 if (!strcmp(strstrip(filter_string), "0")) {
1130 filter_free_subsystem_preds(system); 1124 filter_free_subsystem_preds(system, FILTER_DISABLE_ALL);
1131 remove_filter_string(system->filter); 1125 remove_filter_string(system->filter);
1132 mutex_unlock(&event_mutex); 1126 mutex_unlock(&event_mutex);
1133 return 0; 1127 return 0;
@@ -1138,7 +1132,6 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1138 if (!ps) 1132 if (!ps)
1139 goto out_unlock; 1133 goto out_unlock;
1140 1134
1141 filter_free_subsystem_preds(system);
1142 replace_filter_string(system->filter, filter_string); 1135 replace_filter_string(system->filter, filter_string);
1143 1136
1144 parse_init(ps, filter_ops, filter_string); 1137 parse_init(ps, filter_ops, filter_string);
@@ -1148,9 +1141,23 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
1148 goto out; 1141 goto out;
1149 } 1142 }
1150 1143
1151 err = replace_preds(system, NULL, ps, filter_string); 1144 filter_free_subsystem_preds(system, FILTER_INIT_NO_RESET);
1152 if (err) 1145
1146 /* try to see the filter can be applied to which events */
1147 err = replace_preds(system, NULL, ps, filter_string, true);
1148 if (err) {
1153 append_filter_err(ps, system->filter); 1149 append_filter_err(ps, system->filter);
1150 goto out;
1151 }
1152
1153 filter_free_subsystem_preds(system, FILTER_SKIP_NO_RESET);
1154
1155 /* really apply the filter to the events */
1156 err = replace_preds(system, NULL, ps, filter_string, false);
1157 if (err) {
1158 append_filter_err(ps, system->filter);
1159 filter_free_subsystem_preds(system, 2);
1160 }
1154 1161
1155out: 1162out:
1156 filter_opstack_clear(ps); 1163 filter_opstack_clear(ps);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 75ef000613c3..5b01b94518fc 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -288,11 +288,9 @@ static int
288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, 288ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
289 struct ftrace_probe_ops *ops, void *data) 289 struct ftrace_probe_ops *ops, void *data)
290{ 290{
291 char str[KSYM_SYMBOL_LEN];
292 long count = (long)data; 291 long count = (long)data;
293 292
294 kallsyms_lookup(ip, NULL, NULL, NULL, str); 293 seq_printf(m, "%pf:", (void *)ip);
295 seq_printf(m, "%s:", str);
296 294
297 if (ops == &traceon_probe_ops) 295 if (ops == &traceon_probe_ops)
298 seq_printf(m, "traceon"); 296 seq_printf(m, "traceon");
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 420ec3487579..3f4a251b7d16 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -52,7 +52,7 @@ static struct tracer_flags tracer_flags = {
52 .opts = trace_opts 52 .opts = trace_opts
53}; 53};
54 54
55/* pid on the last trace processed */ 55static struct trace_array *graph_array;
56 56
57 57
58/* Add a function return address to the trace stack on thread info.*/ 58/* Add a function return address to the trace stack on thread info.*/
@@ -166,10 +166,121 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
166 return ret; 166 return ret;
167} 167}
168 168
169static int __trace_graph_entry(struct trace_array *tr,
170 struct ftrace_graph_ent *trace,
171 unsigned long flags,
172 int pc)
173{
174 struct ftrace_event_call *call = &event_funcgraph_entry;
175 struct ring_buffer_event *event;
176 struct ftrace_graph_ent_entry *entry;
177
178 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
179 return 0;
180
181 event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_ENT,
182 sizeof(*entry), flags, pc);
183 if (!event)
184 return 0;
185 entry = ring_buffer_event_data(event);
186 entry->graph_ent = *trace;
187 if (!filter_current_check_discard(call, entry, event))
188 ring_buffer_unlock_commit(tr->buffer, event);
189
190 return 1;
191}
192
193int trace_graph_entry(struct ftrace_graph_ent *trace)
194{
195 struct trace_array *tr = graph_array;
196 struct trace_array_cpu *data;
197 unsigned long flags;
198 long disabled;
199 int ret;
200 int cpu;
201 int pc;
202
203 if (unlikely(!tr))
204 return 0;
205
206 if (!ftrace_trace_task(current))
207 return 0;
208
209 if (!ftrace_graph_addr(trace->func))
210 return 0;
211
212 local_irq_save(flags);
213 cpu = raw_smp_processor_id();
214 data = tr->data[cpu];
215 disabled = atomic_inc_return(&data->disabled);
216 if (likely(disabled == 1)) {
217 pc = preempt_count();
218 ret = __trace_graph_entry(tr, trace, flags, pc);
219 } else {
220 ret = 0;
221 }
222 /* Only do the atomic if it is not already set */
223 if (!test_tsk_trace_graph(current))
224 set_tsk_trace_graph(current);
225
226 atomic_dec(&data->disabled);
227 local_irq_restore(flags);
228
229 return ret;
230}
231
232static void __trace_graph_return(struct trace_array *tr,
233 struct ftrace_graph_ret *trace,
234 unsigned long flags,
235 int pc)
236{
237 struct ftrace_event_call *call = &event_funcgraph_exit;
238 struct ring_buffer_event *event;
239 struct ftrace_graph_ret_entry *entry;
240
241 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
242 return;
243
244 event = trace_buffer_lock_reserve(tr, TRACE_GRAPH_RET,
245 sizeof(*entry), flags, pc);
246 if (!event)
247 return;
248 entry = ring_buffer_event_data(event);
249 entry->ret = *trace;
250 if (!filter_current_check_discard(call, entry, event))
251 ring_buffer_unlock_commit(tr->buffer, event);
252}
253
254void trace_graph_return(struct ftrace_graph_ret *trace)
255{
256 struct trace_array *tr = graph_array;
257 struct trace_array_cpu *data;
258 unsigned long flags;
259 long disabled;
260 int cpu;
261 int pc;
262
263 local_irq_save(flags);
264 cpu = raw_smp_processor_id();
265 data = tr->data[cpu];
266 disabled = atomic_inc_return(&data->disabled);
267 if (likely(disabled == 1)) {
268 pc = preempt_count();
269 __trace_graph_return(tr, trace, flags, pc);
270 }
271 if (!trace->depth)
272 clear_tsk_trace_graph(current);
273 atomic_dec(&data->disabled);
274 local_irq_restore(flags);
275}
276
169static int graph_trace_init(struct trace_array *tr) 277static int graph_trace_init(struct trace_array *tr)
170{ 278{
171 int ret = register_ftrace_graph(&trace_graph_return, 279 int ret;
172 &trace_graph_entry); 280
281 graph_array = tr;
282 ret = register_ftrace_graph(&trace_graph_return,
283 &trace_graph_entry);
173 if (ret) 284 if (ret)
174 return ret; 285 return ret;
175 tracing_start_cmdline_record(); 286 tracing_start_cmdline_record();
@@ -177,49 +288,30 @@ static int graph_trace_init(struct trace_array *tr)
177 return 0; 288 return 0;
178} 289}
179 290
291void set_graph_array(struct trace_array *tr)
292{
293 graph_array = tr;
294}
295
180static void graph_trace_reset(struct trace_array *tr) 296static void graph_trace_reset(struct trace_array *tr)
181{ 297{
182 tracing_stop_cmdline_record(); 298 tracing_stop_cmdline_record();
183 unregister_ftrace_graph(); 299 unregister_ftrace_graph();
184} 300}
185 301
186static inline int log10_cpu(int nb) 302static int max_bytes_for_cpu;
187{
188 if (nb / 100)
189 return 3;
190 if (nb / 10)
191 return 2;
192 return 1;
193}
194 303
195static enum print_line_t 304static enum print_line_t
196print_graph_cpu(struct trace_seq *s, int cpu) 305print_graph_cpu(struct trace_seq *s, int cpu)
197{ 306{
198 int i;
199 int ret; 307 int ret;
200 int log10_this = log10_cpu(cpu);
201 int log10_all = log10_cpu(cpumask_weight(cpu_online_mask));
202
203 308
204 /* 309 /*
205 * Start with a space character - to make it stand out 310 * Start with a space character - to make it stand out
206 * to the right a bit when trace output is pasted into 311 * to the right a bit when trace output is pasted into
207 * email: 312 * email:
208 */ 313 */
209 ret = trace_seq_printf(s, " "); 314 ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu);
210
211 /*
212 * Tricky - we space the CPU field according to the max
213 * number of online CPUs. On a 2-cpu system it would take
214 * a maximum of 1 digit - on a 128 cpu system it would
215 * take up to 3 digits:
216 */
217 for (i = 0; i < log10_all - log10_this; i++) {
218 ret = trace_seq_printf(s, " ");
219 if (!ret)
220 return TRACE_TYPE_PARTIAL_LINE;
221 }
222 ret = trace_seq_printf(s, "%d) ", cpu);
223 if (!ret) 315 if (!ret)
224 return TRACE_TYPE_PARTIAL_LINE; 316 return TRACE_TYPE_PARTIAL_LINE;
225 317
@@ -565,11 +657,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
565 return TRACE_TYPE_PARTIAL_LINE; 657 return TRACE_TYPE_PARTIAL_LINE;
566 } 658 }
567 659
568 ret = seq_print_ip_sym(s, call->func, 0); 660 ret = trace_seq_printf(s, "%pf();\n", (void *)call->func);
569 if (!ret)
570 return TRACE_TYPE_PARTIAL_LINE;
571
572 ret = trace_seq_printf(s, "();\n");
573 if (!ret) 661 if (!ret)
574 return TRACE_TYPE_PARTIAL_LINE; 662 return TRACE_TYPE_PARTIAL_LINE;
575 663
@@ -612,11 +700,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
612 return TRACE_TYPE_PARTIAL_LINE; 700 return TRACE_TYPE_PARTIAL_LINE;
613 } 701 }
614 702
615 ret = seq_print_ip_sym(s, call->func, 0); 703 ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func);
616 if (!ret)
617 return TRACE_TYPE_PARTIAL_LINE;
618
619 ret = trace_seq_printf(s, "() {\n");
620 if (!ret) 704 if (!ret)
621 return TRACE_TYPE_PARTIAL_LINE; 705 return TRACE_TYPE_PARTIAL_LINE;
622 706
@@ -934,6 +1018,8 @@ static struct tracer graph_trace __read_mostly = {
934 1018
935static __init int init_graph_trace(void) 1019static __init int init_graph_trace(void)
936{ 1020{
1021 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1022
937 return register_tracer(&graph_trace); 1023 return register_tracer(&graph_trace);
938} 1024}
939 1025
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index a98106dd979c..e1285d7b5488 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -20,6 +20,34 @@ static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex); 20static DEFINE_MUTEX(sched_register_mutex);
21static int sched_stopped; 21static int sched_stopped;
22 22
23
24void
25tracing_sched_switch_trace(struct trace_array *tr,
26 struct task_struct *prev,
27 struct task_struct *next,
28 unsigned long flags, int pc)
29{
30 struct ftrace_event_call *call = &event_context_switch;
31 struct ring_buffer_event *event;
32 struct ctx_switch_entry *entry;
33
34 event = trace_buffer_lock_reserve(tr, TRACE_CTX,
35 sizeof(*entry), flags, pc);
36 if (!event)
37 return;
38 entry = ring_buffer_event_data(event);
39 entry->prev_pid = prev->pid;
40 entry->prev_prio = prev->prio;
41 entry->prev_state = prev->state;
42 entry->next_pid = next->pid;
43 entry->next_prio = next->prio;
44 entry->next_state = next->state;
45 entry->next_cpu = task_cpu(next);
46
47 if (!filter_check_discard(call, entry, tr->buffer, event))
48 trace_buffer_unlock_commit(tr, event, flags, pc);
49}
50
23static void 51static void
24probe_sched_switch(struct rq *__rq, struct task_struct *prev, 52probe_sched_switch(struct rq *__rq, struct task_struct *prev,
25 struct task_struct *next) 53 struct task_struct *next)
@@ -49,6 +77,35 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
49 local_irq_restore(flags); 77 local_irq_restore(flags);
50} 78}
51 79
80void
81tracing_sched_wakeup_trace(struct trace_array *tr,
82 struct task_struct *wakee,
83 struct task_struct *curr,
84 unsigned long flags, int pc)
85{
86 struct ftrace_event_call *call = &event_wakeup;
87 struct ring_buffer_event *event;
88 struct ctx_switch_entry *entry;
89
90 event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
91 sizeof(*entry), flags, pc);
92 if (!event)
93 return;
94 entry = ring_buffer_event_data(event);
95 entry->prev_pid = curr->pid;
96 entry->prev_prio = curr->prio;
97 entry->prev_state = curr->state;
98 entry->next_pid = wakee->pid;
99 entry->next_prio = wakee->prio;
100 entry->next_state = wakee->state;
101 entry->next_cpu = task_cpu(wakee);
102
103 if (!filter_check_discard(call, entry, tr->buffer, event))
104 ring_buffer_unlock_commit(tr->buffer, event);
105 ftrace_trace_stack(tr, flags, 6, pc);
106 ftrace_trace_userstack(tr, flags, pc);
107}
108
52static void 109static void
53probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 110probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
54{ 111{
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 00dd6485bdd7..d2cdbabb4ead 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -288,6 +288,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
288 * to detect and recover from possible hangs 288 * to detect and recover from possible hangs
289 */ 289 */
290 tracing_reset_online_cpus(tr); 290 tracing_reset_online_cpus(tr);
291 set_graph_array(tr);
291 ret = register_ftrace_graph(&trace_graph_return, 292 ret = register_ftrace_graph(&trace_graph_return,
292 &trace_graph_entry_watchdog); 293 &trace_graph_entry_watchdog);
293 if (ret) { 294 if (ret) {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 6a2a9d484cd6..0da1cff08d67 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -234,15 +234,8 @@ static void t_stop(struct seq_file *m, void *p)
234static int trace_lookup_stack(struct seq_file *m, long i) 234static int trace_lookup_stack(struct seq_file *m, long i)
235{ 235{
236 unsigned long addr = stack_dump_trace[i]; 236 unsigned long addr = stack_dump_trace[i];
237#ifdef CONFIG_KALLSYMS
238 char str[KSYM_SYMBOL_LEN];
239 237
240 sprint_symbol(str, addr); 238 return seq_printf(m, "%pF\n", (void *)addr);
241
242 return seq_printf(m, "%s\n", str);
243#else
244 return seq_printf(m, "%p\n", (void*)addr);
245#endif
246} 239}
247 240
248static void print_disabled(struct seq_file *m) 241static void print_disabled(struct seq_file *m)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index aea321c82fa0..07c60b09258f 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -49,7 +49,8 @@ static struct dentry *stat_dir;
49 * but it will at least advance closer to the next one 49 * but it will at least advance closer to the next one
50 * to be released. 50 * to be released.
51 */ 51 */
52static struct rb_node *release_next(struct rb_node *node) 52static struct rb_node *release_next(struct tracer_stat *ts,
53 struct rb_node *node)
53{ 54{
54 struct stat_node *snode; 55 struct stat_node *snode;
55 struct rb_node *parent = rb_parent(node); 56 struct rb_node *parent = rb_parent(node);
@@ -67,6 +68,8 @@ static struct rb_node *release_next(struct rb_node *node)
67 parent->rb_right = NULL; 68 parent->rb_right = NULL;
68 69
69 snode = container_of(node, struct stat_node, node); 70 snode = container_of(node, struct stat_node, node);
71 if (ts->stat_release)
72 ts->stat_release(snode->stat);
70 kfree(snode); 73 kfree(snode);
71 74
72 return parent; 75 return parent;
@@ -78,7 +81,7 @@ static void __reset_stat_session(struct stat_session *session)
78 struct rb_node *node = session->stat_root.rb_node; 81 struct rb_node *node = session->stat_root.rb_node;
79 82
80 while (node) 83 while (node)
81 node = release_next(node); 84 node = release_next(session->ts, node);
82 85
83 session->stat_root = RB_ROOT; 86 session->stat_root = RB_ROOT;
84} 87}
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index f3546a2cd826..8f03914b9a6a 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -18,6 +18,8 @@ struct tracer_stat {
18 int (*stat_cmp)(void *p1, void *p2); 18 int (*stat_cmp)(void *p1, void *p2);
19 /* Print a stat entry */ 19 /* Print a stat entry */
20 int (*stat_show)(struct seq_file *s, void *p); 20 int (*stat_show)(struct seq_file *s, void *p);
21 /* Release an entry */
22 void (*stat_release)(void *stat);
21 /* Print the headers of your stat entries */ 23 /* Print the headers of your stat entries */
22 int (*stat_headers)(struct seq_file *s); 24 int (*stat_headers)(struct seq_file *s);
23}; 25};
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 97fcea4acce1..40cafb07dffd 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
9#include <trace/events/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/kref.h>
12#include "trace_stat.h" 13#include "trace_stat.h"
13#include "trace.h" 14#include "trace.h"
14 15
@@ -16,6 +17,7 @@
16/* A cpu workqueue thread */ 17/* A cpu workqueue thread */
17struct cpu_workqueue_stats { 18struct cpu_workqueue_stats {
18 struct list_head list; 19 struct list_head list;
20 struct kref kref;
19 int cpu; 21 int cpu;
20 pid_t pid; 22 pid_t pid;
21/* Can be inserted from interrupt or user context, need to be atomic */ 23/* Can be inserted from interrupt or user context, need to be atomic */
@@ -39,6 +41,11 @@ struct workqueue_global_stats {
39static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); 41static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
40#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) 42#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
41 43
44static void cpu_workqueue_stat_free(struct kref *kref)
45{
46 kfree(container_of(kref, struct cpu_workqueue_stats, kref));
47}
48
42/* Insertion of a work */ 49/* Insertion of a work */
43static void 50static void
44probe_workqueue_insertion(struct task_struct *wq_thread, 51probe_workqueue_insertion(struct task_struct *wq_thread,
@@ -96,8 +103,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
96 return; 103 return;
97 } 104 }
98 INIT_LIST_HEAD(&cws->list); 105 INIT_LIST_HEAD(&cws->list);
106 kref_init(&cws->kref);
99 cws->cpu = cpu; 107 cws->cpu = cpu;
100
101 cws->pid = wq_thread->pid; 108 cws->pid = wq_thread->pid;
102 109
103 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 110 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
@@ -118,7 +125,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
118 list) { 125 list) {
119 if (node->pid == wq_thread->pid) { 126 if (node->pid == wq_thread->pid) {
120 list_del(&node->list); 127 list_del(&node->list);
121 kfree(node); 128 kref_put(&node->kref, cpu_workqueue_stat_free);
122 goto found; 129 goto found;
123 } 130 }
124 } 131 }
@@ -137,9 +144,11 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
137 144
138 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 145 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
139 146
140 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) 147 if (!list_empty(&workqueue_cpu_stat(cpu)->list)) {
141 ret = list_entry(workqueue_cpu_stat(cpu)->list.next, 148 ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
142 struct cpu_workqueue_stats, list); 149 struct cpu_workqueue_stats, list);
150 kref_get(&ret->kref);
151 }
143 152
144 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 153 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
145 154
@@ -162,9 +171,9 @@ static void *workqueue_stat_start(struct tracer_stat *trace)
162static void *workqueue_stat_next(void *prev, int idx) 171static void *workqueue_stat_next(void *prev, int idx)
163{ 172{
164 struct cpu_workqueue_stats *prev_cws = prev; 173 struct cpu_workqueue_stats *prev_cws = prev;
174 struct cpu_workqueue_stats *ret;
165 int cpu = prev_cws->cpu; 175 int cpu = prev_cws->cpu;
166 unsigned long flags; 176 unsigned long flags;
167 void *ret = NULL;
168 177
169 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); 178 spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
170 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { 179 if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
@@ -175,11 +184,14 @@ static void *workqueue_stat_next(void *prev, int idx)
175 return NULL; 184 return NULL;
176 } while (!(ret = workqueue_stat_start_cpu(cpu))); 185 } while (!(ret = workqueue_stat_start_cpu(cpu)));
177 return ret; 186 return ret;
187 } else {
188 ret = list_entry(prev_cws->list.next,
189 struct cpu_workqueue_stats, list);
190 kref_get(&ret->kref);
178 } 191 }
179 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); 192 spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
180 193
181 return list_entry(prev_cws->list.next, struct cpu_workqueue_stats, 194 return ret;
182 list);
183} 195}
184 196
185static int workqueue_stat_show(struct seq_file *s, void *p) 197static int workqueue_stat_show(struct seq_file *s, void *p)
@@ -203,6 +215,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
203 return 0; 215 return 0;
204} 216}
205 217
218static void workqueue_stat_release(void *stat)
219{
220 struct cpu_workqueue_stats *node = stat;
221
222 kref_put(&node->kref, cpu_workqueue_stat_free);
223}
224
206static int workqueue_stat_headers(struct seq_file *s) 225static int workqueue_stat_headers(struct seq_file *s)
207{ 226{
208 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); 227 seq_printf(s, "# CPU INSERTED EXECUTED NAME\n");
@@ -215,6 +234,7 @@ struct tracer_stat workqueue_stats __read_mostly = {
215 .stat_start = workqueue_stat_start, 234 .stat_start = workqueue_stat_start,
216 .stat_next = workqueue_stat_next, 235 .stat_next = workqueue_stat_next,
217 .stat_show = workqueue_stat_show, 236 .stat_show = workqueue_stat_show,
237 .stat_release = workqueue_stat_release,
218 .stat_headers = workqueue_stat_headers 238 .stat_headers = workqueue_stat_headers
219}; 239};
220 240
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 911ba7ffab84..090d300d7394 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -57,7 +57,6 @@
57# call mcount (offset: 0x5) 57# call mcount (offset: 0x5)
58# [...] 58# [...]
59# ret 59# ret
60# .globl my_func
61# other_func: 60# other_func:
62# [...] 61# [...]
63# call mcount (offset: 0x1b) 62# call mcount (offset: 0x1b)