diff options
author | Ingo Molnar <mingo@elte.hu> | 2009-07-10 07:30:06 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2009-07-10 07:30:06 -0400 |
commit | e202687927c132b1e1ff36b526b5e78ac33de840 (patch) | |
tree | 8c1cadd560913e41237af48fc0058a9ed640dda7 | |
parent | a35780005eb256eb5ec83ffcc802967295887a45 (diff) | |
parent | 8b2c70d1e43074cc06afe99b0de12b686d9c9d02 (diff) |
Merge branch 'tip/tracing/ring-buffer-3' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-2.6-trace into tracing/core
-rw-r--r-- | Documentation/trace/ring-buffer-design.txt | 955 | ||||
-rw-r--r-- | include/linux/ring_buffer.h | 1 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 935 | ||||
-rw-r--r-- | kernel/trace/trace.c | 3 |
4 files changed, 1725 insertions, 169 deletions
diff --git a/Documentation/trace/ring-buffer-design.txt b/Documentation/trace/ring-buffer-design.txt new file mode 100644 index 000000000000..5b1d23d604c5 --- /dev/null +++ b/Documentation/trace/ring-buffer-design.txt | |||
@@ -0,0 +1,955 @@ | |||
1 | Lockless Ring Buffer Design | ||
2 | =========================== | ||
3 | |||
4 | Copyright 2009 Red Hat Inc. | ||
5 | Author: Steven Rostedt <srostedt@redhat.com> | ||
6 | License: The GNU Free Documentation License, Version 1.2 | ||
7 | (dual licensed under the GPL v2) | ||
8 | Reviewers: Mathieu Desnoyers, Huang Ying, Hidetoshi Seto, | ||
9 | and Frederic Weisbecker. | ||
10 | |||
11 | |||
12 | Written for: 2.6.31 | ||
13 | |||
14 | Terminology used in this Document | ||
15 | --------------------------------- | ||
16 | |||
17 | tail - where new writes happen in the ring buffer. | ||
18 | |||
19 | head - where new reads happen in the ring buffer. | ||
20 | |||
21 | producer - the task that writes into the ring buffer (same as writer) | ||
22 | |||
23 | writer - same as producer | ||
24 | |||
25 | consumer - the task that reads from the buffer (same as reader) | ||
26 | |||
27 | reader - same as consumer. | ||
28 | |||
29 | reader_page - A page outside the ring buffer used solely (for the most part) | ||
30 | by the reader. | ||
31 | |||
32 | head_page - a pointer to the page that the reader will use next | ||
33 | |||
34 | tail_page - a pointer to the page that will be written to next | ||
35 | |||
36 | commit_page - a pointer to the page with the last finished non nested write. | ||
37 | |||
38 | cmpxchg - hardware assisted atomic transaction that performs the following: | ||
39 | |||
40 | A = B iff previous A == C | ||
41 | |||
42 | R = cmpxchg(A, C, B) is saying that we replace A with B if and only if | ||
43 | current A is equal to C, and we put the old (current) A into R | ||
44 | |||
45 | R gets the previous A regardless if A is updated with B or not. | ||
46 | |||
47 | To see if the update was successful a compare of R == C may be used. | ||
48 | |||
49 | The Generic Ring Buffer | ||
50 | ----------------------- | ||
51 | |||
52 | The ring buffer can be used in either an overwrite mode or in | ||
53 | producer/consumer mode. | ||
54 | |||
55 | Producer/consumer mode is where the producer were to fill up the | ||
56 | buffer before the consumer could free up anything, the producer | ||
57 | will stop writing to the buffer. This will lose most recent events. | ||
58 | |||
59 | Overwrite mode is where the produce were to fill up the buffer | ||
60 | before the consumer could free up anything, the producer will | ||
61 | overwrite the older data. This will lose the oldest events. | ||
62 | |||
63 | No two writers can write at the same time (on the same per cpu buffer), | ||
64 | but a writer may interrupt another writer, but it must finish writing | ||
65 | before the previous writer may continue. This is very important to the | ||
66 | algorithm. The writers act like a "stack". The way interrupts works | ||
67 | enforces this behavior. | ||
68 | |||
69 | |||
70 | writer1 start | ||
71 | <preempted> writer2 start | ||
72 | <preempted> writer3 start | ||
73 | writer3 finishes | ||
74 | writer2 finishes | ||
75 | writer1 finishes | ||
76 | |||
77 | This is very much like a writer being preempted by an interrupt and | ||
78 | the interrupt doing a write as well. | ||
79 | |||
80 | Readers can happen at any time. But no two readers may run at the | ||
81 | same time, nor can a reader preempt/interrupt another reader. A reader | ||
82 | can not preempt/interrupt a writer, but it may read/consume from the | ||
83 | buffer at the same time as a writer is writing, but the reader must be | ||
84 | on another processor to do so. A reader may read on its own processor | ||
85 | and can be preempted by a writer. | ||
86 | |||
87 | A writer can preempt a reader, but a reader can not preempt a writer. | ||
88 | But a reader can read the buffer at the same time (on another processor) | ||
89 | as a writer. | ||
90 | |||
91 | The ring buffer is made up of a list of pages held together by a link list. | ||
92 | |||
93 | At initialization a reader page is allocated for the reader that is not | ||
94 | part of the ring buffer. | ||
95 | |||
96 | The head_page, tail_page and commit_page are all initialized to point | ||
97 | to the same page. | ||
98 | |||
99 | The reader page is initialized to have its next pointer pointing to | ||
100 | the head page, and its previous pointer pointing to a page before | ||
101 | the head page. | ||
102 | |||
103 | The reader has its own page to use. At start up time, this page is | ||
104 | allocated but is not attached to the list. When the reader wants | ||
105 | to read from the buffer, if its page is empty (like it is on start up) | ||
106 | it will swap its page with the head_page. The old reader page will | ||
107 | become part of the ring buffer and the head_page will be removed. | ||
108 | The page after the inserted page (old reader_page) will become the | ||
109 | new head page. | ||
110 | |||
111 | Once the new page is given to the reader, the reader could do what | ||
112 | it wants with it, as long as a writer has left that page. | ||
113 | |||
114 | A sample of how the reader page is swapped: Note this does not | ||
115 | show the head page in the buffer, it is for demonstrating a swap | ||
116 | only. | ||
117 | |||
118 | +------+ | ||
119 | |reader| RING BUFFER | ||
120 | |page | | ||
121 | +------+ | ||
122 | +---+ +---+ +---+ | ||
123 | | |-->| |-->| | | ||
124 | | |<--| |<--| | | ||
125 | +---+ +---+ +---+ | ||
126 | ^ | ^ | | ||
127 | | +-------------+ | | ||
128 | +-----------------+ | ||
129 | |||
130 | |||
131 | +------+ | ||
132 | |reader| RING BUFFER | ||
133 | |page |-------------------+ | ||
134 | +------+ v | ||
135 | | +---+ +---+ +---+ | ||
136 | | | |-->| |-->| | | ||
137 | | | |<--| |<--| |<-+ | ||
138 | | +---+ +---+ +---+ | | ||
139 | | ^ | ^ | | | ||
140 | | | +-------------+ | | | ||
141 | | +-----------------+ | | ||
142 | +------------------------------------+ | ||
143 | |||
144 | +------+ | ||
145 | |reader| RING BUFFER | ||
146 | |page |-------------------+ | ||
147 | +------+ <---------------+ v | ||
148 | | ^ +---+ +---+ +---+ | ||
149 | | | | |-->| |-->| | | ||
150 | | | | | | |<--| |<-+ | ||
151 | | | +---+ +---+ +---+ | | ||
152 | | | | ^ | | | ||
153 | | | +-------------+ | | | ||
154 | | +-----------------------------+ | | ||
155 | +------------------------------------+ | ||
156 | |||
157 | +------+ | ||
158 | |buffer| RING BUFFER | ||
159 | |page |-------------------+ | ||
160 | +------+ <---------------+ v | ||
161 | | ^ +---+ +---+ +---+ | ||
162 | | | | | | |-->| | | ||
163 | | | New | | | |<--| |<-+ | ||
164 | | | Reader +---+ +---+ +---+ | | ||
165 | | | page ----^ | | | ||
166 | | | | | | ||
167 | | +-----------------------------+ | | ||
168 | +------------------------------------+ | ||
169 | |||
170 | |||
171 | |||
172 | It is possible that the page swapped is the commit page and the tail page, | ||
173 | if what is in the ring buffer is less than what is held in a buffer page. | ||
174 | |||
175 | |||
176 | reader page commit page tail page | ||
177 | | | | | ||
178 | v | | | ||
179 | +---+ | | | ||
180 | | |<----------+ | | ||
181 | | |<------------------------+ | ||
182 | | |------+ | ||
183 | +---+ | | ||
184 | | | ||
185 | v | ||
186 | +---+ +---+ +---+ +---+ | ||
187 | <---| |--->| |--->| |--->| |---> | ||
188 | --->| |<---| |<---| |<---| |<--- | ||
189 | +---+ +---+ +---+ +---+ | ||
190 | |||
191 | This case is still valid for this algorithm. | ||
192 | When the writer leaves the page, it simply goes into the ring buffer | ||
193 | since the reader page still points to the next location in the ring | ||
194 | buffer. | ||
195 | |||
196 | |||
197 | The main pointers: | ||
198 | |||
199 | reader page - The page used solely by the reader and is not part | ||
200 | of the ring buffer (may be swapped in) | ||
201 | |||
202 | head page - the next page in the ring buffer that will be swapped | ||
203 | with the reader page. | ||
204 | |||
205 | tail page - the page where the next write will take place. | ||
206 | |||
207 | commit page - the page that last finished a write. | ||
208 | |||
209 | The commit page only is updated by the outer most writer in the | ||
210 | writer stack. A writer that preempts another writer will not move the | ||
211 | commit page. | ||
212 | |||
213 | When data is written into the ring buffer, a position is reserved | ||
214 | in the ring buffer and passed back to the writer. When the writer | ||
215 | is finished writing data into that position, it commits the write. | ||
216 | |||
217 | Another write (or a read) may take place at anytime during this | ||
218 | transaction. If another write happens it must finish before continuing | ||
219 | with the previous write. | ||
220 | |||
221 | |||
222 | Write reserve: | ||
223 | |||
224 | Buffer page | ||
225 | +---------+ | ||
226 | |written | | ||
227 | +---------+ <--- given back to writer (current commit) | ||
228 | |reserved | | ||
229 | +---------+ <--- tail pointer | ||
230 | | empty | | ||
231 | +---------+ | ||
232 | |||
233 | Write commit: | ||
234 | |||
235 | Buffer page | ||
236 | +---------+ | ||
237 | |written | | ||
238 | +---------+ | ||
239 | |written | | ||
240 | +---------+ <--- next positon for write (current commit) | ||
241 | | empty | | ||
242 | +---------+ | ||
243 | |||
244 | |||
245 | If a write happens after the first reserve: | ||
246 | |||
247 | Buffer page | ||
248 | +---------+ | ||
249 | |written | | ||
250 | +---------+ <-- current commit | ||
251 | |reserved | | ||
252 | +---------+ <--- given back to second writer | ||
253 | |reserved | | ||
254 | +---------+ <--- tail pointer | ||
255 | |||
256 | After second writer commits: | ||
257 | |||
258 | |||
259 | Buffer page | ||
260 | +---------+ | ||
261 | |written | | ||
262 | +---------+ <--(last full commit) | ||
263 | |reserved | | ||
264 | +---------+ | ||
265 | |pending | | ||
266 | |commit | | ||
267 | +---------+ <--- tail pointer | ||
268 | |||
269 | When the first writer commits: | ||
270 | |||
271 | Buffer page | ||
272 | +---------+ | ||
273 | |written | | ||
274 | +---------+ | ||
275 | |written | | ||
276 | +---------+ | ||
277 | |written | | ||
278 | +---------+ <--(last full commit and tail pointer) | ||
279 | |||
280 | |||
281 | The commit pointer points to the last write location that was | ||
282 | committed without preempting another write. When a write that | ||
283 | preempted another write is committed, it only becomes a pending commit | ||
284 | and will not be a full commit till all writes have been committed. | ||
285 | |||
286 | The commit page points to the page that has the last full commit. | ||
287 | The tail page points to the page with the last write (before | ||
288 | committing). | ||
289 | |||
290 | The tail page is always equal to or after the commit page. It may | ||
291 | be several pages ahead. If the tail page catches up to the commit | ||
292 | page then no more writes may take place (regardless of the mode | ||
293 | of the ring buffer: overwrite and produce/consumer). | ||
294 | |||
295 | The order of pages are: | ||
296 | |||
297 | head page | ||
298 | commit page | ||
299 | tail page | ||
300 | |||
301 | Possible scenario: | ||
302 | tail page | ||
303 | head page commit page | | ||
304 | | | | | ||
305 | v v v | ||
306 | +---+ +---+ +---+ +---+ | ||
307 | <---| |--->| |--->| |--->| |---> | ||
308 | --->| |<---| |<---| |<---| |<--- | ||
309 | +---+ +---+ +---+ +---+ | ||
310 | |||
311 | There is a special case that the head page is after either the commit page | ||
312 | and possibly the tail page. That is when the commit (and tail) page has been | ||
313 | swapped with the reader page. This is because the head page is always | ||
314 | part of the ring buffer, but the reader page is not. When ever there | ||
315 | has been less than a full page that has been committed inside the ring buffer, | ||
316 | and a reader swaps out a page, it will be swapping out the commit page. | ||
317 | |||
318 | |||
319 | reader page commit page tail page | ||
320 | | | | | ||
321 | v | | | ||
322 | +---+ | | | ||
323 | | |<----------+ | | ||
324 | | |<------------------------+ | ||
325 | | |------+ | ||
326 | +---+ | | ||
327 | | | ||
328 | v | ||
329 | +---+ +---+ +---+ +---+ | ||
330 | <---| |--->| |--->| |--->| |---> | ||
331 | --->| |<---| |<---| |<---| |<--- | ||
332 | +---+ +---+ +---+ +---+ | ||
333 | ^ | ||
334 | | | ||
335 | head page | ||
336 | |||
337 | |||
338 | In this case, the head page will not move when the tail and commit | ||
339 | move back into the ring buffer. | ||
340 | |||
341 | The reader can not swap a page into the ring buffer if the commit page | ||
342 | is still on that page. If the read meets the last commit (real commit | ||
343 | not pending or reserved), then there is nothing more to read. | ||
344 | The buffer is considered empty until another full commit finishes. | ||
345 | |||
346 | When the tail meets the head page, if the buffer is in overwrite mode, | ||
347 | the head page will be pushed ahead one. If the buffer is in producer/consumer | ||
348 | mode, the write will fail. | ||
349 | |||
350 | Overwrite mode: | ||
351 | |||
352 | tail page | ||
353 | | | ||
354 | v | ||
355 | +---+ +---+ +---+ +---+ | ||
356 | <---| |--->| |--->| |--->| |---> | ||
357 | --->| |<---| |<---| |<---| |<--- | ||
358 | +---+ +---+ +---+ +---+ | ||
359 | ^ | ||
360 | | | ||
361 | head page | ||
362 | |||
363 | |||
364 | tail page | ||
365 | | | ||
366 | v | ||
367 | +---+ +---+ +---+ +---+ | ||
368 | <---| |--->| |--->| |--->| |---> | ||
369 | --->| |<---| |<---| |<---| |<--- | ||
370 | +---+ +---+ +---+ +---+ | ||
371 | ^ | ||
372 | | | ||
373 | head page | ||
374 | |||
375 | |||
376 | tail page | ||
377 | | | ||
378 | v | ||
379 | +---+ +---+ +---+ +---+ | ||
380 | <---| |--->| |--->| |--->| |---> | ||
381 | --->| |<---| |<---| |<---| |<--- | ||
382 | +---+ +---+ +---+ +---+ | ||
383 | ^ | ||
384 | | | ||
385 | head page | ||
386 | |||
387 | Note, the reader page will still point to the previous head page. | ||
388 | But when a swap takes place, it will use the most recent head page. | ||
389 | |||
390 | |||
391 | Making the Ring Buffer Lockless: | ||
392 | -------------------------------- | ||
393 | |||
394 | The main idea behind the lockless algorithm is to combine the moving | ||
395 | of the head_page pointer with the swapping of pages with the reader. | ||
396 | State flags are placed inside the pointer to the page. To do this, | ||
397 | each page must be aligned in memory by 4 bytes. This will allow the 2 | ||
398 | least significant bits of the address to be used as flags. Since | ||
399 | they will always be zero for the address. To get the address, | ||
400 | simply mask out the flags. | ||
401 | |||
402 | MASK = ~3 | ||
403 | |||
404 | address & MASK | ||
405 | |||
406 | Two flags will be kept by these two bits: | ||
407 | |||
408 | HEADER - the page being pointed to is a head page | ||
409 | |||
410 | UPDATE - the page being pointed to is being updated by a writer | ||
411 | and was or is about to be a head page. | ||
412 | |||
413 | |||
414 | reader page | ||
415 | | | ||
416 | v | ||
417 | +---+ | ||
418 | | |------+ | ||
419 | +---+ | | ||
420 | | | ||
421 | v | ||
422 | +---+ +---+ +---+ +---+ | ||
423 | <---| |--->| |-H->| |--->| |---> | ||
424 | --->| |<---| |<---| |<---| |<--- | ||
425 | +---+ +---+ +---+ +---+ | ||
426 | |||
427 | |||
428 | The above pointer "-H->" would have the HEADER flag set. That is | ||
429 | the next page is the next page to be swapped out by the reader. | ||
430 | This pointer means the next page is the head page. | ||
431 | |||
432 | When the tail page meets the head pointer, it will use cmpxchg to | ||
433 | change the pointer to the UPDATE state: | ||
434 | |||
435 | |||
436 | tail page | ||
437 | | | ||
438 | v | ||
439 | +---+ +---+ +---+ +---+ | ||
440 | <---| |--->| |-H->| |--->| |---> | ||
441 | --->| |<---| |<---| |<---| |<--- | ||
442 | +---+ +---+ +---+ +---+ | ||
443 | |||
444 | tail page | ||
445 | | | ||
446 | v | ||
447 | +---+ +---+ +---+ +---+ | ||
448 | <---| |--->| |-U->| |--->| |---> | ||
449 | --->| |<---| |<---| |<---| |<--- | ||
450 | +---+ +---+ +---+ +---+ | ||
451 | |||
452 | "-U->" represents a pointer in the UPDATE state. | ||
453 | |||
454 | Any access to the reader will need to take some sort of lock to serialize | ||
455 | the readers. But the writers will never take a lock to write to the | ||
456 | ring buffer. This means we only need to worry about a single reader, | ||
457 | and writes only preempt in "stack" formation. | ||
458 | |||
459 | When the reader tries to swap the page with the ring buffer, it | ||
460 | will also use cmpxchg. If the flag bit in the pointer to the | ||
461 | head page does not have the HEADER flag set, the compare will fail | ||
462 | and the reader will need to look for the new head page and try again. | ||
463 | Note, the flag UPDATE and HEADER are never set at the same time. | ||
464 | |||
465 | The reader swaps the reader page as follows: | ||
466 | |||
467 | +------+ | ||
468 | |reader| RING BUFFER | ||
469 | |page | | ||
470 | +------+ | ||
471 | +---+ +---+ +---+ | ||
472 | | |--->| |--->| | | ||
473 | | |<---| |<---| | | ||
474 | +---+ +---+ +---+ | ||
475 | ^ | ^ | | ||
476 | | +---------------+ | | ||
477 | +-----H-------------+ | ||
478 | |||
479 | The reader sets the reader page next pointer as HEADER to the page after | ||
480 | the head page. | ||
481 | |||
482 | |||
483 | +------+ | ||
484 | |reader| RING BUFFER | ||
485 | |page |-------H-----------+ | ||
486 | +------+ v | ||
487 | | +---+ +---+ +---+ | ||
488 | | | |--->| |--->| | | ||
489 | | | |<---| |<---| |<-+ | ||
490 | | +---+ +---+ +---+ | | ||
491 | | ^ | ^ | | | ||
492 | | | +---------------+ | | | ||
493 | | +-----H-------------+ | | ||
494 | +--------------------------------------+ | ||
495 | |||
496 | It does a cmpxchg with the pointer to the previous head page to make it | ||
497 | point to the reader page. Note that the new pointer does not have the HEADER | ||
498 | flag set. This action atomically moves the head page forward. | ||
499 | |||
500 | +------+ | ||
501 | |reader| RING BUFFER | ||
502 | |page |-------H-----------+ | ||
503 | +------+ v | ||
504 | | ^ +---+ +---+ +---+ | ||
505 | | | | |-->| |-->| | | ||
506 | | | | |<--| |<--| |<-+ | ||
507 | | | +---+ +---+ +---+ | | ||
508 | | | | ^ | | | ||
509 | | | +-------------+ | | | ||
510 | | +-----------------------------+ | | ||
511 | +------------------------------------+ | ||
512 | |||
513 | After the new head page is set, the previous pointer of the head page is | ||
514 | updated to the reader page. | ||
515 | |||
516 | +------+ | ||
517 | |reader| RING BUFFER | ||
518 | |page |-------H-----------+ | ||
519 | +------+ <---------------+ v | ||
520 | | ^ +---+ +---+ +---+ | ||
521 | | | | |-->| |-->| | | ||
522 | | | | | | |<--| |<-+ | ||
523 | | | +---+ +---+ +---+ | | ||
524 | | | | ^ | | | ||
525 | | | +-------------+ | | | ||
526 | | +-----------------------------+ | | ||
527 | +------------------------------------+ | ||
528 | |||
529 | +------+ | ||
530 | |buffer| RING BUFFER | ||
531 | |page |-------H-----------+ <--- New head page | ||
532 | +------+ <---------------+ v | ||
533 | | ^ +---+ +---+ +---+ | ||
534 | | | | | | |-->| | | ||
535 | | | New | | | |<--| |<-+ | ||
536 | | | Reader +---+ +---+ +---+ | | ||
537 | | | page ----^ | | | ||
538 | | | | | | ||
539 | | +-----------------------------+ | | ||
540 | +------------------------------------+ | ||
541 | |||
542 | Another important point. The page that the reader page points back to | ||
543 | by its previous pointer (the one that now points to the new head page) | ||
544 | never points back to the reader page. That is because the reader page is | ||
545 | not part of the ring buffer. Traversing the ring buffer via the next pointers | ||
546 | will always stay in the ring buffer. Traversing the ring buffer via the | ||
547 | prev pointers may not. | ||
548 | |||
549 | Note, the way to determine a reader page is simply by examining the previous | ||
550 | pointer of the page. If the next pointer of the previous page does not | ||
551 | point back to the original page, then the original page is a reader page: | ||
552 | |||
553 | |||
554 | +--------+ | ||
555 | | reader | next +----+ | ||
556 | | page |-------->| |<====== (buffer page) | ||
557 | +--------+ +----+ | ||
558 | | | ^ | ||
559 | | v | next | ||
560 | prev | +----+ | ||
561 | +------------->| | | ||
562 | +----+ | ||
563 | |||
564 | The way the head page moves forward: | ||
565 | |||
566 | When the tail page meets the head page and the buffer is in overwrite mode | ||
567 | and more writes take place, the head page must be moved forward before the | ||
568 | writer may move the tail page. The way this is done is that the writer | ||
569 | performs a cmpxchg to convert the pointer to the head page from the HEADER | ||
570 | flag to have the UPDATE flag set. Once this is done, the reader will | ||
571 | not be able to swap the head page from the buffer, nor will it be able to | ||
572 | move the head page, until the writer is finished with the move. | ||
573 | |||
574 | This eliminates any races that the reader can have on the writer. The reader | ||
575 | must spin, and this is why the reader can not preempt the writer. | ||
576 | |||
577 | tail page | ||
578 | | | ||
579 | v | ||
580 | +---+ +---+ +---+ +---+ | ||
581 | <---| |--->| |-H->| |--->| |---> | ||
582 | --->| |<---| |<---| |<---| |<--- | ||
583 | +---+ +---+ +---+ +---+ | ||
584 | |||
585 | tail page | ||
586 | | | ||
587 | v | ||
588 | +---+ +---+ +---+ +---+ | ||
589 | <---| |--->| |-U->| |--->| |---> | ||
590 | --->| |<---| |<---| |<---| |<--- | ||
591 | +---+ +---+ +---+ +---+ | ||
592 | |||
593 | The following page will be made into the new head page. | ||
594 | |||
595 | tail page | ||
596 | | | ||
597 | v | ||
598 | +---+ +---+ +---+ +---+ | ||
599 | <---| |--->| |-U->| |-H->| |---> | ||
600 | --->| |<---| |<---| |<---| |<--- | ||
601 | +---+ +---+ +---+ +---+ | ||
602 | |||
603 | After the new head page has been set, we can set the old head page | ||
604 | pointer back to NORMAL. | ||
605 | |||
606 | tail page | ||
607 | | | ||
608 | v | ||
609 | +---+ +---+ +---+ +---+ | ||
610 | <---| |--->| |--->| |-H->| |---> | ||
611 | --->| |<---| |<---| |<---| |<--- | ||
612 | +---+ +---+ +---+ +---+ | ||
613 | |||
614 | After the head page has been moved, the tail page may now move forward. | ||
615 | |||
616 | tail page | ||
617 | | | ||
618 | v | ||
619 | +---+ +---+ +---+ +---+ | ||
620 | <---| |--->| |--->| |-H->| |---> | ||
621 | --->| |<---| |<---| |<---| |<--- | ||
622 | +---+ +---+ +---+ +---+ | ||
623 | |||
624 | |||
625 | The above are the trivial updates. Now for the more complex scenarios. | ||
626 | |||
627 | |||
628 | As stated before, if enough writes preempt the first write, the | ||
629 | tail page may make it all the way around the buffer and meet the commit | ||
630 | page. At this time, we must start dropping writes (usually with some kind | ||
631 | of warning to the user). But what happens if the commit was still on the | ||
632 | reader page? The commit page is not part of the ring buffer. The tail page | ||
633 | must account for this. | ||
634 | |||
635 | |||
636 | reader page commit page | ||
637 | | | | ||
638 | v | | ||
639 | +---+ | | ||
640 | | |<----------+ | ||
641 | | | | ||
642 | | |------+ | ||
643 | +---+ | | ||
644 | | | ||
645 | v | ||
646 | +---+ +---+ +---+ +---+ | ||
647 | <---| |--->| |-H->| |--->| |---> | ||
648 | --->| |<---| |<---| |<---| |<--- | ||
649 | +---+ +---+ +---+ +---+ | ||
650 | ^ | ||
651 | | | ||
652 | tail page | ||
653 | |||
654 | If the tail page were to simply push the head page forward, the commit when | ||
655 | leaving the reader page would not be pointing to the correct page. | ||
656 | |||
657 | The solution to this is to test if the commit page is on the reader page | ||
658 | before pushing the head page. If it is, then it can be assumed that the | ||
659 | tail page wrapped the buffer, and we must drop new writes. | ||
660 | |||
661 | This is not a race condition, because the commit page can only be moved | ||
662 | by the outter most writer (the writer that was preempted). | ||
663 | This means that the commit will not move while a writer is moving the | ||
664 | tail page. The reader can not swap the reader page if it is also being | ||
665 | used as the commit page. The reader can simply check that the commit | ||
666 | is off the reader page. Once the commit page leaves the reader page | ||
667 | it will never go back on it unless a reader does another swap with the | ||
668 | buffer page that is also the commit page. | ||
669 | |||
670 | |||
671 | Nested writes | ||
672 | ------------- | ||
673 | |||
674 | In the pushing forward of the tail page we must first push forward | ||
675 | the head page if the head page is the next page. If the head page | ||
676 | is not the next page, the tail page is simply updated with a cmpxchg. | ||
677 | |||
678 | Only writers move the tail page. This must be done atomically to protect | ||
679 | against nested writers. | ||
680 | |||
681 | temp_page = tail_page | ||
682 | next_page = temp_page->next | ||
683 | cmpxchg(tail_page, temp_page, next_page) | ||
684 | |||
685 | The above will update the tail page if it is still pointing to the expected | ||
686 | page. If this fails, a nested write pushed it forward, the the current write | ||
687 | does not need to push it. | ||
688 | |||
689 | |||
690 | temp page | ||
691 | | | ||
692 | v | ||
693 | tail page | ||
694 | | | ||
695 | v | ||
696 | +---+ +---+ +---+ +---+ | ||
697 | <---| |--->| |--->| |--->| |---> | ||
698 | --->| |<---| |<---| |<---| |<--- | ||
699 | +---+ +---+ +---+ +---+ | ||
700 | |||
701 | Nested write comes in and moves the tail page forward: | ||
702 | |||
703 | tail page (moved by nested writer) | ||
704 | temp page | | ||
705 | | | | ||
706 | v v | ||
707 | +---+ +---+ +---+ +---+ | ||
708 | <---| |--->| |--->| |--->| |---> | ||
709 | --->| |<---| |<---| |<---| |<--- | ||
710 | +---+ +---+ +---+ +---+ | ||
711 | |||
712 | The above would fail the cmpxchg, but since the tail page has already | ||
713 | been moved forward, the writer will just try again to reserve storage | ||
714 | on the new tail page. | ||
715 | |||
716 | But the moving of the head page is a bit more complex. | ||
717 | |||
718 | tail page | ||
719 | | | ||
720 | v | ||
721 | +---+ +---+ +---+ +---+ | ||
722 | <---| |--->| |-H->| |--->| |---> | ||
723 | --->| |<---| |<---| |<---| |<--- | ||
724 | +---+ +---+ +---+ +---+ | ||
725 | |||
726 | The write converts the head page pointer to UPDATE. | ||
727 | |||
728 | tail page | ||
729 | | | ||
730 | v | ||
731 | +---+ +---+ +---+ +---+ | ||
732 | <---| |--->| |-U->| |--->| |---> | ||
733 | --->| |<---| |<---| |<---| |<--- | ||
734 | +---+ +---+ +---+ +---+ | ||
735 | |||
736 | But if a nested writer preempts here. It will see that the next | ||
737 | page is a head page, but it is also nested. It will detect that | ||
738 | it is nested and will save that information. The detection is the | ||
739 | fact that it sees the UPDATE flag instead of a HEADER or NORMAL | ||
740 | pointer. | ||
741 | |||
742 | The nested writer will set the new head page pointer. | ||
743 | |||
744 | tail page | ||
745 | | | ||
746 | v | ||
747 | +---+ +---+ +---+ +---+ | ||
748 | <---| |--->| |-U->| |-H->| |---> | ||
749 | --->| |<---| |<---| |<---| |<--- | ||
750 | +---+ +---+ +---+ +---+ | ||
751 | |||
752 | But it will not reset the update back to normal. Only the writer | ||
753 | that converted a pointer from HEAD to UPDATE will convert it back | ||
754 | to NORMAL. | ||
755 | |||
756 | tail page | ||
757 | | | ||
758 | v | ||
759 | +---+ +---+ +---+ +---+ | ||
760 | <---| |--->| |-U->| |-H->| |---> | ||
761 | --->| |<---| |<---| |<---| |<--- | ||
762 | +---+ +---+ +---+ +---+ | ||
763 | |||
764 | After the nested writer finishes, the outer most writer will convert | ||
765 | the UPDATE pointer to NORMAL. | ||
766 | |||
767 | |||
768 | tail page | ||
769 | | | ||
770 | v | ||
771 | +---+ +---+ +---+ +---+ | ||
772 | <---| |--->| |--->| |-H->| |---> | ||
773 | --->| |<---| |<---| |<---| |<--- | ||
774 | +---+ +---+ +---+ +---+ | ||
775 | |||
776 | |||
777 | It can be even more complex if several nested writes came in and moved | ||
778 | the tail page ahead several pages: | ||
779 | |||
780 | |||
781 | (first writer) | ||
782 | |||
783 | tail page | ||
784 | | | ||
785 | v | ||
786 | +---+ +---+ +---+ +---+ | ||
787 | <---| |--->| |-H->| |--->| |---> | ||
788 | --->| |<---| |<---| |<---| |<--- | ||
789 | +---+ +---+ +---+ +---+ | ||
790 | |||
791 | The write converts the head page pointer to UPDATE. | ||
792 | |||
793 | tail page | ||
794 | | | ||
795 | v | ||
796 | +---+ +---+ +---+ +---+ | ||
797 | <---| |--->| |-U->| |--->| |---> | ||
798 | --->| |<---| |<---| |<---| |<--- | ||
799 | +---+ +---+ +---+ +---+ | ||
800 | |||
801 | Next writer comes in, and sees the update and sets up the new | ||
802 | head page. | ||
803 | |||
804 | (second writer) | ||
805 | |||
806 | tail page | ||
807 | | | ||
808 | v | ||
809 | +---+ +---+ +---+ +---+ | ||
810 | <---| |--->| |-U->| |-H->| |---> | ||
811 | --->| |<---| |<---| |<---| |<--- | ||
812 | +---+ +---+ +---+ +---+ | ||
813 | |||
814 | The nested writer moves the tail page forward. But does not set the old | ||
815 | update page to NORMAL because it is not the outer most writer. | ||
816 | |||
817 | tail page | ||
818 | | | ||
819 | v | ||
820 | +---+ +---+ +---+ +---+ | ||
821 | <---| |--->| |-U->| |-H->| |---> | ||
822 | --->| |<---| |<---| |<---| |<--- | ||
823 | +---+ +---+ +---+ +---+ | ||
824 | |||
825 | Another writer preempts and sees the page after the tail page is a head page. | ||
826 | It changes it from HEAD to UPDATE. | ||
827 | |||
828 | (third writer) | ||
829 | |||
830 | tail page | ||
831 | | | ||
832 | v | ||
833 | +---+ +---+ +---+ +---+ | ||
834 | <---| |--->| |-U->| |-U->| |---> | ||
835 | --->| |<---| |<---| |<---| |<--- | ||
836 | +---+ +---+ +---+ +---+ | ||
837 | |||
838 | The writer will move the head page forward: | ||
839 | |||
840 | |||
841 | (third writer) | ||
842 | |||
843 | tail page | ||
844 | | | ||
845 | v | ||
846 | +---+ +---+ +---+ +---+ | ||
847 | <---| |--->| |-U->| |-U->| |-H-> | ||
848 | --->| |<---| |<---| |<---| |<--- | ||
849 | +---+ +---+ +---+ +---+ | ||
850 | |||
851 | But now that the third writer did change the HEAD flag to UPDATE it | ||
852 | will convert it to normal: | ||
853 | |||
854 | |||
855 | (third writer) | ||
856 | |||
857 | tail page | ||
858 | | | ||
859 | v | ||
860 | +---+ +---+ +---+ +---+ | ||
861 | <---| |--->| |-U->| |--->| |-H-> | ||
862 | --->| |<---| |<---| |<---| |<--- | ||
863 | +---+ +---+ +---+ +---+ | ||
864 | |||
865 | |||
866 | Then it will move the tail page, and return back to the second writer. | ||
867 | |||
868 | |||
869 | (second writer) | ||
870 | |||
871 | tail page | ||
872 | | | ||
873 | v | ||
874 | +---+ +---+ +---+ +---+ | ||
875 | <---| |--->| |-U->| |--->| |-H-> | ||
876 | --->| |<---| |<---| |<---| |<--- | ||
877 | +---+ +---+ +---+ +---+ | ||
878 | |||
879 | |||
880 | The second writer will fail to move the tail page because it was already | ||
881 | moved, so it will try again and add its data to the new tail page. | ||
882 | It will return to the first writer. | ||
883 | |||
884 | |||
885 | (first writer) | ||
886 | |||
887 | tail page | ||
888 | | | ||
889 | v | ||
890 | +---+ +---+ +---+ +---+ | ||
891 | <---| |--->| |-U->| |--->| |-H-> | ||
892 | --->| |<---| |<---| |<---| |<--- | ||
893 | +---+ +---+ +---+ +---+ | ||
894 | |||
895 | The first writer can not know atomically test if the tail page moved | ||
896 | while it updates the HEAD page. It will then update the head page to | ||
897 | what it thinks is the new head page. | ||
898 | |||
899 | |||
900 | (first writer) | ||
901 | |||
902 | tail page | ||
903 | | | ||
904 | v | ||
905 | +---+ +---+ +---+ +---+ | ||
906 | <---| |--->| |-U->| |-H->| |-H-> | ||
907 | --->| |<---| |<---| |<---| |<--- | ||
908 | +---+ +---+ +---+ +---+ | ||
909 | |||
910 | Since the cmpxchg returns the old value of the pointer the first writer | ||
911 | will see it succeeded in updating the pointer from NORMAL to HEAD. | ||
912 | But as we can see, this is not good enough. It must also check to see | ||
913 | if the tail page is either where it use to be or on the next page: | ||
914 | |||
915 | |||
916 | (first writer) | ||
917 | |||
918 | A B tail page | ||
919 | | | | | ||
920 | v v v | ||
921 | +---+ +---+ +---+ +---+ | ||
922 | <---| |--->| |-U->| |-H->| |-H-> | ||
923 | --->| |<---| |<---| |<---| |<--- | ||
924 | +---+ +---+ +---+ +---+ | ||
925 | |||
926 | If tail page != A and tail page does not equal B, then it must reset the | ||
927 | pointer back to NORMAL. The fact that it only needs to worry about | ||
928 | nested writers, it only needs to check this after setting the HEAD page. | ||
929 | |||
930 | |||
931 | (first writer) | ||
932 | |||
933 | A B tail page | ||
934 | | | | | ||
935 | v v v | ||
936 | +---+ +---+ +---+ +---+ | ||
937 | <---| |--->| |-U->| |--->| |-H-> | ||
938 | --->| |<---| |<---| |<---| |<--- | ||
939 | +---+ +---+ +---+ +---+ | ||
940 | |||
941 | Now the writer can update the head page. This is also why the head page must | ||
942 | remain in UPDATE and only reset by the outer most writer. This prevents | ||
943 | the reader from seeing the incorrect head page. | ||
944 | |||
945 | |||
946 | (first writer) | ||
947 | |||
948 | A B tail page | ||
949 | | | | | ||
950 | v v v | ||
951 | +---+ +---+ +---+ +---+ | ||
952 | <---| |--->| |--->| |--->| |-H-> | ||
953 | --->| |<---| |<---| |<---| |<--- | ||
954 | +---+ +---+ +---+ +---+ | ||
955 | |||
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 29f8599e6bea..7fca71693ae7 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h | |||
@@ -170,7 +170,6 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer); | |||
170 | unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); | 170 | unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); |
171 | unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); | 171 | unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); |
172 | unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu); | 172 | unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu); |
173 | unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu); | ||
174 | 173 | ||
175 | u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); | 174 | u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); |
176 | void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, | 175 | void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bf27bb7a63e2..e648ba4f70e0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -322,6 +322,14 @@ struct buffer_data_page { | |||
322 | unsigned char data[]; /* data of buffer page */ | 322 | unsigned char data[]; /* data of buffer page */ |
323 | }; | 323 | }; |
324 | 324 | ||
325 | /* | ||
326 | * Note, the buffer_page list must be first. The buffer pages | ||
327 | * are allocated in cache lines, which means that each buffer | ||
328 | * page will be at the beginning of a cache line, and thus | ||
329 | * the least significant bits will be zero. We use this to | ||
330 | * add flags in the list struct pointers, to make the ring buffer | ||
331 | * lockless. | ||
332 | */ | ||
325 | struct buffer_page { | 333 | struct buffer_page { |
326 | struct list_head list; /* list of buffer pages */ | 334 | struct list_head list; /* list of buffer pages */ |
327 | local_t write; /* index for next write */ | 335 | local_t write; /* index for next write */ |
@@ -330,6 +338,21 @@ struct buffer_page { | |||
330 | struct buffer_data_page *page; /* Actual data page */ | 338 | struct buffer_data_page *page; /* Actual data page */ |
331 | }; | 339 | }; |
332 | 340 | ||
341 | /* | ||
342 | * The buffer page counters, write and entries, must be reset | ||
343 | * atomically when crossing page boundaries. To synchronize this | ||
344 | * update, two counters are inserted into the number. One is | ||
345 | * the actual counter for the write position or count on the page. | ||
346 | * | ||
347 | * The other is a counter of updaters. Before an update happens | ||
348 | * the update partition of the counter is incremented. This will | ||
349 | * allow the updater to update the counter atomically. | ||
350 | * | ||
351 | * The counter is 20 bits, and the state data is 12. | ||
352 | */ | ||
353 | #define RB_WRITE_MASK 0xfffff | ||
354 | #define RB_WRITE_INTCNT (1 << 20) | ||
355 | |||
333 | static void rb_init_page(struct buffer_data_page *bpage) | 356 | static void rb_init_page(struct buffer_data_page *bpage) |
334 | { | 357 | { |
335 | local_set(&bpage->commit, 0); | 358 | local_set(&bpage->commit, 0); |
@@ -403,21 +426,20 @@ int ring_buffer_print_page_header(struct trace_seq *s) | |||
403 | struct ring_buffer_per_cpu { | 426 | struct ring_buffer_per_cpu { |
404 | int cpu; | 427 | int cpu; |
405 | struct ring_buffer *buffer; | 428 | struct ring_buffer *buffer; |
406 | spinlock_t reader_lock; /* serialize readers */ | 429 | spinlock_t reader_lock; /* serialize readers */ |
407 | raw_spinlock_t lock; | 430 | raw_spinlock_t lock; |
408 | struct lock_class_key lock_key; | 431 | struct lock_class_key lock_key; |
409 | struct list_head pages; | 432 | struct list_head *pages; |
410 | struct buffer_page *head_page; /* read from head */ | 433 | struct buffer_page *head_page; /* read from head */ |
411 | struct buffer_page *tail_page; /* write to tail */ | 434 | struct buffer_page *tail_page; /* write to tail */ |
412 | struct buffer_page *commit_page; /* committed pages */ | 435 | struct buffer_page *commit_page; /* committed pages */ |
413 | struct buffer_page *reader_page; | 436 | struct buffer_page *reader_page; |
414 | unsigned long nmi_dropped; | 437 | local_t commit_overrun; |
415 | unsigned long commit_overrun; | 438 | local_t overrun; |
416 | unsigned long overrun; | ||
417 | unsigned long read; | ||
418 | local_t entries; | 439 | local_t entries; |
419 | local_t committing; | 440 | local_t committing; |
420 | local_t commits; | 441 | local_t commits; |
442 | unsigned long read; | ||
421 | u64 write_stamp; | 443 | u64 write_stamp; |
422 | u64 read_stamp; | 444 | u64 read_stamp; |
423 | atomic_t record_disabled; | 445 | atomic_t record_disabled; |
@@ -489,6 +511,385 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, | |||
489 | } | 511 | } |
490 | EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); | 512 | EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); |
491 | 513 | ||
514 | /* | ||
515 | * Making the ring buffer lockless makes things tricky. | ||
516 | * Although writes only happen on the CPU that they are on, | ||
517 | * and they only need to worry about interrupts. Reads can | ||
518 | * happen on any CPU. | ||
519 | * | ||
520 | * The reader page is always off the ring buffer, but when the | ||
521 | * reader finishes with a page, it needs to swap its page with | ||
522 | * a new one from the buffer. The reader needs to take from | ||
523 | * the head (writes go to the tail). But if a writer is in overwrite | ||
524 | * mode and wraps, it must push the head page forward. | ||
525 | * | ||
526 | * Here lies the problem. | ||
527 | * | ||
528 | * The reader must be careful to replace only the head page, and | ||
529 | * not another one. As described at the top of the file in the | ||
530 | * ASCII art, the reader sets its old page to point to the next | ||
531 | * page after head. It then sets the page after head to point to | ||
532 | * the old reader page. But if the writer moves the head page | ||
533 | * during this operation, the reader could end up with the tail. | ||
534 | * | ||
535 | * We use cmpxchg to help prevent this race. We also do something | ||
536 | * special with the page before head. We set the LSB to 1. | ||
537 | * | ||
538 | * When the writer must push the page forward, it will clear the | ||
539 | * bit that points to the head page, move the head, and then set | ||
540 | * the bit that points to the new head page. | ||
541 | * | ||
542 | * We also don't want an interrupt coming in and moving the head | ||
543 | * page on another writer. Thus we use the second LSB to catch | ||
544 | * that too. Thus: | ||
545 | * | ||
546 | * head->list->prev->next bit 1 bit 0 | ||
547 | * ------- ------- | ||
548 | * Normal page 0 0 | ||
549 | * Points to head page 0 1 | ||
550 | * New head page 1 0 | ||
551 | * | ||
552 | * Note we can not trust the prev pointer of the head page, because: | ||
553 | * | ||
554 | * +----+ +-----+ +-----+ | ||
555 | * | |------>| T |---X--->| N | | ||
556 | * | |<------| | | | | ||
557 | * +----+ +-----+ +-----+ | ||
558 | * ^ ^ | | ||
559 | * | +-----+ | | | ||
560 | * +----------| R |----------+ | | ||
561 | * | |<-----------+ | ||
562 | * +-----+ | ||
563 | * | ||
564 | * Key: ---X--> HEAD flag set in pointer | ||
565 | * T Tail page | ||
566 | * R Reader page | ||
567 | * N Next page | ||
568 | * | ||
569 | * (see __rb_reserve_next() to see where this happens) | ||
570 | * | ||
571 | * What the above shows is that the reader just swapped out | ||
572 | * the reader page with a page in the buffer, but before it | ||
573 | * could make the new header point back to the new page added | ||
574 | * it was preempted by a writer. The writer moved forward onto | ||
575 | * the new page added by the reader and is about to move forward | ||
576 | * again. | ||
577 | * | ||
578 | * You can see, it is legitimate for the previous pointer of | ||
579 | * the head (or any page) not to point back to itself. But only | ||
580 | * temporarially. | ||
581 | */ | ||
582 | |||
583 | #define RB_PAGE_NORMAL 0UL | ||
584 | #define RB_PAGE_HEAD 1UL | ||
585 | #define RB_PAGE_UPDATE 2UL | ||
586 | |||
587 | |||
588 | #define RB_FLAG_MASK 3UL | ||
589 | |||
590 | /* PAGE_MOVED is not part of the mask */ | ||
591 | #define RB_PAGE_MOVED 4UL | ||
592 | |||
593 | /* | ||
594 | * rb_list_head - remove any bit | ||
595 | */ | ||
596 | static struct list_head *rb_list_head(struct list_head *list) | ||
597 | { | ||
598 | unsigned long val = (unsigned long)list; | ||
599 | |||
600 | return (struct list_head *)(val & ~RB_FLAG_MASK); | ||
601 | } | ||
602 | |||
603 | /* | ||
604 | * rb_is_head_page - test if the give page is the head page | ||
605 | * | ||
606 | * Because the reader may move the head_page pointer, we can | ||
607 | * not trust what the head page is (it may be pointing to | ||
608 | * the reader page). But if the next page is a header page, | ||
609 | * its flags will be non zero. | ||
610 | */ | ||
611 | static int inline | ||
612 | rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, | ||
613 | struct buffer_page *page, struct list_head *list) | ||
614 | { | ||
615 | unsigned long val; | ||
616 | |||
617 | val = (unsigned long)list->next; | ||
618 | |||
619 | if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) | ||
620 | return RB_PAGE_MOVED; | ||
621 | |||
622 | return val & RB_FLAG_MASK; | ||
623 | } | ||
624 | |||
625 | /* | ||
626 | * rb_is_reader_page | ||
627 | * | ||
628 | * The unique thing about the reader page, is that, if the | ||
629 | * writer is ever on it, the previous pointer never points | ||
630 | * back to the reader page. | ||
631 | */ | ||
632 | static int rb_is_reader_page(struct buffer_page *page) | ||
633 | { | ||
634 | struct list_head *list = page->list.prev; | ||
635 | |||
636 | return rb_list_head(list->next) != &page->list; | ||
637 | } | ||
638 | |||
639 | /* | ||
640 | * rb_set_list_to_head - set a list_head to be pointing to head. | ||
641 | */ | ||
642 | static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer, | ||
643 | struct list_head *list) | ||
644 | { | ||
645 | unsigned long *ptr; | ||
646 | |||
647 | ptr = (unsigned long *)&list->next; | ||
648 | *ptr |= RB_PAGE_HEAD; | ||
649 | *ptr &= ~RB_PAGE_UPDATE; | ||
650 | } | ||
651 | |||
652 | /* | ||
653 | * rb_head_page_activate - sets up head page | ||
654 | */ | ||
655 | static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) | ||
656 | { | ||
657 | struct buffer_page *head; | ||
658 | |||
659 | head = cpu_buffer->head_page; | ||
660 | if (!head) | ||
661 | return; | ||
662 | |||
663 | /* | ||
664 | * Set the previous list pointer to have the HEAD flag. | ||
665 | */ | ||
666 | rb_set_list_to_head(cpu_buffer, head->list.prev); | ||
667 | } | ||
668 | |||
669 | static void rb_list_head_clear(struct list_head *list) | ||
670 | { | ||
671 | unsigned long *ptr = (unsigned long *)&list->next; | ||
672 | |||
673 | *ptr &= ~RB_FLAG_MASK; | ||
674 | } | ||
675 | |||
676 | /* | ||
677 | * rb_head_page_dactivate - clears head page ptr (for free list) | ||
678 | */ | ||
679 | static void | ||
680 | rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) | ||
681 | { | ||
682 | struct list_head *hd; | ||
683 | |||
684 | /* Go through the whole list and clear any pointers found. */ | ||
685 | rb_list_head_clear(cpu_buffer->pages); | ||
686 | |||
687 | list_for_each(hd, cpu_buffer->pages) | ||
688 | rb_list_head_clear(hd); | ||
689 | } | ||
690 | |||
691 | static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, | ||
692 | struct buffer_page *head, | ||
693 | struct buffer_page *prev, | ||
694 | int old_flag, int new_flag) | ||
695 | { | ||
696 | struct list_head *list; | ||
697 | unsigned long val = (unsigned long)&head->list; | ||
698 | unsigned long ret; | ||
699 | |||
700 | list = &prev->list; | ||
701 | |||
702 | val &= ~RB_FLAG_MASK; | ||
703 | |||
704 | ret = (unsigned long)cmpxchg(&list->next, | ||
705 | val | old_flag, val | new_flag); | ||
706 | |||
707 | /* check if the reader took the page */ | ||
708 | if ((ret & ~RB_FLAG_MASK) != val) | ||
709 | return RB_PAGE_MOVED; | ||
710 | |||
711 | return ret & RB_FLAG_MASK; | ||
712 | } | ||
713 | |||
714 | static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, | ||
715 | struct buffer_page *head, | ||
716 | struct buffer_page *prev, | ||
717 | int old_flag) | ||
718 | { | ||
719 | return rb_head_page_set(cpu_buffer, head, prev, | ||
720 | old_flag, RB_PAGE_UPDATE); | ||
721 | } | ||
722 | |||
723 | static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, | ||
724 | struct buffer_page *head, | ||
725 | struct buffer_page *prev, | ||
726 | int old_flag) | ||
727 | { | ||
728 | return rb_head_page_set(cpu_buffer, head, prev, | ||
729 | old_flag, RB_PAGE_HEAD); | ||
730 | } | ||
731 | |||
732 | static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, | ||
733 | struct buffer_page *head, | ||
734 | struct buffer_page *prev, | ||
735 | int old_flag) | ||
736 | { | ||
737 | return rb_head_page_set(cpu_buffer, head, prev, | ||
738 | old_flag, RB_PAGE_NORMAL); | ||
739 | } | ||
740 | |||
741 | static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, | ||
742 | struct buffer_page **bpage) | ||
743 | { | ||
744 | struct list_head *p = rb_list_head((*bpage)->list.next); | ||
745 | |||
746 | *bpage = list_entry(p, struct buffer_page, list); | ||
747 | } | ||
748 | |||
749 | static struct buffer_page * | ||
750 | rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) | ||
751 | { | ||
752 | struct buffer_page *head; | ||
753 | struct buffer_page *page; | ||
754 | struct list_head *list; | ||
755 | int i; | ||
756 | |||
757 | if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) | ||
758 | return NULL; | ||
759 | |||
760 | /* sanity check */ | ||
761 | list = cpu_buffer->pages; | ||
762 | if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) | ||
763 | return NULL; | ||
764 | |||
765 | page = head = cpu_buffer->head_page; | ||
766 | /* | ||
767 | * It is possible that the writer moves the header behind | ||
768 | * where we started, and we miss in one loop. | ||
769 | * A second loop should grab the header, but we'll do | ||
770 | * three loops just because I'm paranoid. | ||
771 | */ | ||
772 | for (i = 0; i < 3; i++) { | ||
773 | do { | ||
774 | if (rb_is_head_page(cpu_buffer, page, page->list.prev)) { | ||
775 | cpu_buffer->head_page = page; | ||
776 | return page; | ||
777 | } | ||
778 | rb_inc_page(cpu_buffer, &page); | ||
779 | } while (page != head); | ||
780 | } | ||
781 | |||
782 | RB_WARN_ON(cpu_buffer, 1); | ||
783 | |||
784 | return NULL; | ||
785 | } | ||
786 | |||
787 | static int rb_head_page_replace(struct buffer_page *old, | ||
788 | struct buffer_page *new) | ||
789 | { | ||
790 | unsigned long *ptr = (unsigned long *)&old->list.prev->next; | ||
791 | unsigned long val; | ||
792 | unsigned long ret; | ||
793 | |||
794 | val = *ptr & ~RB_FLAG_MASK; | ||
795 | val |= RB_PAGE_HEAD; | ||
796 | |||
797 | ret = cmpxchg(ptr, val, &new->list); | ||
798 | |||
799 | return ret == val; | ||
800 | } | ||
801 | |||
802 | /* | ||
803 | * rb_tail_page_update - move the tail page forward | ||
804 | * | ||
805 | * Returns 1 if moved tail page, 0 if someone else did. | ||
806 | */ | ||
807 | static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, | ||
808 | struct buffer_page *tail_page, | ||
809 | struct buffer_page *next_page) | ||
810 | { | ||
811 | struct buffer_page *old_tail; | ||
812 | unsigned long old_entries; | ||
813 | unsigned long old_write; | ||
814 | int ret = 0; | ||
815 | |||
816 | /* | ||
817 | * The tail page now needs to be moved forward. | ||
818 | * | ||
819 | * We need to reset the tail page, but without messing | ||
820 | * with possible erasing of data brought in by interrupts | ||
821 | * that have moved the tail page and are currently on it. | ||
822 | * | ||
823 | * We add a counter to the write field to denote this. | ||
824 | */ | ||
825 | old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); | ||
826 | old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); | ||
827 | |||
828 | /* | ||
829 | * Just make sure we have seen our old_write and synchronize | ||
830 | * with any interrupts that come in. | ||
831 | */ | ||
832 | barrier(); | ||
833 | |||
834 | /* | ||
835 | * If the tail page is still the same as what we think | ||
836 | * it is, then it is up to us to update the tail | ||
837 | * pointer. | ||
838 | */ | ||
839 | if (tail_page == cpu_buffer->tail_page) { | ||
840 | /* Zero the write counter */ | ||
841 | unsigned long val = old_write & ~RB_WRITE_MASK; | ||
842 | unsigned long eval = old_entries & ~RB_WRITE_MASK; | ||
843 | |||
844 | /* | ||
845 | * This will only succeed if an interrupt did | ||
846 | * not come in and change it. In which case, we | ||
847 | * do not want to modify it. | ||
848 | */ | ||
849 | local_cmpxchg(&next_page->write, old_write, val); | ||
850 | local_cmpxchg(&next_page->entries, old_entries, eval); | ||
851 | |||
852 | /* | ||
853 | * No need to worry about races with clearing out the commit. | ||
854 | * it only can increment when a commit takes place. But that | ||
855 | * only happens in the outer most nested commit. | ||
856 | */ | ||
857 | local_set(&next_page->page->commit, 0); | ||
858 | |||
859 | old_tail = cmpxchg(&cpu_buffer->tail_page, | ||
860 | tail_page, next_page); | ||
861 | |||
862 | if (old_tail == tail_page) | ||
863 | ret = 1; | ||
864 | } | ||
865 | |||
866 | return ret; | ||
867 | } | ||
868 | |||
869 | static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, | ||
870 | struct buffer_page *bpage) | ||
871 | { | ||
872 | unsigned long val = (unsigned long)bpage; | ||
873 | |||
874 | if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK)) | ||
875 | return 1; | ||
876 | |||
877 | return 0; | ||
878 | } | ||
879 | |||
880 | /** | ||
881 | * rb_check_list - make sure a pointer to a list has the last bits zero | ||
882 | */ | ||
883 | static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, | ||
884 | struct list_head *list) | ||
885 | { | ||
886 | if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev)) | ||
887 | return 1; | ||
888 | if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next)) | ||
889 | return 1; | ||
890 | return 0; | ||
891 | } | ||
892 | |||
492 | /** | 893 | /** |
493 | * check_pages - integrity check of buffer pages | 894 | * check_pages - integrity check of buffer pages |
494 | * @cpu_buffer: CPU buffer with pages to test | 895 | * @cpu_buffer: CPU buffer with pages to test |
@@ -498,14 +899,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); | |||
498 | */ | 899 | */ |
499 | static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) | 900 | static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) |
500 | { | 901 | { |
501 | struct list_head *head = &cpu_buffer->pages; | 902 | struct list_head *head = cpu_buffer->pages; |
502 | struct buffer_page *bpage, *tmp; | 903 | struct buffer_page *bpage, *tmp; |
503 | 904 | ||
905 | rb_head_page_deactivate(cpu_buffer); | ||
906 | |||
504 | if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) | 907 | if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) |
505 | return -1; | 908 | return -1; |
506 | if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) | 909 | if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) |
507 | return -1; | 910 | return -1; |
508 | 911 | ||
912 | if (rb_check_list(cpu_buffer, head)) | ||
913 | return -1; | ||
914 | |||
509 | list_for_each_entry_safe(bpage, tmp, head, list) { | 915 | list_for_each_entry_safe(bpage, tmp, head, list) { |
510 | if (RB_WARN_ON(cpu_buffer, | 916 | if (RB_WARN_ON(cpu_buffer, |
511 | bpage->list.next->prev != &bpage->list)) | 917 | bpage->list.next->prev != &bpage->list)) |
@@ -513,25 +919,33 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) | |||
513 | if (RB_WARN_ON(cpu_buffer, | 919 | if (RB_WARN_ON(cpu_buffer, |
514 | bpage->list.prev->next != &bpage->list)) | 920 | bpage->list.prev->next != &bpage->list)) |
515 | return -1; | 921 | return -1; |
922 | if (rb_check_list(cpu_buffer, &bpage->list)) | ||
923 | return -1; | ||
516 | } | 924 | } |
517 | 925 | ||
926 | rb_head_page_activate(cpu_buffer); | ||
927 | |||
518 | return 0; | 928 | return 0; |
519 | } | 929 | } |
520 | 930 | ||
521 | static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | 931 | static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, |
522 | unsigned nr_pages) | 932 | unsigned nr_pages) |
523 | { | 933 | { |
524 | struct list_head *head = &cpu_buffer->pages; | ||
525 | struct buffer_page *bpage, *tmp; | 934 | struct buffer_page *bpage, *tmp; |
526 | unsigned long addr; | 935 | unsigned long addr; |
527 | LIST_HEAD(pages); | 936 | LIST_HEAD(pages); |
528 | unsigned i; | 937 | unsigned i; |
529 | 938 | ||
939 | WARN_ON(!nr_pages); | ||
940 | |||
530 | for (i = 0; i < nr_pages; i++) { | 941 | for (i = 0; i < nr_pages; i++) { |
531 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 942 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
532 | GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); | 943 | GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); |
533 | if (!bpage) | 944 | if (!bpage) |
534 | goto free_pages; | 945 | goto free_pages; |
946 | |||
947 | rb_check_bpage(cpu_buffer, bpage); | ||
948 | |||
535 | list_add(&bpage->list, &pages); | 949 | list_add(&bpage->list, &pages); |
536 | 950 | ||
537 | addr = __get_free_page(GFP_KERNEL); | 951 | addr = __get_free_page(GFP_KERNEL); |
@@ -541,7 +955,13 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
541 | rb_init_page(bpage->page); | 955 | rb_init_page(bpage->page); |
542 | } | 956 | } |
543 | 957 | ||
544 | list_splice(&pages, head); | 958 | /* |
959 | * The ring buffer page list is a circular list that does not | ||
960 | * start and end with a list head. All page list items point to | ||
961 | * other pages. | ||
962 | */ | ||
963 | cpu_buffer->pages = pages.next; | ||
964 | list_del(&pages); | ||
545 | 965 | ||
546 | rb_check_pages(cpu_buffer); | 966 | rb_check_pages(cpu_buffer); |
547 | 967 | ||
@@ -573,13 +993,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
573 | spin_lock_init(&cpu_buffer->reader_lock); | 993 | spin_lock_init(&cpu_buffer->reader_lock); |
574 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); | 994 | lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); |
575 | cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; | 995 | cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; |
576 | INIT_LIST_HEAD(&cpu_buffer->pages); | ||
577 | 996 | ||
578 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), | 997 | bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), |
579 | GFP_KERNEL, cpu_to_node(cpu)); | 998 | GFP_KERNEL, cpu_to_node(cpu)); |
580 | if (!bpage) | 999 | if (!bpage) |
581 | goto fail_free_buffer; | 1000 | goto fail_free_buffer; |
582 | 1001 | ||
1002 | rb_check_bpage(cpu_buffer, bpage); | ||
1003 | |||
583 | cpu_buffer->reader_page = bpage; | 1004 | cpu_buffer->reader_page = bpage; |
584 | addr = __get_free_page(GFP_KERNEL); | 1005 | addr = __get_free_page(GFP_KERNEL); |
585 | if (!addr) | 1006 | if (!addr) |
@@ -594,9 +1015,11 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
594 | goto fail_free_reader; | 1015 | goto fail_free_reader; |
595 | 1016 | ||
596 | cpu_buffer->head_page | 1017 | cpu_buffer->head_page |
597 | = list_entry(cpu_buffer->pages.next, struct buffer_page, list); | 1018 | = list_entry(cpu_buffer->pages, struct buffer_page, list); |
598 | cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; | 1019 | cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; |
599 | 1020 | ||
1021 | rb_head_page_activate(cpu_buffer); | ||
1022 | |||
600 | return cpu_buffer; | 1023 | return cpu_buffer; |
601 | 1024 | ||
602 | fail_free_reader: | 1025 | fail_free_reader: |
@@ -609,15 +1032,22 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) | |||
609 | 1032 | ||
610 | static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) | 1033 | static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) |
611 | { | 1034 | { |
612 | struct list_head *head = &cpu_buffer->pages; | 1035 | struct list_head *head = cpu_buffer->pages; |
613 | struct buffer_page *bpage, *tmp; | 1036 | struct buffer_page *bpage, *tmp; |
614 | 1037 | ||
615 | free_buffer_page(cpu_buffer->reader_page); | 1038 | free_buffer_page(cpu_buffer->reader_page); |
616 | 1039 | ||
617 | list_for_each_entry_safe(bpage, tmp, head, list) { | 1040 | rb_head_page_deactivate(cpu_buffer); |
618 | list_del_init(&bpage->list); | 1041 | |
1042 | if (head) { | ||
1043 | list_for_each_entry_safe(bpage, tmp, head, list) { | ||
1044 | list_del_init(&bpage->list); | ||
1045 | free_buffer_page(bpage); | ||
1046 | } | ||
1047 | bpage = list_entry(head, struct buffer_page, list); | ||
619 | free_buffer_page(bpage); | 1048 | free_buffer_page(bpage); |
620 | } | 1049 | } |
1050 | |||
621 | kfree(cpu_buffer); | 1051 | kfree(cpu_buffer); |
622 | } | 1052 | } |
623 | 1053 | ||
@@ -759,15 +1189,17 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) | |||
759 | atomic_inc(&cpu_buffer->record_disabled); | 1189 | atomic_inc(&cpu_buffer->record_disabled); |
760 | synchronize_sched(); | 1190 | synchronize_sched(); |
761 | 1191 | ||
1192 | rb_head_page_deactivate(cpu_buffer); | ||
1193 | |||
762 | for (i = 0; i < nr_pages; i++) { | 1194 | for (i = 0; i < nr_pages; i++) { |
763 | if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) | 1195 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) |
764 | return; | 1196 | return; |
765 | p = cpu_buffer->pages.next; | 1197 | p = cpu_buffer->pages->next; |
766 | bpage = list_entry(p, struct buffer_page, list); | 1198 | bpage = list_entry(p, struct buffer_page, list); |
767 | list_del_init(&bpage->list); | 1199 | list_del_init(&bpage->list); |
768 | free_buffer_page(bpage); | 1200 | free_buffer_page(bpage); |
769 | } | 1201 | } |
770 | if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages))) | 1202 | if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) |
771 | return; | 1203 | return; |
772 | 1204 | ||
773 | rb_reset_cpu(cpu_buffer); | 1205 | rb_reset_cpu(cpu_buffer); |
@@ -789,15 +1221,19 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, | |||
789 | atomic_inc(&cpu_buffer->record_disabled); | 1221 | atomic_inc(&cpu_buffer->record_disabled); |
790 | synchronize_sched(); | 1222 | synchronize_sched(); |
791 | 1223 | ||
1224 | spin_lock_irq(&cpu_buffer->reader_lock); | ||
1225 | rb_head_page_deactivate(cpu_buffer); | ||
1226 | |||
792 | for (i = 0; i < nr_pages; i++) { | 1227 | for (i = 0; i < nr_pages; i++) { |
793 | if (RB_WARN_ON(cpu_buffer, list_empty(pages))) | 1228 | if (RB_WARN_ON(cpu_buffer, list_empty(pages))) |
794 | return; | 1229 | return; |
795 | p = pages->next; | 1230 | p = pages->next; |
796 | bpage = list_entry(p, struct buffer_page, list); | 1231 | bpage = list_entry(p, struct buffer_page, list); |
797 | list_del_init(&bpage->list); | 1232 | list_del_init(&bpage->list); |
798 | list_add_tail(&bpage->list, &cpu_buffer->pages); | 1233 | list_add_tail(&bpage->list, cpu_buffer->pages); |
799 | } | 1234 | } |
800 | rb_reset_cpu(cpu_buffer); | 1235 | rb_reset_cpu(cpu_buffer); |
1236 | spin_unlock_irq(&cpu_buffer->reader_lock); | ||
801 | 1237 | ||
802 | rb_check_pages(cpu_buffer); | 1238 | rb_check_pages(cpu_buffer); |
803 | 1239 | ||
@@ -948,21 +1384,14 @@ rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) | |||
948 | } | 1384 | } |
949 | 1385 | ||
950 | static inline struct ring_buffer_event * | 1386 | static inline struct ring_buffer_event * |
951 | rb_head_event(struct ring_buffer_per_cpu *cpu_buffer) | ||
952 | { | ||
953 | return __rb_page_index(cpu_buffer->head_page, | ||
954 | cpu_buffer->head_page->read); | ||
955 | } | ||
956 | |||
957 | static inline struct ring_buffer_event * | ||
958 | rb_iter_head_event(struct ring_buffer_iter *iter) | 1387 | rb_iter_head_event(struct ring_buffer_iter *iter) |
959 | { | 1388 | { |
960 | return __rb_page_index(iter->head_page, iter->head); | 1389 | return __rb_page_index(iter->head_page, iter->head); |
961 | } | 1390 | } |
962 | 1391 | ||
963 | static inline unsigned rb_page_write(struct buffer_page *bpage) | 1392 | static inline unsigned long rb_page_write(struct buffer_page *bpage) |
964 | { | 1393 | { |
965 | return local_read(&bpage->write); | 1394 | return local_read(&bpage->write) & RB_WRITE_MASK; |
966 | } | 1395 | } |
967 | 1396 | ||
968 | static inline unsigned rb_page_commit(struct buffer_page *bpage) | 1397 | static inline unsigned rb_page_commit(struct buffer_page *bpage) |
@@ -970,6 +1399,11 @@ static inline unsigned rb_page_commit(struct buffer_page *bpage) | |||
970 | return local_read(&bpage->page->commit); | 1399 | return local_read(&bpage->page->commit); |
971 | } | 1400 | } |
972 | 1401 | ||
1402 | static inline unsigned long rb_page_entries(struct buffer_page *bpage) | ||
1403 | { | ||
1404 | return local_read(&bpage->entries) & RB_WRITE_MASK; | ||
1405 | } | ||
1406 | |||
973 | /* Size is determined by what has been commited */ | 1407 | /* Size is determined by what has been commited */ |
974 | static inline unsigned rb_page_size(struct buffer_page *bpage) | 1408 | static inline unsigned rb_page_size(struct buffer_page *bpage) |
975 | { | 1409 | { |
@@ -982,22 +1416,6 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) | |||
982 | return rb_page_commit(cpu_buffer->commit_page); | 1416 | return rb_page_commit(cpu_buffer->commit_page); |
983 | } | 1417 | } |
984 | 1418 | ||
985 | static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) | ||
986 | { | ||
987 | return rb_page_commit(cpu_buffer->head_page); | ||
988 | } | ||
989 | |||
990 | static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, | ||
991 | struct buffer_page **bpage) | ||
992 | { | ||
993 | struct list_head *p = (*bpage)->list.next; | ||
994 | |||
995 | if (p == &cpu_buffer->pages) | ||
996 | p = p->next; | ||
997 | |||
998 | *bpage = list_entry(p, struct buffer_page, list); | ||
999 | } | ||
1000 | |||
1001 | static inline unsigned | 1419 | static inline unsigned |
1002 | rb_event_index(struct ring_buffer_event *event) | 1420 | rb_event_index(struct ring_buffer_event *event) |
1003 | { | 1421 | { |
@@ -1023,6 +1441,8 @@ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, | |||
1023 | static void | 1441 | static void |
1024 | rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) | 1442 | rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) |
1025 | { | 1443 | { |
1444 | unsigned long max_count; | ||
1445 | |||
1026 | /* | 1446 | /* |
1027 | * We only race with interrupts and NMIs on this CPU. | 1447 | * We only race with interrupts and NMIs on this CPU. |
1028 | * If we own the commit event, then we can commit | 1448 | * If we own the commit event, then we can commit |
@@ -1032,9 +1452,16 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) | |||
1032 | * assign the commit to the tail. | 1452 | * assign the commit to the tail. |
1033 | */ | 1453 | */ |
1034 | again: | 1454 | again: |
1455 | max_count = cpu_buffer->buffer->pages * 100; | ||
1456 | |||
1035 | while (cpu_buffer->commit_page != cpu_buffer->tail_page) { | 1457 | while (cpu_buffer->commit_page != cpu_buffer->tail_page) { |
1036 | cpu_buffer->commit_page->page->commit = | 1458 | if (RB_WARN_ON(cpu_buffer, !(--max_count))) |
1037 | cpu_buffer->commit_page->write; | 1459 | return; |
1460 | if (RB_WARN_ON(cpu_buffer, | ||
1461 | rb_is_reader_page(cpu_buffer->tail_page))) | ||
1462 | return; | ||
1463 | local_set(&cpu_buffer->commit_page->page->commit, | ||
1464 | rb_page_write(cpu_buffer->commit_page)); | ||
1038 | rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); | 1465 | rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); |
1039 | cpu_buffer->write_stamp = | 1466 | cpu_buffer->write_stamp = |
1040 | cpu_buffer->commit_page->page->time_stamp; | 1467 | cpu_buffer->commit_page->page->time_stamp; |
@@ -1043,8 +1470,12 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) | |||
1043 | } | 1470 | } |
1044 | while (rb_commit_index(cpu_buffer) != | 1471 | while (rb_commit_index(cpu_buffer) != |
1045 | rb_page_write(cpu_buffer->commit_page)) { | 1472 | rb_page_write(cpu_buffer->commit_page)) { |
1046 | cpu_buffer->commit_page->page->commit = | 1473 | |
1047 | cpu_buffer->commit_page->write; | 1474 | local_set(&cpu_buffer->commit_page->page->commit, |
1475 | rb_page_write(cpu_buffer->commit_page)); | ||
1476 | RB_WARN_ON(cpu_buffer, | ||
1477 | local_read(&cpu_buffer->commit_page->page->commit) & | ||
1478 | ~RB_WRITE_MASK); | ||
1048 | barrier(); | 1479 | barrier(); |
1049 | } | 1480 | } |
1050 | 1481 | ||
@@ -1077,7 +1508,7 @@ static void rb_inc_iter(struct ring_buffer_iter *iter) | |||
1077 | * to the head page instead of next. | 1508 | * to the head page instead of next. |
1078 | */ | 1509 | */ |
1079 | if (iter->head_page == cpu_buffer->reader_page) | 1510 | if (iter->head_page == cpu_buffer->reader_page) |
1080 | iter->head_page = cpu_buffer->head_page; | 1511 | iter->head_page = rb_set_head_page(cpu_buffer); |
1081 | else | 1512 | else |
1082 | rb_inc_page(cpu_buffer, &iter->head_page); | 1513 | rb_inc_page(cpu_buffer, &iter->head_page); |
1083 | 1514 | ||
@@ -1121,6 +1552,163 @@ rb_update_event(struct ring_buffer_event *event, | |||
1121 | } | 1552 | } |
1122 | } | 1553 | } |
1123 | 1554 | ||
1555 | /* | ||
1556 | * rb_handle_head_page - writer hit the head page | ||
1557 | * | ||
1558 | * Returns: +1 to retry page | ||
1559 | * 0 to continue | ||
1560 | * -1 on error | ||
1561 | */ | ||
1562 | static int | ||
1563 | rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, | ||
1564 | struct buffer_page *tail_page, | ||
1565 | struct buffer_page *next_page) | ||
1566 | { | ||
1567 | struct buffer_page *new_head; | ||
1568 | int entries; | ||
1569 | int type; | ||
1570 | int ret; | ||
1571 | |||
1572 | entries = rb_page_entries(next_page); | ||
1573 | |||
1574 | /* | ||
1575 | * The hard part is here. We need to move the head | ||
1576 | * forward, and protect against both readers on | ||
1577 | * other CPUs and writers coming in via interrupts. | ||
1578 | */ | ||
1579 | type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, | ||
1580 | RB_PAGE_HEAD); | ||
1581 | |||
1582 | /* | ||
1583 | * type can be one of four: | ||
1584 | * NORMAL - an interrupt already moved it for us | ||
1585 | * HEAD - we are the first to get here. | ||
1586 | * UPDATE - we are the interrupt interrupting | ||
1587 | * a current move. | ||
1588 | * MOVED - a reader on another CPU moved the next | ||
1589 | * pointer to its reader page. Give up | ||
1590 | * and try again. | ||
1591 | */ | ||
1592 | |||
1593 | switch (type) { | ||
1594 | case RB_PAGE_HEAD: | ||
1595 | /* | ||
1596 | * We changed the head to UPDATE, thus | ||
1597 | * it is our responsibility to update | ||
1598 | * the counters. | ||
1599 | */ | ||
1600 | local_add(entries, &cpu_buffer->overrun); | ||
1601 | |||
1602 | /* | ||
1603 | * The entries will be zeroed out when we move the | ||
1604 | * tail page. | ||
1605 | */ | ||
1606 | |||
1607 | /* still more to do */ | ||
1608 | break; | ||
1609 | |||
1610 | case RB_PAGE_UPDATE: | ||
1611 | /* | ||
1612 | * This is an interrupt that interrupt the | ||
1613 | * previous update. Still more to do. | ||
1614 | */ | ||
1615 | break; | ||
1616 | case RB_PAGE_NORMAL: | ||
1617 | /* | ||
1618 | * An interrupt came in before the update | ||
1619 | * and processed this for us. | ||
1620 | * Nothing left to do. | ||
1621 | */ | ||
1622 | return 1; | ||
1623 | case RB_PAGE_MOVED: | ||
1624 | /* | ||
1625 | * The reader is on another CPU and just did | ||
1626 | * a swap with our next_page. | ||
1627 | * Try again. | ||
1628 | */ | ||
1629 | return 1; | ||
1630 | default: | ||
1631 | RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ | ||
1632 | return -1; | ||
1633 | } | ||
1634 | |||
1635 | /* | ||
1636 | * Now that we are here, the old head pointer is | ||
1637 | * set to UPDATE. This will keep the reader from | ||
1638 | * swapping the head page with the reader page. | ||
1639 | * The reader (on another CPU) will spin till | ||
1640 | * we are finished. | ||
1641 | * | ||
1642 | * We just need to protect against interrupts | ||
1643 | * doing the job. We will set the next pointer | ||
1644 | * to HEAD. After that, we set the old pointer | ||
1645 | * to NORMAL, but only if it was HEAD before. | ||
1646 | * otherwise we are an interrupt, and only | ||
1647 | * want the outer most commit to reset it. | ||
1648 | */ | ||
1649 | new_head = next_page; | ||
1650 | rb_inc_page(cpu_buffer, &new_head); | ||
1651 | |||
1652 | ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, | ||
1653 | RB_PAGE_NORMAL); | ||
1654 | |||
1655 | /* | ||
1656 | * Valid returns are: | ||
1657 | * HEAD - an interrupt came in and already set it. | ||
1658 | * NORMAL - One of two things: | ||
1659 | * 1) We really set it. | ||
1660 | * 2) A bunch of interrupts came in and moved | ||
1661 | * the page forward again. | ||
1662 | */ | ||
1663 | switch (ret) { | ||
1664 | case RB_PAGE_HEAD: | ||
1665 | case RB_PAGE_NORMAL: | ||
1666 | /* OK */ | ||
1667 | break; | ||
1668 | default: | ||
1669 | RB_WARN_ON(cpu_buffer, 1); | ||
1670 | return -1; | ||
1671 | } | ||
1672 | |||
1673 | /* | ||
1674 | * It is possible that an interrupt came in, | ||
1675 | * set the head up, then more interrupts came in | ||
1676 | * and moved it again. When we get back here, | ||
1677 | * the page would have been set to NORMAL but we | ||
1678 | * just set it back to HEAD. | ||
1679 | * | ||
1680 | * How do you detect this? Well, if that happened | ||
1681 | * the tail page would have moved. | ||
1682 | */ | ||
1683 | if (ret == RB_PAGE_NORMAL) { | ||
1684 | /* | ||
1685 | * If the tail had moved passed next, then we need | ||
1686 | * to reset the pointer. | ||
1687 | */ | ||
1688 | if (cpu_buffer->tail_page != tail_page && | ||
1689 | cpu_buffer->tail_page != next_page) | ||
1690 | rb_head_page_set_normal(cpu_buffer, new_head, | ||
1691 | next_page, | ||
1692 | RB_PAGE_HEAD); | ||
1693 | } | ||
1694 | |||
1695 | /* | ||
1696 | * If this was the outer most commit (the one that | ||
1697 | * changed the original pointer from HEAD to UPDATE), | ||
1698 | * then it is up to us to reset it to NORMAL. | ||
1699 | */ | ||
1700 | if (type == RB_PAGE_HEAD) { | ||
1701 | ret = rb_head_page_set_normal(cpu_buffer, next_page, | ||
1702 | tail_page, | ||
1703 | RB_PAGE_UPDATE); | ||
1704 | if (RB_WARN_ON(cpu_buffer, | ||
1705 | ret != RB_PAGE_UPDATE)) | ||
1706 | return -1; | ||
1707 | } | ||
1708 | |||
1709 | return 0; | ||
1710 | } | ||
1711 | |||
1124 | static unsigned rb_calculate_event_length(unsigned length) | 1712 | static unsigned rb_calculate_event_length(unsigned length) |
1125 | { | 1713 | { |
1126 | struct ring_buffer_event event; /* Used only for sizeof array */ | 1714 | struct ring_buffer_event event; /* Used only for sizeof array */ |
@@ -1199,96 +1787,93 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1199 | struct buffer_page *commit_page, | 1787 | struct buffer_page *commit_page, |
1200 | struct buffer_page *tail_page, u64 *ts) | 1788 | struct buffer_page *tail_page, u64 *ts) |
1201 | { | 1789 | { |
1202 | struct buffer_page *next_page, *head_page, *reader_page; | ||
1203 | struct ring_buffer *buffer = cpu_buffer->buffer; | 1790 | struct ring_buffer *buffer = cpu_buffer->buffer; |
1204 | bool lock_taken = false; | 1791 | struct buffer_page *next_page; |
1205 | unsigned long flags; | 1792 | int ret; |
1206 | 1793 | ||
1207 | next_page = tail_page; | 1794 | next_page = tail_page; |
1208 | 1795 | ||
1209 | local_irq_save(flags); | ||
1210 | /* | ||
1211 | * Since the write to the buffer is still not | ||
1212 | * fully lockless, we must be careful with NMIs. | ||
1213 | * The locks in the writers are taken when a write | ||
1214 | * crosses to a new page. The locks protect against | ||
1215 | * races with the readers (this will soon be fixed | ||
1216 | * with a lockless solution). | ||
1217 | * | ||
1218 | * Because we can not protect against NMIs, and we | ||
1219 | * want to keep traces reentrant, we need to manage | ||
1220 | * what happens when we are in an NMI. | ||
1221 | * | ||
1222 | * NMIs can happen after we take the lock. | ||
1223 | * If we are in an NMI, only take the lock | ||
1224 | * if it is not already taken. Otherwise | ||
1225 | * simply fail. | ||
1226 | */ | ||
1227 | if (unlikely(in_nmi())) { | ||
1228 | if (!__raw_spin_trylock(&cpu_buffer->lock)) { | ||
1229 | cpu_buffer->nmi_dropped++; | ||
1230 | goto out_reset; | ||
1231 | } | ||
1232 | } else | ||
1233 | __raw_spin_lock(&cpu_buffer->lock); | ||
1234 | |||
1235 | lock_taken = true; | ||
1236 | |||
1237 | rb_inc_page(cpu_buffer, &next_page); | 1796 | rb_inc_page(cpu_buffer, &next_page); |
1238 | 1797 | ||
1239 | head_page = cpu_buffer->head_page; | ||
1240 | reader_page = cpu_buffer->reader_page; | ||
1241 | |||
1242 | /* we grabbed the lock before incrementing */ | ||
1243 | if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) | ||
1244 | goto out_reset; | ||
1245 | |||
1246 | /* | 1798 | /* |
1247 | * If for some reason, we had an interrupt storm that made | 1799 | * If for some reason, we had an interrupt storm that made |
1248 | * it all the way around the buffer, bail, and warn | 1800 | * it all the way around the buffer, bail, and warn |
1249 | * about it. | 1801 | * about it. |
1250 | */ | 1802 | */ |
1251 | if (unlikely(next_page == commit_page)) { | 1803 | if (unlikely(next_page == commit_page)) { |
1252 | cpu_buffer->commit_overrun++; | 1804 | local_inc(&cpu_buffer->commit_overrun); |
1253 | goto out_reset; | 1805 | goto out_reset; |
1254 | } | 1806 | } |
1255 | 1807 | ||
1256 | if (next_page == head_page) { | 1808 | /* |
1257 | if (!(buffer->flags & RB_FL_OVERWRITE)) | 1809 | * This is where the fun begins! |
1258 | goto out_reset; | 1810 | * |
1259 | 1811 | * We are fighting against races between a reader that | |
1260 | /* tail_page has not moved yet? */ | 1812 | * could be on another CPU trying to swap its reader |
1261 | if (tail_page == cpu_buffer->tail_page) { | 1813 | * page with the buffer head. |
1262 | /* count overflows */ | 1814 | * |
1263 | cpu_buffer->overrun += | 1815 | * We are also fighting against interrupts coming in and |
1264 | local_read(&head_page->entries); | 1816 | * moving the head or tail on us as well. |
1817 | * | ||
1818 | * If the next page is the head page then we have filled | ||
1819 | * the buffer, unless the commit page is still on the | ||
1820 | * reader page. | ||
1821 | */ | ||
1822 | if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) { | ||
1265 | 1823 | ||
1266 | rb_inc_page(cpu_buffer, &head_page); | 1824 | /* |
1267 | cpu_buffer->head_page = head_page; | 1825 | * If the commit is not on the reader page, then |
1268 | cpu_buffer->head_page->read = 0; | 1826 | * move the header page. |
1827 | */ | ||
1828 | if (!rb_is_reader_page(cpu_buffer->commit_page)) { | ||
1829 | /* | ||
1830 | * If we are not in overwrite mode, | ||
1831 | * this is easy, just stop here. | ||
1832 | */ | ||
1833 | if (!(buffer->flags & RB_FL_OVERWRITE)) | ||
1834 | goto out_reset; | ||
1835 | |||
1836 | ret = rb_handle_head_page(cpu_buffer, | ||
1837 | tail_page, | ||
1838 | next_page); | ||
1839 | if (ret < 0) | ||
1840 | goto out_reset; | ||
1841 | if (ret) | ||
1842 | goto out_again; | ||
1843 | } else { | ||
1844 | /* | ||
1845 | * We need to be careful here too. The | ||
1846 | * commit page could still be on the reader | ||
1847 | * page. We could have a small buffer, and | ||
1848 | * have filled up the buffer with events | ||
1849 | * from interrupts and such, and wrapped. | ||
1850 | * | ||
1851 | * Note, if the tail page is also the on the | ||
1852 | * reader_page, we let it move out. | ||
1853 | */ | ||
1854 | if (unlikely((cpu_buffer->commit_page != | ||
1855 | cpu_buffer->tail_page) && | ||
1856 | (cpu_buffer->commit_page == | ||
1857 | cpu_buffer->reader_page))) { | ||
1858 | local_inc(&cpu_buffer->commit_overrun); | ||
1859 | goto out_reset; | ||
1860 | } | ||
1269 | } | 1861 | } |
1270 | } | 1862 | } |
1271 | 1863 | ||
1272 | /* | 1864 | ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); |
1273 | * If the tail page is still the same as what we think | 1865 | if (ret) { |
1274 | * it is, then it is up to us to update the tail | 1866 | /* |
1275 | * pointer. | 1867 | * Nested commits always have zero deltas, so |
1276 | */ | 1868 | * just reread the time stamp |
1277 | if (tail_page == cpu_buffer->tail_page) { | 1869 | */ |
1278 | local_set(&next_page->write, 0); | ||
1279 | local_set(&next_page->entries, 0); | ||
1280 | local_set(&next_page->page->commit, 0); | ||
1281 | cpu_buffer->tail_page = next_page; | ||
1282 | |||
1283 | /* reread the time stamp */ | ||
1284 | *ts = rb_time_stamp(buffer, cpu_buffer->cpu); | 1870 | *ts = rb_time_stamp(buffer, cpu_buffer->cpu); |
1285 | cpu_buffer->tail_page->page->time_stamp = *ts; | 1871 | next_page->page->time_stamp = *ts; |
1286 | } | 1872 | } |
1287 | 1873 | ||
1288 | rb_reset_tail(cpu_buffer, tail_page, tail, length); | 1874 | out_again: |
1289 | 1875 | ||
1290 | __raw_spin_unlock(&cpu_buffer->lock); | 1876 | rb_reset_tail(cpu_buffer, tail_page, tail, length); |
1291 | local_irq_restore(flags); | ||
1292 | 1877 | ||
1293 | /* fail and let the caller try again */ | 1878 | /* fail and let the caller try again */ |
1294 | return ERR_PTR(-EAGAIN); | 1879 | return ERR_PTR(-EAGAIN); |
@@ -1297,9 +1882,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1297 | /* reset write */ | 1882 | /* reset write */ |
1298 | rb_reset_tail(cpu_buffer, tail_page, tail, length); | 1883 | rb_reset_tail(cpu_buffer, tail_page, tail, length); |
1299 | 1884 | ||
1300 | if (likely(lock_taken)) | ||
1301 | __raw_spin_unlock(&cpu_buffer->lock); | ||
1302 | local_irq_restore(flags); | ||
1303 | return NULL; | 1885 | return NULL; |
1304 | } | 1886 | } |
1305 | 1887 | ||
@@ -1316,6 +1898,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, | |||
1316 | barrier(); | 1898 | barrier(); |
1317 | tail_page = cpu_buffer->tail_page; | 1899 | tail_page = cpu_buffer->tail_page; |
1318 | write = local_add_return(length, &tail_page->write); | 1900 | write = local_add_return(length, &tail_page->write); |
1901 | |||
1902 | /* set write to only the index of the write */ | ||
1903 | write &= RB_WRITE_MASK; | ||
1319 | tail = write - length; | 1904 | tail = write - length; |
1320 | 1905 | ||
1321 | /* See if we shot pass the end of this buffer page */ | 1906 | /* See if we shot pass the end of this buffer page */ |
@@ -1360,12 +1945,16 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, | |||
1360 | bpage = cpu_buffer->tail_page; | 1945 | bpage = cpu_buffer->tail_page; |
1361 | 1946 | ||
1362 | if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { | 1947 | if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { |
1948 | unsigned long write_mask = | ||
1949 | local_read(&bpage->write) & ~RB_WRITE_MASK; | ||
1363 | /* | 1950 | /* |
1364 | * This is on the tail page. It is possible that | 1951 | * This is on the tail page. It is possible that |
1365 | * a write could come in and move the tail page | 1952 | * a write could come in and move the tail page |
1366 | * and write to the next page. That is fine | 1953 | * and write to the next page. That is fine |
1367 | * because we just shorten what is on this page. | 1954 | * because we just shorten what is on this page. |
1368 | */ | 1955 | */ |
1956 | old_index += write_mask; | ||
1957 | new_index += write_mask; | ||
1369 | index = local_cmpxchg(&bpage->write, old_index, new_index); | 1958 | index = local_cmpxchg(&bpage->write, old_index, new_index); |
1370 | if (index == old_index) | 1959 | if (index == old_index) |
1371 | return 1; | 1960 | return 1; |
@@ -1874,9 +2463,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_write); | |||
1874 | static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) | 2463 | static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) |
1875 | { | 2464 | { |
1876 | struct buffer_page *reader = cpu_buffer->reader_page; | 2465 | struct buffer_page *reader = cpu_buffer->reader_page; |
1877 | struct buffer_page *head = cpu_buffer->head_page; | 2466 | struct buffer_page *head = rb_set_head_page(cpu_buffer); |
1878 | struct buffer_page *commit = cpu_buffer->commit_page; | 2467 | struct buffer_page *commit = cpu_buffer->commit_page; |
1879 | 2468 | ||
2469 | /* In case of error, head will be NULL */ | ||
2470 | if (unlikely(!head)) | ||
2471 | return 1; | ||
2472 | |||
1880 | return reader->read == rb_page_commit(reader) && | 2473 | return reader->read == rb_page_commit(reader) && |
1881 | (commit == reader || | 2474 | (commit == reader || |
1882 | (commit == head && | 2475 | (commit == head && |
@@ -1967,7 +2560,7 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) | |||
1967 | return 0; | 2560 | return 0; |
1968 | 2561 | ||
1969 | cpu_buffer = buffer->buffers[cpu]; | 2562 | cpu_buffer = buffer->buffers[cpu]; |
1970 | ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) | 2563 | ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) |
1971 | - cpu_buffer->read; | 2564 | - cpu_buffer->read; |
1972 | 2565 | ||
1973 | return ret; | 2566 | return ret; |
@@ -1988,33 +2581,13 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) | |||
1988 | return 0; | 2581 | return 0; |
1989 | 2582 | ||
1990 | cpu_buffer = buffer->buffers[cpu]; | 2583 | cpu_buffer = buffer->buffers[cpu]; |
1991 | ret = cpu_buffer->overrun; | 2584 | ret = local_read(&cpu_buffer->overrun); |
1992 | 2585 | ||
1993 | return ret; | 2586 | return ret; |
1994 | } | 2587 | } |
1995 | EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); | 2588 | EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); |
1996 | 2589 | ||
1997 | /** | 2590 | /** |
1998 | * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped | ||
1999 | * @buffer: The ring buffer | ||
2000 | * @cpu: The per CPU buffer to get the number of overruns from | ||
2001 | */ | ||
2002 | unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) | ||
2003 | { | ||
2004 | struct ring_buffer_per_cpu *cpu_buffer; | ||
2005 | unsigned long ret; | ||
2006 | |||
2007 | if (!cpumask_test_cpu(cpu, buffer->cpumask)) | ||
2008 | return 0; | ||
2009 | |||
2010 | cpu_buffer = buffer->buffers[cpu]; | ||
2011 | ret = cpu_buffer->nmi_dropped; | ||
2012 | |||
2013 | return ret; | ||
2014 | } | ||
2015 | EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); | ||
2016 | |||
2017 | /** | ||
2018 | * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits | 2591 | * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits |
2019 | * @buffer: The ring buffer | 2592 | * @buffer: The ring buffer |
2020 | * @cpu: The per CPU buffer to get the number of overruns from | 2593 | * @cpu: The per CPU buffer to get the number of overruns from |
@@ -2029,7 +2602,7 @@ ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) | |||
2029 | return 0; | 2602 | return 0; |
2030 | 2603 | ||
2031 | cpu_buffer = buffer->buffers[cpu]; | 2604 | cpu_buffer = buffer->buffers[cpu]; |
2032 | ret = cpu_buffer->commit_overrun; | 2605 | ret = local_read(&cpu_buffer->commit_overrun); |
2033 | 2606 | ||
2034 | return ret; | 2607 | return ret; |
2035 | } | 2608 | } |
@@ -2052,7 +2625,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) | |||
2052 | for_each_buffer_cpu(buffer, cpu) { | 2625 | for_each_buffer_cpu(buffer, cpu) { |
2053 | cpu_buffer = buffer->buffers[cpu]; | 2626 | cpu_buffer = buffer->buffers[cpu]; |
2054 | entries += (local_read(&cpu_buffer->entries) - | 2627 | entries += (local_read(&cpu_buffer->entries) - |
2055 | cpu_buffer->overrun) - cpu_buffer->read; | 2628 | local_read(&cpu_buffer->overrun)) - cpu_buffer->read; |
2056 | } | 2629 | } |
2057 | 2630 | ||
2058 | return entries; | 2631 | return entries; |
@@ -2075,7 +2648,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer) | |||
2075 | /* if you care about this being correct, lock the buffer */ | 2648 | /* if you care about this being correct, lock the buffer */ |
2076 | for_each_buffer_cpu(buffer, cpu) { | 2649 | for_each_buffer_cpu(buffer, cpu) { |
2077 | cpu_buffer = buffer->buffers[cpu]; | 2650 | cpu_buffer = buffer->buffers[cpu]; |
2078 | overruns += cpu_buffer->overrun; | 2651 | overruns += local_read(&cpu_buffer->overrun); |
2079 | } | 2652 | } |
2080 | 2653 | ||
2081 | return overruns; | 2654 | return overruns; |
@@ -2088,8 +2661,10 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) | |||
2088 | 2661 | ||
2089 | /* Iterator usage is expected to have record disabled */ | 2662 | /* Iterator usage is expected to have record disabled */ |
2090 | if (list_empty(&cpu_buffer->reader_page->list)) { | 2663 | if (list_empty(&cpu_buffer->reader_page->list)) { |
2091 | iter->head_page = cpu_buffer->head_page; | 2664 | iter->head_page = rb_set_head_page(cpu_buffer); |
2092 | iter->head = cpu_buffer->head_page->read; | 2665 | if (unlikely(!iter->head_page)) |
2666 | return; | ||
2667 | iter->head = iter->head_page->read; | ||
2093 | } else { | 2668 | } else { |
2094 | iter->head_page = cpu_buffer->reader_page; | 2669 | iter->head_page = cpu_buffer->reader_page; |
2095 | iter->head = cpu_buffer->reader_page->read; | 2670 | iter->head = cpu_buffer->reader_page->read; |
@@ -2206,6 +2781,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | |||
2206 | struct buffer_page *reader = NULL; | 2781 | struct buffer_page *reader = NULL; |
2207 | unsigned long flags; | 2782 | unsigned long flags; |
2208 | int nr_loops = 0; | 2783 | int nr_loops = 0; |
2784 | int ret; | ||
2209 | 2785 | ||
2210 | local_irq_save(flags); | 2786 | local_irq_save(flags); |
2211 | __raw_spin_lock(&cpu_buffer->lock); | 2787 | __raw_spin_lock(&cpu_buffer->lock); |
@@ -2239,30 +2815,56 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) | |||
2239 | goto out; | 2815 | goto out; |
2240 | 2816 | ||
2241 | /* | 2817 | /* |
2242 | * Splice the empty reader page into the list around the head. | ||
2243 | * Reset the reader page to size zero. | 2818 | * Reset the reader page to size zero. |
2244 | */ | 2819 | */ |
2820 | local_set(&cpu_buffer->reader_page->write, 0); | ||
2821 | local_set(&cpu_buffer->reader_page->entries, 0); | ||
2822 | local_set(&cpu_buffer->reader_page->page->commit, 0); | ||
2245 | 2823 | ||
2246 | reader = cpu_buffer->head_page; | 2824 | spin: |
2825 | /* | ||
2826 | * Splice the empty reader page into the list around the head. | ||
2827 | */ | ||
2828 | reader = rb_set_head_page(cpu_buffer); | ||
2247 | cpu_buffer->reader_page->list.next = reader->list.next; | 2829 | cpu_buffer->reader_page->list.next = reader->list.next; |
2248 | cpu_buffer->reader_page->list.prev = reader->list.prev; | 2830 | cpu_buffer->reader_page->list.prev = reader->list.prev; |
2249 | 2831 | ||
2250 | local_set(&cpu_buffer->reader_page->write, 0); | 2832 | /* |
2251 | local_set(&cpu_buffer->reader_page->entries, 0); | 2833 | * cpu_buffer->pages just needs to point to the buffer, it |
2252 | local_set(&cpu_buffer->reader_page->page->commit, 0); | 2834 | * has no specific buffer page to point to. Lets move it out |
2835 | * of our way so we don't accidently swap it. | ||
2836 | */ | ||
2837 | cpu_buffer->pages = reader->list.prev; | ||
2253 | 2838 | ||
2254 | /* Make the reader page now replace the head */ | 2839 | /* The reader page will be pointing to the new head */ |
2255 | reader->list.prev->next = &cpu_buffer->reader_page->list; | 2840 | rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); |
2256 | reader->list.next->prev = &cpu_buffer->reader_page->list; | 2841 | |
2842 | /* | ||
2843 | * Here's the tricky part. | ||
2844 | * | ||
2845 | * We need to move the pointer past the header page. | ||
2846 | * But we can only do that if a writer is not currently | ||
2847 | * moving it. The page before the header page has the | ||
2848 | * flag bit '1' set if it is pointing to the page we want. | ||
2849 | * but if the writer is in the process of moving it | ||
2850 | * than it will be '2' or already moved '0'. | ||
2851 | */ | ||
2852 | |||
2853 | ret = rb_head_page_replace(reader, cpu_buffer->reader_page); | ||
2257 | 2854 | ||
2258 | /* | 2855 | /* |
2259 | * If the tail is on the reader, then we must set the head | 2856 | * If we did not convert it, then we must try again. |
2260 | * to the inserted page, otherwise we set it one before. | ||
2261 | */ | 2857 | */ |
2262 | cpu_buffer->head_page = cpu_buffer->reader_page; | 2858 | if (!ret) |
2859 | goto spin; | ||
2263 | 2860 | ||
2264 | if (cpu_buffer->commit_page != reader) | 2861 | /* |
2265 | rb_inc_page(cpu_buffer, &cpu_buffer->head_page); | 2862 | * Yeah! We succeeded in replacing the page. |
2863 | * | ||
2864 | * Now make the new head point back to the reader page. | ||
2865 | */ | ||
2866 | reader->list.next->prev = &cpu_buffer->reader_page->list; | ||
2867 | rb_inc_page(cpu_buffer, &cpu_buffer->head_page); | ||
2266 | 2868 | ||
2267 | /* Finally update the reader page to the new head */ | 2869 | /* Finally update the reader page to the new head */ |
2268 | cpu_buffer->reader_page = reader; | 2870 | cpu_buffer->reader_page = reader; |
@@ -2718,8 +3320,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_size); | |||
2718 | static void | 3320 | static void |
2719 | rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) | 3321 | rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) |
2720 | { | 3322 | { |
3323 | rb_head_page_deactivate(cpu_buffer); | ||
3324 | |||
2721 | cpu_buffer->head_page | 3325 | cpu_buffer->head_page |
2722 | = list_entry(cpu_buffer->pages.next, struct buffer_page, list); | 3326 | = list_entry(cpu_buffer->pages, struct buffer_page, list); |
2723 | local_set(&cpu_buffer->head_page->write, 0); | 3327 | local_set(&cpu_buffer->head_page->write, 0); |
2724 | local_set(&cpu_buffer->head_page->entries, 0); | 3328 | local_set(&cpu_buffer->head_page->entries, 0); |
2725 | local_set(&cpu_buffer->head_page->page->commit, 0); | 3329 | local_set(&cpu_buffer->head_page->page->commit, 0); |
@@ -2735,16 +3339,17 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) | |||
2735 | local_set(&cpu_buffer->reader_page->page->commit, 0); | 3339 | local_set(&cpu_buffer->reader_page->page->commit, 0); |
2736 | cpu_buffer->reader_page->read = 0; | 3340 | cpu_buffer->reader_page->read = 0; |
2737 | 3341 | ||
2738 | cpu_buffer->nmi_dropped = 0; | 3342 | local_set(&cpu_buffer->commit_overrun, 0); |
2739 | cpu_buffer->commit_overrun = 0; | 3343 | local_set(&cpu_buffer->overrun, 0); |
2740 | cpu_buffer->overrun = 0; | ||
2741 | cpu_buffer->read = 0; | ||
2742 | local_set(&cpu_buffer->entries, 0); | 3344 | local_set(&cpu_buffer->entries, 0); |
2743 | local_set(&cpu_buffer->committing, 0); | 3345 | local_set(&cpu_buffer->committing, 0); |
2744 | local_set(&cpu_buffer->commits, 0); | 3346 | local_set(&cpu_buffer->commits, 0); |
3347 | cpu_buffer->read = 0; | ||
2745 | 3348 | ||
2746 | cpu_buffer->write_stamp = 0; | 3349 | cpu_buffer->write_stamp = 0; |
2747 | cpu_buffer->read_stamp = 0; | 3350 | cpu_buffer->read_stamp = 0; |
3351 | |||
3352 | rb_head_page_activate(cpu_buffer); | ||
2748 | } | 3353 | } |
2749 | 3354 | ||
2750 | /** | 3355 | /** |
@@ -3092,7 +3697,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, | |||
3092 | read = 0; | 3697 | read = 0; |
3093 | } else { | 3698 | } else { |
3094 | /* update the entry counter */ | 3699 | /* update the entry counter */ |
3095 | cpu_buffer->read += local_read(&reader->entries); | 3700 | cpu_buffer->read += rb_page_entries(reader); |
3096 | 3701 | ||
3097 | /* swap the pages */ | 3702 | /* swap the pages */ |
3098 | rb_init_page(bpage); | 3703 | rb_init_page(bpage); |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bdb3afc8b306..b591f7a1bd7b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -3630,9 +3630,6 @@ tracing_stats_read(struct file *filp, char __user *ubuf, | |||
3630 | cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); | 3630 | cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); |
3631 | trace_seq_printf(s, "commit overrun: %ld\n", cnt); | 3631 | trace_seq_printf(s, "commit overrun: %ld\n", cnt); |
3632 | 3632 | ||
3633 | cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu); | ||
3634 | trace_seq_printf(s, "nmi dropped: %ld\n", cnt); | ||
3635 | |||
3636 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); | 3633 | count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); |
3637 | 3634 | ||
3638 | kfree(s); | 3635 | kfree(s); |