diff options
Diffstat (limited to 'kernel/events/ring_buffer.c')
-rw-r--r-- | kernel/events/ring_buffer.c | 380 |
1 files changed, 380 insertions, 0 deletions
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c new file mode 100644 index 000000000000..a2a29205cc0f --- /dev/null +++ b/kernel/events/ring_buffer.c | |||
@@ -0,0 +1,380 @@ | |||
1 | /* | ||
2 | * Performance events ring-buffer code: | ||
3 | * | ||
4 | * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar | ||
6 | * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | ||
7 | * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> | ||
8 | * | ||
9 | * For licensing details see kernel-base/COPYING | ||
10 | */ | ||
11 | |||
12 | #include <linux/perf_event.h> | ||
13 | #include <linux/vmalloc.h> | ||
14 | #include <linux/slab.h> | ||
15 | |||
16 | #include "internal.h" | ||
17 | |||
18 | static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, | ||
19 | unsigned long offset, unsigned long head) | ||
20 | { | ||
21 | unsigned long mask; | ||
22 | |||
23 | if (!rb->writable) | ||
24 | return true; | ||
25 | |||
26 | mask = perf_data_size(rb) - 1; | ||
27 | |||
28 | offset = (offset - tail) & mask; | ||
29 | head = (head - tail) & mask; | ||
30 | |||
31 | if ((int)(head - offset) < 0) | ||
32 | return false; | ||
33 | |||
34 | return true; | ||
35 | } | ||
36 | |||
37 | static void perf_output_wakeup(struct perf_output_handle *handle) | ||
38 | { | ||
39 | atomic_set(&handle->rb->poll, POLL_IN); | ||
40 | |||
41 | handle->event->pending_wakeup = 1; | ||
42 | irq_work_queue(&handle->event->pending); | ||
43 | } | ||
44 | |||
45 | /* | ||
46 | * We need to ensure a later event_id doesn't publish a head when a former | ||
47 | * event isn't done writing. However since we need to deal with NMIs we | ||
48 | * cannot fully serialize things. | ||
49 | * | ||
50 | * We only publish the head (and generate a wakeup) when the outer-most | ||
51 | * event completes. | ||
52 | */ | ||
53 | static void perf_output_get_handle(struct perf_output_handle *handle) | ||
54 | { | ||
55 | struct ring_buffer *rb = handle->rb; | ||
56 | |||
57 | preempt_disable(); | ||
58 | local_inc(&rb->nest); | ||
59 | handle->wakeup = local_read(&rb->wakeup); | ||
60 | } | ||
61 | |||
62 | static void perf_output_put_handle(struct perf_output_handle *handle) | ||
63 | { | ||
64 | struct ring_buffer *rb = handle->rb; | ||
65 | unsigned long head; | ||
66 | |||
67 | again: | ||
68 | head = local_read(&rb->head); | ||
69 | |||
70 | /* | ||
71 | * IRQ/NMI can happen here, which means we can miss a head update. | ||
72 | */ | ||
73 | |||
74 | if (!local_dec_and_test(&rb->nest)) | ||
75 | goto out; | ||
76 | |||
77 | /* | ||
78 | * Publish the known good head. Rely on the full barrier implied | ||
79 | * by atomic_dec_and_test() order the rb->head read and this | ||
80 | * write. | ||
81 | */ | ||
82 | rb->user_page->data_head = head; | ||
83 | |||
84 | /* | ||
85 | * Now check if we missed an update, rely on the (compiler) | ||
86 | * barrier in atomic_dec_and_test() to re-read rb->head. | ||
87 | */ | ||
88 | if (unlikely(head != local_read(&rb->head))) { | ||
89 | local_inc(&rb->nest); | ||
90 | goto again; | ||
91 | } | ||
92 | |||
93 | if (handle->wakeup != local_read(&rb->wakeup)) | ||
94 | perf_output_wakeup(handle); | ||
95 | |||
96 | out: | ||
97 | preempt_enable(); | ||
98 | } | ||
99 | |||
100 | int perf_output_begin(struct perf_output_handle *handle, | ||
101 | struct perf_event *event, unsigned int size) | ||
102 | { | ||
103 | struct ring_buffer *rb; | ||
104 | unsigned long tail, offset, head; | ||
105 | int have_lost; | ||
106 | struct perf_sample_data sample_data; | ||
107 | struct { | ||
108 | struct perf_event_header header; | ||
109 | u64 id; | ||
110 | u64 lost; | ||
111 | } lost_event; | ||
112 | |||
113 | rcu_read_lock(); | ||
114 | /* | ||
115 | * For inherited events we send all the output towards the parent. | ||
116 | */ | ||
117 | if (event->parent) | ||
118 | event = event->parent; | ||
119 | |||
120 | rb = rcu_dereference(event->rb); | ||
121 | if (!rb) | ||
122 | goto out; | ||
123 | |||
124 | handle->rb = rb; | ||
125 | handle->event = event; | ||
126 | |||
127 | if (!rb->nr_pages) | ||
128 | goto out; | ||
129 | |||
130 | have_lost = local_read(&rb->lost); | ||
131 | if (have_lost) { | ||
132 | lost_event.header.size = sizeof(lost_event); | ||
133 | perf_event_header__init_id(&lost_event.header, &sample_data, | ||
134 | event); | ||
135 | size += lost_event.header.size; | ||
136 | } | ||
137 | |||
138 | perf_output_get_handle(handle); | ||
139 | |||
140 | do { | ||
141 | /* | ||
142 | * Userspace could choose to issue a mb() before updating the | ||
143 | * tail pointer. So that all reads will be completed before the | ||
144 | * write is issued. | ||
145 | */ | ||
146 | tail = ACCESS_ONCE(rb->user_page->data_tail); | ||
147 | smp_rmb(); | ||
148 | offset = head = local_read(&rb->head); | ||
149 | head += size; | ||
150 | if (unlikely(!perf_output_space(rb, tail, offset, head))) | ||
151 | goto fail; | ||
152 | } while (local_cmpxchg(&rb->head, offset, head) != offset); | ||
153 | |||
154 | if (head - local_read(&rb->wakeup) > rb->watermark) | ||
155 | local_add(rb->watermark, &rb->wakeup); | ||
156 | |||
157 | handle->page = offset >> (PAGE_SHIFT + page_order(rb)); | ||
158 | handle->page &= rb->nr_pages - 1; | ||
159 | handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); | ||
160 | handle->addr = rb->data_pages[handle->page]; | ||
161 | handle->addr += handle->size; | ||
162 | handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; | ||
163 | |||
164 | if (have_lost) { | ||
165 | lost_event.header.type = PERF_RECORD_LOST; | ||
166 | lost_event.header.misc = 0; | ||
167 | lost_event.id = event->id; | ||
168 | lost_event.lost = local_xchg(&rb->lost, 0); | ||
169 | |||
170 | perf_output_put(handle, lost_event); | ||
171 | perf_event__output_id_sample(event, handle, &sample_data); | ||
172 | } | ||
173 | |||
174 | return 0; | ||
175 | |||
176 | fail: | ||
177 | local_inc(&rb->lost); | ||
178 | perf_output_put_handle(handle); | ||
179 | out: | ||
180 | rcu_read_unlock(); | ||
181 | |||
182 | return -ENOSPC; | ||
183 | } | ||
184 | |||
185 | void perf_output_copy(struct perf_output_handle *handle, | ||
186 | const void *buf, unsigned int len) | ||
187 | { | ||
188 | __output_copy(handle, buf, len); | ||
189 | } | ||
190 | |||
191 | void perf_output_end(struct perf_output_handle *handle) | ||
192 | { | ||
193 | perf_output_put_handle(handle); | ||
194 | rcu_read_unlock(); | ||
195 | } | ||
196 | |||
197 | static void | ||
198 | ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) | ||
199 | { | ||
200 | long max_size = perf_data_size(rb); | ||
201 | |||
202 | if (watermark) | ||
203 | rb->watermark = min(max_size, watermark); | ||
204 | |||
205 | if (!rb->watermark) | ||
206 | rb->watermark = max_size / 2; | ||
207 | |||
208 | if (flags & RING_BUFFER_WRITABLE) | ||
209 | rb->writable = 1; | ||
210 | |||
211 | atomic_set(&rb->refcount, 1); | ||
212 | } | ||
213 | |||
214 | #ifndef CONFIG_PERF_USE_VMALLOC | ||
215 | |||
216 | /* | ||
217 | * Back perf_mmap() with regular GFP_KERNEL-0 pages. | ||
218 | */ | ||
219 | |||
220 | struct page * | ||
221 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | ||
222 | { | ||
223 | if (pgoff > rb->nr_pages) | ||
224 | return NULL; | ||
225 | |||
226 | if (pgoff == 0) | ||
227 | return virt_to_page(rb->user_page); | ||
228 | |||
229 | return virt_to_page(rb->data_pages[pgoff - 1]); | ||
230 | } | ||
231 | |||
232 | static void *perf_mmap_alloc_page(int cpu) | ||
233 | { | ||
234 | struct page *page; | ||
235 | int node; | ||
236 | |||
237 | node = (cpu == -1) ? cpu : cpu_to_node(cpu); | ||
238 | page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); | ||
239 | if (!page) | ||
240 | return NULL; | ||
241 | |||
242 | return page_address(page); | ||
243 | } | ||
244 | |||
245 | struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) | ||
246 | { | ||
247 | struct ring_buffer *rb; | ||
248 | unsigned long size; | ||
249 | int i; | ||
250 | |||
251 | size = sizeof(struct ring_buffer); | ||
252 | size += nr_pages * sizeof(void *); | ||
253 | |||
254 | rb = kzalloc(size, GFP_KERNEL); | ||
255 | if (!rb) | ||
256 | goto fail; | ||
257 | |||
258 | rb->user_page = perf_mmap_alloc_page(cpu); | ||
259 | if (!rb->user_page) | ||
260 | goto fail_user_page; | ||
261 | |||
262 | for (i = 0; i < nr_pages; i++) { | ||
263 | rb->data_pages[i] = perf_mmap_alloc_page(cpu); | ||
264 | if (!rb->data_pages[i]) | ||
265 | goto fail_data_pages; | ||
266 | } | ||
267 | |||
268 | rb->nr_pages = nr_pages; | ||
269 | |||
270 | ring_buffer_init(rb, watermark, flags); | ||
271 | |||
272 | return rb; | ||
273 | |||
274 | fail_data_pages: | ||
275 | for (i--; i >= 0; i--) | ||
276 | free_page((unsigned long)rb->data_pages[i]); | ||
277 | |||
278 | free_page((unsigned long)rb->user_page); | ||
279 | |||
280 | fail_user_page: | ||
281 | kfree(rb); | ||
282 | |||
283 | fail: | ||
284 | return NULL; | ||
285 | } | ||
286 | |||
287 | static void perf_mmap_free_page(unsigned long addr) | ||
288 | { | ||
289 | struct page *page = virt_to_page((void *)addr); | ||
290 | |||
291 | page->mapping = NULL; | ||
292 | __free_page(page); | ||
293 | } | ||
294 | |||
295 | void rb_free(struct ring_buffer *rb) | ||
296 | { | ||
297 | int i; | ||
298 | |||
299 | perf_mmap_free_page((unsigned long)rb->user_page); | ||
300 | for (i = 0; i < rb->nr_pages; i++) | ||
301 | perf_mmap_free_page((unsigned long)rb->data_pages[i]); | ||
302 | kfree(rb); | ||
303 | } | ||
304 | |||
305 | #else | ||
306 | |||
307 | struct page * | ||
308 | perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) | ||
309 | { | ||
310 | if (pgoff > (1UL << page_order(rb))) | ||
311 | return NULL; | ||
312 | |||
313 | return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); | ||
314 | } | ||
315 | |||
316 | static void perf_mmap_unmark_page(void *addr) | ||
317 | { | ||
318 | struct page *page = vmalloc_to_page(addr); | ||
319 | |||
320 | page->mapping = NULL; | ||
321 | } | ||
322 | |||
323 | static void rb_free_work(struct work_struct *work) | ||
324 | { | ||
325 | struct ring_buffer *rb; | ||
326 | void *base; | ||
327 | int i, nr; | ||
328 | |||
329 | rb = container_of(work, struct ring_buffer, work); | ||
330 | nr = 1 << page_order(rb); | ||
331 | |||
332 | base = rb->user_page; | ||
333 | for (i = 0; i < nr + 1; i++) | ||
334 | perf_mmap_unmark_page(base + (i * PAGE_SIZE)); | ||
335 | |||
336 | vfree(base); | ||
337 | kfree(rb); | ||
338 | } | ||
339 | |||
340 | void rb_free(struct ring_buffer *rb) | ||
341 | { | ||
342 | schedule_work(&rb->work); | ||
343 | } | ||
344 | |||
345 | struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) | ||
346 | { | ||
347 | struct ring_buffer *rb; | ||
348 | unsigned long size; | ||
349 | void *all_buf; | ||
350 | |||
351 | size = sizeof(struct ring_buffer); | ||
352 | size += sizeof(void *); | ||
353 | |||
354 | rb = kzalloc(size, GFP_KERNEL); | ||
355 | if (!rb) | ||
356 | goto fail; | ||
357 | |||
358 | INIT_WORK(&rb->work, rb_free_work); | ||
359 | |||
360 | all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); | ||
361 | if (!all_buf) | ||
362 | goto fail_all_buf; | ||
363 | |||
364 | rb->user_page = all_buf; | ||
365 | rb->data_pages[0] = all_buf + PAGE_SIZE; | ||
366 | rb->page_order = ilog2(nr_pages); | ||
367 | rb->nr_pages = 1; | ||
368 | |||
369 | ring_buffer_init(rb, watermark, flags); | ||
370 | |||
371 | return rb; | ||
372 | |||
373 | fail_all_buf: | ||
374 | kfree(rb); | ||
375 | |||
376 | fail: | ||
377 | return NULL; | ||
378 | } | ||
379 | |||
380 | #endif | ||