aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace/tracedump.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace/tracedump.c')
-rw-r--r--kernel/trace/tracedump.c682
1 files changed, 682 insertions, 0 deletions
diff --git a/kernel/trace/tracedump.c b/kernel/trace/tracedump.c
new file mode 100644
index 00000000000..a83532bc36d
--- /dev/null
+++ b/kernel/trace/tracedump.c
@@ -0,0 +1,682 @@
1/*
2 * kernel/trace/tracedump.c
3 *
4 * Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20
21#include <linux/console.h>
22#include <linux/cpumask.h>
23#include <linux/init.h>
24#include <linux/irqflags.h>
25#include <linux/module.h>
26#include <linux/moduleparam.h>
27#include <linux/mutex.h>
28#include <linux/notifier.h>
29#include <linux/proc_fs.h>
30#include <linux/ring_buffer.h>
31#include <linux/sched.h>
32#include <linux/smp.h>
33#include <linux/string.h>
34#include <linux/threads.h>
35#include <linux/tracedump.h>
36#include <linux/uaccess.h>
37#include <linux/vmalloc.h>
38#include <linux/zlib.h>
39
40#include "trace.h"
41#include "trace_output.h"
42
43#define CPU_MAX (NR_CPUS-1)
44
45#define TRYM(fn, ...) do { \
46 int try_error = (fn); \
47 if (try_error < 0) { \
48 printk(__VA_ARGS__); \
49 return try_error; \
50 } \
51} while (0)
52
53#define TRY(fn) TRYM(fn, TAG "Caught error from %s in %s\n", #fn, __func__)
54
55/* Stolen from printk.c */
56#define for_each_console(con) \
57 for (con = console_drivers; con != NULL; con = con->next)
58
59#define TAG KERN_ERR "tracedump: "
60
61#define TD_MIN_CONSUME 2000
62#define TD_COMPRESS_CHUNK 0x8000
63
64static DEFINE_MUTEX(tracedump_proc_lock);
65
66static const char MAGIC_NUMBER[9] = "TRACEDUMP";
67static const char CPU_DELIM[7] = "CPU_END";
68#define CMDLINE_DELIM "|"
69
70/* Type of output */
71static bool current_format;
72static bool format_ascii;
73module_param(format_ascii, bool, S_IRUGO | S_IWUSR);
74MODULE_PARM_DESC(format_ascii, "Dump ascii or raw data");
75
76/* Max size of output */
77static uint panic_size = 0x80000;
78module_param(panic_size, uint, S_IRUGO | S_IWUSR);
79MODULE_PARM_DESC(panic_size, "Max dump size during kernel panic (bytes)");
80
81static uint compress_level = 9;
82module_param(compress_level, uint, S_IRUGO | S_IWUSR);
83MODULE_PARM_DESC(compress_level, "Level of compression to use. [0-9]");
84
85static char out_buf[TD_COMPRESS_CHUNK];
86static z_stream stream;
87static int compress_done;
88static int flush;
89
90static int old_trace_flags;
91
92static struct trace_iterator iter;
93static struct pager_s {
94 struct trace_array *tr;
95 void *spare;
96 int cpu;
97 int len;
98 char __user *ubuf;
99} pager;
100
101static char cmdline_buf[16+TASK_COMM_LEN];
102
103static int print_to_console(const char *buf, size_t len)
104{
105 struct console *con;
106
107 /* Stolen from printk.c */
108 for_each_console(con) {
109 if ((con->flags & CON_ENABLED) && con->write &&
110 (cpu_online(smp_processor_id()) ||
111 (con->flags & CON_ANYTIME)))
112 con->write(con, buf, len);
113 }
114 return 0;
115}
116
117static int print_to_user(const char *buf, size_t len)
118{
119 int size;
120 size = copy_to_user(pager.ubuf, buf, len);
121 if (size > 0) {
122 printk(TAG "Failed to copy to user %d bytes\n", size);
123 return -EINVAL;
124 }
125 return 0;
126}
127
128static int print(const char *buf, size_t len, int print_to)
129{
130 if (print_to == TD_PRINT_CONSOLE)
131 TRY(print_to_console(buf, len));
132 else if (print_to == TD_PRINT_USER)
133 TRY(print_to_user(buf, len));
134 return 0;
135}
136
137/* print_magic will print MAGIC_NUMBER using the
138 * print function selected by print_to.
139 */
140static inline ssize_t print_magic(int print_to)
141{
142 print(MAGIC_NUMBER, sizeof(MAGIC_NUMBER), print_to);
143 return sizeof(MAGIC_NUMBER);
144}
145
146static int iter_init(void)
147{
148 int cpu;
149
150 /* Make iter point to global ring buffer used in trace. */
151 trace_init_global_iter(&iter);
152
153 /* Disable tracing */
154 for_each_tracing_cpu(cpu) {
155 atomic_inc(&iter.tr->data[cpu]->disabled);
156 }
157
158 /* Save flags */
159 old_trace_flags = trace_flags;
160
161 /* Dont look at memory in panic mode. */
162 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
163
164 /* Prepare ring buffer iter */
165 for_each_tracing_cpu(cpu) {
166 iter.buffer_iter[cpu] =
167 ring_buffer_read_prepare(iter.tr->buffer, cpu);
168 }
169 ring_buffer_read_prepare_sync();
170 for_each_tracing_cpu(cpu) {
171 ring_buffer_read_start(iter.buffer_iter[cpu]);
172 tracing_iter_reset(&iter, cpu);
173 }
174 return 0;
175}
176
177/* iter_next gets the next entry in the ring buffer, ordered by time.
178 * If there are no more entries, returns 0.
179 */
180static ssize_t iter_next(void)
181{
182 /* Zero out the iterator's seq */
183 memset(&iter.seq, 0,
184 sizeof(struct trace_iterator) -
185 offsetof(struct trace_iterator, seq));
186
187 while (!trace_empty(&iter)) {
188 if (trace_find_next_entry_inc(&iter) == NULL) {
189 printk(TAG "trace_find_next_entry failed!\n");
190 return -EINVAL;
191 }
192
193 /* Copy the ring buffer data to iterator's seq */
194 print_trace_line(&iter);
195 if (iter.seq.len != 0)
196 return iter.seq.len;
197 }
198 return 0;
199}
200
201static int iter_deinit(void)
202{
203 int cpu;
204 /* Enable tracing */
205 for_each_tracing_cpu(cpu) {
206 ring_buffer_read_finish(iter.buffer_iter[cpu]);
207 }
208 for_each_tracing_cpu(cpu) {
209 atomic_dec(&iter.tr->data[cpu]->disabled);
210 }
211
212 /* Restore flags */
213 trace_flags = old_trace_flags;
214 return 0;
215}
216
217static int pager_init(void)
218{
219 int cpu;
220
221 /* Need to do this to get a pointer to global_trace (iter.tr).
222 Lame, I know. */
223 trace_init_global_iter(&iter);
224
225 /* Turn off tracing */
226 for_each_tracing_cpu(cpu) {
227 atomic_inc(&iter.tr->data[cpu]->disabled);
228 }
229
230 memset(&pager, 0, sizeof(pager));
231 pager.tr = iter.tr;
232 pager.len = TD_COMPRESS_CHUNK;
233
234 return 0;
235}
236
237/* pager_next_cpu moves the pager to the next cpu.
238 * Returns 0 if pager is done, else 1.
239 */
240static ssize_t pager_next_cpu(void)
241{
242 if (pager.cpu <= CPU_MAX) {
243 pager.cpu += 1;
244 return 1;
245 }
246
247 return 0;
248}
249
250/* pager_next gets the next page of data from the ring buffer
251 * of the current cpu. Returns page size or 0 if no more data.
252 */
253static ssize_t pager_next(void)
254{
255 int ret;
256
257 if (pager.cpu > CPU_MAX)
258 return 0;
259
260 if (!pager.spare)
261 pager.spare = ring_buffer_alloc_read_page(pager.tr->buffer, pager.cpu);
262 if (!pager.spare) {
263 printk(TAG "ring_buffer_alloc_read_page failed!");
264 return -ENOMEM;
265 }
266
267 ret = ring_buffer_read_page(pager.tr->buffer,
268 &pager.spare,
269 pager.len,
270 pager.cpu, 0);
271 if (ret < 0)
272 return 0;
273
274 return PAGE_SIZE;
275}
276
277static int pager_deinit(void)
278{
279 int cpu;
280 if (pager.spare != NULL)
281 ring_buffer_free_read_page(pager.tr->buffer, pager.spare);
282
283 for_each_tracing_cpu(cpu) {
284 atomic_dec(&iter.tr->data[cpu]->disabled);
285 }
286 return 0;
287}
288
289/* cmdline_next gets the next saved cmdline from the trace and
290 * puts it in cmdline_buf. Returns the size of the cmdline, or 0 if empty.
291 * but will reset itself on a subsequent call.
292 */
293static ssize_t cmdline_next(void)
294{
295 static int pid;
296 ssize_t size = 0;
297
298 if (pid >= PID_MAX_DEFAULT)
299 pid = -1;
300
301 while (size == 0 && pid < PID_MAX_DEFAULT) {
302 pid++;
303 trace_find_cmdline(pid, cmdline_buf);
304 if (!strncmp(cmdline_buf, "<...>", 5))
305 continue;
306
307 sprintf(&cmdline_buf[strlen(cmdline_buf)], " %d"
308 CMDLINE_DELIM, pid);
309 size = strlen(cmdline_buf);
310 }
311 return size;
312}
313
314/* comsume_events removes the first 'num' entries from the ring buffer. */
315static int consume_events(size_t num)
316{
317 TRY(iter_init());
318 for (; num > 0 && !trace_empty(&iter); num--) {
319 trace_find_next_entry_inc(&iter);
320 ring_buffer_consume(iter.tr->buffer, iter.cpu, &iter.ts,
321 &iter.lost_events);
322 }
323 TRY(iter_deinit());
324 return 0;
325}
326
327static int data_init(void)
328{
329 if (current_format)
330 TRY(iter_init());
331 else
332 TRY(pager_init());
333 return 0;
334}
335
336/* data_next will figure out the right 'next' function to
337 * call and will select the right buffer to pass back
338 * to compress_next.
339 *
340 * iter_next should be used to get data entry-by-entry, ordered
341 * by time, which is what we need in order to convert it to ascii.
342 *
343 * pager_next will return a full page of raw data at a time, one
344 * CPU at a time. pager_next_cpu must be called to get the next CPU.
345 * cmdline_next will get the next saved cmdline
346 */
347static ssize_t data_next(const char **buf)
348{
349 ssize_t size;
350
351 if (current_format) {
352 TRY(size = iter_next());
353 *buf = iter.seq.buffer;
354 } else {
355 TRY(size = pager_next());
356 *buf = pager.spare;
357 if (size == 0) {
358 if (pager_next_cpu()) {
359 size = sizeof(CPU_DELIM);
360 *buf = CPU_DELIM;
361 } else {
362 TRY(size = cmdline_next());
363 *buf = cmdline_buf;
364 }
365 }
366 }
367 return size;
368}
369
370static int data_deinit(void)
371{
372 if (current_format)
373 TRY(iter_deinit());
374 else
375 TRY(pager_deinit());
376 return 0;
377}
378
379static int compress_init(void)
380{
381 int workspacesize, ret;
382
383 compress_done = 0;
384 flush = Z_NO_FLUSH;
385 stream.data_type = current_format ? Z_ASCII : Z_BINARY;
386 workspacesize = zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL);
387 stream.workspace = vmalloc(workspacesize);
388 if (!stream.workspace) {
389 printk(TAG "Could not allocate "
390 "enough memory for zlib!\n");
391 return -ENOMEM;
392 }
393 memset(stream.workspace, 0, workspacesize);
394
395 ret = zlib_deflateInit(&stream, compress_level);
396 if (ret != Z_OK) {
397 printk(TAG "%s\n", stream.msg);
398 return ret;
399 }
400 stream.avail_in = 0;
401 stream.avail_out = 0;
402 TRY(data_init());
403 return 0;
404}
405
406/* compress_next will compress up to min(max_out, TD_COMPRESS_CHUNK) bytes
407 * of data into the output buffer. It gets the data by calling data_next.
408 * It will return the most data it possibly can. If it returns 0, then
409 * there is no more data.
410 *
411 * By the way that zlib works, each call to zlib_deflate will possibly
412 * consume up to avail_in bytes from next_in, and will fill up to
413 * avail_out bytes in next_out. Once flush == Z_FINISH, it can not take
414 * any more input. It will output until it is finished, and will return
415 * Z_STREAM_END.
416 */
417static ssize_t compress_next(size_t max_out)
418{
419 ssize_t ret;
420 max_out = min(max_out, (size_t)TD_COMPRESS_CHUNK);
421 stream.next_out = out_buf;
422 stream.avail_out = max_out;
423 while (stream.avail_out > 0 && !compress_done) {
424 if (stream.avail_in == 0 && flush != Z_FINISH) {
425 TRY(stream.avail_in =
426 data_next((const char **)&stream.next_in));
427 flush = (stream.avail_in == 0) ? Z_FINISH : Z_NO_FLUSH;
428 }
429 if (stream.next_in != NULL) {
430 TRYM((ret = zlib_deflate(&stream, flush)),
431 "zlib: %s\n", stream.msg);
432 compress_done = (ret == Z_STREAM_END);
433 }
434 }
435 ret = max_out - stream.avail_out;
436 return ret;
437}
438
439static int compress_deinit(void)
440{
441 TRY(data_deinit());
442
443 zlib_deflateEnd(&stream);
444 vfree(stream.workspace);
445
446 /* TODO: remove */
447 printk(TAG "Total in: %ld\n", stream.total_in);
448 printk(TAG "Total out: %ld\n", stream.total_out);
449 return stream.total_out;
450}
451
452static int compress_reset(void)
453{
454 TRY(compress_deinit());
455 TRY(compress_init());
456 return 0;
457}
458
459/* tracedump_init initializes all tracedump components.
460 * Call this before tracedump_next
461 */
462int tracedump_init(void)
463{
464 TRY(compress_init());
465 return 0;
466}
467
468/* tracedump_next will print up to max_out data from the tracing ring
469 * buffers using the print function selected by print_to. The data is
470 * compressed using zlib.
471 *
472 * The output type of the data is specified by the format_ascii module
473 * parameter. If format_ascii == 1, human-readable data will be output.
474 * Otherwise, it will output raw data from the ring buffer in cpu order,
475 * followed by the saved_cmdlines data.
476 */
477ssize_t tracedump_next(size_t max_out, int print_to)
478{
479 ssize_t size;
480 TRY(size = compress_next(max_out));
481 print(out_buf, size, print_to);
482 return size;
483}
484
485/* tracedump_all will print all data in the tracing ring buffers using
486 * the print function selected by print_to. The data is compressed using
487 * zlib, and is surrounded by MAGIC_NUMBER.
488 *
489 * The output type of the data is specified by the format_ascii module
490 * parameter. If format_ascii == 1, human-readable data will be output.
491 * Otherwise, it will output raw data from the ring buffer in cpu order,
492 * followed by the saved_cmdlines data.
493 */
494ssize_t tracedump_all(int print_to)
495{
496 ssize_t ret, size = 0;
497 TRY(size += print_magic(print_to));
498
499 do {
500 /* Here the size used doesn't really matter,
501 * since we're dumping everything. */
502 TRY(ret = tracedump_next(0xFFFFFFFF, print_to));
503 size += ret;
504 } while (ret > 0);
505
506 TRY(size += print_magic(print_to));
507
508 return size;
509}
510
511/* tracedump_deinit deinitializes all tracedump components.
512 * This must be called, even on error.
513 */
514int tracedump_deinit(void)
515{
516 TRY(compress_deinit());
517 return 0;
518}
519
520/* tracedump_reset reinitializes all tracedump components. */
521int tracedump_reset(void)
522{
523 TRY(compress_reset());
524 return 0;
525}
526
527
528
529/* tracedump_open opens the tracedump file for reading. */
530static int tracedump_open(struct inode *inode, struct file *file)
531{
532 int ret;
533 mutex_lock(&tracedump_proc_lock);
534 current_format = format_ascii;
535 ret = tracedump_init();
536 if (ret < 0)
537 goto err;
538
539 ret = nonseekable_open(inode, file);
540 if (ret < 0)
541 goto err;
542 return ret;
543
544err:
545 mutex_unlock(&tracedump_proc_lock);
546 return ret;
547}
548
549/* tracedump_read will reads data from tracedump_next and prints
550 * it to userspace. It will surround the data with MAGIC_NUMBER.
551 */
552static ssize_t tracedump_read(struct file *file, char __user *buf,
553 size_t len, loff_t *offset)
554{
555 static int done;
556 ssize_t size = 0;
557
558 pager.ubuf = buf;
559
560 if (*offset == 0) {
561 done = 0;
562 TRY(size = print_magic(TD_PRINT_USER));
563 } else if (!done) {
564 TRY(size = tracedump_next(len, TD_PRINT_USER));
565 if (size == 0) {
566 TRY(size = print_magic(TD_PRINT_USER));
567 done = 1;
568 }
569 }
570
571 *offset += size;
572
573 return size;
574}
575
576static int tracedump_release(struct inode *inode, struct file *file)
577{
578 int ret;
579 ret = tracedump_deinit();
580 mutex_unlock(&tracedump_proc_lock);
581 return ret;
582}
583
584/* tracedump_dump dumps all tracing data from the tracing ring buffers
585 * to all consoles. For details about the output format, see
586 * tracedump_all.
587
588 * At most max_out bytes are dumped. To accomplish this,
589 * tracedump_dump calls tracedump_all several times without writing the data,
590 * each time tossing out old data until it reaches its goal.
591 *
592 * Note: dumping raw pages currently does NOT follow the size limit.
593 */
594
595int tracedump_dump(size_t max_out)
596{
597 ssize_t size;
598 size_t consume;
599
600 printk(TAG "\n");
601
602 tracedump_init();
603
604 if (format_ascii) {
605 size = tracedump_all(TD_NO_PRINT);
606 if (size < 0) {
607 printk(TAG "failed to dump\n");
608 goto out;
609 }
610 while (size > max_out) {
611 TRY(tracedump_deinit());
612 /* Events take more or less 60 ascii bytes each,
613 not counting compression */
614 consume = TD_MIN_CONSUME + (size - max_out) /
615 (60 / (compress_level + 1));
616 TRY(consume_events(consume));
617 TRY(tracedump_init());
618 size = tracedump_all(TD_NO_PRINT);
619 if (size < 0) {
620 printk(TAG "failed to dump\n");
621 goto out;
622 }
623 }
624
625 TRY(tracedump_reset());
626 }
627 size = tracedump_all(TD_PRINT_CONSOLE);
628 if (size < 0) {
629 printk(TAG "failed to dump\n");
630 goto out;
631 }
632
633out:
634 tracedump_deinit();
635 printk(KERN_INFO "\n" TAG " end\n");
636 return size;
637}
638
639static const struct file_operations tracedump_fops = {
640 .owner = THIS_MODULE,
641 .open = tracedump_open,
642 .read = tracedump_read,
643 .release = tracedump_release,
644};
645
646#ifdef CONFIG_TRACEDUMP_PANIC
647static int tracedump_panic_handler(struct notifier_block *this,
648 unsigned long event, void *unused)
649{
650 tracedump_dump(panic_size);
651 return 0;
652}
653
654static struct notifier_block tracedump_panic_notifier = {
655 .notifier_call = tracedump_panic_handler,
656 .next = NULL,
657 .priority = 150 /* priority: INT_MAX >= x >= 0 */
658};
659#endif
660
661static int __init tracedump_initcall(void)
662{
663#ifdef CONFIG_TRACEDUMP_PROCFS
664 struct proc_dir_entry *entry;
665
666 /* Create a procfs file for easy dumping */
667 entry = create_proc_entry("tracedump", S_IFREG | S_IRUGO, NULL);
668 if (!entry)
669 printk(TAG "failed to create proc entry\n");
670 else
671 entry->proc_fops = &tracedump_fops;
672#endif
673
674#ifdef CONFIG_TRACEDUMP_PANIC
675 /* Automatically dump to console on a kernel panic */
676 atomic_notifier_chain_register(&panic_notifier_list,
677 &tracedump_panic_notifier);
678#endif
679 return 0;
680}
681
682early_initcall(tracedump_initcall);