aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-04-10 14:27:30 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-04-10 14:27:30 -0400
commit2a56bb596b2c1fb612f9988afda9655c8c872a6e (patch)
tree8f76cd7a0d4f5a46e00d45e5605e161d4e16b81e
parent9f3a0941fb5efaa4d27911e251dc595034d58baa (diff)
parentb0dc52f15e7fe2b973ecfe4f3706f1b35ce3943a (diff)
Merge tag 'trace-v4.17' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace
Pull tracing updates from Steven Rostedt: "New features: - Tom Zanussi's extended histogram work. This adds the synthetic events to have histograms from multiple event data Adds triggers "onmatch" and "onmax" to call the synthetic events Several updates to the histogram code from this - Allow way to nest ring buffer calls in the same context - Allow absolute time stamps in ring buffer - Rewrite of filter code parsing based on Al Viro's suggestions - Setting of trace_clock to global if TSC is unstable (on boot) - Better OOM handling when allocating large ring buffers - Added initcall tracepoints (consolidated initcall_debug code with them) And other various fixes and clean ups" * tag 'trace-v4.17' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace: (68 commits) init: Have initcall_debug still work without CONFIG_TRACEPOINTS init, tracing: Have printk come through the trace events for initcall_debug init, tracing: instrument security and console initcall trace events init, tracing: Add initcall trace events tracing: Add rcu dereference annotation for test func that touches filter->prog tracing: Add rcu dereference annotation for filter->prog tracing: Fixup logic inversion on setting trace_global_clock defaults tracing: Hide global trace clock from lockdep ring-buffer: Add set/clear_current_oom_origin() during allocations ring-buffer: Check if memory is available before allocation lockdep: Add print_irqtrace_events() to __warn vsprintf: Do not preprocess non-dereferenced pointers for bprintf (%px and %pK) tracing: Uninitialized variable in create_tracing_map_fields() tracing: Make sure variable string fields are NULL-terminated tracing: Add action comparisons when testing matching hist triggers tracing: Don't add flag strings when displaying variable references tracing: Fix display of hist trigger expressions containing timestamps ftrace: Drop a VLA in module_exists() tracing: Mention trace_clock=global when warning about unstable clocks tracing: Default to using trace_global_clock if sched_clock is unstable ...
-rw-r--r--Documentation/trace/events.rst1548
-rw-r--r--Documentation/trace/ftrace.rst24
-rw-r--r--Documentation/trace/histogram.txt1995
-rw-r--r--include/linux/ring_buffer.h17
-rw-r--r--include/linux/trace_events.h14
-rw-r--r--include/trace/events/initcall.h66
-rw-r--r--init/main.c84
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/printk/printk.c7
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/ftrace.c7
-rw-r--r--kernel/trace/ring_buffer.c226
-rw-r--r--kernel/trace/trace.c116
-rw-r--r--kernel/trace/trace.h33
-rw-r--r--kernel/trace/trace_clock.c4
-rw-r--r--kernel/trace/trace_events_filter.c2367
-rw-r--r--kernel/trace/trace_events_hist.c4450
-rw-r--r--kernel/trace/trace_events_trigger.c53
-rw-r--r--kernel/trace/tracing_map.c232
-rw-r--r--kernel/trace/tracing_map.h18
-rw-r--r--lib/vsprintf.c4
-rw-r--r--security/security.c8
-rw-r--r--tools/testing/selftests/ftrace/test.d/functions7
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc39
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc54
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc58
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc50
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc50
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc48
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc54
30 files changed, 8372 insertions, 3268 deletions
diff --git a/Documentation/trace/events.rst b/Documentation/trace/events.rst
index bdf1963ba6ba..a5ea2cb0082b 100644
--- a/Documentation/trace/events.rst
+++ b/Documentation/trace/events.rst
@@ -520,1550 +520,4 @@ The following commands are supported:
520 totals derived from one or more trace event format fields and/or 520 totals derived from one or more trace event format fields and/or
521 event counts (hitcount). 521 event counts (hitcount).
522 522
523 The format of a hist trigger is as follows:: 523 See Documentation/trace/histogram.txt for details and examples.
524
525 hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
526 [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
527 [:clear][:name=histname1] [if <filter>]
528
529 When a matching event is hit, an entry is added to a hash table
530 using the key(s) and value(s) named. Keys and values correspond to
531 fields in the event's format description. Values must correspond to
532 numeric fields - on an event hit, the value(s) will be added to a
533 sum kept for that field. The special string 'hitcount' can be used
534 in place of an explicit value field - this is simply a count of
535 event hits. If 'values' isn't specified, an implicit 'hitcount'
536 value will be automatically created and used as the only value.
537 Keys can be any field, or the special string 'stacktrace', which
538 will use the event's kernel stacktrace as the key. The keywords
539 'keys' or 'key' can be used to specify keys, and the keywords
540 'values', 'vals', or 'val' can be used to specify values. Compound
541 keys consisting of up to two fields can be specified by the 'keys'
542 keyword. Hashing a compound key produces a unique entry in the
543 table for each unique combination of component keys, and can be
544 useful for providing more fine-grained summaries of event data.
545 Additionally, sort keys consisting of up to two fields can be
546 specified by the 'sort' keyword. If more than one field is
547 specified, the result will be a 'sort within a sort': the first key
548 is taken to be the primary sort key and the second the secondary
549 key. If a hist trigger is given a name using the 'name' parameter,
550 its histogram data will be shared with other triggers of the same
551 name, and trigger hits will update this common data. Only triggers
552 with 'compatible' fields can be combined in this way; triggers are
553 'compatible' if the fields named in the trigger share the same
554 number and type of fields and those fields also have the same names.
555 Note that any two events always share the compatible 'hitcount' and
556 'stacktrace' fields and can therefore be combined using those
557 fields, however pointless that may be.
558
559 'hist' triggers add a 'hist' file to each event's subdirectory.
560 Reading the 'hist' file for the event will dump the hash table in
561 its entirety to stdout. If there are multiple hist triggers
562 attached to an event, there will be a table for each trigger in the
563 output. The table displayed for a named trigger will be the same as
564 any other instance having the same name. Each printed hash table
565 entry is a simple list of the keys and values comprising the entry;
566 keys are printed first and are delineated by curly braces, and are
567 followed by the set of value fields for the entry. By default,
568 numeric fields are displayed as base-10 integers. This can be
569 modified by appending any of the following modifiers to the field
570 name:
571
572 - .hex display a number as a hex value
573 - .sym display an address as a symbol
574 - .sym-offset display an address as a symbol and offset
575 - .syscall display a syscall id as a system call name
576 - .execname display a common_pid as a program name
577
578 Note that in general the semantics of a given field aren't
579 interpreted when applying a modifier to it, but there are some
580 restrictions to be aware of in this regard:
581
582 - only the 'hex' modifier can be used for values (because values
583 are essentially sums, and the other modifiers don't make sense
584 in that context).
585 - the 'execname' modifier can only be used on a 'common_pid'. The
586 reason for this is that the execname is simply the 'comm' value
587 saved for the 'current' process when an event was triggered,
588 which is the same as the common_pid value saved by the event
589 tracing code. Trying to apply that comm value to other pid
590 values wouldn't be correct, and typically events that care save
591 pid-specific comm fields in the event itself.
592
593 A typical usage scenario would be the following to enable a hist
594 trigger, read its current contents, and then turn it off::
595
596 # echo 'hist:keys=skbaddr.hex:vals=len' > \
597 /sys/kernel/debug/tracing/events/net/netif_rx/trigger
598
599 # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
600
601 # echo '!hist:keys=skbaddr.hex:vals=len' > \
602 /sys/kernel/debug/tracing/events/net/netif_rx/trigger
603
604 The trigger file itself can be read to show the details of the
605 currently attached hist trigger. This information is also displayed
606 at the top of the 'hist' file when read.
607
608 By default, the size of the hash table is 2048 entries. The 'size'
609 parameter can be used to specify more or fewer than that. The units
610 are in terms of hashtable entries - if a run uses more entries than
611 specified, the results will show the number of 'drops', the number
612 of hits that were ignored. The size should be a power of 2 between
613 128 and 131072 (any non- power-of-2 number specified will be rounded
614 up).
615
616 The 'sort' parameter can be used to specify a value field to sort
617 on. The default if unspecified is 'hitcount' and the default sort
618 order is 'ascending'. To sort in the opposite direction, append
619 .descending' to the sort key.
620
621 The 'pause' parameter can be used to pause an existing hist trigger
622 or to start a hist trigger but not log any events until told to do
623 so. 'continue' or 'cont' can be used to start or restart a paused
624 hist trigger.
625
626 The 'clear' parameter will clear the contents of a running hist
627 trigger and leave its current paused/active state.
628
629 Note that the 'pause', 'cont', and 'clear' parameters should be
630 applied using 'append' shell operator ('>>') if applied to an
631 existing trigger, rather than via the '>' operator, which will cause
632 the trigger to be removed through truncation.
633
634- enable_hist/disable_hist
635
636 The enable_hist and disable_hist triggers can be used to have one
637 event conditionally start and stop another event's already-attached
638 hist trigger. Any number of enable_hist and disable_hist triggers
639 can be attached to a given event, allowing that event to kick off
640 and stop aggregations on a host of other events.
641
642 The format is very similar to the enable/disable_event triggers::
643
644 enable_hist:<system>:<event>[:count]
645 disable_hist:<system>:<event>[:count]
646
647 Instead of enabling or disabling the tracing of the target event
648 into the trace buffer as the enable/disable_event triggers do, the
649 enable/disable_hist triggers enable or disable the aggregation of
650 the target event into a hash table.
651
652 A typical usage scenario for the enable_hist/disable_hist triggers
653 would be to first set up a paused hist trigger on some event,
654 followed by an enable_hist/disable_hist pair that turns the hist
655 aggregation on and off when conditions of interest are hit::
656
657 # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
658 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
659
660 # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
661 /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
662
663 # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
664 /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
665
666 The above sets up an initially paused hist trigger which is unpaused
667 and starts aggregating events when a given program is executed, and
668 which stops aggregating when the process exits and the hist trigger
669 is paused again.
670
671 The examples below provide a more concrete illustration of the
672 concepts and typical usage patterns discussed above.
673
674
6756.2 'hist' trigger examples
676---------------------------
677
678 The first set of examples creates aggregations using the kmalloc
679 event. The fields that can be used for the hist trigger are listed
680 in the kmalloc event's format file::
681
682 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
683 name: kmalloc
684 ID: 374
685 format:
686 field:unsigned short common_type; offset:0; size:2; signed:0;
687 field:unsigned char common_flags; offset:2; size:1; signed:0;
688 field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
689 field:int common_pid; offset:4; size:4; signed:1;
690
691 field:unsigned long call_site; offset:8; size:8; signed:0;
692 field:const void * ptr; offset:16; size:8; signed:0;
693 field:size_t bytes_req; offset:24; size:8; signed:0;
694 field:size_t bytes_alloc; offset:32; size:8; signed:0;
695 field:gfp_t gfp_flags; offset:40; size:4; signed:0;
696
697 We'll start by creating a hist trigger that generates a simple table
698 that lists the total number of bytes requested for each function in
699 the kernel that made one or more calls to kmalloc::
700
701 # echo 'hist:key=call_site:val=bytes_req' > \
702 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
703
704 This tells the tracing system to create a 'hist' trigger using the
705 call_site field of the kmalloc event as the key for the table, which
706 just means that each unique call_site address will have an entry
707 created for it in the table. The 'val=bytes_req' parameter tells
708 the hist trigger that for each unique entry (call_site) in the
709 table, it should keep a running total of the number of bytes
710 requested by that call_site.
711
712 We'll let it run for awhile and then dump the contents of the 'hist'
713 file in the kmalloc event's subdirectory (for readability, a number
714 of entries have been omitted)::
715
716 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
717 # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
718
719 { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176
720 { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024
721 { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384
722 { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24
723 { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8
724 { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152
725 { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144
726 { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144
727 { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560
728 { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736
729 .
730 .
731 .
732 { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576
733 { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336
734 { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504
735 { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584
736 { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448
737 { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720
738 { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088
739 { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920
740 { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716
741 { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712
742 { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160
743 { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520
744
745 Totals:
746 Hits: 4610
747 Entries: 45
748 Dropped: 0
749
750 The output displays a line for each entry, beginning with the key
751 specified in the trigger, followed by the value(s) also specified in
752 the trigger. At the beginning of the output is a line that displays
753 the trigger info, which can also be displayed by reading the
754 'trigger' file::
755
756 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
757 hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
758
759 At the end of the output are a few lines that display the overall
760 totals for the run. The 'Hits' field shows the total number of
761 times the event trigger was hit, the 'Entries' field shows the total
762 number of used entries in the hash table, and the 'Dropped' field
763 shows the number of hits that were dropped because the number of
764 used entries for the run exceeded the maximum number of entries
765 allowed for the table (normally 0, but if not a hint that you may
766 want to increase the size of the table using the 'size' parameter).
767
768 Notice in the above output that there's an extra field, 'hitcount',
769 which wasn't specified in the trigger. Also notice that in the
770 trigger info output, there's a parameter, 'sort=hitcount', which
771 wasn't specified in the trigger either. The reason for that is that
772 every trigger implicitly keeps a count of the total number of hits
773 attributed to a given entry, called the 'hitcount'. That hitcount
774 information is explicitly displayed in the output, and in the
775 absence of a user-specified sort parameter, is used as the default
776 sort field.
777
778 The value 'hitcount' can be used in place of an explicit value in
779 the 'values' parameter if you don't really need to have any
780 particular field summed and are mainly interested in hit
781 frequencies.
782
783 To turn the hist trigger off, simply call up the trigger in the
784 command history and re-execute it with a '!' prepended::
785
786 # echo '!hist:key=call_site:val=bytes_req' > \
787 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
788
789 Finally, notice that the call_site as displayed in the output above
790 isn't really very useful. It's an address, but normally addresses
791 are displayed in hex. To have a numeric field displayed as a hex
792 value, simply append '.hex' to the field name in the trigger::
793
794 # echo 'hist:key=call_site.hex:val=bytes_req' > \
795 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
796
797 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
798 # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
799
800 { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433
801 { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176
802 { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384
803 { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8
804 { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511
805 { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12
806 { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152
807 { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24
808 { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144
809 { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648
810 { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144
811 { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544
812 .
813 .
814 .
815 { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024
816 { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680
817 { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112
818 { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232
819 { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360
820 { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640
821 { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600
822 { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584
823 { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656
824 { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456
825 { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600
826
827 Totals:
828 Hits: 4775
829 Entries: 46
830 Dropped: 0
831
832 Even that's only marginally more useful - while hex values do look
833 more like addresses, what users are typically more interested in
834 when looking at text addresses are the corresponding symbols
835 instead. To have an address displayed as symbolic value instead,
836 simply append '.sym' or '.sym-offset' to the field name in the
837 trigger::
838
839 # echo 'hist:key=call_site.sym:val=bytes_req' > \
840 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
841
842 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
843 # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
844
845 { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024
846 { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
847 { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
848 { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192
849 { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
850 { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
851 { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
852 { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528
853 { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624
854 { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96
855 { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464
856 { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304
857 { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
858 { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424
859 .
860 .
861 .
862 { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240
863 { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280
864 { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672
865 { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208
866 { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840
867 { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312
868 { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152
869 { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576
870 { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248
871 { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384
872 { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584
873 { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176
874 { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265
875
876 Totals:
877 Hits: 109928
878 Entries: 71
879 Dropped: 0
880
881 Because the default sort key above is 'hitcount', the above shows a
882 the list of call_sites by increasing hitcount, so that at the bottom
883 we see the functions that made the most kmalloc calls during the
884 run. If instead we we wanted to see the top kmalloc callers in
885 terms of the number of bytes requested rather than the number of
886 calls, and we wanted the top caller to appear at the top, we can use
887 the 'sort' parameter, along with the 'descending' modifier::
888
889 # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
890 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
891
892 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
893 # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
894
895 { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464
896 { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176
897 { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135
898 { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128
899 { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784
900 { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992
901 { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072
902 { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824
903 { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704
904 { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088
905 { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536
906 { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664
907 { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632
908 .
909 .
910 .
911 { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
912 { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
913 { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48
914 { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48
915 { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48
916 { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
917 { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16
918 { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
919 { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
920 { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
921
922 Totals:
923 Hits: 32133
924 Entries: 81
925 Dropped: 0
926
927 To display the offset and size information in addition to the symbol
928 name, just use 'sym-offset' instead::
929
930 # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
931 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
932
933 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
934 # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
935
936 { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720
937 { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936
938 { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936
939 { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832
940 { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384
941 { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040
942 { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072
943 { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880
944 { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488
945 { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696
946 { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640
947 { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456
948 .
949 .
950 .
951 { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128
952 { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96
953 { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96
954 { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84
955 { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8
956 { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7
957 { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7
958
959 Totals:
960 Hits: 26098
961 Entries: 64
962 Dropped: 0
963
964 We can also add multiple fields to the 'values' parameter. For
965 example, we might want to see the total number of bytes allocated
966 alongside bytes requested, and display the result sorted by bytes
967 allocated in a descending order::
968
969 # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
970 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
971
972 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
973 # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
974
975 { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016
976 { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224
977 { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568
978 { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760
979 { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744
980 { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400
981 { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496
982 { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304
983 { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640
984 { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760
985 { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312
986 { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432
987 .
988 .
989 .
990 { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192
991 { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
992 { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
993 { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
994 { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
995 { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96
996 { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64
997 { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
998 { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8
999 { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
1000
1001 Totals:
1002 Hits: 66598
1003 Entries: 65
1004 Dropped: 0
1005
1006 Finally, to finish off our kmalloc example, instead of simply having
1007 the hist trigger display symbolic call_sites, we can have the hist
1008 trigger additionally display the complete set of kernel stack traces
1009 that led to each call_site. To do that, we simply use the special
1010 value 'stacktrace' for the key parameter::
1011
1012 # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
1013 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
1014
1015 The above trigger will use the kernel stack trace in effect when an
1016 event is triggered as the key for the hash table. This allows the
1017 enumeration of every kernel callpath that led up to a particular
1018 event, along with a running total of any of the event fields for
1019 that event. Here we tally bytes requested and bytes allocated for
1020 every callpath in the system that led up to a kmalloc (in this case
1021 every callpath to a kmalloc for a kernel compile)::
1022
1023 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
1024 # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
1025
1026 { stacktrace:
1027 __kmalloc_track_caller+0x10b/0x1a0
1028 kmemdup+0x20/0x50
1029 hidraw_report_event+0x8a/0x120 [hid]
1030 hid_report_raw_event+0x3ea/0x440 [hid]
1031 hid_input_report+0x112/0x190 [hid]
1032 hid_irq_in+0xc2/0x260 [usbhid]
1033 __usb_hcd_giveback_urb+0x72/0x120
1034 usb_giveback_urb_bh+0x9e/0xe0
1035 tasklet_hi_action+0xf8/0x100
1036 __do_softirq+0x114/0x2c0
1037 irq_exit+0xa5/0xb0
1038 do_IRQ+0x5a/0xf0
1039 ret_from_intr+0x0/0x30
1040 cpuidle_enter+0x17/0x20
1041 cpu_startup_entry+0x315/0x3e0
1042 rest_init+0x7c/0x80
1043 } hitcount: 3 bytes_req: 21 bytes_alloc: 24
1044 { stacktrace:
1045 __kmalloc_track_caller+0x10b/0x1a0
1046 kmemdup+0x20/0x50
1047 hidraw_report_event+0x8a/0x120 [hid]
1048 hid_report_raw_event+0x3ea/0x440 [hid]
1049 hid_input_report+0x112/0x190 [hid]
1050 hid_irq_in+0xc2/0x260 [usbhid]
1051 __usb_hcd_giveback_urb+0x72/0x120
1052 usb_giveback_urb_bh+0x9e/0xe0
1053 tasklet_hi_action+0xf8/0x100
1054 __do_softirq+0x114/0x2c0
1055 irq_exit+0xa5/0xb0
1056 do_IRQ+0x5a/0xf0
1057 ret_from_intr+0x0/0x30
1058 } hitcount: 3 bytes_req: 21 bytes_alloc: 24
1059 { stacktrace:
1060 kmem_cache_alloc_trace+0xeb/0x150
1061 aa_alloc_task_context+0x27/0x40
1062 apparmor_cred_prepare+0x1f/0x50
1063 security_prepare_creds+0x16/0x20
1064 prepare_creds+0xdf/0x1a0
1065 SyS_capset+0xb5/0x200
1066 system_call_fastpath+0x12/0x6a
1067 } hitcount: 1 bytes_req: 32 bytes_alloc: 32
1068 .
1069 .
1070 .
1071 { stacktrace:
1072 __kmalloc+0x11b/0x1b0
1073 i915_gem_execbuffer2+0x6c/0x2c0 [i915]
1074 drm_ioctl+0x349/0x670 [drm]
1075 do_vfs_ioctl+0x2f0/0x4f0
1076 SyS_ioctl+0x81/0xa0
1077 system_call_fastpath+0x12/0x6a
1078 } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808
1079 { stacktrace:
1080 __kmalloc+0x11b/0x1b0
1081 load_elf_phdrs+0x76/0xa0
1082 load_elf_binary+0x102/0x1650
1083 search_binary_handler+0x97/0x1d0
1084 do_execveat_common.isra.34+0x551/0x6e0
1085 SyS_execve+0x3a/0x50
1086 return_from_execve+0x0/0x23
1087 } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048
1088 { stacktrace:
1089 kmem_cache_alloc_trace+0xeb/0x150
1090 apparmor_file_alloc_security+0x27/0x40
1091 security_file_alloc+0x16/0x20
1092 get_empty_filp+0x93/0x1c0
1093 path_openat+0x31/0x5f0
1094 do_filp_open+0x3a/0x90
1095 do_sys_open+0x128/0x220
1096 SyS_open+0x1e/0x20
1097 system_call_fastpath+0x12/0x6a
1098 } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376
1099 { stacktrace:
1100 __kmalloc+0x11b/0x1b0
1101 seq_buf_alloc+0x1b/0x50
1102 seq_read+0x2cc/0x370
1103 proc_reg_read+0x3d/0x80
1104 __vfs_read+0x28/0xe0
1105 vfs_read+0x86/0x140
1106 SyS_read+0x46/0xb0
1107 system_call_fastpath+0x12/0x6a
1108 } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768
1109
1110 Totals:
1111 Hits: 6085872
1112 Entries: 253
1113 Dropped: 0
1114
1115 If you key a hist trigger on common_pid, in order for example to
1116 gather and display sorted totals for each process, you can use the
1117 special .execname modifier to display the executable names for the
1118 processes in the table rather than raw pids. The example below
1119 keeps a per-process sum of total bytes read::
1120
1121 # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
1122 /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
1123
1124 # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
1125 # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
1126
1127 { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512
1128 { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640
1129 { common_pid: compiz [ 2889] } hitcount: 59 count: 254400
1130 { common_pid: bash [ 8710] } hitcount: 3 count: 66369
1131 { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739
1132 { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648
1133 { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216
1134 { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396
1135 { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264
1136 { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424
1137 { common_pid: gmain [ 1315] } hitcount: 18 count: 6336
1138 .
1139 .
1140 .
1141 { common_pid: postgres [ 1892] } hitcount: 2 count: 32
1142 { common_pid: postgres [ 1891] } hitcount: 2 count: 32
1143 { common_pid: gmain [ 8704] } hitcount: 2 count: 32
1144 { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21
1145 { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16
1146 { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16
1147 { common_pid: gdbus [ 2998] } hitcount: 1 count: 16
1148 { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8
1149 { common_pid: init [ 1] } hitcount: 2 count: 2
1150
1151 Totals:
1152 Hits: 2116
1153 Entries: 51
1154 Dropped: 0
1155
1156 Similarly, if you key a hist trigger on syscall id, for example to
1157 gather and display a list of systemwide syscall hits, you can use
1158 the special .syscall modifier to display the syscall names rather
1159 than raw ids. The example below keeps a running total of syscall
1160 counts for the system during the run::
1161
1162 # echo 'hist:key=id.syscall:val=hitcount' > \
1163 /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
1164
1165 # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
1166 # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
1167
1168 { id: sys_fsync [ 74] } hitcount: 1
1169 { id: sys_newuname [ 63] } hitcount: 1
1170 { id: sys_prctl [157] } hitcount: 1
1171 { id: sys_statfs [137] } hitcount: 1
1172 { id: sys_symlink [ 88] } hitcount: 1
1173 { id: sys_sendmmsg [307] } hitcount: 1
1174 { id: sys_semctl [ 66] } hitcount: 1
1175 { id: sys_readlink [ 89] } hitcount: 3
1176 { id: sys_bind [ 49] } hitcount: 3
1177 { id: sys_getsockname [ 51] } hitcount: 3
1178 { id: sys_unlink [ 87] } hitcount: 3
1179 { id: sys_rename [ 82] } hitcount: 4
1180 { id: unknown_syscall [ 58] } hitcount: 4
1181 { id: sys_connect [ 42] } hitcount: 4
1182 { id: sys_getpid [ 39] } hitcount: 4
1183 .
1184 .
1185 .
1186 { id: sys_rt_sigprocmask [ 14] } hitcount: 952
1187 { id: sys_futex [202] } hitcount: 1534
1188 { id: sys_write [ 1] } hitcount: 2689
1189 { id: sys_setitimer [ 38] } hitcount: 2797
1190 { id: sys_read [ 0] } hitcount: 3202
1191 { id: sys_select [ 23] } hitcount: 3773
1192 { id: sys_writev [ 20] } hitcount: 4531
1193 { id: sys_poll [ 7] } hitcount: 8314
1194 { id: sys_recvmsg [ 47] } hitcount: 13738
1195 { id: sys_ioctl [ 16] } hitcount: 21843
1196
1197 Totals:
1198 Hits: 67612
1199 Entries: 72
1200 Dropped: 0
1201
1202 The syscall counts above provide a rough overall picture of system
1203 call activity on the system; we can see for example that the most
1204 popular system call on this system was the 'sys_ioctl' system call.
1205
1206 We can use 'compound' keys to refine that number and provide some
1207 further insight as to which processes exactly contribute to the
1208 overall ioctl count.
1209
1210 The command below keeps a hitcount for every unique combination of
1211 system call id and pid - the end result is essentially a table
1212 that keeps a per-pid sum of system call hits. The results are
1213 sorted using the system call id as the primary key, and the
1214 hitcount sum as the secondary key::
1215
1216 # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
1217 /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
1218
1219 # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
1220 # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
1221
1222 { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1
1223 { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1
1224 { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1
1225 { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1
1226 { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2
1227 { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2
1228 { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2
1229 { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2
1230 { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2
1231 { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2
1232 .
1233 .
1234 .
1235 { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1
1236 { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12
1237 { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16
1238 { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808
1239 { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580
1240 .
1241 .
1242 .
1243 { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3
1244 { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16
1245 { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2
1246 { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4
1247 { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4
1248 { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4
1249 { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4
1250 { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6
1251 { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2
1252 { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4
1253 { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6
1254
1255 Totals:
1256 Hits: 31536
1257 Entries: 323
1258 Dropped: 0
1259
1260 The above list does give us a breakdown of the ioctl syscall by
1261 pid, but it also gives us quite a bit more than that, which we
1262 don't really care about at the moment. Since we know the syscall
1263 id for sys_ioctl (16, displayed next to the sys_ioctl name), we
1264 can use that to filter out all the other syscalls::
1265
1266 # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
1267 /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
1268
1269 # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
1270 # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
1271
1272 { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1
1273 { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1
1274 { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1
1275 { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1
1276 { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1
1277 { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1
1278 { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1
1279 { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1
1280 { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1
1281 .
1282 .
1283 .
1284 { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45
1285 { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48
1286 { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48
1287 { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66
1288 { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674
1289 { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443
1290
1291 Totals:
1292 Hits: 101162
1293 Entries: 103
1294 Dropped: 0
1295
1296 The above output shows that 'compiz' and 'Xorg' are far and away
1297 the heaviest ioctl callers (which might lead to questions about
1298 whether they really need to be making all those calls and to
1299 possible avenues for further investigation.)
1300
1301 The compound key examples used a key and a sum value (hitcount) to
1302 sort the output, but we can just as easily use two keys instead.
1303 Here's an example where we use a compound key composed of the the
1304 common_pid and size event fields. Sorting with pid as the primary
1305 key and 'size' as the secondary key allows us to display an
1306 ordered summary of the recvfrom sizes, with counts, received by
1307 each process::
1308
1309 # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
1310 /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
1311
1312 # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
1313 # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
1314
1315 { common_pid: smbd [ 784], size: 4 } hitcount: 1
1316 { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672
1317 { common_pid: postgres [ 1796], size: 1000 } hitcount: 6
1318 { common_pid: postgres [ 1867], size: 1000 } hitcount: 10
1319 { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2
1320 { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1
1321 { common_pid: compiz [ 2994], size: 8 } hitcount: 1
1322 { common_pid: compiz [ 2994], size: 20 } hitcount: 11
1323 { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2
1324 { common_pid: firefox [ 8817], size: 4 } hitcount: 1
1325 { common_pid: firefox [ 8817], size: 8 } hitcount: 5
1326 { common_pid: firefox [ 8817], size: 588 } hitcount: 2
1327 { common_pid: firefox [ 8817], size: 628 } hitcount: 1
1328 { common_pid: firefox [ 8817], size: 6944 } hitcount: 1
1329 { common_pid: firefox [ 8817], size: 408880 } hitcount: 2
1330 { common_pid: firefox [ 8822], size: 8 } hitcount: 2
1331 { common_pid: firefox [ 8822], size: 160 } hitcount: 2
1332 { common_pid: firefox [ 8822], size: 320 } hitcount: 2
1333 { common_pid: firefox [ 8822], size: 352 } hitcount: 1
1334 .
1335 .
1336 .
1337 { common_pid: pool [ 8923], size: 1960 } hitcount: 10
1338 { common_pid: pool [ 8923], size: 2048 } hitcount: 10
1339 { common_pid: pool [ 8924], size: 1960 } hitcount: 10
1340 { common_pid: pool [ 8924], size: 2048 } hitcount: 10
1341 { common_pid: pool [ 8928], size: 1964 } hitcount: 4
1342 { common_pid: pool [ 8928], size: 1965 } hitcount: 2
1343 { common_pid: pool [ 8928], size: 2048 } hitcount: 6
1344 { common_pid: pool [ 8929], size: 1982 } hitcount: 1
1345 { common_pid: pool [ 8929], size: 2048 } hitcount: 1
1346
1347 Totals:
1348 Hits: 2016
1349 Entries: 224
1350 Dropped: 0
1351
1352 The above example also illustrates the fact that although a compound
1353 key is treated as a single entity for hashing purposes, the sub-keys
1354 it's composed of can be accessed independently.
1355
1356 The next example uses a string field as the hash key and
1357 demonstrates how you can manually pause and continue a hist trigger.
1358 In this example, we'll aggregate fork counts and don't expect a
1359 large number of entries in the hash table, so we'll drop it to a
1360 much smaller number, say 256::
1361
1362 # echo 'hist:key=child_comm:val=hitcount:size=256' > \
1363 /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
1364
1365 # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
1366 # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
1367
1368 { child_comm: dconf worker } hitcount: 1
1369 { child_comm: ibus-daemon } hitcount: 1
1370 { child_comm: whoopsie } hitcount: 1
1371 { child_comm: smbd } hitcount: 1
1372 { child_comm: gdbus } hitcount: 1
1373 { child_comm: kthreadd } hitcount: 1
1374 { child_comm: dconf worker } hitcount: 1
1375 { child_comm: evolution-alarm } hitcount: 2
1376 { child_comm: Socket Thread } hitcount: 2
1377 { child_comm: postgres } hitcount: 2
1378 { child_comm: bash } hitcount: 3
1379 { child_comm: compiz } hitcount: 3
1380 { child_comm: evolution-sourc } hitcount: 4
1381 { child_comm: dhclient } hitcount: 4
1382 { child_comm: pool } hitcount: 5
1383 { child_comm: nm-dispatcher.a } hitcount: 8
1384 { child_comm: firefox } hitcount: 8
1385 { child_comm: dbus-daemon } hitcount: 8
1386 { child_comm: glib-pacrunner } hitcount: 10
1387 { child_comm: evolution } hitcount: 23
1388
1389 Totals:
1390 Hits: 89
1391 Entries: 20
1392 Dropped: 0
1393
1394 If we want to pause the hist trigger, we can simply append :pause to
1395 the command that started the trigger. Notice that the trigger info
1396 displays as [paused]::
1397
1398 # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
1399 /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
1400
1401 # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
1402 # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
1403
1404 { child_comm: dconf worker } hitcount: 1
1405 { child_comm: kthreadd } hitcount: 1
1406 { child_comm: dconf worker } hitcount: 1
1407 { child_comm: gdbus } hitcount: 1
1408 { child_comm: ibus-daemon } hitcount: 1
1409 { child_comm: Socket Thread } hitcount: 2
1410 { child_comm: evolution-alarm } hitcount: 2
1411 { child_comm: smbd } hitcount: 2
1412 { child_comm: bash } hitcount: 3
1413 { child_comm: whoopsie } hitcount: 3
1414 { child_comm: compiz } hitcount: 3
1415 { child_comm: evolution-sourc } hitcount: 4
1416 { child_comm: pool } hitcount: 5
1417 { child_comm: postgres } hitcount: 6
1418 { child_comm: firefox } hitcount: 8
1419 { child_comm: dhclient } hitcount: 10
1420 { child_comm: emacs } hitcount: 12
1421 { child_comm: dbus-daemon } hitcount: 20
1422 { child_comm: nm-dispatcher.a } hitcount: 20
1423 { child_comm: evolution } hitcount: 35
1424 { child_comm: glib-pacrunner } hitcount: 59
1425
1426 Totals:
1427 Hits: 199
1428 Entries: 21
1429 Dropped: 0
1430
1431 To manually continue having the trigger aggregate events, append
1432 :cont instead. Notice that the trigger info displays as [active]
1433 again, and the data has changed::
1434
1435 # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
1436 /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
1437
1438 # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
1439 # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
1440
1441 { child_comm: dconf worker } hitcount: 1
1442 { child_comm: dconf worker } hitcount: 1
1443 { child_comm: kthreadd } hitcount: 1
1444 { child_comm: gdbus } hitcount: 1
1445 { child_comm: ibus-daemon } hitcount: 1
1446 { child_comm: Socket Thread } hitcount: 2
1447 { child_comm: evolution-alarm } hitcount: 2
1448 { child_comm: smbd } hitcount: 2
1449 { child_comm: whoopsie } hitcount: 3
1450 { child_comm: compiz } hitcount: 3
1451 { child_comm: evolution-sourc } hitcount: 4
1452 { child_comm: bash } hitcount: 5
1453 { child_comm: pool } hitcount: 5
1454 { child_comm: postgres } hitcount: 6
1455 { child_comm: firefox } hitcount: 8
1456 { child_comm: dhclient } hitcount: 11
1457 { child_comm: emacs } hitcount: 12
1458 { child_comm: dbus-daemon } hitcount: 22
1459 { child_comm: nm-dispatcher.a } hitcount: 22
1460 { child_comm: evolution } hitcount: 35
1461 { child_comm: glib-pacrunner } hitcount: 59
1462
1463 Totals:
1464 Hits: 206
1465 Entries: 21
1466 Dropped: 0
1467
1468 The previous example showed how to start and stop a hist trigger by
1469 appending 'pause' and 'continue' to the hist trigger command. A
1470 hist trigger can also be started in a paused state by initially
1471 starting the trigger with ':pause' appended. This allows you to
1472 start the trigger only when you're ready to start collecting data
1473 and not before. For example, you could start the trigger in a
1474 paused state, then unpause it and do something you want to measure,
1475 then pause the trigger again when done.
1476
1477 Of course, doing this manually can be difficult and error-prone, but
1478 it is possible to automatically start and stop a hist trigger based
1479 on some condition, via the enable_hist and disable_hist triggers.
1480
1481 For example, suppose we wanted to take a look at the relative
1482 weights in terms of skb length for each callpath that leads to a
1483 netif_receieve_skb event when downloading a decent-sized file using
1484 wget.
1485
1486 First we set up an initially paused stacktrace trigger on the
1487 netif_receive_skb event::
1488
1489 # echo 'hist:key=stacktrace:vals=len:pause' > \
1490 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1491
1492 Next, we set up an 'enable_hist' trigger on the sched_process_exec
1493 event, with an 'if filename==/usr/bin/wget' filter. The effect of
1494 this new trigger is that it will 'unpause' the hist trigger we just
1495 set up on netif_receive_skb if and only if it sees a
1496 sched_process_exec event with a filename of '/usr/bin/wget'. When
1497 that happens, all netif_receive_skb events are aggregated into a
1498 hash table keyed on stacktrace::
1499
1500 # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
1501 /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1502
1503 The aggregation continues until the netif_receive_skb is paused
1504 again, which is what the following disable_hist event does by
1505 creating a similar setup on the sched_process_exit event, using the
1506 filter 'comm==wget'::
1507
1508 # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
1509 /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1510
1511 Whenever a process exits and the comm field of the disable_hist
1512 trigger filter matches 'comm==wget', the netif_receive_skb hist
1513 trigger is disabled.
1514
1515 The overall effect is that netif_receive_skb events are aggregated
1516 into the hash table for only the duration of the wget. Executing a
1517 wget command and then listing the 'hist' file will display the
1518 output generated by the wget command::
1519
1520 $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
1521
1522 # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1523 # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
1524
1525 { stacktrace:
1526 __netif_receive_skb_core+0x46d/0x990
1527 __netif_receive_skb+0x18/0x60
1528 netif_receive_skb_internal+0x23/0x90
1529 napi_gro_receive+0xc8/0x100
1530 ieee80211_deliver_skb+0xd6/0x270 [mac80211]
1531 ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
1532 ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
1533 ieee80211_rx+0x31d/0x900 [mac80211]
1534 iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
1535 iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
1536 iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
1537 irq_thread_fn+0x20/0x50
1538 irq_thread+0x11f/0x150
1539 kthread+0xd2/0xf0
1540 ret_from_fork+0x42/0x70
1541 } hitcount: 85 len: 28884
1542 { stacktrace:
1543 __netif_receive_skb_core+0x46d/0x990
1544 __netif_receive_skb+0x18/0x60
1545 netif_receive_skb_internal+0x23/0x90
1546 napi_gro_complete+0xa4/0xe0
1547 dev_gro_receive+0x23a/0x360
1548 napi_gro_receive+0x30/0x100
1549 ieee80211_deliver_skb+0xd6/0x270 [mac80211]
1550 ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
1551 ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
1552 ieee80211_rx+0x31d/0x900 [mac80211]
1553 iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
1554 iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
1555 iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
1556 irq_thread_fn+0x20/0x50
1557 irq_thread+0x11f/0x150
1558 kthread+0xd2/0xf0
1559 } hitcount: 98 len: 664329
1560 { stacktrace:
1561 __netif_receive_skb_core+0x46d/0x990
1562 __netif_receive_skb+0x18/0x60
1563 process_backlog+0xa8/0x150
1564 net_rx_action+0x15d/0x340
1565 __do_softirq+0x114/0x2c0
1566 do_softirq_own_stack+0x1c/0x30
1567 do_softirq+0x65/0x70
1568 __local_bh_enable_ip+0xb5/0xc0
1569 ip_finish_output+0x1f4/0x840
1570 ip_output+0x6b/0xc0
1571 ip_local_out_sk+0x31/0x40
1572 ip_send_skb+0x1a/0x50
1573 udp_send_skb+0x173/0x2a0
1574 udp_sendmsg+0x2bf/0x9f0
1575 inet_sendmsg+0x64/0xa0
1576 sock_sendmsg+0x3d/0x50
1577 } hitcount: 115 len: 13030
1578 { stacktrace:
1579 __netif_receive_skb_core+0x46d/0x990
1580 __netif_receive_skb+0x18/0x60
1581 netif_receive_skb_internal+0x23/0x90
1582 napi_gro_complete+0xa4/0xe0
1583 napi_gro_flush+0x6d/0x90
1584 iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
1585 irq_thread_fn+0x20/0x50
1586 irq_thread+0x11f/0x150
1587 kthread+0xd2/0xf0
1588 ret_from_fork+0x42/0x70
1589 } hitcount: 934 len: 5512212
1590
1591 Totals:
1592 Hits: 1232
1593 Entries: 4
1594 Dropped: 0
1595
1596 The above shows all the netif_receive_skb callpaths and their total
1597 lengths for the duration of the wget command.
1598
1599 The 'clear' hist trigger param can be used to clear the hash table.
1600 Suppose we wanted to try another run of the previous example but
1601 this time also wanted to see the complete list of events that went
1602 into the histogram. In order to avoid having to set everything up
1603 again, we can just clear the histogram first::
1604
1605 # echo 'hist:key=stacktrace:vals=len:clear' >> \
1606 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1607
1608 Just to verify that it is in fact cleared, here's what we now see in
1609 the hist file::
1610
1611 # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1612 # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
1613
1614 Totals:
1615 Hits: 0
1616 Entries: 0
1617 Dropped: 0
1618
1619 Since we want to see the detailed list of every netif_receive_skb
1620 event occurring during the new run, which are in fact the same
1621 events being aggregated into the hash table, we add some additional
1622 'enable_event' events to the triggering sched_process_exec and
1623 sched_process_exit events as such::
1624
1625 # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
1626 /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1627
1628 # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
1629 /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1630
1631 If you read the trigger files for the sched_process_exec and
1632 sched_process_exit triggers, you should see two triggers for each:
1633 one enabling/disabling the hist aggregation and the other
1634 enabling/disabling the logging of events::
1635
1636 # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1637 enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
1638 enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
1639
1640 # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1641 enable_event:net:netif_receive_skb:unlimited if comm==wget
1642 disable_hist:net:netif_receive_skb:unlimited if comm==wget
1643
1644 In other words, whenever either of the sched_process_exec or
1645 sched_process_exit events is hit and matches 'wget', it enables or
1646 disables both the histogram and the event log, and what you end up
1647 with is a hash table and set of events just covering the specified
1648 duration. Run the wget command again::
1649
1650 $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
1651
1652 Displaying the 'hist' file should show something similar to what you
1653 saw in the last run, but this time you should also see the
1654 individual events in the trace file::
1655
1656 # cat /sys/kernel/debug/tracing/trace
1657
1658 # tracer: nop
1659 #
1660 # entries-in-buffer/entries-written: 183/1426 #P:4
1661 #
1662 # _-----=> irqs-off
1663 # / _----=> need-resched
1664 # | / _---=> hardirq/softirq
1665 # || / _--=> preempt-depth
1666 # ||| / delay
1667 # TASK-PID CPU# |||| TIMESTAMP FUNCTION
1668 # | | | |||| | |
1669 wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
1670 wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
1671 dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
1672 dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
1673 ##### CPU 2 buffer started ####
1674 irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
1675 irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
1676 irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
1677 irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
1678 irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
1679 ....
1680
1681
1682 The following example demonstrates how multiple hist triggers can be
1683 attached to a given event. This capability can be useful for
1684 creating a set of different summaries derived from the same set of
1685 events, or for comparing the effects of different filters, among
1686 other things.
1687 ::
1688
1689 # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
1690 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1691 # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
1692 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1693 # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
1694 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1695 # echo 'hist:keys=skbaddr.hex:vals=len' >> \
1696 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1697 # echo 'hist:keys=len:vals=common_preempt_count' >> \
1698 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1699
1700 The above set of commands create four triggers differing only in
1701 their filters, along with a completely different though fairly
1702 nonsensical trigger. Note that in order to append multiple hist
1703 triggers to the same file, you should use the '>>' operator to
1704 append them ('>' will also add the new hist trigger, but will remove
1705 any existing hist triggers beforehand).
1706
1707 Displaying the contents of the 'hist' file for the event shows the
1708 contents of all five histograms::
1709
1710 # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1711
1712 # event histogram
1713 #
1714 # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
1715 #
1716
1717 { len: 176 } hitcount: 1 common_preempt_count: 0
1718 { len: 223 } hitcount: 1 common_preempt_count: 0
1719 { len: 4854 } hitcount: 1 common_preempt_count: 0
1720 { len: 395 } hitcount: 1 common_preempt_count: 0
1721 { len: 177 } hitcount: 1 common_preempt_count: 0
1722 { len: 446 } hitcount: 1 common_preempt_count: 0
1723 { len: 1601 } hitcount: 1 common_preempt_count: 0
1724 .
1725 .
1726 .
1727 { len: 1280 } hitcount: 66 common_preempt_count: 0
1728 { len: 116 } hitcount: 81 common_preempt_count: 40
1729 { len: 708 } hitcount: 112 common_preempt_count: 0
1730 { len: 46 } hitcount: 221 common_preempt_count: 0
1731 { len: 1264 } hitcount: 458 common_preempt_count: 0
1732
1733 Totals:
1734 Hits: 1428
1735 Entries: 147
1736 Dropped: 0
1737
1738
1739 # event histogram
1740 #
1741 # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1742 #
1743
1744 { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130
1745 { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280
1746 { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280
1747 { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115
1748 { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115
1749 { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46
1750 { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118
1751 { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60
1752 { skbaddr: ffff880100065900 } hitcount: 1 len: 46
1753 { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116
1754 { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280
1755 { skbaddr: ffff880100064700 } hitcount: 1 len: 365
1756 { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60
1757 .
1758 .
1759 .
1760 { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677
1761 { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052
1762 { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589
1763 { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326
1764 { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678
1765 { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678
1766 { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589
1767 { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307
1768 { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032
1769
1770 Totals:
1771 Hits: 1451
1772 Entries: 318
1773 Dropped: 0
1774
1775
1776 # event histogram
1777 #
1778 # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
1779 #
1780
1781
1782 Totals:
1783 Hits: 0
1784 Entries: 0
1785 Dropped: 0
1786
1787
1788 # event histogram
1789 #
1790 # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
1791 #
1792
1793 { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212
1794 { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212
1795 { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212
1796 { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492
1797 { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212
1798 { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212
1799 { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854
1800 { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636
1801 { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924
1802 { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356
1803 { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420
1804 { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996
1805
1806 Totals:
1807 Hits: 14
1808 Entries: 12
1809 Dropped: 0
1810
1811
1812 # event histogram
1813 #
1814 # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
1815 #
1816
1817
1818 Totals:
1819 Hits: 0
1820 Entries: 0
1821 Dropped: 0
1822
1823 Named triggers can be used to have triggers share a common set of
1824 histogram data. This capability is mostly useful for combining the
1825 output of events generated by tracepoints contained inside inline
1826 functions, but names can be used in a hist trigger on any event.
1827 For example, these two triggers when hit will update the same 'len'
1828 field in the shared 'foo' histogram data::
1829
1830 # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
1831 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1832 # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
1833 /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1834
1835 You can see that they're updating common histogram data by reading
1836 each event's hist files at the same time::
1837
1838 # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
1839 cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
1840
1841 # event histogram
1842 #
1843 # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1844 #
1845
1846 { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
1847 { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
1848 { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
1849 { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
1850 { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
1851 { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
1852 { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
1853 { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
1854 { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
1855 { skbaddr: ffff880064505000 } hitcount: 1 len: 46
1856 { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
1857 { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
1858 { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
1859 { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
1860 { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
1861 { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
1862 { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
1863 { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
1864 { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
1865 { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
1866 { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
1867 { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
1868 { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
1869 { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
1870 { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
1871 { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
1872 { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
1873 { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
1874 { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
1875 { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
1876 { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
1877 { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
1878 { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
1879 { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
1880 { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
1881 { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
1882 { skbaddr: ffff880064504400 } hitcount: 4 len: 184
1883 { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
1884 { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
1885 { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
1886 { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
1887 { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
1888
1889 Totals:
1890 Hits: 81
1891 Entries: 42
1892 Dropped: 0
1893 # event histogram
1894 #
1895 # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1896 #
1897
1898 { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
1899 { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
1900 { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
1901 { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
1902 { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
1903 { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
1904 { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
1905 { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
1906 { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
1907 { skbaddr: ffff880064505000 } hitcount: 1 len: 46
1908 { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
1909 { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
1910 { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
1911 { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
1912 { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
1913 { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
1914 { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
1915 { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
1916 { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
1917 { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
1918 { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
1919 { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
1920 { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
1921 { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
1922 { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
1923 { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
1924 { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
1925 { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
1926 { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
1927 { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
1928 { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
1929 { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
1930 { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
1931 { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
1932 { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
1933 { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
1934 { skbaddr: ffff880064504400 } hitcount: 4 len: 184
1935 { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
1936 { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
1937 { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
1938 { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
1939 { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
1940
1941 Totals:
1942 Hits: 81
1943 Entries: 42
1944 Dropped: 0
1945
1946 And here's an example that shows how to combine histogram data from
1947 any two events even if they don't share any 'compatible' fields
1948 other than 'hitcount' and 'stacktrace'. These commands create a
1949 couple of triggers named 'bar' using those fields::
1950
1951 # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
1952 /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
1953 # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
1954 /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1955
1956 And displaying the output of either shows some interesting if
1957 somewhat confusing output::
1958
1959 # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
1960 # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
1961
1962 # event histogram
1963 #
1964 # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
1965 #
1966
1967 { stacktrace:
1968 _do_fork+0x18e/0x330
1969 kernel_thread+0x29/0x30
1970 kthreadd+0x154/0x1b0
1971 ret_from_fork+0x3f/0x70
1972 } hitcount: 1
1973 { stacktrace:
1974 netif_rx_internal+0xb2/0xd0
1975 netif_rx_ni+0x20/0x70
1976 dev_loopback_xmit+0xaa/0xd0
1977 ip_mc_output+0x126/0x240
1978 ip_local_out_sk+0x31/0x40
1979 igmp_send_report+0x1e9/0x230
1980 igmp_timer_expire+0xe9/0x120
1981 call_timer_fn+0x39/0xf0
1982 run_timer_softirq+0x1e1/0x290
1983 __do_softirq+0xfd/0x290
1984 irq_exit+0x98/0xb0
1985 smp_apic_timer_interrupt+0x4a/0x60
1986 apic_timer_interrupt+0x6d/0x80
1987 cpuidle_enter+0x17/0x20
1988 call_cpuidle+0x3b/0x60
1989 cpu_startup_entry+0x22d/0x310
1990 } hitcount: 1
1991 { stacktrace:
1992 netif_rx_internal+0xb2/0xd0
1993 netif_rx_ni+0x20/0x70
1994 dev_loopback_xmit+0xaa/0xd0
1995 ip_mc_output+0x17f/0x240
1996 ip_local_out_sk+0x31/0x40
1997 ip_send_skb+0x1a/0x50
1998 udp_send_skb+0x13e/0x270
1999 udp_sendmsg+0x2bf/0x980
2000 inet_sendmsg+0x67/0xa0
2001 sock_sendmsg+0x38/0x50
2002 SYSC_sendto+0xef/0x170
2003 SyS_sendto+0xe/0x10
2004 entry_SYSCALL_64_fastpath+0x12/0x6a
2005 } hitcount: 2
2006 { stacktrace:
2007 netif_rx_internal+0xb2/0xd0
2008 netif_rx+0x1c/0x60
2009 loopback_xmit+0x6c/0xb0
2010 dev_hard_start_xmit+0x219/0x3a0
2011 __dev_queue_xmit+0x415/0x4f0
2012 dev_queue_xmit_sk+0x13/0x20
2013 ip_finish_output2+0x237/0x340
2014 ip_finish_output+0x113/0x1d0
2015 ip_output+0x66/0xc0
2016 ip_local_out_sk+0x31/0x40
2017 ip_send_skb+0x1a/0x50
2018 udp_send_skb+0x16d/0x270
2019 udp_sendmsg+0x2bf/0x980
2020 inet_sendmsg+0x67/0xa0
2021 sock_sendmsg+0x38/0x50
2022 ___sys_sendmsg+0x14e/0x270
2023 } hitcount: 76
2024 { stacktrace:
2025 netif_rx_internal+0xb2/0xd0
2026 netif_rx+0x1c/0x60
2027 loopback_xmit+0x6c/0xb0
2028 dev_hard_start_xmit+0x219/0x3a0
2029 __dev_queue_xmit+0x415/0x4f0
2030 dev_queue_xmit_sk+0x13/0x20
2031 ip_finish_output2+0x237/0x340
2032 ip_finish_output+0x113/0x1d0
2033 ip_output+0x66/0xc0
2034 ip_local_out_sk+0x31/0x40
2035 ip_send_skb+0x1a/0x50
2036 udp_send_skb+0x16d/0x270
2037 udp_sendmsg+0x2bf/0x980
2038 inet_sendmsg+0x67/0xa0
2039 sock_sendmsg+0x38/0x50
2040 ___sys_sendmsg+0x269/0x270
2041 } hitcount: 77
2042 { stacktrace:
2043 netif_rx_internal+0xb2/0xd0
2044 netif_rx+0x1c/0x60
2045 loopback_xmit+0x6c/0xb0
2046 dev_hard_start_xmit+0x219/0x3a0
2047 __dev_queue_xmit+0x415/0x4f0
2048 dev_queue_xmit_sk+0x13/0x20
2049 ip_finish_output2+0x237/0x340
2050 ip_finish_output+0x113/0x1d0
2051 ip_output+0x66/0xc0
2052 ip_local_out_sk+0x31/0x40
2053 ip_send_skb+0x1a/0x50
2054 udp_send_skb+0x16d/0x270
2055 udp_sendmsg+0x2bf/0x980
2056 inet_sendmsg+0x67/0xa0
2057 sock_sendmsg+0x38/0x50
2058 SYSC_sendto+0xef/0x170
2059 } hitcount: 88
2060 { stacktrace:
2061 _do_fork+0x18e/0x330
2062 SyS_clone+0x19/0x20
2063 entry_SYSCALL_64_fastpath+0x12/0x6a
2064 } hitcount: 244
2065
2066 Totals:
2067 Hits: 489
2068 Entries: 7
2069 Dropped: 0
diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index fdf5fb54a04c..e45f0786f3f9 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -543,6 +543,30 @@ of ftrace. Here is a list of some of the key files:
543 543
544 See events.txt for more information. 544 See events.txt for more information.
545 545
546 timestamp_mode:
547
548 Certain tracers may change the timestamp mode used when
549 logging trace events into the event buffer. Events with
550 different modes can coexist within a buffer but the mode in
551 effect when an event is logged determines which timestamp mode
552 is used for that event. The default timestamp mode is
553 'delta'.
554
555 Usual timestamp modes for tracing:
556
557 # cat timestamp_mode
558 [delta] absolute
559
560 The timestamp mode with the square brackets around it is the
561 one in effect.
562
563 delta: Default timestamp mode - timestamp is a delta against
564 a per-buffer timestamp.
565
566 absolute: The timestamp is a full timestamp, not a delta
567 against some other value. As such it takes up more
568 space and is less efficient.
569
546 hwlat_detector: 570 hwlat_detector:
547 571
548 Directory for the Hardware Latency Detector. 572 Directory for the Hardware Latency Detector.
diff --git a/Documentation/trace/histogram.txt b/Documentation/trace/histogram.txt
new file mode 100644
index 000000000000..6e05510afc28
--- /dev/null
+++ b/Documentation/trace/histogram.txt
@@ -0,0 +1,1995 @@
1 Event Histograms
2
3 Documentation written by Tom Zanussi
4
51. Introduction
6===============
7
8 Histogram triggers are special event triggers that can be used to
9 aggregate trace event data into histograms. For information on
10 trace events and event triggers, see Documentation/trace/events.txt.
11
12
132. Histogram Trigger Command
14============================
15
16 A histogram trigger command is an event trigger command that
17 aggregates event hits into a hash table keyed on one or more trace
18 event format fields (or stacktrace) and a set of running totals
19 derived from one or more trace event format fields and/or event
20 counts (hitcount).
21
22 The format of a hist trigger is as follows:
23
24 hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>]
25 [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue]
26 [:clear][:name=histname1] [if <filter>]
27
28 When a matching event is hit, an entry is added to a hash table
29 using the key(s) and value(s) named. Keys and values correspond to
30 fields in the event's format description. Values must correspond to
31 numeric fields - on an event hit, the value(s) will be added to a
32 sum kept for that field. The special string 'hitcount' can be used
33 in place of an explicit value field - this is simply a count of
34 event hits. If 'values' isn't specified, an implicit 'hitcount'
35 value will be automatically created and used as the only value.
36 Keys can be any field, or the special string 'stacktrace', which
37 will use the event's kernel stacktrace as the key. The keywords
38 'keys' or 'key' can be used to specify keys, and the keywords
39 'values', 'vals', or 'val' can be used to specify values. Compound
40 keys consisting of up to two fields can be specified by the 'keys'
41 keyword. Hashing a compound key produces a unique entry in the
42 table for each unique combination of component keys, and can be
43 useful for providing more fine-grained summaries of event data.
44 Additionally, sort keys consisting of up to two fields can be
45 specified by the 'sort' keyword. If more than one field is
46 specified, the result will be a 'sort within a sort': the first key
47 is taken to be the primary sort key and the second the secondary
48 key. If a hist trigger is given a name using the 'name' parameter,
49 its histogram data will be shared with other triggers of the same
50 name, and trigger hits will update this common data. Only triggers
51 with 'compatible' fields can be combined in this way; triggers are
52 'compatible' if the fields named in the trigger share the same
53 number and type of fields and those fields also have the same names.
54 Note that any two events always share the compatible 'hitcount' and
55 'stacktrace' fields and can therefore be combined using those
56 fields, however pointless that may be.
57
58 'hist' triggers add a 'hist' file to each event's subdirectory.
59 Reading the 'hist' file for the event will dump the hash table in
60 its entirety to stdout. If there are multiple hist triggers
61 attached to an event, there will be a table for each trigger in the
62 output. The table displayed for a named trigger will be the same as
63 any other instance having the same name. Each printed hash table
64 entry is a simple list of the keys and values comprising the entry;
65 keys are printed first and are delineated by curly braces, and are
66 followed by the set of value fields for the entry. By default,
67 numeric fields are displayed as base-10 integers. This can be
68 modified by appending any of the following modifiers to the field
69 name:
70
71 .hex display a number as a hex value
72 .sym display an address as a symbol
73 .sym-offset display an address as a symbol and offset
74 .syscall display a syscall id as a system call name
75 .execname display a common_pid as a program name
76 .log2 display log2 value rather than raw number
77 .usecs display a common_timestamp in microseconds
78
79 Note that in general the semantics of a given field aren't
80 interpreted when applying a modifier to it, but there are some
81 restrictions to be aware of in this regard:
82
83 - only the 'hex' modifier can be used for values (because values
84 are essentially sums, and the other modifiers don't make sense
85 in that context).
86 - the 'execname' modifier can only be used on a 'common_pid'. The
87 reason for this is that the execname is simply the 'comm' value
88 saved for the 'current' process when an event was triggered,
89 which is the same as the common_pid value saved by the event
90 tracing code. Trying to apply that comm value to other pid
91 values wouldn't be correct, and typically events that care save
92 pid-specific comm fields in the event itself.
93
94 A typical usage scenario would be the following to enable a hist
95 trigger, read its current contents, and then turn it off:
96
97 # echo 'hist:keys=skbaddr.hex:vals=len' > \
98 /sys/kernel/debug/tracing/events/net/netif_rx/trigger
99
100 # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
101
102 # echo '!hist:keys=skbaddr.hex:vals=len' > \
103 /sys/kernel/debug/tracing/events/net/netif_rx/trigger
104
105 The trigger file itself can be read to show the details of the
106 currently attached hist trigger. This information is also displayed
107 at the top of the 'hist' file when read.
108
109 By default, the size of the hash table is 2048 entries. The 'size'
110 parameter can be used to specify more or fewer than that. The units
111 are in terms of hashtable entries - if a run uses more entries than
112 specified, the results will show the number of 'drops', the number
113 of hits that were ignored. The size should be a power of 2 between
114 128 and 131072 (any non- power-of-2 number specified will be rounded
115 up).
116
117 The 'sort' parameter can be used to specify a value field to sort
118 on. The default if unspecified is 'hitcount' and the default sort
119 order is 'ascending'. To sort in the opposite direction, append
120 .descending' to the sort key.
121
122 The 'pause' parameter can be used to pause an existing hist trigger
123 or to start a hist trigger but not log any events until told to do
124 so. 'continue' or 'cont' can be used to start or restart a paused
125 hist trigger.
126
127 The 'clear' parameter will clear the contents of a running hist
128 trigger and leave its current paused/active state.
129
130 Note that the 'pause', 'cont', and 'clear' parameters should be
131 applied using 'append' shell operator ('>>') if applied to an
132 existing trigger, rather than via the '>' operator, which will cause
133 the trigger to be removed through truncation.
134
135- enable_hist/disable_hist
136
137 The enable_hist and disable_hist triggers can be used to have one
138 event conditionally start and stop another event's already-attached
139 hist trigger. Any number of enable_hist and disable_hist triggers
140 can be attached to a given event, allowing that event to kick off
141 and stop aggregations on a host of other events.
142
143 The format is very similar to the enable/disable_event triggers:
144
145 enable_hist:<system>:<event>[:count]
146 disable_hist:<system>:<event>[:count]
147
148 Instead of enabling or disabling the tracing of the target event
149 into the trace buffer as the enable/disable_event triggers do, the
150 enable/disable_hist triggers enable or disable the aggregation of
151 the target event into a hash table.
152
153 A typical usage scenario for the enable_hist/disable_hist triggers
154 would be to first set up a paused hist trigger on some event,
155 followed by an enable_hist/disable_hist pair that turns the hist
156 aggregation on and off when conditions of interest are hit:
157
158 # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \
159 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
160
161 # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
162 /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
163
164 # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
165 /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
166
167 The above sets up an initially paused hist trigger which is unpaused
168 and starts aggregating events when a given program is executed, and
169 which stops aggregating when the process exits and the hist trigger
170 is paused again.
171
172 The examples below provide a more concrete illustration of the
173 concepts and typical usage patterns discussed above.
174
175 'special' event fields
176 ------------------------
177
178 There are a number of 'special event fields' available for use as
179 keys or values in a hist trigger. These look like and behave as if
180 they were actual event fields, but aren't really part of the event's
181 field definition or format file. They are however available for any
182 event, and can be used anywhere an actual event field could be.
183 They are:
184
185 common_timestamp u64 - timestamp (from ring buffer) associated
186 with the event, in nanoseconds. May be
187 modified by .usecs to have timestamps
188 interpreted as microseconds.
189 cpu int - the cpu on which the event occurred.
190
191 Extended error information
192 --------------------------
193
194 For some error conditions encountered when invoking a hist trigger
195 command, extended error information is available via the
196 corresponding event's 'hist' file. Reading the hist file after an
197 error will display more detailed information about what went wrong,
198 if information is available. This extended error information will
199 be available until the next hist trigger command for that event.
200
201 If available for a given error condition, the extended error
202 information and usage takes the following form:
203
204 # echo xxx > /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger
205 echo: write error: Invalid argument
206
207 # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist
208 ERROR: Couldn't yyy: zzz
209 Last command: xxx
210
2116.2 'hist' trigger examples
212---------------------------
213
214 The first set of examples creates aggregations using the kmalloc
215 event. The fields that can be used for the hist trigger are listed
216 in the kmalloc event's format file:
217
218 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format
219 name: kmalloc
220 ID: 374
221 format:
222 field:unsigned short common_type; offset:0; size:2; signed:0;
223 field:unsigned char common_flags; offset:2; size:1; signed:0;
224 field:unsigned char common_preempt_count; offset:3; size:1; signed:0;
225 field:int common_pid; offset:4; size:4; signed:1;
226
227 field:unsigned long call_site; offset:8; size:8; signed:0;
228 field:const void * ptr; offset:16; size:8; signed:0;
229 field:size_t bytes_req; offset:24; size:8; signed:0;
230 field:size_t bytes_alloc; offset:32; size:8; signed:0;
231 field:gfp_t gfp_flags; offset:40; size:4; signed:0;
232
233 We'll start by creating a hist trigger that generates a simple table
234 that lists the total number of bytes requested for each function in
235 the kernel that made one or more calls to kmalloc:
236
237 # echo 'hist:key=call_site:val=bytes_req' > \
238 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
239
240 This tells the tracing system to create a 'hist' trigger using the
241 call_site field of the kmalloc event as the key for the table, which
242 just means that each unique call_site address will have an entry
243 created for it in the table. The 'val=bytes_req' parameter tells
244 the hist trigger that for each unique entry (call_site) in the
245 table, it should keep a running total of the number of bytes
246 requested by that call_site.
247
248 We'll let it run for awhile and then dump the contents of the 'hist'
249 file in the kmalloc event's subdirectory (for readability, a number
250 of entries have been omitted):
251
252 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
253 # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
254
255 { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176
256 { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024
257 { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384
258 { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24
259 { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8
260 { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152
261 { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144
262 { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144
263 { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560
264 { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736
265 .
266 .
267 .
268 { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576
269 { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336
270 { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504
271 { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584
272 { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448
273 { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720
274 { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088
275 { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920
276 { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716
277 { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712
278 { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160
279 { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520
280
281 Totals:
282 Hits: 4610
283 Entries: 45
284 Dropped: 0
285
286 The output displays a line for each entry, beginning with the key
287 specified in the trigger, followed by the value(s) also specified in
288 the trigger. At the beginning of the output is a line that displays
289 the trigger info, which can also be displayed by reading the
290 'trigger' file:
291
292 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
293 hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active]
294
295 At the end of the output are a few lines that display the overall
296 totals for the run. The 'Hits' field shows the total number of
297 times the event trigger was hit, the 'Entries' field shows the total
298 number of used entries in the hash table, and the 'Dropped' field
299 shows the number of hits that were dropped because the number of
300 used entries for the run exceeded the maximum number of entries
301 allowed for the table (normally 0, but if not a hint that you may
302 want to increase the size of the table using the 'size' parameter).
303
304 Notice in the above output that there's an extra field, 'hitcount',
305 which wasn't specified in the trigger. Also notice that in the
306 trigger info output, there's a parameter, 'sort=hitcount', which
307 wasn't specified in the trigger either. The reason for that is that
308 every trigger implicitly keeps a count of the total number of hits
309 attributed to a given entry, called the 'hitcount'. That hitcount
310 information is explicitly displayed in the output, and in the
311 absence of a user-specified sort parameter, is used as the default
312 sort field.
313
314 The value 'hitcount' can be used in place of an explicit value in
315 the 'values' parameter if you don't really need to have any
316 particular field summed and are mainly interested in hit
317 frequencies.
318
319 To turn the hist trigger off, simply call up the trigger in the
320 command history and re-execute it with a '!' prepended:
321
322 # echo '!hist:key=call_site:val=bytes_req' > \
323 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
324
325 Finally, notice that the call_site as displayed in the output above
326 isn't really very useful. It's an address, but normally addresses
327 are displayed in hex. To have a numeric field displayed as a hex
328 value, simply append '.hex' to the field name in the trigger:
329
330 # echo 'hist:key=call_site.hex:val=bytes_req' > \
331 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
332
333 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
334 # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active]
335
336 { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433
337 { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176
338 { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384
339 { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8
340 { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511
341 { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12
342 { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152
343 { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24
344 { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144
345 { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648
346 { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144
347 { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544
348 .
349 .
350 .
351 { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024
352 { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680
353 { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112
354 { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232
355 { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360
356 { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640
357 { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600
358 { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584
359 { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656
360 { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456
361 { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600
362
363 Totals:
364 Hits: 4775
365 Entries: 46
366 Dropped: 0
367
368 Even that's only marginally more useful - while hex values do look
369 more like addresses, what users are typically more interested in
370 when looking at text addresses are the corresponding symbols
371 instead. To have an address displayed as symbolic value instead,
372 simply append '.sym' or '.sym-offset' to the field name in the
373 trigger:
374
375 # echo 'hist:key=call_site.sym:val=bytes_req' > \
376 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
377
378 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
379 # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active]
380
381 { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024
382 { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
383 { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
384 { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192
385 { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
386 { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
387 { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
388 { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528
389 { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624
390 { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96
391 { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464
392 { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304
393 { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
394 { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424
395 .
396 .
397 .
398 { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240
399 { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280
400 { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672
401 { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208
402 { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840
403 { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312
404 { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152
405 { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576
406 { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248
407 { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384
408 { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584
409 { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176
410 { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265
411
412 Totals:
413 Hits: 109928
414 Entries: 71
415 Dropped: 0
416
417 Because the default sort key above is 'hitcount', the above shows a
418 the list of call_sites by increasing hitcount, so that at the bottom
419 we see the functions that made the most kmalloc calls during the
420 run. If instead we we wanted to see the top kmalloc callers in
421 terms of the number of bytes requested rather than the number of
422 calls, and we wanted the top caller to appear at the top, we can use
423 the 'sort' parameter, along with the 'descending' modifier:
424
425 # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \
426 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
427
428 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
429 # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
430
431 { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464
432 { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176
433 { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135
434 { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128
435 { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784
436 { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992
437 { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072
438 { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824
439 { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704
440 { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088
441 { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536
442 { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664
443 { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632
444 .
445 .
446 .
447 { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128
448 { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128
449 { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48
450 { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48
451 { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48
452 { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40
453 { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16
454 { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8
455 { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7
456 { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7
457
458 Totals:
459 Hits: 32133
460 Entries: 81
461 Dropped: 0
462
463 To display the offset and size information in addition to the symbol
464 name, just use 'sym-offset' instead:
465
466 # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \
467 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
468
469 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
470 # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active]
471
472 { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720
473 { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936
474 { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936
475 { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832
476 { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384
477 { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040
478 { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072
479 { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880
480 { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488
481 { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696
482 { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640
483 { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456
484 .
485 .
486 .
487 { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128
488 { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96
489 { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96
490 { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84
491 { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8
492 { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7
493 { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7
494
495 Totals:
496 Hits: 26098
497 Entries: 64
498 Dropped: 0
499
500 We can also add multiple fields to the 'values' parameter. For
501 example, we might want to see the total number of bytes allocated
502 alongside bytes requested, and display the result sorted by bytes
503 allocated in a descending order:
504
505 # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \
506 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
507
508 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
509 # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active]
510
511 { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016
512 { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224
513 { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568
514 { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760
515 { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744
516 { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400
517 { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496
518 { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304
519 { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640
520 { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760
521 { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312
522 { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432
523 .
524 .
525 .
526 { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192
527 { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
528 { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
529 { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
530 { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128
531 { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96
532 { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64
533 { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
534 { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8
535 { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8
536
537 Totals:
538 Hits: 66598
539 Entries: 65
540 Dropped: 0
541
542 Finally, to finish off our kmalloc example, instead of simply having
543 the hist trigger display symbolic call_sites, we can have the hist
544 trigger additionally display the complete set of kernel stack traces
545 that led to each call_site. To do that, we simply use the special
546 value 'stacktrace' for the key parameter:
547
548 # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \
549 /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
550
551 The above trigger will use the kernel stack trace in effect when an
552 event is triggered as the key for the hash table. This allows the
553 enumeration of every kernel callpath that led up to a particular
554 event, along with a running total of any of the event fields for
555 that event. Here we tally bytes requested and bytes allocated for
556 every callpath in the system that led up to a kmalloc (in this case
557 every callpath to a kmalloc for a kernel compile):
558
559 # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
560 # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active]
561
562 { stacktrace:
563 __kmalloc_track_caller+0x10b/0x1a0
564 kmemdup+0x20/0x50
565 hidraw_report_event+0x8a/0x120 [hid]
566 hid_report_raw_event+0x3ea/0x440 [hid]
567 hid_input_report+0x112/0x190 [hid]
568 hid_irq_in+0xc2/0x260 [usbhid]
569 __usb_hcd_giveback_urb+0x72/0x120
570 usb_giveback_urb_bh+0x9e/0xe0
571 tasklet_hi_action+0xf8/0x100
572 __do_softirq+0x114/0x2c0
573 irq_exit+0xa5/0xb0
574 do_IRQ+0x5a/0xf0
575 ret_from_intr+0x0/0x30
576 cpuidle_enter+0x17/0x20
577 cpu_startup_entry+0x315/0x3e0
578 rest_init+0x7c/0x80
579 } hitcount: 3 bytes_req: 21 bytes_alloc: 24
580 { stacktrace:
581 __kmalloc_track_caller+0x10b/0x1a0
582 kmemdup+0x20/0x50
583 hidraw_report_event+0x8a/0x120 [hid]
584 hid_report_raw_event+0x3ea/0x440 [hid]
585 hid_input_report+0x112/0x190 [hid]
586 hid_irq_in+0xc2/0x260 [usbhid]
587 __usb_hcd_giveback_urb+0x72/0x120
588 usb_giveback_urb_bh+0x9e/0xe0
589 tasklet_hi_action+0xf8/0x100
590 __do_softirq+0x114/0x2c0
591 irq_exit+0xa5/0xb0
592 do_IRQ+0x5a/0xf0
593 ret_from_intr+0x0/0x30
594 } hitcount: 3 bytes_req: 21 bytes_alloc: 24
595 { stacktrace:
596 kmem_cache_alloc_trace+0xeb/0x150
597 aa_alloc_task_context+0x27/0x40
598 apparmor_cred_prepare+0x1f/0x50
599 security_prepare_creds+0x16/0x20
600 prepare_creds+0xdf/0x1a0
601 SyS_capset+0xb5/0x200
602 system_call_fastpath+0x12/0x6a
603 } hitcount: 1 bytes_req: 32 bytes_alloc: 32
604 .
605 .
606 .
607 { stacktrace:
608 __kmalloc+0x11b/0x1b0
609 i915_gem_execbuffer2+0x6c/0x2c0 [i915]
610 drm_ioctl+0x349/0x670 [drm]
611 do_vfs_ioctl+0x2f0/0x4f0
612 SyS_ioctl+0x81/0xa0
613 system_call_fastpath+0x12/0x6a
614 } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808
615 { stacktrace:
616 __kmalloc+0x11b/0x1b0
617 load_elf_phdrs+0x76/0xa0
618 load_elf_binary+0x102/0x1650
619 search_binary_handler+0x97/0x1d0
620 do_execveat_common.isra.34+0x551/0x6e0
621 SyS_execve+0x3a/0x50
622 return_from_execve+0x0/0x23
623 } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048
624 { stacktrace:
625 kmem_cache_alloc_trace+0xeb/0x150
626 apparmor_file_alloc_security+0x27/0x40
627 security_file_alloc+0x16/0x20
628 get_empty_filp+0x93/0x1c0
629 path_openat+0x31/0x5f0
630 do_filp_open+0x3a/0x90
631 do_sys_open+0x128/0x220
632 SyS_open+0x1e/0x20
633 system_call_fastpath+0x12/0x6a
634 } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376
635 { stacktrace:
636 __kmalloc+0x11b/0x1b0
637 seq_buf_alloc+0x1b/0x50
638 seq_read+0x2cc/0x370
639 proc_reg_read+0x3d/0x80
640 __vfs_read+0x28/0xe0
641 vfs_read+0x86/0x140
642 SyS_read+0x46/0xb0
643 system_call_fastpath+0x12/0x6a
644 } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768
645
646 Totals:
647 Hits: 6085872
648 Entries: 253
649 Dropped: 0
650
651 If you key a hist trigger on common_pid, in order for example to
652 gather and display sorted totals for each process, you can use the
653 special .execname modifier to display the executable names for the
654 processes in the table rather than raw pids. The example below
655 keeps a per-process sum of total bytes read:
656
657 # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \
658 /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
659
660 # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist
661 # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active]
662
663 { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512
664 { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640
665 { common_pid: compiz [ 2889] } hitcount: 59 count: 254400
666 { common_pid: bash [ 8710] } hitcount: 3 count: 66369
667 { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739
668 { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648
669 { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216
670 { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396
671 { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264
672 { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424
673 { common_pid: gmain [ 1315] } hitcount: 18 count: 6336
674 .
675 .
676 .
677 { common_pid: postgres [ 1892] } hitcount: 2 count: 32
678 { common_pid: postgres [ 1891] } hitcount: 2 count: 32
679 { common_pid: gmain [ 8704] } hitcount: 2 count: 32
680 { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21
681 { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16
682 { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16
683 { common_pid: gdbus [ 2998] } hitcount: 1 count: 16
684 { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8
685 { common_pid: init [ 1] } hitcount: 2 count: 2
686
687 Totals:
688 Hits: 2116
689 Entries: 51
690 Dropped: 0
691
692 Similarly, if you key a hist trigger on syscall id, for example to
693 gather and display a list of systemwide syscall hits, you can use
694 the special .syscall modifier to display the syscall names rather
695 than raw ids. The example below keeps a running total of syscall
696 counts for the system during the run:
697
698 # echo 'hist:key=id.syscall:val=hitcount' > \
699 /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
700
701 # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
702 # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active]
703
704 { id: sys_fsync [ 74] } hitcount: 1
705 { id: sys_newuname [ 63] } hitcount: 1
706 { id: sys_prctl [157] } hitcount: 1
707 { id: sys_statfs [137] } hitcount: 1
708 { id: sys_symlink [ 88] } hitcount: 1
709 { id: sys_sendmmsg [307] } hitcount: 1
710 { id: sys_semctl [ 66] } hitcount: 1
711 { id: sys_readlink [ 89] } hitcount: 3
712 { id: sys_bind [ 49] } hitcount: 3
713 { id: sys_getsockname [ 51] } hitcount: 3
714 { id: sys_unlink [ 87] } hitcount: 3
715 { id: sys_rename [ 82] } hitcount: 4
716 { id: unknown_syscall [ 58] } hitcount: 4
717 { id: sys_connect [ 42] } hitcount: 4
718 { id: sys_getpid [ 39] } hitcount: 4
719 .
720 .
721 .
722 { id: sys_rt_sigprocmask [ 14] } hitcount: 952
723 { id: sys_futex [202] } hitcount: 1534
724 { id: sys_write [ 1] } hitcount: 2689
725 { id: sys_setitimer [ 38] } hitcount: 2797
726 { id: sys_read [ 0] } hitcount: 3202
727 { id: sys_select [ 23] } hitcount: 3773
728 { id: sys_writev [ 20] } hitcount: 4531
729 { id: sys_poll [ 7] } hitcount: 8314
730 { id: sys_recvmsg [ 47] } hitcount: 13738
731 { id: sys_ioctl [ 16] } hitcount: 21843
732
733 Totals:
734 Hits: 67612
735 Entries: 72
736 Dropped: 0
737
738 The syscall counts above provide a rough overall picture of system
739 call activity on the system; we can see for example that the most
740 popular system call on this system was the 'sys_ioctl' system call.
741
742 We can use 'compound' keys to refine that number and provide some
743 further insight as to which processes exactly contribute to the
744 overall ioctl count.
745
746 The command below keeps a hitcount for every unique combination of
747 system call id and pid - the end result is essentially a table
748 that keeps a per-pid sum of system call hits. The results are
749 sorted using the system call id as the primary key, and the
750 hitcount sum as the secondary key:
751
752 # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \
753 /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
754
755 # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
756 # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active]
757
758 { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1
759 { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1
760 { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1
761 { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1
762 { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2
763 { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2
764 { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2
765 { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2
766 { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2
767 { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2
768 .
769 .
770 .
771 { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1
772 { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12
773 { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16
774 { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808
775 { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580
776 .
777 .
778 .
779 { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3
780 { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16
781 { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2
782 { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4
783 { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4
784 { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4
785 { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4
786 { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6
787 { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2
788 { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4
789 { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6
790
791 Totals:
792 Hits: 31536
793 Entries: 323
794 Dropped: 0
795
796 The above list does give us a breakdown of the ioctl syscall by
797 pid, but it also gives us quite a bit more than that, which we
798 don't really care about at the moment. Since we know the syscall
799 id for sys_ioctl (16, displayed next to the sys_ioctl name), we
800 can use that to filter out all the other syscalls:
801
802 # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \
803 /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
804
805 # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
806 # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active]
807
808 { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1
809 { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1
810 { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1
811 { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1
812 { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1
813 { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1
814 { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1
815 { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1
816 { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1
817 .
818 .
819 .
820 { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45
821 { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48
822 { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48
823 { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66
824 { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674
825 { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443
826
827 Totals:
828 Hits: 101162
829 Entries: 103
830 Dropped: 0
831
832 The above output shows that 'compiz' and 'Xorg' are far and away
833 the heaviest ioctl callers (which might lead to questions about
834 whether they really need to be making all those calls and to
835 possible avenues for further investigation.)
836
837 The compound key examples used a key and a sum value (hitcount) to
838 sort the output, but we can just as easily use two keys instead.
839 Here's an example where we use a compound key composed of the the
840 common_pid and size event fields. Sorting with pid as the primary
841 key and 'size' as the secondary key allows us to display an
842 ordered summary of the recvfrom sizes, with counts, received by
843 each process:
844
845 # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \
846 /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger
847
848 # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist
849 # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active]
850
851 { common_pid: smbd [ 784], size: 4 } hitcount: 1
852 { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672
853 { common_pid: postgres [ 1796], size: 1000 } hitcount: 6
854 { common_pid: postgres [ 1867], size: 1000 } hitcount: 10
855 { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2
856 { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1
857 { common_pid: compiz [ 2994], size: 8 } hitcount: 1
858 { common_pid: compiz [ 2994], size: 20 } hitcount: 11
859 { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2
860 { common_pid: firefox [ 8817], size: 4 } hitcount: 1
861 { common_pid: firefox [ 8817], size: 8 } hitcount: 5
862 { common_pid: firefox [ 8817], size: 588 } hitcount: 2
863 { common_pid: firefox [ 8817], size: 628 } hitcount: 1
864 { common_pid: firefox [ 8817], size: 6944 } hitcount: 1
865 { common_pid: firefox [ 8817], size: 408880 } hitcount: 2
866 { common_pid: firefox [ 8822], size: 8 } hitcount: 2
867 { common_pid: firefox [ 8822], size: 160 } hitcount: 2
868 { common_pid: firefox [ 8822], size: 320 } hitcount: 2
869 { common_pid: firefox [ 8822], size: 352 } hitcount: 1
870 .
871 .
872 .
873 { common_pid: pool [ 8923], size: 1960 } hitcount: 10
874 { common_pid: pool [ 8923], size: 2048 } hitcount: 10
875 { common_pid: pool [ 8924], size: 1960 } hitcount: 10
876 { common_pid: pool [ 8924], size: 2048 } hitcount: 10
877 { common_pid: pool [ 8928], size: 1964 } hitcount: 4
878 { common_pid: pool [ 8928], size: 1965 } hitcount: 2
879 { common_pid: pool [ 8928], size: 2048 } hitcount: 6
880 { common_pid: pool [ 8929], size: 1982 } hitcount: 1
881 { common_pid: pool [ 8929], size: 2048 } hitcount: 1
882
883 Totals:
884 Hits: 2016
885 Entries: 224
886 Dropped: 0
887
888 The above example also illustrates the fact that although a compound
889 key is treated as a single entity for hashing purposes, the sub-keys
890 it's composed of can be accessed independently.
891
892 The next example uses a string field as the hash key and
893 demonstrates how you can manually pause and continue a hist trigger.
894 In this example, we'll aggregate fork counts and don't expect a
895 large number of entries in the hash table, so we'll drop it to a
896 much smaller number, say 256:
897
898 # echo 'hist:key=child_comm:val=hitcount:size=256' > \
899 /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
900
901 # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
902 # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
903
904 { child_comm: dconf worker } hitcount: 1
905 { child_comm: ibus-daemon } hitcount: 1
906 { child_comm: whoopsie } hitcount: 1
907 { child_comm: smbd } hitcount: 1
908 { child_comm: gdbus } hitcount: 1
909 { child_comm: kthreadd } hitcount: 1
910 { child_comm: dconf worker } hitcount: 1
911 { child_comm: evolution-alarm } hitcount: 2
912 { child_comm: Socket Thread } hitcount: 2
913 { child_comm: postgres } hitcount: 2
914 { child_comm: bash } hitcount: 3
915 { child_comm: compiz } hitcount: 3
916 { child_comm: evolution-sourc } hitcount: 4
917 { child_comm: dhclient } hitcount: 4
918 { child_comm: pool } hitcount: 5
919 { child_comm: nm-dispatcher.a } hitcount: 8
920 { child_comm: firefox } hitcount: 8
921 { child_comm: dbus-daemon } hitcount: 8
922 { child_comm: glib-pacrunner } hitcount: 10
923 { child_comm: evolution } hitcount: 23
924
925 Totals:
926 Hits: 89
927 Entries: 20
928 Dropped: 0
929
930 If we want to pause the hist trigger, we can simply append :pause to
931 the command that started the trigger. Notice that the trigger info
932 displays as [paused]:
933
934 # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \
935 /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
936
937 # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
938 # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused]
939
940 { child_comm: dconf worker } hitcount: 1
941 { child_comm: kthreadd } hitcount: 1
942 { child_comm: dconf worker } hitcount: 1
943 { child_comm: gdbus } hitcount: 1
944 { child_comm: ibus-daemon } hitcount: 1
945 { child_comm: Socket Thread } hitcount: 2
946 { child_comm: evolution-alarm } hitcount: 2
947 { child_comm: smbd } hitcount: 2
948 { child_comm: bash } hitcount: 3
949 { child_comm: whoopsie } hitcount: 3
950 { child_comm: compiz } hitcount: 3
951 { child_comm: evolution-sourc } hitcount: 4
952 { child_comm: pool } hitcount: 5
953 { child_comm: postgres } hitcount: 6
954 { child_comm: firefox } hitcount: 8
955 { child_comm: dhclient } hitcount: 10
956 { child_comm: emacs } hitcount: 12
957 { child_comm: dbus-daemon } hitcount: 20
958 { child_comm: nm-dispatcher.a } hitcount: 20
959 { child_comm: evolution } hitcount: 35
960 { child_comm: glib-pacrunner } hitcount: 59
961
962 Totals:
963 Hits: 199
964 Entries: 21
965 Dropped: 0
966
967 To manually continue having the trigger aggregate events, append
968 :cont instead. Notice that the trigger info displays as [active]
969 again, and the data has changed:
970
971 # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \
972 /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
973
974 # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
975 # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active]
976
977 { child_comm: dconf worker } hitcount: 1
978 { child_comm: dconf worker } hitcount: 1
979 { child_comm: kthreadd } hitcount: 1
980 { child_comm: gdbus } hitcount: 1
981 { child_comm: ibus-daemon } hitcount: 1
982 { child_comm: Socket Thread } hitcount: 2
983 { child_comm: evolution-alarm } hitcount: 2
984 { child_comm: smbd } hitcount: 2
985 { child_comm: whoopsie } hitcount: 3
986 { child_comm: compiz } hitcount: 3
987 { child_comm: evolution-sourc } hitcount: 4
988 { child_comm: bash } hitcount: 5
989 { child_comm: pool } hitcount: 5
990 { child_comm: postgres } hitcount: 6
991 { child_comm: firefox } hitcount: 8
992 { child_comm: dhclient } hitcount: 11
993 { child_comm: emacs } hitcount: 12
994 { child_comm: dbus-daemon } hitcount: 22
995 { child_comm: nm-dispatcher.a } hitcount: 22
996 { child_comm: evolution } hitcount: 35
997 { child_comm: glib-pacrunner } hitcount: 59
998
999 Totals:
1000 Hits: 206
1001 Entries: 21
1002 Dropped: 0
1003
1004 The previous example showed how to start and stop a hist trigger by
1005 appending 'pause' and 'continue' to the hist trigger command. A
1006 hist trigger can also be started in a paused state by initially
1007 starting the trigger with ':pause' appended. This allows you to
1008 start the trigger only when you're ready to start collecting data
1009 and not before. For example, you could start the trigger in a
1010 paused state, then unpause it and do something you want to measure,
1011 then pause the trigger again when done.
1012
1013 Of course, doing this manually can be difficult and error-prone, but
1014 it is possible to automatically start and stop a hist trigger based
1015 on some condition, via the enable_hist and disable_hist triggers.
1016
1017 For example, suppose we wanted to take a look at the relative
1018 weights in terms of skb length for each callpath that leads to a
1019 netif_receieve_skb event when downloading a decent-sized file using
1020 wget.
1021
1022 First we set up an initially paused stacktrace trigger on the
1023 netif_receive_skb event:
1024
1025 # echo 'hist:key=stacktrace:vals=len:pause' > \
1026 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1027
1028 Next, we set up an 'enable_hist' trigger on the sched_process_exec
1029 event, with an 'if filename==/usr/bin/wget' filter. The effect of
1030 this new trigger is that it will 'unpause' the hist trigger we just
1031 set up on netif_receive_skb if and only if it sees a
1032 sched_process_exec event with a filename of '/usr/bin/wget'. When
1033 that happens, all netif_receive_skb events are aggregated into a
1034 hash table keyed on stacktrace:
1035
1036 # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \
1037 /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1038
1039 The aggregation continues until the netif_receive_skb is paused
1040 again, which is what the following disable_hist event does by
1041 creating a similar setup on the sched_process_exit event, using the
1042 filter 'comm==wget':
1043
1044 # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \
1045 /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1046
1047 Whenever a process exits and the comm field of the disable_hist
1048 trigger filter matches 'comm==wget', the netif_receive_skb hist
1049 trigger is disabled.
1050
1051 The overall effect is that netif_receive_skb events are aggregated
1052 into the hash table for only the duration of the wget. Executing a
1053 wget command and then listing the 'hist' file will display the
1054 output generated by the wget command:
1055
1056 $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
1057
1058 # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1059 # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
1060
1061 { stacktrace:
1062 __netif_receive_skb_core+0x46d/0x990
1063 __netif_receive_skb+0x18/0x60
1064 netif_receive_skb_internal+0x23/0x90
1065 napi_gro_receive+0xc8/0x100
1066 ieee80211_deliver_skb+0xd6/0x270 [mac80211]
1067 ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
1068 ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
1069 ieee80211_rx+0x31d/0x900 [mac80211]
1070 iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
1071 iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
1072 iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
1073 irq_thread_fn+0x20/0x50
1074 irq_thread+0x11f/0x150
1075 kthread+0xd2/0xf0
1076 ret_from_fork+0x42/0x70
1077 } hitcount: 85 len: 28884
1078 { stacktrace:
1079 __netif_receive_skb_core+0x46d/0x990
1080 __netif_receive_skb+0x18/0x60
1081 netif_receive_skb_internal+0x23/0x90
1082 napi_gro_complete+0xa4/0xe0
1083 dev_gro_receive+0x23a/0x360
1084 napi_gro_receive+0x30/0x100
1085 ieee80211_deliver_skb+0xd6/0x270 [mac80211]
1086 ieee80211_rx_handlers+0xccf/0x22f0 [mac80211]
1087 ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211]
1088 ieee80211_rx+0x31d/0x900 [mac80211]
1089 iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm]
1090 iwl_rx_dispatch+0x8e/0xf0 [iwldvm]
1091 iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi]
1092 irq_thread_fn+0x20/0x50
1093 irq_thread+0x11f/0x150
1094 kthread+0xd2/0xf0
1095 } hitcount: 98 len: 664329
1096 { stacktrace:
1097 __netif_receive_skb_core+0x46d/0x990
1098 __netif_receive_skb+0x18/0x60
1099 process_backlog+0xa8/0x150
1100 net_rx_action+0x15d/0x340
1101 __do_softirq+0x114/0x2c0
1102 do_softirq_own_stack+0x1c/0x30
1103 do_softirq+0x65/0x70
1104 __local_bh_enable_ip+0xb5/0xc0
1105 ip_finish_output+0x1f4/0x840
1106 ip_output+0x6b/0xc0
1107 ip_local_out_sk+0x31/0x40
1108 ip_send_skb+0x1a/0x50
1109 udp_send_skb+0x173/0x2a0
1110 udp_sendmsg+0x2bf/0x9f0
1111 inet_sendmsg+0x64/0xa0
1112 sock_sendmsg+0x3d/0x50
1113 } hitcount: 115 len: 13030
1114 { stacktrace:
1115 __netif_receive_skb_core+0x46d/0x990
1116 __netif_receive_skb+0x18/0x60
1117 netif_receive_skb_internal+0x23/0x90
1118 napi_gro_complete+0xa4/0xe0
1119 napi_gro_flush+0x6d/0x90
1120 iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi]
1121 irq_thread_fn+0x20/0x50
1122 irq_thread+0x11f/0x150
1123 kthread+0xd2/0xf0
1124 ret_from_fork+0x42/0x70
1125 } hitcount: 934 len: 5512212
1126
1127 Totals:
1128 Hits: 1232
1129 Entries: 4
1130 Dropped: 0
1131
1132 The above shows all the netif_receive_skb callpaths and their total
1133 lengths for the duration of the wget command.
1134
1135 The 'clear' hist trigger param can be used to clear the hash table.
1136 Suppose we wanted to try another run of the previous example but
1137 this time also wanted to see the complete list of events that went
1138 into the histogram. In order to avoid having to set everything up
1139 again, we can just clear the histogram first:
1140
1141 # echo 'hist:key=stacktrace:vals=len:clear' >> \
1142 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1143
1144 Just to verify that it is in fact cleared, here's what we now see in
1145 the hist file:
1146
1147 # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1148 # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused]
1149
1150 Totals:
1151 Hits: 0
1152 Entries: 0
1153 Dropped: 0
1154
1155 Since we want to see the detailed list of every netif_receive_skb
1156 event occurring during the new run, which are in fact the same
1157 events being aggregated into the hash table, we add some additional
1158 'enable_event' events to the triggering sched_process_exec and
1159 sched_process_exit events as such:
1160
1161 # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \
1162 /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1163
1164 # echo 'disable_event:net:netif_receive_skb if comm==wget' > \
1165 /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1166
1167 If you read the trigger files for the sched_process_exec and
1168 sched_process_exit triggers, you should see two triggers for each:
1169 one enabling/disabling the hist aggregation and the other
1170 enabling/disabling the logging of events:
1171
1172 # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger
1173 enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
1174 enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget
1175
1176 # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger
1177 enable_event:net:netif_receive_skb:unlimited if comm==wget
1178 disable_hist:net:netif_receive_skb:unlimited if comm==wget
1179
1180 In other words, whenever either of the sched_process_exec or
1181 sched_process_exit events is hit and matches 'wget', it enables or
1182 disables both the histogram and the event log, and what you end up
1183 with is a hash table and set of events just covering the specified
1184 duration. Run the wget command again:
1185
1186 $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz
1187
1188 Displaying the 'hist' file should show something similar to what you
1189 saw in the last run, but this time you should also see the
1190 individual events in the trace file:
1191
1192 # cat /sys/kernel/debug/tracing/trace
1193
1194 # tracer: nop
1195 #
1196 # entries-in-buffer/entries-written: 183/1426 #P:4
1197 #
1198 # _-----=> irqs-off
1199 # / _----=> need-resched
1200 # | / _---=> hardirq/softirq
1201 # || / _--=> preempt-depth
1202 # ||| / delay
1203 # TASK-PID CPU# |||| TIMESTAMP FUNCTION
1204 # | | | |||| | |
1205 wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60
1206 wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60
1207 dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130
1208 dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138
1209 ##### CPU 2 buffer started ####
1210 irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948
1211 irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500
1212 irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948
1213 irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948
1214 irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500
1215 .
1216 .
1217 .
1218
1219 The following example demonstrates how multiple hist triggers can be
1220 attached to a given event. This capability can be useful for
1221 creating a set of different summaries derived from the same set of
1222 events, or for comparing the effects of different filters, among
1223 other things.
1224
1225 # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \
1226 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1227 # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \
1228 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1229 # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \
1230 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1231 # echo 'hist:keys=skbaddr.hex:vals=len' >> \
1232 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1233 # echo 'hist:keys=len:vals=common_preempt_count' >> \
1234 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1235
1236 The above set of commands create four triggers differing only in
1237 their filters, along with a completely different though fairly
1238 nonsensical trigger. Note that in order to append multiple hist
1239 triggers to the same file, you should use the '>>' operator to
1240 append them ('>' will also add the new hist trigger, but will remove
1241 any existing hist triggers beforehand).
1242
1243 Displaying the contents of the 'hist' file for the event shows the
1244 contents of all five histograms:
1245
1246 # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist
1247
1248 # event histogram
1249 #
1250 # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active]
1251 #
1252
1253 { len: 176 } hitcount: 1 common_preempt_count: 0
1254 { len: 223 } hitcount: 1 common_preempt_count: 0
1255 { len: 4854 } hitcount: 1 common_preempt_count: 0
1256 { len: 395 } hitcount: 1 common_preempt_count: 0
1257 { len: 177 } hitcount: 1 common_preempt_count: 0
1258 { len: 446 } hitcount: 1 common_preempt_count: 0
1259 { len: 1601 } hitcount: 1 common_preempt_count: 0
1260 .
1261 .
1262 .
1263 { len: 1280 } hitcount: 66 common_preempt_count: 0
1264 { len: 116 } hitcount: 81 common_preempt_count: 40
1265 { len: 708 } hitcount: 112 common_preempt_count: 0
1266 { len: 46 } hitcount: 221 common_preempt_count: 0
1267 { len: 1264 } hitcount: 458 common_preempt_count: 0
1268
1269 Totals:
1270 Hits: 1428
1271 Entries: 147
1272 Dropped: 0
1273
1274
1275 # event histogram
1276 #
1277 # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1278 #
1279
1280 { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130
1281 { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280
1282 { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280
1283 { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115
1284 { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115
1285 { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46
1286 { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118
1287 { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60
1288 { skbaddr: ffff880100065900 } hitcount: 1 len: 46
1289 { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116
1290 { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280
1291 { skbaddr: ffff880100064700 } hitcount: 1 len: 365
1292 { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60
1293 .
1294 .
1295 .
1296 { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677
1297 { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052
1298 { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589
1299 { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326
1300 { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678
1301 { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678
1302 { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589
1303 { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307
1304 { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032
1305
1306 Totals:
1307 Hits: 1451
1308 Entries: 318
1309 Dropped: 0
1310
1311
1312 # event histogram
1313 #
1314 # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active]
1315 #
1316
1317
1318 Totals:
1319 Hits: 0
1320 Entries: 0
1321 Dropped: 0
1322
1323
1324 # event histogram
1325 #
1326 # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active]
1327 #
1328
1329 { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212
1330 { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212
1331 { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212
1332 { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492
1333 { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212
1334 { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212
1335 { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854
1336 { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636
1337 { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924
1338 { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356
1339 { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420
1340 { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996
1341
1342 Totals:
1343 Hits: 14
1344 Entries: 12
1345 Dropped: 0
1346
1347
1348 # event histogram
1349 #
1350 # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active]
1351 #
1352
1353
1354 Totals:
1355 Hits: 0
1356 Entries: 0
1357 Dropped: 0
1358
1359 Named triggers can be used to have triggers share a common set of
1360 histogram data. This capability is mostly useful for combining the
1361 output of events generated by tracepoints contained inside inline
1362 functions, but names can be used in a hist trigger on any event.
1363 For example, these two triggers when hit will update the same 'len'
1364 field in the shared 'foo' histogram data:
1365
1366 # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
1367 /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger
1368 # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \
1369 /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1370
1371 You can see that they're updating common histogram data by reading
1372 each event's hist files at the same time:
1373
1374 # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist;
1375 cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
1376
1377 # event histogram
1378 #
1379 # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1380 #
1381
1382 { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
1383 { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
1384 { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
1385 { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
1386 { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
1387 { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
1388 { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
1389 { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
1390 { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
1391 { skbaddr: ffff880064505000 } hitcount: 1 len: 46
1392 { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
1393 { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
1394 { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
1395 { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
1396 { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
1397 { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
1398 { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
1399 { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
1400 { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
1401 { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
1402 { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
1403 { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
1404 { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
1405 { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
1406 { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
1407 { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
1408 { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
1409 { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
1410 { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
1411 { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
1412 { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
1413 { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
1414 { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
1415 { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
1416 { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
1417 { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
1418 { skbaddr: ffff880064504400 } hitcount: 4 len: 184
1419 { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
1420 { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
1421 { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
1422 { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
1423 { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
1424
1425 Totals:
1426 Hits: 81
1427 Entries: 42
1428 Dropped: 0
1429 # event histogram
1430 #
1431 # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active]
1432 #
1433
1434 { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46
1435 { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76
1436 { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46
1437 { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468
1438 { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46
1439 { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52
1440 { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168
1441 { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46
1442 { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260
1443 { skbaddr: ffff880064505000 } hitcount: 1 len: 46
1444 { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32
1445 { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46
1446 { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44
1447 { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168
1448 { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40
1449 { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40
1450 { skbaddr: ffff880064505f00 } hitcount: 1 len: 174
1451 { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160
1452 { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76
1453 { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46
1454 { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32
1455 { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46
1456 { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988
1457 { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46
1458 { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44
1459 { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676
1460 { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107
1461 { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92
1462 { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142
1463 { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220
1464 { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92
1465 { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92
1466 { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675
1467 { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138
1468 { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138
1469 { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184
1470 { skbaddr: ffff880064504400 } hitcount: 4 len: 184
1471 { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184
1472 { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230
1473 { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196
1474 { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276
1475 { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276
1476
1477 Totals:
1478 Hits: 81
1479 Entries: 42
1480 Dropped: 0
1481
1482 And here's an example that shows how to combine histogram data from
1483 any two events even if they don't share any 'compatible' fields
1484 other than 'hitcount' and 'stacktrace'. These commands create a
1485 couple of triggers named 'bar' using those fields:
1486
1487 # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
1488 /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger
1489 # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \
1490 /sys/kernel/debug/tracing/events/net/netif_rx/trigger
1491
1492 And displaying the output of either shows some interesting if
1493 somewhat confusing output:
1494
1495 # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist
1496 # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist
1497
1498 # event histogram
1499 #
1500 # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active]
1501 #
1502
1503 { stacktrace:
1504 _do_fork+0x18e/0x330
1505 kernel_thread+0x29/0x30
1506 kthreadd+0x154/0x1b0
1507 ret_from_fork+0x3f/0x70
1508 } hitcount: 1
1509 { stacktrace:
1510 netif_rx_internal+0xb2/0xd0
1511 netif_rx_ni+0x20/0x70
1512 dev_loopback_xmit+0xaa/0xd0
1513 ip_mc_output+0x126/0x240
1514 ip_local_out_sk+0x31/0x40
1515 igmp_send_report+0x1e9/0x230
1516 igmp_timer_expire+0xe9/0x120
1517 call_timer_fn+0x39/0xf0
1518 run_timer_softirq+0x1e1/0x290
1519 __do_softirq+0xfd/0x290
1520 irq_exit+0x98/0xb0
1521 smp_apic_timer_interrupt+0x4a/0x60
1522 apic_timer_interrupt+0x6d/0x80
1523 cpuidle_enter+0x17/0x20
1524 call_cpuidle+0x3b/0x60
1525 cpu_startup_entry+0x22d/0x310
1526 } hitcount: 1
1527 { stacktrace:
1528 netif_rx_internal+0xb2/0xd0
1529 netif_rx_ni+0x20/0x70
1530 dev_loopback_xmit+0xaa/0xd0
1531 ip_mc_output+0x17f/0x240
1532 ip_local_out_sk+0x31/0x40
1533 ip_send_skb+0x1a/0x50
1534 udp_send_skb+0x13e/0x270
1535 udp_sendmsg+0x2bf/0x980
1536 inet_sendmsg+0x67/0xa0
1537 sock_sendmsg+0x38/0x50
1538 SYSC_sendto+0xef/0x170
1539 SyS_sendto+0xe/0x10
1540 entry_SYSCALL_64_fastpath+0x12/0x6a
1541 } hitcount: 2
1542 { stacktrace:
1543 netif_rx_internal+0xb2/0xd0
1544 netif_rx+0x1c/0x60
1545 loopback_xmit+0x6c/0xb0
1546 dev_hard_start_xmit+0x219/0x3a0
1547 __dev_queue_xmit+0x415/0x4f0
1548 dev_queue_xmit_sk+0x13/0x20
1549 ip_finish_output2+0x237/0x340
1550 ip_finish_output+0x113/0x1d0
1551 ip_output+0x66/0xc0
1552 ip_local_out_sk+0x31/0x40
1553 ip_send_skb+0x1a/0x50
1554 udp_send_skb+0x16d/0x270
1555 udp_sendmsg+0x2bf/0x980
1556 inet_sendmsg+0x67/0xa0
1557 sock_sendmsg+0x38/0x50
1558 ___sys_sendmsg+0x14e/0x270
1559 } hitcount: 76
1560 { stacktrace:
1561 netif_rx_internal+0xb2/0xd0
1562 netif_rx+0x1c/0x60
1563 loopback_xmit+0x6c/0xb0
1564 dev_hard_start_xmit+0x219/0x3a0
1565 __dev_queue_xmit+0x415/0x4f0
1566 dev_queue_xmit_sk+0x13/0x20
1567 ip_finish_output2+0x237/0x340
1568 ip_finish_output+0x113/0x1d0
1569 ip_output+0x66/0xc0
1570 ip_local_out_sk+0x31/0x40
1571 ip_send_skb+0x1a/0x50
1572 udp_send_skb+0x16d/0x270
1573 udp_sendmsg+0x2bf/0x980
1574 inet_sendmsg+0x67/0xa0
1575 sock_sendmsg+0x38/0x50
1576 ___sys_sendmsg+0x269/0x270
1577 } hitcount: 77
1578 { stacktrace:
1579 netif_rx_internal+0xb2/0xd0
1580 netif_rx+0x1c/0x60
1581 loopback_xmit+0x6c/0xb0
1582 dev_hard_start_xmit+0x219/0x3a0
1583 __dev_queue_xmit+0x415/0x4f0
1584 dev_queue_xmit_sk+0x13/0x20
1585 ip_finish_output2+0x237/0x340
1586 ip_finish_output+0x113/0x1d0
1587 ip_output+0x66/0xc0
1588 ip_local_out_sk+0x31/0x40
1589 ip_send_skb+0x1a/0x50
1590 udp_send_skb+0x16d/0x270
1591 udp_sendmsg+0x2bf/0x980
1592 inet_sendmsg+0x67/0xa0
1593 sock_sendmsg+0x38/0x50
1594 SYSC_sendto+0xef/0x170
1595 } hitcount: 88
1596 { stacktrace:
1597 _do_fork+0x18e/0x330
1598 SyS_clone+0x19/0x20
1599 entry_SYSCALL_64_fastpath+0x12/0x6a
1600 } hitcount: 244
1601
1602 Totals:
1603 Hits: 489
1604 Entries: 7
1605 Dropped: 0
1606
1607
16082.2 Inter-event hist triggers
1609-----------------------------
1610
1611Inter-event hist triggers are hist triggers that combine values from
1612one or more other events and create a histogram using that data. Data
1613from an inter-event histogram can in turn become the source for
1614further combined histograms, thus providing a chain of related
1615histograms, which is important for some applications.
1616
1617The most important example of an inter-event quantity that can be used
1618in this manner is latency, which is simply a difference in timestamps
1619between two events. Although latency is the most important
1620inter-event quantity, note that because the support is completely
1621general across the trace event subsystem, any event field can be used
1622in an inter-event quantity.
1623
1624An example of a histogram that combines data from other histograms
1625into a useful chain would be a 'wakeupswitch latency' histogram that
1626combines a 'wakeup latency' histogram and a 'switch latency'
1627histogram.
1628
1629Normally, a hist trigger specification consists of a (possibly
1630compound) key along with one or more numeric values, which are
1631continually updated sums associated with that key. A histogram
1632specification in this case consists of individual key and value
1633specifications that refer to trace event fields associated with a
1634single event type.
1635
1636The inter-event hist trigger extension allows fields from multiple
1637events to be referenced and combined into a multi-event histogram
1638specification. In support of this overall goal, a few enabling
1639features have been added to the hist trigger support:
1640
1641 - In order to compute an inter-event quantity, a value from one
1642 event needs to saved and then referenced from another event. This
1643 requires the introduction of support for histogram 'variables'.
1644
1645 - The computation of inter-event quantities and their combination
1646 require some minimal amount of support for applying simple
1647 expressions to variables (+ and -).
1648
1649 - A histogram consisting of inter-event quantities isn't logically a
1650 histogram on either event (so having the 'hist' file for either
1651 event host the histogram output doesn't really make sense). To
1652 address the idea that the histogram is associated with a
1653 combination of events, support is added allowing the creation of
1654 'synthetic' events that are events derived from other events.
1655 These synthetic events are full-fledged events just like any other
1656 and can be used as such, as for instance to create the
1657 'combination' histograms mentioned previously.
1658
1659 - A set of 'actions' can be associated with histogram entries -
1660 these can be used to generate the previously mentioned synthetic
1661 events, but can also be used for other purposes, such as for
1662 example saving context when a 'max' latency has been hit.
1663
1664 - Trace events don't have a 'timestamp' associated with them, but
1665 there is an implicit timestamp saved along with an event in the
1666 underlying ftrace ring buffer. This timestamp is now exposed as a
1667 a synthetic field named 'common_timestamp' which can be used in
1668 histograms as if it were any other event field; it isn't an actual
1669 field in the trace format but rather is a synthesized value that
1670 nonetheless can be used as if it were an actual field. By default
1671 it is in units of nanoseconds; appending '.usecs' to a
1672 common_timestamp field changes the units to microseconds.
1673
1674A note on inter-event timestamps: If common_timestamp is used in a
1675histogram, the trace buffer is automatically switched over to using
1676absolute timestamps and the "global" trace clock, in order to avoid
1677bogus timestamp differences with other clocks that aren't coherent
1678across CPUs. This can be overridden by specifying one of the other
1679trace clocks instead, using the "clock=XXX" hist trigger attribute,
1680where XXX is any of the clocks listed in the tracing/trace_clock
1681pseudo-file.
1682
1683These features are described in more detail in the following sections.
1684
16852.2.1 Histogram Variables
1686-------------------------
1687
1688Variables are simply named locations used for saving and retrieving
1689values between matching events. A 'matching' event is defined as an
1690event that has a matching key - if a variable is saved for a histogram
1691entry corresponding to that key, any subsequent event with a matching
1692key can access that variable.
1693
1694A variable's value is normally available to any subsequent event until
1695it is set to something else by a subsequent event. The one exception
1696to that rule is that any variable used in an expression is essentially
1697'read-once' - once it's used by an expression in a subsequent event,
1698it's reset to its 'unset' state, which means it can't be used again
1699unless it's set again. This ensures not only that an event doesn't
1700use an uninitialized variable in a calculation, but that that variable
1701is used only once and not for any unrelated subsequent match.
1702
1703The basic syntax for saving a variable is to simply prefix a unique
1704variable name not corresponding to any keyword along with an '=' sign
1705to any event field.
1706
1707Either keys or values can be saved and retrieved in this way. This
1708creates a variable named 'ts0' for a histogram entry with the key
1709'next_pid':
1710
1711 # echo 'hist:keys=next_pid:vals=$ts0:ts0=common_timestamp ... >> \
1712 event/trigger
1713
1714The ts0 variable can be accessed by any subsequent event having the
1715same pid as 'next_pid'.
1716
1717Variable references are formed by prepending the variable name with
1718the '$' sign. Thus for example, the ts0 variable above would be
1719referenced as '$ts0' in expressions.
1720
1721Because 'vals=' is used, the common_timestamp variable value above
1722will also be summed as a normal histogram value would (though for a
1723timestamp it makes little sense).
1724
1725The below shows that a key value can also be saved in the same way:
1726
1727 # echo 'hist:timer_pid=common_pid:key=timer_pid ...' >> event/trigger
1728
1729If a variable isn't a key variable or prefixed with 'vals=', the
1730associated event field will be saved in a variable but won't be summed
1731as a value:
1732
1733 # echo 'hist:keys=next_pid:ts1=common_timestamp ... >> event/trigger
1734
1735Multiple variables can be assigned at the same time. The below would
1736result in both ts0 and b being created as variables, with both
1737common_timestamp and field1 additionally being summed as values:
1738
1739 # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ... >> \
1740 event/trigger
1741
1742Note that variable assignments can appear either preceding or
1743following their use. The command below behaves identically to the
1744command above:
1745
1746 # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ... >> \
1747 event/trigger
1748
1749Any number of variables not bound to a 'vals=' prefix can also be
1750assigned by simply separating them with colons. Below is the same
1751thing but without the values being summed in the histogram:
1752
1753 # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ... >> event/trigger
1754
1755Variables set as above can be referenced and used in expressions on
1756another event.
1757
1758For example, here's how a latency can be calculated:
1759
1760 # echo 'hist:keys=pid,prio:ts0=common_timestamp ... >> event1/trigger
1761 # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ... >> event2/trigger
1762
1763In the first line above, the event's timetamp is saved into the
1764variable ts0. In the next line, ts0 is subtracted from the second
1765event's timestamp to produce the latency, which is then assigned into
1766yet another variable, 'wakeup_lat'. The hist trigger below in turn
1767makes use of the wakeup_lat variable to compute a combined latency
1768using the same key and variable from yet another event:
1769
1770 # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ... >> event3/trigger
1771
17722.2.2 Synthetic Events
1773----------------------
1774
1775Synthetic events are user-defined events generated from hist trigger
1776variables or fields associated with one or more other events. Their
1777purpose is to provide a mechanism for displaying data spanning
1778multiple events consistent with the existing and already familiar
1779usage for normal events.
1780
1781To define a synthetic event, the user writes a simple specification
1782consisting of the name of the new event along with one or more
1783variables and their types, which can be any valid field type,
1784separated by semicolons, to the tracing/synthetic_events file.
1785
1786For instance, the following creates a new event named 'wakeup_latency'
1787with 3 fields: lat, pid, and prio. Each of those fields is simply a
1788variable reference to a variable on another event:
1789
1790 # echo 'wakeup_latency \
1791 u64 lat; \
1792 pid_t pid; \
1793 int prio' >> \
1794 /sys/kernel/debug/tracing/synthetic_events
1795
1796Reading the tracing/synthetic_events file lists all the currently
1797defined synthetic events, in this case the event defined above:
1798
1799 # cat /sys/kernel/debug/tracing/synthetic_events
1800 wakeup_latency u64 lat; pid_t pid; int prio
1801
1802An existing synthetic event definition can be removed by prepending
1803the command that defined it with a '!':
1804
1805 # echo '!wakeup_latency u64 lat pid_t pid int prio' >> \
1806 /sys/kernel/debug/tracing/synthetic_events
1807
1808At this point, there isn't yet an actual 'wakeup_latency' event
1809instantiated in the event subsytem - for this to happen, a 'hist
1810trigger action' needs to be instantiated and bound to actual fields
1811and variables defined on other events (see Section 6.3.3 below).
1812
1813Once that is done, an event instance is created, and a histogram can
1814be defined using it:
1815
1816 # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \
1817 /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
1818
1819The new event is created under the tracing/events/synthetic/ directory
1820and looks and behaves just like any other event:
1821
1822 # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency
1823 enable filter format hist id trigger
1824
1825Like any other event, once a histogram is enabled for the event, the
1826output can be displayed by reading the event's 'hist' file.
1827
18282.2.3 Hist trigger 'actions'
1829----------------------------
1830
1831A hist trigger 'action' is a function that's executed whenever a
1832histogram entry is added or updated.
1833
1834The default 'action' if no special function is explicity specified is
1835as it always has been, to simply update the set of values associated
1836with an entry. Some applications, however, may want to perform
1837additional actions at that point, such as generate another event, or
1838compare and save a maximum.
1839
1840The following additional actions are available. To specify an action
1841for a given event, simply specify the action between colons in the
1842hist trigger specification.
1843
1844 - onmatch(matching.event).<synthetic_event_name>(param list)
1845
1846 The 'onmatch(matching.event).<synthetic_event_name>(params)' hist
1847 trigger action is invoked whenever an event matches and the
1848 histogram entry would be added or updated. It causes the named
1849 synthetic event to be generated with the values given in the
1850 'param list'. The result is the generation of a synthetic event
1851 that consists of the values contained in those variables at the
1852 time the invoking event was hit.
1853
1854 The 'param list' consists of one or more parameters which may be
1855 either variables or fields defined on either the 'matching.event'
1856 or the target event. The variables or fields specified in the
1857 param list may be either fully-qualified or unqualified. If a
1858 variable is specified as unqualified, it must be unique between
1859 the two events. A field name used as a param can be unqualified
1860 if it refers to the target event, but must be fully qualified if
1861 it refers to the matching event. A fully-qualified name is of the
1862 form 'system.event_name.$var_name' or 'system.event_name.field'.
1863
1864 The 'matching.event' specification is simply the fully qualified
1865 event name of the event that matches the target event for the
1866 onmatch() functionality, in the form 'system.event_name'.
1867
1868 Finally, the number and type of variables/fields in the 'param
1869 list' must match the number and types of the fields in the
1870 synthetic event being generated.
1871
1872 As an example the below defines a simple synthetic event and uses
1873 a variable defined on the sched_wakeup_new event as a parameter
1874 when invoking the synthetic event. Here we define the synthetic
1875 event:
1876
1877 # echo 'wakeup_new_test pid_t pid' >> \
1878 /sys/kernel/debug/tracing/synthetic_events
1879
1880 # cat /sys/kernel/debug/tracing/synthetic_events
1881 wakeup_new_test pid_t pid
1882
1883 The following hist trigger both defines the missing testpid
1884 variable and specifies an onmatch() action that generates a
1885 wakeup_new_test synthetic event whenever a sched_wakeup_new event
1886 occurs, which because of the 'if comm == "cyclictest"' filter only
1887 happens when the executable is cyclictest:
1888
1889 # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\
1890 wakeup_new_test($testpid) if comm=="cyclictest"' >> \
1891 /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger
1892
1893 Creating and displaying a histogram based on those events is now
1894 just a matter of using the fields and new synthetic event in the
1895 tracing/events/synthetic directory, as usual:
1896
1897 # echo 'hist:keys=pid:sort=pid' >> \
1898 /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/trigger
1899
1900 Running 'cyclictest' should cause wakeup_new events to generate
1901 wakeup_new_test synthetic events which should result in histogram
1902 output in the wakeup_new_test event's hist file:
1903
1904 # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/hist
1905
1906 A more typical usage would be to use two events to calculate a
1907 latency. The following example uses a set of hist triggers to
1908 produce a 'wakeup_latency' histogram:
1909
1910 First, we define a 'wakeup_latency' synthetic event:
1911
1912 # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> \
1913 /sys/kernel/debug/tracing/synthetic_events
1914
1915 Next, we specify that whenever we see a sched_waking event for a
1916 cyclictest thread, save the timestamp in a 'ts0' variable:
1917
1918 # echo 'hist:keys=$saved_pid:saved_pid=pid:ts0=common_timestamp.usecs \
1919 if comm=="cyclictest"' >> \
1920 /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
1921
1922 Then, when the corresponding thread is actually scheduled onto the
1923 CPU by a sched_switch event, calculate the latency and use that
1924 along with another variable and an event field to generate a
1925 wakeup_latency synthetic event:
1926
1927 # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:\
1928 onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,\
1929 $saved_pid,next_prio) if next_comm=="cyclictest"' >> \
1930 /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
1931
1932 We also need to create a histogram on the wakeup_latency synthetic
1933 event in order to aggregate the generated synthetic event data:
1934
1935 # echo 'hist:keys=pid,prio,lat:sort=pid,lat' >> \
1936 /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger
1937
1938 Finally, once we've run cyclictest to actually generate some
1939 events, we can see the output by looking at the wakeup_latency
1940 synthetic event's hist file:
1941
1942 # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist
1943
1944 - onmax(var).save(field,.. .)
1945
1946 The 'onmax(var).save(field,...)' hist trigger action is invoked
1947 whenever the value of 'var' associated with a histogram entry
1948 exceeds the current maximum contained in that variable.
1949
1950 The end result is that the trace event fields specified as the
1951 onmax.save() params will be saved if 'var' exceeds the current
1952 maximum for that hist trigger entry. This allows context from the
1953 event that exhibited the new maximum to be saved for later
1954 reference. When the histogram is displayed, additional fields
1955 displaying the saved values will be printed.
1956
1957 As an example the below defines a couple of hist triggers, one for
1958 sched_waking and another for sched_switch, keyed on pid. Whenever
1959 a sched_waking occurs, the timestamp is saved in the entry
1960 corresponding to the current pid, and when the scheduler switches
1961 back to that pid, the timestamp difference is calculated. If the
1962 resulting latency, stored in wakeup_lat, exceeds the current
1963 maximum latency, the values specified in the save() fields are
1964 recoreded:
1965
1966 # echo 'hist:keys=pid:ts0=common_timestamp.usecs \
1967 if comm=="cyclictest"' >> \
1968 /sys/kernel/debug/tracing/events/sched/sched_waking/trigger
1969
1970 # echo 'hist:keys=next_pid:\
1971 wakeup_lat=common_timestamp.usecs-$ts0:\
1972 onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) \
1973 if next_comm=="cyclictest"' >> \
1974 /sys/kernel/debug/tracing/events/sched/sched_switch/trigger
1975
1976 When the histogram is displayed, the max value and the saved
1977 values corresponding to the max are displayed following the rest
1978 of the fields:
1979
1980 # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist
1981 { next_pid: 2255 } hitcount: 239
1982 common_timestamp-ts0: 0
1983 max: 27
1984 next_comm: cyclictest
1985 prev_pid: 0 prev_prio: 120 prev_comm: swapper/1
1986
1987 { next_pid: 2256 } hitcount: 2355
1988 common_timestamp-ts0: 0
1989 max: 49 next_comm: cyclictest
1990 prev_pid: 0 prev_prio: 120 prev_comm: swapper/0
1991
1992 Totals:
1993 Hits: 12970
1994 Entries: 2
1995 Dropped: 0
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 7d9eb39fa76a..a0233edc0718 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -34,10 +34,12 @@ struct ring_buffer_event {
34 * array[0] = time delta (28 .. 59) 34 * array[0] = time delta (28 .. 59)
35 * size = 8 bytes 35 * size = 8 bytes
36 * 36 *
37 * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock 37 * @RINGBUF_TYPE_TIME_STAMP: Absolute timestamp
38 * array[0] = tv_nsec 38 * Same format as TIME_EXTEND except that the
39 * array[1..2] = tv_sec 39 * value is an absolute timestamp, not a delta
40 * size = 16 bytes 40 * event.time_delta contains bottom 27 bits
41 * array[0] = top (28 .. 59) bits
42 * size = 8 bytes
41 * 43 *
42 * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX: 44 * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
43 * Data record 45 * Data record
@@ -54,12 +56,12 @@ enum ring_buffer_type {
54 RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28, 56 RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
55 RINGBUF_TYPE_PADDING, 57 RINGBUF_TYPE_PADDING,
56 RINGBUF_TYPE_TIME_EXTEND, 58 RINGBUF_TYPE_TIME_EXTEND,
57 /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
58 RINGBUF_TYPE_TIME_STAMP, 59 RINGBUF_TYPE_TIME_STAMP,
59}; 60};
60 61
61unsigned ring_buffer_event_length(struct ring_buffer_event *event); 62unsigned ring_buffer_event_length(struct ring_buffer_event *event);
62void *ring_buffer_event_data(struct ring_buffer_event *event); 63void *ring_buffer_event_data(struct ring_buffer_event *event);
64u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event);
63 65
64/* 66/*
65 * ring_buffer_discard_commit will remove an event that has not 67 * ring_buffer_discard_commit will remove an event that has not
@@ -115,6 +117,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
115int ring_buffer_write(struct ring_buffer *buffer, 117int ring_buffer_write(struct ring_buffer *buffer,
116 unsigned long length, void *data); 118 unsigned long length, void *data);
117 119
120void ring_buffer_nest_start(struct ring_buffer *buffer);
121void ring_buffer_nest_end(struct ring_buffer *buffer);
122
118struct ring_buffer_event * 123struct ring_buffer_event *
119ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, 124ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
120 unsigned long *lost_events); 125 unsigned long *lost_events);
@@ -178,6 +183,8 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
178 int cpu, u64 *ts); 183 int cpu, u64 *ts);
179void ring_buffer_set_clock(struct ring_buffer *buffer, 184void ring_buffer_set_clock(struct ring_buffer *buffer,
180 u64 (*clock)(void)); 185 u64 (*clock)(void));
186void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs);
187bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer);
181 188
182size_t ring_buffer_page_len(void *page); 189size_t ring_buffer_page_len(void *page);
183 190
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index e0e98000b665..2bde3eff564c 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -430,11 +430,13 @@ enum event_trigger_type {
430 430
431extern int filter_match_preds(struct event_filter *filter, void *rec); 431extern int filter_match_preds(struct event_filter *filter, void *rec);
432 432
433extern enum event_trigger_type event_triggers_call(struct trace_event_file *file, 433extern enum event_trigger_type
434 void *rec); 434event_triggers_call(struct trace_event_file *file, void *rec,
435extern void event_triggers_post_call(struct trace_event_file *file, 435 struct ring_buffer_event *event);
436 enum event_trigger_type tt, 436extern void
437 void *rec); 437event_triggers_post_call(struct trace_event_file *file,
438 enum event_trigger_type tt,
439 void *rec, struct ring_buffer_event *event);
438 440
439bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); 441bool trace_event_ignore_this_pid(struct trace_event_file *trace_file);
440 442
@@ -454,7 +456,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file)
454 456
455 if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { 457 if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) {
456 if (eflags & EVENT_FILE_FL_TRIGGER_MODE) 458 if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
457 event_triggers_call(file, NULL); 459 event_triggers_call(file, NULL, NULL);
458 if (eflags & EVENT_FILE_FL_SOFT_DISABLED) 460 if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
459 return true; 461 return true;
460 if (eflags & EVENT_FILE_FL_PID_FILTER) 462 if (eflags & EVENT_FILE_FL_PID_FILTER)
diff --git a/include/trace/events/initcall.h b/include/trace/events/initcall.h
new file mode 100644
index 000000000000..8d6cf10d27c9
--- /dev/null
+++ b/include/trace/events/initcall.h
@@ -0,0 +1,66 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#undef TRACE_SYSTEM
3#define TRACE_SYSTEM initcall
4
5#if !defined(_TRACE_INITCALL_H) || defined(TRACE_HEADER_MULTI_READ)
6#define _TRACE_INITCALL_H
7
8#include <linux/tracepoint.h>
9
10TRACE_EVENT(initcall_level,
11
12 TP_PROTO(const char *level),
13
14 TP_ARGS(level),
15
16 TP_STRUCT__entry(
17 __string(level, level)
18 ),
19
20 TP_fast_assign(
21 __assign_str(level, level);
22 ),
23
24 TP_printk("level=%s", __get_str(level))
25);
26
27TRACE_EVENT(initcall_start,
28
29 TP_PROTO(initcall_t func),
30
31 TP_ARGS(func),
32
33 TP_STRUCT__entry(
34 __field(initcall_t, func)
35 ),
36
37 TP_fast_assign(
38 __entry->func = func;
39 ),
40
41 TP_printk("func=%pS", __entry->func)
42);
43
44TRACE_EVENT(initcall_finish,
45
46 TP_PROTO(initcall_t func, int ret),
47
48 TP_ARGS(func, ret),
49
50 TP_STRUCT__entry(
51 __field(initcall_t, func)
52 __field(int, ret)
53 ),
54
55 TP_fast_assign(
56 __entry->func = func;
57 __entry->ret = ret;
58 ),
59
60 TP_printk("func=%pS ret=%d", __entry->func, __entry->ret)
61);
62
63#endif /* if !defined(_TRACE_GPIO_H) || defined(TRACE_HEADER_MULTI_READ) */
64
65/* This part must be outside protection */
66#include <trace/define_trace.h>
diff --git a/init/main.c b/init/main.c
index e4a3160991ea..d499f4a80e0b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -97,6 +97,9 @@
97#include <asm/sections.h> 97#include <asm/sections.h>
98#include <asm/cacheflush.h> 98#include <asm/cacheflush.h>
99 99
100#define CREATE_TRACE_POINTS
101#include <trace/events/initcall.h>
102
100static int kernel_init(void *); 103static int kernel_init(void *);
101 104
102extern void init_IRQ(void); 105extern void init_IRQ(void);
@@ -491,6 +494,17 @@ void __init __weak thread_stack_cache_init(void)
491 494
492void __init __weak mem_encrypt_init(void) { } 495void __init __weak mem_encrypt_init(void) { }
493 496
497bool initcall_debug;
498core_param(initcall_debug, initcall_debug, bool, 0644);
499
500#ifdef TRACEPOINTS_ENABLED
501static void __init initcall_debug_enable(void);
502#else
503static inline void initcall_debug_enable(void)
504{
505}
506#endif
507
494/* 508/*
495 * Set up kernel memory allocators 509 * Set up kernel memory allocators
496 */ 510 */
@@ -612,6 +626,9 @@ asmlinkage __visible void __init start_kernel(void)
612 /* Trace events are available after this */ 626 /* Trace events are available after this */
613 trace_init(); 627 trace_init();
614 628
629 if (initcall_debug)
630 initcall_debug_enable();
631
615 context_tracking_init(); 632 context_tracking_init();
616 /* init some links before init_ISA_irqs() */ 633 /* init some links before init_ISA_irqs() */
617 early_irq_init(); 634 early_irq_init();
@@ -728,9 +745,6 @@ static void __init do_ctors(void)
728#endif 745#endif
729} 746}
730 747
731bool initcall_debug;
732core_param(initcall_debug, initcall_debug, bool, 0644);
733
734#ifdef CONFIG_KALLSYMS 748#ifdef CONFIG_KALLSYMS
735struct blacklist_entry { 749struct blacklist_entry {
736 struct list_head next; 750 struct list_head next;
@@ -800,37 +814,71 @@ static bool __init_or_module initcall_blacklisted(initcall_t fn)
800#endif 814#endif
801__setup("initcall_blacklist=", initcall_blacklist); 815__setup("initcall_blacklist=", initcall_blacklist);
802 816
803static int __init_or_module do_one_initcall_debug(initcall_t fn) 817static __init_or_module void
818trace_initcall_start_cb(void *data, initcall_t fn)
804{ 819{
805 ktime_t calltime, delta, rettime; 820 ktime_t *calltime = (ktime_t *)data;
806 unsigned long long duration;
807 int ret;
808 821
809 printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); 822 printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current));
810 calltime = ktime_get(); 823 *calltime = ktime_get();
811 ret = fn(); 824}
825
826static __init_or_module void
827trace_initcall_finish_cb(void *data, initcall_t fn, int ret)
828{
829 ktime_t *calltime = (ktime_t *)data;
830 ktime_t delta, rettime;
831 unsigned long long duration;
832
812 rettime = ktime_get(); 833 rettime = ktime_get();
813 delta = ktime_sub(rettime, calltime); 834 delta = ktime_sub(rettime, *calltime);
814 duration = (unsigned long long) ktime_to_ns(delta) >> 10; 835 duration = (unsigned long long) ktime_to_ns(delta) >> 10;
815 printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", 836 printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n",
816 fn, ret, duration); 837 fn, ret, duration);
838}
817 839
818 return ret; 840static ktime_t initcall_calltime;
841
842#ifdef TRACEPOINTS_ENABLED
843static void __init initcall_debug_enable(void)
844{
845 int ret;
846
847 ret = register_trace_initcall_start(trace_initcall_start_cb,
848 &initcall_calltime);
849 ret |= register_trace_initcall_finish(trace_initcall_finish_cb,
850 &initcall_calltime);
851 WARN(ret, "Failed to register initcall tracepoints\n");
819} 852}
853# define do_trace_initcall_start trace_initcall_start
854# define do_trace_initcall_finish trace_initcall_finish
855#else
856static inline void do_trace_initcall_start(initcall_t fn)
857{
858 if (!initcall_debug)
859 return;
860 trace_initcall_start_cb(&initcall_calltime, fn);
861}
862static inline void do_trace_initcall_finish(initcall_t fn, int ret)
863{
864 if (!initcall_debug)
865 return;
866 trace_initcall_finish_cb(&initcall_calltime, fn, ret);
867}
868#endif /* !TRACEPOINTS_ENABLED */
820 869
821int __init_or_module do_one_initcall(initcall_t fn) 870int __init_or_module do_one_initcall(initcall_t fn)
822{ 871{
823 int count = preempt_count(); 872 int count = preempt_count();
824 int ret;
825 char msgbuf[64]; 873 char msgbuf[64];
874 int ret;
826 875
827 if (initcall_blacklisted(fn)) 876 if (initcall_blacklisted(fn))
828 return -EPERM; 877 return -EPERM;
829 878
830 if (initcall_debug) 879 do_trace_initcall_start(fn);
831 ret = do_one_initcall_debug(fn); 880 ret = fn();
832 else 881 do_trace_initcall_finish(fn, ret);
833 ret = fn();
834 882
835 msgbuf[0] = 0; 883 msgbuf[0] = 0;
836 884
@@ -874,7 +922,7 @@ static initcall_t *initcall_levels[] __initdata = {
874 922
875/* Keep these in sync with initcalls in include/linux/init.h */ 923/* Keep these in sync with initcalls in include/linux/init.h */
876static char *initcall_level_names[] __initdata = { 924static char *initcall_level_names[] __initdata = {
877 "early", 925 "pure",
878 "core", 926 "core",
879 "postcore", 927 "postcore",
880 "arch", 928 "arch",
@@ -895,6 +943,7 @@ static void __init do_initcall_level(int level)
895 level, level, 943 level, level,
896 NULL, &repair_env_string); 944 NULL, &repair_env_string);
897 945
946 trace_initcall_level(initcall_level_names[level]);
898 for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) 947 for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
899 do_one_initcall(*fn); 948 do_one_initcall(*fn);
900} 949}
@@ -929,6 +978,7 @@ static void __init do_pre_smp_initcalls(void)
929{ 978{
930 initcall_t *fn; 979 initcall_t *fn;
931 980
981 trace_initcall_level("early");
932 for (fn = __initcall_start; fn < __initcall0_start; fn++) 982 for (fn = __initcall_start; fn < __initcall0_start; fn++)
933 do_one_initcall(*fn); 983 do_one_initcall(*fn);
934} 984}
diff --git a/kernel/panic.c b/kernel/panic.c
index 9d833d913c84..6c3b08cd1139 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -554,6 +554,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
554 else 554 else
555 dump_stack(); 555 dump_stack();
556 556
557 print_irqtrace_events(current);
558
557 print_oops_end_marker(); 559 print_oops_end_marker();
558 560
559 /* Just a warning, don't kill lockdep. */ 561 /* Just a warning, don't kill lockdep. */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 704e55129c3a..2f4af216bd6e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -51,6 +51,7 @@
51#include <linux/uaccess.h> 51#include <linux/uaccess.h>
52#include <asm/sections.h> 52#include <asm/sections.h>
53 53
54#include <trace/events/initcall.h>
54#define CREATE_TRACE_POINTS 55#define CREATE_TRACE_POINTS
55#include <trace/events/printk.h> 56#include <trace/events/printk.h>
56 57
@@ -2780,6 +2781,7 @@ EXPORT_SYMBOL(unregister_console);
2780 */ 2781 */
2781void __init console_init(void) 2782void __init console_init(void)
2782{ 2783{
2784 int ret;
2783 initcall_t *call; 2785 initcall_t *call;
2784 2786
2785 /* Setup the default TTY line discipline. */ 2787 /* Setup the default TTY line discipline. */
@@ -2790,8 +2792,11 @@ void __init console_init(void)
2790 * inform about problems etc.. 2792 * inform about problems etc..
2791 */ 2793 */
2792 call = __con_initcall_start; 2794 call = __con_initcall_start;
2795 trace_initcall_level("console");
2793 while (call < __con_initcall_end) { 2796 while (call < __con_initcall_end) {
2794 (*call)(); 2797 trace_initcall_start((*call));
2798 ret = (*call)();
2799 trace_initcall_finish((*call), ret);
2795 call++; 2800 call++;
2796 } 2801 }
2797} 2802}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 0b249e2f0c3c..c4f0f2e4126e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -606,7 +606,10 @@ config HIST_TRIGGERS
606 event activity as an initial guide for further investigation 606 event activity as an initial guide for further investigation
607 using more advanced tools. 607 using more advanced tools.
608 608
609 See Documentation/trace/events.txt. 609 Inter-event tracing of quantities such as latencies is also
610 supported using hist triggers under this option.
611
612 See Documentation/trace/histogram.txt.
610 If in doubt, say N. 613 If in doubt, say N.
611 614
612config MMIOTRACE_TEST 615config MMIOTRACE_TEST
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eac9ce2c57a2..16bbf062018f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3902,14 +3902,13 @@ static bool module_exists(const char *module)
3902{ 3902{
3903 /* All modules have the symbol __this_module */ 3903 /* All modules have the symbol __this_module */
3904 const char this_mod[] = "__this_module"; 3904 const char this_mod[] = "__this_module";
3905 const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1; 3905 char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
3906 char modname[modname_size + 1];
3907 unsigned long val; 3906 unsigned long val;
3908 int n; 3907 int n;
3909 3908
3910 n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod); 3909 n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod);
3911 3910
3912 if (n > modname_size) 3911 if (n > sizeof(modname) - 1)
3913 return false; 3912 return false;
3914 3913
3915 val = module_kallsyms_lookup_name(modname); 3914 val = module_kallsyms_lookup_name(modname);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dcf1c4dd3efe..c9cb9767d49b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -22,6 +22,7 @@
22#include <linux/hash.h> 22#include <linux/hash.h>
23#include <linux/list.h> 23#include <linux/list.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/oom.h>
25 26
26#include <asm/local.h> 27#include <asm/local.h>
27 28
@@ -41,6 +42,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
41 RINGBUF_TYPE_PADDING); 42 RINGBUF_TYPE_PADDING);
42 trace_seq_printf(s, "\ttime_extend : type == %d\n", 43 trace_seq_printf(s, "\ttime_extend : type == %d\n",
43 RINGBUF_TYPE_TIME_EXTEND); 44 RINGBUF_TYPE_TIME_EXTEND);
45 trace_seq_printf(s, "\ttime_stamp : type == %d\n",
46 RINGBUF_TYPE_TIME_STAMP);
44 trace_seq_printf(s, "\tdata max type_len == %d\n", 47 trace_seq_printf(s, "\tdata max type_len == %d\n",
45 RINGBUF_TYPE_DATA_TYPE_LEN_MAX); 48 RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
46 49
@@ -140,12 +143,15 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
140 143
141enum { 144enum {
142 RB_LEN_TIME_EXTEND = 8, 145 RB_LEN_TIME_EXTEND = 8,
143 RB_LEN_TIME_STAMP = 16, 146 RB_LEN_TIME_STAMP = 8,
144}; 147};
145 148
146#define skip_time_extend(event) \ 149#define skip_time_extend(event) \
147 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) 150 ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
148 151
152#define extended_time(event) \
153 (event->type_len >= RINGBUF_TYPE_TIME_EXTEND)
154
149static inline int rb_null_event(struct ring_buffer_event *event) 155static inline int rb_null_event(struct ring_buffer_event *event)
150{ 156{
151 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; 157 return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -209,7 +215,7 @@ rb_event_ts_length(struct ring_buffer_event *event)
209{ 215{
210 unsigned len = 0; 216 unsigned len = 0;
211 217
212 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { 218 if (extended_time(event)) {
213 /* time extends include the data event after it */ 219 /* time extends include the data event after it */
214 len = RB_LEN_TIME_EXTEND; 220 len = RB_LEN_TIME_EXTEND;
215 event = skip_time_extend(event); 221 event = skip_time_extend(event);
@@ -231,7 +237,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
231{ 237{
232 unsigned length; 238 unsigned length;
233 239
234 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) 240 if (extended_time(event))
235 event = skip_time_extend(event); 241 event = skip_time_extend(event);
236 242
237 length = rb_event_length(event); 243 length = rb_event_length(event);
@@ -248,7 +254,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
248static __always_inline void * 254static __always_inline void *
249rb_event_data(struct ring_buffer_event *event) 255rb_event_data(struct ring_buffer_event *event)
250{ 256{
251 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) 257 if (extended_time(event))
252 event = skip_time_extend(event); 258 event = skip_time_extend(event);
253 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); 259 BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
254 /* If length is in len field, then array[0] has the data */ 260 /* If length is in len field, then array[0] has the data */
@@ -275,6 +281,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
275#define TS_MASK ((1ULL << TS_SHIFT) - 1) 281#define TS_MASK ((1ULL << TS_SHIFT) - 1)
276#define TS_DELTA_TEST (~TS_MASK) 282#define TS_DELTA_TEST (~TS_MASK)
277 283
284/**
285 * ring_buffer_event_time_stamp - return the event's extended timestamp
286 * @event: the event to get the timestamp of
287 *
288 * Returns the extended timestamp associated with a data event.
289 * An extended time_stamp is a 64-bit timestamp represented
290 * internally in a special way that makes the best use of space
291 * contained within a ring buffer event. This function decodes
292 * it and maps it to a straight u64 value.
293 */
294u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event)
295{
296 u64 ts;
297
298 ts = event->array[0];
299 ts <<= TS_SHIFT;
300 ts += event->time_delta;
301
302 return ts;
303}
304
278/* Flag when events were overwritten */ 305/* Flag when events were overwritten */
279#define RB_MISSED_EVENTS (1 << 31) 306#define RB_MISSED_EVENTS (1 << 31)
280/* Missed count stored at end */ 307/* Missed count stored at end */
@@ -451,6 +478,7 @@ struct ring_buffer_per_cpu {
451 struct buffer_page *reader_page; 478 struct buffer_page *reader_page;
452 unsigned long lost_events; 479 unsigned long lost_events;
453 unsigned long last_overrun; 480 unsigned long last_overrun;
481 unsigned long nest;
454 local_t entries_bytes; 482 local_t entries_bytes;
455 local_t entries; 483 local_t entries;
456 local_t overrun; 484 local_t overrun;
@@ -488,6 +516,7 @@ struct ring_buffer {
488 u64 (*clock)(void); 516 u64 (*clock)(void);
489 517
490 struct rb_irq_work irq_work; 518 struct rb_irq_work irq_work;
519 bool time_stamp_abs;
491}; 520};
492 521
493struct ring_buffer_iter { 522struct ring_buffer_iter {
@@ -1134,30 +1163,60 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
1134static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) 1163static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
1135{ 1164{
1136 struct buffer_page *bpage, *tmp; 1165 struct buffer_page *bpage, *tmp;
1166 bool user_thread = current->mm != NULL;
1167 gfp_t mflags;
1137 long i; 1168 long i;
1138 1169
1170 /*
1171 * Check if the available memory is there first.
1172 * Note, si_mem_available() only gives us a rough estimate of available
1173 * memory. It may not be accurate. But we don't care, we just want
1174 * to prevent doing any allocation when it is obvious that it is
1175 * not going to succeed.
1176 */
1177 i = si_mem_available();
1178 if (i < nr_pages)
1179 return -ENOMEM;
1180
1181 /*
1182 * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
1183 * gracefully without invoking oom-killer and the system is not
1184 * destabilized.
1185 */
1186 mflags = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
1187
1188 /*
1189 * If a user thread allocates too much, and si_mem_available()
1190 * reports there's enough memory, even though there is not.
1191 * Make sure the OOM killer kills this thread. This can happen
1192 * even with RETRY_MAYFAIL because another task may be doing
1193 * an allocation after this task has taken all memory.
1194 * This is the task the OOM killer needs to take out during this
1195 * loop, even if it was triggered by an allocation somewhere else.
1196 */
1197 if (user_thread)
1198 set_current_oom_origin();
1139 for (i = 0; i < nr_pages; i++) { 1199 for (i = 0; i < nr_pages; i++) {
1140 struct page *page; 1200 struct page *page;
1141 /* 1201
1142 * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
1143 * gracefully without invoking oom-killer and the system is not
1144 * destabilized.
1145 */
1146 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1202 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1147 GFP_KERNEL | __GFP_RETRY_MAYFAIL, 1203 mflags, cpu_to_node(cpu));
1148 cpu_to_node(cpu));
1149 if (!bpage) 1204 if (!bpage)
1150 goto free_pages; 1205 goto free_pages;
1151 1206
1152 list_add(&bpage->list, pages); 1207 list_add(&bpage->list, pages);
1153 1208
1154 page = alloc_pages_node(cpu_to_node(cpu), 1209 page = alloc_pages_node(cpu_to_node(cpu), mflags, 0);
1155 GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0);
1156 if (!page) 1210 if (!page)
1157 goto free_pages; 1211 goto free_pages;
1158 bpage->page = page_address(page); 1212 bpage->page = page_address(page);
1159 rb_init_page(bpage->page); 1213 rb_init_page(bpage->page);
1214
1215 if (user_thread && fatal_signal_pending(current))
1216 goto free_pages;
1160 } 1217 }
1218 if (user_thread)
1219 clear_current_oom_origin();
1161 1220
1162 return 0; 1221 return 0;
1163 1222
@@ -1166,6 +1225,8 @@ free_pages:
1166 list_del_init(&bpage->list); 1225 list_del_init(&bpage->list);
1167 free_buffer_page(bpage); 1226 free_buffer_page(bpage);
1168 } 1227 }
1228 if (user_thread)
1229 clear_current_oom_origin();
1169 1230
1170 return -ENOMEM; 1231 return -ENOMEM;
1171} 1232}
@@ -1382,6 +1443,16 @@ void ring_buffer_set_clock(struct ring_buffer *buffer,
1382 buffer->clock = clock; 1443 buffer->clock = clock;
1383} 1444}
1384 1445
1446void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs)
1447{
1448 buffer->time_stamp_abs = abs;
1449}
1450
1451bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer)
1452{
1453 return buffer->time_stamp_abs;
1454}
1455
1385static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); 1456static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
1386 1457
1387static inline unsigned long rb_page_entries(struct buffer_page *bpage) 1458static inline unsigned long rb_page_entries(struct buffer_page *bpage)
@@ -2206,12 +2277,15 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
2206 2277
2207/* Slow path, do not inline */ 2278/* Slow path, do not inline */
2208static noinline struct ring_buffer_event * 2279static noinline struct ring_buffer_event *
2209rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) 2280rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
2210{ 2281{
2211 event->type_len = RINGBUF_TYPE_TIME_EXTEND; 2282 if (abs)
2283 event->type_len = RINGBUF_TYPE_TIME_STAMP;
2284 else
2285 event->type_len = RINGBUF_TYPE_TIME_EXTEND;
2212 2286
2213 /* Not the first event on the page? */ 2287 /* Not the first event on the page, or not delta? */
2214 if (rb_event_index(event)) { 2288 if (abs || rb_event_index(event)) {
2215 event->time_delta = delta & TS_MASK; 2289 event->time_delta = delta & TS_MASK;
2216 event->array[0] = delta >> TS_SHIFT; 2290 event->array[0] = delta >> TS_SHIFT;
2217 } else { 2291 } else {
@@ -2254,7 +2328,9 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
2254 * add it to the start of the resevered space. 2328 * add it to the start of the resevered space.
2255 */ 2329 */
2256 if (unlikely(info->add_timestamp)) { 2330 if (unlikely(info->add_timestamp)) {
2257 event = rb_add_time_stamp(event, delta); 2331 bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
2332
2333 event = rb_add_time_stamp(event, info->delta, abs);
2258 length -= RB_LEN_TIME_EXTEND; 2334 length -= RB_LEN_TIME_EXTEND;
2259 delta = 0; 2335 delta = 0;
2260 } 2336 }
@@ -2442,7 +2518,7 @@ static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer
2442 2518
2443static inline void rb_event_discard(struct ring_buffer_event *event) 2519static inline void rb_event_discard(struct ring_buffer_event *event)
2444{ 2520{
2445 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) 2521 if (extended_time(event))
2446 event = skip_time_extend(event); 2522 event = skip_time_extend(event);
2447 2523
2448 /* array[0] holds the actual length for the discarded event */ 2524 /* array[0] holds the actual length for the discarded event */
@@ -2486,10 +2562,11 @@ rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2486 cpu_buffer->write_stamp = 2562 cpu_buffer->write_stamp =
2487 cpu_buffer->commit_page->page->time_stamp; 2563 cpu_buffer->commit_page->page->time_stamp;
2488 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { 2564 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
2489 delta = event->array[0]; 2565 delta = ring_buffer_event_time_stamp(event);
2490 delta <<= TS_SHIFT;
2491 delta += event->time_delta;
2492 cpu_buffer->write_stamp += delta; 2566 cpu_buffer->write_stamp += delta;
2567 } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
2568 delta = ring_buffer_event_time_stamp(event);
2569 cpu_buffer->write_stamp = delta;
2493 } else 2570 } else
2494 cpu_buffer->write_stamp += event->time_delta; 2571 cpu_buffer->write_stamp += event->time_delta;
2495 } 2572 }
@@ -2581,10 +2658,10 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
2581 bit = pc & NMI_MASK ? RB_CTX_NMI : 2658 bit = pc & NMI_MASK ? RB_CTX_NMI :
2582 pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; 2659 pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
2583 2660
2584 if (unlikely(val & (1 << bit))) 2661 if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
2585 return 1; 2662 return 1;
2586 2663
2587 val |= (1 << bit); 2664 val |= (1 << (bit + cpu_buffer->nest));
2588 cpu_buffer->current_context = val; 2665 cpu_buffer->current_context = val;
2589 2666
2590 return 0; 2667 return 0;
@@ -2593,7 +2670,57 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
2593static __always_inline void 2670static __always_inline void
2594trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) 2671trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
2595{ 2672{
2596 cpu_buffer->current_context &= cpu_buffer->current_context - 1; 2673 cpu_buffer->current_context &=
2674 cpu_buffer->current_context - (1 << cpu_buffer->nest);
2675}
2676
2677/* The recursive locking above uses 4 bits */
2678#define NESTED_BITS 4
2679
2680/**
2681 * ring_buffer_nest_start - Allow to trace while nested
2682 * @buffer: The ring buffer to modify
2683 *
2684 * The ring buffer has a safty mechanism to prevent recursion.
2685 * But there may be a case where a trace needs to be done while
2686 * tracing something else. In this case, calling this function
2687 * will allow this function to nest within a currently active
2688 * ring_buffer_lock_reserve().
2689 *
2690 * Call this function before calling another ring_buffer_lock_reserve() and
2691 * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit().
2692 */
2693void ring_buffer_nest_start(struct ring_buffer *buffer)
2694{
2695 struct ring_buffer_per_cpu *cpu_buffer;
2696 int cpu;
2697
2698 /* Enabled by ring_buffer_nest_end() */
2699 preempt_disable_notrace();
2700 cpu = raw_smp_processor_id();
2701 cpu_buffer = buffer->buffers[cpu];
2702 /* This is the shift value for the above recusive locking */
2703 cpu_buffer->nest += NESTED_BITS;
2704}
2705
2706/**
2707 * ring_buffer_nest_end - Allow to trace while nested
2708 * @buffer: The ring buffer to modify
2709 *
2710 * Must be called after ring_buffer_nest_start() and after the
2711 * ring_buffer_unlock_commit().
2712 */
2713void ring_buffer_nest_end(struct ring_buffer *buffer)
2714{
2715 struct ring_buffer_per_cpu *cpu_buffer;
2716 int cpu;
2717
2718 /* disabled by ring_buffer_nest_start() */
2719 cpu = raw_smp_processor_id();
2720 cpu_buffer = buffer->buffers[cpu];
2721 /* This is the shift value for the above recusive locking */
2722 cpu_buffer->nest -= NESTED_BITS;
2723 preempt_enable_notrace();
2597} 2724}
2598 2725
2599/** 2726/**
@@ -2637,7 +2764,8 @@ rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
2637 sched_clock_stable() ? "" : 2764 sched_clock_stable() ? "" :
2638 "If you just came from a suspend/resume,\n" 2765 "If you just came from a suspend/resume,\n"
2639 "please switch to the trace global clock:\n" 2766 "please switch to the trace global clock:\n"
2640 " echo global > /sys/kernel/debug/tracing/trace_clock\n"); 2767 " echo global > /sys/kernel/debug/tracing/trace_clock\n"
2768 "or add trace_clock=global to the kernel command line\n");
2641 info->add_timestamp = 1; 2769 info->add_timestamp = 1;
2642} 2770}
2643 2771
@@ -2669,7 +2797,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
2669 * If this is the first commit on the page, then it has the same 2797 * If this is the first commit on the page, then it has the same
2670 * timestamp as the page itself. 2798 * timestamp as the page itself.
2671 */ 2799 */
2672 if (!tail) 2800 if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
2673 info->delta = 0; 2801 info->delta = 0;
2674 2802
2675 /* See if we shot pass the end of this buffer page */ 2803 /* See if we shot pass the end of this buffer page */
@@ -2746,8 +2874,11 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2746 /* make sure this diff is calculated here */ 2874 /* make sure this diff is calculated here */
2747 barrier(); 2875 barrier();
2748 2876
2749 /* Did the write stamp get updated already? */ 2877 if (ring_buffer_time_stamp_abs(buffer)) {
2750 if (likely(info.ts >= cpu_buffer->write_stamp)) { 2878 info.delta = info.ts;
2879 rb_handle_timestamp(cpu_buffer, &info);
2880 } else /* Did the write stamp get updated already? */
2881 if (likely(info.ts >= cpu_buffer->write_stamp)) {
2751 info.delta = diff; 2882 info.delta = diff;
2752 if (unlikely(test_time_stamp(info.delta))) 2883 if (unlikely(test_time_stamp(info.delta)))
2753 rb_handle_timestamp(cpu_buffer, &info); 2884 rb_handle_timestamp(cpu_buffer, &info);
@@ -3429,14 +3560,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
3429 return; 3560 return;
3430 3561
3431 case RINGBUF_TYPE_TIME_EXTEND: 3562 case RINGBUF_TYPE_TIME_EXTEND:
3432 delta = event->array[0]; 3563 delta = ring_buffer_event_time_stamp(event);
3433 delta <<= TS_SHIFT;
3434 delta += event->time_delta;
3435 cpu_buffer->read_stamp += delta; 3564 cpu_buffer->read_stamp += delta;
3436 return; 3565 return;
3437 3566
3438 case RINGBUF_TYPE_TIME_STAMP: 3567 case RINGBUF_TYPE_TIME_STAMP:
3439 /* FIXME: not implemented */ 3568 delta = ring_buffer_event_time_stamp(event);
3569 cpu_buffer->read_stamp = delta;
3440 return; 3570 return;
3441 3571
3442 case RINGBUF_TYPE_DATA: 3572 case RINGBUF_TYPE_DATA:
@@ -3460,14 +3590,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
3460 return; 3590 return;
3461 3591
3462 case RINGBUF_TYPE_TIME_EXTEND: 3592 case RINGBUF_TYPE_TIME_EXTEND:
3463 delta = event->array[0]; 3593 delta = ring_buffer_event_time_stamp(event);
3464 delta <<= TS_SHIFT;
3465 delta += event->time_delta;
3466 iter->read_stamp += delta; 3594 iter->read_stamp += delta;
3467 return; 3595 return;
3468 3596
3469 case RINGBUF_TYPE_TIME_STAMP: 3597 case RINGBUF_TYPE_TIME_STAMP:
3470 /* FIXME: not implemented */ 3598 delta = ring_buffer_event_time_stamp(event);
3599 iter->read_stamp = delta;
3471 return; 3600 return;
3472 3601
3473 case RINGBUF_TYPE_DATA: 3602 case RINGBUF_TYPE_DATA:
@@ -3691,6 +3820,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3691 struct buffer_page *reader; 3820 struct buffer_page *reader;
3692 int nr_loops = 0; 3821 int nr_loops = 0;
3693 3822
3823 if (ts)
3824 *ts = 0;
3694 again: 3825 again:
3695 /* 3826 /*
3696 * We repeat when a time extend is encountered. 3827 * We repeat when a time extend is encountered.
@@ -3727,12 +3858,17 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3727 goto again; 3858 goto again;
3728 3859
3729 case RINGBUF_TYPE_TIME_STAMP: 3860 case RINGBUF_TYPE_TIME_STAMP:
3730 /* FIXME: not implemented */ 3861 if (ts) {
3862 *ts = ring_buffer_event_time_stamp(event);
3863 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3864 cpu_buffer->cpu, ts);
3865 }
3866 /* Internal data, OK to advance */
3731 rb_advance_reader(cpu_buffer); 3867 rb_advance_reader(cpu_buffer);
3732 goto again; 3868 goto again;
3733 3869
3734 case RINGBUF_TYPE_DATA: 3870 case RINGBUF_TYPE_DATA:
3735 if (ts) { 3871 if (ts && !(*ts)) {
3736 *ts = cpu_buffer->read_stamp + event->time_delta; 3872 *ts = cpu_buffer->read_stamp + event->time_delta;
3737 ring_buffer_normalize_time_stamp(cpu_buffer->buffer, 3873 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3738 cpu_buffer->cpu, ts); 3874 cpu_buffer->cpu, ts);
@@ -3757,6 +3893,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3757 struct ring_buffer_event *event; 3893 struct ring_buffer_event *event;
3758 int nr_loops = 0; 3894 int nr_loops = 0;
3759 3895
3896 if (ts)
3897 *ts = 0;
3898
3760 cpu_buffer = iter->cpu_buffer; 3899 cpu_buffer = iter->cpu_buffer;
3761 buffer = cpu_buffer->buffer; 3900 buffer = cpu_buffer->buffer;
3762 3901
@@ -3809,12 +3948,17 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3809 goto again; 3948 goto again;
3810 3949
3811 case RINGBUF_TYPE_TIME_STAMP: 3950 case RINGBUF_TYPE_TIME_STAMP:
3812 /* FIXME: not implemented */ 3951 if (ts) {
3952 *ts = ring_buffer_event_time_stamp(event);
3953 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3954 cpu_buffer->cpu, ts);
3955 }
3956 /* Internal data, OK to advance */
3813 rb_advance_iter(iter); 3957 rb_advance_iter(iter);
3814 goto again; 3958 goto again;
3815 3959
3816 case RINGBUF_TYPE_DATA: 3960 case RINGBUF_TYPE_DATA:
3817 if (ts) { 3961 if (ts && !(*ts)) {
3818 *ts = iter->read_stamp + event->time_delta; 3962 *ts = iter->read_stamp + event->time_delta;
3819 ring_buffer_normalize_time_stamp(buffer, 3963 ring_buffer_normalize_time_stamp(buffer,
3820 cpu_buffer->cpu, ts); 3964 cpu_buffer->cpu, ts);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5071931eb943..dfbcf9ee1447 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,6 +41,7 @@
41#include <linux/nmi.h> 41#include <linux/nmi.h>
42#include <linux/fs.h> 42#include <linux/fs.h>
43#include <linux/trace.h> 43#include <linux/trace.h>
44#include <linux/sched/clock.h>
44#include <linux/sched/rt.h> 45#include <linux/sched/rt.h>
45 46
46#include "trace.h" 47#include "trace.h"
@@ -1168,6 +1169,14 @@ static struct {
1168 ARCH_TRACE_CLOCKS 1169 ARCH_TRACE_CLOCKS
1169}; 1170};
1170 1171
1172bool trace_clock_in_ns(struct trace_array *tr)
1173{
1174 if (trace_clocks[tr->clock_id].in_ns)
1175 return true;
1176
1177 return false;
1178}
1179
1171/* 1180/*
1172 * trace_parser_get_init - gets the buffer for trace parser 1181 * trace_parser_get_init - gets the buffer for trace parser
1173 */ 1182 */
@@ -2269,7 +2278,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
2269 2278
2270 *current_rb = trace_file->tr->trace_buffer.buffer; 2279 *current_rb = trace_file->tr->trace_buffer.buffer;
2271 2280
2272 if ((trace_file->flags & 2281 if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags &
2273 (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && 2282 (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) &&
2274 (entry = this_cpu_read(trace_buffered_event))) { 2283 (entry = this_cpu_read(trace_buffered_event))) {
2275 /* Try to use the per cpu buffer first */ 2284 /* Try to use the per cpu buffer first */
@@ -4515,6 +4524,9 @@ static const char readme_msg[] =
4515#ifdef CONFIG_X86_64 4524#ifdef CONFIG_X86_64
4516 " x86-tsc: TSC cycle counter\n" 4525 " x86-tsc: TSC cycle counter\n"
4517#endif 4526#endif
4527 "\n timestamp_mode\t-view the mode used to timestamp events\n"
4528 " delta: Delta difference against a buffer-wide timestamp\n"
4529 " absolute: Absolute (standalone) timestamp\n"
4518 "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" 4530 "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
4519 "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n" 4531 "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n"
4520 " tracing_cpumask\t- Limit which CPUs to trace\n" 4532 " tracing_cpumask\t- Limit which CPUs to trace\n"
@@ -4691,8 +4703,9 @@ static const char readme_msg[] =
4691 "\t .sym display an address as a symbol\n" 4703 "\t .sym display an address as a symbol\n"
4692 "\t .sym-offset display an address as a symbol and offset\n" 4704 "\t .sym-offset display an address as a symbol and offset\n"
4693 "\t .execname display a common_pid as a program name\n" 4705 "\t .execname display a common_pid as a program name\n"
4694 "\t .syscall display a syscall id as a syscall name\n\n" 4706 "\t .syscall display a syscall id as a syscall name\n"
4695 "\t .log2 display log2 value rather than raw number\n\n" 4707 "\t .log2 display log2 value rather than raw number\n"
4708 "\t .usecs display a common_timestamp in microseconds\n\n"
4696 "\t The 'pause' parameter can be used to pause an existing hist\n" 4709 "\t The 'pause' parameter can be used to pause an existing hist\n"
4697 "\t trigger or to start a hist trigger but not log any events\n" 4710 "\t trigger or to start a hist trigger but not log any events\n"
4698 "\t until told to do so. 'continue' can be used to start or\n" 4711 "\t until told to do so. 'continue' can be used to start or\n"
@@ -6202,7 +6215,7 @@ static int tracing_clock_show(struct seq_file *m, void *v)
6202 return 0; 6215 return 0;
6203} 6216}
6204 6217
6205static int tracing_set_clock(struct trace_array *tr, const char *clockstr) 6218int tracing_set_clock(struct trace_array *tr, const char *clockstr)
6206{ 6219{
6207 int i; 6220 int i;
6208 6221
@@ -6282,6 +6295,71 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
6282 return ret; 6295 return ret;
6283} 6296}
6284 6297
6298static int tracing_time_stamp_mode_show(struct seq_file *m, void *v)
6299{
6300 struct trace_array *tr = m->private;
6301
6302 mutex_lock(&trace_types_lock);
6303
6304 if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer))
6305 seq_puts(m, "delta [absolute]\n");
6306 else
6307 seq_puts(m, "[delta] absolute\n");
6308
6309 mutex_unlock(&trace_types_lock);
6310
6311 return 0;
6312}
6313
6314static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file)
6315{
6316 struct trace_array *tr = inode->i_private;
6317 int ret;
6318
6319 if (tracing_disabled)
6320 return -ENODEV;
6321
6322 if (trace_array_get(tr))
6323 return -ENODEV;
6324
6325 ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private);
6326 if (ret < 0)
6327 trace_array_put(tr);
6328
6329 return ret;
6330}
6331
6332int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs)
6333{
6334 int ret = 0;
6335
6336 mutex_lock(&trace_types_lock);
6337
6338 if (abs && tr->time_stamp_abs_ref++)
6339 goto out;
6340
6341 if (!abs) {
6342 if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) {
6343 ret = -EINVAL;
6344 goto out;
6345 }
6346
6347 if (--tr->time_stamp_abs_ref)
6348 goto out;
6349 }
6350
6351 ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs);
6352
6353#ifdef CONFIG_TRACER_MAX_TRACE
6354 if (tr->max_buffer.buffer)
6355 ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs);
6356#endif
6357 out:
6358 mutex_unlock(&trace_types_lock);
6359
6360 return ret;
6361}
6362
6285struct ftrace_buffer_info { 6363struct ftrace_buffer_info {
6286 struct trace_iterator iter; 6364 struct trace_iterator iter;
6287 void *spare; 6365 void *spare;
@@ -6529,6 +6607,13 @@ static const struct file_operations trace_clock_fops = {
6529 .write = tracing_clock_write, 6607 .write = tracing_clock_write,
6530}; 6608};
6531 6609
6610static const struct file_operations trace_time_stamp_mode_fops = {
6611 .open = tracing_time_stamp_mode_open,
6612 .read = seq_read,
6613 .llseek = seq_lseek,
6614 .release = tracing_single_release_tr,
6615};
6616
6532#ifdef CONFIG_TRACER_SNAPSHOT 6617#ifdef CONFIG_TRACER_SNAPSHOT
6533static const struct file_operations snapshot_fops = { 6618static const struct file_operations snapshot_fops = {
6534 .open = tracing_snapshot_open, 6619 .open = tracing_snapshot_open,
@@ -7699,6 +7784,7 @@ static int instance_mkdir(const char *name)
7699 7784
7700 INIT_LIST_HEAD(&tr->systems); 7785 INIT_LIST_HEAD(&tr->systems);
7701 INIT_LIST_HEAD(&tr->events); 7786 INIT_LIST_HEAD(&tr->events);
7787 INIT_LIST_HEAD(&tr->hist_vars);
7702 7788
7703 if (allocate_trace_buffers(tr, trace_buf_size) < 0) 7789 if (allocate_trace_buffers(tr, trace_buf_size) < 0)
7704 goto out_free_tr; 7790 goto out_free_tr;
@@ -7851,6 +7937,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
7851 trace_create_file("tracing_on", 0644, d_tracer, 7937 trace_create_file("tracing_on", 0644, d_tracer,
7852 tr, &rb_simple_fops); 7938 tr, &rb_simple_fops);
7853 7939
7940 trace_create_file("timestamp_mode", 0444, d_tracer, tr,
7941 &trace_time_stamp_mode_fops);
7942
7854 create_trace_options_dir(tr); 7943 create_trace_options_dir(tr);
7855 7944
7856#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) 7945#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
@@ -8446,6 +8535,7 @@ __init static int tracer_alloc_buffers(void)
8446 8535
8447 INIT_LIST_HEAD(&global_trace.systems); 8536 INIT_LIST_HEAD(&global_trace.systems);
8448 INIT_LIST_HEAD(&global_trace.events); 8537 INIT_LIST_HEAD(&global_trace.events);
8538 INIT_LIST_HEAD(&global_trace.hist_vars);
8449 list_add(&global_trace.list, &ftrace_trace_arrays); 8539 list_add(&global_trace.list, &ftrace_trace_arrays);
8450 8540
8451 apply_trace_boot_options(); 8541 apply_trace_boot_options();
@@ -8507,3 +8597,21 @@ __init static int clear_boot_tracer(void)
8507 8597
8508fs_initcall(tracer_init_tracefs); 8598fs_initcall(tracer_init_tracefs);
8509late_initcall_sync(clear_boot_tracer); 8599late_initcall_sync(clear_boot_tracer);
8600
8601#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
8602__init static int tracing_set_default_clock(void)
8603{
8604 /* sched_clock_stable() is determined in late_initcall */
8605 if (!trace_boot_clock && !sched_clock_stable()) {
8606 printk(KERN_WARNING
8607 "Unstable clock detected, switching default tracing clock to \"global\"\n"
8608 "If you want to keep using the local clock, then add:\n"
8609 " \"trace_clock=local\"\n"
8610 "on the kernel command line\n");
8611 tracing_set_clock(&global_trace, "global");
8612 }
8613
8614 return 0;
8615}
8616late_initcall_sync(tracing_set_default_clock);
8617#endif
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2a6d0325a761..6fb46a06c9dc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -273,6 +273,8 @@ struct trace_array {
273 /* function tracing enabled */ 273 /* function tracing enabled */
274 int function_enabled; 274 int function_enabled;
275#endif 275#endif
276 int time_stamp_abs_ref;
277 struct list_head hist_vars;
276}; 278};
277 279
278enum { 280enum {
@@ -286,6 +288,11 @@ extern struct mutex trace_types_lock;
286extern int trace_array_get(struct trace_array *tr); 288extern int trace_array_get(struct trace_array *tr);
287extern void trace_array_put(struct trace_array *tr); 289extern void trace_array_put(struct trace_array *tr);
288 290
291extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs);
292extern int tracing_set_clock(struct trace_array *tr, const char *clockstr);
293
294extern bool trace_clock_in_ns(struct trace_array *tr);
295
289/* 296/*
290 * The global tracer (top) should be the first trace array added, 297 * The global tracer (top) should be the first trace array added,
291 * but we check the flag anyway. 298 * but we check the flag anyway.
@@ -1209,12 +1216,11 @@ struct ftrace_event_field {
1209 int is_signed; 1216 int is_signed;
1210}; 1217};
1211 1218
1219struct prog_entry;
1220
1212struct event_filter { 1221struct event_filter {
1213 int n_preds; /* Number assigned */ 1222 struct prog_entry __rcu *prog;
1214 int a_preds; /* allocated */ 1223 char *filter_string;
1215 struct filter_pred __rcu *preds;
1216 struct filter_pred __rcu *root;
1217 char *filter_string;
1218}; 1224};
1219 1225
1220struct event_subsystem { 1226struct event_subsystem {
@@ -1291,7 +1297,7 @@ __event_trigger_test_discard(struct trace_event_file *file,
1291 unsigned long eflags = file->flags; 1297 unsigned long eflags = file->flags;
1292 1298
1293 if (eflags & EVENT_FILE_FL_TRIGGER_COND) 1299 if (eflags & EVENT_FILE_FL_TRIGGER_COND)
1294 *tt = event_triggers_call(file, entry); 1300 *tt = event_triggers_call(file, entry, event);
1295 1301
1296 if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || 1302 if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
1297 (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && 1303 (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
@@ -1328,7 +1334,7 @@ event_trigger_unlock_commit(struct trace_event_file *file,
1328 trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); 1334 trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc);
1329 1335
1330 if (tt) 1336 if (tt)
1331 event_triggers_post_call(file, tt, entry); 1337 event_triggers_post_call(file, tt, entry, event);
1332} 1338}
1333 1339
1334/** 1340/**
@@ -1361,7 +1367,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file,
1361 irq_flags, pc, regs); 1367 irq_flags, pc, regs);
1362 1368
1363 if (tt) 1369 if (tt)
1364 event_triggers_post_call(file, tt, entry); 1370 event_triggers_post_call(file, tt, entry, event);
1365} 1371}
1366 1372
1367#define FILTER_PRED_INVALID ((unsigned short)-1) 1373#define FILTER_PRED_INVALID ((unsigned short)-1)
@@ -1406,12 +1412,8 @@ struct filter_pred {
1406 unsigned short *ops; 1412 unsigned short *ops;
1407 struct ftrace_event_field *field; 1413 struct ftrace_event_field *field;
1408 int offset; 1414 int offset;
1409 int not; 1415 int not;
1410 int op; 1416 int op;
1411 unsigned short index;
1412 unsigned short parent;
1413 unsigned short left;
1414 unsigned short right;
1415}; 1417};
1416 1418
1417static inline bool is_string_field(struct ftrace_event_field *field) 1419static inline bool is_string_field(struct ftrace_event_field *field)
@@ -1543,6 +1545,8 @@ extern void pause_named_trigger(struct event_trigger_data *data);
1543extern void unpause_named_trigger(struct event_trigger_data *data); 1545extern void unpause_named_trigger(struct event_trigger_data *data);
1544extern void set_named_trigger_data(struct event_trigger_data *data, 1546extern void set_named_trigger_data(struct event_trigger_data *data,
1545 struct event_trigger_data *named_data); 1547 struct event_trigger_data *named_data);
1548extern struct event_trigger_data *
1549get_named_trigger_data(struct event_trigger_data *data);
1546extern int register_event_command(struct event_command *cmd); 1550extern int register_event_command(struct event_command *cmd);
1547extern int unregister_event_command(struct event_command *cmd); 1551extern int unregister_event_command(struct event_command *cmd);
1548extern int register_trigger_hist_enable_disable_cmds(void); 1552extern int register_trigger_hist_enable_disable_cmds(void);
@@ -1586,7 +1590,8 @@ extern int register_trigger_hist_enable_disable_cmds(void);
1586 */ 1590 */
1587struct event_trigger_ops { 1591struct event_trigger_ops {
1588 void (*func)(struct event_trigger_data *data, 1592 void (*func)(struct event_trigger_data *data,
1589 void *rec); 1593 void *rec,
1594 struct ring_buffer_event *rbe);
1590 int (*init)(struct event_trigger_ops *ops, 1595 int (*init)(struct event_trigger_ops *ops,
1591 struct event_trigger_data *data); 1596 struct event_trigger_data *data);
1592 void (*free)(struct event_trigger_ops *ops, 1597 void (*free)(struct event_trigger_ops *ops,
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 5fdc779f411d..d8a188e0418a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -96,7 +96,7 @@ u64 notrace trace_clock_global(void)
96 int this_cpu; 96 int this_cpu;
97 u64 now; 97 u64 now;
98 98
99 local_irq_save(flags); 99 raw_local_irq_save(flags);
100 100
101 this_cpu = raw_smp_processor_id(); 101 this_cpu = raw_smp_processor_id();
102 now = sched_clock_cpu(this_cpu); 102 now = sched_clock_cpu(this_cpu);
@@ -122,7 +122,7 @@ u64 notrace trace_clock_global(void)
122 arch_spin_unlock(&trace_clock_struct.lock); 122 arch_spin_unlock(&trace_clock_struct.lock);
123 123
124 out: 124 out:
125 local_irq_restore(flags); 125 raw_local_irq_restore(flags);
126 126
127 return now; 127 return now;
128} 128}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index a764aec3c9a1..1bda4ec95e18 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -33,163 +33,595 @@
33 "# Only events with the given fields will be affected.\n" \ 33 "# Only events with the given fields will be affected.\n" \
34 "# If no events are modified, an error message will be displayed here" 34 "# If no events are modified, an error message will be displayed here"
35 35
36enum filter_op_ids 36/* Due to token parsing '<=' must be before '<' and '>=' must be before '>' */
37{ 37#define OPS \
38 OP_OR, 38 C( OP_GLOB, "~" ), \
39 OP_AND, 39 C( OP_NE, "!=" ), \
40 OP_GLOB, 40 C( OP_EQ, "==" ), \
41 OP_NE, 41 C( OP_LE, "<=" ), \
42 OP_EQ, 42 C( OP_LT, "<" ), \
43 OP_LT, 43 C( OP_GE, ">=" ), \
44 OP_LE, 44 C( OP_GT, ">" ), \
45 OP_GT, 45 C( OP_BAND, "&" ), \
46 OP_GE, 46 C( OP_MAX, NULL )
47 OP_BAND,
48 OP_NOT,
49 OP_NONE,
50 OP_OPEN_PAREN,
51};
52 47
53struct filter_op { 48#undef C
54 int id; 49#define C(a, b) a
55 char *string;
56 int precedence;
57};
58 50
59/* Order must be the same as enum filter_op_ids above */ 51enum filter_op_ids { OPS };
60static struct filter_op filter_ops[] = {
61 { OP_OR, "||", 1 },
62 { OP_AND, "&&", 2 },
63 { OP_GLOB, "~", 4 },
64 { OP_NE, "!=", 4 },
65 { OP_EQ, "==", 4 },
66 { OP_LT, "<", 5 },
67 { OP_LE, "<=", 5 },
68 { OP_GT, ">", 5 },
69 { OP_GE, ">=", 5 },
70 { OP_BAND, "&", 6 },
71 { OP_NOT, "!", 6 },
72 { OP_NONE, "OP_NONE", 0 },
73 { OP_OPEN_PAREN, "(", 0 },
74};
75 52
76enum { 53#undef C
77 FILT_ERR_NONE, 54#define C(a, b) b
78 FILT_ERR_INVALID_OP,
79 FILT_ERR_UNBALANCED_PAREN,
80 FILT_ERR_TOO_MANY_OPERANDS,
81 FILT_ERR_OPERAND_TOO_LONG,
82 FILT_ERR_FIELD_NOT_FOUND,
83 FILT_ERR_ILLEGAL_FIELD_OP,
84 FILT_ERR_ILLEGAL_INTVAL,
85 FILT_ERR_BAD_SUBSYS_FILTER,
86 FILT_ERR_TOO_MANY_PREDS,
87 FILT_ERR_MISSING_FIELD,
88 FILT_ERR_INVALID_FILTER,
89 FILT_ERR_IP_FIELD_ONLY,
90 FILT_ERR_ILLEGAL_NOT_OP,
91};
92 55
93static char *err_text[] = { 56static const char * ops[] = { OPS };
94 "No error",
95 "Invalid operator",
96 "Unbalanced parens",
97 "Too many operands",
98 "Operand too long",
99 "Field not found",
100 "Illegal operation for field type",
101 "Illegal integer value",
102 "Couldn't find or set field in one of a subsystem's events",
103 "Too many terms in predicate expression",
104 "Missing field name and/or value",
105 "Meaningless filter expression",
106 "Only 'ip' field is supported for function trace",
107 "Illegal use of '!'",
108};
109 57
110struct opstack_op { 58/*
111 enum filter_op_ids op; 59 * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND
112 struct list_head list; 60 * pred_funcs_##type below must match the order of them above.
113}; 61 */
62#define PRED_FUNC_START OP_LE
63#define PRED_FUNC_MAX (OP_BAND - PRED_FUNC_START)
64
65#define ERRORS \
66 C(NONE, "No error"), \
67 C(INVALID_OP, "Invalid operator"), \
68 C(TOO_MANY_OPEN, "Too many '('"), \
69 C(TOO_MANY_CLOSE, "Too few '('"), \
70 C(MISSING_QUOTE, "Missing matching quote"), \
71 C(OPERAND_TOO_LONG, "Operand too long"), \
72 C(EXPECT_STRING, "Expecting string field"), \
73 C(EXPECT_DIGIT, "Expecting numeric field"), \
74 C(ILLEGAL_FIELD_OP, "Illegal operation for field type"), \
75 C(FIELD_NOT_FOUND, "Field not found"), \
76 C(ILLEGAL_INTVAL, "Illegal integer value"), \
77 C(BAD_SUBSYS_FILTER, "Couldn't find or set field in one of a subsystem's events"), \
78 C(TOO_MANY_PREDS, "Too many terms in predicate expression"), \
79 C(INVALID_FILTER, "Meaningless filter expression"), \
80 C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \
81 C(INVALID_VALUE, "Invalid value (did you forget quotes)?"),
82
83#undef C
84#define C(a, b) FILT_ERR_##a
85
86enum { ERRORS };
87
88#undef C
89#define C(a, b) b
90
91static char *err_text[] = { ERRORS };
92
93/* Called after a '!' character but "!=" and "!~" are not "not"s */
94static bool is_not(const char *str)
95{
96 switch (str[1]) {
97 case '=':
98 case '~':
99 return false;
100 }
101 return true;
102}
114 103
115struct postfix_elt { 104/**
116 enum filter_op_ids op; 105 * prog_entry - a singe entry in the filter program
117 char *operand; 106 * @target: Index to jump to on a branch (actually one minus the index)
118 struct list_head list; 107 * @when_to_branch: The value of the result of the predicate to do a branch
108 * @pred: The predicate to execute.
109 */
110struct prog_entry {
111 int target;
112 int when_to_branch;
113 struct filter_pred *pred;
119}; 114};
120 115
121struct filter_parse_state { 116/**
122 struct filter_op *ops; 117 * update_preds- assign a program entry a label target
123 struct list_head opstack; 118 * @prog: The program array
124 struct list_head postfix; 119 * @N: The index of the current entry in @prog
120 * @when_to_branch: What to assign a program entry for its branch condition
121 *
122 * The program entry at @N has a target that points to the index of a program
123 * entry that can have its target and when_to_branch fields updated.
124 * Update the current program entry denoted by index @N target field to be
125 * that of the updated entry. This will denote the entry to update if
126 * we are processing an "||" after an "&&"
127 */
128static void update_preds(struct prog_entry *prog, int N, int invert)
129{
130 int t, s;
131
132 t = prog[N].target;
133 s = prog[t].target;
134 prog[t].when_to_branch = invert;
135 prog[t].target = N;
136 prog[N].target = s;
137}
138
139struct filter_parse_error {
125 int lasterr; 140 int lasterr;
126 int lasterr_pos; 141 int lasterr_pos;
127
128 struct {
129 char *string;
130 unsigned int cnt;
131 unsigned int tail;
132 } infix;
133
134 struct {
135 char string[MAX_FILTER_STR_VAL];
136 int pos;
137 unsigned int tail;
138 } operand;
139}; 142};
140 143
141struct pred_stack { 144static void parse_error(struct filter_parse_error *pe, int err, int pos)
142 struct filter_pred **preds; 145{
143 int index; 146 pe->lasterr = err;
147 pe->lasterr_pos = pos;
148}
149
150typedef int (*parse_pred_fn)(const char *str, void *data, int pos,
151 struct filter_parse_error *pe,
152 struct filter_pred **pred);
153
154enum {
155 INVERT = 1,
156 PROCESS_AND = 2,
157 PROCESS_OR = 4,
144}; 158};
145 159
146/* If not of not match is equal to not of not, then it is a match */ 160/*
161 * Without going into a formal proof, this explains the method that is used in
162 * parsing the logical expressions.
163 *
164 * For example, if we have: "a && !(!b || (c && g)) || d || e && !f"
165 * The first pass will convert it into the following program:
166 *
167 * n1: r=a; l1: if (!r) goto l4;
168 * n2: r=b; l2: if (!r) goto l4;
169 * n3: r=c; r=!r; l3: if (r) goto l4;
170 * n4: r=g; r=!r; l4: if (r) goto l5;
171 * n5: r=d; l5: if (r) goto T
172 * n6: r=e; l6: if (!r) goto l7;
173 * n7: r=f; r=!r; l7: if (!r) goto F
174 * T: return TRUE
175 * F: return FALSE
176 *
177 * To do this, we use a data structure to represent each of the above
178 * predicate and conditions that has:
179 *
180 * predicate, when_to_branch, invert, target
181 *
182 * The "predicate" will hold the function to determine the result "r".
183 * The "when_to_branch" denotes what "r" should be if a branch is to be taken
184 * "&&" would contain "!r" or (0) and "||" would contain "r" or (1).
185 * The "invert" holds whether the value should be reversed before testing.
186 * The "target" contains the label "l#" to jump to.
187 *
188 * A stack is created to hold values when parentheses are used.
189 *
190 * To simplify the logic, the labels will start at 0 and not 1.
191 *
192 * The possible invert values are 1 and 0. The number of "!"s that are in scope
193 * before the predicate determines the invert value, if the number is odd then
194 * the invert value is 1 and 0 otherwise. This means the invert value only
195 * needs to be toggled when a new "!" is introduced compared to what is stored
196 * on the stack, where parentheses were used.
197 *
198 * The top of the stack and "invert" are initialized to zero.
199 *
200 * ** FIRST PASS **
201 *
202 * #1 A loop through all the tokens is done:
203 *
204 * #2 If the token is an "(", the stack is push, and the current stack value
205 * gets the current invert value, and the loop continues to the next token.
206 * The top of the stack saves the "invert" value to keep track of what
207 * the current inversion is. As "!(a && !b || c)" would require all
208 * predicates being affected separately by the "!" before the parentheses.
209 * And that would end up being equivalent to "(!a || b) && !c"
210 *
211 * #3 If the token is an "!", the current "invert" value gets inverted, and
212 * the loop continues. Note, if the next token is a predicate, then
213 * this "invert" value is only valid for the current program entry,
214 * and does not affect other predicates later on.
215 *
216 * The only other acceptable token is the predicate string.
217 *
218 * #4 A new entry into the program is added saving: the predicate and the
219 * current value of "invert". The target is currently assigned to the
220 * previous program index (this will not be its final value).
221 *
222 * #5 We now enter another loop and look at the next token. The only valid
223 * tokens are ")", "&&", "||" or end of the input string "\0".
224 *
225 * #6 The invert variable is reset to the current value saved on the top of
226 * the stack.
227 *
228 * #7 The top of the stack holds not only the current invert value, but also
229 * if a "&&" or "||" needs to be processed. Note, the "&&" takes higher
230 * precedence than "||". That is "a && b || c && d" is equivalent to
231 * "(a && b) || (c && d)". Thus the first thing to do is to see if "&&" needs
232 * to be processed. This is the case if an "&&" was the last token. If it was
233 * then we call update_preds(). This takes the program, the current index in
234 * the program, and the current value of "invert". More will be described
235 * below about this function.
236 *
237 * #8 If the next token is "&&" then we set a flag in the top of the stack
238 * that denotes that "&&" needs to be processed, break out of this loop
239 * and continue with the outer loop.
240 *
241 * #9 Otherwise, if a "||" needs to be processed then update_preds() is called.
242 * This is called with the program, the current index in the program, but
243 * this time with an inverted value of "invert" (that is !invert). This is
244 * because the value taken will become the "when_to_branch" value of the
245 * program.
246 * Note, this is called when the next token is not an "&&". As stated before,
247 * "&&" takes higher precedence, and "||" should not be processed yet if the
248 * next logical operation is "&&".
249 *
250 * #10 If the next token is "||" then we set a flag in the top of the stack
251 * that denotes that "||" needs to be processed, break out of this loop
252 * and continue with the outer loop.
253 *
254 * #11 If this is the end of the input string "\0" then we break out of both
255 * loops.
256 *
257 * #12 Otherwise, the next token is ")", where we pop the stack and continue
258 * this inner loop.
259 *
260 * Now to discuss the update_pred() function, as that is key to the setting up
261 * of the program. Remember the "target" of the program is initialized to the
262 * previous index and not the "l" label. The target holds the index into the
263 * program that gets affected by the operand. Thus if we have something like
264 * "a || b && c", when we process "a" the target will be "-1" (undefined).
265 * When we process "b", its target is "0", which is the index of "a", as that's
266 * the predicate that is affected by "||". But because the next token after "b"
267 * is "&&" we don't call update_preds(). Instead continue to "c". As the
268 * next token after "c" is not "&&" but the end of input, we first process the
269 * "&&" by calling update_preds() for the "&&" then we process the "||" by
270 * callin updates_preds() with the values for processing "||".
271 *
272 * What does that mean? What update_preds() does is to first save the "target"
273 * of the program entry indexed by the current program entry's "target"
274 * (remember the "target" is initialized to previous program entry), and then
275 * sets that "target" to the current index which represents the label "l#".
276 * That entry's "when_to_branch" is set to the value passed in (the "invert"
277 * or "!invert"). Then it sets the current program entry's target to the saved
278 * "target" value (the old value of the program that had its "target" updated
279 * to the label).
280 *
281 * Looking back at "a || b && c", we have the following steps:
282 * "a" - prog[0] = { "a", X, -1 } // pred, when_to_branch, target
283 * "||" - flag that we need to process "||"; continue outer loop
284 * "b" - prog[1] = { "b", X, 0 }
285 * "&&" - flag that we need to process "&&"; continue outer loop
286 * (Notice we did not process "||")
287 * "c" - prog[2] = { "c", X, 1 }
288 * update_preds(prog, 2, 0); // invert = 0 as we are processing "&&"
289 * t = prog[2].target; // t = 1
290 * s = prog[t].target; // s = 0
291 * prog[t].target = 2; // Set target to "l2"
292 * prog[t].when_to_branch = 0;
293 * prog[2].target = s;
294 * update_preds(prog, 2, 1); // invert = 1 as we are now processing "||"
295 * t = prog[2].target; // t = 0
296 * s = prog[t].target; // s = -1
297 * prog[t].target = 2; // Set target to "l2"
298 * prog[t].when_to_branch = 1;
299 * prog[2].target = s;
300 *
301 * #13 Which brings us to the final step of the first pass, which is to set
302 * the last program entry's when_to_branch and target, which will be
303 * when_to_branch = 0; target = N; ( the label after the program entry after
304 * the last program entry processed above).
305 *
306 * If we denote "TRUE" to be the entry after the last program entry processed,
307 * and "FALSE" the program entry after that, we are now done with the first
308 * pass.
309 *
310 * Making the above "a || b && c" have a progam of:
311 * prog[0] = { "a", 1, 2 }
312 * prog[1] = { "b", 0, 2 }
313 * prog[2] = { "c", 0, 3 }
314 *
315 * Which translates into:
316 * n0: r = a; l0: if (r) goto l2;
317 * n1: r = b; l1: if (!r) goto l2;
318 * n2: r = c; l2: if (!r) goto l3; // Which is the same as "goto F;"
319 * T: return TRUE; l3:
320 * F: return FALSE
321 *
322 * Although, after the first pass, the program is correct, it is
323 * inefficient. The simple sample of "a || b && c" could be easily been
324 * converted into:
325 * n0: r = a; if (r) goto T
326 * n1: r = b; if (!r) goto F
327 * n2: r = c; if (!r) goto F
328 * T: return TRUE;
329 * F: return FALSE;
330 *
331 * The First Pass is over the input string. The next too passes are over
332 * the program itself.
333 *
334 * ** SECOND PASS **
335 *
336 * Which brings us to the second pass. If a jump to a label has the
337 * same condition as that label, it can instead jump to its target.
338 * The original example of "a && !(!b || (c && g)) || d || e && !f"
339 * where the first pass gives us:
340 *
341 * n1: r=a; l1: if (!r) goto l4;
342 * n2: r=b; l2: if (!r) goto l4;
343 * n3: r=c; r=!r; l3: if (r) goto l4;
344 * n4: r=g; r=!r; l4: if (r) goto l5;
345 * n5: r=d; l5: if (r) goto T
346 * n6: r=e; l6: if (!r) goto l7;
347 * n7: r=f; r=!r; l7: if (!r) goto F:
348 * T: return TRUE;
349 * F: return FALSE
350 *
351 * We can see that "l3: if (r) goto l4;" and at l4, we have "if (r) goto l5;".
352 * And "l5: if (r) goto T", we could optimize this by converting l3 and l4
353 * to go directly to T. To accomplish this, we start from the last
354 * entry in the program and work our way back. If the target of the entry
355 * has the same "when_to_branch" then we could use that entry's target.
356 * Doing this, the above would end up as:
357 *
358 * n1: r=a; l1: if (!r) goto l4;
359 * n2: r=b; l2: if (!r) goto l4;
360 * n3: r=c; r=!r; l3: if (r) goto T;
361 * n4: r=g; r=!r; l4: if (r) goto T;
362 * n5: r=d; l5: if (r) goto T;
363 * n6: r=e; l6: if (!r) goto F;
364 * n7: r=f; r=!r; l7: if (!r) goto F;
365 * T: return TRUE
366 * F: return FALSE
367 *
368 * In that same pass, if the "when_to_branch" doesn't match, we can simply
369 * go to the program entry after the label. That is, "l2: if (!r) goto l4;"
370 * where "l4: if (r) goto T;", then we can convert l2 to be:
371 * "l2: if (!r) goto n5;".
372 *
373 * This will have the second pass give us:
374 * n1: r=a; l1: if (!r) goto n5;
375 * n2: r=b; l2: if (!r) goto n5;
376 * n3: r=c; r=!r; l3: if (r) goto T;
377 * n4: r=g; r=!r; l4: if (r) goto T;
378 * n5: r=d; l5: if (r) goto T
379 * n6: r=e; l6: if (!r) goto F;
380 * n7: r=f; r=!r; l7: if (!r) goto F
381 * T: return TRUE
382 * F: return FALSE
383 *
384 * Notice, all the "l#" labels are no longer used, and they can now
385 * be discarded.
386 *
387 * ** THIRD PASS **
388 *
389 * For the third pass we deal with the inverts. As they simply just
390 * make the "when_to_branch" get inverted, a simple loop over the
391 * program to that does: "when_to_branch ^= invert;" will do the
392 * job, leaving us with:
393 * n1: r=a; if (!r) goto n5;
394 * n2: r=b; if (!r) goto n5;
395 * n3: r=c: if (!r) goto T;
396 * n4: r=g; if (!r) goto T;
397 * n5: r=d; if (r) goto T
398 * n6: r=e; if (!r) goto F;
399 * n7: r=f; if (r) goto F
400 * T: return TRUE
401 * F: return FALSE
402 *
403 * As "r = a; if (!r) goto n5;" is obviously the same as
404 * "if (!a) goto n5;" without doing anything we can interperate the
405 * program as:
406 * n1: if (!a) goto n5;
407 * n2: if (!b) goto n5;
408 * n3: if (!c) goto T;
409 * n4: if (!g) goto T;
410 * n5: if (d) goto T
411 * n6: if (!e) goto F;
412 * n7: if (f) goto F
413 * T: return TRUE
414 * F: return FALSE
415 *
416 * Since the inverts are discarded at the end, there's no reason to store
417 * them in the program array (and waste memory). A separate array to hold
418 * the inverts is used and freed at the end.
419 */
420static struct prog_entry *
421predicate_parse(const char *str, int nr_parens, int nr_preds,
422 parse_pred_fn parse_pred, void *data,
423 struct filter_parse_error *pe)
424{
425 struct prog_entry *prog_stack;
426 struct prog_entry *prog;
427 const char *ptr = str;
428 char *inverts = NULL;
429 int *op_stack;
430 int *top;
431 int invert = 0;
432 int ret = -ENOMEM;
433 int len;
434 int N = 0;
435 int i;
436
437 nr_preds += 2; /* For TRUE and FALSE */
438
439 op_stack = kmalloc(sizeof(*op_stack) * nr_parens, GFP_KERNEL);
440 if (!op_stack)
441 return ERR_PTR(-ENOMEM);
442 prog_stack = kmalloc(sizeof(*prog_stack) * nr_preds, GFP_KERNEL);
443 if (!prog_stack) {
444 parse_error(pe, -ENOMEM, 0);
445 goto out_free;
446 }
447 inverts = kmalloc(sizeof(*inverts) * nr_preds, GFP_KERNEL);
448 if (!inverts) {
449 parse_error(pe, -ENOMEM, 0);
450 goto out_free;
451 }
452
453 top = op_stack;
454 prog = prog_stack;
455 *top = 0;
456
457 /* First pass */
458 while (*ptr) { /* #1 */
459 const char *next = ptr++;
460
461 if (isspace(*next))
462 continue;
463
464 switch (*next) {
465 case '(': /* #2 */
466 if (top - op_stack > nr_parens)
467 return ERR_PTR(-EINVAL);
468 *(++top) = invert;
469 continue;
470 case '!': /* #3 */
471 if (!is_not(next))
472 break;
473 invert = !invert;
474 continue;
475 }
476
477 if (N >= nr_preds) {
478 parse_error(pe, FILT_ERR_TOO_MANY_PREDS, next - str);
479 goto out_free;
480 }
481
482 inverts[N] = invert; /* #4 */
483 prog[N].target = N-1;
484
485 len = parse_pred(next, data, ptr - str, pe, &prog[N].pred);
486 if (len < 0) {
487 ret = len;
488 goto out_free;
489 }
490 ptr = next + len;
491
492 N++;
493
494 ret = -1;
495 while (1) { /* #5 */
496 next = ptr++;
497 if (isspace(*next))
498 continue;
499
500 switch (*next) {
501 case ')':
502 case '\0':
503 break;
504 case '&':
505 case '|':
506 if (next[1] == next[0]) {
507 ptr++;
508 break;
509 }
510 default:
511 parse_error(pe, FILT_ERR_TOO_MANY_PREDS,
512 next - str);
513 goto out_free;
514 }
515
516 invert = *top & INVERT;
517
518 if (*top & PROCESS_AND) { /* #7 */
519 update_preds(prog, N - 1, invert);
520 *top &= ~PROCESS_AND;
521 }
522 if (*next == '&') { /* #8 */
523 *top |= PROCESS_AND;
524 break;
525 }
526 if (*top & PROCESS_OR) { /* #9 */
527 update_preds(prog, N - 1, !invert);
528 *top &= ~PROCESS_OR;
529 }
530 if (*next == '|') { /* #10 */
531 *top |= PROCESS_OR;
532 break;
533 }
534 if (!*next) /* #11 */
535 goto out;
536
537 if (top == op_stack) {
538 ret = -1;
539 /* Too few '(' */
540 parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, ptr - str);
541 goto out_free;
542 }
543 top--; /* #12 */
544 }
545 }
546 out:
547 if (top != op_stack) {
548 /* Too many '(' */
549 parse_error(pe, FILT_ERR_TOO_MANY_OPEN, ptr - str);
550 goto out_free;
551 }
552
553 prog[N].pred = NULL; /* #13 */
554 prog[N].target = 1; /* TRUE */
555 prog[N+1].pred = NULL;
556 prog[N+1].target = 0; /* FALSE */
557 prog[N-1].target = N;
558 prog[N-1].when_to_branch = false;
559
560 /* Second Pass */
561 for (i = N-1 ; i--; ) {
562 int target = prog[i].target;
563 if (prog[i].when_to_branch == prog[target].when_to_branch)
564 prog[i].target = prog[target].target;
565 }
566
567 /* Third Pass */
568 for (i = 0; i < N; i++) {
569 invert = inverts[i] ^ prog[i].when_to_branch;
570 prog[i].when_to_branch = invert;
571 /* Make sure the program always moves forward */
572 if (WARN_ON(prog[i].target <= i)) {
573 ret = -EINVAL;
574 goto out_free;
575 }
576 }
577
578 return prog;
579out_free:
580 kfree(op_stack);
581 kfree(prog_stack);
582 kfree(inverts);
583 return ERR_PTR(ret);
584}
585
147#define DEFINE_COMPARISON_PRED(type) \ 586#define DEFINE_COMPARISON_PRED(type) \
148static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \ 587static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \
149{ \ 588{ \
150 type *addr = (type *)(event + pred->offset); \ 589 type *addr = (type *)(event + pred->offset); \
151 type val = (type)pred->val; \ 590 type val = (type)pred->val; \
152 int match = (*addr < val); \ 591 return *addr < val; \
153 return !!match == !pred->not; \
154} \ 592} \
155static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \ 593static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \
156{ \ 594{ \
157 type *addr = (type *)(event + pred->offset); \ 595 type *addr = (type *)(event + pred->offset); \
158 type val = (type)pred->val; \ 596 type val = (type)pred->val; \
159 int match = (*addr <= val); \ 597 return *addr <= val; \
160 return !!match == !pred->not; \
161} \ 598} \
162static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \ 599static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \
163{ \ 600{ \
164 type *addr = (type *)(event + pred->offset); \ 601 type *addr = (type *)(event + pred->offset); \
165 type val = (type)pred->val; \ 602 type val = (type)pred->val; \
166 int match = (*addr > val); \ 603 return *addr > val; \
167 return !!match == !pred->not; \
168} \ 604} \
169static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \ 605static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \
170{ \ 606{ \
171 type *addr = (type *)(event + pred->offset); \ 607 type *addr = (type *)(event + pred->offset); \
172 type val = (type)pred->val; \ 608 type val = (type)pred->val; \
173 int match = (*addr >= val); \ 609 return *addr >= val; \
174 return !!match == !pred->not; \
175} \ 610} \
176static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \ 611static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \
177{ \ 612{ \
178 type *addr = (type *)(event + pred->offset); \ 613 type *addr = (type *)(event + pred->offset); \
179 type val = (type)pred->val; \ 614 type val = (type)pred->val; \
180 int match = !!(*addr & val); \ 615 return !!(*addr & val); \
181 return match == !pred->not; \
182} \ 616} \
183static const filter_pred_fn_t pred_funcs_##type[] = { \ 617static const filter_pred_fn_t pred_funcs_##type[] = { \
184 filter_pred_LT_##type, \
185 filter_pred_LE_##type, \ 618 filter_pred_LE_##type, \
186 filter_pred_GT_##type, \ 619 filter_pred_LT_##type, \
187 filter_pred_GE_##type, \ 620 filter_pred_GE_##type, \
621 filter_pred_GT_##type, \
188 filter_pred_BAND_##type, \ 622 filter_pred_BAND_##type, \
189}; 623};
190 624
191#define PRED_FUNC_START OP_LT
192
193#define DEFINE_EQUALITY_PRED(size) \ 625#define DEFINE_EQUALITY_PRED(size) \
194static int filter_pred_##size(struct filter_pred *pred, void *event) \ 626static int filter_pred_##size(struct filter_pred *pred, void *event) \
195{ \ 627{ \
@@ -272,44 +704,36 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
272static int filter_pred_cpu(struct filter_pred *pred, void *event) 704static int filter_pred_cpu(struct filter_pred *pred, void *event)
273{ 705{
274 int cpu, cmp; 706 int cpu, cmp;
275 int match = 0;
276 707
277 cpu = raw_smp_processor_id(); 708 cpu = raw_smp_processor_id();
278 cmp = pred->val; 709 cmp = pred->val;
279 710
280 switch (pred->op) { 711 switch (pred->op) {
281 case OP_EQ: 712 case OP_EQ:
282 match = cpu == cmp; 713 return cpu == cmp;
283 break; 714 case OP_NE:
715 return cpu != cmp;
284 case OP_LT: 716 case OP_LT:
285 match = cpu < cmp; 717 return cpu < cmp;
286 break;
287 case OP_LE: 718 case OP_LE:
288 match = cpu <= cmp; 719 return cpu <= cmp;
289 break;
290 case OP_GT: 720 case OP_GT:
291 match = cpu > cmp; 721 return cpu > cmp;
292 break;
293 case OP_GE: 722 case OP_GE:
294 match = cpu >= cmp; 723 return cpu >= cmp;
295 break;
296 default: 724 default:
297 break; 725 return 0;
298 } 726 }
299
300 return !!match == !pred->not;
301} 727}
302 728
303/* Filter predicate for COMM. */ 729/* Filter predicate for COMM. */
304static int filter_pred_comm(struct filter_pred *pred, void *event) 730static int filter_pred_comm(struct filter_pred *pred, void *event)
305{ 731{
306 int cmp, match; 732 int cmp;
307 733
308 cmp = pred->regex.match(current->comm, &pred->regex, 734 cmp = pred->regex.match(current->comm, &pred->regex,
309 pred->regex.field_len); 735 TASK_COMM_LEN);
310 match = cmp ^ pred->not; 736 return cmp ^ pred->not;
311
312 return match;
313} 737}
314 738
315static int filter_pred_none(struct filter_pred *pred, void *event) 739static int filter_pred_none(struct filter_pred *pred, void *event)
@@ -366,6 +790,7 @@ static int regex_match_glob(char *str, struct regex *r, int len __maybe_unused)
366 return 1; 790 return 1;
367 return 0; 791 return 0;
368} 792}
793
369/** 794/**
370 * filter_parse_regex - parse a basic regex 795 * filter_parse_regex - parse a basic regex
371 * @buff: the raw regex 796 * @buff: the raw regex
@@ -426,10 +851,9 @@ static void filter_build_regex(struct filter_pred *pred)
426 struct regex *r = &pred->regex; 851 struct regex *r = &pred->regex;
427 char *search; 852 char *search;
428 enum regex_type type = MATCH_FULL; 853 enum regex_type type = MATCH_FULL;
429 int not = 0;
430 854
431 if (pred->op == OP_GLOB) { 855 if (pred->op == OP_GLOB) {
432 type = filter_parse_regex(r->pattern, r->len, &search, &not); 856 type = filter_parse_regex(r->pattern, r->len, &search, &pred->not);
433 r->len = strlen(search); 857 r->len = strlen(search);
434 memmove(r->pattern, search, r->len+1); 858 memmove(r->pattern, search, r->len+1);
435 } 859 }
@@ -451,210 +875,32 @@ static void filter_build_regex(struct filter_pred *pred)
451 r->match = regex_match_glob; 875 r->match = regex_match_glob;
452 break; 876 break;
453 } 877 }
454
455 pred->not ^= not;
456}
457
458enum move_type {
459 MOVE_DOWN,
460 MOVE_UP_FROM_LEFT,
461 MOVE_UP_FROM_RIGHT
462};
463
464static struct filter_pred *
465get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
466 int index, enum move_type *move)
467{
468 if (pred->parent & FILTER_PRED_IS_RIGHT)
469 *move = MOVE_UP_FROM_RIGHT;
470 else
471 *move = MOVE_UP_FROM_LEFT;
472 pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
473
474 return pred;
475}
476
477enum walk_return {
478 WALK_PRED_ABORT,
479 WALK_PRED_PARENT,
480 WALK_PRED_DEFAULT,
481};
482
483typedef int (*filter_pred_walkcb_t) (enum move_type move,
484 struct filter_pred *pred,
485 int *err, void *data);
486
487static int walk_pred_tree(struct filter_pred *preds,
488 struct filter_pred *root,
489 filter_pred_walkcb_t cb, void *data)
490{
491 struct filter_pred *pred = root;
492 enum move_type move = MOVE_DOWN;
493 int done = 0;
494
495 if (!preds)
496 return -EINVAL;
497
498 do {
499 int err = 0, ret;
500
501 ret = cb(move, pred, &err, data);
502 if (ret == WALK_PRED_ABORT)
503 return err;
504 if (ret == WALK_PRED_PARENT)
505 goto get_parent;
506
507 switch (move) {
508 case MOVE_DOWN:
509 if (pred->left != FILTER_PRED_INVALID) {
510 pred = &preds[pred->left];
511 continue;
512 }
513 goto get_parent;
514 case MOVE_UP_FROM_LEFT:
515 pred = &preds[pred->right];
516 move = MOVE_DOWN;
517 continue;
518 case MOVE_UP_FROM_RIGHT:
519 get_parent:
520 if (pred == root)
521 break;
522 pred = get_pred_parent(pred, preds,
523 pred->parent,
524 &move);
525 continue;
526 }
527 done = 1;
528 } while (!done);
529
530 /* We are fine. */
531 return 0;
532}
533
534/*
535 * A series of AND or ORs where found together. Instead of
536 * climbing up and down the tree branches, an array of the
537 * ops were made in order of checks. We can just move across
538 * the array and short circuit if needed.
539 */
540static int process_ops(struct filter_pred *preds,
541 struct filter_pred *op, void *rec)
542{
543 struct filter_pred *pred;
544 int match = 0;
545 int type;
546 int i;
547
548 /*
549 * Micro-optimization: We set type to true if op
550 * is an OR and false otherwise (AND). Then we
551 * just need to test if the match is equal to
552 * the type, and if it is, we can short circuit the
553 * rest of the checks:
554 *
555 * if ((match && op->op == OP_OR) ||
556 * (!match && op->op == OP_AND))
557 * return match;
558 */
559 type = op->op == OP_OR;
560
561 for (i = 0; i < op->val; i++) {
562 pred = &preds[op->ops[i]];
563 if (!WARN_ON_ONCE(!pred->fn))
564 match = pred->fn(pred, rec);
565 if (!!match == type)
566 break;
567 }
568 /* If not of not match is equal to not of not, then it is a match */
569 return !!match == !op->not;
570}
571
572struct filter_match_preds_data {
573 struct filter_pred *preds;
574 int match;
575 void *rec;
576};
577
578static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
579 int *err, void *data)
580{
581 struct filter_match_preds_data *d = data;
582
583 *err = 0;
584 switch (move) {
585 case MOVE_DOWN:
586 /* only AND and OR have children */
587 if (pred->left != FILTER_PRED_INVALID) {
588 /* If ops is set, then it was folded. */
589 if (!pred->ops)
590 return WALK_PRED_DEFAULT;
591 /* We can treat folded ops as a leaf node */
592 d->match = process_ops(d->preds, pred, d->rec);
593 } else {
594 if (!WARN_ON_ONCE(!pred->fn))
595 d->match = pred->fn(pred, d->rec);
596 }
597
598 return WALK_PRED_PARENT;
599 case MOVE_UP_FROM_LEFT:
600 /*
601 * Check for short circuits.
602 *
603 * Optimization: !!match == (pred->op == OP_OR)
604 * is the same as:
605 * if ((match && pred->op == OP_OR) ||
606 * (!match && pred->op == OP_AND))
607 */
608 if (!!d->match == (pred->op == OP_OR))
609 return WALK_PRED_PARENT;
610 break;
611 case MOVE_UP_FROM_RIGHT:
612 break;
613 }
614
615 return WALK_PRED_DEFAULT;
616} 878}
617 879
618/* return 1 if event matches, 0 otherwise (discard) */ 880/* return 1 if event matches, 0 otherwise (discard) */
619int filter_match_preds(struct event_filter *filter, void *rec) 881int filter_match_preds(struct event_filter *filter, void *rec)
620{ 882{
621 struct filter_pred *preds; 883 struct prog_entry *prog;
622 struct filter_pred *root; 884 int i;
623 struct filter_match_preds_data data = {
624 /* match is currently meaningless */
625 .match = -1,
626 .rec = rec,
627 };
628 int n_preds, ret;
629 885
630 /* no filter is considered a match */ 886 /* no filter is considered a match */
631 if (!filter) 887 if (!filter)
632 return 1; 888 return 1;
633 889
634 n_preds = filter->n_preds; 890 prog = rcu_dereference_sched(filter->prog);
635 if (!n_preds) 891 if (!prog)
636 return 1; 892 return 1;
637 893
638 /* 894 for (i = 0; prog[i].pred; i++) {
639 * n_preds, root and filter->preds are protect with preemption disabled. 895 struct filter_pred *pred = prog[i].pred;
640 */ 896 int match = pred->fn(pred, rec);
641 root = rcu_dereference_sched(filter->root); 897 if (match == prog[i].when_to_branch)
642 if (!root) 898 i = prog[i].target;
643 return 1; 899 }
644 900 return prog[i].target;
645 data.preds = preds = rcu_dereference_sched(filter->preds);
646 ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
647 WARN_ON(ret);
648 return data.match;
649} 901}
650EXPORT_SYMBOL_GPL(filter_match_preds); 902EXPORT_SYMBOL_GPL(filter_match_preds);
651 903
652static void parse_error(struct filter_parse_state *ps, int err, int pos)
653{
654 ps->lasterr = err;
655 ps->lasterr_pos = pos;
656}
657
658static void remove_filter_string(struct event_filter *filter) 904static void remove_filter_string(struct event_filter *filter)
659{ 905{
660 if (!filter) 906 if (!filter)
@@ -664,57 +910,44 @@ static void remove_filter_string(struct event_filter *filter)
664 filter->filter_string = NULL; 910 filter->filter_string = NULL;
665} 911}
666 912
667static int replace_filter_string(struct event_filter *filter, 913static void append_filter_err(struct filter_parse_error *pe,
668 char *filter_string)
669{
670 kfree(filter->filter_string);
671 filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
672 if (!filter->filter_string)
673 return -ENOMEM;
674
675 return 0;
676}
677
678static int append_filter_string(struct event_filter *filter,
679 char *string)
680{
681 int newlen;
682 char *new_filter_string;
683
684 BUG_ON(!filter->filter_string);
685 newlen = strlen(filter->filter_string) + strlen(string) + 1;
686 new_filter_string = kmalloc(newlen, GFP_KERNEL);
687 if (!new_filter_string)
688 return -ENOMEM;
689
690 strcpy(new_filter_string, filter->filter_string);
691 strcat(new_filter_string, string);
692 kfree(filter->filter_string);
693 filter->filter_string = new_filter_string;
694
695 return 0;
696}
697
698static void append_filter_err(struct filter_parse_state *ps,
699 struct event_filter *filter) 914 struct event_filter *filter)
700{ 915{
701 int pos = ps->lasterr_pos; 916 struct trace_seq *s;
702 char *buf, *pbuf; 917 int pos = pe->lasterr_pos;
918 char *buf;
919 int len;
920
921 if (WARN_ON(!filter->filter_string))
922 return;
703 923
704 buf = (char *)__get_free_page(GFP_KERNEL); 924 s = kmalloc(sizeof(*s), GFP_KERNEL);
705 if (!buf) 925 if (!s)
706 return; 926 return;
927 trace_seq_init(s);
707 928
708 append_filter_string(filter, "\n"); 929 len = strlen(filter->filter_string);
709 memset(buf, ' ', PAGE_SIZE); 930 if (pos > len)
710 if (pos > PAGE_SIZE - 128) 931 pos = len;
711 pos = 0;
712 buf[pos] = '^';
713 pbuf = &buf[pos] + 1;
714 932
715 sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); 933 /* indexing is off by one */
716 append_filter_string(filter, buf); 934 if (pos)
717 free_page((unsigned long) buf); 935 pos++;
936
937 trace_seq_puts(s, filter->filter_string);
938 if (pe->lasterr > 0) {
939 trace_seq_printf(s, "\n%*s", pos, "^");
940 trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]);
941 } else {
942 trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr);
943 }
944 trace_seq_putc(s, 0);
945 buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL);
946 if (buf) {
947 kfree(filter->filter_string);
948 filter->filter_string = buf;
949 }
950 kfree(s);
718} 951}
719 952
720static inline struct event_filter *event_filter(struct trace_event_file *file) 953static inline struct event_filter *event_filter(struct trace_event_file *file)
@@ -747,108 +980,18 @@ void print_subsystem_event_filter(struct event_subsystem *system,
747 mutex_unlock(&event_mutex); 980 mutex_unlock(&event_mutex);
748} 981}
749 982
750static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) 983static void free_prog(struct event_filter *filter)
751{
752 stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
753 if (!stack->preds)
754 return -ENOMEM;
755 stack->index = n_preds;
756 return 0;
757}
758
759static void __free_pred_stack(struct pred_stack *stack)
760{
761 kfree(stack->preds);
762 stack->index = 0;
763}
764
765static int __push_pred_stack(struct pred_stack *stack,
766 struct filter_pred *pred)
767{
768 int index = stack->index;
769
770 if (WARN_ON(index == 0))
771 return -ENOSPC;
772
773 stack->preds[--index] = pred;
774 stack->index = index;
775 return 0;
776}
777
778static struct filter_pred *
779__pop_pred_stack(struct pred_stack *stack)
780{
781 struct filter_pred *pred;
782 int index = stack->index;
783
784 pred = stack->preds[index++];
785 if (!pred)
786 return NULL;
787
788 stack->index = index;
789 return pred;
790}
791
792static int filter_set_pred(struct event_filter *filter,
793 int idx,
794 struct pred_stack *stack,
795 struct filter_pred *src)
796{
797 struct filter_pred *dest = &filter->preds[idx];
798 struct filter_pred *left;
799 struct filter_pred *right;
800
801 *dest = *src;
802 dest->index = idx;
803
804 if (dest->op == OP_OR || dest->op == OP_AND) {
805 right = __pop_pred_stack(stack);
806 left = __pop_pred_stack(stack);
807 if (!left || !right)
808 return -EINVAL;
809 /*
810 * If both children can be folded
811 * and they are the same op as this op or a leaf,
812 * then this op can be folded.
813 */
814 if (left->index & FILTER_PRED_FOLD &&
815 ((left->op == dest->op && !left->not) ||
816 left->left == FILTER_PRED_INVALID) &&
817 right->index & FILTER_PRED_FOLD &&
818 ((right->op == dest->op && !right->not) ||
819 right->left == FILTER_PRED_INVALID))
820 dest->index |= FILTER_PRED_FOLD;
821
822 dest->left = left->index & ~FILTER_PRED_FOLD;
823 dest->right = right->index & ~FILTER_PRED_FOLD;
824 left->parent = dest->index & ~FILTER_PRED_FOLD;
825 right->parent = dest->index | FILTER_PRED_IS_RIGHT;
826 } else {
827 /*
828 * Make dest->left invalid to be used as a quick
829 * way to know this is a leaf node.
830 */
831 dest->left = FILTER_PRED_INVALID;
832
833 /* All leafs allow folding the parent ops. */
834 dest->index |= FILTER_PRED_FOLD;
835 }
836
837 return __push_pred_stack(stack, dest);
838}
839
840static void __free_preds(struct event_filter *filter)
841{ 984{
985 struct prog_entry *prog;
842 int i; 986 int i;
843 987
844 if (filter->preds) { 988 prog = rcu_access_pointer(filter->prog);
845 for (i = 0; i < filter->n_preds; i++) 989 if (!prog)
846 kfree(filter->preds[i].ops); 990 return;
847 kfree(filter->preds); 991
848 filter->preds = NULL; 992 for (i = 0; prog[i].pred; i++)
849 } 993 kfree(prog[i].pred);
850 filter->a_preds = 0; 994 kfree(prog);
851 filter->n_preds = 0;
852} 995}
853 996
854static void filter_disable(struct trace_event_file *file) 997static void filter_disable(struct trace_event_file *file)
@@ -866,7 +1009,7 @@ static void __free_filter(struct event_filter *filter)
866 if (!filter) 1009 if (!filter)
867 return; 1010 return;
868 1011
869 __free_preds(filter); 1012 free_prog(filter);
870 kfree(filter->filter_string); 1013 kfree(filter->filter_string);
871 kfree(filter); 1014 kfree(filter);
872} 1015}
@@ -876,38 +1019,6 @@ void free_event_filter(struct event_filter *filter)
876 __free_filter(filter); 1019 __free_filter(filter);
877} 1020}
878 1021
879static struct event_filter *__alloc_filter(void)
880{
881 struct event_filter *filter;
882
883 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
884 return filter;
885}
886
887static int __alloc_preds(struct event_filter *filter, int n_preds)
888{
889 struct filter_pred *pred;
890 int i;
891
892 if (filter->preds)
893 __free_preds(filter);
894
895 filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL);
896
897 if (!filter->preds)
898 return -ENOMEM;
899
900 filter->a_preds = n_preds;
901 filter->n_preds = 0;
902
903 for (i = 0; i < n_preds; i++) {
904 pred = &filter->preds[i];
905 pred->fn = filter_pred_none;
906 }
907
908 return 0;
909}
910
911static inline void __remove_filter(struct trace_event_file *file) 1022static inline void __remove_filter(struct trace_event_file *file)
912{ 1023{
913 filter_disable(file); 1024 filter_disable(file);
@@ -944,27 +1055,6 @@ static void filter_free_subsystem_filters(struct trace_subsystem_dir *dir,
944 } 1055 }
945} 1056}
946 1057
947static int filter_add_pred(struct filter_parse_state *ps,
948 struct event_filter *filter,
949 struct filter_pred *pred,
950 struct pred_stack *stack)
951{
952 int err;
953
954 if (WARN_ON(filter->n_preds == filter->a_preds)) {
955 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
956 return -ENOSPC;
957 }
958
959 err = filter_set_pred(filter, filter->n_preds, stack, pred);
960 if (err)
961 return err;
962
963 filter->n_preds++;
964
965 return 0;
966}
967
968int filter_assign_type(const char *type) 1058int filter_assign_type(const char *type)
969{ 1059{
970 if (strstr(type, "__data_loc") && strstr(type, "char")) 1060 if (strstr(type, "__data_loc") && strstr(type, "char"))
@@ -976,761 +1066,449 @@ int filter_assign_type(const char *type)
976 return FILTER_OTHER; 1066 return FILTER_OTHER;
977} 1067}
978 1068
979static bool is_legal_op(struct ftrace_event_field *field, enum filter_op_ids op)
980{
981 if (is_string_field(field) &&
982 (op != OP_EQ && op != OP_NE && op != OP_GLOB))
983 return false;
984 if (!is_string_field(field) && op == OP_GLOB)
985 return false;
986
987 return true;
988}
989
990static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, 1069static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op,
991 int field_size, int field_is_signed) 1070 int field_size, int field_is_signed)
992{ 1071{
993 filter_pred_fn_t fn = NULL; 1072 filter_pred_fn_t fn = NULL;
1073 int pred_func_index = -1;
1074
1075 switch (op) {
1076 case OP_EQ:
1077 case OP_NE:
1078 break;
1079 default:
1080 if (WARN_ON_ONCE(op < PRED_FUNC_START))
1081 return NULL;
1082 pred_func_index = op - PRED_FUNC_START;
1083 if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX))
1084 return NULL;
1085 }
994 1086
995 switch (field_size) { 1087 switch (field_size) {
996 case 8: 1088 case 8:
997 if (op == OP_EQ || op == OP_NE) 1089 if (pred_func_index < 0)
998 fn = filter_pred_64; 1090 fn = filter_pred_64;
999 else if (field_is_signed) 1091 else if (field_is_signed)
1000 fn = pred_funcs_s64[op - PRED_FUNC_START]; 1092 fn = pred_funcs_s64[pred_func_index];
1001 else 1093 else
1002 fn = pred_funcs_u64[op - PRED_FUNC_START]; 1094 fn = pred_funcs_u64[pred_func_index];
1003 break; 1095 break;
1004 case 4: 1096 case 4:
1005 if (op == OP_EQ || op == OP_NE) 1097 if (pred_func_index < 0)
1006 fn = filter_pred_32; 1098 fn = filter_pred_32;
1007 else if (field_is_signed) 1099 else if (field_is_signed)
1008 fn = pred_funcs_s32[op - PRED_FUNC_START]; 1100 fn = pred_funcs_s32[pred_func_index];
1009 else 1101 else
1010 fn = pred_funcs_u32[op - PRED_FUNC_START]; 1102 fn = pred_funcs_u32[pred_func_index];
1011 break; 1103 break;
1012 case 2: 1104 case 2:
1013 if (op == OP_EQ || op == OP_NE) 1105 if (pred_func_index < 0)
1014 fn = filter_pred_16; 1106 fn = filter_pred_16;
1015 else if (field_is_signed) 1107 else if (field_is_signed)
1016 fn = pred_funcs_s16[op - PRED_FUNC_START]; 1108 fn = pred_funcs_s16[pred_func_index];
1017 else 1109 else
1018 fn = pred_funcs_u16[op - PRED_FUNC_START]; 1110 fn = pred_funcs_u16[pred_func_index];
1019 break; 1111 break;
1020 case 1: 1112 case 1:
1021 if (op == OP_EQ || op == OP_NE) 1113 if (pred_func_index < 0)
1022 fn = filter_pred_8; 1114 fn = filter_pred_8;
1023 else if (field_is_signed) 1115 else if (field_is_signed)
1024 fn = pred_funcs_s8[op - PRED_FUNC_START]; 1116 fn = pred_funcs_s8[pred_func_index];
1025 else 1117 else
1026 fn = pred_funcs_u8[op - PRED_FUNC_START]; 1118 fn = pred_funcs_u8[pred_func_index];
1027 break; 1119 break;
1028 } 1120 }
1029 1121
1030 return fn; 1122 return fn;
1031} 1123}
1032 1124
1033static int init_pred(struct filter_parse_state *ps, 1125/* Called when a predicate is encountered by predicate_parse() */
1034 struct ftrace_event_field *field, 1126static int parse_pred(const char *str, void *data,
1035 struct filter_pred *pred) 1127 int pos, struct filter_parse_error *pe,
1036 1128 struct filter_pred **pred_ptr)
1037{ 1129{
1038 filter_pred_fn_t fn = filter_pred_none; 1130 struct trace_event_call *call = data;
1039 unsigned long long val; 1131 struct ftrace_event_field *field;
1132 struct filter_pred *pred = NULL;
1133 char num_buf[24]; /* Big enough to hold an address */
1134 char *field_name;
1135 char q;
1136 u64 val;
1137 int len;
1040 int ret; 1138 int ret;
1139 int op;
1140 int s;
1141 int i = 0;
1041 1142
1042 pred->offset = field->offset; 1143 /* First find the field to associate to */
1043 1144 while (isspace(str[i]))
1044 if (!is_legal_op(field, pred->op)) { 1145 i++;
1045 parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); 1146 s = i;
1046 return -EINVAL;
1047 }
1048
1049 if (field->filter_type == FILTER_COMM) {
1050 filter_build_regex(pred);
1051 fn = filter_pred_comm;
1052 pred->regex.field_len = TASK_COMM_LEN;
1053 } else if (is_string_field(field)) {
1054 filter_build_regex(pred);
1055
1056 if (field->filter_type == FILTER_STATIC_STRING) {
1057 fn = filter_pred_string;
1058 pred->regex.field_len = field->size;
1059 } else if (field->filter_type == FILTER_DYN_STRING)
1060 fn = filter_pred_strloc;
1061 else
1062 fn = filter_pred_pchar;
1063 } else if (is_function_field(field)) {
1064 if (strcmp(field->name, "ip")) {
1065 parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0);
1066 return -EINVAL;
1067 }
1068 } else {
1069 if (field->is_signed)
1070 ret = kstrtoll(pred->regex.pattern, 0, &val);
1071 else
1072 ret = kstrtoull(pred->regex.pattern, 0, &val);
1073 if (ret) {
1074 parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
1075 return -EINVAL;
1076 }
1077 pred->val = val;
1078
1079 if (field->filter_type == FILTER_CPU)
1080 fn = filter_pred_cpu;
1081 else
1082 fn = select_comparison_fn(pred->op, field->size,
1083 field->is_signed);
1084 if (!fn) {
1085 parse_error(ps, FILT_ERR_INVALID_OP, 0);
1086 return -EINVAL;
1087 }
1088 }
1089
1090 if (pred->op == OP_NE)
1091 pred->not ^= 1;
1092
1093 pred->fn = fn;
1094 return 0;
1095}
1096
1097static void parse_init(struct filter_parse_state *ps,
1098 struct filter_op *ops,
1099 char *infix_string)
1100{
1101 memset(ps, '\0', sizeof(*ps));
1102
1103 ps->infix.string = infix_string;
1104 ps->infix.cnt = strlen(infix_string);
1105 ps->ops = ops;
1106
1107 INIT_LIST_HEAD(&ps->opstack);
1108 INIT_LIST_HEAD(&ps->postfix);
1109}
1110
1111static char infix_next(struct filter_parse_state *ps)
1112{
1113 if (!ps->infix.cnt)
1114 return 0;
1115
1116 ps->infix.cnt--;
1117
1118 return ps->infix.string[ps->infix.tail++];
1119}
1120 1147
1121static char infix_peek(struct filter_parse_state *ps) 1148 while (isalnum(str[i]) || str[i] == '_')
1122{ 1149 i++;
1123 if (ps->infix.tail == strlen(ps->infix.string))
1124 return 0;
1125 1150
1126 return ps->infix.string[ps->infix.tail]; 1151 len = i - s;
1127}
1128 1152
1129static void infix_advance(struct filter_parse_state *ps) 1153 if (!len)
1130{ 1154 return -1;
1131 if (!ps->infix.cnt)
1132 return;
1133 1155
1134 ps->infix.cnt--; 1156 field_name = kmemdup_nul(str + s, len, GFP_KERNEL);
1135 ps->infix.tail++; 1157 if (!field_name)
1136} 1158 return -ENOMEM;
1137 1159
1138static inline int is_precedence_lower(struct filter_parse_state *ps, 1160 /* Make sure that the field exists */
1139 int a, int b)
1140{
1141 return ps->ops[a].precedence < ps->ops[b].precedence;
1142}
1143 1161
1144static inline int is_op_char(struct filter_parse_state *ps, char c) 1162 field = trace_find_event_field(call, field_name);
1145{ 1163 kfree(field_name);
1146 int i; 1164 if (!field) {
1147 1165 parse_error(pe, FILT_ERR_FIELD_NOT_FOUND, pos + i);
1148 for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { 1166 return -EINVAL;
1149 if (ps->ops[i].string[0] == c)
1150 return 1;
1151 } 1167 }
1152 1168
1153 return 0; 1169 while (isspace(str[i]))
1154} 1170 i++;
1155 1171
1156static int infix_get_op(struct filter_parse_state *ps, char firstc) 1172 /* Make sure this op is supported */
1157{ 1173 for (op = 0; ops[op]; op++) {
1158 char nextc = infix_peek(ps); 1174 /* This is why '<=' must come before '<' in ops[] */
1159 char opstr[3]; 1175 if (strncmp(str + i, ops[op], strlen(ops[op])) == 0)
1160 int i; 1176 break;
1161
1162 opstr[0] = firstc;
1163 opstr[1] = nextc;
1164 opstr[2] = '\0';
1165
1166 for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
1167 if (!strcmp(opstr, ps->ops[i].string)) {
1168 infix_advance(ps);
1169 return ps->ops[i].id;
1170 }
1171 } 1177 }
1172 1178
1173 opstr[1] = '\0'; 1179 if (!ops[op]) {
1174 1180 parse_error(pe, FILT_ERR_INVALID_OP, pos + i);
1175 for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { 1181 goto err_free;
1176 if (!strcmp(opstr, ps->ops[i].string))
1177 return ps->ops[i].id;
1178 } 1182 }
1179 1183
1180 return OP_NONE; 1184 i += strlen(ops[op]);
1181}
1182
1183static inline void clear_operand_string(struct filter_parse_state *ps)
1184{
1185 memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
1186 ps->operand.tail = 0;
1187}
1188
1189static inline int append_operand_char(struct filter_parse_state *ps, char c)
1190{
1191 if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
1192 return -EINVAL;
1193
1194 ps->operand.string[ps->operand.tail++] = c;
1195 1185
1196 return 0; 1186 while (isspace(str[i]))
1197} 1187 i++;
1198 1188
1199static int filter_opstack_push(struct filter_parse_state *ps, 1189 s = i;
1200 enum filter_op_ids op)
1201{
1202 struct opstack_op *opstack_op;
1203 1190
1204 opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); 1191 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
1205 if (!opstack_op) 1192 if (!pred)
1206 return -ENOMEM; 1193 return -ENOMEM;
1207 1194
1208 opstack_op->op = op; 1195 pred->field = field;
1209 list_add(&opstack_op->list, &ps->opstack); 1196 pred->offset = field->offset;
1197 pred->op = op;
1210 1198
1211 return 0; 1199 if (ftrace_event_is_function(call)) {
1212} 1200 /*
1201 * Perf does things different with function events.
1202 * It only allows an "ip" field, and expects a string.
1203 * But the string does not need to be surrounded by quotes.
1204 * If it is a string, the assigned function as a nop,
1205 * (perf doesn't use it) and grab everything.
1206 */
1207 if (strcmp(field->name, "ip") != 0) {
1208 parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i);
1209 goto err_free;
1210 }
1211 pred->fn = filter_pred_none;
1212
1213 /*
1214 * Quotes are not required, but if they exist then we need
1215 * to read them till we hit a matching one.
1216 */
1217 if (str[i] == '\'' || str[i] == '"')
1218 q = str[i];
1219 else
1220 q = 0;
1221
1222 for (i++; str[i]; i++) {
1223 if (q && str[i] == q)
1224 break;
1225 if (!q && (str[i] == ')' || str[i] == '&' ||
1226 str[i] == '|'))
1227 break;
1228 }
1229 /* Skip quotes */
1230 if (q)
1231 s++;
1232 len = i - s;
1233 if (len >= MAX_FILTER_STR_VAL) {
1234 parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
1235 goto err_free;
1236 }
1213 1237
1214static int filter_opstack_empty(struct filter_parse_state *ps) 1238 pred->regex.len = len;
1215{ 1239 strncpy(pred->regex.pattern, str + s, len);
1216 return list_empty(&ps->opstack); 1240 pred->regex.pattern[len] = 0;
1217} 1241
1242 /* This is either a string, or an integer */
1243 } else if (str[i] == '\'' || str[i] == '"') {
1244 char q = str[i];
1245
1246 /* Make sure the op is OK for strings */
1247 switch (op) {
1248 case OP_NE:
1249 pred->not = 1;
1250 /* Fall through */
1251 case OP_GLOB:
1252 case OP_EQ:
1253 break;
1254 default:
1255 parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
1256 goto err_free;
1257 }
1218 1258
1219static int filter_opstack_top(struct filter_parse_state *ps) 1259 /* Make sure the field is OK for strings */
1220{ 1260 if (!is_string_field(field)) {
1221 struct opstack_op *opstack_op; 1261 parse_error(pe, FILT_ERR_EXPECT_DIGIT, pos + i);
1262 goto err_free;
1263 }
1222 1264
1223 if (filter_opstack_empty(ps)) 1265 for (i++; str[i]; i++) {
1224 return OP_NONE; 1266 if (str[i] == q)
1267 break;
1268 }
1269 if (!str[i]) {
1270 parse_error(pe, FILT_ERR_MISSING_QUOTE, pos + i);
1271 goto err_free;
1272 }
1225 1273
1226 opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); 1274 /* Skip quotes */
1275 s++;
1276 len = i - s;
1277 if (len >= MAX_FILTER_STR_VAL) {
1278 parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
1279 goto err_free;
1280 }
1227 1281
1228 return opstack_op->op; 1282 pred->regex.len = len;
1229} 1283 strncpy(pred->regex.pattern, str + s, len);
1284 pred->regex.pattern[len] = 0;
1230 1285
1231static int filter_opstack_pop(struct filter_parse_state *ps) 1286 filter_build_regex(pred);
1232{
1233 struct opstack_op *opstack_op;
1234 enum filter_op_ids op;
1235 1287
1236 if (filter_opstack_empty(ps)) 1288 if (field->filter_type == FILTER_COMM) {
1237 return OP_NONE; 1289 pred->fn = filter_pred_comm;
1238 1290
1239 opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); 1291 } else if (field->filter_type == FILTER_STATIC_STRING) {
1240 op = opstack_op->op; 1292 pred->fn = filter_pred_string;
1241 list_del(&opstack_op->list); 1293 pred->regex.field_len = field->size;
1242 1294
1243 kfree(opstack_op); 1295 } else if (field->filter_type == FILTER_DYN_STRING)
1296 pred->fn = filter_pred_strloc;
1297 else
1298 pred->fn = filter_pred_pchar;
1299 /* go past the last quote */
1300 i++;
1244 1301
1245 return op; 1302 } else if (isdigit(str[i])) {
1246}
1247 1303
1248static void filter_opstack_clear(struct filter_parse_state *ps) 1304 /* Make sure the field is not a string */
1249{ 1305 if (is_string_field(field)) {
1250 while (!filter_opstack_empty(ps)) 1306 parse_error(pe, FILT_ERR_EXPECT_STRING, pos + i);
1251 filter_opstack_pop(ps); 1307 goto err_free;
1252} 1308 }
1253
1254static char *curr_operand(struct filter_parse_state *ps)
1255{
1256 return ps->operand.string;
1257}
1258 1309
1259static int postfix_append_operand(struct filter_parse_state *ps, char *operand) 1310 if (op == OP_GLOB) {
1260{ 1311 parse_error(pe, FILT_ERR_ILLEGAL_FIELD_OP, pos + i);
1261 struct postfix_elt *elt; 1312 goto err_free;
1313 }
1262 1314
1263 elt = kmalloc(sizeof(*elt), GFP_KERNEL); 1315 /* We allow 0xDEADBEEF */
1264 if (!elt) 1316 while (isalnum(str[i]))
1265 return -ENOMEM; 1317 i++;
1266 1318
1267 elt->op = OP_NONE; 1319 len = i - s;
1268 elt->operand = kstrdup(operand, GFP_KERNEL); 1320 /* 0xfeedfacedeadbeef is 18 chars max */
1269 if (!elt->operand) { 1321 if (len >= sizeof(num_buf)) {
1270 kfree(elt); 1322 parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i);
1271 return -ENOMEM; 1323 goto err_free;
1272 } 1324 }
1273 1325
1274 list_add_tail(&elt->list, &ps->postfix); 1326 strncpy(num_buf, str + s, len);
1327 num_buf[len] = 0;
1275 1328
1276 return 0; 1329 /* Make sure it is a value */
1277} 1330 if (field->is_signed)
1331 ret = kstrtoll(num_buf, 0, &val);
1332 else
1333 ret = kstrtoull(num_buf, 0, &val);
1334 if (ret) {
1335 parse_error(pe, FILT_ERR_ILLEGAL_INTVAL, pos + s);
1336 goto err_free;
1337 }
1278 1338
1279static int postfix_append_op(struct filter_parse_state *ps, enum filter_op_ids op) 1339 pred->val = val;
1280{
1281 struct postfix_elt *elt;
1282 1340
1283 elt = kmalloc(sizeof(*elt), GFP_KERNEL); 1341 if (field->filter_type == FILTER_CPU)
1284 if (!elt) 1342 pred->fn = filter_pred_cpu;
1285 return -ENOMEM; 1343 else {
1344 pred->fn = select_comparison_fn(pred->op, field->size,
1345 field->is_signed);
1346 if (pred->op == OP_NE)
1347 pred->not = 1;
1348 }
1286 1349
1287 elt->op = op; 1350 } else {
1288 elt->operand = NULL; 1351 parse_error(pe, FILT_ERR_INVALID_VALUE, pos + i);
1352 goto err_free;
1353 }
1289 1354
1290 list_add_tail(&elt->list, &ps->postfix); 1355 *pred_ptr = pred;
1356 return i;
1291 1357
1292 return 0; 1358err_free:
1359 kfree(pred);
1360 return -EINVAL;
1293} 1361}
1294 1362
1295static void postfix_clear(struct filter_parse_state *ps) 1363enum {
1296{ 1364 TOO_MANY_CLOSE = -1,
1297 struct postfix_elt *elt; 1365 TOO_MANY_OPEN = -2,
1366 MISSING_QUOTE = -3,
1367};
1298 1368
1299 while (!list_empty(&ps->postfix)) { 1369/*
1300 elt = list_first_entry(&ps->postfix, struct postfix_elt, list); 1370 * Read the filter string once to calculate the number of predicates
1301 list_del(&elt->list); 1371 * as well as how deep the parentheses go.
1302 kfree(elt->operand); 1372 *
1303 kfree(elt); 1373 * Returns:
1304 } 1374 * 0 - everything is fine (err is undefined)
1305} 1375 * -1 - too many ')'
1376 * -2 - too many '('
1377 * -3 - No matching quote
1378 */
1379static int calc_stack(const char *str, int *parens, int *preds, int *err)
1380{
1381 bool is_pred = false;
1382 int nr_preds = 0;
1383 int open = 1; /* Count the expression as "(E)" */
1384 int last_quote = 0;
1385 int max_open = 1;
1386 int quote = 0;
1387 int i;
1306 1388
1307static int filter_parse(struct filter_parse_state *ps) 1389 *err = 0;
1308{
1309 enum filter_op_ids op, top_op;
1310 int in_string = 0;
1311 char ch;
1312 1390
1313 while ((ch = infix_next(ps))) { 1391 for (i = 0; str[i]; i++) {
1314 if (ch == '"') { 1392 if (isspace(str[i]))
1315 in_string ^= 1;
1316 continue; 1393 continue;
1317 } 1394 if (quote) {
1318 1395 if (str[i] == quote)
1319 if (in_string) 1396 quote = 0;
1320 goto parse_operand;
1321
1322 if (isspace(ch))
1323 continue; 1397 continue;
1398 }
1324 1399
1325 if (is_op_char(ps, ch)) { 1400 switch (str[i]) {
1326 op = infix_get_op(ps, ch); 1401 case '\'':
1327 if (op == OP_NONE) { 1402 case '"':
1328 parse_error(ps, FILT_ERR_INVALID_OP, 0); 1403 quote = str[i];
1329 return -EINVAL; 1404 last_quote = i;
1330 } 1405 break;
1331 1406 case '|':
1332 if (strlen(curr_operand(ps))) { 1407 case '&':
1333 postfix_append_operand(ps, curr_operand(ps)); 1408 if (str[i+1] != str[i])
1334 clear_operand_string(ps);
1335 }
1336
1337 while (!filter_opstack_empty(ps)) {
1338 top_op = filter_opstack_top(ps);
1339 if (!is_precedence_lower(ps, top_op, op)) {
1340 top_op = filter_opstack_pop(ps);
1341 postfix_append_op(ps, top_op);
1342 continue;
1343 }
1344 break; 1409 break;
1345 } 1410 is_pred = false;
1346
1347 filter_opstack_push(ps, op);
1348 continue; 1411 continue;
1349 } 1412 case '(':
1350 1413 is_pred = false;
1351 if (ch == '(') { 1414 open++;
1352 filter_opstack_push(ps, OP_OPEN_PAREN); 1415 if (open > max_open)
1416 max_open = open;
1353 continue; 1417 continue;
1354 } 1418 case ')':
1355 1419 is_pred = false;
1356 if (ch == ')') { 1420 if (open == 1) {
1357 if (strlen(curr_operand(ps))) { 1421 *err = i;
1358 postfix_append_operand(ps, curr_operand(ps)); 1422 return TOO_MANY_CLOSE;
1359 clear_operand_string(ps);
1360 }
1361
1362 top_op = filter_opstack_pop(ps);
1363 while (top_op != OP_NONE) {
1364 if (top_op == OP_OPEN_PAREN)
1365 break;
1366 postfix_append_op(ps, top_op);
1367 top_op = filter_opstack_pop(ps);
1368 }
1369 if (top_op == OP_NONE) {
1370 parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
1371 return -EINVAL;
1372 } 1423 }
1424 open--;
1373 continue; 1425 continue;
1374 } 1426 }
1375parse_operand: 1427 if (!is_pred) {
1376 if (append_operand_char(ps, ch)) { 1428 nr_preds++;
1377 parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); 1429 is_pred = true;
1378 return -EINVAL;
1379 } 1430 }
1380 } 1431 }
1381 1432
1382 if (strlen(curr_operand(ps))) 1433 if (quote) {
1383 postfix_append_operand(ps, curr_operand(ps)); 1434 *err = last_quote;
1384 1435 return MISSING_QUOTE;
1385 while (!filter_opstack_empty(ps)) {
1386 top_op = filter_opstack_pop(ps);
1387 if (top_op == OP_NONE)
1388 break;
1389 if (top_op == OP_OPEN_PAREN) {
1390 parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
1391 return -EINVAL;
1392 }
1393 postfix_append_op(ps, top_op);
1394 } 1436 }
1395 1437
1396 return 0; 1438 if (open != 1) {
1397} 1439 int level = open;
1398
1399static struct filter_pred *create_pred(struct filter_parse_state *ps,
1400 struct trace_event_call *call,
1401 enum filter_op_ids op,
1402 char *operand1, char *operand2)
1403{
1404 struct ftrace_event_field *field;
1405 static struct filter_pred pred;
1406
1407 memset(&pred, 0, sizeof(pred));
1408 pred.op = op;
1409
1410 if (op == OP_AND || op == OP_OR)
1411 return &pred;
1412
1413 if (!operand1 || !operand2) {
1414 parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
1415 return NULL;
1416 }
1417
1418 field = trace_find_event_field(call, operand1);
1419 if (!field) {
1420 parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
1421 return NULL;
1422 }
1423
1424 strcpy(pred.regex.pattern, operand2);
1425 pred.regex.len = strlen(pred.regex.pattern);
1426 pred.field = field;
1427 return init_pred(ps, field, &pred) ? NULL : &pred;
1428}
1429
1430static int check_preds(struct filter_parse_state *ps)
1431{
1432 int n_normal_preds = 0, n_logical_preds = 0;
1433 struct postfix_elt *elt;
1434 int cnt = 0;
1435
1436 list_for_each_entry(elt, &ps->postfix, list) {
1437 if (elt->op == OP_NONE) {
1438 cnt++;
1439 continue;
1440 }
1441 1440
1442 if (elt->op == OP_AND || elt->op == OP_OR) { 1441 /* find the bad open */
1443 n_logical_preds++; 1442 for (i--; i; i--) {
1444 cnt--; 1443 if (quote) {
1445 continue; 1444 if (str[i] == quote)
1445 quote = 0;
1446 continue;
1447 }
1448 switch (str[i]) {
1449 case '(':
1450 if (level == open) {
1451 *err = i;
1452 return TOO_MANY_OPEN;
1453 }
1454 level--;
1455 break;
1456 case ')':
1457 level++;
1458 break;
1459 case '\'':
1460 case '"':
1461 quote = str[i];
1462 break;
1463 }
1446 } 1464 }
1447 if (elt->op != OP_NOT) 1465 /* First character is the '(' with missing ')' */
1448 cnt--; 1466 *err = 0;
1449 n_normal_preds++; 1467 return TOO_MANY_OPEN;
1450 /* all ops should have operands */
1451 if (cnt < 0)
1452 break;
1453 }
1454
1455 if (cnt != 1 || !n_normal_preds || n_logical_preds >= n_normal_preds) {
1456 parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
1457 return -EINVAL;
1458 } 1468 }
1459 1469
1470 /* Set the size of the required stacks */
1471 *parens = max_open;
1472 *preds = nr_preds;
1460 return 0; 1473 return 0;
1461} 1474}
1462 1475
1463static int count_preds(struct filter_parse_state *ps) 1476static int process_preds(struct trace_event_call *call,
1464{ 1477 const char *filter_string,
1465 struct postfix_elt *elt;
1466 int n_preds = 0;
1467
1468 list_for_each_entry(elt, &ps->postfix, list) {
1469 if (elt->op == OP_NONE)
1470 continue;
1471 n_preds++;
1472 }
1473
1474 return n_preds;
1475}
1476
1477struct check_pred_data {
1478 int count;
1479 int max;
1480};
1481
1482static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
1483 int *err, void *data)
1484{
1485 struct check_pred_data *d = data;
1486
1487 if (WARN_ON(d->count++ > d->max)) {
1488 *err = -EINVAL;
1489 return WALK_PRED_ABORT;
1490 }
1491 return WALK_PRED_DEFAULT;
1492}
1493
1494/*
1495 * The tree is walked at filtering of an event. If the tree is not correctly
1496 * built, it may cause an infinite loop. Check here that the tree does
1497 * indeed terminate.
1498 */
1499static int check_pred_tree(struct event_filter *filter,
1500 struct filter_pred *root)
1501{
1502 struct check_pred_data data = {
1503 /*
1504 * The max that we can hit a node is three times.
1505 * Once going down, once coming up from left, and
1506 * once coming up from right. This is more than enough
1507 * since leafs are only hit a single time.
1508 */
1509 .max = 3 * filter->n_preds,
1510 .count = 0,
1511 };
1512
1513 return walk_pred_tree(filter->preds, root,
1514 check_pred_tree_cb, &data);
1515}
1516
1517static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
1518 int *err, void *data)
1519{
1520 int *count = data;
1521
1522 if ((move == MOVE_DOWN) &&
1523 (pred->left == FILTER_PRED_INVALID))
1524 (*count)++;
1525
1526 return WALK_PRED_DEFAULT;
1527}
1528
1529static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
1530{
1531 int count = 0, ret;
1532
1533 ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
1534 WARN_ON(ret);
1535 return count;
1536}
1537
1538struct fold_pred_data {
1539 struct filter_pred *root;
1540 int count;
1541 int children;
1542};
1543
1544static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
1545 int *err, void *data)
1546{
1547 struct fold_pred_data *d = data;
1548 struct filter_pred *root = d->root;
1549
1550 if (move != MOVE_DOWN)
1551 return WALK_PRED_DEFAULT;
1552 if (pred->left != FILTER_PRED_INVALID)
1553 return WALK_PRED_DEFAULT;
1554
1555 if (WARN_ON(d->count == d->children)) {
1556 *err = -EINVAL;
1557 return WALK_PRED_ABORT;
1558 }
1559
1560 pred->index &= ~FILTER_PRED_FOLD;
1561 root->ops[d->count++] = pred->index;
1562 return WALK_PRED_DEFAULT;
1563}
1564
1565static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1566{
1567 struct fold_pred_data data = {
1568 .root = root,
1569 .count = 0,
1570 };
1571 int children;
1572
1573 /* No need to keep the fold flag */
1574 root->index &= ~FILTER_PRED_FOLD;
1575
1576 /* If the root is a leaf then do nothing */
1577 if (root->left == FILTER_PRED_INVALID)
1578 return 0;
1579
1580 /* count the children */
1581 children = count_leafs(preds, &preds[root->left]);
1582 children += count_leafs(preds, &preds[root->right]);
1583
1584 root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL);
1585 if (!root->ops)
1586 return -ENOMEM;
1587
1588 root->val = children;
1589 data.children = children;
1590 return walk_pred_tree(preds, root, fold_pred_cb, &data);
1591}
1592
1593static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
1594 int *err, void *data)
1595{
1596 struct filter_pred *preds = data;
1597
1598 if (move != MOVE_DOWN)
1599 return WALK_PRED_DEFAULT;
1600 if (!(pred->index & FILTER_PRED_FOLD))
1601 return WALK_PRED_DEFAULT;
1602
1603 *err = fold_pred(preds, pred);
1604 if (*err)
1605 return WALK_PRED_ABORT;
1606
1607 /* eveyrhing below is folded, continue with parent */
1608 return WALK_PRED_PARENT;
1609}
1610
1611/*
1612 * To optimize the processing of the ops, if we have several "ors" or
1613 * "ands" together, we can put them in an array and process them all
1614 * together speeding up the filter logic.
1615 */
1616static int fold_pred_tree(struct event_filter *filter,
1617 struct filter_pred *root)
1618{
1619 return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
1620 filter->preds);
1621}
1622
1623static int replace_preds(struct trace_event_call *call,
1624 struct event_filter *filter, 1478 struct event_filter *filter,
1625 struct filter_parse_state *ps, 1479 struct filter_parse_error *pe)
1626 bool dry_run)
1627{ 1480{
1628 char *operand1 = NULL, *operand2 = NULL; 1481 struct prog_entry *prog;
1629 struct filter_pred *pred; 1482 int nr_parens;
1630 struct filter_pred *root; 1483 int nr_preds;
1631 struct postfix_elt *elt; 1484 int index;
1632 struct pred_stack stack = { }; /* init to NULL */ 1485 int ret;
1633 int err;
1634 int n_preds = 0;
1635
1636 n_preds = count_preds(ps);
1637 if (n_preds >= MAX_FILTER_PRED) {
1638 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1639 return -ENOSPC;
1640 }
1641
1642 err = check_preds(ps);
1643 if (err)
1644 return err;
1645
1646 if (!dry_run) {
1647 err = __alloc_pred_stack(&stack, n_preds);
1648 if (err)
1649 return err;
1650 err = __alloc_preds(filter, n_preds);
1651 if (err)
1652 goto fail;
1653 }
1654
1655 n_preds = 0;
1656 list_for_each_entry(elt, &ps->postfix, list) {
1657 if (elt->op == OP_NONE) {
1658 if (!operand1)
1659 operand1 = elt->operand;
1660 else if (!operand2)
1661 operand2 = elt->operand;
1662 else {
1663 parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
1664 err = -EINVAL;
1665 goto fail;
1666 }
1667 continue;
1668 }
1669
1670 if (elt->op == OP_NOT) {
1671 if (!n_preds || operand1 || operand2) {
1672 parse_error(ps, FILT_ERR_ILLEGAL_NOT_OP, 0);
1673 err = -EINVAL;
1674 goto fail;
1675 }
1676 if (!dry_run)
1677 filter->preds[n_preds - 1].not ^= 1;
1678 continue;
1679 }
1680
1681 if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
1682 parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
1683 err = -ENOSPC;
1684 goto fail;
1685 }
1686
1687 pred = create_pred(ps, call, elt->op, operand1, operand2);
1688 if (!pred) {
1689 err = -EINVAL;
1690 goto fail;
1691 }
1692 1486
1693 if (!dry_run) { 1487 ret = calc_stack(filter_string, &nr_parens, &nr_preds, &index);
1694 err = filter_add_pred(ps, filter, pred, &stack); 1488 if (ret < 0) {
1695 if (err) 1489 switch (ret) {
1696 goto fail; 1490 case MISSING_QUOTE:
1491 parse_error(pe, FILT_ERR_MISSING_QUOTE, index);
1492 break;
1493 case TOO_MANY_OPEN:
1494 parse_error(pe, FILT_ERR_TOO_MANY_OPEN, index);
1495 break;
1496 default:
1497 parse_error(pe, FILT_ERR_TOO_MANY_CLOSE, index);
1697 } 1498 }
1698 1499 return ret;
1699 operand1 = operand2 = NULL;
1700 } 1500 }
1701 1501
1702 if (!dry_run) { 1502 if (!nr_preds) {
1703 /* We should have one item left on the stack */ 1503 prog = NULL;
1704 pred = __pop_pred_stack(&stack); 1504 } else {
1705 if (!pred) 1505 prog = predicate_parse(filter_string, nr_parens, nr_preds,
1706 return -EINVAL; 1506 parse_pred, call, pe);
1707 /* This item is where we start from in matching */ 1507 if (IS_ERR(prog))
1708 root = pred; 1508 return PTR_ERR(prog);
1709 /* Make sure the stack is empty */
1710 pred = __pop_pred_stack(&stack);
1711 if (WARN_ON(pred)) {
1712 err = -EINVAL;
1713 filter->root = NULL;
1714 goto fail;
1715 }
1716 err = check_pred_tree(filter, root);
1717 if (err)
1718 goto fail;
1719
1720 /* Optimize the tree */
1721 err = fold_pred_tree(filter, root);
1722 if (err)
1723 goto fail;
1724
1725 /* We don't set root until we know it works */
1726 barrier();
1727 filter->root = root;
1728 } 1509 }
1729 1510 rcu_assign_pointer(filter->prog, prog);
1730 err = 0; 1511 return 0;
1731fail:
1732 __free_pred_stack(&stack);
1733 return err;
1734} 1512}
1735 1513
1736static inline void event_set_filtered_flag(struct trace_event_file *file) 1514static inline void event_set_filtered_flag(struct trace_event_file *file)
@@ -1780,72 +1558,53 @@ struct filter_list {
1780 struct event_filter *filter; 1558 struct event_filter *filter;
1781}; 1559};
1782 1560
1783static int replace_system_preds(struct trace_subsystem_dir *dir, 1561static int process_system_preds(struct trace_subsystem_dir *dir,
1784 struct trace_array *tr, 1562 struct trace_array *tr,
1785 struct filter_parse_state *ps, 1563 struct filter_parse_error *pe,
1786 char *filter_string) 1564 char *filter_string)
1787{ 1565{
1788 struct trace_event_file *file; 1566 struct trace_event_file *file;
1789 struct filter_list *filter_item; 1567 struct filter_list *filter_item;
1568 struct event_filter *filter = NULL;
1790 struct filter_list *tmp; 1569 struct filter_list *tmp;
1791 LIST_HEAD(filter_list); 1570 LIST_HEAD(filter_list);
1792 bool fail = true; 1571 bool fail = true;
1793 int err; 1572 int err;
1794 1573
1795 list_for_each_entry(file, &tr->events, list) { 1574 list_for_each_entry(file, &tr->events, list) {
1796 if (file->system != dir)
1797 continue;
1798
1799 /*
1800 * Try to see if the filter can be applied
1801 * (filter arg is ignored on dry_run)
1802 */
1803 err = replace_preds(file->event_call, NULL, ps, true);
1804 if (err)
1805 event_set_no_set_filter_flag(file);
1806 else
1807 event_clear_no_set_filter_flag(file);
1808 }
1809
1810 list_for_each_entry(file, &tr->events, list) {
1811 struct event_filter *filter;
1812 1575
1813 if (file->system != dir) 1576 if (file->system != dir)
1814 continue; 1577 continue;
1815 1578
1816 if (event_no_set_filter_flag(file)) 1579 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
1817 continue; 1580 if (!filter)
1818
1819 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
1820 if (!filter_item)
1821 goto fail_mem;
1822
1823 list_add_tail(&filter_item->list, &filter_list);
1824
1825 filter_item->filter = __alloc_filter();
1826 if (!filter_item->filter)
1827 goto fail_mem; 1581 goto fail_mem;
1828 filter = filter_item->filter;
1829 1582
1830 /* Can only fail on no memory */ 1583 filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
1831 err = replace_filter_string(filter, filter_string); 1584 if (!filter->filter_string)
1832 if (err)
1833 goto fail_mem; 1585 goto fail_mem;
1834 1586
1835 err = replace_preds(file->event_call, filter, ps, false); 1587 err = process_preds(file->event_call, filter_string, filter, pe);
1836 if (err) { 1588 if (err) {
1837 filter_disable(file); 1589 filter_disable(file);
1838 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1590 parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1839 append_filter_err(ps, filter); 1591 append_filter_err(pe, filter);
1840 } else 1592 } else
1841 event_set_filtered_flag(file); 1593 event_set_filtered_flag(file);
1594
1595
1596 filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
1597 if (!filter_item)
1598 goto fail_mem;
1599
1600 list_add_tail(&filter_item->list, &filter_list);
1842 /* 1601 /*
1843 * Regardless of if this returned an error, we still 1602 * Regardless of if this returned an error, we still
1844 * replace the filter for the call. 1603 * replace the filter for the call.
1845 */ 1604 */
1846 filter = event_filter(file); 1605 filter_item->filter = event_filter(file);
1847 event_set_filter(file, filter_item->filter); 1606 event_set_filter(file, filter);
1848 filter_item->filter = filter; 1607 filter = NULL;
1849 1608
1850 fail = false; 1609 fail = false;
1851 } 1610 }
@@ -1871,9 +1630,10 @@ static int replace_system_preds(struct trace_subsystem_dir *dir,
1871 list_del(&filter_item->list); 1630 list_del(&filter_item->list);
1872 kfree(filter_item); 1631 kfree(filter_item);
1873 } 1632 }
1874 parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); 1633 parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0);
1875 return -EINVAL; 1634 return -EINVAL;
1876 fail_mem: 1635 fail_mem:
1636 kfree(filter);
1877 /* If any call succeeded, we still need to sync */ 1637 /* If any call succeeded, we still need to sync */
1878 if (!fail) 1638 if (!fail)
1879 synchronize_sched(); 1639 synchronize_sched();
@@ -1885,47 +1645,42 @@ static int replace_system_preds(struct trace_subsystem_dir *dir,
1885 return -ENOMEM; 1645 return -ENOMEM;
1886} 1646}
1887 1647
1888static int create_filter_start(char *filter_str, bool set_str, 1648static int create_filter_start(char *filter_string, bool set_str,
1889 struct filter_parse_state **psp, 1649 struct filter_parse_error **pse,
1890 struct event_filter **filterp) 1650 struct event_filter **filterp)
1891{ 1651{
1892 struct event_filter *filter; 1652 struct event_filter *filter;
1893 struct filter_parse_state *ps = NULL; 1653 struct filter_parse_error *pe = NULL;
1894 int err = 0; 1654 int err = 0;
1895 1655
1896 WARN_ON_ONCE(*psp || *filterp); 1656 if (WARN_ON_ONCE(*pse || *filterp))
1657 return -EINVAL;
1897 1658
1898 /* allocate everything, and if any fails, free all and fail */ 1659 filter = kzalloc(sizeof(*filter), GFP_KERNEL);
1899 filter = __alloc_filter(); 1660 if (filter && set_str) {
1900 if (filter && set_str) 1661 filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
1901 err = replace_filter_string(filter, filter_str); 1662 if (!filter->filter_string)
1663 err = -ENOMEM;
1664 }
1902 1665
1903 ps = kzalloc(sizeof(*ps), GFP_KERNEL); 1666 pe = kzalloc(sizeof(*pe), GFP_KERNEL);
1904 1667
1905 if (!filter || !ps || err) { 1668 if (!filter || !pe || err) {
1906 kfree(ps); 1669 kfree(pe);
1907 __free_filter(filter); 1670 __free_filter(filter);
1908 return -ENOMEM; 1671 return -ENOMEM;
1909 } 1672 }
1910 1673
1911 /* we're committed to creating a new filter */ 1674 /* we're committed to creating a new filter */
1912 *filterp = filter; 1675 *filterp = filter;
1913 *psp = ps; 1676 *pse = pe;
1914 1677
1915 parse_init(ps, filter_ops, filter_str); 1678 return 0;
1916 err = filter_parse(ps);
1917 if (err && set_str)
1918 append_filter_err(ps, filter);
1919 return err;
1920} 1679}
1921 1680
1922static void create_filter_finish(struct filter_parse_state *ps) 1681static void create_filter_finish(struct filter_parse_error *pe)
1923{ 1682{
1924 if (ps) { 1683 kfree(pe);
1925 filter_opstack_clear(ps);
1926 postfix_clear(ps);
1927 kfree(ps);
1928 }
1929} 1684}
1930 1685
1931/** 1686/**
@@ -1945,24 +1700,20 @@ static void create_filter_finish(struct filter_parse_state *ps)
1945 * freeing it. 1700 * freeing it.
1946 */ 1701 */
1947static int create_filter(struct trace_event_call *call, 1702static int create_filter(struct trace_event_call *call,
1948 char *filter_str, bool set_str, 1703 char *filter_string, bool set_str,
1949 struct event_filter **filterp) 1704 struct event_filter **filterp)
1950{ 1705{
1706 struct filter_parse_error *pe = NULL;
1951 struct event_filter *filter = NULL; 1707 struct event_filter *filter = NULL;
1952 struct filter_parse_state *ps = NULL;
1953 int err; 1708 int err;
1954 1709
1955 err = create_filter_start(filter_str, set_str, &ps, &filter); 1710 err = create_filter_start(filter_string, set_str, &pe, &filter);
1956 if (!err) { 1711 if (err)
1957 err = replace_preds(call, filter, ps, false); 1712 return err;
1958 if (err && set_str) 1713
1959 append_filter_err(ps, filter); 1714 err = process_preds(call, filter_string, filter, pe);
1960 } 1715 if (err && set_str)
1961 if (err && !set_str) { 1716 append_filter_err(pe, filter);
1962 free_event_filter(filter);
1963 filter = NULL;
1964 }
1965 create_filter_finish(ps);
1966 1717
1967 *filterp = filter; 1718 *filterp = filter;
1968 return err; 1719 return err;
@@ -1989,21 +1740,21 @@ static int create_system_filter(struct trace_subsystem_dir *dir,
1989 char *filter_str, struct event_filter **filterp) 1740 char *filter_str, struct event_filter **filterp)
1990{ 1741{
1991 struct event_filter *filter = NULL; 1742 struct event_filter *filter = NULL;
1992 struct filter_parse_state *ps = NULL; 1743 struct filter_parse_error *pe = NULL;
1993 int err; 1744 int err;
1994 1745
1995 err = create_filter_start(filter_str, true, &ps, &filter); 1746 err = create_filter_start(filter_str, true, &pe, &filter);
1996 if (!err) { 1747 if (!err) {
1997 err = replace_system_preds(dir, tr, ps, filter_str); 1748 err = process_system_preds(dir, tr, pe, filter_str);
1998 if (!err) { 1749 if (!err) {
1999 /* System filters just show a default message */ 1750 /* System filters just show a default message */
2000 kfree(filter->filter_string); 1751 kfree(filter->filter_string);
2001 filter->filter_string = NULL; 1752 filter->filter_string = NULL;
2002 } else { 1753 } else {
2003 append_filter_err(ps, filter); 1754 append_filter_err(pe, filter);
2004 } 1755 }
2005 } 1756 }
2006 create_filter_finish(ps); 1757 create_filter_finish(pe);
2007 1758
2008 *filterp = filter; 1759 *filterp = filter;
2009 return err; 1760 return err;
@@ -2186,66 +1937,80 @@ static int __ftrace_function_set_filter(int filter, char *buf, int len,
2186 return ret; 1937 return ret;
2187} 1938}
2188 1939
2189static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) 1940static int ftrace_function_check_pred(struct filter_pred *pred)
2190{ 1941{
2191 struct ftrace_event_field *field = pred->field; 1942 struct ftrace_event_field *field = pred->field;
2192 1943
2193 if (leaf) { 1944 /*
2194 /* 1945 * Check the predicate for function trace, verify:
2195 * Check the leaf predicate for function trace, verify: 1946 * - only '==' and '!=' is used
2196 * - only '==' and '!=' is used 1947 * - the 'ip' field is used
2197 * - the 'ip' field is used 1948 */
2198 */ 1949 if ((pred->op != OP_EQ) && (pred->op != OP_NE))
2199 if ((pred->op != OP_EQ) && (pred->op != OP_NE)) 1950 return -EINVAL;
2200 return -EINVAL;
2201 1951
2202 if (strcmp(field->name, "ip")) 1952 if (strcmp(field->name, "ip"))
2203 return -EINVAL; 1953 return -EINVAL;
2204 } else {
2205 /*
2206 * Check the non leaf predicate for function trace, verify:
2207 * - only '||' is used
2208 */
2209 if (pred->op != OP_OR)
2210 return -EINVAL;
2211 }
2212 1954
2213 return 0; 1955 return 0;
2214} 1956}
2215 1957
2216static int ftrace_function_set_filter_cb(enum move_type move, 1958static int ftrace_function_set_filter_pred(struct filter_pred *pred,
2217 struct filter_pred *pred, 1959 struct function_filter_data *data)
2218 int *err, void *data)
2219{ 1960{
1961 int ret;
1962
2220 /* Checking the node is valid for function trace. */ 1963 /* Checking the node is valid for function trace. */
2221 if ((move != MOVE_DOWN) || 1964 ret = ftrace_function_check_pred(pred);
2222 (pred->left != FILTER_PRED_INVALID)) { 1965 if (ret)
2223 *err = ftrace_function_check_pred(pred, 0); 1966 return ret;
2224 } else {
2225 *err = ftrace_function_check_pred(pred, 1);
2226 if (*err)
2227 return WALK_PRED_ABORT;
2228
2229 *err = __ftrace_function_set_filter(pred->op == OP_EQ,
2230 pred->regex.pattern,
2231 pred->regex.len,
2232 data);
2233 }
2234 1967
2235 return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; 1968 return __ftrace_function_set_filter(pred->op == OP_EQ,
1969 pred->regex.pattern,
1970 pred->regex.len,
1971 data);
1972}
1973
1974static bool is_or(struct prog_entry *prog, int i)
1975{
1976 int target;
1977
1978 /*
1979 * Only "||" is allowed for function events, thus,
1980 * all true branches should jump to true, and any
1981 * false branch should jump to false.
1982 */
1983 target = prog[i].target + 1;
1984 /* True and false have NULL preds (all prog entries should jump to one */
1985 if (prog[target].pred)
1986 return false;
1987
1988 /* prog[target].target is 1 for TRUE, 0 for FALSE */
1989 return prog[i].when_to_branch == prog[target].target;
2236} 1990}
2237 1991
2238static int ftrace_function_set_filter(struct perf_event *event, 1992static int ftrace_function_set_filter(struct perf_event *event,
2239 struct event_filter *filter) 1993 struct event_filter *filter)
2240{ 1994{
1995 struct prog_entry *prog = rcu_dereference_protected(filter->prog,
1996 lockdep_is_held(&event_mutex));
2241 struct function_filter_data data = { 1997 struct function_filter_data data = {
2242 .first_filter = 1, 1998 .first_filter = 1,
2243 .first_notrace = 1, 1999 .first_notrace = 1,
2244 .ops = &event->ftrace_ops, 2000 .ops = &event->ftrace_ops,
2245 }; 2001 };
2002 int i;
2246 2003
2247 return walk_pred_tree(filter->preds, filter->root, 2004 for (i = 0; prog[i].pred; i++) {
2248 ftrace_function_set_filter_cb, &data); 2005 struct filter_pred *pred = prog[i].pred;
2006
2007 if (!is_or(prog, i))
2008 return -EINVAL;
2009
2010 if (ftrace_function_set_filter_pred(pred, &data) < 0)
2011 return -EINVAL;
2012 }
2013 return 0;
2249} 2014}
2250#else 2015#else
2251static int ftrace_function_set_filter(struct perf_event *event, 2016static int ftrace_function_set_filter(struct perf_event *event,
@@ -2388,26 +2153,28 @@ static int test_pred_visited_fn(struct filter_pred *pred, void *event)
2388 return 1; 2153 return 1;
2389} 2154}
2390 2155
2391static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred, 2156static void update_pred_fn(struct event_filter *filter, char *fields)
2392 int *err, void *data)
2393{ 2157{
2394 char *fields = data; 2158 struct prog_entry *prog = rcu_dereference_protected(filter->prog,
2159 lockdep_is_held(&event_mutex));
2160 int i;
2395 2161
2396 if ((move == MOVE_DOWN) && 2162 for (i = 0; prog[i].pred; i++) {
2397 (pred->left == FILTER_PRED_INVALID)) { 2163 struct filter_pred *pred = prog[i].pred;
2398 struct ftrace_event_field *field = pred->field; 2164 struct ftrace_event_field *field = pred->field;
2399 2165
2166 WARN_ON_ONCE(!pred->fn);
2167
2400 if (!field) { 2168 if (!field) {
2401 WARN(1, "all leafs should have field defined"); 2169 WARN_ONCE(1, "all leafs should have field defined %d", i);
2402 return WALK_PRED_DEFAULT; 2170 continue;
2403 } 2171 }
2172
2404 if (!strchr(fields, *field->name)) 2173 if (!strchr(fields, *field->name))
2405 return WALK_PRED_DEFAULT; 2174 continue;
2406 2175
2407 WARN_ON(!pred->fn);
2408 pred->fn = test_pred_visited_fn; 2176 pred->fn = test_pred_visited_fn;
2409 } 2177 }
2410 return WALK_PRED_DEFAULT;
2411} 2178}
2412 2179
2413static __init int ftrace_test_event_filter(void) 2180static __init int ftrace_test_event_filter(void)
@@ -2431,20 +2198,22 @@ static __init int ftrace_test_event_filter(void)
2431 break; 2198 break;
2432 } 2199 }
2433 2200
2201 /* Needed to dereference filter->prog */
2202 mutex_lock(&event_mutex);
2434 /* 2203 /*
2435 * The preemption disabling is not really needed for self 2204 * The preemption disabling is not really needed for self
2436 * tests, but the rcu dereference will complain without it. 2205 * tests, but the rcu dereference will complain without it.
2437 */ 2206 */
2438 preempt_disable(); 2207 preempt_disable();
2439 if (*d->not_visited) 2208 if (*d->not_visited)
2440 walk_pred_tree(filter->preds, filter->root, 2209 update_pred_fn(filter, d->not_visited);
2441 test_walk_pred_cb,
2442 d->not_visited);
2443 2210
2444 test_pred_visited = 0; 2211 test_pred_visited = 0;
2445 err = filter_match_preds(filter, &d->rec); 2212 err = filter_match_preds(filter, &d->rec);
2446 preempt_enable(); 2213 preempt_enable();
2447 2214
2215 mutex_unlock(&event_mutex);
2216
2448 __free_filter(filter); 2217 __free_filter(filter);
2449 2218
2450 if (test_pred_visited) { 2219 if (test_pred_visited) {
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1e1558c99d56..0d7b3ffbecc2 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -20,15 +20,39 @@
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <linux/stacktrace.h> 21#include <linux/stacktrace.h>
22#include <linux/rculist.h> 22#include <linux/rculist.h>
23#include <linux/tracefs.h>
23 24
24#include "tracing_map.h" 25#include "tracing_map.h"
25#include "trace.h" 26#include "trace.h"
26 27
28#define SYNTH_SYSTEM "synthetic"
29#define SYNTH_FIELDS_MAX 16
30
31#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */
32
27struct hist_field; 33struct hist_field;
28 34
29typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event); 35typedef u64 (*hist_field_fn_t) (struct hist_field *field,
36 struct tracing_map_elt *elt,
37 struct ring_buffer_event *rbe,
38 void *event);
30 39
31#define HIST_FIELD_OPERANDS_MAX 2 40#define HIST_FIELD_OPERANDS_MAX 2
41#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
42#define HIST_ACTIONS_MAX 8
43
44enum field_op_id {
45 FIELD_OP_NONE,
46 FIELD_OP_PLUS,
47 FIELD_OP_MINUS,
48 FIELD_OP_UNARY_MINUS,
49};
50
51struct hist_var {
52 char *name;
53 struct hist_trigger_data *hist_data;
54 unsigned int idx;
55};
32 56
33struct hist_field { 57struct hist_field {
34 struct ftrace_event_field *field; 58 struct ftrace_event_field *field;
@@ -37,27 +61,49 @@ struct hist_field {
37 unsigned int size; 61 unsigned int size;
38 unsigned int offset; 62 unsigned int offset;
39 unsigned int is_signed; 63 unsigned int is_signed;
64 const char *type;
40 struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; 65 struct hist_field *operands[HIST_FIELD_OPERANDS_MAX];
66 struct hist_trigger_data *hist_data;
67 struct hist_var var;
68 enum field_op_id operator;
69 char *system;
70 char *event_name;
71 char *name;
72 unsigned int var_idx;
73 unsigned int var_ref_idx;
74 bool read_once;
41}; 75};
42 76
43static u64 hist_field_none(struct hist_field *field, void *event) 77static u64 hist_field_none(struct hist_field *field,
78 struct tracing_map_elt *elt,
79 struct ring_buffer_event *rbe,
80 void *event)
44{ 81{
45 return 0; 82 return 0;
46} 83}
47 84
48static u64 hist_field_counter(struct hist_field *field, void *event) 85static u64 hist_field_counter(struct hist_field *field,
86 struct tracing_map_elt *elt,
87 struct ring_buffer_event *rbe,
88 void *event)
49{ 89{
50 return 1; 90 return 1;
51} 91}
52 92
53static u64 hist_field_string(struct hist_field *hist_field, void *event) 93static u64 hist_field_string(struct hist_field *hist_field,
94 struct tracing_map_elt *elt,
95 struct ring_buffer_event *rbe,
96 void *event)
54{ 97{
55 char *addr = (char *)(event + hist_field->field->offset); 98 char *addr = (char *)(event + hist_field->field->offset);
56 99
57 return (u64)(unsigned long)addr; 100 return (u64)(unsigned long)addr;
58} 101}
59 102
60static u64 hist_field_dynstring(struct hist_field *hist_field, void *event) 103static u64 hist_field_dynstring(struct hist_field *hist_field,
104 struct tracing_map_elt *elt,
105 struct ring_buffer_event *rbe,
106 void *event)
61{ 107{
62 u32 str_item = *(u32 *)(event + hist_field->field->offset); 108 u32 str_item = *(u32 *)(event + hist_field->field->offset);
63 int str_loc = str_item & 0xffff; 109 int str_loc = str_item & 0xffff;
@@ -66,24 +112,74 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, void *event)
66 return (u64)(unsigned long)addr; 112 return (u64)(unsigned long)addr;
67} 113}
68 114
69static u64 hist_field_pstring(struct hist_field *hist_field, void *event) 115static u64 hist_field_pstring(struct hist_field *hist_field,
116 struct tracing_map_elt *elt,
117 struct ring_buffer_event *rbe,
118 void *event)
70{ 119{
71 char **addr = (char **)(event + hist_field->field->offset); 120 char **addr = (char **)(event + hist_field->field->offset);
72 121
73 return (u64)(unsigned long)*addr; 122 return (u64)(unsigned long)*addr;
74} 123}
75 124
76static u64 hist_field_log2(struct hist_field *hist_field, void *event) 125static u64 hist_field_log2(struct hist_field *hist_field,
126 struct tracing_map_elt *elt,
127 struct ring_buffer_event *rbe,
128 void *event)
77{ 129{
78 struct hist_field *operand = hist_field->operands[0]; 130 struct hist_field *operand = hist_field->operands[0];
79 131
80 u64 val = operand->fn(operand, event); 132 u64 val = operand->fn(operand, elt, rbe, event);
81 133
82 return (u64) ilog2(roundup_pow_of_two(val)); 134 return (u64) ilog2(roundup_pow_of_two(val));
83} 135}
84 136
137static u64 hist_field_plus(struct hist_field *hist_field,
138 struct tracing_map_elt *elt,
139 struct ring_buffer_event *rbe,
140 void *event)
141{
142 struct hist_field *operand1 = hist_field->operands[0];
143 struct hist_field *operand2 = hist_field->operands[1];
144
145 u64 val1 = operand1->fn(operand1, elt, rbe, event);
146 u64 val2 = operand2->fn(operand2, elt, rbe, event);
147
148 return val1 + val2;
149}
150
151static u64 hist_field_minus(struct hist_field *hist_field,
152 struct tracing_map_elt *elt,
153 struct ring_buffer_event *rbe,
154 void *event)
155{
156 struct hist_field *operand1 = hist_field->operands[0];
157 struct hist_field *operand2 = hist_field->operands[1];
158
159 u64 val1 = operand1->fn(operand1, elt, rbe, event);
160 u64 val2 = operand2->fn(operand2, elt, rbe, event);
161
162 return val1 - val2;
163}
164
165static u64 hist_field_unary_minus(struct hist_field *hist_field,
166 struct tracing_map_elt *elt,
167 struct ring_buffer_event *rbe,
168 void *event)
169{
170 struct hist_field *operand = hist_field->operands[0];
171
172 s64 sval = (s64)operand->fn(operand, elt, rbe, event);
173 u64 val = (u64)-sval;
174
175 return val;
176}
177
85#define DEFINE_HIST_FIELD_FN(type) \ 178#define DEFINE_HIST_FIELD_FN(type) \
86static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ 179 static u64 hist_field_##type(struct hist_field *hist_field, \
180 struct tracing_map_elt *elt, \
181 struct ring_buffer_event *rbe, \
182 void *event) \
87{ \ 183{ \
88 type *addr = (type *)(event + hist_field->field->offset); \ 184 type *addr = (type *)(event + hist_field->field->offset); \
89 \ 185 \
@@ -126,6 +222,19 @@ enum hist_field_flags {
126 HIST_FIELD_FL_SYSCALL = 1 << 7, 222 HIST_FIELD_FL_SYSCALL = 1 << 7,
127 HIST_FIELD_FL_STACKTRACE = 1 << 8, 223 HIST_FIELD_FL_STACKTRACE = 1 << 8,
128 HIST_FIELD_FL_LOG2 = 1 << 9, 224 HIST_FIELD_FL_LOG2 = 1 << 9,
225 HIST_FIELD_FL_TIMESTAMP = 1 << 10,
226 HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11,
227 HIST_FIELD_FL_VAR = 1 << 12,
228 HIST_FIELD_FL_EXPR = 1 << 13,
229 HIST_FIELD_FL_VAR_REF = 1 << 14,
230 HIST_FIELD_FL_CPU = 1 << 15,
231 HIST_FIELD_FL_ALIAS = 1 << 16,
232};
233
234struct var_defs {
235 unsigned int n_vars;
236 char *name[TRACING_MAP_VARS_MAX];
237 char *expr[TRACING_MAP_VARS_MAX];
129}; 238};
130 239
131struct hist_trigger_attrs { 240struct hist_trigger_attrs {
@@ -133,25 +242,1437 @@ struct hist_trigger_attrs {
133 char *vals_str; 242 char *vals_str;
134 char *sort_key_str; 243 char *sort_key_str;
135 char *name; 244 char *name;
245 char *clock;
136 bool pause; 246 bool pause;
137 bool cont; 247 bool cont;
138 bool clear; 248 bool clear;
249 bool ts_in_usecs;
139 unsigned int map_bits; 250 unsigned int map_bits;
251
252 char *assignment_str[TRACING_MAP_VARS_MAX];
253 unsigned int n_assignments;
254
255 char *action_str[HIST_ACTIONS_MAX];
256 unsigned int n_actions;
257
258 struct var_defs var_defs;
259};
260
261struct field_var {
262 struct hist_field *var;
263 struct hist_field *val;
264};
265
266struct field_var_hist {
267 struct hist_trigger_data *hist_data;
268 char *cmd;
140}; 269};
141 270
142struct hist_trigger_data { 271struct hist_trigger_data {
143 struct hist_field *fields[TRACING_MAP_FIELDS_MAX]; 272 struct hist_field *fields[HIST_FIELDS_MAX];
144 unsigned int n_vals; 273 unsigned int n_vals;
145 unsigned int n_keys; 274 unsigned int n_keys;
146 unsigned int n_fields; 275 unsigned int n_fields;
276 unsigned int n_vars;
147 unsigned int key_size; 277 unsigned int key_size;
148 struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; 278 struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX];
149 unsigned int n_sort_keys; 279 unsigned int n_sort_keys;
150 struct trace_event_file *event_file; 280 struct trace_event_file *event_file;
151 struct hist_trigger_attrs *attrs; 281 struct hist_trigger_attrs *attrs;
152 struct tracing_map *map; 282 struct tracing_map *map;
283 bool enable_timestamps;
284 bool remove;
285 struct hist_field *var_refs[TRACING_MAP_VARS_MAX];
286 unsigned int n_var_refs;
287
288 struct action_data *actions[HIST_ACTIONS_MAX];
289 unsigned int n_actions;
290
291 struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX];
292 unsigned int n_synth_var_refs;
293 struct field_var *field_vars[SYNTH_FIELDS_MAX];
294 unsigned int n_field_vars;
295 unsigned int n_field_var_str;
296 struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX];
297 unsigned int n_field_var_hists;
298
299 struct field_var *max_vars[SYNTH_FIELDS_MAX];
300 unsigned int n_max_vars;
301 unsigned int n_max_var_str;
302};
303
304struct synth_field {
305 char *type;
306 char *name;
307 size_t size;
308 bool is_signed;
309 bool is_string;
310};
311
312struct synth_event {
313 struct list_head list;
314 int ref;
315 char *name;
316 struct synth_field **fields;
317 unsigned int n_fields;
318 unsigned int n_u64;
319 struct trace_event_class class;
320 struct trace_event_call call;
321 struct tracepoint *tp;
322};
323
324struct action_data;
325
326typedef void (*action_fn_t) (struct hist_trigger_data *hist_data,
327 struct tracing_map_elt *elt, void *rec,
328 struct ring_buffer_event *rbe,
329 struct action_data *data, u64 *var_ref_vals);
330
331struct action_data {
332 action_fn_t fn;
333 unsigned int n_params;
334 char *params[SYNTH_FIELDS_MAX];
335
336 union {
337 struct {
338 unsigned int var_ref_idx;
339 char *match_event;
340 char *match_event_system;
341 char *synth_event_name;
342 struct synth_event *synth_event;
343 } onmatch;
344
345 struct {
346 char *var_str;
347 char *fn_name;
348 unsigned int max_var_ref_idx;
349 struct hist_field *max_var;
350 struct hist_field *var;
351 } onmax;
352 };
353};
354
355
356static char last_hist_cmd[MAX_FILTER_STR_VAL];
357static char hist_err_str[MAX_FILTER_STR_VAL];
358
359static void last_cmd_set(char *str)
360{
361 if (!str)
362 return;
363
364 strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1);
365}
366
367static void hist_err(char *str, char *var)
368{
369 int maxlen = MAX_FILTER_STR_VAL - 1;
370
371 if (!str)
372 return;
373
374 if (strlen(hist_err_str))
375 return;
376
377 if (!var)
378 var = "";
379
380 if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen)
381 return;
382
383 strcat(hist_err_str, str);
384 strcat(hist_err_str, var);
385}
386
387static void hist_err_event(char *str, char *system, char *event, char *var)
388{
389 char err[MAX_FILTER_STR_VAL];
390
391 if (system && var)
392 snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var);
393 else if (system)
394 snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event);
395 else
396 strncpy(err, var, MAX_FILTER_STR_VAL);
397
398 hist_err(str, err);
399}
400
401static void hist_err_clear(void)
402{
403 hist_err_str[0] = '\0';
404}
405
406static bool have_hist_err(void)
407{
408 if (strlen(hist_err_str))
409 return true;
410
411 return false;
412}
413
414static LIST_HEAD(synth_event_list);
415static DEFINE_MUTEX(synth_event_mutex);
416
417struct synth_trace_event {
418 struct trace_entry ent;
419 u64 fields[];
420};
421
422static int synth_event_define_fields(struct trace_event_call *call)
423{
424 struct synth_trace_event trace;
425 int offset = offsetof(typeof(trace), fields);
426 struct synth_event *event = call->data;
427 unsigned int i, size, n_u64;
428 char *name, *type;
429 bool is_signed;
430 int ret = 0;
431
432 for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
433 size = event->fields[i]->size;
434 is_signed = event->fields[i]->is_signed;
435 type = event->fields[i]->type;
436 name = event->fields[i]->name;
437 ret = trace_define_field(call, type, name, offset, size,
438 is_signed, FILTER_OTHER);
439 if (ret)
440 break;
441
442 if (event->fields[i]->is_string) {
443 offset += STR_VAR_LEN_MAX;
444 n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
445 } else {
446 offset += sizeof(u64);
447 n_u64++;
448 }
449 }
450
451 event->n_u64 = n_u64;
452
453 return ret;
454}
455
456static bool synth_field_signed(char *type)
457{
458 if (strncmp(type, "u", 1) == 0)
459 return false;
460
461 return true;
462}
463
464static int synth_field_is_string(char *type)
465{
466 if (strstr(type, "char[") != NULL)
467 return true;
468
469 return false;
470}
471
472static int synth_field_string_size(char *type)
473{
474 char buf[4], *end, *start;
475 unsigned int len;
476 int size, err;
477
478 start = strstr(type, "char[");
479 if (start == NULL)
480 return -EINVAL;
481 start += strlen("char[");
482
483 end = strchr(type, ']');
484 if (!end || end < start)
485 return -EINVAL;
486
487 len = end - start;
488 if (len > 3)
489 return -EINVAL;
490
491 strncpy(buf, start, len);
492 buf[len] = '\0';
493
494 err = kstrtouint(buf, 0, &size);
495 if (err)
496 return err;
497
498 if (size > STR_VAR_LEN_MAX)
499 return -EINVAL;
500
501 return size;
502}
503
504static int synth_field_size(char *type)
505{
506 int size = 0;
507
508 if (strcmp(type, "s64") == 0)
509 size = sizeof(s64);
510 else if (strcmp(type, "u64") == 0)
511 size = sizeof(u64);
512 else if (strcmp(type, "s32") == 0)
513 size = sizeof(s32);
514 else if (strcmp(type, "u32") == 0)
515 size = sizeof(u32);
516 else if (strcmp(type, "s16") == 0)
517 size = sizeof(s16);
518 else if (strcmp(type, "u16") == 0)
519 size = sizeof(u16);
520 else if (strcmp(type, "s8") == 0)
521 size = sizeof(s8);
522 else if (strcmp(type, "u8") == 0)
523 size = sizeof(u8);
524 else if (strcmp(type, "char") == 0)
525 size = sizeof(char);
526 else if (strcmp(type, "unsigned char") == 0)
527 size = sizeof(unsigned char);
528 else if (strcmp(type, "int") == 0)
529 size = sizeof(int);
530 else if (strcmp(type, "unsigned int") == 0)
531 size = sizeof(unsigned int);
532 else if (strcmp(type, "long") == 0)
533 size = sizeof(long);
534 else if (strcmp(type, "unsigned long") == 0)
535 size = sizeof(unsigned long);
536 else if (strcmp(type, "pid_t") == 0)
537 size = sizeof(pid_t);
538 else if (synth_field_is_string(type))
539 size = synth_field_string_size(type);
540
541 return size;
542}
543
544static const char *synth_field_fmt(char *type)
545{
546 const char *fmt = "%llu";
547
548 if (strcmp(type, "s64") == 0)
549 fmt = "%lld";
550 else if (strcmp(type, "u64") == 0)
551 fmt = "%llu";
552 else if (strcmp(type, "s32") == 0)
553 fmt = "%d";
554 else if (strcmp(type, "u32") == 0)
555 fmt = "%u";
556 else if (strcmp(type, "s16") == 0)
557 fmt = "%d";
558 else if (strcmp(type, "u16") == 0)
559 fmt = "%u";
560 else if (strcmp(type, "s8") == 0)
561 fmt = "%d";
562 else if (strcmp(type, "u8") == 0)
563 fmt = "%u";
564 else if (strcmp(type, "char") == 0)
565 fmt = "%d";
566 else if (strcmp(type, "unsigned char") == 0)
567 fmt = "%u";
568 else if (strcmp(type, "int") == 0)
569 fmt = "%d";
570 else if (strcmp(type, "unsigned int") == 0)
571 fmt = "%u";
572 else if (strcmp(type, "long") == 0)
573 fmt = "%ld";
574 else if (strcmp(type, "unsigned long") == 0)
575 fmt = "%lu";
576 else if (strcmp(type, "pid_t") == 0)
577 fmt = "%d";
578 else if (synth_field_is_string(type))
579 fmt = "%s";
580
581 return fmt;
582}
583
584static enum print_line_t print_synth_event(struct trace_iterator *iter,
585 int flags,
586 struct trace_event *event)
587{
588 struct trace_array *tr = iter->tr;
589 struct trace_seq *s = &iter->seq;
590 struct synth_trace_event *entry;
591 struct synth_event *se;
592 unsigned int i, n_u64;
593 char print_fmt[32];
594 const char *fmt;
595
596 entry = (struct synth_trace_event *)iter->ent;
597 se = container_of(event, struct synth_event, call.event);
598
599 trace_seq_printf(s, "%s: ", se->name);
600
601 for (i = 0, n_u64 = 0; i < se->n_fields; i++) {
602 if (trace_seq_has_overflowed(s))
603 goto end;
604
605 fmt = synth_field_fmt(se->fields[i]->type);
606
607 /* parameter types */
608 if (tr->trace_flags & TRACE_ITER_VERBOSE)
609 trace_seq_printf(s, "%s ", fmt);
610
611 snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt);
612
613 /* parameter values */
614 if (se->fields[i]->is_string) {
615 trace_seq_printf(s, print_fmt, se->fields[i]->name,
616 (char *)&entry->fields[n_u64],
617 i == se->n_fields - 1 ? "" : " ");
618 n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
619 } else {
620 trace_seq_printf(s, print_fmt, se->fields[i]->name,
621 entry->fields[n_u64],
622 i == se->n_fields - 1 ? "" : " ");
623 n_u64++;
624 }
625 }
626end:
627 trace_seq_putc(s, '\n');
628
629 return trace_handle_return(s);
630}
631
632static struct trace_event_functions synth_event_funcs = {
633 .trace = print_synth_event
634};
635
636static notrace void trace_event_raw_event_synth(void *__data,
637 u64 *var_ref_vals,
638 unsigned int var_ref_idx)
639{
640 struct trace_event_file *trace_file = __data;
641 struct synth_trace_event *entry;
642 struct trace_event_buffer fbuffer;
643 struct ring_buffer *buffer;
644 struct synth_event *event;
645 unsigned int i, n_u64;
646 int fields_size = 0;
647
648 event = trace_file->event_call->data;
649
650 if (trace_trigger_soft_disabled(trace_file))
651 return;
652
653 fields_size = event->n_u64 * sizeof(u64);
654
655 /*
656 * Avoid ring buffer recursion detection, as this event
657 * is being performed within another event.
658 */
659 buffer = trace_file->tr->trace_buffer.buffer;
660 ring_buffer_nest_start(buffer);
661
662 entry = trace_event_buffer_reserve(&fbuffer, trace_file,
663 sizeof(*entry) + fields_size);
664 if (!entry)
665 goto out;
666
667 for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
668 if (event->fields[i]->is_string) {
669 char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i];
670 char *str_field = (char *)&entry->fields[n_u64];
671
672 strscpy(str_field, str_val, STR_VAR_LEN_MAX);
673 n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
674 } else {
675 entry->fields[n_u64] = var_ref_vals[var_ref_idx + i];
676 n_u64++;
677 }
678 }
679
680 trace_event_buffer_commit(&fbuffer);
681out:
682 ring_buffer_nest_end(buffer);
683}
684
685static void free_synth_event_print_fmt(struct trace_event_call *call)
686{
687 if (call) {
688 kfree(call->print_fmt);
689 call->print_fmt = NULL;
690 }
691}
692
693static int __set_synth_event_print_fmt(struct synth_event *event,
694 char *buf, int len)
695{
696 const char *fmt;
697 int pos = 0;
698 int i;
699
700 /* When len=0, we just calculate the needed length */
701#define LEN_OR_ZERO (len ? len - pos : 0)
702
703 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
704 for (i = 0; i < event->n_fields; i++) {
705 fmt = synth_field_fmt(event->fields[i]->type);
706 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s",
707 event->fields[i]->name, fmt,
708 i == event->n_fields - 1 ? "" : ", ");
709 }
710 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
711
712 for (i = 0; i < event->n_fields; i++) {
713 pos += snprintf(buf + pos, LEN_OR_ZERO,
714 ", REC->%s", event->fields[i]->name);
715 }
716
717#undef LEN_OR_ZERO
718
719 /* return the length of print_fmt */
720 return pos;
721}
722
723static int set_synth_event_print_fmt(struct trace_event_call *call)
724{
725 struct synth_event *event = call->data;
726 char *print_fmt;
727 int len;
728
729 /* First: called with 0 length to calculate the needed length */
730 len = __set_synth_event_print_fmt(event, NULL, 0);
731
732 print_fmt = kmalloc(len + 1, GFP_KERNEL);
733 if (!print_fmt)
734 return -ENOMEM;
735
736 /* Second: actually write the @print_fmt */
737 __set_synth_event_print_fmt(event, print_fmt, len + 1);
738 call->print_fmt = print_fmt;
739
740 return 0;
741}
742
743static void free_synth_field(struct synth_field *field)
744{
745 kfree(field->type);
746 kfree(field->name);
747 kfree(field);
748}
749
750static struct synth_field *parse_synth_field(char *field_type,
751 char *field_name)
752{
753 struct synth_field *field;
754 int len, ret = 0;
755 char *array;
756
757 if (field_type[0] == ';')
758 field_type++;
759
760 len = strlen(field_name);
761 if (field_name[len - 1] == ';')
762 field_name[len - 1] = '\0';
763
764 field = kzalloc(sizeof(*field), GFP_KERNEL);
765 if (!field)
766 return ERR_PTR(-ENOMEM);
767
768 len = strlen(field_type) + 1;
769 array = strchr(field_name, '[');
770 if (array)
771 len += strlen(array);
772 field->type = kzalloc(len, GFP_KERNEL);
773 if (!field->type) {
774 ret = -ENOMEM;
775 goto free;
776 }
777 strcat(field->type, field_type);
778 if (array) {
779 strcat(field->type, array);
780 *array = '\0';
781 }
782
783 field->size = synth_field_size(field->type);
784 if (!field->size) {
785 ret = -EINVAL;
786 goto free;
787 }
788
789 if (synth_field_is_string(field->type))
790 field->is_string = true;
791
792 field->is_signed = synth_field_signed(field->type);
793
794 field->name = kstrdup(field_name, GFP_KERNEL);
795 if (!field->name) {
796 ret = -ENOMEM;
797 goto free;
798 }
799 out:
800 return field;
801 free:
802 free_synth_field(field);
803 field = ERR_PTR(ret);
804 goto out;
805}
806
807static void free_synth_tracepoint(struct tracepoint *tp)
808{
809 if (!tp)
810 return;
811
812 kfree(tp->name);
813 kfree(tp);
814}
815
816static struct tracepoint *alloc_synth_tracepoint(char *name)
817{
818 struct tracepoint *tp;
819
820 tp = kzalloc(sizeof(*tp), GFP_KERNEL);
821 if (!tp)
822 return ERR_PTR(-ENOMEM);
823
824 tp->name = kstrdup(name, GFP_KERNEL);
825 if (!tp->name) {
826 kfree(tp);
827 return ERR_PTR(-ENOMEM);
828 }
829
830 return tp;
831}
832
833typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals,
834 unsigned int var_ref_idx);
835
836static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals,
837 unsigned int var_ref_idx)
838{
839 struct tracepoint *tp = event->tp;
840
841 if (unlikely(atomic_read(&tp->key.enabled) > 0)) {
842 struct tracepoint_func *probe_func_ptr;
843 synth_probe_func_t probe_func;
844 void *__data;
845
846 if (!(cpu_online(raw_smp_processor_id())))
847 return;
848
849 probe_func_ptr = rcu_dereference_sched((tp)->funcs);
850 if (probe_func_ptr) {
851 do {
852 probe_func = probe_func_ptr->func;
853 __data = probe_func_ptr->data;
854 probe_func(__data, var_ref_vals, var_ref_idx);
855 } while ((++probe_func_ptr)->func);
856 }
857 }
858}
859
860static struct synth_event *find_synth_event(const char *name)
861{
862 struct synth_event *event;
863
864 list_for_each_entry(event, &synth_event_list, list) {
865 if (strcmp(event->name, name) == 0)
866 return event;
867 }
868
869 return NULL;
870}
871
872static int register_synth_event(struct synth_event *event)
873{
874 struct trace_event_call *call = &event->call;
875 int ret = 0;
876
877 event->call.class = &event->class;
878 event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL);
879 if (!event->class.system) {
880 ret = -ENOMEM;
881 goto out;
882 }
883
884 event->tp = alloc_synth_tracepoint(event->name);
885 if (IS_ERR(event->tp)) {
886 ret = PTR_ERR(event->tp);
887 event->tp = NULL;
888 goto out;
889 }
890
891 INIT_LIST_HEAD(&call->class->fields);
892 call->event.funcs = &synth_event_funcs;
893 call->class->define_fields = synth_event_define_fields;
894
895 ret = register_trace_event(&call->event);
896 if (!ret) {
897 ret = -ENODEV;
898 goto out;
899 }
900 call->flags = TRACE_EVENT_FL_TRACEPOINT;
901 call->class->reg = trace_event_reg;
902 call->class->probe = trace_event_raw_event_synth;
903 call->data = event;
904 call->tp = event->tp;
905
906 ret = trace_add_event_call(call);
907 if (ret) {
908 pr_warn("Failed to register synthetic event: %s\n",
909 trace_event_name(call));
910 goto err;
911 }
912
913 ret = set_synth_event_print_fmt(call);
914 if (ret < 0) {
915 trace_remove_event_call(call);
916 goto err;
917 }
918 out:
919 return ret;
920 err:
921 unregister_trace_event(&call->event);
922 goto out;
923}
924
925static int unregister_synth_event(struct synth_event *event)
926{
927 struct trace_event_call *call = &event->call;
928 int ret;
929
930 ret = trace_remove_event_call(call);
931
932 return ret;
933}
934
935static void free_synth_event(struct synth_event *event)
936{
937 unsigned int i;
938
939 if (!event)
940 return;
941
942 for (i = 0; i < event->n_fields; i++)
943 free_synth_field(event->fields[i]);
944
945 kfree(event->fields);
946 kfree(event->name);
947 kfree(event->class.system);
948 free_synth_tracepoint(event->tp);
949 free_synth_event_print_fmt(&event->call);
950 kfree(event);
951}
952
953static struct synth_event *alloc_synth_event(char *event_name, int n_fields,
954 struct synth_field **fields)
955{
956 struct synth_event *event;
957 unsigned int i;
958
959 event = kzalloc(sizeof(*event), GFP_KERNEL);
960 if (!event) {
961 event = ERR_PTR(-ENOMEM);
962 goto out;
963 }
964
965 event->name = kstrdup(event_name, GFP_KERNEL);
966 if (!event->name) {
967 kfree(event);
968 event = ERR_PTR(-ENOMEM);
969 goto out;
970 }
971
972 event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL);
973 if (!event->fields) {
974 free_synth_event(event);
975 event = ERR_PTR(-ENOMEM);
976 goto out;
977 }
978
979 for (i = 0; i < n_fields; i++)
980 event->fields[i] = fields[i];
981
982 event->n_fields = n_fields;
983 out:
984 return event;
985}
986
987static void action_trace(struct hist_trigger_data *hist_data,
988 struct tracing_map_elt *elt, void *rec,
989 struct ring_buffer_event *rbe,
990 struct action_data *data, u64 *var_ref_vals)
991{
992 struct synth_event *event = data->onmatch.synth_event;
993
994 trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx);
995}
996
997struct hist_var_data {
998 struct list_head list;
999 struct hist_trigger_data *hist_data;
1000};
1001
1002static void add_or_delete_synth_event(struct synth_event *event, int delete)
1003{
1004 if (delete)
1005 free_synth_event(event);
1006 else {
1007 mutex_lock(&synth_event_mutex);
1008 if (!find_synth_event(event->name))
1009 list_add(&event->list, &synth_event_list);
1010 else
1011 free_synth_event(event);
1012 mutex_unlock(&synth_event_mutex);
1013 }
1014}
1015
1016static int create_synth_event(int argc, char **argv)
1017{
1018 struct synth_field *field, *fields[SYNTH_FIELDS_MAX];
1019 struct synth_event *event = NULL;
1020 bool delete_event = false;
1021 int i, n_fields = 0, ret = 0;
1022 char *name;
1023
1024 mutex_lock(&synth_event_mutex);
1025
1026 /*
1027 * Argument syntax:
1028 * - Add synthetic event: <event_name> field[;field] ...
1029 * - Remove synthetic event: !<event_name> field[;field] ...
1030 * where 'field' = type field_name
1031 */
1032 if (argc < 1) {
1033 ret = -EINVAL;
1034 goto out;
1035 }
1036
1037 name = argv[0];
1038 if (name[0] == '!') {
1039 delete_event = true;
1040 name++;
1041 }
1042
1043 event = find_synth_event(name);
1044 if (event) {
1045 if (delete_event) {
1046 if (event->ref) {
1047 event = NULL;
1048 ret = -EBUSY;
1049 goto out;
1050 }
1051 list_del(&event->list);
1052 goto out;
1053 }
1054 event = NULL;
1055 ret = -EEXIST;
1056 goto out;
1057 } else if (delete_event)
1058 goto out;
1059
1060 if (argc < 2) {
1061 ret = -EINVAL;
1062 goto out;
1063 }
1064
1065 for (i = 1; i < argc - 1; i++) {
1066 if (strcmp(argv[i], ";") == 0)
1067 continue;
1068 if (n_fields == SYNTH_FIELDS_MAX) {
1069 ret = -EINVAL;
1070 goto err;
1071 }
1072
1073 field = parse_synth_field(argv[i], argv[i + 1]);
1074 if (IS_ERR(field)) {
1075 ret = PTR_ERR(field);
1076 goto err;
1077 }
1078 fields[n_fields] = field;
1079 i++; n_fields++;
1080 }
1081
1082 if (i < argc) {
1083 ret = -EINVAL;
1084 goto err;
1085 }
1086
1087 event = alloc_synth_event(name, n_fields, fields);
1088 if (IS_ERR(event)) {
1089 ret = PTR_ERR(event);
1090 event = NULL;
1091 goto err;
1092 }
1093 out:
1094 mutex_unlock(&synth_event_mutex);
1095
1096 if (event) {
1097 if (delete_event) {
1098 ret = unregister_synth_event(event);
1099 add_or_delete_synth_event(event, !ret);
1100 } else {
1101 ret = register_synth_event(event);
1102 add_or_delete_synth_event(event, ret);
1103 }
1104 }
1105
1106 return ret;
1107 err:
1108 mutex_unlock(&synth_event_mutex);
1109
1110 for (i = 0; i < n_fields; i++)
1111 free_synth_field(fields[i]);
1112 free_synth_event(event);
1113
1114 return ret;
1115}
1116
1117static int release_all_synth_events(void)
1118{
1119 struct list_head release_events;
1120 struct synth_event *event, *e;
1121 int ret = 0;
1122
1123 INIT_LIST_HEAD(&release_events);
1124
1125 mutex_lock(&synth_event_mutex);
1126
1127 list_for_each_entry(event, &synth_event_list, list) {
1128 if (event->ref) {
1129 mutex_unlock(&synth_event_mutex);
1130 return -EBUSY;
1131 }
1132 }
1133
1134 list_splice_init(&event->list, &release_events);
1135
1136 mutex_unlock(&synth_event_mutex);
1137
1138 list_for_each_entry_safe(event, e, &release_events, list) {
1139 list_del(&event->list);
1140
1141 ret = unregister_synth_event(event);
1142 add_or_delete_synth_event(event, !ret);
1143 }
1144
1145 return ret;
1146}
1147
1148
1149static void *synth_events_seq_start(struct seq_file *m, loff_t *pos)
1150{
1151 mutex_lock(&synth_event_mutex);
1152
1153 return seq_list_start(&synth_event_list, *pos);
1154}
1155
1156static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos)
1157{
1158 return seq_list_next(v, &synth_event_list, pos);
1159}
1160
1161static void synth_events_seq_stop(struct seq_file *m, void *v)
1162{
1163 mutex_unlock(&synth_event_mutex);
1164}
1165
1166static int synth_events_seq_show(struct seq_file *m, void *v)
1167{
1168 struct synth_field *field;
1169 struct synth_event *event = v;
1170 unsigned int i;
1171
1172 seq_printf(m, "%s\t", event->name);
1173
1174 for (i = 0; i < event->n_fields; i++) {
1175 field = event->fields[i];
1176
1177 /* parameter values */
1178 seq_printf(m, "%s %s%s", field->type, field->name,
1179 i == event->n_fields - 1 ? "" : "; ");
1180 }
1181
1182 seq_putc(m, '\n');
1183
1184 return 0;
1185}
1186
1187static const struct seq_operations synth_events_seq_op = {
1188 .start = synth_events_seq_start,
1189 .next = synth_events_seq_next,
1190 .stop = synth_events_seq_stop,
1191 .show = synth_events_seq_show
1192};
1193
1194static int synth_events_open(struct inode *inode, struct file *file)
1195{
1196 int ret;
1197
1198 if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
1199 ret = release_all_synth_events();
1200 if (ret < 0)
1201 return ret;
1202 }
1203
1204 return seq_open(file, &synth_events_seq_op);
1205}
1206
1207static ssize_t synth_events_write(struct file *file,
1208 const char __user *buffer,
1209 size_t count, loff_t *ppos)
1210{
1211 return trace_parse_run_command(file, buffer, count, ppos,
1212 create_synth_event);
1213}
1214
1215static const struct file_operations synth_events_fops = {
1216 .open = synth_events_open,
1217 .write = synth_events_write,
1218 .read = seq_read,
1219 .llseek = seq_lseek,
1220 .release = seq_release,
1221};
1222
1223static u64 hist_field_timestamp(struct hist_field *hist_field,
1224 struct tracing_map_elt *elt,
1225 struct ring_buffer_event *rbe,
1226 void *event)
1227{
1228 struct hist_trigger_data *hist_data = hist_field->hist_data;
1229 struct trace_array *tr = hist_data->event_file->tr;
1230
1231 u64 ts = ring_buffer_event_time_stamp(rbe);
1232
1233 if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr))
1234 ts = ns2usecs(ts);
1235
1236 return ts;
1237}
1238
1239static u64 hist_field_cpu(struct hist_field *hist_field,
1240 struct tracing_map_elt *elt,
1241 struct ring_buffer_event *rbe,
1242 void *event)
1243{
1244 int cpu = smp_processor_id();
1245
1246 return cpu;
1247}
1248
1249static struct hist_field *
1250check_field_for_var_ref(struct hist_field *hist_field,
1251 struct hist_trigger_data *var_data,
1252 unsigned int var_idx)
1253{
1254 struct hist_field *found = NULL;
1255
1256 if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) {
1257 if (hist_field->var.idx == var_idx &&
1258 hist_field->var.hist_data == var_data) {
1259 found = hist_field;
1260 }
1261 }
1262
1263 return found;
1264}
1265
1266static struct hist_field *
1267check_field_for_var_refs(struct hist_trigger_data *hist_data,
1268 struct hist_field *hist_field,
1269 struct hist_trigger_data *var_data,
1270 unsigned int var_idx,
1271 unsigned int level)
1272{
1273 struct hist_field *found = NULL;
1274 unsigned int i;
1275
1276 if (level > 3)
1277 return found;
1278
1279 if (!hist_field)
1280 return found;
1281
1282 found = check_field_for_var_ref(hist_field, var_data, var_idx);
1283 if (found)
1284 return found;
1285
1286 for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
1287 struct hist_field *operand;
1288
1289 operand = hist_field->operands[i];
1290 found = check_field_for_var_refs(hist_data, operand, var_data,
1291 var_idx, level + 1);
1292 if (found)
1293 return found;
1294 }
1295
1296 return found;
1297}
1298
1299static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data,
1300 struct hist_trigger_data *var_data,
1301 unsigned int var_idx)
1302{
1303 struct hist_field *hist_field, *found = NULL;
1304 unsigned int i;
1305
1306 for_each_hist_field(i, hist_data) {
1307 hist_field = hist_data->fields[i];
1308 found = check_field_for_var_refs(hist_data, hist_field,
1309 var_data, var_idx, 0);
1310 if (found)
1311 return found;
1312 }
1313
1314 for (i = 0; i < hist_data->n_synth_var_refs; i++) {
1315 hist_field = hist_data->synth_var_refs[i];
1316 found = check_field_for_var_refs(hist_data, hist_field,
1317 var_data, var_idx, 0);
1318 if (found)
1319 return found;
1320 }
1321
1322 return found;
1323}
1324
1325static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data,
1326 unsigned int var_idx)
1327{
1328 struct trace_array *tr = hist_data->event_file->tr;
1329 struct hist_field *found = NULL;
1330 struct hist_var_data *var_data;
1331
1332 list_for_each_entry(var_data, &tr->hist_vars, list) {
1333 if (var_data->hist_data == hist_data)
1334 continue;
1335 found = find_var_ref(var_data->hist_data, hist_data, var_idx);
1336 if (found)
1337 break;
1338 }
1339
1340 return found;
1341}
1342
1343static bool check_var_refs(struct hist_trigger_data *hist_data)
1344{
1345 struct hist_field *field;
1346 bool found = false;
1347 int i;
1348
1349 for_each_hist_field(i, hist_data) {
1350 field = hist_data->fields[i];
1351 if (field && field->flags & HIST_FIELD_FL_VAR) {
1352 if (find_any_var_ref(hist_data, field->var.idx)) {
1353 found = true;
1354 break;
1355 }
1356 }
1357 }
1358
1359 return found;
1360}
1361
1362static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data)
1363{
1364 struct trace_array *tr = hist_data->event_file->tr;
1365 struct hist_var_data *var_data, *found = NULL;
1366
1367 list_for_each_entry(var_data, &tr->hist_vars, list) {
1368 if (var_data->hist_data == hist_data) {
1369 found = var_data;
1370 break;
1371 }
1372 }
1373
1374 return found;
1375}
1376
1377static bool field_has_hist_vars(struct hist_field *hist_field,
1378 unsigned int level)
1379{
1380 int i;
1381
1382 if (level > 3)
1383 return false;
1384
1385 if (!hist_field)
1386 return false;
1387
1388 if (hist_field->flags & HIST_FIELD_FL_VAR ||
1389 hist_field->flags & HIST_FIELD_FL_VAR_REF)
1390 return true;
1391
1392 for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) {
1393 struct hist_field *operand;
1394
1395 operand = hist_field->operands[i];
1396 if (field_has_hist_vars(operand, level + 1))
1397 return true;
1398 }
1399
1400 return false;
1401}
1402
1403static bool has_hist_vars(struct hist_trigger_data *hist_data)
1404{
1405 struct hist_field *hist_field;
1406 int i;
1407
1408 for_each_hist_field(i, hist_data) {
1409 hist_field = hist_data->fields[i];
1410 if (field_has_hist_vars(hist_field, 0))
1411 return true;
1412 }
1413
1414 return false;
1415}
1416
1417static int save_hist_vars(struct hist_trigger_data *hist_data)
1418{
1419 struct trace_array *tr = hist_data->event_file->tr;
1420 struct hist_var_data *var_data;
1421
1422 var_data = find_hist_vars(hist_data);
1423 if (var_data)
1424 return 0;
1425
1426 if (trace_array_get(tr) < 0)
1427 return -ENODEV;
1428
1429 var_data = kzalloc(sizeof(*var_data), GFP_KERNEL);
1430 if (!var_data) {
1431 trace_array_put(tr);
1432 return -ENOMEM;
1433 }
1434
1435 var_data->hist_data = hist_data;
1436 list_add(&var_data->list, &tr->hist_vars);
1437
1438 return 0;
1439}
1440
1441static void remove_hist_vars(struct hist_trigger_data *hist_data)
1442{
1443 struct trace_array *tr = hist_data->event_file->tr;
1444 struct hist_var_data *var_data;
1445
1446 var_data = find_hist_vars(hist_data);
1447 if (!var_data)
1448 return;
1449
1450 if (WARN_ON(check_var_refs(hist_data)))
1451 return;
1452
1453 list_del(&var_data->list);
1454
1455 kfree(var_data);
1456
1457 trace_array_put(tr);
1458}
1459
1460static struct hist_field *find_var_field(struct hist_trigger_data *hist_data,
1461 const char *var_name)
1462{
1463 struct hist_field *hist_field, *found = NULL;
1464 int i;
1465
1466 for_each_hist_field(i, hist_data) {
1467 hist_field = hist_data->fields[i];
1468 if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR &&
1469 strcmp(hist_field->var.name, var_name) == 0) {
1470 found = hist_field;
1471 break;
1472 }
1473 }
1474
1475 return found;
1476}
1477
1478static struct hist_field *find_var(struct hist_trigger_data *hist_data,
1479 struct trace_event_file *file,
1480 const char *var_name)
1481{
1482 struct hist_trigger_data *test_data;
1483 struct event_trigger_data *test;
1484 struct hist_field *hist_field;
1485
1486 hist_field = find_var_field(hist_data, var_name);
1487 if (hist_field)
1488 return hist_field;
1489
1490 list_for_each_entry_rcu(test, &file->triggers, list) {
1491 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
1492 test_data = test->private_data;
1493 hist_field = find_var_field(test_data, var_name);
1494 if (hist_field)
1495 return hist_field;
1496 }
1497 }
1498
1499 return NULL;
1500}
1501
1502static struct trace_event_file *find_var_file(struct trace_array *tr,
1503 char *system,
1504 char *event_name,
1505 char *var_name)
1506{
1507 struct hist_trigger_data *var_hist_data;
1508 struct hist_var_data *var_data;
1509 struct trace_event_file *file, *found = NULL;
1510
1511 if (system)
1512 return find_event_file(tr, system, event_name);
1513
1514 list_for_each_entry(var_data, &tr->hist_vars, list) {
1515 var_hist_data = var_data->hist_data;
1516 file = var_hist_data->event_file;
1517 if (file == found)
1518 continue;
1519
1520 if (find_var_field(var_hist_data, var_name)) {
1521 if (found) {
1522 hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
1523 return NULL;
1524 }
1525
1526 found = file;
1527 }
1528 }
1529
1530 return found;
1531}
1532
1533static struct hist_field *find_file_var(struct trace_event_file *file,
1534 const char *var_name)
1535{
1536 struct hist_trigger_data *test_data;
1537 struct event_trigger_data *test;
1538 struct hist_field *hist_field;
1539
1540 list_for_each_entry_rcu(test, &file->triggers, list) {
1541 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
1542 test_data = test->private_data;
1543 hist_field = find_var_field(test_data, var_name);
1544 if (hist_field)
1545 return hist_field;
1546 }
1547 }
1548
1549 return NULL;
1550}
1551
1552static struct hist_field *
1553find_match_var(struct hist_trigger_data *hist_data, char *var_name)
1554{
1555 struct trace_array *tr = hist_data->event_file->tr;
1556 struct hist_field *hist_field, *found = NULL;
1557 struct trace_event_file *file;
1558 unsigned int i;
1559
1560 for (i = 0; i < hist_data->n_actions; i++) {
1561 struct action_data *data = hist_data->actions[i];
1562
1563 if (data->fn == action_trace) {
1564 char *system = data->onmatch.match_event_system;
1565 char *event_name = data->onmatch.match_event;
1566
1567 file = find_var_file(tr, system, event_name, var_name);
1568 if (!file)
1569 continue;
1570 hist_field = find_file_var(file, var_name);
1571 if (hist_field) {
1572 if (found) {
1573 hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name);
1574 return ERR_PTR(-EINVAL);
1575 }
1576
1577 found = hist_field;
1578 }
1579 }
1580 }
1581 return found;
1582}
1583
1584static struct hist_field *find_event_var(struct hist_trigger_data *hist_data,
1585 char *system,
1586 char *event_name,
1587 char *var_name)
1588{
1589 struct trace_array *tr = hist_data->event_file->tr;
1590 struct hist_field *hist_field = NULL;
1591 struct trace_event_file *file;
1592
1593 if (!system || !event_name) {
1594 hist_field = find_match_var(hist_data, var_name);
1595 if (IS_ERR(hist_field))
1596 return NULL;
1597 if (hist_field)
1598 return hist_field;
1599 }
1600
1601 file = find_var_file(tr, system, event_name, var_name);
1602 if (!file)
1603 return NULL;
1604
1605 hist_field = find_file_var(file, var_name);
1606
1607 return hist_field;
1608}
1609
1610struct hist_elt_data {
1611 char *comm;
1612 u64 *var_ref_vals;
1613 char *field_var_str[SYNTH_FIELDS_MAX];
153}; 1614};
154 1615
1616static u64 hist_field_var_ref(struct hist_field *hist_field,
1617 struct tracing_map_elt *elt,
1618 struct ring_buffer_event *rbe,
1619 void *event)
1620{
1621 struct hist_elt_data *elt_data;
1622 u64 var_val = 0;
1623
1624 elt_data = elt->private_data;
1625 var_val = elt_data->var_ref_vals[hist_field->var_ref_idx];
1626
1627 return var_val;
1628}
1629
1630static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key,
1631 u64 *var_ref_vals, bool self)
1632{
1633 struct hist_trigger_data *var_data;
1634 struct tracing_map_elt *var_elt;
1635 struct hist_field *hist_field;
1636 unsigned int i, var_idx;
1637 bool resolved = true;
1638 u64 var_val = 0;
1639
1640 for (i = 0; i < hist_data->n_var_refs; i++) {
1641 hist_field = hist_data->var_refs[i];
1642 var_idx = hist_field->var.idx;
1643 var_data = hist_field->var.hist_data;
1644
1645 if (var_data == NULL) {
1646 resolved = false;
1647 break;
1648 }
1649
1650 if ((self && var_data != hist_data) ||
1651 (!self && var_data == hist_data))
1652 continue;
1653
1654 var_elt = tracing_map_lookup(var_data->map, key);
1655 if (!var_elt) {
1656 resolved = false;
1657 break;
1658 }
1659
1660 if (!tracing_map_var_set(var_elt, var_idx)) {
1661 resolved = false;
1662 break;
1663 }
1664
1665 if (self || !hist_field->read_once)
1666 var_val = tracing_map_read_var(var_elt, var_idx);
1667 else
1668 var_val = tracing_map_read_var_once(var_elt, var_idx);
1669
1670 var_ref_vals[i] = var_val;
1671 }
1672
1673 return resolved;
1674}
1675
155static const char *hist_field_name(struct hist_field *field, 1676static const char *hist_field_name(struct hist_field *field,
156 unsigned int level) 1677 unsigned int level)
157{ 1678{
@@ -162,8 +1683,26 @@ static const char *hist_field_name(struct hist_field *field,
162 1683
163 if (field->field) 1684 if (field->field)
164 field_name = field->field->name; 1685 field_name = field->field->name;
165 else if (field->flags & HIST_FIELD_FL_LOG2) 1686 else if (field->flags & HIST_FIELD_FL_LOG2 ||
1687 field->flags & HIST_FIELD_FL_ALIAS)
166 field_name = hist_field_name(field->operands[0], ++level); 1688 field_name = hist_field_name(field->operands[0], ++level);
1689 else if (field->flags & HIST_FIELD_FL_CPU)
1690 field_name = "cpu";
1691 else if (field->flags & HIST_FIELD_FL_EXPR ||
1692 field->flags & HIST_FIELD_FL_VAR_REF) {
1693 if (field->system) {
1694 static char full_name[MAX_FILTER_STR_VAL];
1695
1696 strcat(full_name, field->system);
1697 strcat(full_name, ".");
1698 strcat(full_name, field->event_name);
1699 strcat(full_name, ".");
1700 strcat(full_name, field->name);
1701 field_name = full_name;
1702 } else
1703 field_name = field->name;
1704 } else if (field->flags & HIST_FIELD_FL_TIMESTAMP)
1705 field_name = "common_timestamp";
167 1706
168 if (field_name == NULL) 1707 if (field_name == NULL)
169 field_name = ""; 1708 field_name = "";
@@ -232,16 +1771,119 @@ static int parse_map_size(char *str)
232 1771
233static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) 1772static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs)
234{ 1773{
1774 unsigned int i;
1775
235 if (!attrs) 1776 if (!attrs)
236 return; 1777 return;
237 1778
1779 for (i = 0; i < attrs->n_assignments; i++)
1780 kfree(attrs->assignment_str[i]);
1781
1782 for (i = 0; i < attrs->n_actions; i++)
1783 kfree(attrs->action_str[i]);
1784
238 kfree(attrs->name); 1785 kfree(attrs->name);
239 kfree(attrs->sort_key_str); 1786 kfree(attrs->sort_key_str);
240 kfree(attrs->keys_str); 1787 kfree(attrs->keys_str);
241 kfree(attrs->vals_str); 1788 kfree(attrs->vals_str);
1789 kfree(attrs->clock);
242 kfree(attrs); 1790 kfree(attrs);
243} 1791}
244 1792
1793static int parse_action(char *str, struct hist_trigger_attrs *attrs)
1794{
1795 int ret = -EINVAL;
1796
1797 if (attrs->n_actions >= HIST_ACTIONS_MAX)
1798 return ret;
1799
1800 if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) ||
1801 (strncmp(str, "onmax(", strlen("onmax(")) == 0)) {
1802 attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL);
1803 if (!attrs->action_str[attrs->n_actions]) {
1804 ret = -ENOMEM;
1805 return ret;
1806 }
1807 attrs->n_actions++;
1808 ret = 0;
1809 }
1810
1811 return ret;
1812}
1813
1814static int parse_assignment(char *str, struct hist_trigger_attrs *attrs)
1815{
1816 int ret = 0;
1817
1818 if ((strncmp(str, "key=", strlen("key=")) == 0) ||
1819 (strncmp(str, "keys=", strlen("keys=")) == 0)) {
1820 attrs->keys_str = kstrdup(str, GFP_KERNEL);
1821 if (!attrs->keys_str) {
1822 ret = -ENOMEM;
1823 goto out;
1824 }
1825 } else if ((strncmp(str, "val=", strlen("val=")) == 0) ||
1826 (strncmp(str, "vals=", strlen("vals=")) == 0) ||
1827 (strncmp(str, "values=", strlen("values=")) == 0)) {
1828 attrs->vals_str = kstrdup(str, GFP_KERNEL);
1829 if (!attrs->vals_str) {
1830 ret = -ENOMEM;
1831 goto out;
1832 }
1833 } else if (strncmp(str, "sort=", strlen("sort=")) == 0) {
1834 attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
1835 if (!attrs->sort_key_str) {
1836 ret = -ENOMEM;
1837 goto out;
1838 }
1839 } else if (strncmp(str, "name=", strlen("name=")) == 0) {
1840 attrs->name = kstrdup(str, GFP_KERNEL);
1841 if (!attrs->name) {
1842 ret = -ENOMEM;
1843 goto out;
1844 }
1845 } else if (strncmp(str, "clock=", strlen("clock=")) == 0) {
1846 strsep(&str, "=");
1847 if (!str) {
1848 ret = -EINVAL;
1849 goto out;
1850 }
1851
1852 str = strstrip(str);
1853 attrs->clock = kstrdup(str, GFP_KERNEL);
1854 if (!attrs->clock) {
1855 ret = -ENOMEM;
1856 goto out;
1857 }
1858 } else if (strncmp(str, "size=", strlen("size=")) == 0) {
1859 int map_bits = parse_map_size(str);
1860
1861 if (map_bits < 0) {
1862 ret = map_bits;
1863 goto out;
1864 }
1865 attrs->map_bits = map_bits;
1866 } else {
1867 char *assignment;
1868
1869 if (attrs->n_assignments == TRACING_MAP_VARS_MAX) {
1870 hist_err("Too many variables defined: ", str);
1871 ret = -EINVAL;
1872 goto out;
1873 }
1874
1875 assignment = kstrdup(str, GFP_KERNEL);
1876 if (!assignment) {
1877 ret = -ENOMEM;
1878 goto out;
1879 }
1880
1881 attrs->assignment_str[attrs->n_assignments++] = assignment;
1882 }
1883 out:
1884 return ret;
1885}
1886
245static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) 1887static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
246{ 1888{
247 struct hist_trigger_attrs *attrs; 1889 struct hist_trigger_attrs *attrs;
@@ -254,35 +1896,21 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
254 while (trigger_str) { 1896 while (trigger_str) {
255 char *str = strsep(&trigger_str, ":"); 1897 char *str = strsep(&trigger_str, ":");
256 1898
257 if ((strncmp(str, "key=", strlen("key=")) == 0) || 1899 if (strchr(str, '=')) {
258 (strncmp(str, "keys=", strlen("keys=")) == 0)) 1900 ret = parse_assignment(str, attrs);
259 attrs->keys_str = kstrdup(str, GFP_KERNEL); 1901 if (ret)
260 else if ((strncmp(str, "val=", strlen("val=")) == 0) || 1902 goto free;
261 (strncmp(str, "vals=", strlen("vals=")) == 0) || 1903 } else if (strcmp(str, "pause") == 0)
262 (strncmp(str, "values=", strlen("values=")) == 0))
263 attrs->vals_str = kstrdup(str, GFP_KERNEL);
264 else if (strncmp(str, "sort=", strlen("sort=")) == 0)
265 attrs->sort_key_str = kstrdup(str, GFP_KERNEL);
266 else if (strncmp(str, "name=", strlen("name=")) == 0)
267 attrs->name = kstrdup(str, GFP_KERNEL);
268 else if (strcmp(str, "pause") == 0)
269 attrs->pause = true; 1904 attrs->pause = true;
270 else if ((strcmp(str, "cont") == 0) || 1905 else if ((strcmp(str, "cont") == 0) ||
271 (strcmp(str, "continue") == 0)) 1906 (strcmp(str, "continue") == 0))
272 attrs->cont = true; 1907 attrs->cont = true;
273 else if (strcmp(str, "clear") == 0) 1908 else if (strcmp(str, "clear") == 0)
274 attrs->clear = true; 1909 attrs->clear = true;
275 else if (strncmp(str, "size=", strlen("size=")) == 0) { 1910 else {
276 int map_bits = parse_map_size(str); 1911 ret = parse_action(str, attrs);
277 1912 if (ret)
278 if (map_bits < 0) {
279 ret = map_bits;
280 goto free; 1913 goto free;
281 }
282 attrs->map_bits = map_bits;
283 } else {
284 ret = -EINVAL;
285 goto free;
286 } 1914 }
287 } 1915 }
288 1916
@@ -291,6 +1919,14 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str)
291 goto free; 1919 goto free;
292 } 1920 }
293 1921
1922 if (!attrs->clock) {
1923 attrs->clock = kstrdup("global", GFP_KERNEL);
1924 if (!attrs->clock) {
1925 ret = -ENOMEM;
1926 goto free;
1927 }
1928 }
1929
294 return attrs; 1930 return attrs;
295 free: 1931 free:
296 destroy_hist_trigger_attrs(attrs); 1932 destroy_hist_trigger_attrs(attrs);
@@ -313,64 +1949,203 @@ static inline void save_comm(char *comm, struct task_struct *task)
313 memcpy(comm, task->comm, TASK_COMM_LEN); 1949 memcpy(comm, task->comm, TASK_COMM_LEN);
314} 1950}
315 1951
316static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt) 1952static void hist_elt_data_free(struct hist_elt_data *elt_data)
317{ 1953{
318 kfree((char *)elt->private_data); 1954 unsigned int i;
1955
1956 for (i = 0; i < SYNTH_FIELDS_MAX; i++)
1957 kfree(elt_data->field_var_str[i]);
1958
1959 kfree(elt_data->comm);
1960 kfree(elt_data);
319} 1961}
320 1962
321static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt) 1963static void hist_trigger_elt_data_free(struct tracing_map_elt *elt)
1964{
1965 struct hist_elt_data *elt_data = elt->private_data;
1966
1967 hist_elt_data_free(elt_data);
1968}
1969
1970static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
322{ 1971{
323 struct hist_trigger_data *hist_data = elt->map->private_data; 1972 struct hist_trigger_data *hist_data = elt->map->private_data;
1973 unsigned int size = TASK_COMM_LEN;
1974 struct hist_elt_data *elt_data;
324 struct hist_field *key_field; 1975 struct hist_field *key_field;
325 unsigned int i; 1976 unsigned int i, n_str;
1977
1978 elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL);
1979 if (!elt_data)
1980 return -ENOMEM;
326 1981
327 for_each_hist_key_field(i, hist_data) { 1982 for_each_hist_key_field(i, hist_data) {
328 key_field = hist_data->fields[i]; 1983 key_field = hist_data->fields[i];
329 1984
330 if (key_field->flags & HIST_FIELD_FL_EXECNAME) { 1985 if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
331 unsigned int size = TASK_COMM_LEN + 1; 1986 elt_data->comm = kzalloc(size, GFP_KERNEL);
332 1987 if (!elt_data->comm) {
333 elt->private_data = kzalloc(size, GFP_KERNEL); 1988 kfree(elt_data);
334 if (!elt->private_data)
335 return -ENOMEM; 1989 return -ENOMEM;
1990 }
336 break; 1991 break;
337 } 1992 }
338 } 1993 }
339 1994
1995 n_str = hist_data->n_field_var_str + hist_data->n_max_var_str;
1996
1997 size = STR_VAR_LEN_MAX;
1998
1999 for (i = 0; i < n_str; i++) {
2000 elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL);
2001 if (!elt_data->field_var_str[i]) {
2002 hist_elt_data_free(elt_data);
2003 return -ENOMEM;
2004 }
2005 }
2006
2007 elt->private_data = elt_data;
2008
340 return 0; 2009 return 0;
341} 2010}
342 2011
343static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to, 2012static void hist_trigger_elt_data_init(struct tracing_map_elt *elt)
344 struct tracing_map_elt *from) 2013{
2014 struct hist_elt_data *elt_data = elt->private_data;
2015
2016 if (elt_data->comm)
2017 save_comm(elt_data->comm, current);
2018}
2019
2020static const struct tracing_map_ops hist_trigger_elt_data_ops = {
2021 .elt_alloc = hist_trigger_elt_data_alloc,
2022 .elt_free = hist_trigger_elt_data_free,
2023 .elt_init = hist_trigger_elt_data_init,
2024};
2025
2026static const char *get_hist_field_flags(struct hist_field *hist_field)
2027{
2028 const char *flags_str = NULL;
2029
2030 if (hist_field->flags & HIST_FIELD_FL_HEX)
2031 flags_str = "hex";
2032 else if (hist_field->flags & HIST_FIELD_FL_SYM)
2033 flags_str = "sym";
2034 else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
2035 flags_str = "sym-offset";
2036 else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
2037 flags_str = "execname";
2038 else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
2039 flags_str = "syscall";
2040 else if (hist_field->flags & HIST_FIELD_FL_LOG2)
2041 flags_str = "log2";
2042 else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS)
2043 flags_str = "usecs";
2044
2045 return flags_str;
2046}
2047
2048static void expr_field_str(struct hist_field *field, char *expr)
345{ 2049{
346 char *comm_from = from->private_data; 2050 if (field->flags & HIST_FIELD_FL_VAR_REF)
347 char *comm_to = to->private_data; 2051 strcat(expr, "$");
2052
2053 strcat(expr, hist_field_name(field, 0));
348 2054
349 if (comm_from) 2055 if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) {
350 memcpy(comm_to, comm_from, TASK_COMM_LEN + 1); 2056 const char *flags_str = get_hist_field_flags(field);
2057
2058 if (flags_str) {
2059 strcat(expr, ".");
2060 strcat(expr, flags_str);
2061 }
2062 }
351} 2063}
352 2064
353static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt) 2065static char *expr_str(struct hist_field *field, unsigned int level)
354{ 2066{
355 char *comm = elt->private_data; 2067 char *expr;
2068
2069 if (level > 1)
2070 return NULL;
2071
2072 expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
2073 if (!expr)
2074 return NULL;
2075
2076 if (!field->operands[0]) {
2077 expr_field_str(field, expr);
2078 return expr;
2079 }
2080
2081 if (field->operator == FIELD_OP_UNARY_MINUS) {
2082 char *subexpr;
356 2083
357 if (comm) 2084 strcat(expr, "-(");
358 save_comm(comm, current); 2085 subexpr = expr_str(field->operands[0], ++level);
2086 if (!subexpr) {
2087 kfree(expr);
2088 return NULL;
2089 }
2090 strcat(expr, subexpr);
2091 strcat(expr, ")");
2092
2093 kfree(subexpr);
2094
2095 return expr;
2096 }
2097
2098 expr_field_str(field->operands[0], expr);
2099
2100 switch (field->operator) {
2101 case FIELD_OP_MINUS:
2102 strcat(expr, "-");
2103 break;
2104 case FIELD_OP_PLUS:
2105 strcat(expr, "+");
2106 break;
2107 default:
2108 kfree(expr);
2109 return NULL;
2110 }
2111
2112 expr_field_str(field->operands[1], expr);
2113
2114 return expr;
359} 2115}
360 2116
361static const struct tracing_map_ops hist_trigger_elt_comm_ops = { 2117static int contains_operator(char *str)
362 .elt_alloc = hist_trigger_elt_comm_alloc, 2118{
363 .elt_copy = hist_trigger_elt_comm_copy, 2119 enum field_op_id field_op = FIELD_OP_NONE;
364 .elt_free = hist_trigger_elt_comm_free, 2120 char *op;
365 .elt_init = hist_trigger_elt_comm_init, 2121
366}; 2122 op = strpbrk(str, "+-");
2123 if (!op)
2124 return FIELD_OP_NONE;
2125
2126 switch (*op) {
2127 case '-':
2128 if (*str == '-')
2129 field_op = FIELD_OP_UNARY_MINUS;
2130 else
2131 field_op = FIELD_OP_MINUS;
2132 break;
2133 case '+':
2134 field_op = FIELD_OP_PLUS;
2135 break;
2136 default:
2137 break;
2138 }
2139
2140 return field_op;
2141}
367 2142
368static void destroy_hist_field(struct hist_field *hist_field, 2143static void destroy_hist_field(struct hist_field *hist_field,
369 unsigned int level) 2144 unsigned int level)
370{ 2145{
371 unsigned int i; 2146 unsigned int i;
372 2147
373 if (level > 2) 2148 if (level > 3)
374 return; 2149 return;
375 2150
376 if (!hist_field) 2151 if (!hist_field)
@@ -379,11 +2154,17 @@ static void destroy_hist_field(struct hist_field *hist_field,
379 for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) 2154 for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
380 destroy_hist_field(hist_field->operands[i], level + 1); 2155 destroy_hist_field(hist_field->operands[i], level + 1);
381 2156
2157 kfree(hist_field->var.name);
2158 kfree(hist_field->name);
2159 kfree(hist_field->type);
2160
382 kfree(hist_field); 2161 kfree(hist_field);
383} 2162}
384 2163
385static struct hist_field *create_hist_field(struct ftrace_event_field *field, 2164static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
386 unsigned long flags) 2165 struct ftrace_event_field *field,
2166 unsigned long flags,
2167 char *var_name)
387{ 2168{
388 struct hist_field *hist_field; 2169 struct hist_field *hist_field;
389 2170
@@ -394,8 +2175,22 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
394 if (!hist_field) 2175 if (!hist_field)
395 return NULL; 2176 return NULL;
396 2177
2178 hist_field->hist_data = hist_data;
2179
2180 if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS)
2181 goto out; /* caller will populate */
2182
2183 if (flags & HIST_FIELD_FL_VAR_REF) {
2184 hist_field->fn = hist_field_var_ref;
2185 goto out;
2186 }
2187
397 if (flags & HIST_FIELD_FL_HITCOUNT) { 2188 if (flags & HIST_FIELD_FL_HITCOUNT) {
398 hist_field->fn = hist_field_counter; 2189 hist_field->fn = hist_field_counter;
2190 hist_field->size = sizeof(u64);
2191 hist_field->type = kstrdup("u64", GFP_KERNEL);
2192 if (!hist_field->type)
2193 goto free;
399 goto out; 2194 goto out;
400 } 2195 }
401 2196
@@ -407,8 +2202,29 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
407 if (flags & HIST_FIELD_FL_LOG2) { 2202 if (flags & HIST_FIELD_FL_LOG2) {
408 unsigned long fl = flags & ~HIST_FIELD_FL_LOG2; 2203 unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
409 hist_field->fn = hist_field_log2; 2204 hist_field->fn = hist_field_log2;
410 hist_field->operands[0] = create_hist_field(field, fl); 2205 hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL);
411 hist_field->size = hist_field->operands[0]->size; 2206 hist_field->size = hist_field->operands[0]->size;
2207 hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL);
2208 if (!hist_field->type)
2209 goto free;
2210 goto out;
2211 }
2212
2213 if (flags & HIST_FIELD_FL_TIMESTAMP) {
2214 hist_field->fn = hist_field_timestamp;
2215 hist_field->size = sizeof(u64);
2216 hist_field->type = kstrdup("u64", GFP_KERNEL);
2217 if (!hist_field->type)
2218 goto free;
2219 goto out;
2220 }
2221
2222 if (flags & HIST_FIELD_FL_CPU) {
2223 hist_field->fn = hist_field_cpu;
2224 hist_field->size = sizeof(int);
2225 hist_field->type = kstrdup("unsigned int", GFP_KERNEL);
2226 if (!hist_field->type)
2227 goto free;
412 goto out; 2228 goto out;
413 } 2229 }
414 2230
@@ -418,6 +2234,11 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
418 if (is_string_field(field)) { 2234 if (is_string_field(field)) {
419 flags |= HIST_FIELD_FL_STRING; 2235 flags |= HIST_FIELD_FL_STRING;
420 2236
2237 hist_field->size = MAX_FILTER_STR_VAL;
2238 hist_field->type = kstrdup(field->type, GFP_KERNEL);
2239 if (!hist_field->type)
2240 goto free;
2241
421 if (field->filter_type == FILTER_STATIC_STRING) 2242 if (field->filter_type == FILTER_STATIC_STRING)
422 hist_field->fn = hist_field_string; 2243 hist_field->fn = hist_field_string;
423 else if (field->filter_type == FILTER_DYN_STRING) 2244 else if (field->filter_type == FILTER_DYN_STRING)
@@ -425,6 +2246,12 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
425 else 2246 else
426 hist_field->fn = hist_field_pstring; 2247 hist_field->fn = hist_field_pstring;
427 } else { 2248 } else {
2249 hist_field->size = field->size;
2250 hist_field->is_signed = field->is_signed;
2251 hist_field->type = kstrdup(field->type, GFP_KERNEL);
2252 if (!hist_field->type)
2253 goto free;
2254
428 hist_field->fn = select_value_fn(field->size, 2255 hist_field->fn = select_value_fn(field->size,
429 field->is_signed); 2256 field->is_signed);
430 if (!hist_field->fn) { 2257 if (!hist_field->fn) {
@@ -436,14 +2263,23 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
436 hist_field->field = field; 2263 hist_field->field = field;
437 hist_field->flags = flags; 2264 hist_field->flags = flags;
438 2265
2266 if (var_name) {
2267 hist_field->var.name = kstrdup(var_name, GFP_KERNEL);
2268 if (!hist_field->var.name)
2269 goto free;
2270 }
2271
439 return hist_field; 2272 return hist_field;
2273 free:
2274 destroy_hist_field(hist_field, 0);
2275 return NULL;
440} 2276}
441 2277
442static void destroy_hist_fields(struct hist_trigger_data *hist_data) 2278static void destroy_hist_fields(struct hist_trigger_data *hist_data)
443{ 2279{
444 unsigned int i; 2280 unsigned int i;
445 2281
446 for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) { 2282 for (i = 0; i < HIST_FIELDS_MAX; i++) {
447 if (hist_data->fields[i]) { 2283 if (hist_data->fields[i]) {
448 destroy_hist_field(hist_data->fields[i], 0); 2284 destroy_hist_field(hist_data->fields[i], 0);
449 hist_data->fields[i] = NULL; 2285 hist_data->fields[i] = NULL;
@@ -451,69 +2287,1610 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data)
451 } 2287 }
452} 2288}
453 2289
454static int create_hitcount_val(struct hist_trigger_data *hist_data) 2290static int init_var_ref(struct hist_field *ref_field,
2291 struct hist_field *var_field,
2292 char *system, char *event_name)
455{ 2293{
456 hist_data->fields[HITCOUNT_IDX] = 2294 int err = 0;
457 create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT); 2295
458 if (!hist_data->fields[HITCOUNT_IDX]) 2296 ref_field->var.idx = var_field->var.idx;
459 return -ENOMEM; 2297 ref_field->var.hist_data = var_field->hist_data;
2298 ref_field->size = var_field->size;
2299 ref_field->is_signed = var_field->is_signed;
2300 ref_field->flags |= var_field->flags &
2301 (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
2302
2303 if (system) {
2304 ref_field->system = kstrdup(system, GFP_KERNEL);
2305 if (!ref_field->system)
2306 return -ENOMEM;
2307 }
460 2308
461 hist_data->n_vals++; 2309 if (event_name) {
2310 ref_field->event_name = kstrdup(event_name, GFP_KERNEL);
2311 if (!ref_field->event_name) {
2312 err = -ENOMEM;
2313 goto free;
2314 }
2315 }
462 2316
463 if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) 2317 if (var_field->var.name) {
2318 ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL);
2319 if (!ref_field->name) {
2320 err = -ENOMEM;
2321 goto free;
2322 }
2323 } else if (var_field->name) {
2324 ref_field->name = kstrdup(var_field->name, GFP_KERNEL);
2325 if (!ref_field->name) {
2326 err = -ENOMEM;
2327 goto free;
2328 }
2329 }
2330
2331 ref_field->type = kstrdup(var_field->type, GFP_KERNEL);
2332 if (!ref_field->type) {
2333 err = -ENOMEM;
2334 goto free;
2335 }
2336 out:
2337 return err;
2338 free:
2339 kfree(ref_field->system);
2340 kfree(ref_field->event_name);
2341 kfree(ref_field->name);
2342
2343 goto out;
2344}
2345
2346static struct hist_field *create_var_ref(struct hist_field *var_field,
2347 char *system, char *event_name)
2348{
2349 unsigned long flags = HIST_FIELD_FL_VAR_REF;
2350 struct hist_field *ref_field;
2351
2352 ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL);
2353 if (ref_field) {
2354 if (init_var_ref(ref_field, var_field, system, event_name)) {
2355 destroy_hist_field(ref_field, 0);
2356 return NULL;
2357 }
2358 }
2359
2360 return ref_field;
2361}
2362
2363static bool is_var_ref(char *var_name)
2364{
2365 if (!var_name || strlen(var_name) < 2 || var_name[0] != '$')
2366 return false;
2367
2368 return true;
2369}
2370
2371static char *field_name_from_var(struct hist_trigger_data *hist_data,
2372 char *var_name)
2373{
2374 char *name, *field;
2375 unsigned int i;
2376
2377 for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
2378 name = hist_data->attrs->var_defs.name[i];
2379
2380 if (strcmp(var_name, name) == 0) {
2381 field = hist_data->attrs->var_defs.expr[i];
2382 if (contains_operator(field) || is_var_ref(field))
2383 continue;
2384 return field;
2385 }
2386 }
2387
2388 return NULL;
2389}
2390
2391static char *local_field_var_ref(struct hist_trigger_data *hist_data,
2392 char *system, char *event_name,
2393 char *var_name)
2394{
2395 struct trace_event_call *call;
2396
2397 if (system && event_name) {
2398 call = hist_data->event_file->event_call;
2399
2400 if (strcmp(system, call->class->system) != 0)
2401 return NULL;
2402
2403 if (strcmp(event_name, trace_event_name(call)) != 0)
2404 return NULL;
2405 }
2406
2407 if (!!system != !!event_name)
2408 return NULL;
2409
2410 if (!is_var_ref(var_name))
2411 return NULL;
2412
2413 var_name++;
2414
2415 return field_name_from_var(hist_data, var_name);
2416}
2417
2418static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data,
2419 char *system, char *event_name,
2420 char *var_name)
2421{
2422 struct hist_field *var_field = NULL, *ref_field = NULL;
2423
2424 if (!is_var_ref(var_name))
2425 return NULL;
2426
2427 var_name++;
2428
2429 var_field = find_event_var(hist_data, system, event_name, var_name);
2430 if (var_field)
2431 ref_field = create_var_ref(var_field, system, event_name);
2432
2433 if (!ref_field)
2434 hist_err_event("Couldn't find variable: $",
2435 system, event_name, var_name);
2436
2437 return ref_field;
2438}
2439
2440static struct ftrace_event_field *
2441parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
2442 char *field_str, unsigned long *flags)
2443{
2444 struct ftrace_event_field *field = NULL;
2445 char *field_name, *modifier, *str;
2446
2447 modifier = str = kstrdup(field_str, GFP_KERNEL);
2448 if (!modifier)
2449 return ERR_PTR(-ENOMEM);
2450
2451 field_name = strsep(&modifier, ".");
2452 if (modifier) {
2453 if (strcmp(modifier, "hex") == 0)
2454 *flags |= HIST_FIELD_FL_HEX;
2455 else if (strcmp(modifier, "sym") == 0)
2456 *flags |= HIST_FIELD_FL_SYM;
2457 else if (strcmp(modifier, "sym-offset") == 0)
2458 *flags |= HIST_FIELD_FL_SYM_OFFSET;
2459 else if ((strcmp(modifier, "execname") == 0) &&
2460 (strcmp(field_name, "common_pid") == 0))
2461 *flags |= HIST_FIELD_FL_EXECNAME;
2462 else if (strcmp(modifier, "syscall") == 0)
2463 *flags |= HIST_FIELD_FL_SYSCALL;
2464 else if (strcmp(modifier, "log2") == 0)
2465 *flags |= HIST_FIELD_FL_LOG2;
2466 else if (strcmp(modifier, "usecs") == 0)
2467 *flags |= HIST_FIELD_FL_TIMESTAMP_USECS;
2468 else {
2469 field = ERR_PTR(-EINVAL);
2470 goto out;
2471 }
2472 }
2473
2474 if (strcmp(field_name, "common_timestamp") == 0) {
2475 *flags |= HIST_FIELD_FL_TIMESTAMP;
2476 hist_data->enable_timestamps = true;
2477 if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
2478 hist_data->attrs->ts_in_usecs = true;
2479 } else if (strcmp(field_name, "cpu") == 0)
2480 *flags |= HIST_FIELD_FL_CPU;
2481 else {
2482 field = trace_find_event_field(file->event_call, field_name);
2483 if (!field || !field->size) {
2484 field = ERR_PTR(-EINVAL);
2485 goto out;
2486 }
2487 }
2488 out:
2489 kfree(str);
2490
2491 return field;
2492}
2493
2494static struct hist_field *create_alias(struct hist_trigger_data *hist_data,
2495 struct hist_field *var_ref,
2496 char *var_name)
2497{
2498 struct hist_field *alias = NULL;
2499 unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR;
2500
2501 alias = create_hist_field(hist_data, NULL, flags, var_name);
2502 if (!alias)
2503 return NULL;
2504
2505 alias->fn = var_ref->fn;
2506 alias->operands[0] = var_ref;
2507
2508 if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) {
2509 destroy_hist_field(alias, 0);
2510 return NULL;
2511 }
2512
2513 return alias;
2514}
2515
2516static struct hist_field *parse_atom(struct hist_trigger_data *hist_data,
2517 struct trace_event_file *file, char *str,
2518 unsigned long *flags, char *var_name)
2519{
2520 char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str;
2521 struct ftrace_event_field *field = NULL;
2522 struct hist_field *hist_field = NULL;
2523 int ret = 0;
2524
2525 s = strchr(str, '.');
2526 if (s) {
2527 s = strchr(++s, '.');
2528 if (s) {
2529 ref_system = strsep(&str, ".");
2530 if (!str) {
2531 ret = -EINVAL;
2532 goto out;
2533 }
2534 ref_event = strsep(&str, ".");
2535 if (!str) {
2536 ret = -EINVAL;
2537 goto out;
2538 }
2539 ref_var = str;
2540 }
2541 }
2542
2543 s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var);
2544 if (!s) {
2545 hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var);
2546 if (hist_field) {
2547 hist_data->var_refs[hist_data->n_var_refs] = hist_field;
2548 hist_field->var_ref_idx = hist_data->n_var_refs++;
2549 if (var_name) {
2550 hist_field = create_alias(hist_data, hist_field, var_name);
2551 if (!hist_field) {
2552 ret = -ENOMEM;
2553 goto out;
2554 }
2555 }
2556 return hist_field;
2557 }
2558 } else
2559 str = s;
2560
2561 field = parse_field(hist_data, file, str, flags);
2562 if (IS_ERR(field)) {
2563 ret = PTR_ERR(field);
2564 goto out;
2565 }
2566
2567 hist_field = create_hist_field(hist_data, field, *flags, var_name);
2568 if (!hist_field) {
2569 ret = -ENOMEM;
2570 goto out;
2571 }
2572
2573 return hist_field;
2574 out:
2575 return ERR_PTR(ret);
2576}
2577
2578static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
2579 struct trace_event_file *file,
2580 char *str, unsigned long flags,
2581 char *var_name, unsigned int level);
2582
2583static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
2584 struct trace_event_file *file,
2585 char *str, unsigned long flags,
2586 char *var_name, unsigned int level)
2587{
2588 struct hist_field *operand1, *expr = NULL;
2589 unsigned long operand_flags;
2590 int ret = 0;
2591 char *s;
2592
2593 /* we support only -(xxx) i.e. explicit parens required */
2594
2595 if (level > 3) {
2596 hist_err("Too many subexpressions (3 max): ", str);
2597 ret = -EINVAL;
2598 goto free;
2599 }
2600
2601 str++; /* skip leading '-' */
2602
2603 s = strchr(str, '(');
2604 if (s)
2605 str++;
2606 else {
2607 ret = -EINVAL;
2608 goto free;
2609 }
2610
2611 s = strrchr(str, ')');
2612 if (s)
2613 *s = '\0';
2614 else {
2615 ret = -EINVAL; /* no closing ')' */
2616 goto free;
2617 }
2618
2619 flags |= HIST_FIELD_FL_EXPR;
2620 expr = create_hist_field(hist_data, NULL, flags, var_name);
2621 if (!expr) {
2622 ret = -ENOMEM;
2623 goto free;
2624 }
2625
2626 operand_flags = 0;
2627 operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
2628 if (IS_ERR(operand1)) {
2629 ret = PTR_ERR(operand1);
2630 goto free;
2631 }
2632
2633 expr->flags |= operand1->flags &
2634 (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
2635 expr->fn = hist_field_unary_minus;
2636 expr->operands[0] = operand1;
2637 expr->operator = FIELD_OP_UNARY_MINUS;
2638 expr->name = expr_str(expr, 0);
2639 expr->type = kstrdup(operand1->type, GFP_KERNEL);
2640 if (!expr->type) {
2641 ret = -ENOMEM;
2642 goto free;
2643 }
2644
2645 return expr;
2646 free:
2647 destroy_hist_field(expr, 0);
2648 return ERR_PTR(ret);
2649}
2650
2651static int check_expr_operands(struct hist_field *operand1,
2652 struct hist_field *operand2)
2653{
2654 unsigned long operand1_flags = operand1->flags;
2655 unsigned long operand2_flags = operand2->flags;
2656
2657 if ((operand1_flags & HIST_FIELD_FL_VAR_REF) ||
2658 (operand1_flags & HIST_FIELD_FL_ALIAS)) {
2659 struct hist_field *var;
2660
2661 var = find_var_field(operand1->var.hist_data, operand1->name);
2662 if (!var)
2663 return -EINVAL;
2664 operand1_flags = var->flags;
2665 }
2666
2667 if ((operand2_flags & HIST_FIELD_FL_VAR_REF) ||
2668 (operand2_flags & HIST_FIELD_FL_ALIAS)) {
2669 struct hist_field *var;
2670
2671 var = find_var_field(operand2->var.hist_data, operand2->name);
2672 if (!var)
2673 return -EINVAL;
2674 operand2_flags = var->flags;
2675 }
2676
2677 if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) !=
2678 (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) {
2679 hist_err("Timestamp units in expression don't match", NULL);
464 return -EINVAL; 2680 return -EINVAL;
2681 }
465 2682
466 return 0; 2683 return 0;
467} 2684}
468 2685
469static int create_val_field(struct hist_trigger_data *hist_data, 2686static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
470 unsigned int val_idx, 2687 struct trace_event_file *file,
471 struct trace_event_file *file, 2688 char *str, unsigned long flags,
472 char *field_str) 2689 char *var_name, unsigned int level)
473{ 2690{
474 struct ftrace_event_field *field = NULL; 2691 struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL;
475 unsigned long flags = 0; 2692 unsigned long operand_flags;
476 char *field_name; 2693 int field_op, ret = -EINVAL;
2694 char *sep, *operand1_str;
2695
2696 if (level > 3) {
2697 hist_err("Too many subexpressions (3 max): ", str);
2698 return ERR_PTR(-EINVAL);
2699 }
2700
2701 field_op = contains_operator(str);
2702
2703 if (field_op == FIELD_OP_NONE)
2704 return parse_atom(hist_data, file, str, &flags, var_name);
2705
2706 if (field_op == FIELD_OP_UNARY_MINUS)
2707 return parse_unary(hist_data, file, str, flags, var_name, ++level);
2708
2709 switch (field_op) {
2710 case FIELD_OP_MINUS:
2711 sep = "-";
2712 break;
2713 case FIELD_OP_PLUS:
2714 sep = "+";
2715 break;
2716 default:
2717 goto free;
2718 }
2719
2720 operand1_str = strsep(&str, sep);
2721 if (!operand1_str || !str)
2722 goto free;
2723
2724 operand_flags = 0;
2725 operand1 = parse_atom(hist_data, file, operand1_str,
2726 &operand_flags, NULL);
2727 if (IS_ERR(operand1)) {
2728 ret = PTR_ERR(operand1);
2729 operand1 = NULL;
2730 goto free;
2731 }
2732
2733 /* rest of string could be another expression e.g. b+c in a+b+c */
2734 operand_flags = 0;
2735 operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level);
2736 if (IS_ERR(operand2)) {
2737 ret = PTR_ERR(operand2);
2738 operand2 = NULL;
2739 goto free;
2740 }
2741
2742 ret = check_expr_operands(operand1, operand2);
2743 if (ret)
2744 goto free;
2745
2746 flags |= HIST_FIELD_FL_EXPR;
2747
2748 flags |= operand1->flags &
2749 (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
2750
2751 expr = create_hist_field(hist_data, NULL, flags, var_name);
2752 if (!expr) {
2753 ret = -ENOMEM;
2754 goto free;
2755 }
2756
2757 operand1->read_once = true;
2758 operand2->read_once = true;
2759
2760 expr->operands[0] = operand1;
2761 expr->operands[1] = operand2;
2762 expr->operator = field_op;
2763 expr->name = expr_str(expr, 0);
2764 expr->type = kstrdup(operand1->type, GFP_KERNEL);
2765 if (!expr->type) {
2766 ret = -ENOMEM;
2767 goto free;
2768 }
2769
2770 switch (field_op) {
2771 case FIELD_OP_MINUS:
2772 expr->fn = hist_field_minus;
2773 break;
2774 case FIELD_OP_PLUS:
2775 expr->fn = hist_field_plus;
2776 break;
2777 default:
2778 ret = -EINVAL;
2779 goto free;
2780 }
2781
2782 return expr;
2783 free:
2784 destroy_hist_field(operand1, 0);
2785 destroy_hist_field(operand2, 0);
2786 destroy_hist_field(expr, 0);
2787
2788 return ERR_PTR(ret);
2789}
2790
2791static char *find_trigger_filter(struct hist_trigger_data *hist_data,
2792 struct trace_event_file *file)
2793{
2794 struct event_trigger_data *test;
2795
2796 list_for_each_entry_rcu(test, &file->triggers, list) {
2797 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
2798 if (test->private_data == hist_data)
2799 return test->filter_str;
2800 }
2801 }
2802
2803 return NULL;
2804}
2805
2806static struct event_command trigger_hist_cmd;
2807static int event_hist_trigger_func(struct event_command *cmd_ops,
2808 struct trace_event_file *file,
2809 char *glob, char *cmd, char *param);
2810
2811static bool compatible_keys(struct hist_trigger_data *target_hist_data,
2812 struct hist_trigger_data *hist_data,
2813 unsigned int n_keys)
2814{
2815 struct hist_field *target_hist_field, *hist_field;
2816 unsigned int n, i, j;
2817
2818 if (hist_data->n_fields - hist_data->n_vals != n_keys)
2819 return false;
2820
2821 i = hist_data->n_vals;
2822 j = target_hist_data->n_vals;
2823
2824 for (n = 0; n < n_keys; n++) {
2825 hist_field = hist_data->fields[i + n];
2826 target_hist_field = target_hist_data->fields[j + n];
2827
2828 if (strcmp(hist_field->type, target_hist_field->type) != 0)
2829 return false;
2830 if (hist_field->size != target_hist_field->size)
2831 return false;
2832 if (hist_field->is_signed != target_hist_field->is_signed)
2833 return false;
2834 }
2835
2836 return true;
2837}
2838
2839static struct hist_trigger_data *
2840find_compatible_hist(struct hist_trigger_data *target_hist_data,
2841 struct trace_event_file *file)
2842{
2843 struct hist_trigger_data *hist_data;
2844 struct event_trigger_data *test;
2845 unsigned int n_keys;
2846
2847 n_keys = target_hist_data->n_fields - target_hist_data->n_vals;
2848
2849 list_for_each_entry_rcu(test, &file->triggers, list) {
2850 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
2851 hist_data = test->private_data;
2852
2853 if (compatible_keys(target_hist_data, hist_data, n_keys))
2854 return hist_data;
2855 }
2856 }
2857
2858 return NULL;
2859}
2860
2861static struct trace_event_file *event_file(struct trace_array *tr,
2862 char *system, char *event_name)
2863{
2864 struct trace_event_file *file;
2865
2866 file = find_event_file(tr, system, event_name);
2867 if (!file)
2868 return ERR_PTR(-EINVAL);
2869
2870 return file;
2871}
2872
2873static struct hist_field *
2874find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
2875 char *system, char *event_name, char *field_name)
2876{
2877 struct hist_field *event_var;
2878 char *synthetic_name;
2879
2880 synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
2881 if (!synthetic_name)
2882 return ERR_PTR(-ENOMEM);
2883
2884 strcpy(synthetic_name, "synthetic_");
2885 strcat(synthetic_name, field_name);
2886
2887 event_var = find_event_var(target_hist_data, system, event_name, synthetic_name);
2888
2889 kfree(synthetic_name);
2890
2891 return event_var;
2892}
2893
2894/**
2895 * create_field_var_hist - Automatically create a histogram and var for a field
2896 * @target_hist_data: The target hist trigger
2897 * @subsys_name: Optional subsystem name
2898 * @event_name: Optional event name
2899 * @field_name: The name of the field (and the resulting variable)
2900 *
2901 * Hist trigger actions fetch data from variables, not directly from
2902 * events. However, for convenience, users are allowed to directly
2903 * specify an event field in an action, which will be automatically
2904 * converted into a variable on their behalf.
2905
2906 * If a user specifies a field on an event that isn't the event the
2907 * histogram currently being defined (the target event histogram), the
2908 * only way that can be accomplished is if a new hist trigger is
2909 * created and the field variable defined on that.
2910 *
2911 * This function creates a new histogram compatible with the target
2912 * event (meaning a histogram with the same key as the target
2913 * histogram), and creates a variable for the specified field, but
2914 * with 'synthetic_' prepended to the variable name in order to avoid
2915 * collision with normal field variables.
2916 *
2917 * Return: The variable created for the field.
2918 */
2919static struct hist_field *
2920create_field_var_hist(struct hist_trigger_data *target_hist_data,
2921 char *subsys_name, char *event_name, char *field_name)
2922{
2923 struct trace_array *tr = target_hist_data->event_file->tr;
2924 struct hist_field *event_var = ERR_PTR(-EINVAL);
2925 struct hist_trigger_data *hist_data;
2926 unsigned int i, n, first = true;
2927 struct field_var_hist *var_hist;
2928 struct trace_event_file *file;
2929 struct hist_field *key_field;
2930 char *saved_filter;
2931 char *cmd;
2932 int ret;
2933
2934 if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) {
2935 hist_err_event("onmatch: Too many field variables defined: ",
2936 subsys_name, event_name, field_name);
2937 return ERR_PTR(-EINVAL);
2938 }
2939
2940 file = event_file(tr, subsys_name, event_name);
2941
2942 if (IS_ERR(file)) {
2943 hist_err_event("onmatch: Event file not found: ",
2944 subsys_name, event_name, field_name);
2945 ret = PTR_ERR(file);
2946 return ERR_PTR(ret);
2947 }
2948
2949 /*
2950 * Look for a histogram compatible with target. We'll use the
2951 * found histogram specification to create a new matching
2952 * histogram with our variable on it. target_hist_data is not
2953 * yet a registered histogram so we can't use that.
2954 */
2955 hist_data = find_compatible_hist(target_hist_data, file);
2956 if (!hist_data) {
2957 hist_err_event("onmatch: Matching event histogram not found: ",
2958 subsys_name, event_name, field_name);
2959 return ERR_PTR(-EINVAL);
2960 }
2961
2962 /* See if a synthetic field variable has already been created */
2963 event_var = find_synthetic_field_var(target_hist_data, subsys_name,
2964 event_name, field_name);
2965 if (!IS_ERR_OR_NULL(event_var))
2966 return event_var;
2967
2968 var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL);
2969 if (!var_hist)
2970 return ERR_PTR(-ENOMEM);
2971
2972 cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL);
2973 if (!cmd) {
2974 kfree(var_hist);
2975 return ERR_PTR(-ENOMEM);
2976 }
2977
2978 /* Use the same keys as the compatible histogram */
2979 strcat(cmd, "keys=");
2980
2981 for_each_hist_key_field(i, hist_data) {
2982 key_field = hist_data->fields[i];
2983 if (!first)
2984 strcat(cmd, ",");
2985 strcat(cmd, key_field->field->name);
2986 first = false;
2987 }
2988
2989 /* Create the synthetic field variable specification */
2990 strcat(cmd, ":synthetic_");
2991 strcat(cmd, field_name);
2992 strcat(cmd, "=");
2993 strcat(cmd, field_name);
2994
2995 /* Use the same filter as the compatible histogram */
2996 saved_filter = find_trigger_filter(hist_data, file);
2997 if (saved_filter) {
2998 strcat(cmd, " if ");
2999 strcat(cmd, saved_filter);
3000 }
3001
3002 var_hist->cmd = kstrdup(cmd, GFP_KERNEL);
3003 if (!var_hist->cmd) {
3004 kfree(cmd);
3005 kfree(var_hist);
3006 return ERR_PTR(-ENOMEM);
3007 }
3008
3009 /* Save the compatible histogram information */
3010 var_hist->hist_data = hist_data;
3011
3012 /* Create the new histogram with our variable */
3013 ret = event_hist_trigger_func(&trigger_hist_cmd, file,
3014 "", "hist", cmd);
3015 if (ret) {
3016 kfree(cmd);
3017 kfree(var_hist->cmd);
3018 kfree(var_hist);
3019 hist_err_event("onmatch: Couldn't create histogram for field: ",
3020 subsys_name, event_name, field_name);
3021 return ERR_PTR(ret);
3022 }
3023
3024 kfree(cmd);
3025
3026 /* If we can't find the variable, something went wrong */
3027 event_var = find_synthetic_field_var(target_hist_data, subsys_name,
3028 event_name, field_name);
3029 if (IS_ERR_OR_NULL(event_var)) {
3030 kfree(var_hist->cmd);
3031 kfree(var_hist);
3032 hist_err_event("onmatch: Couldn't find synthetic variable: ",
3033 subsys_name, event_name, field_name);
3034 return ERR_PTR(-EINVAL);
3035 }
3036
3037 n = target_hist_data->n_field_var_hists;
3038 target_hist_data->field_var_hists[n] = var_hist;
3039 target_hist_data->n_field_var_hists++;
3040
3041 return event_var;
3042}
3043
3044static struct hist_field *
3045find_target_event_var(struct hist_trigger_data *hist_data,
3046 char *subsys_name, char *event_name, char *var_name)
3047{
3048 struct trace_event_file *file = hist_data->event_file;
3049 struct hist_field *hist_field = NULL;
3050
3051 if (subsys_name) {
3052 struct trace_event_call *call;
3053
3054 if (!event_name)
3055 return NULL;
3056
3057 call = file->event_call;
3058
3059 if (strcmp(subsys_name, call->class->system) != 0)
3060 return NULL;
3061
3062 if (strcmp(event_name, trace_event_name(call)) != 0)
3063 return NULL;
3064 }
3065
3066 hist_field = find_var_field(hist_data, var_name);
3067
3068 return hist_field;
3069}
3070
3071static inline void __update_field_vars(struct tracing_map_elt *elt,
3072 struct ring_buffer_event *rbe,
3073 void *rec,
3074 struct field_var **field_vars,
3075 unsigned int n_field_vars,
3076 unsigned int field_var_str_start)
3077{
3078 struct hist_elt_data *elt_data = elt->private_data;
3079 unsigned int i, j, var_idx;
3080 u64 var_val;
3081
3082 for (i = 0, j = field_var_str_start; i < n_field_vars; i++) {
3083 struct field_var *field_var = field_vars[i];
3084 struct hist_field *var = field_var->var;
3085 struct hist_field *val = field_var->val;
3086
3087 var_val = val->fn(val, elt, rbe, rec);
3088 var_idx = var->var.idx;
3089
3090 if (val->flags & HIST_FIELD_FL_STRING) {
3091 char *str = elt_data->field_var_str[j++];
3092 char *val_str = (char *)(uintptr_t)var_val;
3093
3094 strscpy(str, val_str, STR_VAR_LEN_MAX);
3095 var_val = (u64)(uintptr_t)str;
3096 }
3097 tracing_map_set_var(elt, var_idx, var_val);
3098 }
3099}
3100
3101static void update_field_vars(struct hist_trigger_data *hist_data,
3102 struct tracing_map_elt *elt,
3103 struct ring_buffer_event *rbe,
3104 void *rec)
3105{
3106 __update_field_vars(elt, rbe, rec, hist_data->field_vars,
3107 hist_data->n_field_vars, 0);
3108}
3109
3110static void update_max_vars(struct hist_trigger_data *hist_data,
3111 struct tracing_map_elt *elt,
3112 struct ring_buffer_event *rbe,
3113 void *rec)
3114{
3115 __update_field_vars(elt, rbe, rec, hist_data->max_vars,
3116 hist_data->n_max_vars, hist_data->n_field_var_str);
3117}
3118
3119static struct hist_field *create_var(struct hist_trigger_data *hist_data,
3120 struct trace_event_file *file,
3121 char *name, int size, const char *type)
3122{
3123 struct hist_field *var;
3124 int idx;
3125
3126 if (find_var(hist_data, file, name) && !hist_data->remove) {
3127 var = ERR_PTR(-EINVAL);
3128 goto out;
3129 }
3130
3131 var = kzalloc(sizeof(struct hist_field), GFP_KERNEL);
3132 if (!var) {
3133 var = ERR_PTR(-ENOMEM);
3134 goto out;
3135 }
3136
3137 idx = tracing_map_add_var(hist_data->map);
3138 if (idx < 0) {
3139 kfree(var);
3140 var = ERR_PTR(-EINVAL);
3141 goto out;
3142 }
3143
3144 var->flags = HIST_FIELD_FL_VAR;
3145 var->var.idx = idx;
3146 var->var.hist_data = var->hist_data = hist_data;
3147 var->size = size;
3148 var->var.name = kstrdup(name, GFP_KERNEL);
3149 var->type = kstrdup(type, GFP_KERNEL);
3150 if (!var->var.name || !var->type) {
3151 kfree(var->var.name);
3152 kfree(var->type);
3153 kfree(var);
3154 var = ERR_PTR(-ENOMEM);
3155 }
3156 out:
3157 return var;
3158}
3159
3160static struct field_var *create_field_var(struct hist_trigger_data *hist_data,
3161 struct trace_event_file *file,
3162 char *field_name)
3163{
3164 struct hist_field *val = NULL, *var = NULL;
3165 unsigned long flags = HIST_FIELD_FL_VAR;
3166 struct field_var *field_var;
477 int ret = 0; 3167 int ret = 0;
478 3168
479 if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) 3169 if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) {
3170 hist_err("Too many field variables defined: ", field_name);
3171 ret = -EINVAL;
3172 goto err;
3173 }
3174
3175 val = parse_atom(hist_data, file, field_name, &flags, NULL);
3176 if (IS_ERR(val)) {
3177 hist_err("Couldn't parse field variable: ", field_name);
3178 ret = PTR_ERR(val);
3179 goto err;
3180 }
3181
3182 var = create_var(hist_data, file, field_name, val->size, val->type);
3183 if (IS_ERR(var)) {
3184 hist_err("Couldn't create or find variable: ", field_name);
3185 kfree(val);
3186 ret = PTR_ERR(var);
3187 goto err;
3188 }
3189
3190 field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL);
3191 if (!field_var) {
3192 kfree(val);
3193 kfree(var);
3194 ret = -ENOMEM;
3195 goto err;
3196 }
3197
3198 field_var->var = var;
3199 field_var->val = val;
3200 out:
3201 return field_var;
3202 err:
3203 field_var = ERR_PTR(ret);
3204 goto out;
3205}
3206
3207/**
3208 * create_target_field_var - Automatically create a variable for a field
3209 * @target_hist_data: The target hist trigger
3210 * @subsys_name: Optional subsystem name
3211 * @event_name: Optional event name
3212 * @var_name: The name of the field (and the resulting variable)
3213 *
3214 * Hist trigger actions fetch data from variables, not directly from
3215 * events. However, for convenience, users are allowed to directly
3216 * specify an event field in an action, which will be automatically
3217 * converted into a variable on their behalf.
3218
3219 * This function creates a field variable with the name var_name on
3220 * the hist trigger currently being defined on the target event. If
3221 * subsys_name and event_name are specified, this function simply
3222 * verifies that they do in fact match the target event subsystem and
3223 * event name.
3224 *
3225 * Return: The variable created for the field.
3226 */
3227static struct field_var *
3228create_target_field_var(struct hist_trigger_data *target_hist_data,
3229 char *subsys_name, char *event_name, char *var_name)
3230{
3231 struct trace_event_file *file = target_hist_data->event_file;
3232
3233 if (subsys_name) {
3234 struct trace_event_call *call;
3235
3236 if (!event_name)
3237 return NULL;
3238
3239 call = file->event_call;
3240
3241 if (strcmp(subsys_name, call->class->system) != 0)
3242 return NULL;
3243
3244 if (strcmp(event_name, trace_event_name(call)) != 0)
3245 return NULL;
3246 }
3247
3248 return create_field_var(target_hist_data, file, var_name);
3249}
3250
3251static void onmax_print(struct seq_file *m,
3252 struct hist_trigger_data *hist_data,
3253 struct tracing_map_elt *elt,
3254 struct action_data *data)
3255{
3256 unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx;
3257
3258 seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx));
3259
3260 for (i = 0; i < hist_data->n_max_vars; i++) {
3261 struct hist_field *save_val = hist_data->max_vars[i]->val;
3262 struct hist_field *save_var = hist_data->max_vars[i]->var;
3263 u64 val;
3264
3265 save_var_idx = save_var->var.idx;
3266
3267 val = tracing_map_read_var(elt, save_var_idx);
3268
3269 if (save_val->flags & HIST_FIELD_FL_STRING) {
3270 seq_printf(m, " %s: %-32s", save_var->var.name,
3271 (char *)(uintptr_t)(val));
3272 } else
3273 seq_printf(m, " %s: %10llu", save_var->var.name, val);
3274 }
3275}
3276
3277static void onmax_save(struct hist_trigger_data *hist_data,
3278 struct tracing_map_elt *elt, void *rec,
3279 struct ring_buffer_event *rbe,
3280 struct action_data *data, u64 *var_ref_vals)
3281{
3282 unsigned int max_idx = data->onmax.max_var->var.idx;
3283 unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx;
3284
3285 u64 var_val, max_val;
3286
3287 var_val = var_ref_vals[max_var_ref_idx];
3288 max_val = tracing_map_read_var(elt, max_idx);
3289
3290 if (var_val <= max_val)
3291 return;
3292
3293 tracing_map_set_var(elt, max_idx, var_val);
3294
3295 update_max_vars(hist_data, elt, rbe, rec);
3296}
3297
3298static void onmax_destroy(struct action_data *data)
3299{
3300 unsigned int i;
3301
3302 destroy_hist_field(data->onmax.max_var, 0);
3303 destroy_hist_field(data->onmax.var, 0);
3304
3305 kfree(data->onmax.var_str);
3306 kfree(data->onmax.fn_name);
3307
3308 for (i = 0; i < data->n_params; i++)
3309 kfree(data->params[i]);
3310
3311 kfree(data);
3312}
3313
3314static int onmax_create(struct hist_trigger_data *hist_data,
3315 struct action_data *data)
3316{
3317 struct trace_event_file *file = hist_data->event_file;
3318 struct hist_field *var_field, *ref_field, *max_var;
3319 unsigned int var_ref_idx = hist_data->n_var_refs;
3320 struct field_var *field_var;
3321 char *onmax_var_str, *param;
3322 unsigned long flags;
3323 unsigned int i;
3324 int ret = 0;
3325
3326 onmax_var_str = data->onmax.var_str;
3327 if (onmax_var_str[0] != '$') {
3328 hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str);
480 return -EINVAL; 3329 return -EINVAL;
3330 }
3331 onmax_var_str++;
481 3332
482 field_name = strsep(&field_str, "."); 3333 var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str);
483 if (field_str) { 3334 if (!var_field) {
484 if (strcmp(field_str, "hex") == 0) 3335 hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str);
485 flags |= HIST_FIELD_FL_HEX; 3336 return -EINVAL;
486 else { 3337 }
3338
3339 flags = HIST_FIELD_FL_VAR_REF;
3340 ref_field = create_hist_field(hist_data, NULL, flags, NULL);
3341 if (!ref_field)
3342 return -ENOMEM;
3343
3344 if (init_var_ref(ref_field, var_field, NULL, NULL)) {
3345 destroy_hist_field(ref_field, 0);
3346 ret = -ENOMEM;
3347 goto out;
3348 }
3349 hist_data->var_refs[hist_data->n_var_refs] = ref_field;
3350 ref_field->var_ref_idx = hist_data->n_var_refs++;
3351 data->onmax.var = ref_field;
3352
3353 data->fn = onmax_save;
3354 data->onmax.max_var_ref_idx = var_ref_idx;
3355 max_var = create_var(hist_data, file, "max", sizeof(u64), "u64");
3356 if (IS_ERR(max_var)) {
3357 hist_err("onmax: Couldn't create onmax variable: ", "max");
3358 ret = PTR_ERR(max_var);
3359 goto out;
3360 }
3361 data->onmax.max_var = max_var;
3362
3363 for (i = 0; i < data->n_params; i++) {
3364 param = kstrdup(data->params[i], GFP_KERNEL);
3365 if (!param) {
3366 ret = -ENOMEM;
3367 goto out;
3368 }
3369
3370 field_var = create_target_field_var(hist_data, NULL, NULL, param);
3371 if (IS_ERR(field_var)) {
3372 hist_err("onmax: Couldn't create field variable: ", param);
3373 ret = PTR_ERR(field_var);
3374 kfree(param);
3375 goto out;
3376 }
3377
3378 hist_data->max_vars[hist_data->n_max_vars++] = field_var;
3379 if (field_var->val->flags & HIST_FIELD_FL_STRING)
3380 hist_data->n_max_var_str++;
3381
3382 kfree(param);
3383 }
3384 out:
3385 return ret;
3386}
3387
3388static int parse_action_params(char *params, struct action_data *data)
3389{
3390 char *param, *saved_param;
3391 int ret = 0;
3392
3393 while (params) {
3394 if (data->n_params >= SYNTH_FIELDS_MAX)
3395 goto out;
3396
3397 param = strsep(&params, ",");
3398 if (!param) {
487 ret = -EINVAL; 3399 ret = -EINVAL;
488 goto out; 3400 goto out;
489 } 3401 }
3402
3403 param = strstrip(param);
3404 if (strlen(param) < 2) {
3405 hist_err("Invalid action param: ", param);
3406 ret = -EINVAL;
3407 goto out;
3408 }
3409
3410 saved_param = kstrdup(param, GFP_KERNEL);
3411 if (!saved_param) {
3412 ret = -ENOMEM;
3413 goto out;
3414 }
3415
3416 data->params[data->n_params++] = saved_param;
490 } 3417 }
3418 out:
3419 return ret;
3420}
491 3421
492 field = trace_find_event_field(file->event_call, field_name); 3422static struct action_data *onmax_parse(char *str)
493 if (!field || !field->size) { 3423{
3424 char *onmax_fn_name, *onmax_var_str;
3425 struct action_data *data;
3426 int ret = -EINVAL;
3427
3428 data = kzalloc(sizeof(*data), GFP_KERNEL);
3429 if (!data)
3430 return ERR_PTR(-ENOMEM);
3431
3432 onmax_var_str = strsep(&str, ")");
3433 if (!onmax_var_str || !str) {
494 ret = -EINVAL; 3434 ret = -EINVAL;
495 goto out; 3435 goto free;
3436 }
3437
3438 data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL);
3439 if (!data->onmax.var_str) {
3440 ret = -ENOMEM;
3441 goto free;
3442 }
3443
3444 strsep(&str, ".");
3445 if (!str)
3446 goto free;
3447
3448 onmax_fn_name = strsep(&str, "(");
3449 if (!onmax_fn_name || !str)
3450 goto free;
3451
3452 if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) {
3453 char *params = strsep(&str, ")");
3454
3455 if (!params) {
3456 ret = -EINVAL;
3457 goto free;
3458 }
3459
3460 ret = parse_action_params(params, data);
3461 if (ret)
3462 goto free;
3463 } else
3464 goto free;
3465
3466 data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL);
3467 if (!data->onmax.fn_name) {
3468 ret = -ENOMEM;
3469 goto free;
3470 }
3471 out:
3472 return data;
3473 free:
3474 onmax_destroy(data);
3475 data = ERR_PTR(ret);
3476 goto out;
3477}
3478
3479static void onmatch_destroy(struct action_data *data)
3480{
3481 unsigned int i;
3482
3483 mutex_lock(&synth_event_mutex);
3484
3485 kfree(data->onmatch.match_event);
3486 kfree(data->onmatch.match_event_system);
3487 kfree(data->onmatch.synth_event_name);
3488
3489 for (i = 0; i < data->n_params; i++)
3490 kfree(data->params[i]);
3491
3492 if (data->onmatch.synth_event)
3493 data->onmatch.synth_event->ref--;
3494
3495 kfree(data);
3496
3497 mutex_unlock(&synth_event_mutex);
3498}
3499
3500static void destroy_field_var(struct field_var *field_var)
3501{
3502 if (!field_var)
3503 return;
3504
3505 destroy_hist_field(field_var->var, 0);
3506 destroy_hist_field(field_var->val, 0);
3507
3508 kfree(field_var);
3509}
3510
3511static void destroy_field_vars(struct hist_trigger_data *hist_data)
3512{
3513 unsigned int i;
3514
3515 for (i = 0; i < hist_data->n_field_vars; i++)
3516 destroy_field_var(hist_data->field_vars[i]);
3517}
3518
3519static void save_field_var(struct hist_trigger_data *hist_data,
3520 struct field_var *field_var)
3521{
3522 hist_data->field_vars[hist_data->n_field_vars++] = field_var;
3523
3524 if (field_var->val->flags & HIST_FIELD_FL_STRING)
3525 hist_data->n_field_var_str++;
3526}
3527
3528
3529static void destroy_synth_var_refs(struct hist_trigger_data *hist_data)
3530{
3531 unsigned int i;
3532
3533 for (i = 0; i < hist_data->n_synth_var_refs; i++)
3534 destroy_hist_field(hist_data->synth_var_refs[i], 0);
3535}
3536
3537static void save_synth_var_ref(struct hist_trigger_data *hist_data,
3538 struct hist_field *var_ref)
3539{
3540 hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref;
3541
3542 hist_data->var_refs[hist_data->n_var_refs] = var_ref;
3543 var_ref->var_ref_idx = hist_data->n_var_refs++;
3544}
3545
3546static int check_synth_field(struct synth_event *event,
3547 struct hist_field *hist_field,
3548 unsigned int field_pos)
3549{
3550 struct synth_field *field;
3551
3552 if (field_pos >= event->n_fields)
3553 return -EINVAL;
3554
3555 field = event->fields[field_pos];
3556
3557 if (strcmp(field->type, hist_field->type) != 0)
3558 return -EINVAL;
3559
3560 return 0;
3561}
3562
3563static struct hist_field *
3564onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data,
3565 char *system, char *event, char *var)
3566{
3567 struct hist_field *hist_field;
3568
3569 var++; /* skip '$' */
3570
3571 hist_field = find_target_event_var(hist_data, system, event, var);
3572 if (!hist_field) {
3573 if (!system) {
3574 system = data->onmatch.match_event_system;
3575 event = data->onmatch.match_event;
3576 }
3577
3578 hist_field = find_event_var(hist_data, system, event, var);
3579 }
3580
3581 if (!hist_field)
3582 hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var);
3583
3584 return hist_field;
3585}
3586
3587static struct hist_field *
3588onmatch_create_field_var(struct hist_trigger_data *hist_data,
3589 struct action_data *data, char *system,
3590 char *event, char *var)
3591{
3592 struct hist_field *hist_field = NULL;
3593 struct field_var *field_var;
3594
3595 /*
3596 * First try to create a field var on the target event (the
3597 * currently being defined). This will create a variable for
3598 * unqualified fields on the target event, or if qualified,
3599 * target fields that have qualified names matching the target.
3600 */
3601 field_var = create_target_field_var(hist_data, system, event, var);
3602
3603 if (field_var && !IS_ERR(field_var)) {
3604 save_field_var(hist_data, field_var);
3605 hist_field = field_var->var;
3606 } else {
3607 field_var = NULL;
3608 /*
3609 * If no explicit system.event is specfied, default to
3610 * looking for fields on the onmatch(system.event.xxx)
3611 * event.
3612 */
3613 if (!system) {
3614 system = data->onmatch.match_event_system;
3615 event = data->onmatch.match_event;
3616 }
3617
3618 /*
3619 * At this point, we're looking at a field on another
3620 * event. Because we can't modify a hist trigger on
3621 * another event to add a variable for a field, we need
3622 * to create a new trigger on that event and create the
3623 * variable at the same time.
3624 */
3625 hist_field = create_field_var_hist(hist_data, system, event, var);
3626 if (IS_ERR(hist_field))
3627 goto free;
3628 }
3629 out:
3630 return hist_field;
3631 free:
3632 destroy_field_var(field_var);
3633 hist_field = NULL;
3634 goto out;
3635}
3636
3637static int onmatch_create(struct hist_trigger_data *hist_data,
3638 struct trace_event_file *file,
3639 struct action_data *data)
3640{
3641 char *event_name, *param, *system = NULL;
3642 struct hist_field *hist_field, *var_ref;
3643 unsigned int i, var_ref_idx;
3644 unsigned int field_pos = 0;
3645 struct synth_event *event;
3646 int ret = 0;
3647
3648 mutex_lock(&synth_event_mutex);
3649 event = find_synth_event(data->onmatch.synth_event_name);
3650 if (!event) {
3651 hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name);
3652 mutex_unlock(&synth_event_mutex);
3653 return -EINVAL;
3654 }
3655 event->ref++;
3656 mutex_unlock(&synth_event_mutex);
3657
3658 var_ref_idx = hist_data->n_var_refs;
3659
3660 for (i = 0; i < data->n_params; i++) {
3661 char *p;
3662
3663 p = param = kstrdup(data->params[i], GFP_KERNEL);
3664 if (!param) {
3665 ret = -ENOMEM;
3666 goto err;
3667 }
3668
3669 system = strsep(&param, ".");
3670 if (!param) {
3671 param = (char *)system;
3672 system = event_name = NULL;
3673 } else {
3674 event_name = strsep(&param, ".");
3675 if (!param) {
3676 kfree(p);
3677 ret = -EINVAL;
3678 goto err;
3679 }
3680 }
3681
3682 if (param[0] == '$')
3683 hist_field = onmatch_find_var(hist_data, data, system,
3684 event_name, param);
3685 else
3686 hist_field = onmatch_create_field_var(hist_data, data,
3687 system,
3688 event_name,
3689 param);
3690
3691 if (!hist_field) {
3692 kfree(p);
3693 ret = -EINVAL;
3694 goto err;
3695 }
3696
3697 if (check_synth_field(event, hist_field, field_pos) == 0) {
3698 var_ref = create_var_ref(hist_field, system, event_name);
3699 if (!var_ref) {
3700 kfree(p);
3701 ret = -ENOMEM;
3702 goto err;
3703 }
3704
3705 save_synth_var_ref(hist_data, var_ref);
3706 field_pos++;
3707 kfree(p);
3708 continue;
3709 }
3710
3711 hist_err_event("onmatch: Param type doesn't match synthetic event field type: ",
3712 system, event_name, param);
3713 kfree(p);
3714 ret = -EINVAL;
3715 goto err;
3716 }
3717
3718 if (field_pos != event->n_fields) {
3719 hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name);
3720 ret = -EINVAL;
3721 goto err;
3722 }
3723
3724 data->fn = action_trace;
3725 data->onmatch.synth_event = event;
3726 data->onmatch.var_ref_idx = var_ref_idx;
3727 out:
3728 return ret;
3729 err:
3730 mutex_lock(&synth_event_mutex);
3731 event->ref--;
3732 mutex_unlock(&synth_event_mutex);
3733
3734 goto out;
3735}
3736
3737static struct action_data *onmatch_parse(struct trace_array *tr, char *str)
3738{
3739 char *match_event, *match_event_system;
3740 char *synth_event_name, *params;
3741 struct action_data *data;
3742 int ret = -EINVAL;
3743
3744 data = kzalloc(sizeof(*data), GFP_KERNEL);
3745 if (!data)
3746 return ERR_PTR(-ENOMEM);
3747
3748 match_event = strsep(&str, ")");
3749 if (!match_event || !str) {
3750 hist_err("onmatch: Missing closing paren: ", match_event);
3751 goto free;
3752 }
3753
3754 match_event_system = strsep(&match_event, ".");
3755 if (!match_event) {
3756 hist_err("onmatch: Missing subsystem for match event: ", match_event_system);
3757 goto free;
3758 }
3759
3760 if (IS_ERR(event_file(tr, match_event_system, match_event))) {
3761 hist_err_event("onmatch: Invalid subsystem or event name: ",
3762 match_event_system, match_event, NULL);
3763 goto free;
3764 }
3765
3766 data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL);
3767 if (!data->onmatch.match_event) {
3768 ret = -ENOMEM;
3769 goto free;
3770 }
3771
3772 data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL);
3773 if (!data->onmatch.match_event_system) {
3774 ret = -ENOMEM;
3775 goto free;
3776 }
3777
3778 strsep(&str, ".");
3779 if (!str) {
3780 hist_err("onmatch: Missing . after onmatch(): ", str);
3781 goto free;
3782 }
3783
3784 synth_event_name = strsep(&str, "(");
3785 if (!synth_event_name || !str) {
3786 hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name);
3787 goto free;
496 } 3788 }
497 3789
498 hist_data->fields[val_idx] = create_hist_field(field, flags); 3790 data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL);
499 if (!hist_data->fields[val_idx]) { 3791 if (!data->onmatch.synth_event_name) {
500 ret = -ENOMEM; 3792 ret = -ENOMEM;
3793 goto free;
3794 }
3795
3796 params = strsep(&str, ")");
3797 if (!params || !str || (str && strlen(str))) {
3798 hist_err("onmatch: Missing closing paramlist paren: ", params);
3799 goto free;
3800 }
3801
3802 ret = parse_action_params(params, data);
3803 if (ret)
3804 goto free;
3805 out:
3806 return data;
3807 free:
3808 onmatch_destroy(data);
3809 data = ERR_PTR(ret);
3810 goto out;
3811}
3812
3813static int create_hitcount_val(struct hist_trigger_data *hist_data)
3814{
3815 hist_data->fields[HITCOUNT_IDX] =
3816 create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL);
3817 if (!hist_data->fields[HITCOUNT_IDX])
3818 return -ENOMEM;
3819
3820 hist_data->n_vals++;
3821 hist_data->n_fields++;
3822
3823 if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX))
3824 return -EINVAL;
3825
3826 return 0;
3827}
3828
3829static int __create_val_field(struct hist_trigger_data *hist_data,
3830 unsigned int val_idx,
3831 struct trace_event_file *file,
3832 char *var_name, char *field_str,
3833 unsigned long flags)
3834{
3835 struct hist_field *hist_field;
3836 int ret = 0;
3837
3838 hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0);
3839 if (IS_ERR(hist_field)) {
3840 ret = PTR_ERR(hist_field);
501 goto out; 3841 goto out;
502 } 3842 }
503 3843
3844 hist_data->fields[val_idx] = hist_field;
3845
504 ++hist_data->n_vals; 3846 ++hist_data->n_vals;
3847 ++hist_data->n_fields;
505 3848
506 if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) 3849 if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
507 ret = -EINVAL; 3850 ret = -EINVAL;
508 out: 3851 out:
509 return ret; 3852 return ret;
510} 3853}
511 3854
3855static int create_val_field(struct hist_trigger_data *hist_data,
3856 unsigned int val_idx,
3857 struct trace_event_file *file,
3858 char *field_str)
3859{
3860 if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX))
3861 return -EINVAL;
3862
3863 return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0);
3864}
3865
3866static int create_var_field(struct hist_trigger_data *hist_data,
3867 unsigned int val_idx,
3868 struct trace_event_file *file,
3869 char *var_name, char *expr_str)
3870{
3871 unsigned long flags = 0;
3872
3873 if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
3874 return -EINVAL;
3875
3876 if (find_var(hist_data, file, var_name) && !hist_data->remove) {
3877 hist_err("Variable already defined: ", var_name);
3878 return -EINVAL;
3879 }
3880
3881 flags |= HIST_FIELD_FL_VAR;
3882 hist_data->n_vars++;
3883 if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX))
3884 return -EINVAL;
3885
3886 return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
3887}
3888
512static int create_val_fields(struct hist_trigger_data *hist_data, 3889static int create_val_fields(struct hist_trigger_data *hist_data,
513 struct trace_event_file *file) 3890 struct trace_event_file *file)
514{ 3891{
515 char *fields_str, *field_str; 3892 char *fields_str, *field_str;
516 unsigned int i, j; 3893 unsigned int i, j = 1;
517 int ret; 3894 int ret;
518 3895
519 ret = create_hitcount_val(hist_data); 3896 ret = create_hitcount_val(hist_data);
@@ -533,12 +3910,15 @@ static int create_val_fields(struct hist_trigger_data *hist_data,
533 field_str = strsep(&fields_str, ","); 3910 field_str = strsep(&fields_str, ",");
534 if (!field_str) 3911 if (!field_str)
535 break; 3912 break;
3913
536 if (strcmp(field_str, "hitcount") == 0) 3914 if (strcmp(field_str, "hitcount") == 0)
537 continue; 3915 continue;
3916
538 ret = create_val_field(hist_data, j++, file, field_str); 3917 ret = create_val_field(hist_data, j++, file, field_str);
539 if (ret) 3918 if (ret)
540 goto out; 3919 goto out;
541 } 3920 }
3921
542 if (fields_str && (strcmp(fields_str, "hitcount") != 0)) 3922 if (fields_str && (strcmp(fields_str, "hitcount") != 0))
543 ret = -EINVAL; 3923 ret = -EINVAL;
544 out: 3924 out:
@@ -551,12 +3931,13 @@ static int create_key_field(struct hist_trigger_data *hist_data,
551 struct trace_event_file *file, 3931 struct trace_event_file *file,
552 char *field_str) 3932 char *field_str)
553{ 3933{
554 struct ftrace_event_field *field = NULL; 3934 struct hist_field *hist_field = NULL;
3935
555 unsigned long flags = 0; 3936 unsigned long flags = 0;
556 unsigned int key_size; 3937 unsigned int key_size;
557 int ret = 0; 3938 int ret = 0;
558 3939
559 if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX)) 3940 if (WARN_ON(key_idx >= HIST_FIELDS_MAX))
560 return -EINVAL; 3941 return -EINVAL;
561 3942
562 flags |= HIST_FIELD_FL_KEY; 3943 flags |= HIST_FIELD_FL_KEY;
@@ -564,57 +3945,40 @@ static int create_key_field(struct hist_trigger_data *hist_data,
564 if (strcmp(field_str, "stacktrace") == 0) { 3945 if (strcmp(field_str, "stacktrace") == 0) {
565 flags |= HIST_FIELD_FL_STACKTRACE; 3946 flags |= HIST_FIELD_FL_STACKTRACE;
566 key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH; 3947 key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH;
3948 hist_field = create_hist_field(hist_data, NULL, flags, NULL);
567 } else { 3949 } else {
568 char *field_name = strsep(&field_str, "."); 3950 hist_field = parse_expr(hist_data, file, field_str, flags,
569 3951 NULL, 0);
570 if (field_str) { 3952 if (IS_ERR(hist_field)) {
571 if (strcmp(field_str, "hex") == 0) 3953 ret = PTR_ERR(hist_field);
572 flags |= HIST_FIELD_FL_HEX; 3954 goto out;
573 else if (strcmp(field_str, "sym") == 0)
574 flags |= HIST_FIELD_FL_SYM;
575 else if (strcmp(field_str, "sym-offset") == 0)
576 flags |= HIST_FIELD_FL_SYM_OFFSET;
577 else if ((strcmp(field_str, "execname") == 0) &&
578 (strcmp(field_name, "common_pid") == 0))
579 flags |= HIST_FIELD_FL_EXECNAME;
580 else if (strcmp(field_str, "syscall") == 0)
581 flags |= HIST_FIELD_FL_SYSCALL;
582 else if (strcmp(field_str, "log2") == 0)
583 flags |= HIST_FIELD_FL_LOG2;
584 else {
585 ret = -EINVAL;
586 goto out;
587 }
588 } 3955 }
589 3956
590 field = trace_find_event_field(file->event_call, field_name); 3957 if (hist_field->flags & HIST_FIELD_FL_VAR_REF) {
591 if (!field || !field->size) { 3958 hist_err("Using variable references as keys not supported: ", field_str);
3959 destroy_hist_field(hist_field, 0);
592 ret = -EINVAL; 3960 ret = -EINVAL;
593 goto out; 3961 goto out;
594 } 3962 }
595 3963
596 if (is_string_field(field)) 3964 key_size = hist_field->size;
597 key_size = MAX_FILTER_STR_VAL;
598 else
599 key_size = field->size;
600 } 3965 }
601 3966
602 hist_data->fields[key_idx] = create_hist_field(field, flags); 3967 hist_data->fields[key_idx] = hist_field;
603 if (!hist_data->fields[key_idx]) {
604 ret = -ENOMEM;
605 goto out;
606 }
607 3968
608 key_size = ALIGN(key_size, sizeof(u64)); 3969 key_size = ALIGN(key_size, sizeof(u64));
609 hist_data->fields[key_idx]->size = key_size; 3970 hist_data->fields[key_idx]->size = key_size;
610 hist_data->fields[key_idx]->offset = key_offset; 3971 hist_data->fields[key_idx]->offset = key_offset;
3972
611 hist_data->key_size += key_size; 3973 hist_data->key_size += key_size;
3974
612 if (hist_data->key_size > HIST_KEY_SIZE_MAX) { 3975 if (hist_data->key_size > HIST_KEY_SIZE_MAX) {
613 ret = -EINVAL; 3976 ret = -EINVAL;
614 goto out; 3977 goto out;
615 } 3978 }
616 3979
617 hist_data->n_keys++; 3980 hist_data->n_keys++;
3981 hist_data->n_fields++;
618 3982
619 if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX)) 3983 if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX))
620 return -EINVAL; 3984 return -EINVAL;
@@ -658,21 +4022,113 @@ static int create_key_fields(struct hist_trigger_data *hist_data,
658 return ret; 4022 return ret;
659} 4023}
660 4024
4025static int create_var_fields(struct hist_trigger_data *hist_data,
4026 struct trace_event_file *file)
4027{
4028 unsigned int i, j = hist_data->n_vals;
4029 int ret = 0;
4030
4031 unsigned int n_vars = hist_data->attrs->var_defs.n_vars;
4032
4033 for (i = 0; i < n_vars; i++) {
4034 char *var_name = hist_data->attrs->var_defs.name[i];
4035 char *expr = hist_data->attrs->var_defs.expr[i];
4036
4037 ret = create_var_field(hist_data, j++, file, var_name, expr);
4038 if (ret)
4039 goto out;
4040 }
4041 out:
4042 return ret;
4043}
4044
4045static void free_var_defs(struct hist_trigger_data *hist_data)
4046{
4047 unsigned int i;
4048
4049 for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) {
4050 kfree(hist_data->attrs->var_defs.name[i]);
4051 kfree(hist_data->attrs->var_defs.expr[i]);
4052 }
4053
4054 hist_data->attrs->var_defs.n_vars = 0;
4055}
4056
4057static int parse_var_defs(struct hist_trigger_data *hist_data)
4058{
4059 char *s, *str, *var_name, *field_str;
4060 unsigned int i, j, n_vars = 0;
4061 int ret = 0;
4062
4063 for (i = 0; i < hist_data->attrs->n_assignments; i++) {
4064 str = hist_data->attrs->assignment_str[i];
4065 for (j = 0; j < TRACING_MAP_VARS_MAX; j++) {
4066 field_str = strsep(&str, ",");
4067 if (!field_str)
4068 break;
4069
4070 var_name = strsep(&field_str, "=");
4071 if (!var_name || !field_str) {
4072 hist_err("Malformed assignment: ", var_name);
4073 ret = -EINVAL;
4074 goto free;
4075 }
4076
4077 if (n_vars == TRACING_MAP_VARS_MAX) {
4078 hist_err("Too many variables defined: ", var_name);
4079 ret = -EINVAL;
4080 goto free;
4081 }
4082
4083 s = kstrdup(var_name, GFP_KERNEL);
4084 if (!s) {
4085 ret = -ENOMEM;
4086 goto free;
4087 }
4088 hist_data->attrs->var_defs.name[n_vars] = s;
4089
4090 s = kstrdup(field_str, GFP_KERNEL);
4091 if (!s) {
4092 kfree(hist_data->attrs->var_defs.name[n_vars]);
4093 ret = -ENOMEM;
4094 goto free;
4095 }
4096 hist_data->attrs->var_defs.expr[n_vars++] = s;
4097
4098 hist_data->attrs->var_defs.n_vars = n_vars;
4099 }
4100 }
4101
4102 return ret;
4103 free:
4104 free_var_defs(hist_data);
4105
4106 return ret;
4107}
4108
661static int create_hist_fields(struct hist_trigger_data *hist_data, 4109static int create_hist_fields(struct hist_trigger_data *hist_data,
662 struct trace_event_file *file) 4110 struct trace_event_file *file)
663{ 4111{
664 int ret; 4112 int ret;
665 4113
4114 ret = parse_var_defs(hist_data);
4115 if (ret)
4116 goto out;
4117
666 ret = create_val_fields(hist_data, file); 4118 ret = create_val_fields(hist_data, file);
667 if (ret) 4119 if (ret)
668 goto out; 4120 goto out;
669 4121
670 ret = create_key_fields(hist_data, file); 4122 ret = create_var_fields(hist_data, file);
671 if (ret) 4123 if (ret)
672 goto out; 4124 goto out;
673 4125
674 hist_data->n_fields = hist_data->n_vals + hist_data->n_keys; 4126 ret = create_key_fields(hist_data, file);
4127 if (ret)
4128 goto out;
675 out: 4129 out:
4130 free_var_defs(hist_data);
4131
676 return ret; 4132 return ret;
677} 4133}
678 4134
@@ -695,7 +4151,7 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
695 char *fields_str = hist_data->attrs->sort_key_str; 4151 char *fields_str = hist_data->attrs->sort_key_str;
696 struct tracing_map_sort_key *sort_key; 4152 struct tracing_map_sort_key *sort_key;
697 int descending, ret = 0; 4153 int descending, ret = 0;
698 unsigned int i, j; 4154 unsigned int i, j, k;
699 4155
700 hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */ 4156 hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */
701 4157
@@ -743,12 +4199,19 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
743 continue; 4199 continue;
744 } 4200 }
745 4201
746 for (j = 1; j < hist_data->n_fields; j++) { 4202 for (j = 1, k = 1; j < hist_data->n_fields; j++) {
4203 unsigned int idx;
4204
747 hist_field = hist_data->fields[j]; 4205 hist_field = hist_data->fields[j];
4206 if (hist_field->flags & HIST_FIELD_FL_VAR)
4207 continue;
4208
4209 idx = k++;
4210
748 test_name = hist_field_name(hist_field, 0); 4211 test_name = hist_field_name(hist_field, 0);
749 4212
750 if (strcmp(field_name, test_name) == 0) { 4213 if (strcmp(field_name, test_name) == 0) {
751 sort_key->field_idx = j; 4214 sort_key->field_idx = idx;
752 descending = is_descending(field_str); 4215 descending = is_descending(field_str);
753 if (descending < 0) { 4216 if (descending < 0) {
754 ret = descending; 4217 ret = descending;
@@ -763,16 +4226,230 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
763 break; 4226 break;
764 } 4227 }
765 } 4228 }
4229
766 hist_data->n_sort_keys = i; 4230 hist_data->n_sort_keys = i;
767 out: 4231 out:
768 return ret; 4232 return ret;
769} 4233}
770 4234
4235static void destroy_actions(struct hist_trigger_data *hist_data)
4236{
4237 unsigned int i;
4238
4239 for (i = 0; i < hist_data->n_actions; i++) {
4240 struct action_data *data = hist_data->actions[i];
4241
4242 if (data->fn == action_trace)
4243 onmatch_destroy(data);
4244 else if (data->fn == onmax_save)
4245 onmax_destroy(data);
4246 else
4247 kfree(data);
4248 }
4249}
4250
4251static int parse_actions(struct hist_trigger_data *hist_data)
4252{
4253 struct trace_array *tr = hist_data->event_file->tr;
4254 struct action_data *data;
4255 unsigned int i;
4256 int ret = 0;
4257 char *str;
4258
4259 for (i = 0; i < hist_data->attrs->n_actions; i++) {
4260 str = hist_data->attrs->action_str[i];
4261
4262 if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) {
4263 char *action_str = str + strlen("onmatch(");
4264
4265 data = onmatch_parse(tr, action_str);
4266 if (IS_ERR(data)) {
4267 ret = PTR_ERR(data);
4268 break;
4269 }
4270 data->fn = action_trace;
4271 } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) {
4272 char *action_str = str + strlen("onmax(");
4273
4274 data = onmax_parse(action_str);
4275 if (IS_ERR(data)) {
4276 ret = PTR_ERR(data);
4277 break;
4278 }
4279 data->fn = onmax_save;
4280 } else {
4281 ret = -EINVAL;
4282 break;
4283 }
4284
4285 hist_data->actions[hist_data->n_actions++] = data;
4286 }
4287
4288 return ret;
4289}
4290
4291static int create_actions(struct hist_trigger_data *hist_data,
4292 struct trace_event_file *file)
4293{
4294 struct action_data *data;
4295 unsigned int i;
4296 int ret = 0;
4297
4298 for (i = 0; i < hist_data->attrs->n_actions; i++) {
4299 data = hist_data->actions[i];
4300
4301 if (data->fn == action_trace) {
4302 ret = onmatch_create(hist_data, file, data);
4303 if (ret)
4304 return ret;
4305 } else if (data->fn == onmax_save) {
4306 ret = onmax_create(hist_data, data);
4307 if (ret)
4308 return ret;
4309 }
4310 }
4311
4312 return ret;
4313}
4314
4315static void print_actions(struct seq_file *m,
4316 struct hist_trigger_data *hist_data,
4317 struct tracing_map_elt *elt)
4318{
4319 unsigned int i;
4320
4321 for (i = 0; i < hist_data->n_actions; i++) {
4322 struct action_data *data = hist_data->actions[i];
4323
4324 if (data->fn == onmax_save)
4325 onmax_print(m, hist_data, elt, data);
4326 }
4327}
4328
4329static void print_onmax_spec(struct seq_file *m,
4330 struct hist_trigger_data *hist_data,
4331 struct action_data *data)
4332{
4333 unsigned int i;
4334
4335 seq_puts(m, ":onmax(");
4336 seq_printf(m, "%s", data->onmax.var_str);
4337 seq_printf(m, ").%s(", data->onmax.fn_name);
4338
4339 for (i = 0; i < hist_data->n_max_vars; i++) {
4340 seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name);
4341 if (i < hist_data->n_max_vars - 1)
4342 seq_puts(m, ",");
4343 }
4344 seq_puts(m, ")");
4345}
4346
4347static void print_onmatch_spec(struct seq_file *m,
4348 struct hist_trigger_data *hist_data,
4349 struct action_data *data)
4350{
4351 unsigned int i;
4352
4353 seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system,
4354 data->onmatch.match_event);
4355
4356 seq_printf(m, "%s(", data->onmatch.synth_event->name);
4357
4358 for (i = 0; i < data->n_params; i++) {
4359 if (i)
4360 seq_puts(m, ",");
4361 seq_printf(m, "%s", data->params[i]);
4362 }
4363
4364 seq_puts(m, ")");
4365}
4366
4367static bool actions_match(struct hist_trigger_data *hist_data,
4368 struct hist_trigger_data *hist_data_test)
4369{
4370 unsigned int i, j;
4371
4372 if (hist_data->n_actions != hist_data_test->n_actions)
4373 return false;
4374
4375 for (i = 0; i < hist_data->n_actions; i++) {
4376 struct action_data *data = hist_data->actions[i];
4377 struct action_data *data_test = hist_data_test->actions[i];
4378
4379 if (data->fn != data_test->fn)
4380 return false;
4381
4382 if (data->n_params != data_test->n_params)
4383 return false;
4384
4385 for (j = 0; j < data->n_params; j++) {
4386 if (strcmp(data->params[j], data_test->params[j]) != 0)
4387 return false;
4388 }
4389
4390 if (data->fn == action_trace) {
4391 if (strcmp(data->onmatch.synth_event_name,
4392 data_test->onmatch.synth_event_name) != 0)
4393 return false;
4394 if (strcmp(data->onmatch.match_event_system,
4395 data_test->onmatch.match_event_system) != 0)
4396 return false;
4397 if (strcmp(data->onmatch.match_event,
4398 data_test->onmatch.match_event) != 0)
4399 return false;
4400 } else if (data->fn == onmax_save) {
4401 if (strcmp(data->onmax.var_str,
4402 data_test->onmax.var_str) != 0)
4403 return false;
4404 if (strcmp(data->onmax.fn_name,
4405 data_test->onmax.fn_name) != 0)
4406 return false;
4407 }
4408 }
4409
4410 return true;
4411}
4412
4413
4414static void print_actions_spec(struct seq_file *m,
4415 struct hist_trigger_data *hist_data)
4416{
4417 unsigned int i;
4418
4419 for (i = 0; i < hist_data->n_actions; i++) {
4420 struct action_data *data = hist_data->actions[i];
4421
4422 if (data->fn == action_trace)
4423 print_onmatch_spec(m, hist_data, data);
4424 else if (data->fn == onmax_save)
4425 print_onmax_spec(m, hist_data, data);
4426 }
4427}
4428
4429static void destroy_field_var_hists(struct hist_trigger_data *hist_data)
4430{
4431 unsigned int i;
4432
4433 for (i = 0; i < hist_data->n_field_var_hists; i++) {
4434 kfree(hist_data->field_var_hists[i]->cmd);
4435 kfree(hist_data->field_var_hists[i]);
4436 }
4437}
4438
771static void destroy_hist_data(struct hist_trigger_data *hist_data) 4439static void destroy_hist_data(struct hist_trigger_data *hist_data)
772{ 4440{
4441 if (!hist_data)
4442 return;
4443
773 destroy_hist_trigger_attrs(hist_data->attrs); 4444 destroy_hist_trigger_attrs(hist_data->attrs);
774 destroy_hist_fields(hist_data); 4445 destroy_hist_fields(hist_data);
775 tracing_map_destroy(hist_data->map); 4446 tracing_map_destroy(hist_data->map);
4447
4448 destroy_actions(hist_data);
4449 destroy_field_vars(hist_data);
4450 destroy_field_var_hists(hist_data);
4451 destroy_synth_var_refs(hist_data);
4452
776 kfree(hist_data); 4453 kfree(hist_data);
777} 4454}
778 4455
@@ -781,7 +4458,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
781 struct tracing_map *map = hist_data->map; 4458 struct tracing_map *map = hist_data->map;
782 struct ftrace_event_field *field; 4459 struct ftrace_event_field *field;
783 struct hist_field *hist_field; 4460 struct hist_field *hist_field;
784 int i, idx; 4461 int i, idx = 0;
785 4462
786 for_each_hist_field(i, hist_data) { 4463 for_each_hist_field(i, hist_data) {
787 hist_field = hist_data->fields[i]; 4464 hist_field = hist_data->fields[i];
@@ -792,6 +4469,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
792 4469
793 if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) 4470 if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
794 cmp_fn = tracing_map_cmp_none; 4471 cmp_fn = tracing_map_cmp_none;
4472 else if (!field)
4473 cmp_fn = tracing_map_cmp_num(hist_field->size,
4474 hist_field->is_signed);
795 else if (is_string_field(field)) 4475 else if (is_string_field(field))
796 cmp_fn = tracing_map_cmp_string; 4476 cmp_fn = tracing_map_cmp_string;
797 else 4477 else
@@ -800,36 +4480,29 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
800 idx = tracing_map_add_key_field(map, 4480 idx = tracing_map_add_key_field(map,
801 hist_field->offset, 4481 hist_field->offset,
802 cmp_fn); 4482 cmp_fn);
803 4483 } else if (!(hist_field->flags & HIST_FIELD_FL_VAR))
804 } else
805 idx = tracing_map_add_sum_field(map); 4484 idx = tracing_map_add_sum_field(map);
806 4485
807 if (idx < 0) 4486 if (idx < 0)
808 return idx; 4487 return idx;
809 }
810
811 return 0;
812}
813
814static bool need_tracing_map_ops(struct hist_trigger_data *hist_data)
815{
816 struct hist_field *key_field;
817 unsigned int i;
818
819 for_each_hist_key_field(i, hist_data) {
820 key_field = hist_data->fields[i];
821 4488
822 if (key_field->flags & HIST_FIELD_FL_EXECNAME) 4489 if (hist_field->flags & HIST_FIELD_FL_VAR) {
823 return true; 4490 idx = tracing_map_add_var(map);
4491 if (idx < 0)
4492 return idx;
4493 hist_field->var.idx = idx;
4494 hist_field->var.hist_data = hist_data;
4495 }
824 } 4496 }
825 4497
826 return false; 4498 return 0;
827} 4499}
828 4500
829static struct hist_trigger_data * 4501static struct hist_trigger_data *
830create_hist_data(unsigned int map_bits, 4502create_hist_data(unsigned int map_bits,
831 struct hist_trigger_attrs *attrs, 4503 struct hist_trigger_attrs *attrs,
832 struct trace_event_file *file) 4504 struct trace_event_file *file,
4505 bool remove)
833{ 4506{
834 const struct tracing_map_ops *map_ops = NULL; 4507 const struct tracing_map_ops *map_ops = NULL;
835 struct hist_trigger_data *hist_data; 4508 struct hist_trigger_data *hist_data;
@@ -840,6 +4513,12 @@ create_hist_data(unsigned int map_bits,
840 return ERR_PTR(-ENOMEM); 4513 return ERR_PTR(-ENOMEM);
841 4514
842 hist_data->attrs = attrs; 4515 hist_data->attrs = attrs;
4516 hist_data->remove = remove;
4517 hist_data->event_file = file;
4518
4519 ret = parse_actions(hist_data);
4520 if (ret)
4521 goto free;
843 4522
844 ret = create_hist_fields(hist_data, file); 4523 ret = create_hist_fields(hist_data, file);
845 if (ret) 4524 if (ret)
@@ -849,8 +4528,7 @@ create_hist_data(unsigned int map_bits,
849 if (ret) 4528 if (ret)
850 goto free; 4529 goto free;
851 4530
852 if (need_tracing_map_ops(hist_data)) 4531 map_ops = &hist_trigger_elt_data_ops;
853 map_ops = &hist_trigger_elt_comm_ops;
854 4532
855 hist_data->map = tracing_map_create(map_bits, hist_data->key_size, 4533 hist_data->map = tracing_map_create(map_bits, hist_data->key_size,
856 map_ops, hist_data); 4534 map_ops, hist_data);
@@ -863,12 +4541,6 @@ create_hist_data(unsigned int map_bits,
863 ret = create_tracing_map_fields(hist_data); 4541 ret = create_tracing_map_fields(hist_data);
864 if (ret) 4542 if (ret)
865 goto free; 4543 goto free;
866
867 ret = tracing_map_init(hist_data->map);
868 if (ret)
869 goto free;
870
871 hist_data->event_file = file;
872 out: 4544 out:
873 return hist_data; 4545 return hist_data;
874 free: 4546 free:
@@ -882,18 +4554,39 @@ create_hist_data(unsigned int map_bits,
882} 4554}
883 4555
884static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, 4556static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
885 struct tracing_map_elt *elt, 4557 struct tracing_map_elt *elt, void *rec,
886 void *rec) 4558 struct ring_buffer_event *rbe,
4559 u64 *var_ref_vals)
887{ 4560{
4561 struct hist_elt_data *elt_data;
888 struct hist_field *hist_field; 4562 struct hist_field *hist_field;
889 unsigned int i; 4563 unsigned int i, var_idx;
890 u64 hist_val; 4564 u64 hist_val;
891 4565
4566 elt_data = elt->private_data;
4567 elt_data->var_ref_vals = var_ref_vals;
4568
892 for_each_hist_val_field(i, hist_data) { 4569 for_each_hist_val_field(i, hist_data) {
893 hist_field = hist_data->fields[i]; 4570 hist_field = hist_data->fields[i];
894 hist_val = hist_field->fn(hist_field, rec); 4571 hist_val = hist_field->fn(hist_field, elt, rbe, rec);
4572 if (hist_field->flags & HIST_FIELD_FL_VAR) {
4573 var_idx = hist_field->var.idx;
4574 tracing_map_set_var(elt, var_idx, hist_val);
4575 continue;
4576 }
895 tracing_map_update_sum(elt, i, hist_val); 4577 tracing_map_update_sum(elt, i, hist_val);
896 } 4578 }
4579
4580 for_each_hist_key_field(i, hist_data) {
4581 hist_field = hist_data->fields[i];
4582 if (hist_field->flags & HIST_FIELD_FL_VAR) {
4583 hist_val = hist_field->fn(hist_field, elt, rbe, rec);
4584 var_idx = hist_field->var.idx;
4585 tracing_map_set_var(elt, var_idx, hist_val);
4586 }
4587 }
4588
4589 update_field_vars(hist_data, elt, rbe, rec);
897} 4590}
898 4591
899static inline void add_to_key(char *compound_key, void *key, 4592static inline void add_to_key(char *compound_key, void *key,
@@ -920,15 +4613,31 @@ static inline void add_to_key(char *compound_key, void *key,
920 memcpy(compound_key + key_field->offset, key, size); 4613 memcpy(compound_key + key_field->offset, key, size);
921} 4614}
922 4615
923static void event_hist_trigger(struct event_trigger_data *data, void *rec) 4616static void
4617hist_trigger_actions(struct hist_trigger_data *hist_data,
4618 struct tracing_map_elt *elt, void *rec,
4619 struct ring_buffer_event *rbe, u64 *var_ref_vals)
4620{
4621 struct action_data *data;
4622 unsigned int i;
4623
4624 for (i = 0; i < hist_data->n_actions; i++) {
4625 data = hist_data->actions[i];
4626 data->fn(hist_data, elt, rec, rbe, data, var_ref_vals);
4627 }
4628}
4629
4630static void event_hist_trigger(struct event_trigger_data *data, void *rec,
4631 struct ring_buffer_event *rbe)
924{ 4632{
925 struct hist_trigger_data *hist_data = data->private_data; 4633 struct hist_trigger_data *hist_data = data->private_data;
926 bool use_compound_key = (hist_data->n_keys > 1); 4634 bool use_compound_key = (hist_data->n_keys > 1);
927 unsigned long entries[HIST_STACKTRACE_DEPTH]; 4635 unsigned long entries[HIST_STACKTRACE_DEPTH];
4636 u64 var_ref_vals[TRACING_MAP_VARS_MAX];
928 char compound_key[HIST_KEY_SIZE_MAX]; 4637 char compound_key[HIST_KEY_SIZE_MAX];
4638 struct tracing_map_elt *elt = NULL;
929 struct stack_trace stacktrace; 4639 struct stack_trace stacktrace;
930 struct hist_field *key_field; 4640 struct hist_field *key_field;
931 struct tracing_map_elt *elt;
932 u64 field_contents; 4641 u64 field_contents;
933 void *key = NULL; 4642 void *key = NULL;
934 unsigned int i; 4643 unsigned int i;
@@ -949,7 +4658,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
949 4658
950 key = entries; 4659 key = entries;
951 } else { 4660 } else {
952 field_contents = key_field->fn(key_field, rec); 4661 field_contents = key_field->fn(key_field, elt, rbe, rec);
953 if (key_field->flags & HIST_FIELD_FL_STRING) { 4662 if (key_field->flags & HIST_FIELD_FL_STRING) {
954 key = (void *)(unsigned long)field_contents; 4663 key = (void *)(unsigned long)field_contents;
955 use_compound_key = true; 4664 use_compound_key = true;
@@ -964,9 +4673,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec)
964 if (use_compound_key) 4673 if (use_compound_key)
965 key = compound_key; 4674 key = compound_key;
966 4675
4676 if (hist_data->n_var_refs &&
4677 !resolve_var_refs(hist_data, key, var_ref_vals, false))
4678 return;
4679
967 elt = tracing_map_insert(hist_data->map, key); 4680 elt = tracing_map_insert(hist_data->map, key);
968 if (elt) 4681 if (!elt)
969 hist_trigger_elt_update(hist_data, elt, rec); 4682 return;
4683
4684 hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals);
4685
4686 if (resolve_var_refs(hist_data, key, var_ref_vals, true))
4687 hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals);
970} 4688}
971 4689
972static void hist_trigger_stacktrace_print(struct seq_file *m, 4690static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -1023,7 +4741,13 @@ hist_trigger_entry_print(struct seq_file *m,
1023 seq_printf(m, "%s: [%llx] %-55s", field_name, 4741 seq_printf(m, "%s: [%llx] %-55s", field_name,
1024 uval, str); 4742 uval, str);
1025 } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { 4743 } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
1026 char *comm = elt->private_data; 4744 struct hist_elt_data *elt_data = elt->private_data;
4745 char *comm;
4746
4747 if (WARN_ON_ONCE(!elt_data))
4748 return;
4749
4750 comm = elt_data->comm;
1027 4751
1028 uval = *(u64 *)(key + key_field->offset); 4752 uval = *(u64 *)(key + key_field->offset);
1029 seq_printf(m, "%s: %-16s[%10llu]", field_name, 4753 seq_printf(m, "%s: %-16s[%10llu]", field_name,
@@ -1067,6 +4791,10 @@ hist_trigger_entry_print(struct seq_file *m,
1067 for (i = 1; i < hist_data->n_vals; i++) { 4791 for (i = 1; i < hist_data->n_vals; i++) {
1068 field_name = hist_field_name(hist_data->fields[i], 0); 4792 field_name = hist_field_name(hist_data->fields[i], 0);
1069 4793
4794 if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR ||
4795 hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR)
4796 continue;
4797
1070 if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { 4798 if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
1071 seq_printf(m, " %s: %10llx", field_name, 4799 seq_printf(m, " %s: %10llx", field_name,
1072 tracing_map_read_sum(elt, i)); 4800 tracing_map_read_sum(elt, i));
@@ -1076,6 +4804,8 @@ hist_trigger_entry_print(struct seq_file *m,
1076 } 4804 }
1077 } 4805 }
1078 4806
4807 print_actions(m, hist_data, elt);
4808
1079 seq_puts(m, "\n"); 4809 seq_puts(m, "\n");
1080} 4810}
1081 4811
@@ -1144,6 +4874,11 @@ static int hist_show(struct seq_file *m, void *v)
1144 hist_trigger_show(m, data, n++); 4874 hist_trigger_show(m, data, n++);
1145 } 4875 }
1146 4876
4877 if (have_hist_err()) {
4878 seq_printf(m, "\nERROR: %s\n", hist_err_str);
4879 seq_printf(m, " Last command: %s\n", last_hist_cmd);
4880 }
4881
1147 out_unlock: 4882 out_unlock:
1148 mutex_unlock(&event_mutex); 4883 mutex_unlock(&event_mutex);
1149 4884
@@ -1162,37 +4897,22 @@ const struct file_operations event_hist_fops = {
1162 .release = single_release, 4897 .release = single_release,
1163}; 4898};
1164 4899
1165static const char *get_hist_field_flags(struct hist_field *hist_field)
1166{
1167 const char *flags_str = NULL;
1168
1169 if (hist_field->flags & HIST_FIELD_FL_HEX)
1170 flags_str = "hex";
1171 else if (hist_field->flags & HIST_FIELD_FL_SYM)
1172 flags_str = "sym";
1173 else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET)
1174 flags_str = "sym-offset";
1175 else if (hist_field->flags & HIST_FIELD_FL_EXECNAME)
1176 flags_str = "execname";
1177 else if (hist_field->flags & HIST_FIELD_FL_SYSCALL)
1178 flags_str = "syscall";
1179 else if (hist_field->flags & HIST_FIELD_FL_LOG2)
1180 flags_str = "log2";
1181
1182 return flags_str;
1183}
1184
1185static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) 4900static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
1186{ 4901{
1187 const char *field_name = hist_field_name(hist_field, 0); 4902 const char *field_name = hist_field_name(hist_field, 0);
1188 4903
1189 seq_printf(m, "%s", field_name); 4904 if (hist_field->var.name)
1190 if (hist_field->flags) { 4905 seq_printf(m, "%s=", hist_field->var.name);
1191 const char *flags_str = get_hist_field_flags(hist_field); 4906
1192 4907 if (hist_field->flags & HIST_FIELD_FL_CPU)
1193 if (flags_str) 4908 seq_puts(m, "cpu");
1194 seq_printf(m, ".%s", flags_str); 4909 else if (field_name) {
1195 } 4910 if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
4911 hist_field->flags & HIST_FIELD_FL_ALIAS)
4912 seq_putc(m, '$');
4913 seq_printf(m, "%s", field_name);
4914 } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP)
4915 seq_puts(m, "common_timestamp");
1196} 4916}
1197 4917
1198static int event_hist_trigger_print(struct seq_file *m, 4918static int event_hist_trigger_print(struct seq_file *m,
@@ -1200,7 +4920,8 @@ static int event_hist_trigger_print(struct seq_file *m,
1200 struct event_trigger_data *data) 4920 struct event_trigger_data *data)
1201{ 4921{
1202 struct hist_trigger_data *hist_data = data->private_data; 4922 struct hist_trigger_data *hist_data = data->private_data;
1203 struct hist_field *key_field; 4923 struct hist_field *field;
4924 bool have_var = false;
1204 unsigned int i; 4925 unsigned int i;
1205 4926
1206 seq_puts(m, "hist:"); 4927 seq_puts(m, "hist:");
@@ -1211,25 +4932,47 @@ static int event_hist_trigger_print(struct seq_file *m,
1211 seq_puts(m, "keys="); 4932 seq_puts(m, "keys=");
1212 4933
1213 for_each_hist_key_field(i, hist_data) { 4934 for_each_hist_key_field(i, hist_data) {
1214 key_field = hist_data->fields[i]; 4935 field = hist_data->fields[i];
1215 4936
1216 if (i > hist_data->n_vals) 4937 if (i > hist_data->n_vals)
1217 seq_puts(m, ","); 4938 seq_puts(m, ",");
1218 4939
1219 if (key_field->flags & HIST_FIELD_FL_STACKTRACE) 4940 if (field->flags & HIST_FIELD_FL_STACKTRACE)
1220 seq_puts(m, "stacktrace"); 4941 seq_puts(m, "stacktrace");
1221 else 4942 else
1222 hist_field_print(m, key_field); 4943 hist_field_print(m, field);
1223 } 4944 }
1224 4945
1225 seq_puts(m, ":vals="); 4946 seq_puts(m, ":vals=");
1226 4947
1227 for_each_hist_val_field(i, hist_data) { 4948 for_each_hist_val_field(i, hist_data) {
4949 field = hist_data->fields[i];
4950 if (field->flags & HIST_FIELD_FL_VAR) {
4951 have_var = true;
4952 continue;
4953 }
4954
1228 if (i == HITCOUNT_IDX) 4955 if (i == HITCOUNT_IDX)
1229 seq_puts(m, "hitcount"); 4956 seq_puts(m, "hitcount");
1230 else { 4957 else {
1231 seq_puts(m, ","); 4958 seq_puts(m, ",");
1232 hist_field_print(m, hist_data->fields[i]); 4959 hist_field_print(m, field);
4960 }
4961 }
4962
4963 if (have_var) {
4964 unsigned int n = 0;
4965
4966 seq_puts(m, ":");
4967
4968 for_each_hist_val_field(i, hist_data) {
4969 field = hist_data->fields[i];
4970
4971 if (field->flags & HIST_FIELD_FL_VAR) {
4972 if (n++)
4973 seq_puts(m, ",");
4974 hist_field_print(m, field);
4975 }
1233 } 4976 }
1234 } 4977 }
1235 4978
@@ -1237,28 +4980,36 @@ static int event_hist_trigger_print(struct seq_file *m,
1237 4980
1238 for (i = 0; i < hist_data->n_sort_keys; i++) { 4981 for (i = 0; i < hist_data->n_sort_keys; i++) {
1239 struct tracing_map_sort_key *sort_key; 4982 struct tracing_map_sort_key *sort_key;
4983 unsigned int idx, first_key_idx;
4984
4985 /* skip VAR vals */
4986 first_key_idx = hist_data->n_vals - hist_data->n_vars;
1240 4987
1241 sort_key = &hist_data->sort_keys[i]; 4988 sort_key = &hist_data->sort_keys[i];
4989 idx = sort_key->field_idx;
4990
4991 if (WARN_ON(idx >= HIST_FIELDS_MAX))
4992 return -EINVAL;
1242 4993
1243 if (i > 0) 4994 if (i > 0)
1244 seq_puts(m, ","); 4995 seq_puts(m, ",");
1245 4996
1246 if (sort_key->field_idx == HITCOUNT_IDX) 4997 if (idx == HITCOUNT_IDX)
1247 seq_puts(m, "hitcount"); 4998 seq_puts(m, "hitcount");
1248 else { 4999 else {
1249 unsigned int idx = sort_key->field_idx; 5000 if (idx >= first_key_idx)
1250 5001 idx += hist_data->n_vars;
1251 if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX))
1252 return -EINVAL;
1253
1254 hist_field_print(m, hist_data->fields[idx]); 5002 hist_field_print(m, hist_data->fields[idx]);
1255 } 5003 }
1256 5004
1257 if (sort_key->descending) 5005 if (sort_key->descending)
1258 seq_puts(m, ".descending"); 5006 seq_puts(m, ".descending");
1259 } 5007 }
1260
1261 seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); 5008 seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits));
5009 if (hist_data->enable_timestamps)
5010 seq_printf(m, ":clock=%s", hist_data->attrs->clock);
5011
5012 print_actions_spec(m, hist_data);
1262 5013
1263 if (data->filter_str) 5014 if (data->filter_str)
1264 seq_printf(m, " if %s", data->filter_str); 5015 seq_printf(m, " if %s", data->filter_str);
@@ -1286,6 +5037,21 @@ static int event_hist_trigger_init(struct event_trigger_ops *ops,
1286 return 0; 5037 return 0;
1287} 5038}
1288 5039
5040static void unregister_field_var_hists(struct hist_trigger_data *hist_data)
5041{
5042 struct trace_event_file *file;
5043 unsigned int i;
5044 char *cmd;
5045 int ret;
5046
5047 for (i = 0; i < hist_data->n_field_var_hists; i++) {
5048 file = hist_data->field_var_hists[i]->hist_data->event_file;
5049 cmd = hist_data->field_var_hists[i]->cmd;
5050 ret = event_hist_trigger_func(&trigger_hist_cmd, file,
5051 "!hist", "hist", cmd);
5052 }
5053}
5054
1289static void event_hist_trigger_free(struct event_trigger_ops *ops, 5055static void event_hist_trigger_free(struct event_trigger_ops *ops,
1290 struct event_trigger_data *data) 5056 struct event_trigger_data *data)
1291{ 5057{
@@ -1298,7 +5064,13 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops,
1298 if (!data->ref) { 5064 if (!data->ref) {
1299 if (data->name) 5065 if (data->name)
1300 del_named_trigger(data); 5066 del_named_trigger(data);
5067
1301 trigger_data_free(data); 5068 trigger_data_free(data);
5069
5070 remove_hist_vars(hist_data);
5071
5072 unregister_field_var_hists(hist_data);
5073
1302 destroy_hist_data(hist_data); 5074 destroy_hist_data(hist_data);
1303 } 5075 }
1304} 5076}
@@ -1425,6 +5197,15 @@ static bool hist_trigger_match(struct event_trigger_data *data,
1425 return false; 5197 return false;
1426 if (key_field->offset != key_field_test->offset) 5198 if (key_field->offset != key_field_test->offset)
1427 return false; 5199 return false;
5200 if (key_field->size != key_field_test->size)
5201 return false;
5202 if (key_field->is_signed != key_field_test->is_signed)
5203 return false;
5204 if (!!key_field->var.name != !!key_field_test->var.name)
5205 return false;
5206 if (key_field->var.name &&
5207 strcmp(key_field->var.name, key_field_test->var.name) != 0)
5208 return false;
1428 } 5209 }
1429 5210
1430 for (i = 0; i < hist_data->n_sort_keys; i++) { 5211 for (i = 0; i < hist_data->n_sort_keys; i++) {
@@ -1440,6 +5221,9 @@ static bool hist_trigger_match(struct event_trigger_data *data,
1440 (strcmp(data->filter_str, data_test->filter_str) != 0)) 5221 (strcmp(data->filter_str, data_test->filter_str) != 0))
1441 return false; 5222 return false;
1442 5223
5224 if (!actions_match(hist_data, hist_data_test))
5225 return false;
5226
1443 return true; 5227 return true;
1444} 5228}
1445 5229
@@ -1456,6 +5240,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
1456 if (named_data) { 5240 if (named_data) {
1457 if (!hist_trigger_match(data, named_data, named_data, 5241 if (!hist_trigger_match(data, named_data, named_data,
1458 true)) { 5242 true)) {
5243 hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name);
1459 ret = -EINVAL; 5244 ret = -EINVAL;
1460 goto out; 5245 goto out;
1461 } 5246 }
@@ -1475,13 +5260,16 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
1475 test->paused = false; 5260 test->paused = false;
1476 else if (hist_data->attrs->clear) 5261 else if (hist_data->attrs->clear)
1477 hist_clear(test); 5262 hist_clear(test);
1478 else 5263 else {
5264 hist_err("Hist trigger already exists", NULL);
1479 ret = -EEXIST; 5265 ret = -EEXIST;
5266 }
1480 goto out; 5267 goto out;
1481 } 5268 }
1482 } 5269 }
1483 new: 5270 new:
1484 if (hist_data->attrs->cont || hist_data->attrs->clear) { 5271 if (hist_data->attrs->cont || hist_data->attrs->clear) {
5272 hist_err("Can't clear or continue a nonexistent hist trigger", NULL);
1485 ret = -ENOENT; 5273 ret = -ENOENT;
1486 goto out; 5274 goto out;
1487 } 5275 }
@@ -1490,7 +5278,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
1490 data->paused = true; 5278 data->paused = true;
1491 5279
1492 if (named_data) { 5280 if (named_data) {
1493 destroy_hist_data(data->private_data);
1494 data->private_data = named_data->private_data; 5281 data->private_data = named_data->private_data;
1495 set_named_trigger_data(data, named_data); 5282 set_named_trigger_data(data, named_data);
1496 data->ops = &event_hist_trigger_named_ops; 5283 data->ops = &event_hist_trigger_named_ops;
@@ -1502,8 +5289,32 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
1502 goto out; 5289 goto out;
1503 } 5290 }
1504 5291
1505 list_add_rcu(&data->list, &file->triggers); 5292 if (hist_data->enable_timestamps) {
5293 char *clock = hist_data->attrs->clock;
5294
5295 ret = tracing_set_clock(file->tr, hist_data->attrs->clock);
5296 if (ret) {
5297 hist_err("Couldn't set trace_clock: ", clock);
5298 goto out;
5299 }
5300
5301 tracing_set_time_stamp_abs(file->tr, true);
5302 }
5303
5304 if (named_data)
5305 destroy_hist_data(hist_data);
5306
1506 ret++; 5307 ret++;
5308 out:
5309 return ret;
5310}
5311
5312static int hist_trigger_enable(struct event_trigger_data *data,
5313 struct trace_event_file *file)
5314{
5315 int ret = 0;
5316
5317 list_add_tail_rcu(&data->list, &file->triggers);
1507 5318
1508 update_cond_flag(file); 5319 update_cond_flag(file);
1509 5320
@@ -1512,10 +5323,55 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops,
1512 update_cond_flag(file); 5323 update_cond_flag(file);
1513 ret--; 5324 ret--;
1514 } 5325 }
1515 out: 5326
1516 return ret; 5327 return ret;
1517} 5328}
1518 5329
5330static bool have_hist_trigger_match(struct event_trigger_data *data,
5331 struct trace_event_file *file)
5332{
5333 struct hist_trigger_data *hist_data = data->private_data;
5334 struct event_trigger_data *test, *named_data = NULL;
5335 bool match = false;
5336
5337 if (hist_data->attrs->name)
5338 named_data = find_named_trigger(hist_data->attrs->name);
5339
5340 list_for_each_entry_rcu(test, &file->triggers, list) {
5341 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
5342 if (hist_trigger_match(data, test, named_data, false)) {
5343 match = true;
5344 break;
5345 }
5346 }
5347 }
5348
5349 return match;
5350}
5351
5352static bool hist_trigger_check_refs(struct event_trigger_data *data,
5353 struct trace_event_file *file)
5354{
5355 struct hist_trigger_data *hist_data = data->private_data;
5356 struct event_trigger_data *test, *named_data = NULL;
5357
5358 if (hist_data->attrs->name)
5359 named_data = find_named_trigger(hist_data->attrs->name);
5360
5361 list_for_each_entry_rcu(test, &file->triggers, list) {
5362 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
5363 if (!hist_trigger_match(data, test, named_data, false))
5364 continue;
5365 hist_data = test->private_data;
5366 if (check_var_refs(hist_data))
5367 return true;
5368 break;
5369 }
5370 }
5371
5372 return false;
5373}
5374
1519static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, 5375static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
1520 struct event_trigger_data *data, 5376 struct event_trigger_data *data,
1521 struct trace_event_file *file) 5377 struct trace_event_file *file)
@@ -1541,17 +5397,55 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops,
1541 5397
1542 if (unregistered && test->ops->free) 5398 if (unregistered && test->ops->free)
1543 test->ops->free(test->ops, test); 5399 test->ops->free(test->ops, test);
5400
5401 if (hist_data->enable_timestamps) {
5402 if (!hist_data->remove || unregistered)
5403 tracing_set_time_stamp_abs(file->tr, false);
5404 }
5405}
5406
5407static bool hist_file_check_refs(struct trace_event_file *file)
5408{
5409 struct hist_trigger_data *hist_data;
5410 struct event_trigger_data *test;
5411
5412 list_for_each_entry_rcu(test, &file->triggers, list) {
5413 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
5414 hist_data = test->private_data;
5415 if (check_var_refs(hist_data))
5416 return true;
5417 }
5418 }
5419
5420 return false;
1544} 5421}
1545 5422
1546static void hist_unreg_all(struct trace_event_file *file) 5423static void hist_unreg_all(struct trace_event_file *file)
1547{ 5424{
1548 struct event_trigger_data *test, *n; 5425 struct event_trigger_data *test, *n;
5426 struct hist_trigger_data *hist_data;
5427 struct synth_event *se;
5428 const char *se_name;
5429
5430 if (hist_file_check_refs(file))
5431 return;
1549 5432
1550 list_for_each_entry_safe(test, n, &file->triggers, list) { 5433 list_for_each_entry_safe(test, n, &file->triggers, list) {
1551 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { 5434 if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) {
5435 hist_data = test->private_data;
1552 list_del_rcu(&test->list); 5436 list_del_rcu(&test->list);
1553 trace_event_trigger_enable_disable(file, 0); 5437 trace_event_trigger_enable_disable(file, 0);
5438
5439 mutex_lock(&synth_event_mutex);
5440 se_name = trace_event_name(file->event_call);
5441 se = find_synth_event(se_name);
5442 if (se)
5443 se->ref--;
5444 mutex_unlock(&synth_event_mutex);
5445
1554 update_cond_flag(file); 5446 update_cond_flag(file);
5447 if (hist_data->enable_timestamps)
5448 tracing_set_time_stamp_abs(file->tr, false);
1555 if (test->ops->free) 5449 if (test->ops->free)
1556 test->ops->free(test->ops, test); 5450 test->ops->free(test->ops, test);
1557 } 5451 }
@@ -1567,16 +5461,54 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
1567 struct hist_trigger_attrs *attrs; 5461 struct hist_trigger_attrs *attrs;
1568 struct event_trigger_ops *trigger_ops; 5462 struct event_trigger_ops *trigger_ops;
1569 struct hist_trigger_data *hist_data; 5463 struct hist_trigger_data *hist_data;
1570 char *trigger; 5464 struct synth_event *se;
5465 const char *se_name;
5466 bool remove = false;
5467 char *trigger, *p;
1571 int ret = 0; 5468 int ret = 0;
1572 5469
5470 if (glob && strlen(glob)) {
5471 last_cmd_set(param);
5472 hist_err_clear();
5473 }
5474
1573 if (!param) 5475 if (!param)
1574 return -EINVAL; 5476 return -EINVAL;
1575 5477
1576 /* separate the trigger from the filter (k:v [if filter]) */ 5478 if (glob[0] == '!')
1577 trigger = strsep(&param, " \t"); 5479 remove = true;
1578 if (!trigger) 5480
1579 return -EINVAL; 5481 /*
5482 * separate the trigger from the filter (k:v [if filter])
5483 * allowing for whitespace in the trigger
5484 */
5485 p = trigger = param;
5486 do {
5487 p = strstr(p, "if");
5488 if (!p)
5489 break;
5490 if (p == param)
5491 return -EINVAL;
5492 if (*(p - 1) != ' ' && *(p - 1) != '\t') {
5493 p++;
5494 continue;
5495 }
5496 if (p >= param + strlen(param) - strlen("if") - 1)
5497 return -EINVAL;
5498 if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') {
5499 p++;
5500 continue;
5501 }
5502 break;
5503 } while (p);
5504
5505 if (!p)
5506 param = NULL;
5507 else {
5508 *(p - 1) = '\0';
5509 param = strstrip(p);
5510 trigger = strstrip(trigger);
5511 }
1580 5512
1581 attrs = parse_hist_trigger_attrs(trigger); 5513 attrs = parse_hist_trigger_attrs(trigger);
1582 if (IS_ERR(attrs)) 5514 if (IS_ERR(attrs))
@@ -1585,7 +5517,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
1585 if (attrs->map_bits) 5517 if (attrs->map_bits)
1586 hist_trigger_bits = attrs->map_bits; 5518 hist_trigger_bits = attrs->map_bits;
1587 5519
1588 hist_data = create_hist_data(hist_trigger_bits, attrs, file); 5520 hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove);
1589 if (IS_ERR(hist_data)) { 5521 if (IS_ERR(hist_data)) {
1590 destroy_hist_trigger_attrs(attrs); 5522 destroy_hist_trigger_attrs(attrs);
1591 return PTR_ERR(hist_data); 5523 return PTR_ERR(hist_data);
@@ -1593,10 +5525,11 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
1593 5525
1594 trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); 5526 trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
1595 5527
1596 ret = -ENOMEM;
1597 trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); 5528 trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL);
1598 if (!trigger_data) 5529 if (!trigger_data) {
5530 ret = -ENOMEM;
1599 goto out_free; 5531 goto out_free;
5532 }
1600 5533
1601 trigger_data->count = -1; 5534 trigger_data->count = -1;
1602 trigger_data->ops = trigger_ops; 5535 trigger_data->ops = trigger_ops;
@@ -1614,8 +5547,24 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
1614 goto out_free; 5547 goto out_free;
1615 } 5548 }
1616 5549
1617 if (glob[0] == '!') { 5550 if (remove) {
5551 if (!have_hist_trigger_match(trigger_data, file))
5552 goto out_free;
5553
5554 if (hist_trigger_check_refs(trigger_data, file)) {
5555 ret = -EBUSY;
5556 goto out_free;
5557 }
5558
1618 cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); 5559 cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
5560
5561 mutex_lock(&synth_event_mutex);
5562 se_name = trace_event_name(file->event_call);
5563 se = find_synth_event(se_name);
5564 if (se)
5565 se->ref--;
5566 mutex_unlock(&synth_event_mutex);
5567
1619 ret = 0; 5568 ret = 0;
1620 goto out_free; 5569 goto out_free;
1621 } 5570 }
@@ -1632,14 +5581,47 @@ static int event_hist_trigger_func(struct event_command *cmd_ops,
1632 goto out_free; 5581 goto out_free;
1633 } else if (ret < 0) 5582 } else if (ret < 0)
1634 goto out_free; 5583 goto out_free;
5584
5585 if (get_named_trigger_data(trigger_data))
5586 goto enable;
5587
5588 if (has_hist_vars(hist_data))
5589 save_hist_vars(hist_data);
5590
5591 ret = create_actions(hist_data, file);
5592 if (ret)
5593 goto out_unreg;
5594
5595 ret = tracing_map_init(hist_data->map);
5596 if (ret)
5597 goto out_unreg;
5598enable:
5599 ret = hist_trigger_enable(trigger_data, file);
5600 if (ret)
5601 goto out_unreg;
5602
5603 mutex_lock(&synth_event_mutex);
5604 se_name = trace_event_name(file->event_call);
5605 se = find_synth_event(se_name);
5606 if (se)
5607 se->ref++;
5608 mutex_unlock(&synth_event_mutex);
5609
1635 /* Just return zero, not the number of registered triggers */ 5610 /* Just return zero, not the number of registered triggers */
1636 ret = 0; 5611 ret = 0;
1637 out: 5612 out:
5613 if (ret == 0)
5614 hist_err_clear();
5615
1638 return ret; 5616 return ret;
5617 out_unreg:
5618 cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file);
1639 out_free: 5619 out_free:
1640 if (cmd_ops->set_filter) 5620 if (cmd_ops->set_filter)
1641 cmd_ops->set_filter(NULL, trigger_data, NULL); 5621 cmd_ops->set_filter(NULL, trigger_data, NULL);
1642 5622
5623 remove_hist_vars(hist_data);
5624
1643 kfree(trigger_data); 5625 kfree(trigger_data);
1644 5626
1645 destroy_hist_data(hist_data); 5627 destroy_hist_data(hist_data);
@@ -1669,7 +5651,8 @@ __init int register_trigger_hist_cmd(void)
1669} 5651}
1670 5652
1671static void 5653static void
1672hist_enable_trigger(struct event_trigger_data *data, void *rec) 5654hist_enable_trigger(struct event_trigger_data *data, void *rec,
5655 struct ring_buffer_event *event)
1673{ 5656{
1674 struct enable_trigger_data *enable_data = data->private_data; 5657 struct enable_trigger_data *enable_data = data->private_data;
1675 struct event_trigger_data *test; 5658 struct event_trigger_data *test;
@@ -1685,7 +5668,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec)
1685} 5668}
1686 5669
1687static void 5670static void
1688hist_enable_count_trigger(struct event_trigger_data *data, void *rec) 5671hist_enable_count_trigger(struct event_trigger_data *data, void *rec,
5672 struct ring_buffer_event *event)
1689{ 5673{
1690 if (!data->count) 5674 if (!data->count)
1691 return; 5675 return;
@@ -1693,7 +5677,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec)
1693 if (data->count != -1) 5677 if (data->count != -1)
1694 (data->count)--; 5678 (data->count)--;
1695 5679
1696 hist_enable_trigger(data, rec); 5680 hist_enable_trigger(data, rec, event);
1697} 5681}
1698 5682
1699static struct event_trigger_ops hist_enable_trigger_ops = { 5683static struct event_trigger_ops hist_enable_trigger_ops = {
@@ -1798,3 +5782,31 @@ __init int register_trigger_hist_enable_disable_cmds(void)
1798 5782
1799 return ret; 5783 return ret;
1800} 5784}
5785
5786static __init int trace_events_hist_init(void)
5787{
5788 struct dentry *entry = NULL;
5789 struct dentry *d_tracer;
5790 int err = 0;
5791
5792 d_tracer = tracing_init_dentry();
5793 if (IS_ERR(d_tracer)) {
5794 err = PTR_ERR(d_tracer);
5795 goto err;
5796 }
5797
5798 entry = tracefs_create_file("synthetic_events", 0644, d_tracer,
5799 NULL, &synth_events_fops);
5800 if (!entry) {
5801 err = -ENODEV;
5802 goto err;
5803 }
5804
5805 return err;
5806 err:
5807 pr_warn("Could not create tracefs 'synthetic_events' entry\n");
5808
5809 return err;
5810}
5811
5812fs_initcall(trace_events_hist_init);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 87411482a46f..d251cabcf69a 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -63,7 +63,8 @@ void trigger_data_free(struct event_trigger_data *data)
63 * any trigger that should be deferred, ETT_NONE if nothing to defer. 63 * any trigger that should be deferred, ETT_NONE if nothing to defer.
64 */ 64 */
65enum event_trigger_type 65enum event_trigger_type
66event_triggers_call(struct trace_event_file *file, void *rec) 66event_triggers_call(struct trace_event_file *file, void *rec,
67 struct ring_buffer_event *event)
67{ 68{
68 struct event_trigger_data *data; 69 struct event_trigger_data *data;
69 enum event_trigger_type tt = ETT_NONE; 70 enum event_trigger_type tt = ETT_NONE;
@@ -76,7 +77,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
76 if (data->paused) 77 if (data->paused)
77 continue; 78 continue;
78 if (!rec) { 79 if (!rec) {
79 data->ops->func(data, rec); 80 data->ops->func(data, rec, event);
80 continue; 81 continue;
81 } 82 }
82 filter = rcu_dereference_sched(data->filter); 83 filter = rcu_dereference_sched(data->filter);
@@ -86,7 +87,7 @@ event_triggers_call(struct trace_event_file *file, void *rec)
86 tt |= data->cmd_ops->trigger_type; 87 tt |= data->cmd_ops->trigger_type;
87 continue; 88 continue;
88 } 89 }
89 data->ops->func(data, rec); 90 data->ops->func(data, rec, event);
90 } 91 }
91 return tt; 92 return tt;
92} 93}
@@ -108,7 +109,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call);
108void 109void
109event_triggers_post_call(struct trace_event_file *file, 110event_triggers_post_call(struct trace_event_file *file,
110 enum event_trigger_type tt, 111 enum event_trigger_type tt,
111 void *rec) 112 void *rec, struct ring_buffer_event *event)
112{ 113{
113 struct event_trigger_data *data; 114 struct event_trigger_data *data;
114 115
@@ -116,7 +117,7 @@ event_triggers_post_call(struct trace_event_file *file,
116 if (data->paused) 117 if (data->paused)
117 continue; 118 continue;
118 if (data->cmd_ops->trigger_type & tt) 119 if (data->cmd_ops->trigger_type & tt)
119 data->ops->func(data, rec); 120 data->ops->func(data, rec, event);
120 } 121 }
121} 122}
122EXPORT_SYMBOL_GPL(event_triggers_post_call); 123EXPORT_SYMBOL_GPL(event_triggers_post_call);
@@ -908,8 +909,15 @@ void set_named_trigger_data(struct event_trigger_data *data,
908 data->named_data = named_data; 909 data->named_data = named_data;
909} 910}
910 911
912struct event_trigger_data *
913get_named_trigger_data(struct event_trigger_data *data)
914{
915 return data->named_data;
916}
917
911static void 918static void
912traceon_trigger(struct event_trigger_data *data, void *rec) 919traceon_trigger(struct event_trigger_data *data, void *rec,
920 struct ring_buffer_event *event)
913{ 921{
914 if (tracing_is_on()) 922 if (tracing_is_on())
915 return; 923 return;
@@ -918,7 +926,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec)
918} 926}
919 927
920static void 928static void
921traceon_count_trigger(struct event_trigger_data *data, void *rec) 929traceon_count_trigger(struct event_trigger_data *data, void *rec,
930 struct ring_buffer_event *event)
922{ 931{
923 if (tracing_is_on()) 932 if (tracing_is_on())
924 return; 933 return;
@@ -933,7 +942,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec)
933} 942}
934 943
935static void 944static void
936traceoff_trigger(struct event_trigger_data *data, void *rec) 945traceoff_trigger(struct event_trigger_data *data, void *rec,
946 struct ring_buffer_event *event)
937{ 947{
938 if (!tracing_is_on()) 948 if (!tracing_is_on())
939 return; 949 return;
@@ -942,7 +952,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec)
942} 952}
943 953
944static void 954static void
945traceoff_count_trigger(struct event_trigger_data *data, void *rec) 955traceoff_count_trigger(struct event_trigger_data *data, void *rec,
956 struct ring_buffer_event *event)
946{ 957{
947 if (!tracing_is_on()) 958 if (!tracing_is_on())
948 return; 959 return;
@@ -1039,13 +1050,15 @@ static struct event_command trigger_traceoff_cmd = {
1039 1050
1040#ifdef CONFIG_TRACER_SNAPSHOT 1051#ifdef CONFIG_TRACER_SNAPSHOT
1041static void 1052static void
1042snapshot_trigger(struct event_trigger_data *data, void *rec) 1053snapshot_trigger(struct event_trigger_data *data, void *rec,
1054 struct ring_buffer_event *event)
1043{ 1055{
1044 tracing_snapshot(); 1056 tracing_snapshot();
1045} 1057}
1046 1058
1047static void 1059static void
1048snapshot_count_trigger(struct event_trigger_data *data, void *rec) 1060snapshot_count_trigger(struct event_trigger_data *data, void *rec,
1061 struct ring_buffer_event *event)
1049{ 1062{
1050 if (!data->count) 1063 if (!data->count)
1051 return; 1064 return;
@@ -1053,7 +1066,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec)
1053 if (data->count != -1) 1066 if (data->count != -1)
1054 (data->count)--; 1067 (data->count)--;
1055 1068
1056 snapshot_trigger(data, rec); 1069 snapshot_trigger(data, rec, event);
1057} 1070}
1058 1071
1059static int 1072static int
@@ -1141,13 +1154,15 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
1141#endif 1154#endif
1142 1155
1143static void 1156static void
1144stacktrace_trigger(struct event_trigger_data *data, void *rec) 1157stacktrace_trigger(struct event_trigger_data *data, void *rec,
1158 struct ring_buffer_event *event)
1145{ 1159{
1146 trace_dump_stack(STACK_SKIP); 1160 trace_dump_stack(STACK_SKIP);
1147} 1161}
1148 1162
1149static void 1163static void
1150stacktrace_count_trigger(struct event_trigger_data *data, void *rec) 1164stacktrace_count_trigger(struct event_trigger_data *data, void *rec,
1165 struct ring_buffer_event *event)
1151{ 1166{
1152 if (!data->count) 1167 if (!data->count)
1153 return; 1168 return;
@@ -1155,7 +1170,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec)
1155 if (data->count != -1) 1170 if (data->count != -1)
1156 (data->count)--; 1171 (data->count)--;
1157 1172
1158 stacktrace_trigger(data, rec); 1173 stacktrace_trigger(data, rec, event);
1159} 1174}
1160 1175
1161static int 1176static int
@@ -1217,7 +1232,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void)
1217} 1232}
1218 1233
1219static void 1234static void
1220event_enable_trigger(struct event_trigger_data *data, void *rec) 1235event_enable_trigger(struct event_trigger_data *data, void *rec,
1236 struct ring_buffer_event *event)
1221{ 1237{
1222 struct enable_trigger_data *enable_data = data->private_data; 1238 struct enable_trigger_data *enable_data = data->private_data;
1223 1239
@@ -1228,7 +1244,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec)
1228} 1244}
1229 1245
1230static void 1246static void
1231event_enable_count_trigger(struct event_trigger_data *data, void *rec) 1247event_enable_count_trigger(struct event_trigger_data *data, void *rec,
1248 struct ring_buffer_event *event)
1232{ 1249{
1233 struct enable_trigger_data *enable_data = data->private_data; 1250 struct enable_trigger_data *enable_data = data->private_data;
1234 1251
@@ -1242,7 +1259,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec)
1242 if (data->count != -1) 1259 if (data->count != -1)
1243 (data->count)--; 1260 (data->count)--;
1244 1261
1245 event_enable_trigger(data, rec); 1262 event_enable_trigger(data, rec, event);
1246} 1263}
1247 1264
1248int event_enable_trigger_print(struct seq_file *m, 1265int event_enable_trigger_print(struct seq_file *m,
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 07e75344725b..5cadb1b8b5fe 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -66,6 +66,73 @@ u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i)
66 return (u64)atomic64_read(&elt->fields[i].sum); 66 return (u64)atomic64_read(&elt->fields[i].sum);
67} 67}
68 68
69/**
70 * tracing_map_set_var - Assign a tracing_map_elt's variable field
71 * @elt: The tracing_map_elt
72 * @i: The index of the given variable associated with the tracing_map_elt
73 * @n: The value to assign
74 *
75 * Assign n to variable i associated with the specified tracing_map_elt
76 * instance. The index i is the index returned by the call to
77 * tracing_map_add_var() when the tracing map was set up.
78 */
79void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n)
80{
81 atomic64_set(&elt->vars[i], n);
82 elt->var_set[i] = true;
83}
84
85/**
86 * tracing_map_var_set - Return whether or not a variable has been set
87 * @elt: The tracing_map_elt
88 * @i: The index of the given variable associated with the tracing_map_elt
89 *
90 * Return true if the variable has been set, false otherwise. The
91 * index i is the index returned by the call to tracing_map_add_var()
92 * when the tracing map was set up.
93 */
94bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i)
95{
96 return elt->var_set[i];
97}
98
99/**
100 * tracing_map_read_var - Return the value of a tracing_map_elt's variable field
101 * @elt: The tracing_map_elt
102 * @i: The index of the given variable associated with the tracing_map_elt
103 *
104 * Retrieve the value of the variable i associated with the specified
105 * tracing_map_elt instance. The index i is the index returned by the
106 * call to tracing_map_add_var() when the tracing map was set
107 * up.
108 *
109 * Return: The variable value associated with field i for elt.
110 */
111u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i)
112{
113 return (u64)atomic64_read(&elt->vars[i]);
114}
115
116/**
117 * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field
118 * @elt: The tracing_map_elt
119 * @i: The index of the given variable associated with the tracing_map_elt
120 *
121 * Retrieve the value of the variable i associated with the specified
122 * tracing_map_elt instance, and reset the variable to the 'not set'
123 * state. The index i is the index returned by the call to
124 * tracing_map_add_var() when the tracing map was set up. The reset
125 * essentially makes the variable a read-once variable if it's only
126 * accessed using this function.
127 *
128 * Return: The variable value associated with field i for elt.
129 */
130u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i)
131{
132 elt->var_set[i] = false;
133 return (u64)atomic64_read(&elt->vars[i]);
134}
135
69int tracing_map_cmp_string(void *val_a, void *val_b) 136int tracing_map_cmp_string(void *val_a, void *val_b)
70{ 137{
71 char *a = val_a; 138 char *a = val_a;
@@ -171,6 +238,28 @@ int tracing_map_add_sum_field(struct tracing_map *map)
171} 238}
172 239
173/** 240/**
241 * tracing_map_add_var - Add a field describing a tracing_map var
242 * @map: The tracing_map
243 *
244 * Add a var to the map and return the index identifying it in the map
245 * and associated tracing_map_elts. This is the index used for
246 * instance to update a var for a particular tracing_map_elt using
247 * tracing_map_update_var() or reading it via tracing_map_read_var().
248 *
249 * Return: The index identifying the var in the map and associated
250 * tracing_map_elts, or -EINVAL on error.
251 */
252int tracing_map_add_var(struct tracing_map *map)
253{
254 int ret = -EINVAL;
255
256 if (map->n_vars < TRACING_MAP_VARS_MAX)
257 ret = map->n_vars++;
258
259 return ret;
260}
261
262/**
174 * tracing_map_add_key_field - Add a field describing a tracing_map key 263 * tracing_map_add_key_field - Add a field describing a tracing_map key
175 * @map: The tracing_map 264 * @map: The tracing_map
176 * @offset: The offset within the key 265 * @offset: The offset within the key
@@ -280,6 +369,11 @@ static void tracing_map_elt_clear(struct tracing_map_elt *elt)
280 if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64) 369 if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64)
281 atomic64_set(&elt->fields[i].sum, 0); 370 atomic64_set(&elt->fields[i].sum, 0);
282 371
372 for (i = 0; i < elt->map->n_vars; i++) {
373 atomic64_set(&elt->vars[i], 0);
374 elt->var_set[i] = false;
375 }
376
283 if (elt->map->ops && elt->map->ops->elt_clear) 377 if (elt->map->ops && elt->map->ops->elt_clear)
284 elt->map->ops->elt_clear(elt); 378 elt->map->ops->elt_clear(elt);
285} 379}
@@ -306,6 +400,8 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt)
306 if (elt->map->ops && elt->map->ops->elt_free) 400 if (elt->map->ops && elt->map->ops->elt_free)
307 elt->map->ops->elt_free(elt); 401 elt->map->ops->elt_free(elt);
308 kfree(elt->fields); 402 kfree(elt->fields);
403 kfree(elt->vars);
404 kfree(elt->var_set);
309 kfree(elt->key); 405 kfree(elt->key);
310 kfree(elt); 406 kfree(elt);
311} 407}
@@ -333,6 +429,18 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map)
333 goto free; 429 goto free;
334 } 430 }
335 431
432 elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL);
433 if (!elt->vars) {
434 err = -ENOMEM;
435 goto free;
436 }
437
438 elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL);
439 if (!elt->var_set) {
440 err = -ENOMEM;
441 goto free;
442 }
443
336 tracing_map_elt_init_fields(elt); 444 tracing_map_elt_init_fields(elt);
337 445
338 if (map->ops && map->ops->elt_alloc) { 446 if (map->ops && map->ops->elt_alloc) {
@@ -414,7 +522,9 @@ static inline struct tracing_map_elt *
414__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) 522__tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
415{ 523{
416 u32 idx, key_hash, test_key; 524 u32 idx, key_hash, test_key;
525 int dup_try = 0;
417 struct tracing_map_entry *entry; 526 struct tracing_map_entry *entry;
527 struct tracing_map_elt *val;
418 528
419 key_hash = jhash(key, map->key_size, 0); 529 key_hash = jhash(key, map->key_size, 0);
420 if (key_hash == 0) 530 if (key_hash == 0)
@@ -426,11 +536,33 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
426 entry = TRACING_MAP_ENTRY(map->map, idx); 536 entry = TRACING_MAP_ENTRY(map->map, idx);
427 test_key = entry->key; 537 test_key = entry->key;
428 538
429 if (test_key && test_key == key_hash && entry->val && 539 if (test_key && test_key == key_hash) {
430 keys_match(key, entry->val->key, map->key_size)) { 540 val = READ_ONCE(entry->val);
431 if (!lookup_only) 541 if (val &&
432 atomic64_inc(&map->hits); 542 keys_match(key, val->key, map->key_size)) {
433 return entry->val; 543 if (!lookup_only)
544 atomic64_inc(&map->hits);
545 return val;
546 } else if (unlikely(!val)) {
547 /*
548 * The key is present. But, val (pointer to elt
549 * struct) is still NULL. which means some other
550 * thread is in the process of inserting an
551 * element.
552 *
553 * On top of that, it's key_hash is same as the
554 * one being inserted right now. So, it's
555 * possible that the element has the same
556 * key as well.
557 */
558
559 dup_try++;
560 if (dup_try > map->map_size) {
561 atomic64_inc(&map->drops);
562 break;
563 }
564 continue;
565 }
434 } 566 }
435 567
436 if (!test_key) { 568 if (!test_key) {
@@ -452,6 +584,13 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
452 atomic64_inc(&map->hits); 584 atomic64_inc(&map->hits);
453 585
454 return entry->val; 586 return entry->val;
587 } else {
588 /*
589 * cmpxchg() failed. Loop around once
590 * more to check what key was inserted.
591 */
592 dup_try++;
593 continue;
455 } 594 }
456 } 595 }
457 596
@@ -816,67 +955,15 @@ create_sort_entry(void *key, struct tracing_map_elt *elt)
816 return sort_entry; 955 return sort_entry;
817} 956}
818 957
819static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt) 958static void detect_dups(struct tracing_map_sort_entry **sort_entries,
820{
821 struct tracing_map_elt *dup_elt;
822 unsigned int i;
823
824 dup_elt = tracing_map_elt_alloc(elt->map);
825 if (IS_ERR(dup_elt))
826 return NULL;
827
828 if (elt->map->ops && elt->map->ops->elt_copy)
829 elt->map->ops->elt_copy(dup_elt, elt);
830
831 dup_elt->private_data = elt->private_data;
832 memcpy(dup_elt->key, elt->key, elt->map->key_size);
833
834 for (i = 0; i < elt->map->n_fields; i++) {
835 atomic64_set(&dup_elt->fields[i].sum,
836 atomic64_read(&elt->fields[i].sum));
837 dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn;
838 }
839
840 return dup_elt;
841}
842
843static int merge_dup(struct tracing_map_sort_entry **sort_entries,
844 unsigned int target, unsigned int dup)
845{
846 struct tracing_map_elt *target_elt, *elt;
847 bool first_dup = (target - dup) == 1;
848 int i;
849
850 if (first_dup) {
851 elt = sort_entries[target]->elt;
852 target_elt = copy_elt(elt);
853 if (!target_elt)
854 return -ENOMEM;
855 sort_entries[target]->elt = target_elt;
856 sort_entries[target]->elt_copied = true;
857 } else
858 target_elt = sort_entries[target]->elt;
859
860 elt = sort_entries[dup]->elt;
861
862 for (i = 0; i < elt->map->n_fields; i++)
863 atomic64_add(atomic64_read(&elt->fields[i].sum),
864 &target_elt->fields[i].sum);
865
866 sort_entries[dup]->dup = true;
867
868 return 0;
869}
870
871static int merge_dups(struct tracing_map_sort_entry **sort_entries,
872 int n_entries, unsigned int key_size) 959 int n_entries, unsigned int key_size)
873{ 960{
874 unsigned int dups = 0, total_dups = 0; 961 unsigned int dups = 0, total_dups = 0;
875 int err, i, j; 962 int i;
876 void *key; 963 void *key;
877 964
878 if (n_entries < 2) 965 if (n_entries < 2)
879 return total_dups; 966 return;
880 967
881 sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *), 968 sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *),
882 (int (*)(const void *, const void *))cmp_entries_dup, NULL); 969 (int (*)(const void *, const void *))cmp_entries_dup, NULL);
@@ -885,30 +972,14 @@ static int merge_dups(struct tracing_map_sort_entry **sort_entries,
885 for (i = 1; i < n_entries; i++) { 972 for (i = 1; i < n_entries; i++) {
886 if (!memcmp(sort_entries[i]->key, key, key_size)) { 973 if (!memcmp(sort_entries[i]->key, key, key_size)) {
887 dups++; total_dups++; 974 dups++; total_dups++;
888 err = merge_dup(sort_entries, i - dups, i);
889 if (err)
890 return err;
891 continue; 975 continue;
892 } 976 }
893 key = sort_entries[i]->key; 977 key = sort_entries[i]->key;
894 dups = 0; 978 dups = 0;
895 } 979 }
896 980
897 if (!total_dups) 981 WARN_ONCE(total_dups > 0,
898 return total_dups; 982 "Duplicates detected: %d\n", total_dups);
899
900 for (i = 0, j = 0; i < n_entries; i++) {
901 if (!sort_entries[i]->dup) {
902 sort_entries[j] = sort_entries[i];
903 if (j++ != i)
904 sort_entries[i] = NULL;
905 } else {
906 destroy_sort_entry(sort_entries[i]);
907 sort_entries[i] = NULL;
908 }
909 }
910
911 return total_dups;
912} 983}
913 984
914static bool is_key(struct tracing_map *map, unsigned int field_idx) 985static bool is_key(struct tracing_map *map, unsigned int field_idx)
@@ -1034,10 +1105,7 @@ int tracing_map_sort_entries(struct tracing_map *map,
1034 return 1; 1105 return 1;
1035 } 1106 }
1036 1107
1037 ret = merge_dups(entries, n_entries, map->key_size); 1108 detect_dups(entries, n_entries, map->key_size);
1038 if (ret < 0)
1039 goto free;
1040 n_entries -= ret;
1041 1109
1042 if (is_key(map, sort_keys[0].field_idx)) 1110 if (is_key(map, sort_keys[0].field_idx))
1043 cmp_entries_fn = cmp_entries_key; 1111 cmp_entries_fn = cmp_entries_key;
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index 5b5bbf8ae550..053eb92b2d31 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -10,6 +10,7 @@
10#define TRACING_MAP_VALS_MAX 3 10#define TRACING_MAP_VALS_MAX 3
11#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \ 11#define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \
12 TRACING_MAP_VALS_MAX) 12 TRACING_MAP_VALS_MAX)
13#define TRACING_MAP_VARS_MAX 16
13#define TRACING_MAP_SORT_KEYS_MAX 2 14#define TRACING_MAP_SORT_KEYS_MAX 2
14 15
15typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); 16typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b);
@@ -137,6 +138,8 @@ struct tracing_map_field {
137struct tracing_map_elt { 138struct tracing_map_elt {
138 struct tracing_map *map; 139 struct tracing_map *map;
139 struct tracing_map_field *fields; 140 struct tracing_map_field *fields;
141 atomic64_t *vars;
142 bool *var_set;
140 void *key; 143 void *key;
141 void *private_data; 144 void *private_data;
142}; 145};
@@ -192,6 +195,7 @@ struct tracing_map {
192 int key_idx[TRACING_MAP_KEYS_MAX]; 195 int key_idx[TRACING_MAP_KEYS_MAX];
193 unsigned int n_keys; 196 unsigned int n_keys;
194 struct tracing_map_sort_key sort_key; 197 struct tracing_map_sort_key sort_key;
198 unsigned int n_vars;
195 atomic64_t hits; 199 atomic64_t hits;
196 atomic64_t drops; 200 atomic64_t drops;
197}; 201};
@@ -215,11 +219,6 @@ struct tracing_map {
215 * Element allocation occurs before tracing begins, when the 219 * Element allocation occurs before tracing begins, when the
216 * tracing_map_init() call is made by client code. 220 * tracing_map_init() call is made by client code.
217 * 221 *
218 * @elt_copy: At certain points in the lifetime of an element, it may
219 * need to be copied. The copy should include a copy of the
220 * client-allocated data, which can be copied into the 'to'
221 * element from the 'from' element.
222 *
223 * @elt_free: When a tracing_map_elt is freed, this function is called 222 * @elt_free: When a tracing_map_elt is freed, this function is called
224 * and allows client-allocated per-element data to be freed. 223 * and allows client-allocated per-element data to be freed.
225 * 224 *
@@ -233,8 +232,6 @@ struct tracing_map {
233 */ 232 */
234struct tracing_map_ops { 233struct tracing_map_ops {
235 int (*elt_alloc)(struct tracing_map_elt *elt); 234 int (*elt_alloc)(struct tracing_map_elt *elt);
236 void (*elt_copy)(struct tracing_map_elt *to,
237 struct tracing_map_elt *from);
238 void (*elt_free)(struct tracing_map_elt *elt); 235 void (*elt_free)(struct tracing_map_elt *elt);
239 void (*elt_clear)(struct tracing_map_elt *elt); 236 void (*elt_clear)(struct tracing_map_elt *elt);
240 void (*elt_init)(struct tracing_map_elt *elt); 237 void (*elt_init)(struct tracing_map_elt *elt);
@@ -248,6 +245,7 @@ tracing_map_create(unsigned int map_bits,
248extern int tracing_map_init(struct tracing_map *map); 245extern int tracing_map_init(struct tracing_map *map);
249 246
250extern int tracing_map_add_sum_field(struct tracing_map *map); 247extern int tracing_map_add_sum_field(struct tracing_map *map);
248extern int tracing_map_add_var(struct tracing_map *map);
251extern int tracing_map_add_key_field(struct tracing_map *map, 249extern int tracing_map_add_key_field(struct tracing_map *map,
252 unsigned int offset, 250 unsigned int offset,
253 tracing_map_cmp_fn_t cmp_fn); 251 tracing_map_cmp_fn_t cmp_fn);
@@ -267,7 +265,13 @@ extern int tracing_map_cmp_none(void *val_a, void *val_b);
267 265
268extern void tracing_map_update_sum(struct tracing_map_elt *elt, 266extern void tracing_map_update_sum(struct tracing_map_elt *elt,
269 unsigned int i, u64 n); 267 unsigned int i, u64 n);
268extern void tracing_map_set_var(struct tracing_map_elt *elt,
269 unsigned int i, u64 n);
270extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i);
270extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i); 271extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i);
272extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i);
273extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i);
274
271extern void tracing_map_set_field_descr(struct tracing_map *map, 275extern void tracing_map_set_field_descr(struct tracing_map *map,
272 unsigned int i, 276 unsigned int i,
273 unsigned int key_offset, 277 unsigned int key_offset,
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index d7a708f82559..89f8a4a4b770 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -2591,6 +2591,8 @@ int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args)
2591 case 's': 2591 case 's':
2592 case 'F': 2592 case 'F':
2593 case 'f': 2593 case 'f':
2594 case 'x':
2595 case 'K':
2594 save_arg(void *); 2596 save_arg(void *);
2595 break; 2597 break;
2596 default: 2598 default:
@@ -2765,6 +2767,8 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
2765 case 's': 2767 case 's':
2766 case 'F': 2768 case 'F':
2767 case 'f': 2769 case 'f':
2770 case 'x':
2771 case 'K':
2768 process = true; 2772 process = true;
2769 break; 2773 break;
2770 default: 2774 default:
diff --git a/security/security.c b/security/security.c
index d2a84cda7e8d..7bc2fde023a7 100644
--- a/security/security.c
+++ b/security/security.c
@@ -30,6 +30,8 @@
30#include <linux/string.h> 30#include <linux/string.h>
31#include <net/flow.h> 31#include <net/flow.h>
32 32
33#include <trace/events/initcall.h>
34
33#define MAX_LSM_EVM_XATTR 2 35#define MAX_LSM_EVM_XATTR 2
34 36
35/* Maximum number of letters for an LSM name string */ 37/* Maximum number of letters for an LSM name string */
@@ -45,10 +47,14 @@ static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] =
45 47
46static void __init do_security_initcalls(void) 48static void __init do_security_initcalls(void)
47{ 49{
50 int ret;
48 initcall_t *call; 51 initcall_t *call;
49 call = __security_initcall_start; 52 call = __security_initcall_start;
53 trace_initcall_level("security");
50 while (call < __security_initcall_end) { 54 while (call < __security_initcall_end) {
51 (*call) (); 55 trace_initcall_start((*call));
56 ret = (*call) ();
57 trace_initcall_finish((*call), ret);
52 call++; 58 call++;
53 } 59 }
54} 60}
diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions
index df3dd7fe5f9b..2a4f16fc9819 100644
--- a/tools/testing/selftests/ftrace/test.d/functions
+++ b/tools/testing/selftests/ftrace/test.d/functions
@@ -59,6 +59,13 @@ disable_events() {
59 echo 0 > events/enable 59 echo 0 > events/enable
60} 60}
61 61
62clear_synthetic_events() { # reset all current synthetic events
63 grep -v ^# synthetic_events |
64 while read line; do
65 echo "!$line" >> synthetic_events
66 done
67}
68
62initialize_ftrace() { # Reset ftrace to initial-state 69initialize_ftrace() { # Reset ftrace to initial-state
63# As the initial state, ftrace will be set to nop tracer, 70# As the initial state, ftrace will be set to nop tracer,
64# no events, no triggers, no filters, no function filters, 71# no events, no triggers, no filters, no function filters,
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc
new file mode 100644
index 000000000000..786dce7e48be
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc
@@ -0,0 +1,39 @@
1#!/bin/sh
2# description: event trigger - test extended error support
3
4
5do_reset() {
6 reset_trigger
7 echo > set_event
8 clear_trace
9}
10
11fail() { #msg
12 do_reset
13 echo $1
14 exit_fail
15}
16
17if [ ! -f set_event ]; then
18 echo "event tracing is not supported"
19 exit_unsupported
20fi
21
22if [ ! -f synthetic_events ]; then
23 echo "synthetic event is not supported"
24 exit_unsupported
25fi
26
27reset_tracer
28do_reset
29
30echo "Test extended error support"
31echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
32echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger &>/dev/null
33if ! grep -q "ERROR:" events/sched/sched_wakeup/hist; then
34 fail "Failed to generate extended error in histogram"
35fi
36
37do_reset
38
39exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc
new file mode 100644
index 000000000000..7fd5b4a8f060
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc
@@ -0,0 +1,54 @@
1#!/bin/sh
2# description: event trigger - test field variable support
3
4do_reset() {
5 reset_trigger
6 echo > set_event
7 clear_trace
8}
9
10fail() { #msg
11 do_reset
12 echo $1
13 exit_fail
14}
15
16if [ ! -f set_event ]; then
17 echo "event tracing is not supported"
18 exit_unsupported
19fi
20
21if [ ! -f synthetic_events ]; then
22 echo "synthetic event is not supported"
23 exit_unsupported
24fi
25
26clear_synthetic_events
27reset_tracer
28do_reset
29
30echo "Test field variable support"
31
32echo 'wakeup_latency u64 lat; pid_t pid; int prio; char comm[16]' > synthetic_events
33echo 'hist:keys=comm:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
34echo 'hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
35echo 'hist:keys=pid,prio,comm:vals=lat:sort=pid,prio' > events/synthetic/wakeup_latency/trigger
36
37ping localhost -c 3
38if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
39 fail "Failed to create inter-event histogram"
40fi
41
42if ! grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
43 fail "Failed to create histogram with field variable"
44fi
45
46echo '!hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
47
48if grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
49 fail "Failed to remove histogram with field variable"
50fi
51
52do_reset
53
54exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc
new file mode 100644
index 000000000000..c93dbe38b5df
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc
@@ -0,0 +1,58 @@
1#!/bin/sh
2# description: event trigger - test inter-event combined histogram trigger
3
4do_reset() {
5 reset_trigger
6 echo > set_event
7 clear_trace
8}
9
10fail() { #msg
11 do_reset
12 echo $1
13 exit_fail
14}
15
16if [ ! -f set_event ]; then
17 echo "event tracing is not supported"
18 exit_unsupported
19fi
20
21if [ ! -f synthetic_events ]; then
22 echo "synthetic event is not supported"
23 exit_unsupported
24fi
25
26reset_tracer
27do_reset
28clear_synthetic_events
29
30echo "Test create synthetic event"
31
32echo 'waking_latency u64 lat pid_t pid' > synthetic_events
33if [ ! -d events/synthetic/waking_latency ]; then
34 fail "Failed to create waking_latency synthetic event"
35fi
36
37echo "Test combined histogram"
38
39echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
40echo 'hist:keys=pid:waking_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).waking_latency($waking_lat,pid) if comm=="ping"' > events/sched/sched_wakeup/trigger
41echo 'hist:keys=pid,lat:sort=pid,lat' > events/synthetic/waking_latency/trigger
42
43echo 'wakeup_latency u64 lat pid_t pid' >> synthetic_events
44echo 'hist:keys=pid:ts1=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger
45echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts1:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid) if next_comm=="ping"' > events/sched/sched_switch/trigger
46
47echo 'waking+wakeup_latency u64 lat; pid_t pid' >> synthetic_events
48echo 'hist:keys=pid,lat:sort=pid,lat:ww_lat=$waking_lat+$wakeup_lat:onmatch(synthetic.wakeup_latency).waking+wakeup_latency($ww_lat,pid)' >> events/synthetic/wakeup_latency/trigger
49echo 'hist:keys=pid,lat:sort=pid,lat' >> events/synthetic/waking+wakeup_latency/trigger
50
51ping localhost -c 3
52if ! grep -q "pid:" events/synthetic/waking+wakeup_latency/hist; then
53 fail "Failed to create combined histogram"
54fi
55
56do_reset
57
58exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc
new file mode 100644
index 000000000000..e84e7d048566
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc
@@ -0,0 +1,50 @@
1#!/bin/sh
2# description: event trigger - test inter-event histogram trigger onmatch action
3
4do_reset() {
5 reset_trigger
6 echo > set_event
7 clear_trace
8}
9
10fail() { #msg
11 do_reset
12 echo $1
13 exit_fail
14}
15
16if [ ! -f set_event ]; then
17 echo "event tracing is not supported"
18 exit_unsupported
19fi
20
21if [ ! -f synthetic_events ]; then
22 echo "synthetic event is not supported"
23 exit_unsupported
24fi
25
26clear_synthetic_events
27reset_tracer
28do_reset
29
30echo "Test create synthetic event"
31
32echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
33if [ ! -d events/synthetic/wakeup_latency ]; then
34 fail "Failed to create wakeup_latency synthetic event"
35fi
36
37echo "Test create histogram for synthetic event"
38echo "Test histogram variables,simple expression support and onmatch action"
39
40echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
41echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
42echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
43ping localhost -c 5
44if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
45 fail "Failed to create onmatch action inter-event histogram"
46fi
47
48do_reset
49
50exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc
new file mode 100644
index 000000000000..7907d8aacde3
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc
@@ -0,0 +1,50 @@
1#!/bin/sh
2# description: event trigger - test inter-event histogram trigger onmatch-onmax action
3
4do_reset() {
5 reset_trigger
6 echo > set_event
7 clear_trace
8}
9
10fail() { #msg
11 do_reset
12 echo $1
13 exit_fail
14}
15
16if [ ! -f set_event ]; then
17 echo "event tracing is not supported"
18 exit_unsupported
19fi
20
21if [ ! -f synthetic_events ]; then
22 echo "synthetic event is not supported"
23 exit_unsupported
24fi
25
26clear_synthetic_events
27reset_tracer
28do_reset
29
30echo "Test create synthetic event"
31
32echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
33if [ ! -d events/synthetic/wakeup_latency ]; then
34 fail "Failed to create wakeup_latency synthetic event"
35fi
36
37echo "Test create histogram for synthetic event"
38echo "Test histogram variables,simple expression support and onmatch-onmax action"
39
40echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
41echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm):onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
42echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
43ping localhost -c 5
44if [ ! grep -q "ping" events/synthetic/wakeup_latency/hist -o ! grep -q "max:" events/sched/sched_switch/hist]; then
45 fail "Failed to create onmatch-onmax action inter-event histogram"
46fi
47
48do_reset
49
50exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc
new file mode 100644
index 000000000000..38b7ed6242b2
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc
@@ -0,0 +1,48 @@
1#!/bin/sh
2# description: event trigger - test inter-event histogram trigger onmax action
3
4do_reset() {
5 reset_trigger
6 echo > set_event
7 clear_trace
8}
9
10fail() { #msg
11 do_reset
12 echo $1
13 exit_fail
14}
15
16if [ ! -f set_event ]; then
17 echo "event tracing is not supported"
18 exit_unsupported
19fi
20
21if [ ! -f synthetic_events ]; then
22 echo "synthetic event is not supported"
23 exit_unsupported
24fi
25
26clear_synthetic_events
27reset_tracer
28do_reset
29
30echo "Test create synthetic event"
31
32echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
33if [ ! -d events/synthetic/wakeup_latency ]; then
34 fail "Failed to create wakeup_latency synthetic event"
35fi
36
37echo "Test onmax action"
38
39echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_waking/trigger
40echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
41ping localhost -c 3
42if ! grep -q "max:" events/sched/sched_switch/hist; then
43 fail "Failed to create onmax action inter-event histogram"
44fi
45
46do_reset
47
48exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc
new file mode 100644
index 000000000000..cef11377dcbd
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc
@@ -0,0 +1,54 @@
1#!/bin/sh
2# description: event trigger - test synthetic event create remove
3do_reset() {
4 reset_trigger
5 echo > set_event
6 clear_trace
7}
8
9fail() { #msg
10 do_reset
11 echo $1
12 exit_fail
13}
14
15if [ ! -f set_event ]; then
16 echo "event tracing is not supported"
17 exit_unsupported
18fi
19
20if [ ! -f synthetic_events ]; then
21 echo "synthetic event is not supported"
22 exit_unsupported
23fi
24
25clear_synthetic_events
26reset_tracer
27do_reset
28
29echo "Test create synthetic event"
30
31echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
32if [ ! -d events/synthetic/wakeup_latency ]; then
33 fail "Failed to create wakeup_latency synthetic event"
34fi
35
36reset_trigger
37
38echo "Test create synthetic event with an error"
39echo 'wakeup_latency u64 lat pid_t pid char' > synthetic_events > /dev/null
40if [ -d events/synthetic/wakeup_latency ]; then
41 fail "Created wakeup_latency synthetic event with an invalid format"
42fi
43
44reset_trigger
45
46echo "Test remove synthetic event"
47echo '!wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
48if [ -d events/synthetic/wakeup_latency ]; then
49 fail "Failed to delete wakeup_latency synthetic event"
50fi
51
52do_reset
53
54exit 0