aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKan Liang <kan.liang@intel.com>2017-09-29 10:47:54 -0400
committerArnaldo Carvalho de Melo <acme@redhat.com>2017-10-03 08:27:46 -0400
commit340b47f510bbe55a76b7309107276f02ea11f117 (patch)
tree4dde66840cd57b47c01f7c350ad545347f4e2178
parentf988e71bc6220d8b404dbd43c0e0962e30305795 (diff)
perf top: Implement multithreading for perf_event__synthesize_threads
The proc files which is sorted with alphabetical order are evenly assigned to several synthesize threads to be processed in parallel. For 'perf top', the threads number hard code to online CPU number. The following patch will introduce an option to set it. For other perf tools, the thread number is 1. Because the process function is not ready for multithreading, e.g. process_synthesized_event. This patch series only support event synthesize multithreading for 'perf top'. For other tools, it can be done separately later. With multithread applied, the total processing time can get up to 1.56x speedup on Knights Mill for 'perf top'. For specific single event processing, the processing time could increase because of the lock contention. So proc_map_timeout may need to be increased. Otherwise some proc maps will be truncated. Based on my test, increasing the proc_map_timeout has small impact on the total processing time. The total processing time still get 1.49x speedup on Knights Mill after increasing the proc_map_timeout. The patch itself doesn't increase the proc_map_timeout. Doesn't need to implement multithreading for per task monitoring, perf_event__synthesize_thread_map. It doesn't have performance issue. Committer testing: # getconf _NPROCESSORS_ONLN 4 # perf trace --no-inherit -e clone -o /tmp/output perf top # tail -4 /tmp/bla 0.124 ( 0.041 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3eb3a8f30, parent_tidptr: 0x7fc3eb3a99d0, child_tidptr: 0x7fc3eb3a99d0, tls: 0x7fc3eb3a9700) = 9548 (perf) 0.246 ( 0.023 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3eaba7f30, parent_tidptr: 0x7fc3eaba89d0, child_tidptr: 0x7fc3eaba89d0, tls: 0x7fc3eaba8700) = 9549 (perf) 0.286 ( 0.019 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3ea3a6f30, parent_tidptr: 0x7fc3ea3a79d0, child_tidptr: 0x7fc3ea3a79d0, tls: 0x7fc3ea3a7700) = 9550 (perf) 246.540 ( 0.047 ms): clone(flags: VM|FS|FILES|SIGHAND|THREAD|SYSVSEM|SETTLS|PARENT_SETTID|CHILD_CLEARTID, child_stack: 0x7fc3ea3a6f30, parent_tidptr: 0x7fc3ea3a79d0, child_tidptr: 0x7fc3ea3a79d0, tls: 0x7fc3ea3a7700) = 9551 (perf) # Signed-off-by: Kan Liang <kan.liang@intel.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Acked-by: Jiri Olsa <jolsa@kernel.org> Cc: Adrian Hunter <adrian.hunter@intel.com> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Andi Kleen <ak@linux.intel.com> Cc: He Kuang <hekuang@huawei.com> Cc: Lukasz Odzioba <lukasz.odzioba@intel.com> Cc: Namhyung Kim <namhyung@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Wang Nan <wangnan0@huawei.com> Link: http://lkml.kernel.org/r/1506696477-146932-4-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
-rw-r--r--tools/perf/builtin-kvm.c3
-rw-r--r--tools/perf/builtin-record.c2
-rw-r--r--tools/perf/builtin-top.c8
-rw-r--r--tools/perf/builtin-trace.c2
-rw-r--r--tools/perf/tests/mmap-thread-lookup.c2
-rw-r--r--tools/perf/util/event.c160
-rw-r--r--tools/perf/util/event.h3
-rw-r--r--tools/perf/util/machine.c8
-rw-r--r--tools/perf/util/machine.h9
9 files changed, 155 insertions, 42 deletions
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
index c747a1af49fe..721f4f91291a 100644
--- a/tools/perf/builtin-kvm.c
+++ b/tools/perf/builtin-kvm.c
@@ -1441,7 +1441,8 @@ static int kvm_events_live(struct perf_kvm_stat *kvm,
1441 perf_session__set_id_hdr_size(kvm->session); 1441 perf_session__set_id_hdr_size(kvm->session);
1442 ordered_events__set_copy_on_queue(&kvm->session->ordered_events, true); 1442 ordered_events__set_copy_on_queue(&kvm->session->ordered_events, true);
1443 machine__synthesize_threads(&kvm->session->machines.host, &kvm->opts.target, 1443 machine__synthesize_threads(&kvm->session->machines.host, &kvm->opts.target,
1444 kvm->evlist->threads, false, kvm->opts.proc_map_timeout); 1444 kvm->evlist->threads, false,
1445 kvm->opts.proc_map_timeout, 1);
1445 err = kvm_live_open_events(kvm); 1446 err = kvm_live_open_events(kvm);
1446 if (err) 1447 if (err)
1447 goto out; 1448 goto out;
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 9b379f3a3d99..234fdf4734f6 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -863,7 +863,7 @@ static int record__synthesize(struct record *rec, bool tail)
863 863
864 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads, 864 err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
865 process_synthesized_event, opts->sample_address, 865 process_synthesized_event, opts->sample_address,
866 opts->proc_map_timeout); 866 opts->proc_map_timeout, 1);
867out: 867out:
868 return err; 868 return err;
869} 869}
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index ee954bde7e3e..bc31b93cc1d8 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -958,8 +958,14 @@ static int __cmd_top(struct perf_top *top)
958 if (perf_session__register_idle_thread(top->session) < 0) 958 if (perf_session__register_idle_thread(top->session) < 0)
959 goto out_delete; 959 goto out_delete;
960 960
961 perf_set_multithreaded();
962
961 machine__synthesize_threads(&top->session->machines.host, &opts->target, 963 machine__synthesize_threads(&top->session->machines.host, &opts->target,
962 top->evlist->threads, false, opts->proc_map_timeout); 964 top->evlist->threads, false,
965 opts->proc_map_timeout,
966 (unsigned int)sysconf(_SC_NPROCESSORS_ONLN));
967
968 perf_set_singlethreaded();
963 969
964 if (perf_hpp_list.socket) { 970 if (perf_hpp_list.socket) {
965 ret = perf_env__read_cpu_topology_map(&perf_env); 971 ret = perf_env__read_cpu_topology_map(&perf_env);
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 967bd351b58d..afef6fe46c45 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1131,7 +1131,7 @@ static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1131 1131
1132 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, 1132 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1133 evlist->threads, trace__tool_process, false, 1133 evlist->threads, trace__tool_process, false,
1134 trace->opts.proc_map_timeout); 1134 trace->opts.proc_map_timeout, 1);
1135 if (err) 1135 if (err)
1136 symbol__exit(); 1136 symbol__exit();
1137 1137
diff --git a/tools/perf/tests/mmap-thread-lookup.c b/tools/perf/tests/mmap-thread-lookup.c
index f94a4196e7c9..2a0068afe3bf 100644
--- a/tools/perf/tests/mmap-thread-lookup.c
+++ b/tools/perf/tests/mmap-thread-lookup.c
@@ -131,7 +131,7 @@ static int synth_all(struct machine *machine)
131{ 131{
132 return perf_event__synthesize_threads(NULL, 132 return perf_event__synthesize_threads(NULL,
133 perf_event__process, 133 perf_event__process,
134 machine, 0, 500); 134 machine, 0, 500, 1);
135} 135}
136 136
137static int synth_process(struct machine *machine) 137static int synth_process(struct machine *machine)
diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c
index 10366b87d0b5..0e678dd6bdbe 100644
--- a/tools/perf/util/event.c
+++ b/tools/perf/util/event.c
@@ -678,23 +678,21 @@ out:
678 return err; 678 return err;
679} 679}
680 680
681int perf_event__synthesize_threads(struct perf_tool *tool, 681static int __perf_event__synthesize_threads(struct perf_tool *tool,
682 perf_event__handler_t process, 682 perf_event__handler_t process,
683 struct machine *machine, 683 struct machine *machine,
684 bool mmap_data, 684 bool mmap_data,
685 unsigned int proc_map_timeout) 685 unsigned int proc_map_timeout,
686 struct dirent **dirent,
687 int start,
688 int num)
686{ 689{
687 union perf_event *comm_event, *mmap_event, *fork_event; 690 union perf_event *comm_event, *mmap_event, *fork_event;
688 union perf_event *namespaces_event; 691 union perf_event *namespaces_event;
689 char proc_path[PATH_MAX];
690 struct dirent **dirent;
691 int err = -1; 692 int err = -1;
692 char *end; 693 char *end;
693 pid_t pid; 694 pid_t pid;
694 int n, i; 695 int i;
695
696 if (machine__is_default_guest(machine))
697 return 0;
698 696
699 comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size); 697 comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size);
700 if (comm_event == NULL) 698 if (comm_event == NULL)
@@ -714,34 +712,25 @@ int perf_event__synthesize_threads(struct perf_tool *tool,
714 if (namespaces_event == NULL) 712 if (namespaces_event == NULL)
715 goto out_free_fork; 713 goto out_free_fork;
716 714
717 snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir); 715 for (i = start; i < start + num; i++) {
718 n = scandir(proc_path, &dirent, 0, alphasort);
719
720 if (n < 0)
721 goto out_free_namespaces;
722
723 for (i = 0; i < n; i++) {
724 if (!isdigit(dirent[i]->d_name[0])) 716 if (!isdigit(dirent[i]->d_name[0]))
725 continue; 717 continue;
726 718
727 pid = (pid_t)strtol(dirent[i]->d_name, &end, 10); 719 pid = (pid_t)strtol(dirent[i]->d_name, &end, 10);
728 /* only interested in proper numerical dirents */ 720 /* only interested in proper numerical dirents */
729 if (!*end) { 721 if (*end)
730 /* 722 continue;
731 * We may race with exiting thread, so don't stop just because 723 /*
732 * one thread couldn't be synthesized. 724 * We may race with exiting thread, so don't stop just because
733 */ 725 * one thread couldn't be synthesized.
734 __event__synthesize_thread(comm_event, mmap_event, fork_event, 726 */
735 namespaces_event, pid, 1, process, 727 __event__synthesize_thread(comm_event, mmap_event, fork_event,
736 tool, machine, mmap_data, 728 namespaces_event, pid, 1, process,
737 proc_map_timeout); 729 tool, machine, mmap_data,
738 } 730 proc_map_timeout);
739 free(dirent[i]);
740 } 731 }
741 free(dirent);
742 err = 0; 732 err = 0;
743 733
744out_free_namespaces:
745 free(namespaces_event); 734 free(namespaces_event);
746out_free_fork: 735out_free_fork:
747 free(fork_event); 736 free(fork_event);
@@ -753,6 +742,115 @@ out:
753 return err; 742 return err;
754} 743}
755 744
745struct synthesize_threads_arg {
746 struct perf_tool *tool;
747 perf_event__handler_t process;
748 struct machine *machine;
749 bool mmap_data;
750 unsigned int proc_map_timeout;
751 struct dirent **dirent;
752 int num;
753 int start;
754};
755
756static void *synthesize_threads_worker(void *arg)
757{
758 struct synthesize_threads_arg *args = arg;
759
760 __perf_event__synthesize_threads(args->tool, args->process,
761 args->machine, args->mmap_data,
762 args->proc_map_timeout, args->dirent,
763 args->start, args->num);
764 return NULL;
765}
766
767int perf_event__synthesize_threads(struct perf_tool *tool,
768 perf_event__handler_t process,
769 struct machine *machine,
770 bool mmap_data,
771 unsigned int proc_map_timeout,
772 unsigned int nr_threads_synthesize)
773{
774 struct synthesize_threads_arg *args = NULL;
775 pthread_t *synthesize_threads = NULL;
776 char proc_path[PATH_MAX];
777 struct dirent **dirent;
778 int num_per_thread;
779 int m, n, i, j;
780 int thread_nr;
781 int base = 0;
782 int err = -1;
783
784
785 if (machine__is_default_guest(machine))
786 return 0;
787
788 snprintf(proc_path, sizeof(proc_path), "%s/proc", machine->root_dir);
789 n = scandir(proc_path, &dirent, 0, alphasort);
790 if (n < 0)
791 return err;
792
793 thread_nr = nr_threads_synthesize;
794
795 if (thread_nr <= 1) {
796 err = __perf_event__synthesize_threads(tool, process,
797 machine, mmap_data,
798 proc_map_timeout,
799 dirent, base, n);
800 goto free_dirent;
801 }
802 if (thread_nr > n)
803 thread_nr = n;
804
805 synthesize_threads = calloc(sizeof(pthread_t), thread_nr);
806 if (synthesize_threads == NULL)
807 goto free_dirent;
808
809 args = calloc(sizeof(*args), thread_nr);
810 if (args == NULL)
811 goto free_threads;
812
813 num_per_thread = n / thread_nr;
814 m = n % thread_nr;
815 for (i = 0; i < thread_nr; i++) {
816 args[i].tool = tool;
817 args[i].process = process;
818 args[i].machine = machine;
819 args[i].mmap_data = mmap_data;
820 args[i].proc_map_timeout = proc_map_timeout;
821 args[i].dirent = dirent;
822 }
823 for (i = 0; i < m; i++) {
824 args[i].num = num_per_thread + 1;
825 args[i].start = i * args[i].num;
826 }
827 if (i != 0)
828 base = args[i-1].start + args[i-1].num;
829 for (j = i; j < thread_nr; j++) {
830 args[j].num = num_per_thread;
831 args[j].start = base + (j - i) * args[i].num;
832 }
833
834 for (i = 0; i < thread_nr; i++) {
835 if (pthread_create(&synthesize_threads[i], NULL,
836 synthesize_threads_worker, &args[i]))
837 goto out_join;
838 }
839 err = 0;
840out_join:
841 for (i = 0; i < thread_nr; i++)
842 pthread_join(synthesize_threads[i], NULL);
843 free(args);
844free_threads:
845 free(synthesize_threads);
846free_dirent:
847 for (i = 0; i < n; i++)
848 free(dirent[i]);
849 free(dirent);
850
851 return err;
852}
853
756struct process_symbol_args { 854struct process_symbol_args {
757 const char *name; 855 const char *name;
758 u64 start; 856 u64 start;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index ee7bcc898d35..d6cbb0a0d919 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -680,7 +680,8 @@ int perf_event__synthesize_cpu_map(struct perf_tool *tool,
680int perf_event__synthesize_threads(struct perf_tool *tool, 680int perf_event__synthesize_threads(struct perf_tool *tool,
681 perf_event__handler_t process, 681 perf_event__handler_t process,
682 struct machine *machine, bool mmap_data, 682 struct machine *machine, bool mmap_data,
683 unsigned int proc_map_timeout); 683 unsigned int proc_map_timeout,
684 unsigned int nr_threads_synthesize);
684int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, 685int perf_event__synthesize_kernel_mmap(struct perf_tool *tool,
685 perf_event__handler_t process, 686 perf_event__handler_t process,
686 struct machine *machine); 687 struct machine *machine);
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 585b4a3d64a4..7c3aa479201a 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2218,12 +2218,16 @@ int machines__for_each_thread(struct machines *machines,
2218int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool, 2218int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
2219 struct target *target, struct thread_map *threads, 2219 struct target *target, struct thread_map *threads,
2220 perf_event__handler_t process, bool data_mmap, 2220 perf_event__handler_t process, bool data_mmap,
2221 unsigned int proc_map_timeout) 2221 unsigned int proc_map_timeout,
2222 unsigned int nr_threads_synthesize)
2222{ 2223{
2223 if (target__has_task(target)) 2224 if (target__has_task(target))
2224 return perf_event__synthesize_thread_map(tool, threads, process, machine, data_mmap, proc_map_timeout); 2225 return perf_event__synthesize_thread_map(tool, threads, process, machine, data_mmap, proc_map_timeout);
2225 else if (target__has_cpu(target)) 2226 else if (target__has_cpu(target))
2226 return perf_event__synthesize_threads(tool, process, machine, data_mmap, proc_map_timeout); 2227 return perf_event__synthesize_threads(tool, process,
2228 machine, data_mmap,
2229 proc_map_timeout,
2230 nr_threads_synthesize);
2227 /* command specified */ 2231 /* command specified */
2228 return 0; 2232 return 0;
2229} 2233}
diff --git a/tools/perf/util/machine.h b/tools/perf/util/machine.h
index b1cd516f2025..c6a299ea506c 100644
--- a/tools/perf/util/machine.h
+++ b/tools/perf/util/machine.h
@@ -257,15 +257,18 @@ int machines__for_each_thread(struct machines *machines,
257int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool, 257int __machine__synthesize_threads(struct machine *machine, struct perf_tool *tool,
258 struct target *target, struct thread_map *threads, 258 struct target *target, struct thread_map *threads,
259 perf_event__handler_t process, bool data_mmap, 259 perf_event__handler_t process, bool data_mmap,
260 unsigned int proc_map_timeout); 260 unsigned int proc_map_timeout,
261 unsigned int nr_threads_synthesize);
261static inline 262static inline
262int machine__synthesize_threads(struct machine *machine, struct target *target, 263int machine__synthesize_threads(struct machine *machine, struct target *target,
263 struct thread_map *threads, bool data_mmap, 264 struct thread_map *threads, bool data_mmap,
264 unsigned int proc_map_timeout) 265 unsigned int proc_map_timeout,
266 unsigned int nr_threads_synthesize)
265{ 267{
266 return __machine__synthesize_threads(machine, NULL, target, threads, 268 return __machine__synthesize_threads(machine, NULL, target, threads,
267 perf_event__process, data_mmap, 269 perf_event__process, data_mmap,
268 proc_map_timeout); 270 proc_map_timeout,
271 nr_threads_synthesize);
269} 272}
270 273
271pid_t machine__get_current_tid(struct machine *machine, int cpu); 274pid_t machine__get_current_tid(struct machine *machine, int cpu);