aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/perf_counter/.gitignore179
-rw-r--r--Documentation/perf_counter/Documentation/perf-help.txt38
-rw-r--r--Documentation/perf_counter/Documentation/perf-record.txt63
-rw-r--r--Documentation/perf_counter/Documentation/perf-stat.txt76
-rw-r--r--Documentation/perf_counter/Documentation/perf-top.txt61
-rw-r--r--Documentation/perf_counter/Makefile849
-rw-r--r--Documentation/perf_counter/builtin-help.c461
-rw-r--r--Documentation/perf_counter/builtin-record.c613
-rw-r--r--Documentation/perf_counter/builtin-stat.c568
-rw-r--r--Documentation/perf_counter/builtin-top.c1146
-rw-r--r--Documentation/perf_counter/builtin.h22
-rw-r--r--Documentation/perf_counter/command-list.txt6
-rw-r--r--Documentation/perf_counter/design.txt449
-rw-r--r--Documentation/perf_counter/perf-report.cc515
-rw-r--r--Documentation/perf_counter/perf.c414
-rw-r--r--Documentation/perf_counter/perf.h62
-rwxr-xr-xDocumentation/perf_counter/util/PERF-VERSION-GEN42
-rw-r--r--Documentation/perf_counter/util/abspath.c117
-rw-r--r--Documentation/perf_counter/util/alias.c77
-rw-r--r--Documentation/perf_counter/util/cache.h117
-rw-r--r--Documentation/perf_counter/util/config.c873
-rw-r--r--Documentation/perf_counter/util/ctype.c26
-rw-r--r--Documentation/perf_counter/util/exec_cmd.c165
-rw-r--r--Documentation/perf_counter/util/exec_cmd.h13
-rwxr-xr-xDocumentation/perf_counter/util/generate-cmdlist.sh24
-rw-r--r--Documentation/perf_counter/util/help.c366
-rw-r--r--Documentation/perf_counter/util/help.h29
-rw-r--r--Documentation/perf_counter/util/levenshtein.c84
-rw-r--r--Documentation/perf_counter/util/levenshtein.h8
-rw-r--r--Documentation/perf_counter/util/parse-options.c492
-rw-r--r--Documentation/perf_counter/util/parse-options.h172
-rw-r--r--Documentation/perf_counter/util/path.c353
-rw-r--r--Documentation/perf_counter/util/quote.c478
-rw-r--r--Documentation/perf_counter/util/quote.h68
-rw-r--r--Documentation/perf_counter/util/run-command.c395
-rw-r--r--Documentation/perf_counter/util/run-command.h93
-rw-r--r--Documentation/perf_counter/util/strbuf.c359
-rw-r--r--Documentation/perf_counter/util/strbuf.h137
-rw-r--r--Documentation/perf_counter/util/usage.c80
-rw-r--r--Documentation/perf_counter/util/util.h408
-rw-r--r--Documentation/perf_counter/util/wrapper.c206
-rw-r--r--MAINTAINERS10
-rw-r--r--arch/powerpc/include/asm/hw_irq.h39
-rw-r--r--arch/powerpc/include/asm/paca.h1
-rw-r--r--arch/powerpc/include/asm/perf_counter.h95
-rw-r--r--arch/powerpc/include/asm/reg.h2
-rw-r--r--arch/powerpc/include/asm/systbl.h2
-rw-r--r--arch/powerpc/include/asm/unistd.h1
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kernel/asm-offsets.c1
-rw-r--r--arch/powerpc/kernel/entry_64.S9
-rw-r--r--arch/powerpc/kernel/irq.c5
-rw-r--r--arch/powerpc/kernel/perf_counter.c1165
-rw-r--r--arch/powerpc/kernel/power4-pmu.c557
-rw-r--r--arch/powerpc/kernel/power5+-pmu.c630
-rw-r--r--arch/powerpc/kernel/power5-pmu.c570
-rw-r--r--arch/powerpc/kernel/power6-pmu.c490
-rw-r--r--arch/powerpc/kernel/ppc970-pmu.c441
-rw-r--r--arch/powerpc/mm/fault.c10
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype1
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/include/asm/atomic_32.h236
-rw-r--r--arch/x86/include/asm/entry_arch.h1
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h3
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/perf_counter.h100
-rw-r--r--arch/x86/include/asm/unistd_32.h2
-rw-r--r--arch/x86/include/asm/unistd_64.h5
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1242
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/entry_64.S7
-rw-r--r--arch/x86/kernel/irq.c10
-rw-r--r--arch/x86/kernel/irqinit_32.c60
-rw-r--r--arch/x86/kernel/irqinit_64.c13
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/traps.c15
-rw-r--r--arch/x86/mm/fault.c12
-rw-r--r--arch/x86/oprofile/nmi_int.c7
-rw-r--r--arch/x86/oprofile/op_model_ppro.c10
-rw-r--r--drivers/char/sysrq.c2
-rw-r--r--fs/exec.c9
-rw-r--r--include/linux/compat.h2
-rw-r--r--include/linux/init_task.h13
-rw-r--r--include/linux/kernel_stat.h5
-rw-r--r--include/linux/mutex.h1
-rw-r--r--include/linux/perf_counter.h642
-rw-r--r--include/linux/prctl.h3
-rw-r--r--include/linux/sched.h17
-rw-r--r--include/linux/signal.h2
-rw-r--r--include/linux/syscalls.h5
-rw-r--r--init/Kconfig35
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/compat.c11
-rw-r--r--kernel/exit.c20
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/mutex.c27
-rw-r--r--kernel/perf_counter.c3526
-rw-r--r--kernel/rtmutex.c8
-rw-r--r--kernel/sched.c49
-rw-r--r--kernel/signal.c56
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c19
-rw-r--r--kernel/timer.c3
-rw-r--r--mm/mmap.c10
112 files changed, 20892 insertions, 111 deletions
diff --git a/Documentation/perf_counter/.gitignore b/Documentation/perf_counter/.gitignore
new file mode 100644
index 000000000000..41c0b20a76ce
--- /dev/null
+++ b/Documentation/perf_counter/.gitignore
@@ -0,0 +1,179 @@
1GIT-BUILD-OPTIONS
2GIT-CFLAGS
3GIT-GUI-VARS
4GIT-VERSION-FILE
5git
6git-add
7git-add--interactive
8git-am
9git-annotate
10git-apply
11git-archimport
12git-archive
13git-bisect
14git-bisect--helper
15git-blame
16git-branch
17git-bundle
18git-cat-file
19git-check-attr
20git-check-ref-format
21git-checkout
22git-checkout-index
23git-cherry
24git-cherry-pick
25git-clean
26git-clone
27git-commit
28git-commit-tree
29git-config
30git-count-objects
31git-cvsexportcommit
32git-cvsimport
33git-cvsserver
34git-daemon
35git-diff
36git-diff-files
37git-diff-index
38git-diff-tree
39git-difftool
40git-difftool--helper
41git-describe
42git-fast-export
43git-fast-import
44git-fetch
45git-fetch--tool
46git-fetch-pack
47git-filter-branch
48git-fmt-merge-msg
49git-for-each-ref
50git-format-patch
51git-fsck
52git-fsck-objects
53git-gc
54git-get-tar-commit-id
55git-grep
56git-hash-object
57git-help
58git-http-fetch
59git-http-push
60git-imap-send
61git-index-pack
62git-init
63git-init-db
64git-instaweb
65git-log
66git-lost-found
67git-ls-files
68git-ls-remote
69git-ls-tree
70git-mailinfo
71git-mailsplit
72git-merge
73git-merge-base
74git-merge-index
75git-merge-file
76git-merge-tree
77git-merge-octopus
78git-merge-one-file
79git-merge-ours
80git-merge-recursive
81git-merge-resolve
82git-merge-subtree
83git-mergetool
84git-mergetool--lib
85git-mktag
86git-mktree
87git-name-rev
88git-mv
89git-pack-redundant
90git-pack-objects
91git-pack-refs
92git-parse-remote
93git-patch-id
94git-peek-remote
95git-prune
96git-prune-packed
97git-pull
98git-push
99git-quiltimport
100git-read-tree
101git-rebase
102git-rebase--interactive
103git-receive-pack
104git-reflog
105git-relink
106git-remote
107git-repack
108git-repo-config
109git-request-pull
110git-rerere
111git-reset
112git-rev-list
113git-rev-parse
114git-revert
115git-rm
116git-send-email
117git-send-pack
118git-sh-setup
119git-shell
120git-shortlog
121git-show
122git-show-branch
123git-show-index
124git-show-ref
125git-stage
126git-stash
127git-status
128git-stripspace
129git-submodule
130git-svn
131git-symbolic-ref
132git-tag
133git-tar-tree
134git-unpack-file
135git-unpack-objects
136git-update-index
137git-update-ref
138git-update-server-info
139git-upload-archive
140git-upload-pack
141git-var
142git-verify-pack
143git-verify-tag
144git-web--browse
145git-whatchanged
146git-write-tree
147git-core-*/?*
148gitk-wish
149gitweb/gitweb.cgi
150test-chmtime
151test-ctype
152test-date
153test-delta
154test-dump-cache-tree
155test-genrandom
156test-match-trees
157test-parse-options
158test-path-utils
159test-sha1
160test-sigchain
161common-cmds.h
162*.tar.gz
163*.dsc
164*.deb
165git.spec
166*.exe
167*.[aos]
168*.py[co]
169config.mak
170autom4te.cache
171config.cache
172config.log
173config.status
174config.mak.autogen
175config.mak.append
176configure
177tags
178TAGS
179cscope*
diff --git a/Documentation/perf_counter/Documentation/perf-help.txt b/Documentation/perf_counter/Documentation/perf-help.txt
new file mode 100644
index 000000000000..f85fed5a7edb
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-help.txt
@@ -0,0 +1,38 @@
1perf-help(1)
2===========
3
4NAME
5----
6perf-help - display help information about perf
7
8SYNOPSIS
9--------
10'perf help' [-a|--all] [COMMAND]
11
12DESCRIPTION
13-----------
14
15With no options and no COMMAND given, the synopsis of the 'perf'
16command and a list of the most commonly used perf commands are printed
17on the standard output.
18
19If the option '--all' or '-a' is given, then all available commands are
20printed on the standard output.
21
22If a perf command is named, a manual page for that command is brought
23up. The 'man' program is used by default for this purpose, but this
24can be overridden by other options or configuration variables.
25
26Note that `perf --help ...` is identical to `perf help ...` because the
27former is internally converted into the latter.
28
29OPTIONS
30-------
31-a::
32--all::
33 Prints all the available commands on the standard output. This
34 option supersedes any other option.
35
36PERF
37----
38Part of the linkperf:perf[1] suite
diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
new file mode 100644
index 000000000000..d07700e35eb2
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-record.txt
@@ -0,0 +1,63 @@
1perf-record(1)
2==========
3
4NAME
5----
6perf-record - Run a command and record its profile into output.perf
7
8SYNOPSIS
9--------
10[verse]
11'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
12
13DESCRIPTION
14-----------
15This command runs a command and gathers a performance counter profile
16from it, into output.perf - without displaying anything.
17
18This file can then be inspected later on, using 'perf report'.
19
20
21OPTIONS
22-------
23<command>...::
24 Any command you can specify in a shell.
25
26-e::
27--event=::
28 0:0: cpu-cycles
29 0:0: cycles
30 0:1: instructions
31 0:2: cache-references
32 0:3: cache-misses
33 0:4: branch-instructions
34 0:4: branches
35 0:5: branch-misses
36 0:6: bus-cycles
37 1:0: cpu-clock
38 1:1: task-clock
39 1:2: page-faults
40 1:2: faults
41 1:5: minor-faults
42 1:6: major-faults
43 1:3: context-switches
44 1:3: cs
45 1:4: cpu-migrations
46 1:4: migrations
47 rNNN: raw PMU events (eventsel+umask)
48
49-a::
50 system-wide collection
51
52-l::
53 scale counter values
54
55Configuration
56-------------
57
58EXAMPLES
59--------
60
61SEE ALSO
62--------
63linkperf:git-stat[1]
diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
new file mode 100644
index 000000000000..7fcab271e570
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -0,0 +1,76 @@
1perf-stat(1)
2==========
3
4NAME
5----
6perf-stat - Run a command and gather performance counter statistics
7
8SYNOPSIS
9--------
10[verse]
11'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
12
13DESCRIPTION
14-----------
15This command runs a command and gathers performance counter statistics
16from it.
17
18
19OPTIONS
20-------
21<command>...::
22 Any command you can specify in a shell.
23
24-e::
25--event=::
26 0:0: cpu-cycles
27 0:0: cycles
28 0:1: instructions
29 0:2: cache-references
30 0:3: cache-misses
31 0:4: branch-instructions
32 0:4: branches
33 0:5: branch-misses
34 0:6: bus-cycles
35 1:0: cpu-clock
36 1:1: task-clock
37 1:2: page-faults
38 1:2: faults
39 1:5: minor-faults
40 1:6: major-faults
41 1:3: context-switches
42 1:3: cs
43 1:4: cpu-migrations
44 1:4: migrations
45 rNNN: raw PMU events (eventsel+umask)
46
47-a::
48 system-wide collection
49
50-l::
51 scale counter values
52
53Configuration
54-------------
55
56EXAMPLES
57--------
58
59$ perf stat sleep 1
60
61 Performance counter stats for 'sleep':
62
63 0.678356 task clock ticks (msecs)
64 7 context switches (events)
65 4 CPU migrations (events)
66 232 pagefaults (events)
67 1810403 CPU cycles (events)
68 946759 instructions (events)
69 18952 cache references (events)
70 4885 cache misses (events)
71
72 Wall-clock time elapsed: 1001.252894 msecs
73
74SEE ALSO
75--------
76linkperf:git-tops[1]
diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/Documentation/perf_counter/Documentation/perf-top.txt
new file mode 100644
index 000000000000..057333b72534
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-top.txt
@@ -0,0 +1,61 @@
1perf-top(1)
2==========
3
4NAME
5----
6perf-top - Run a command and profile it
7
8SYNOPSIS
9--------
10[verse]
11'perf top' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
12
13DESCRIPTION
14-----------
15This command runs a command and gathers a performance counter profile
16from it.
17
18
19OPTIONS
20-------
21<command>...::
22 Any command you can specify in a shell.
23
24-e::
25--event=::
26 0:0: cpu-cycles
27 0:0: cycles
28 0:1: instructions
29 0:2: cache-references
30 0:3: cache-misses
31 0:4: branch-instructions
32 0:4: branches
33 0:5: branch-misses
34 0:6: bus-cycles
35 1:0: cpu-clock
36 1:1: task-clock
37 1:2: page-faults
38 1:2: faults
39 1:5: minor-faults
40 1:6: major-faults
41 1:3: context-switches
42 1:3: cs
43 1:4: cpu-migrations
44 1:4: migrations
45 rNNN: raw PMU events (eventsel+umask)
46
47-a::
48 system-wide collection
49
50-l::
51 scale counter values
52
53Configuration
54-------------
55
56EXAMPLES
57--------
58
59SEE ALSO
60--------
61linkperf:git-stat[1]
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
new file mode 100644
index 000000000000..481e4c26cd45
--- /dev/null
+++ b/Documentation/perf_counter/Makefile
@@ -0,0 +1,849 @@
1# The default target of this Makefile is...
2all::
3
4# Define V=1 to have a more verbose compile.
5#
6# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
7# or vsnprintf() return -1 instead of number of characters which would
8# have been written to the final string if enough space had been available.
9#
10# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
11# when attempting to read from an fopen'ed directory.
12#
13# Define NO_OPENSSL environment variable if you do not have OpenSSL.
14# This also implies MOZILLA_SHA1.
15#
16# Define CURLDIR=/foo/bar if your curl header and library files are in
17# /foo/bar/include and /foo/bar/lib directories.
18#
19# Define EXPATDIR=/foo/bar if your expat header and library files are in
20# /foo/bar/include and /foo/bar/lib directories.
21#
22# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent.
23#
24# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks
25# d_type in struct dirent (latest Cygwin -- will be fixed soonish).
26#
27# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
28# do not support the 'size specifiers' introduced by C99, namely ll, hh,
29# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
30# some C compilers supported these specifiers prior to C99 as an extension.
31#
32# Define NO_STRCASESTR if you don't have strcasestr.
33#
34# Define NO_MEMMEM if you don't have memmem.
35#
36# Define NO_STRTOUMAX if you don't have strtoumax in the C library.
37# If your compiler also does not support long long or does not have
38# strtoull, define NO_STRTOULL.
39#
40# Define NO_SETENV if you don't have setenv in the C library.
41#
42# Define NO_UNSETENV if you don't have unsetenv in the C library.
43#
44# Define NO_MKDTEMP if you don't have mkdtemp in the C library.
45#
46# Define NO_SYS_SELECT_H if you don't have sys/select.h.
47#
48# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link.
49# Enable it on Windows. By default, symrefs are still used.
50#
51# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
52# tests. These tests take up a significant amount of the total test time
53# but are not needed unless you plan to talk to SVN repos.
54#
55# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
56# installed in /sw, but don't want PERF to link against any libraries
57# installed there. If defined you may specify your own (or Fink's)
58# include directories and library directories by defining CFLAGS
59# and LDFLAGS appropriately.
60#
61# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
62# have DarwinPorts installed in /opt/local, but don't want PERF to
63# link against any libraries installed there. If defined you may
64# specify your own (or DarwinPort's) include directories and
65# library directories by defining CFLAGS and LDFLAGS appropriately.
66#
67# Define PPC_SHA1 environment variable when running make to make use of
68# a bundled SHA1 routine optimized for PowerPC.
69#
70# Define ARM_SHA1 environment variable when running make to make use of
71# a bundled SHA1 routine optimized for ARM.
72#
73# Define MOZILLA_SHA1 environment variable when running make to make use of
74# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast
75# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default
76# choice) has very fast version optimized for i586.
77#
78# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
79#
80# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
81#
82# Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
83# Patrick Mauritz).
84#
85# Define NO_MMAP if you want to avoid mmap.
86#
87# Define NO_PTHREADS if you do not have or do not want to use Pthreads.
88#
89# Define NO_PREAD if you have a problem with pread() system call (e.g.
90# cygwin.dll before v1.5.22).
91#
92# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is
93# generally faster on your platform than accessing the working directory.
94#
95# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
96# the executable mode bit, but doesn't really do so.
97#
98# Define NO_IPV6 if you lack IPv6 support and getaddrinfo().
99#
100# Define NO_SOCKADDR_STORAGE if your platform does not have struct
101# sockaddr_storage.
102#
103# Define NO_ICONV if your libc does not properly support iconv.
104#
105# Define OLD_ICONV if your library has an old iconv(), where the second
106# (input buffer pointer) parameter is declared with type (const char **).
107#
108# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
109#
110# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
111# that tells runtime paths to dynamic libraries;
112# "-Wl,-rpath=/path/lib" is used instead.
113#
114# Define USE_NSEC below if you want perf to care about sub-second file mtimes
115# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
116# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
117# randomly break unless your underlying filesystem supports those sub-second
118# times (my ext3 doesn't).
119#
120# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of
121# "st_ctim"
122#
123# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
124# available. This automatically turns USE_NSEC off.
125#
126# Define USE_STDEV below if you want perf to care about the underlying device
127# change being considered an inode change from the update-index perspective.
128#
129# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
130# field that counts the on-disk footprint in 512-byte blocks.
131#
132# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
133#
134# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
135#
136# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's
137# MakeMaker (e.g. using ActiveState under Cygwin).
138#
139# Define NO_PERL if you do not want Perl scripts or libraries at all.
140#
141# Define INTERNAL_QSORT to use Git's implementation of qsort(), which
142# is a simplified version of the merge sort used in glibc. This is
143# recommended if Git triggers O(n^2) behavior in your platform's qsort().
144#
145# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
146# your external grep (e.g., if your system lacks grep, if its grep is
147# broken, or spawning external process is slower than built-in grep perf has).
148
149PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
150 @$(SHELL_PATH) util/PERF-VERSION-GEN
151-include PERF-VERSION-FILE
152
153uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
154uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
155uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not')
156uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
157uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
158uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
159
160# CFLAGS and LDFLAGS are for the users to override from the command line.
161
162CFLAGS = -g -O2 -Wall
163LDFLAGS = -lpthread -lrt
164ALL_CFLAGS = $(CFLAGS)
165ALL_LDFLAGS = $(LDFLAGS)
166STRIP ?= strip
167
168# Among the variables below, these:
169# perfexecdir
170# template_dir
171# mandir
172# infodir
173# htmldir
174# ETC_PERFCONFIG (but not sysconfdir)
175# can be specified as a relative path some/where/else;
176# this is interpreted as relative to $(prefix) and "perf" at
177# runtime figures out where they are based on the path to the executable.
178# This can help installing the suite in a relocatable way.
179
180prefix = $(HOME)
181bindir_relative = bin
182bindir = $(prefix)/$(bindir_relative)
183mandir = share/man
184infodir = share/info
185perfexecdir = libexec/perf-core
186sharedir = $(prefix)/share
187template_dir = share/perf-core/templates
188htmldir = share/doc/perf-doc
189ifeq ($(prefix),/usr)
190sysconfdir = /etc
191ETC_PERFCONFIG = $(sysconfdir)/perfconfig
192else
193sysconfdir = $(prefix)/etc
194ETC_PERFCONFIG = etc/perfconfig
195endif
196lib = lib
197# DESTDIR=
198
199export prefix bindir sharedir sysconfdir
200
201CC = gcc
202AR = ar
203RM = rm -f
204TAR = tar
205FIND = find
206INSTALL = install
207RPMBUILD = rpmbuild
208PTHREAD_LIBS = -lpthread
209
210# sparse is architecture-neutral, which means that we need to tell it
211# explicitly what architecture to check for. Fix this up for yours..
212SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
213
214
215
216### --- END CONFIGURATION SECTION ---
217
218# Those must not be GNU-specific; they are shared with perl/ which may
219# be built by a different compiler. (Note that this is an artifact now
220# but it still might be nice to keep that distinction.)
221BASIC_CFLAGS =
222BASIC_LDFLAGS =
223
224# Guard against environment variables
225BUILTIN_OBJS =
226BUILT_INS =
227COMPAT_CFLAGS =
228COMPAT_OBJS =
229LIB_H =
230LIB_OBJS =
231PROGRAMS = perf-report
232SCRIPT_PERL =
233SCRIPT_SH =
234TEST_PROGRAMS =
235
236#
237# No scripts right now:
238#
239
240# SCRIPT_SH += perf-am.sh
241
242#
243# No Perl scripts right now:
244#
245
246# SCRIPT_PERL += perf-add--interactive.perl
247
248SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
249 $(patsubst %.perl,%,$(SCRIPT_PERL))
250
251# Empty...
252EXTRA_PROGRAMS =
253
254# ... and all the rest that could be moved out of bindir to perfexecdir
255PROGRAMS += $(EXTRA_PROGRAMS)
256
257#
258# Single 'perf' binary right now:
259#
260PROGRAMS += perf
261
262# List built-in command $C whose implementation cmd_$C() is not in
263# builtin-$C.o but is linked in as part of some other command.
264BUILT_INS += $(patsubst builtin-%.o,perf-%$X,$(BUILTIN_OBJS))
265
266#
267# None right now:
268#
269# BUILT_INS += perf-init $X
270
271# what 'all' will build and 'install' will install, in perfexecdir
272ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
273
274# what 'all' will build but not install in perfexecdir
275OTHER_PROGRAMS = perf$X
276
277# Set paths to tools early so that they can be used for version tests.
278ifndef SHELL_PATH
279 SHELL_PATH = /bin/sh
280endif
281ifndef PERL_PATH
282 PERL_PATH = /usr/bin/perl
283endif
284
285export PERL_PATH
286
287LIB_FILE=libperf.a
288
289LIB_H += ../../include/linux/perf_counter.h
290LIB_H += perf.h
291LIB_H += util/levenshtein.h
292LIB_H += util/parse-options.h
293LIB_H += util/quote.h
294LIB_H += util/util.h
295LIB_H += util/help.h
296LIB_H += util/strbuf.h
297LIB_H += util/run-command.h
298
299LIB_OBJS += util/abspath.o
300LIB_OBJS += util/alias.o
301LIB_OBJS += util/config.o
302LIB_OBJS += util/ctype.o
303LIB_OBJS += util/exec_cmd.o
304LIB_OBJS += util/help.o
305LIB_OBJS += util/levenshtein.o
306LIB_OBJS += util/parse-options.o
307LIB_OBJS += util/path.o
308LIB_OBJS += util/run-command.o
309LIB_OBJS += util/quote.o
310LIB_OBJS += util/strbuf.o
311LIB_OBJS += util/usage.o
312LIB_OBJS += util/wrapper.o
313
314BUILTIN_OBJS += builtin-help.o
315BUILTIN_OBJS += builtin-record.o
316BUILTIN_OBJS += builtin-stat.o
317BUILTIN_OBJS += builtin-top.o
318
319PERFLIBS = $(LIB_FILE)
320EXTLIBS =
321
322#
323# Platform specific tweaks
324#
325
326# We choose to avoid "if .. else if .. else .. endif endif"
327# because maintaining the nesting to match is a pain. If
328# we had "elif" things would have been much nicer...
329
330-include config.mak.autogen
331-include config.mak
332
333ifeq ($(uname_S),Darwin)
334 ifndef NO_FINK
335 ifeq ($(shell test -d /sw/lib && echo y),y)
336 BASIC_CFLAGS += -I/sw/include
337 BASIC_LDFLAGS += -L/sw/lib
338 endif
339 endif
340 ifndef NO_DARWIN_PORTS
341 ifeq ($(shell test -d /opt/local/lib && echo y),y)
342 BASIC_CFLAGS += -I/opt/local/include
343 BASIC_LDFLAGS += -L/opt/local/lib
344 endif
345 endif
346 PTHREAD_LIBS =
347endif
348
349ifndef CC_LD_DYNPATH
350 ifdef NO_R_TO_GCC_LINKER
351 # Some gcc does not accept and pass -R to the linker to specify
352 # the runtime dynamic library path.
353 CC_LD_DYNPATH = -Wl,-rpath,
354 else
355 CC_LD_DYNPATH = -R
356 endif
357endif
358
359ifdef ZLIB_PATH
360 BASIC_CFLAGS += -I$(ZLIB_PATH)/include
361 EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib)
362endif
363EXTLIBS += -lz
364
365ifdef NEEDS_SOCKET
366 EXTLIBS += -lsocket
367endif
368ifdef NEEDS_NSL
369 EXTLIBS += -lnsl
370endif
371ifdef NO_D_TYPE_IN_DIRENT
372 BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT
373endif
374ifdef NO_D_INO_IN_DIRENT
375 BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT
376endif
377ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
378 BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT
379endif
380ifdef USE_NSEC
381 BASIC_CFLAGS += -DUSE_NSEC
382endif
383ifdef USE_ST_TIMESPEC
384 BASIC_CFLAGS += -DUSE_ST_TIMESPEC
385endif
386ifdef NO_NSEC
387 BASIC_CFLAGS += -DNO_NSEC
388endif
389ifdef NO_C99_FORMAT
390 BASIC_CFLAGS += -DNO_C99_FORMAT
391endif
392ifdef SNPRINTF_RETURNS_BOGUS
393 COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
394 COMPAT_OBJS += compat/snprintf.o
395endif
396ifdef FREAD_READS_DIRECTORIES
397 COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
398 COMPAT_OBJS += compat/fopen.o
399endif
400ifdef NO_SYMLINK_HEAD
401 BASIC_CFLAGS += -DNO_SYMLINK_HEAD
402endif
403ifdef NO_STRCASESTR
404 COMPAT_CFLAGS += -DNO_STRCASESTR
405 COMPAT_OBJS += compat/strcasestr.o
406endif
407ifdef NO_STRTOUMAX
408 COMPAT_CFLAGS += -DNO_STRTOUMAX
409 COMPAT_OBJS += compat/strtoumax.o
410endif
411ifdef NO_STRTOULL
412 COMPAT_CFLAGS += -DNO_STRTOULL
413endif
414ifdef NO_SETENV
415 COMPAT_CFLAGS += -DNO_SETENV
416 COMPAT_OBJS += compat/setenv.o
417endif
418ifdef NO_MKDTEMP
419 COMPAT_CFLAGS += -DNO_MKDTEMP
420 COMPAT_OBJS += compat/mkdtemp.o
421endif
422ifdef NO_UNSETENV
423 COMPAT_CFLAGS += -DNO_UNSETENV
424 COMPAT_OBJS += compat/unsetenv.o
425endif
426ifdef NO_SYS_SELECT_H
427 BASIC_CFLAGS += -DNO_SYS_SELECT_H
428endif
429ifdef NO_MMAP
430 COMPAT_CFLAGS += -DNO_MMAP
431 COMPAT_OBJS += compat/mmap.o
432else
433 ifdef USE_WIN32_MMAP
434 COMPAT_CFLAGS += -DUSE_WIN32_MMAP
435 COMPAT_OBJS += compat/win32mmap.o
436 endif
437endif
438ifdef NO_PREAD
439 COMPAT_CFLAGS += -DNO_PREAD
440 COMPAT_OBJS += compat/pread.o
441endif
442ifdef NO_FAST_WORKING_DIRECTORY
443 BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
444endif
445ifdef NO_TRUSTABLE_FILEMODE
446 BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
447endif
448ifdef NO_IPV6
449 BASIC_CFLAGS += -DNO_IPV6
450endif
451ifdef NO_UINTMAX_T
452 BASIC_CFLAGS += -Duintmax_t=uint32_t
453endif
454ifdef NO_SOCKADDR_STORAGE
455ifdef NO_IPV6
456 BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in
457else
458 BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6
459endif
460endif
461ifdef NO_INET_NTOP
462 LIB_OBJS += compat/inet_ntop.o
463endif
464ifdef NO_INET_PTON
465 LIB_OBJS += compat/inet_pton.o
466endif
467
468ifdef NO_ICONV
469 BASIC_CFLAGS += -DNO_ICONV
470endif
471
472ifdef OLD_ICONV
473 BASIC_CFLAGS += -DOLD_ICONV
474endif
475
476ifdef NO_DEFLATE_BOUND
477 BASIC_CFLAGS += -DNO_DEFLATE_BOUND
478endif
479
480ifdef PPC_SHA1
481 SHA1_HEADER = "ppc/sha1.h"
482 LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
483else
484ifdef ARM_SHA1
485 SHA1_HEADER = "arm/sha1.h"
486 LIB_OBJS += arm/sha1.o arm/sha1_arm.o
487else
488ifdef MOZILLA_SHA1
489 SHA1_HEADER = "mozilla-sha1/sha1.h"
490 LIB_OBJS += mozilla-sha1/sha1.o
491else
492 SHA1_HEADER = <openssl/sha.h>
493 EXTLIBS += $(LIB_4_CRYPTO)
494endif
495endif
496endif
497ifdef NO_PERL_MAKEMAKER
498 export NO_PERL_MAKEMAKER
499endif
500ifdef NO_HSTRERROR
501 COMPAT_CFLAGS += -DNO_HSTRERROR
502 COMPAT_OBJS += compat/hstrerror.o
503endif
504ifdef NO_MEMMEM
505 COMPAT_CFLAGS += -DNO_MEMMEM
506 COMPAT_OBJS += compat/memmem.o
507endif
508ifdef INTERNAL_QSORT
509 COMPAT_CFLAGS += -DINTERNAL_QSORT
510 COMPAT_OBJS += compat/qsort.o
511endif
512ifdef RUNTIME_PREFIX
513 COMPAT_CFLAGS += -DRUNTIME_PREFIX
514endif
515
516ifdef DIR_HAS_BSD_GROUP_SEMANTICS
517 COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
518endif
519ifdef NO_EXTERNAL_GREP
520 BASIC_CFLAGS += -DNO_EXTERNAL_GREP
521endif
522
523ifeq ($(PERL_PATH),)
524NO_PERL=NoThanks
525endif
526
527QUIET_SUBDIR0 = +$(MAKE) -C # space to separate -C and subdir
528QUIET_SUBDIR1 =
529
530ifneq ($(findstring $(MAKEFLAGS),w),w)
531PRINT_DIR = --no-print-directory
532else # "make -w"
533NO_SUBDIR = :
534endif
535
536ifneq ($(findstring $(MAKEFLAGS),s),s)
537ifndef V
538 QUIET_CC = @echo ' ' CC $@;
539 QUIET_AR = @echo ' ' AR $@;
540 QUIET_LINK = @echo ' ' LINK $@;
541 QUIET_BUILT_IN = @echo ' ' BUILTIN $@;
542 QUIET_GEN = @echo ' ' GEN $@;
543 QUIET_SUBDIR0 = +@subdir=
544 QUIET_SUBDIR1 = ;$(NO_SUBDIR) echo ' ' SUBDIR $$subdir; \
545 $(MAKE) $(PRINT_DIR) -C $$subdir
546 export V
547 export QUIET_GEN
548 export QUIET_BUILT_IN
549endif
550endif
551
552ifdef ASCIIDOC8
553 export ASCIIDOC8
554endif
555
556# Shell quote (do not use $(call) to accommodate ancient setups);
557
558SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
559ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
560
561DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
562bindir_SQ = $(subst ','\'',$(bindir))
563bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
564mandir_SQ = $(subst ','\'',$(mandir))
565infodir_SQ = $(subst ','\'',$(infodir))
566perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
567template_dir_SQ = $(subst ','\'',$(template_dir))
568htmldir_SQ = $(subst ','\'',$(htmldir))
569prefix_SQ = $(subst ','\'',$(prefix))
570
571SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
572PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
573
574LIBS = $(PERFLIBS) $(EXTLIBS)
575
576BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
577 $(COMPAT_CFLAGS)
578LIB_OBJS += $(COMPAT_OBJS)
579
580ALL_CFLAGS += $(BASIC_CFLAGS)
581ALL_LDFLAGS += $(BASIC_LDFLAGS)
582
583export TAR INSTALL DESTDIR SHELL_PATH
584
585
586### Build rules
587
588SHELL = $(SHELL_PATH)
589
590all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS
591ifneq (,$X)
592 $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
593endif
594
595all::
596
597please_set_SHELL_PATH_to_a_more_modern_shell:
598 @$$(:)
599
600shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
601
602strip: $(PROGRAMS) perf$X
603 $(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X
604
605perf.o: perf.c common-cmds.h PERF-CFLAGS
606 $(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
607 '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
608 $(ALL_CFLAGS) -c $(filter %.c,$^)
609
610perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS)
611 $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \
612 $(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
613
614builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS
615 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
616 '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
617 '-DPERF_MAN_PATH="$(mandir_SQ)"' \
618 '-DPERF_INFO_PATH="$(infodir_SQ)"' $<
619
620$(BUILT_INS): perf$X
621 $(QUIET_BUILT_IN)$(RM) $@ && \
622 ln perf$X $@ 2>/dev/null || \
623 ln -s perf$X $@ 2>/dev/null || \
624 cp perf$X $@
625
626common-cmds.h: util/generate-cmdlist.sh command-list.txt
627
628common-cmds.h: $(wildcard Documentation/perf-*.txt)
629 $(QUIET_GEN)util/generate-cmdlist.sh > $@+ && mv $@+ $@
630
631$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
632 $(QUIET_GEN)$(RM) $@ $@+ && \
633 sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
634 -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
635 -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
636 -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
637 -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
638 $@.sh >$@+ && \
639 chmod +x $@+ && \
640 mv $@+ $@
641
642configure: configure.ac
643 $(QUIET_GEN)$(RM) $@ $<+ && \
644 sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
645 $< > $<+ && \
646 autoconf -o $@ $<+ && \
647 $(RM) $<+
648
649# These can record PERF_VERSION
650perf.o perf.spec \
651 $(patsubst %.sh,%,$(SCRIPT_SH)) \
652 $(patsubst %.perl,%,$(SCRIPT_PERL)) \
653 : PERF-VERSION-FILE
654
655%.o: %.c PERF-CFLAGS
656 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
657%.s: %.c PERF-CFLAGS
658 $(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
659%.o: %.S
660 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
661
662util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS
663 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
664 '-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
665 '-DBINDIR="$(bindir_relative_SQ)"' \
666 '-DPREFIX="$(prefix_SQ)"' \
667 $<
668
669builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
670 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
671
672util/config.o: util/config.c PERF-CFLAGS
673 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
674
675perf-%$X: %.o $(PERFLIBS)
676 $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
677
678$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
679$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
680builtin-revert.o wt-status.o: wt-status.h
681
682$(LIB_FILE): $(LIB_OBJS)
683 $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
684
685TAGS:
686 $(RM) TAGS
687 $(FIND) . -name '*.[hcS]' -print | xargs etags -a
688
689tags:
690 $(RM) tags
691 $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
692
693cscope:
694 $(RM) cscope*
695 $(FIND) . -name '*.[hcS]' -print | xargs cscope -b
696
697### Detect prefix changes
698TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
699 $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
700
701PERF-CFLAGS: .FORCE-PERF-CFLAGS
702 @FLAGS='$(TRACK_CFLAGS)'; \
703 if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \
704 echo 1>&2 " * new build flags or prefix"; \
705 echo "$$FLAGS" >PERF-CFLAGS; \
706 fi
707
708# We need to apply sq twice, once to protect from the shell
709# that runs PERF-BUILD-OPTIONS, and then again to protect it
710# and the first level quoting from the shell that runs "echo".
711PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
712 @echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
713 @echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
714 @echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
715 @echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
716
717### Testing rules
718
719#
720# None right now:
721#
722# TEST_PROGRAMS += test-something$X
723
724all:: $(TEST_PROGRAMS)
725
726# GNU make supports exporting all variables by "export" without parameters.
727# However, the environment gets quite big, and some programs have problems
728# with that.
729
730export NO_SVN_TESTS
731
732check: common-cmds.h
733 if sparse; \
734 then \
735 for i in *.c */*.c; \
736 do \
737 sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
738 done; \
739 else \
740 echo 2>&1 "Did you mean 'make test'?"; \
741 exit 1; \
742 fi
743
744remove-dashes:
745 ./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS)
746
747### Installation rules
748
749ifneq ($(filter /%,$(firstword $(template_dir))),)
750template_instdir = $(template_dir)
751else
752template_instdir = $(prefix)/$(template_dir)
753endif
754export template_instdir
755
756ifneq ($(filter /%,$(firstword $(perfexecdir))),)
757perfexec_instdir = $(perfexecdir)
758else
759perfexec_instdir = $(prefix)/$(perfexecdir)
760endif
761perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
762export perfexec_instdir
763
764install: all
765 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
766 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
767 $(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
768ifneq (,$X)
769 $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
770endif
771
772### Maintainer's dist rules
773
774perf.spec: perf.spec.in
775 sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
776 mv $@+ $@
777
778PERF_TARNAME=perf-$(PERF_VERSION)
779dist: perf.spec perf-archive$(X) configure
780 ./perf-archive --format=tar \
781 --prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
782 @mkdir -p $(PERF_TARNAME)
783 @cp perf.spec configure $(PERF_TARNAME)
784 @echo $(PERF_VERSION) > $(PERF_TARNAME)/version
785 $(TAR) rf $(PERF_TARNAME).tar \
786 $(PERF_TARNAME)/perf.spec \
787 $(PERF_TARNAME)/configure \
788 $(PERF_TARNAME)/version
789 @$(RM) -r $(PERF_TARNAME)
790 gzip -f -9 $(PERF_TARNAME).tar
791
792rpm: dist
793 $(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
794
795### Cleaning rules
796
797distclean: clean
798 $(RM) configure
799
800clean:
801 $(RM) *.o */*.o $(LIB_FILE)
802 $(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
803 $(RM) $(TEST_PROGRAMS)
804 $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
805 $(RM) -r autom4te.cache
806 $(RM) config.log config.mak.autogen config.mak.append config.status config.cache
807 $(RM) -r $(PERF_TARNAME) .doc-tmp-dir
808 $(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
809 $(RM) $(htmldocs).tar.gz $(manpages).tar.gz
810 $(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
811
812# temporary hack:
813perf-report: perf-report.cc ../../include/linux/perf_counter.h Makefile
814 g++ -g -O2 -Wall -lrt -o $@ $<
815
816.PHONY: all install clean strip
817.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
818.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
819.PHONY: .FORCE-PERF-BUILD-OPTIONS
820
821### Make sure built-ins do not have dups and listed in perf.c
822#
823check-builtins::
824 ./check-builtins.sh
825
826### Test suite coverage testing
827#
828.PHONY: coverage coverage-clean coverage-build coverage-report
829
830coverage:
831 $(MAKE) coverage-build
832 $(MAKE) coverage-report
833
834coverage-clean:
835 rm -f *.gcda *.gcno
836
837COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
838COVERAGE_LDFLAGS = $(CFLAGS) -O0 -lgcov
839
840coverage-build: coverage-clean
841 $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
842 $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
843 -j1 test
844
845coverage-report:
846 gcov -b *.c */*.c
847 grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
848 | sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
849 | tee coverage-untested-functions
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
new file mode 100644
index 000000000000..6616de0ef053
--- /dev/null
+++ b/Documentation/perf_counter/builtin-help.c
@@ -0,0 +1,461 @@
1/*
2 * builtin-help.c
3 *
4 * Builtin help command
5 */
6#include "util/cache.h"
7#include "builtin.h"
8#include "util/exec_cmd.h"
9#include "common-cmds.h"
10#include "util/parse-options.h"
11#include "util/run-command.h"
12#include "util/help.h"
13
14static struct man_viewer_list {
15 struct man_viewer_list *next;
16 char name[FLEX_ARRAY];
17} *man_viewer_list;
18
19static struct man_viewer_info_list {
20 struct man_viewer_info_list *next;
21 const char *info;
22 char name[FLEX_ARRAY];
23} *man_viewer_info_list;
24
25enum help_format {
26 HELP_FORMAT_MAN,
27 HELP_FORMAT_INFO,
28 HELP_FORMAT_WEB,
29};
30
31static int show_all = 0;
32static enum help_format help_format = HELP_FORMAT_MAN;
33static struct option builtin_help_options[] = {
34 OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
35 OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
36 OPT_SET_INT('w', "web", &help_format, "show manual in web browser",
37 HELP_FORMAT_WEB),
38 OPT_SET_INT('i', "info", &help_format, "show info page",
39 HELP_FORMAT_INFO),
40 OPT_END(),
41};
42
43static const char * const builtin_help_usage[] = {
44 "perf help [--all] [--man|--web|--info] [command]",
45 NULL
46};
47
48static enum help_format parse_help_format(const char *format)
49{
50 if (!strcmp(format, "man"))
51 return HELP_FORMAT_MAN;
52 if (!strcmp(format, "info"))
53 return HELP_FORMAT_INFO;
54 if (!strcmp(format, "web") || !strcmp(format, "html"))
55 return HELP_FORMAT_WEB;
56 die("unrecognized help format '%s'", format);
57}
58
59static const char *get_man_viewer_info(const char *name)
60{
61 struct man_viewer_info_list *viewer;
62
63 for (viewer = man_viewer_info_list; viewer; viewer = viewer->next)
64 {
65 if (!strcasecmp(name, viewer->name))
66 return viewer->info;
67 }
68 return NULL;
69}
70
71static int check_emacsclient_version(void)
72{
73 struct strbuf buffer = STRBUF_INIT;
74 struct child_process ec_process;
75 const char *argv_ec[] = { "emacsclient", "--version", NULL };
76 int version;
77
78 /* emacsclient prints its version number on stderr */
79 memset(&ec_process, 0, sizeof(ec_process));
80 ec_process.argv = argv_ec;
81 ec_process.err = -1;
82 ec_process.stdout_to_stderr = 1;
83 if (start_command(&ec_process)) {
84 fprintf(stderr, "Failed to start emacsclient.\n");
85 return -1;
86 }
87 strbuf_read(&buffer, ec_process.err, 20);
88 close(ec_process.err);
89
90 /*
91 * Don't bother checking return value, because "emacsclient --version"
92 * seems to always exits with code 1.
93 */
94 finish_command(&ec_process);
95
96 if (prefixcmp(buffer.buf, "emacsclient")) {
97 fprintf(stderr, "Failed to parse emacsclient version.\n");
98 strbuf_release(&buffer);
99 return -1;
100 }
101
102 strbuf_remove(&buffer, 0, strlen("emacsclient"));
103 version = atoi(buffer.buf);
104
105 if (version < 22) {
106 fprintf(stderr,
107 "emacsclient version '%d' too old (< 22).\n",
108 version);
109 strbuf_release(&buffer);
110 return -1;
111 }
112
113 strbuf_release(&buffer);
114 return 0;
115}
116
117static void exec_woman_emacs(const char* path, const char *page)
118{
119 if (!check_emacsclient_version()) {
120 /* This works only with emacsclient version >= 22. */
121 struct strbuf man_page = STRBUF_INIT;
122
123 if (!path)
124 path = "emacsclient";
125 strbuf_addf(&man_page, "(woman \"%s\")", page);
126 execlp(path, "emacsclient", "-e", man_page.buf, NULL);
127 warning("failed to exec '%s': %s", path, strerror(errno));
128 }
129}
130
131static void exec_man_konqueror(const char* path, const char *page)
132{
133 const char *display = getenv("DISPLAY");
134 if (display && *display) {
135 struct strbuf man_page = STRBUF_INIT;
136 const char *filename = "kfmclient";
137
138 /* It's simpler to launch konqueror using kfmclient. */
139 if (path) {
140 const char *file = strrchr(path, '/');
141 if (file && !strcmp(file + 1, "konqueror")) {
142 char *new = strdup(path);
143 char *dest = strrchr(new, '/');
144
145 /* strlen("konqueror") == strlen("kfmclient") */
146 strcpy(dest + 1, "kfmclient");
147 path = new;
148 }
149 if (file)
150 filename = file;
151 } else
152 path = "kfmclient";
153 strbuf_addf(&man_page, "man:%s(1)", page);
154 execlp(path, filename, "newTab", man_page.buf, NULL);
155 warning("failed to exec '%s': %s", path, strerror(errno));
156 }
157}
158
159static void exec_man_man(const char* path, const char *page)
160{
161 if (!path)
162 path = "man";
163 execlp(path, "man", page, NULL);
164 warning("failed to exec '%s': %s", path, strerror(errno));
165}
166
167static void exec_man_cmd(const char *cmd, const char *page)
168{
169 struct strbuf shell_cmd = STRBUF_INIT;
170 strbuf_addf(&shell_cmd, "%s %s", cmd, page);
171 execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL);
172 warning("failed to exec '%s': %s", cmd, strerror(errno));
173}
174
175static void add_man_viewer(const char *name)
176{
177 struct man_viewer_list **p = &man_viewer_list;
178 size_t len = strlen(name);
179
180 while (*p)
181 p = &((*p)->next);
182 *p = calloc(1, (sizeof(**p) + len + 1));
183 strncpy((*p)->name, name, len);
184}
185
186static int supported_man_viewer(const char *name, size_t len)
187{
188 return (!strncasecmp("man", name, len) ||
189 !strncasecmp("woman", name, len) ||
190 !strncasecmp("konqueror", name, len));
191}
192
193static void do_add_man_viewer_info(const char *name,
194 size_t len,
195 const char *value)
196{
197 struct man_viewer_info_list *new = calloc(1, sizeof(*new) + len + 1);
198
199 strncpy(new->name, name, len);
200 new->info = strdup(value);
201 new->next = man_viewer_info_list;
202 man_viewer_info_list = new;
203}
204
205static int add_man_viewer_path(const char *name,
206 size_t len,
207 const char *value)
208{
209 if (supported_man_viewer(name, len))
210 do_add_man_viewer_info(name, len, value);
211 else
212 warning("'%s': path for unsupported man viewer.\n"
213 "Please consider using 'man.<tool>.cmd' instead.",
214 name);
215
216 return 0;
217}
218
219static int add_man_viewer_cmd(const char *name,
220 size_t len,
221 const char *value)
222{
223 if (supported_man_viewer(name, len))
224 warning("'%s': cmd for supported man viewer.\n"
225 "Please consider using 'man.<tool>.path' instead.",
226 name);
227 else
228 do_add_man_viewer_info(name, len, value);
229
230 return 0;
231}
232
233static int add_man_viewer_info(const char *var, const char *value)
234{
235 const char *name = var + 4;
236 const char *subkey = strrchr(name, '.');
237
238 if (!subkey)
239 return error("Config with no key for man viewer: %s", name);
240
241 if (!strcmp(subkey, ".path")) {
242 if (!value)
243 return config_error_nonbool(var);
244 return add_man_viewer_path(name, subkey - name, value);
245 }
246 if (!strcmp(subkey, ".cmd")) {
247 if (!value)
248 return config_error_nonbool(var);
249 return add_man_viewer_cmd(name, subkey - name, value);
250 }
251
252 warning("'%s': unsupported man viewer sub key.", subkey);
253 return 0;
254}
255
256static int perf_help_config(const char *var, const char *value, void *cb)
257{
258 if (!strcmp(var, "help.format")) {
259 if (!value)
260 return config_error_nonbool(var);
261 help_format = parse_help_format(value);
262 return 0;
263 }
264 if (!strcmp(var, "man.viewer")) {
265 if (!value)
266 return config_error_nonbool(var);
267 add_man_viewer(value);
268 return 0;
269 }
270 if (!prefixcmp(var, "man."))
271 return add_man_viewer_info(var, value);
272
273 return perf_default_config(var, value, cb);
274}
275
276static struct cmdnames main_cmds, other_cmds;
277
278void list_common_cmds_help(void)
279{
280 int i, longest = 0;
281
282 for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
283 if (longest < strlen(common_cmds[i].name))
284 longest = strlen(common_cmds[i].name);
285 }
286
287 puts("The most commonly used perf commands are:");
288 for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
289 printf(" %s ", common_cmds[i].name);
290 mput_char(' ', longest - strlen(common_cmds[i].name));
291 puts(common_cmds[i].help);
292 }
293}
294
295static int is_perf_command(const char *s)
296{
297 return is_in_cmdlist(&main_cmds, s) ||
298 is_in_cmdlist(&other_cmds, s);
299}
300
301static const char *prepend(const char *prefix, const char *cmd)
302{
303 size_t pre_len = strlen(prefix);
304 size_t cmd_len = strlen(cmd);
305 char *p = malloc(pre_len + cmd_len + 1);
306 memcpy(p, prefix, pre_len);
307 strcpy(p + pre_len, cmd);
308 return p;
309}
310
311static const char *cmd_to_page(const char *perf_cmd)
312{
313 if (!perf_cmd)
314 return "perf";
315 else if (!prefixcmp(perf_cmd, "perf"))
316 return perf_cmd;
317 else if (is_perf_command(perf_cmd))
318 return prepend("perf-", perf_cmd);
319 else
320 return prepend("perf", perf_cmd);
321}
322
323static void setup_man_path(void)
324{
325 struct strbuf new_path = STRBUF_INIT;
326 const char *old_path = getenv("MANPATH");
327
328 /* We should always put ':' after our path. If there is no
329 * old_path, the ':' at the end will let 'man' to try
330 * system-wide paths after ours to find the manual page. If
331 * there is old_path, we need ':' as delimiter. */
332 strbuf_addstr(&new_path, system_path(PERF_MAN_PATH));
333 strbuf_addch(&new_path, ':');
334 if (old_path)
335 strbuf_addstr(&new_path, old_path);
336
337 setenv("MANPATH", new_path.buf, 1);
338
339 strbuf_release(&new_path);
340}
341
342static void exec_viewer(const char *name, const char *page)
343{
344 const char *info = get_man_viewer_info(name);
345
346 if (!strcasecmp(name, "man"))
347 exec_man_man(info, page);
348 else if (!strcasecmp(name, "woman"))
349 exec_woman_emacs(info, page);
350 else if (!strcasecmp(name, "konqueror"))
351 exec_man_konqueror(info, page);
352 else if (info)
353 exec_man_cmd(info, page);
354 else
355 warning("'%s': unknown man viewer.", name);
356}
357
358static void show_man_page(const char *perf_cmd)
359{
360 struct man_viewer_list *viewer;
361 const char *page = cmd_to_page(perf_cmd);
362 const char *fallback = getenv("PERF_MAN_VIEWER");
363
364 setup_man_path();
365 for (viewer = man_viewer_list; viewer; viewer = viewer->next)
366 {
367 exec_viewer(viewer->name, page); /* will return when unable */
368 }
369 if (fallback)
370 exec_viewer(fallback, page);
371 exec_viewer("man", page);
372 die("no man viewer handled the request");
373}
374
375static void show_info_page(const char *perf_cmd)
376{
377 const char *page = cmd_to_page(perf_cmd);
378 setenv("INFOPATH", system_path(PERF_INFO_PATH), 1);
379 execlp("info", "info", "perfman", page, NULL);
380}
381
382static void get_html_page_path(struct strbuf *page_path, const char *page)
383{
384 struct stat st;
385 const char *html_path = system_path(PERF_HTML_PATH);
386
387 /* Check that we have a perf documentation directory. */
388 if (stat(mkpath("%s/perf.html", html_path), &st)
389 || !S_ISREG(st.st_mode))
390 die("'%s': not a documentation directory.", html_path);
391
392 strbuf_init(page_path, 0);
393 strbuf_addf(page_path, "%s/%s.html", html_path, page);
394}
395
396/*
397 * If open_html is not defined in a platform-specific way (see for
398 * example compat/mingw.h), we use the script web--browse to display
399 * HTML.
400 */
401#ifndef open_html
402void open_html(const char *path)
403{
404 execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
405}
406#endif
407
408static void show_html_page(const char *perf_cmd)
409{
410 const char *page = cmd_to_page(perf_cmd);
411 struct strbuf page_path; /* it leaks but we exec bellow */
412
413 get_html_page_path(&page_path, page);
414
415 open_html(page_path.buf);
416}
417
418int cmd_help(int argc, const char **argv, const char *prefix)
419{
420 const char *alias;
421 load_command_list("perf-", &main_cmds, &other_cmds);
422
423 perf_config(perf_help_config, NULL);
424
425 argc = parse_options(argc, argv, builtin_help_options,
426 builtin_help_usage, 0);
427
428 if (show_all) {
429 printf("usage: %s\n\n", perf_usage_string);
430 list_commands("perf commands", &main_cmds, &other_cmds);
431 printf("%s\n", perf_more_info_string);
432 return 0;
433 }
434
435 if (!argv[0]) {
436 printf("usage: %s\n\n", perf_usage_string);
437 list_common_cmds_help();
438 printf("\n%s\n", perf_more_info_string);
439 return 0;
440 }
441
442 alias = alias_lookup(argv[0]);
443 if (alias && !is_perf_command(argv[0])) {
444 printf("`perf %s' is aliased to `%s'\n", argv[0], alias);
445 return 0;
446 }
447
448 switch (help_format) {
449 case HELP_FORMAT_MAN:
450 show_man_page(argv[0]);
451 break;
452 case HELP_FORMAT_INFO:
453 show_info_page(argv[0]);
454 break;
455 case HELP_FORMAT_WEB:
456 show_html_page(argv[0]);
457 break;
458 }
459
460 return 0;
461}
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
new file mode 100644
index 000000000000..efb87595f3cb
--- /dev/null
+++ b/Documentation/perf_counter/builtin-record.c
@@ -0,0 +1,613 @@
1
2
3#include "util/util.h"
4
5#include <sys/types.h>
6#include <sys/stat.h>
7#include <sys/time.h>
8#include <unistd.h>
9#include <stdint.h>
10#include <stdlib.h>
11#include <string.h>
12#include <limits.h>
13#include <getopt.h>
14#include <assert.h>
15#include <fcntl.h>
16#include <stdio.h>
17#include <errno.h>
18#include <time.h>
19#include <sched.h>
20#include <pthread.h>
21
22#include <sys/syscall.h>
23#include <sys/ioctl.h>
24#include <sys/poll.h>
25#include <sys/prctl.h>
26#include <sys/wait.h>
27#include <sys/uio.h>
28#include <sys/mman.h>
29
30#include <linux/unistd.h>
31#include <linux/types.h>
32
33#include "../../include/linux/perf_counter.h"
34
35#include "perf.h"
36
37#define ALIGN(x,a) __ALIGN_MASK(x,(typeof(x))(a)-1)
38#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
39
40static int nr_counters = 0;
41static __u64 event_id[MAX_COUNTERS] = { };
42static int default_interval = 100000;
43static int event_count[MAX_COUNTERS];
44static int fd[MAX_NR_CPUS][MAX_COUNTERS];
45static int nr_cpus = 0;
46static unsigned int page_size;
47static unsigned int mmap_pages = 16;
48static int output;
49static char *output_name = "output.perf";
50static int group = 0;
51static unsigned int realtime_prio = 0;
52static int system_wide = 0;
53static pid_t target_pid = -1;
54static int inherit = 1;
55static int nmi = 1;
56
57const unsigned int default_count[] = {
58 1000000,
59 1000000,
60 10000,
61 10000,
62 1000000,
63 10000,
64};
65
66struct event_symbol {
67 __u64 event;
68 char *symbol;
69};
70
71static struct event_symbol event_symbols[] = {
72 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
73 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
74 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
75 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
76 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
77 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
78 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
79 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
80 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
81
82 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
83 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
84 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
85 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
86 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
87 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
88 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
89 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
90 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
91 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
92};
93
94/*
95 * Each event can have multiple symbolic names.
96 * Symbolic names are (almost) exactly matched.
97 */
98static __u64 match_event_symbols(char *str)
99{
100 __u64 config, id;
101 int type;
102 unsigned int i;
103
104 if (sscanf(str, "r%llx", &config) == 1)
105 return config | PERF_COUNTER_RAW_MASK;
106
107 if (sscanf(str, "%d:%llu", &type, &id) == 2)
108 return EID(type, id);
109
110 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
111 if (!strncmp(str, event_symbols[i].symbol,
112 strlen(event_symbols[i].symbol)))
113 return event_symbols[i].event;
114 }
115
116 return ~0ULL;
117}
118
119static int parse_events(char *str)
120{
121 __u64 config;
122
123again:
124 if (nr_counters == MAX_COUNTERS)
125 return -1;
126
127 config = match_event_symbols(str);
128 if (config == ~0ULL)
129 return -1;
130
131 event_id[nr_counters] = config;
132 nr_counters++;
133
134 str = strstr(str, ",");
135 if (str) {
136 str++;
137 goto again;
138 }
139
140 return 0;
141}
142
143#define __PERF_COUNTER_FIELD(config, name) \
144 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
145
146#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
147#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
148#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
149#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
150
151static void display_events_help(void)
152{
153 unsigned int i;
154 __u64 e;
155
156 printf(
157 " -e EVENT --event=EVENT # symbolic-name abbreviations");
158
159 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
160 int type, id;
161
162 e = event_symbols[i].event;
163 type = PERF_COUNTER_TYPE(e);
164 id = PERF_COUNTER_ID(e);
165
166 printf("\n %d:%d: %-20s",
167 type, id, event_symbols[i].symbol);
168 }
169
170 printf("\n"
171 " rNNN: raw PMU events (eventsel+umask)\n\n");
172}
173
174static void display_help(void)
175{
176 printf(
177 "Usage: perf-record [<options>] <cmd>\n"
178 "perf-record Options (up to %d event types can be specified at once):\n\n",
179 MAX_COUNTERS);
180
181 display_events_help();
182
183 printf(
184 " -c CNT --count=CNT # event period to sample\n"
185 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
186 " -o file --output=<file> # output file\n"
187 " -p pid --pid=<pid> # record events on existing pid\n"
188 " -r prio --realtime=<prio> # use RT prio\n"
189 " -s --system # system wide profiling\n"
190 );
191
192 exit(0);
193}
194
195static void process_options(int argc, const char *argv[])
196{
197 int error = 0, counter;
198
199 for (;;) {
200 int option_index = 0;
201 /** Options for getopt */
202 static struct option long_options[] = {
203 {"count", required_argument, NULL, 'c'},
204 {"event", required_argument, NULL, 'e'},
205 {"mmap_pages", required_argument, NULL, 'm'},
206 {"output", required_argument, NULL, 'o'},
207 {"pid", required_argument, NULL, 'p'},
208 {"realtime", required_argument, NULL, 'r'},
209 {"system", no_argument, NULL, 's'},
210 {"inherit", no_argument, NULL, 'i'},
211 {"nmi", no_argument, NULL, 'n'},
212 {NULL, 0, NULL, 0 }
213 };
214 int c = getopt_long(argc, argv, "+:c:e:m:o:p:r:sin",
215 long_options, &option_index);
216 if (c == -1)
217 break;
218
219 switch (c) {
220 case 'c': default_interval = atoi(optarg); break;
221 case 'e': error = parse_events(optarg); break;
222 case 'm': mmap_pages = atoi(optarg); break;
223 case 'o': output_name = strdup(optarg); break;
224 case 'p': target_pid = atoi(optarg); break;
225 case 'r': realtime_prio = atoi(optarg); break;
226 case 's': system_wide ^= 1; break;
227 case 'i': inherit ^= 1; break;
228 case 'n': nmi ^= 1; break;
229 default: error = 1; break;
230 }
231 }
232
233 if (argc - optind == 0 && target_pid == -1)
234 error = 1;
235
236 if (error)
237 display_help();
238
239 if (!nr_counters) {
240 nr_counters = 1;
241 event_id[0] = 0;
242 }
243
244 for (counter = 0; counter < nr_counters; counter++) {
245 if (event_count[counter])
246 continue;
247
248 event_count[counter] = default_interval;
249 }
250}
251
252struct mmap_data {
253 int counter;
254 void *base;
255 unsigned int mask;
256 unsigned int prev;
257};
258
259static unsigned int mmap_read_head(struct mmap_data *md)
260{
261 struct perf_counter_mmap_page *pc = md->base;
262 int head;
263
264 head = pc->data_head;
265 rmb();
266
267 return head;
268}
269
270static long events;
271static struct timeval last_read, this_read;
272
273static void mmap_read(struct mmap_data *md)
274{
275 unsigned int head = mmap_read_head(md);
276 unsigned int old = md->prev;
277 unsigned char *data = md->base + page_size;
278 unsigned long size;
279 void *buf;
280 int diff;
281
282 gettimeofday(&this_read, NULL);
283
284 /*
285 * If we're further behind than half the buffer, there's a chance
286 * the writer will bite our tail and screw up the events under us.
287 *
288 * If we somehow ended up ahead of the head, we got messed up.
289 *
290 * In either case, truncate and restart at head.
291 */
292 diff = head - old;
293 if (diff > md->mask / 2 || diff < 0) {
294 struct timeval iv;
295 unsigned long msecs;
296
297 timersub(&this_read, &last_read, &iv);
298 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
299
300 fprintf(stderr, "WARNING: failed to keep up with mmap data."
301 " Last read %lu msecs ago.\n", msecs);
302
303 /*
304 * head points to a known good entry, start there.
305 */
306 old = head;
307 }
308
309 last_read = this_read;
310
311 if (old != head)
312 events++;
313
314 size = head - old;
315
316 if ((old & md->mask) + size != (head & md->mask)) {
317 buf = &data[old & md->mask];
318 size = md->mask + 1 - (old & md->mask);
319 old += size;
320 while (size) {
321 int ret = write(output, buf, size);
322 if (ret < 0) {
323 perror("failed to write");
324 exit(-1);
325 }
326 size -= ret;
327 buf += ret;
328 }
329 }
330
331 buf = &data[old & md->mask];
332 size = head - old;
333 old += size;
334 while (size) {
335 int ret = write(output, buf, size);
336 if (ret < 0) {
337 perror("failed to write");
338 exit(-1);
339 }
340 size -= ret;
341 buf += ret;
342 }
343
344 md->prev = old;
345}
346
347static volatile int done = 0;
348
349static void sig_handler(int sig)
350{
351 done = 1;
352}
353
354static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
355static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
356
357static int nr_poll;
358static int nr_cpu;
359
360struct mmap_event {
361 struct perf_event_header header;
362 __u32 pid, tid;
363 __u64 start;
364 __u64 len;
365 __u64 pgoff;
366 char filename[PATH_MAX];
367};
368struct comm_event {
369 struct perf_event_header header;
370 __u32 pid,tid;
371 char comm[16];
372};
373
374static pid_t pid_synthesize_comm_event(pid_t pid)
375{
376 char filename[PATH_MAX];
377 char bf[BUFSIZ];
378 struct comm_event comm_ev;
379 size_t size;
380 int fd;
381
382 snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
383
384 fd = open(filename, O_RDONLY);
385 if (fd < 0) {
386 fprintf(stderr, "couldn't open %s\n", filename);
387 exit(EXIT_FAILURE);
388 }
389 if (read(fd, bf, sizeof(bf)) < 0) {
390 fprintf(stderr, "couldn't read %s\n", filename);
391 exit(EXIT_FAILURE);
392 }
393 close(fd);
394
395 pid_t spid, ppid;
396 char state;
397 char comm[18];
398
399 memset(&comm_ev, 0, sizeof(comm_ev));
400 int nr = sscanf(bf, "%d %s %c %d %d ",
401 &spid, comm, &state, &ppid, &comm_ev.pid);
402 if (nr != 5) {
403 fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
404 filename);
405 exit(EXIT_FAILURE);
406 }
407 comm_ev.header.type = PERF_EVENT_COMM;
408 comm_ev.tid = pid;
409 size = strlen(comm);
410 comm[--size] = '\0'; /* Remove the ')' at the end */
411 --size; /* Remove the '(' at the begin */
412 memcpy(comm_ev.comm, comm + 1, size);
413 size = ALIGN(size, sizeof(uint64_t));
414 comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
415 int ret = write(output, &comm_ev, comm_ev.header.size);
416 if (ret < 0) {
417 perror("failed to write");
418 exit(-1);
419 }
420 return comm_ev.pid;
421}
422
423static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid)
424{
425 char filename[PATH_MAX];
426 FILE *fp;
427
428 snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
429
430 fp = fopen(filename, "r");
431 if (fp == NULL) {
432 fprintf(stderr, "couldn't open %s\n", filename);
433 exit(EXIT_FAILURE);
434 }
435 while (1) {
436 char bf[BUFSIZ];
437 unsigned char vm_read, vm_write, vm_exec, vm_mayshare;
438 struct mmap_event mmap_ev = {
439 .header.type = PERF_EVENT_MMAP,
440 };
441 unsigned long ino;
442 int major, minor;
443 size_t size;
444 if (fgets(bf, sizeof(bf), fp) == NULL)
445 break;
446
447 /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */
448 sscanf(bf, "%llx-%llx %c%c%c%c %llx %x:%x %lu",
449 &mmap_ev.start, &mmap_ev.len,
450 &vm_read, &vm_write, &vm_exec, &vm_mayshare,
451 &mmap_ev.pgoff, &major, &minor, &ino);
452 if (vm_exec == 'x') {
453 char *execname = strrchr(bf, ' ');
454
455 if (execname == NULL || execname[1] != '/')
456 continue;
457
458 execname += 1;
459 size = strlen(execname);
460 execname[size - 1] = '\0'; /* Remove \n */
461 memcpy(mmap_ev.filename, execname, size);
462 size = ALIGN(size, sizeof(uint64_t));
463 mmap_ev.len -= mmap_ev.start;
464 mmap_ev.header.size = (sizeof(mmap_ev) -
465 (sizeof(mmap_ev.filename) - size));
466 mmap_ev.pid = pgid;
467 mmap_ev.tid = pid;
468
469 if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
470 perror("failed to write");
471 exit(-1);
472 }
473 }
474 }
475
476 fclose(fp);
477}
478
479static void open_counters(int cpu, pid_t pid)
480{
481 struct perf_counter_hw_event hw_event;
482 int counter, group_fd;
483 int track = 1;
484
485 if (pid > 0) {
486 pid_t pgid = pid_synthesize_comm_event(pid);
487 pid_synthesize_mmap_events(pid, pgid);
488 }
489
490 group_fd = -1;
491 for (counter = 0; counter < nr_counters; counter++) {
492
493 memset(&hw_event, 0, sizeof(hw_event));
494 hw_event.config = event_id[counter];
495 hw_event.irq_period = event_count[counter];
496 hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID;
497 hw_event.nmi = nmi;
498 hw_event.mmap = track;
499 hw_event.comm = track;
500 hw_event.inherit = (cpu < 0) && inherit;
501
502 track = 0; // only the first counter needs these
503
504 fd[nr_cpu][counter] =
505 sys_perf_counter_open(&hw_event, pid, cpu, group_fd, 0);
506
507 if (fd[nr_cpu][counter] < 0) {
508 int err = errno;
509 printf("kerneltop error: syscall returned with %d (%s)\n",
510 fd[nr_cpu][counter], strerror(err));
511 if (err == EPERM)
512 printf("Are you root?\n");
513 exit(-1);
514 }
515 assert(fd[nr_cpu][counter] >= 0);
516 fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
517
518 /*
519 * First counter acts as the group leader:
520 */
521 if (group && group_fd == -1)
522 group_fd = fd[nr_cpu][counter];
523
524 event_array[nr_poll].fd = fd[nr_cpu][counter];
525 event_array[nr_poll].events = POLLIN;
526 nr_poll++;
527
528 mmap_array[nr_cpu][counter].counter = counter;
529 mmap_array[nr_cpu][counter].prev = 0;
530 mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
531 mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
532 PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
533 if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
534 printf("kerneltop error: failed to mmap with %d (%s)\n",
535 errno, strerror(errno));
536 exit(-1);
537 }
538 }
539 nr_cpu++;
540}
541
542int cmd_record(int argc, const char **argv)
543{
544 int i, counter;
545 pid_t pid;
546 int ret;
547
548 page_size = sysconf(_SC_PAGE_SIZE);
549
550 process_options(argc, argv);
551
552 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
553 assert(nr_cpus <= MAX_NR_CPUS);
554 assert(nr_cpus >= 0);
555
556 output = open(output_name, O_CREAT|O_RDWR, S_IRWXU);
557 if (output < 0) {
558 perror("failed to create output file");
559 exit(-1);
560 }
561
562 argc -= optind;
563 argv += optind;
564
565 if (!system_wide) {
566 open_counters(-1, target_pid != -1 ? target_pid : 0);
567 } else for (i = 0; i < nr_cpus; i++)
568 open_counters(i, target_pid);
569
570 signal(SIGCHLD, sig_handler);
571 signal(SIGINT, sig_handler);
572
573 if (target_pid == -1) {
574 pid = fork();
575 if (pid < 0)
576 perror("failed to fork");
577
578 if (!pid) {
579 if (execvp(argv[0], argv)) {
580 perror(argv[0]);
581 exit(-1);
582 }
583 }
584 }
585
586 if (realtime_prio) {
587 struct sched_param param;
588
589 param.sched_priority = realtime_prio;
590 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
591 printf("Could not set realtime priority.\n");
592 exit(-1);
593 }
594 }
595
596 /*
597 * TODO: store the current /proc/$/maps information somewhere
598 */
599
600 while (!done) {
601 int hits = events;
602
603 for (i = 0; i < nr_cpu; i++) {
604 for (counter = 0; counter < nr_counters; counter++)
605 mmap_read(&mmap_array[i][counter]);
606 }
607
608 if (hits == events)
609 ret = poll(event_array, nr_poll, 100);
610 }
611
612 return 0;
613}
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
new file mode 100644
index 000000000000..03518d75d864
--- /dev/null
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -0,0 +1,568 @@
1/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
60 *
61 * Released under the GPL v2. (and only v2, not any later version)
62 */
63
64#include "util/util.h"
65
66#include <getopt.h>
67#include <assert.h>
68#include <fcntl.h>
69#include <stdio.h>
70#include <errno.h>
71#include <time.h>
72#include <sched.h>
73#include <pthread.h>
74
75#include <sys/syscall.h>
76#include <sys/ioctl.h>
77#include <sys/poll.h>
78#include <sys/prctl.h>
79#include <sys/wait.h>
80#include <sys/uio.h>
81#include <sys/mman.h>
82
83#include <linux/unistd.h>
84#include <linux/types.h>
85
86#include "../../include/linux/perf_counter.h"
87
88#include "perf.h"
89
90#define EVENT_MASK_KERNEL 1
91#define EVENT_MASK_USER 2
92
93static int system_wide = 0;
94
95static int nr_counters = 0;
96static __u64 event_id[MAX_COUNTERS] = {
97 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
98 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
99 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
100 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
101
102 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
103 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
104 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
105 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
106};
107static int default_interval = 100000;
108static int event_count[MAX_COUNTERS];
109static int fd[MAX_NR_CPUS][MAX_COUNTERS];
110static int event_mask[MAX_COUNTERS];
111
112static int tid = -1;
113static int profile_cpu = -1;
114static int nr_cpus = 0;
115static int nmi = 1;
116static int group = 0;
117static unsigned int page_size;
118
119static int zero;
120
121static int scale = 1;
122
123static const unsigned int default_count[] = {
124 1000000,
125 1000000,
126 10000,
127 10000,
128 1000000,
129 10000,
130};
131
132static char *hw_event_names[] = {
133 "CPU cycles",
134 "instructions",
135 "cache references",
136 "cache misses",
137 "branches",
138 "branch misses",
139 "bus cycles",
140};
141
142static char *sw_event_names[] = {
143 "cpu clock ticks",
144 "task clock ticks",
145 "pagefaults",
146 "context switches",
147 "CPU migrations",
148 "minor faults",
149 "major faults",
150};
151
152struct event_symbol {
153 __u64 event;
154 char *symbol;
155};
156
157static struct event_symbol event_symbols[] = {
158 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
159 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
160 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
161 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
162 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
163 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
164 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
165 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
166 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
167
168 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
169 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
170 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
171 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
172 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
173 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
174 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
175 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
176 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
177 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
178};
179
180#define __PERF_COUNTER_FIELD(config, name) \
181 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
182
183#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
184#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
185#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
186#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
187
188static void display_events_help(void)
189{
190 unsigned int i;
191 __u64 e;
192
193 printf(
194 " -e EVENT --event=EVENT # symbolic-name abbreviations");
195
196 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
197 int type, id;
198
199 e = event_symbols[i].event;
200 type = PERF_COUNTER_TYPE(e);
201 id = PERF_COUNTER_ID(e);
202
203 printf("\n %d:%d: %-20s",
204 type, id, event_symbols[i].symbol);
205 }
206
207 printf("\n"
208 " rNNN: raw PMU events (eventsel+umask)\n\n");
209}
210
211static void display_help(void)
212{
213 printf(
214 "Usage: perfstat [<events...>] <cmd...>\n\n"
215 "PerfStat Options (up to %d event types can be specified):\n\n",
216 MAX_COUNTERS);
217
218 display_events_help();
219
220 printf(
221 " -l # scale counter values\n"
222 " -a # system-wide collection\n");
223 exit(0);
224}
225
226static char *event_name(int ctr)
227{
228 __u64 config = event_id[ctr];
229 int type = PERF_COUNTER_TYPE(config);
230 int id = PERF_COUNTER_ID(config);
231 static char buf[32];
232
233 if (PERF_COUNTER_RAW(config)) {
234 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
235 return buf;
236 }
237
238 switch (type) {
239 case PERF_TYPE_HARDWARE:
240 if (id < PERF_HW_EVENTS_MAX)
241 return hw_event_names[id];
242 return "unknown-hardware";
243
244 case PERF_TYPE_SOFTWARE:
245 if (id < PERF_SW_EVENTS_MAX)
246 return sw_event_names[id];
247 return "unknown-software";
248
249 default:
250 break;
251 }
252
253 return "unknown";
254}
255
256/*
257 * Each event can have multiple symbolic names.
258 * Symbolic names are (almost) exactly matched.
259 */
260static __u64 match_event_symbols(char *str)
261{
262 __u64 config, id;
263 int type;
264 unsigned int i;
265 char mask_str[4];
266
267 if (sscanf(str, "r%llx", &config) == 1)
268 return config | PERF_COUNTER_RAW_MASK;
269
270 switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
271 case 3:
272 if (strchr(mask_str, 'u'))
273 event_mask[nr_counters] |= EVENT_MASK_USER;
274 if (strchr(mask_str, 'k'))
275 event_mask[nr_counters] |= EVENT_MASK_KERNEL;
276 case 2:
277 return EID(type, id);
278
279 default:
280 break;
281 }
282
283 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
284 if (!strncmp(str, event_symbols[i].symbol,
285 strlen(event_symbols[i].symbol)))
286 return event_symbols[i].event;
287 }
288
289 return ~0ULL;
290}
291
292static int parse_events(char *str)
293{
294 __u64 config;
295
296again:
297 if (nr_counters == MAX_COUNTERS)
298 return -1;
299
300 config = match_event_symbols(str);
301 if (config == ~0ULL)
302 return -1;
303
304 event_id[nr_counters] = config;
305 nr_counters++;
306
307 str = strstr(str, ",");
308 if (str) {
309 str++;
310 goto again;
311 }
312
313 return 0;
314}
315
316
317/*
318 * perfstat
319 */
320
321char fault_here[1000000];
322
323static void create_perfstat_counter(int counter)
324{
325 struct perf_counter_hw_event hw_event;
326
327 memset(&hw_event, 0, sizeof(hw_event));
328 hw_event.config = event_id[counter];
329 hw_event.record_type = 0;
330 hw_event.nmi = 0;
331 hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
332 hw_event.exclude_user = event_mask[counter] & EVENT_MASK_USER;
333
334 if (scale)
335 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
336 PERF_FORMAT_TOTAL_TIME_RUNNING;
337
338 if (system_wide) {
339 int cpu;
340 for (cpu = 0; cpu < nr_cpus; cpu ++) {
341 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
342 if (fd[cpu][counter] < 0) {
343 printf("perfstat error: syscall returned with %d (%s)\n",
344 fd[cpu][counter], strerror(errno));
345 exit(-1);
346 }
347 }
348 } else {
349 hw_event.inherit = 1;
350 hw_event.disabled = 1;
351
352 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
353 if (fd[0][counter] < 0) {
354 printf("perfstat error: syscall returned with %d (%s)\n",
355 fd[0][counter], strerror(errno));
356 exit(-1);
357 }
358 }
359}
360
361int do_perfstat(int argc, char *argv[])
362{
363 unsigned long long t0, t1;
364 int counter;
365 ssize_t res;
366 int status;
367 int pid;
368
369 if (!system_wide)
370 nr_cpus = 1;
371
372 for (counter = 0; counter < nr_counters; counter++)
373 create_perfstat_counter(counter);
374
375 argc -= optind;
376 argv += optind;
377
378 if (!argc)
379 display_help();
380
381 /*
382 * Enable counters and exec the command:
383 */
384 t0 = rdclock();
385 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
386
387 if ((pid = fork()) < 0)
388 perror("failed to fork");
389 if (!pid) {
390 if (execvp(argv[0], argv)) {
391 perror(argv[0]);
392 exit(-1);
393 }
394 }
395 while (wait(&status) >= 0)
396 ;
397 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
398 t1 = rdclock();
399
400 fflush(stdout);
401
402 fprintf(stderr, "\n");
403 fprintf(stderr, " Performance counter stats for \'%s\':\n",
404 argv[0]);
405 fprintf(stderr, "\n");
406
407 for (counter = 0; counter < nr_counters; counter++) {
408 int cpu, nv;
409 __u64 count[3], single_count[3];
410 int scaled;
411
412 count[0] = count[1] = count[2] = 0;
413 nv = scale ? 3 : 1;
414 for (cpu = 0; cpu < nr_cpus; cpu ++) {
415 res = read(fd[cpu][counter],
416 single_count, nv * sizeof(__u64));
417 assert(res == nv * sizeof(__u64));
418
419 count[0] += single_count[0];
420 if (scale) {
421 count[1] += single_count[1];
422 count[2] += single_count[2];
423 }
424 }
425
426 scaled = 0;
427 if (scale) {
428 if (count[2] == 0) {
429 fprintf(stderr, " %14s %-20s\n",
430 "<not counted>", event_name(counter));
431 continue;
432 }
433 if (count[2] < count[1]) {
434 scaled = 1;
435 count[0] = (unsigned long long)
436 ((double)count[0] * count[1] / count[2] + 0.5);
437 }
438 }
439
440 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
441 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
442
443 double msecs = (double)count[0] / 1000000;
444
445 fprintf(stderr, " %14.6f %-20s (msecs)",
446 msecs, event_name(counter));
447 } else {
448 fprintf(stderr, " %14Ld %-20s (events)",
449 count[0], event_name(counter));
450 }
451 if (scaled)
452 fprintf(stderr, " (scaled from %.2f%%)",
453 (double) count[2] / count[1] * 100);
454 fprintf(stderr, "\n");
455 }
456 fprintf(stderr, "\n");
457 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
458 (double)(t1-t0)/1e6);
459 fprintf(stderr, "\n");
460
461 return 0;
462}
463
464static void process_options(int argc, char **argv)
465{
466 int error = 0, counter;
467
468 for (;;) {
469 int option_index = 0;
470 /** Options for getopt */
471 static struct option long_options[] = {
472 {"count", required_argument, NULL, 'c'},
473 {"cpu", required_argument, NULL, 'C'},
474 {"delay", required_argument, NULL, 'd'},
475 {"dump_symtab", no_argument, NULL, 'D'},
476 {"event", required_argument, NULL, 'e'},
477 {"filter", required_argument, NULL, 'f'},
478 {"group", required_argument, NULL, 'g'},
479 {"help", no_argument, NULL, 'h'},
480 {"nmi", required_argument, NULL, 'n'},
481 {"munmap_info", no_argument, NULL, 'U'},
482 {"pid", required_argument, NULL, 'p'},
483 {"realtime", required_argument, NULL, 'r'},
484 {"scale", no_argument, NULL, 'l'},
485 {"symbol", required_argument, NULL, 's'},
486 {"stat", no_argument, NULL, 'S'},
487 {"vmlinux", required_argument, NULL, 'x'},
488 {"zero", no_argument, NULL, 'z'},
489 {NULL, 0, NULL, 0 }
490 };
491 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
492 long_options, &option_index);
493 if (c == -1)
494 break;
495
496 switch (c) {
497 case 'a': system_wide = 1; break;
498 case 'c': default_interval = atoi(optarg); break;
499 case 'C':
500 /* CPU and PID are mutually exclusive */
501 if (tid != -1) {
502 printf("WARNING: CPU switch overriding PID\n");
503 sleep(1);
504 tid = -1;
505 }
506 profile_cpu = atoi(optarg); break;
507
508 case 'e': error = parse_events(optarg); break;
509
510 case 'g': group = atoi(optarg); break;
511 case 'h': display_help(); break;
512 case 'l': scale = 1; break;
513 case 'n': nmi = atoi(optarg); break;
514 case 'p':
515 /* CPU and PID are mutually exclusive */
516 if (profile_cpu != -1) {
517 printf("WARNING: PID switch overriding CPU\n");
518 sleep(1);
519 profile_cpu = -1;
520 }
521 tid = atoi(optarg); break;
522 case 'z': zero = 1; break;
523 default: error = 1; break;
524 }
525 }
526 if (error)
527 display_help();
528
529 if (!nr_counters) {
530 nr_counters = 8;
531 }
532
533 for (counter = 0; counter < nr_counters; counter++) {
534 if (event_count[counter])
535 continue;
536
537 event_count[counter] = default_interval;
538 }
539}
540
541static void skip_signal(int signo)
542{
543}
544
545int cmd_stat(int argc, char **argv, const char *prefix)
546{
547 sigset_t blocked;
548
549 page_size = sysconf(_SC_PAGE_SIZE);
550
551 process_options(argc, argv);
552
553 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
554 assert(nr_cpus <= MAX_NR_CPUS);
555 assert(nr_cpus >= 0);
556
557 /*
558 * We dont want to block the signals - that would cause
559 * child tasks to inherit that and Ctrl-C would not work.
560 * What we want is for Ctrl-C to work in the exec()-ed
561 * task, but being ignored by perf stat itself:
562 */
563 signal(SIGINT, skip_signal);
564 signal(SIGALRM, skip_signal);
565 signal(SIGABRT, skip_signal);
566
567 return do_perfstat(argc, argv);
568}
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
new file mode 100644
index 000000000000..814b2e4925e3
--- /dev/null
+++ b/Documentation/perf_counter/builtin-top.c
@@ -0,0 +1,1146 @@
1/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 make -C Documentation/perf_counter/
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31 /*
32 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
33 *
34 * Improvements and fixes by:
35 *
36 * Arjan van de Ven <arjan@linux.intel.com>
37 * Yanmin Zhang <yanmin.zhang@intel.com>
38 * Wu Fengguang <fengguang.wu@intel.com>
39 * Mike Galbraith <efault@gmx.de>
40 * Paul Mackerras <paulus@samba.org>
41 *
42 * Released under the GPL v2. (and only v2, not any later version)
43 */
44
45#include "util/util.h"
46
47#include <getopt.h>
48#include <assert.h>
49#include <fcntl.h>
50#include <stdio.h>
51#include <errno.h>
52#include <time.h>
53#include <sched.h>
54#include <pthread.h>
55
56#include <sys/syscall.h>
57#include <sys/ioctl.h>
58#include <sys/poll.h>
59#include <sys/prctl.h>
60#include <sys/wait.h>
61#include <sys/uio.h>
62#include <sys/mman.h>
63
64#include <linux/unistd.h>
65#include <linux/types.h>
66
67#include "../../include/linux/perf_counter.h"
68
69#include "perf.h"
70
71static int system_wide = 0;
72
73static int nr_counters = 0;
74static __u64 event_id[MAX_COUNTERS] = {
75 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
76 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
77 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
78 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
79
80 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
81 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
82 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
83 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
84};
85static int default_interval = 100000;
86static int event_count[MAX_COUNTERS];
87static int fd[MAX_NR_CPUS][MAX_COUNTERS];
88
89static __u64 count_filter = 100;
90
91static int tid = -1;
92static int profile_cpu = -1;
93static int nr_cpus = 0;
94static int nmi = 1;
95static unsigned int realtime_prio = 0;
96static int group = 0;
97static unsigned int page_size;
98static unsigned int mmap_pages = 16;
99static int use_mmap = 0;
100static int use_munmap = 0;
101static int freq = 0;
102
103static char *vmlinux;
104
105static char *sym_filter;
106static unsigned long filter_start;
107static unsigned long filter_end;
108
109static int delay_secs = 2;
110static int zero;
111static int dump_symtab;
112
113static int scale;
114
115struct source_line {
116 uint64_t EIP;
117 unsigned long count;
118 char *line;
119 struct source_line *next;
120};
121
122static struct source_line *lines;
123static struct source_line **lines_tail;
124
125static const unsigned int default_count[] = {
126 1000000,
127 1000000,
128 10000,
129 10000,
130 1000000,
131 10000,
132};
133
134static char *hw_event_names[] = {
135 "CPU cycles",
136 "instructions",
137 "cache references",
138 "cache misses",
139 "branches",
140 "branch misses",
141 "bus cycles",
142};
143
144static char *sw_event_names[] = {
145 "cpu clock ticks",
146 "task clock ticks",
147 "pagefaults",
148 "context switches",
149 "CPU migrations",
150 "minor faults",
151 "major faults",
152};
153
154struct event_symbol {
155 __u64 event;
156 char *symbol;
157};
158
159static struct event_symbol event_symbols[] = {
160 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
161 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
162 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
163 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
164 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
165 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
166 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
167 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
168 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
169
170 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
171 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
172 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
173 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
174 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
175 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
176 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
177 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
178 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
179 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
180};
181
182#define __PERF_COUNTER_FIELD(config, name) \
183 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
184
185#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
186#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
187#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
188#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
189
190static void display_events_help(void)
191{
192 unsigned int i;
193 __u64 e;
194
195 printf(
196 " -e EVENT --event=EVENT # symbolic-name abbreviations");
197
198 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
199 int type, id;
200
201 e = event_symbols[i].event;
202 type = PERF_COUNTER_TYPE(e);
203 id = PERF_COUNTER_ID(e);
204
205 printf("\n %d:%d: %-20s",
206 type, id, event_symbols[i].symbol);
207 }
208
209 printf("\n"
210 " rNNN: raw PMU events (eventsel+umask)\n\n");
211}
212
213static void display_help(void)
214{
215 printf(
216 "Usage: kerneltop [<options>]\n"
217 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
218 "KernelTop Options (up to %d event types can be specified at once):\n\n",
219 MAX_COUNTERS);
220
221 display_events_help();
222
223 printf(
224 " -c CNT --count=CNT # event period to sample\n\n"
225 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
226 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
227 " -l # show scale factor for RR events\n"
228 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
229 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
230 " -r prio --realtime=<prio> # event acquisition runs with SCHED_FIFO policy\n"
231 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
232 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
233 " -z --zero # zero counts after display\n"
234 " -D --dump_symtab # dump symbol table to stderr on startup\n"
235 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
236 " -M --mmap_info # print mmap info stream\n"
237 " -U --munmap_info # print munmap info stream\n"
238 );
239
240 exit(0);
241}
242
243static char *event_name(int ctr)
244{
245 __u64 config = event_id[ctr];
246 int type = PERF_COUNTER_TYPE(config);
247 int id = PERF_COUNTER_ID(config);
248 static char buf[32];
249
250 if (PERF_COUNTER_RAW(config)) {
251 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
252 return buf;
253 }
254
255 switch (type) {
256 case PERF_TYPE_HARDWARE:
257 if (id < PERF_HW_EVENTS_MAX)
258 return hw_event_names[id];
259 return "unknown-hardware";
260
261 case PERF_TYPE_SOFTWARE:
262 if (id < PERF_SW_EVENTS_MAX)
263 return sw_event_names[id];
264 return "unknown-software";
265
266 default:
267 break;
268 }
269
270 return "unknown";
271}
272
273/*
274 * Each event can have multiple symbolic names.
275 * Symbolic names are (almost) exactly matched.
276 */
277static __u64 match_event_symbols(char *str)
278{
279 __u64 config, id;
280 int type;
281 unsigned int i;
282
283 if (sscanf(str, "r%llx", &config) == 1)
284 return config | PERF_COUNTER_RAW_MASK;
285
286 if (sscanf(str, "%d:%llu", &type, &id) == 2)
287 return EID(type, id);
288
289 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
290 if (!strncmp(str, event_symbols[i].symbol,
291 strlen(event_symbols[i].symbol)))
292 return event_symbols[i].event;
293 }
294
295 return ~0ULL;
296}
297
298static int parse_events(char *str)
299{
300 __u64 config;
301
302again:
303 if (nr_counters == MAX_COUNTERS)
304 return -1;
305
306 config = match_event_symbols(str);
307 if (config == ~0ULL)
308 return -1;
309
310 event_id[nr_counters] = config;
311 nr_counters++;
312
313 str = strstr(str, ",");
314 if (str) {
315 str++;
316 goto again;
317 }
318
319 return 0;
320}
321
322/*
323 * Symbols
324 */
325
326static uint64_t min_ip;
327static uint64_t max_ip = -1ll;
328
329struct sym_entry {
330 unsigned long long addr;
331 char *sym;
332 unsigned long count[MAX_COUNTERS];
333 int skip;
334 struct source_line *source;
335};
336
337#define MAX_SYMS 100000
338
339static int sym_table_count;
340
341struct sym_entry *sym_filter_entry;
342
343static struct sym_entry sym_table[MAX_SYMS];
344
345static void show_details(struct sym_entry *sym);
346
347/*
348 * Ordering weight: count-1 * count-2 * ... / count-n
349 */
350static double sym_weight(const struct sym_entry *sym)
351{
352 double weight;
353 int counter;
354
355 weight = sym->count[0];
356
357 for (counter = 1; counter < nr_counters-1; counter++)
358 weight *= sym->count[counter];
359
360 weight /= (sym->count[counter] + 1);
361
362 return weight;
363}
364
365static int compare(const void *__sym1, const void *__sym2)
366{
367 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
368
369 return sym_weight(sym1) < sym_weight(sym2);
370}
371
372static long events;
373static long userspace_events;
374static const char CONSOLE_CLEAR[] = "";
375
376static struct sym_entry tmp[MAX_SYMS];
377
378static void print_sym_table(void)
379{
380 int i, printed;
381 int counter;
382 float events_per_sec = events/delay_secs;
383 float kevents_per_sec = (events-userspace_events)/delay_secs;
384 float sum_kevents = 0.0;
385
386 events = userspace_events = 0;
387 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
388 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
389
390 for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
391 sum_kevents += tmp[i].count[0];
392
393 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
394
395 printf(
396"------------------------------------------------------------------------------\n");
397 printf( " KernelTop:%8.0f irqs/sec kernel:%4.1f%% [%s, ",
398 events_per_sec,
399 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
400 nmi ? "NMI" : "IRQ");
401
402 if (nr_counters == 1)
403 printf("%d ", event_count[0]);
404
405 for (counter = 0; counter < nr_counters; counter++) {
406 if (counter)
407 printf("/");
408
409 printf("%s", event_name(counter));
410 }
411
412 printf( "], ");
413
414 if (tid != -1)
415 printf(" (tid: %d", tid);
416 else
417 printf(" (all");
418
419 if (profile_cpu != -1)
420 printf(", cpu: %d)\n", profile_cpu);
421 else {
422 if (tid != -1)
423 printf(")\n");
424 else
425 printf(", %d CPUs)\n", nr_cpus);
426 }
427
428 printf("------------------------------------------------------------------------------\n\n");
429
430 if (nr_counters == 1)
431 printf(" events pcnt");
432 else
433 printf(" weight events pcnt");
434
435 printf(" RIP kernel function\n"
436 " ______ ______ _____ ________________ _______________\n\n"
437 );
438
439 for (i = 0, printed = 0; i < sym_table_count; i++) {
440 float pcnt;
441 int count;
442
443 if (printed <= 18 && tmp[i].count[0] >= count_filter) {
444 pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
445
446 if (nr_counters == 1)
447 printf("%19.2f - %4.1f%% - %016llx : %s\n",
448 sym_weight(tmp + i),
449 pcnt, tmp[i].addr, tmp[i].sym);
450 else
451 printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
452 sym_weight(tmp + i),
453 tmp[i].count[0],
454 pcnt, tmp[i].addr, tmp[i].sym);
455 printed++;
456 }
457 /*
458 * Add decay to the counts:
459 */
460 for (count = 0; count < nr_counters; count++)
461 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
462 }
463
464 if (sym_filter_entry)
465 show_details(sym_filter_entry);
466
467 {
468 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
469
470 if (poll(&stdin_poll, 1, 0) == 1) {
471 printf("key pressed - exiting.\n");
472 exit(0);
473 }
474 }
475}
476
477static void *display_thread(void *arg)
478{
479 printf("KernelTop refresh period: %d seconds\n", delay_secs);
480
481 while (!sleep(delay_secs))
482 print_sym_table();
483
484 return NULL;
485}
486
487static int read_symbol(FILE *in, struct sym_entry *s)
488{
489 static int filter_match = 0;
490 char *sym, stype;
491 char str[500];
492 int rc, pos;
493
494 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
495 if (rc == EOF)
496 return -1;
497
498 assert(rc == 3);
499
500 /* skip until end of line: */
501 pos = strlen(str);
502 do {
503 rc = fgetc(in);
504 if (rc == '\n' || rc == EOF || pos >= 499)
505 break;
506 str[pos] = rc;
507 pos++;
508 } while (1);
509 str[pos] = 0;
510
511 sym = str;
512
513 /* Filter out known duplicates and non-text symbols. */
514 if (!strcmp(sym, "_text"))
515 return 1;
516 if (!min_ip && !strcmp(sym, "_stext"))
517 return 1;
518 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
519 return 1;
520 if (stype != 'T' && stype != 't')
521 return 1;
522 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
523 return 1;
524 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
525 return 1;
526
527 s->sym = malloc(strlen(str)+1);
528 assert(s->sym);
529
530 strcpy((char *)s->sym, str);
531 s->skip = 0;
532
533 /* Tag events to be skipped. */
534 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
535 s->skip = 1;
536 else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
537 s->skip = 1;
538 else if (!strcmp("mwait_idle", s->sym))
539 s->skip = 1;
540
541 if (filter_match == 1) {
542 filter_end = s->addr;
543 filter_match = -1;
544 if (filter_end - filter_start > 10000) {
545 printf("hm, too large filter symbol <%s> - skipping.\n",
546 sym_filter);
547 printf("symbol filter start: %016lx\n", filter_start);
548 printf(" end: %016lx\n", filter_end);
549 filter_end = filter_start = 0;
550 sym_filter = NULL;
551 sleep(1);
552 }
553 }
554 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
555 filter_match = 1;
556 filter_start = s->addr;
557 }
558
559 return 0;
560}
561
562static int compare_addr(const void *__sym1, const void *__sym2)
563{
564 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
565
566 return sym1->addr > sym2->addr;
567}
568
569static void sort_symbol_table(void)
570{
571 int i, dups;
572
573 do {
574 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
575 for (i = 0, dups = 0; i < sym_table_count; i++) {
576 if (sym_table[i].addr == sym_table[i+1].addr) {
577 sym_table[i+1].addr = -1ll;
578 dups++;
579 }
580 }
581 sym_table_count -= dups;
582 } while(dups);
583}
584
585static void parse_symbols(void)
586{
587 struct sym_entry *last;
588
589 FILE *kallsyms = fopen("/proc/kallsyms", "r");
590
591 if (!kallsyms) {
592 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
593 exit(-1);
594 }
595
596 while (!feof(kallsyms)) {
597 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
598 sym_table_count++;
599 assert(sym_table_count <= MAX_SYMS);
600 }
601 }
602
603 sort_symbol_table();
604 min_ip = sym_table[0].addr;
605 max_ip = sym_table[sym_table_count-1].addr;
606 last = sym_table + sym_table_count++;
607
608 last->addr = -1ll;
609 last->sym = "<end>";
610
611 if (filter_end) {
612 int count;
613 for (count=0; count < sym_table_count; count ++) {
614 if (!strcmp(sym_table[count].sym, sym_filter)) {
615 sym_filter_entry = &sym_table[count];
616 break;
617 }
618 }
619 }
620 if (dump_symtab) {
621 int i;
622
623 for (i = 0; i < sym_table_count; i++)
624 fprintf(stderr, "%llx %s\n",
625 sym_table[i].addr, sym_table[i].sym);
626 }
627}
628
629/*
630 * Source lines
631 */
632
633static void parse_vmlinux(char *filename)
634{
635 FILE *file;
636 char command[PATH_MAX*2];
637 if (!filename)
638 return;
639
640 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
641
642 file = popen(command, "r");
643 if (!file)
644 return;
645
646 lines_tail = &lines;
647 while (!feof(file)) {
648 struct source_line *src;
649 size_t dummy = 0;
650 char *c;
651
652 src = malloc(sizeof(struct source_line));
653 assert(src != NULL);
654 memset(src, 0, sizeof(struct source_line));
655
656 if (getline(&src->line, &dummy, file) < 0)
657 break;
658 if (!src->line)
659 break;
660
661 c = strchr(src->line, '\n');
662 if (c)
663 *c = 0;
664
665 src->next = NULL;
666 *lines_tail = src;
667 lines_tail = &src->next;
668
669 if (strlen(src->line)>8 && src->line[8] == ':')
670 src->EIP = strtoull(src->line, NULL, 16);
671 if (strlen(src->line)>8 && src->line[16] == ':')
672 src->EIP = strtoull(src->line, NULL, 16);
673 }
674 pclose(file);
675}
676
677static void record_precise_ip(uint64_t ip)
678{
679 struct source_line *line;
680
681 for (line = lines; line; line = line->next) {
682 if (line->EIP == ip)
683 line->count++;
684 if (line->EIP > ip)
685 break;
686 }
687}
688
689static void lookup_sym_in_vmlinux(struct sym_entry *sym)
690{
691 struct source_line *line;
692 char pattern[PATH_MAX];
693 sprintf(pattern, "<%s>:", sym->sym);
694
695 for (line = lines; line; line = line->next) {
696 if (strstr(line->line, pattern)) {
697 sym->source = line;
698 break;
699 }
700 }
701}
702
703static void show_lines(struct source_line *line_queue, int line_queue_count)
704{
705 int i;
706 struct source_line *line;
707
708 line = line_queue;
709 for (i = 0; i < line_queue_count; i++) {
710 printf("%8li\t%s\n", line->count, line->line);
711 line = line->next;
712 }
713}
714
715#define TRACE_COUNT 3
716
717static void show_details(struct sym_entry *sym)
718{
719 struct source_line *line;
720 struct source_line *line_queue = NULL;
721 int displayed = 0;
722 int line_queue_count = 0;
723
724 if (!sym->source)
725 lookup_sym_in_vmlinux(sym);
726 if (!sym->source)
727 return;
728
729 printf("Showing details for %s\n", sym->sym);
730
731 line = sym->source;
732 while (line) {
733 if (displayed && strstr(line->line, ">:"))
734 break;
735
736 if (!line_queue_count)
737 line_queue = line;
738 line_queue_count ++;
739
740 if (line->count >= count_filter) {
741 show_lines(line_queue, line_queue_count);
742 line_queue_count = 0;
743 line_queue = NULL;
744 } else if (line_queue_count > TRACE_COUNT) {
745 line_queue = line_queue->next;
746 line_queue_count --;
747 }
748
749 line->count = 0;
750 displayed++;
751 if (displayed > 300)
752 break;
753 line = line->next;
754 }
755}
756
757/*
758 * Binary search in the histogram table and record the hit:
759 */
760static void record_ip(uint64_t ip, int counter)
761{
762 int left_idx, middle_idx, right_idx, idx;
763 unsigned long left, middle, right;
764
765 record_precise_ip(ip);
766
767 left_idx = 0;
768 right_idx = sym_table_count-1;
769 assert(ip <= max_ip && ip >= min_ip);
770
771 while (left_idx + 1 < right_idx) {
772 middle_idx = (left_idx + right_idx) / 2;
773
774 left = sym_table[ left_idx].addr;
775 middle = sym_table[middle_idx].addr;
776 right = sym_table[ right_idx].addr;
777
778 if (!(left <= middle && middle <= right)) {
779 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
780 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
781 }
782 assert(left <= middle && middle <= right);
783 if (!(left <= ip && ip <= right)) {
784 printf(" left: %016lx\n", left);
785 printf(" ip: %016lx\n", (unsigned long)ip);
786 printf("right: %016lx\n", right);
787 }
788 assert(left <= ip && ip <= right);
789 /*
790 * [ left .... target .... middle .... right ]
791 * => right := middle
792 */
793 if (ip < middle) {
794 right_idx = middle_idx;
795 continue;
796 }
797 /*
798 * [ left .... middle ... target ... right ]
799 * => left := middle
800 */
801 left_idx = middle_idx;
802 }
803
804 idx = left_idx;
805
806 if (!sym_table[idx].skip)
807 sym_table[idx].count[counter]++;
808 else events--;
809}
810
811static void process_event(uint64_t ip, int counter)
812{
813 events++;
814
815 if (ip < min_ip || ip > max_ip) {
816 userspace_events++;
817 return;
818 }
819
820 record_ip(ip, counter);
821}
822
823static void process_options(int argc, char **argv)
824{
825 int error = 0, counter;
826
827 for (;;) {
828 int option_index = 0;
829 /** Options for getopt */
830 static struct option long_options[] = {
831 {"count", required_argument, NULL, 'c'},
832 {"cpu", required_argument, NULL, 'C'},
833 {"delay", required_argument, NULL, 'd'},
834 {"dump_symtab", no_argument, NULL, 'D'},
835 {"event", required_argument, NULL, 'e'},
836 {"filter", required_argument, NULL, 'f'},
837 {"group", required_argument, NULL, 'g'},
838 {"help", no_argument, NULL, 'h'},
839 {"nmi", required_argument, NULL, 'n'},
840 {"mmap_info", no_argument, NULL, 'M'},
841 {"mmap_pages", required_argument, NULL, 'm'},
842 {"munmap_info", no_argument, NULL, 'U'},
843 {"pid", required_argument, NULL, 'p'},
844 {"realtime", required_argument, NULL, 'r'},
845 {"scale", no_argument, NULL, 'l'},
846 {"symbol", required_argument, NULL, 's'},
847 {"stat", no_argument, NULL, 'S'},
848 {"vmlinux", required_argument, NULL, 'x'},
849 {"zero", no_argument, NULL, 'z'},
850 {"freq", required_argument, NULL, 'F'},
851 {NULL, 0, NULL, 0 }
852 };
853 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMUF:",
854 long_options, &option_index);
855 if (c == -1)
856 break;
857
858 switch (c) {
859 case 'a': system_wide = 1; break;
860 case 'c': default_interval = atoi(optarg); break;
861 case 'C':
862 /* CPU and PID are mutually exclusive */
863 if (tid != -1) {
864 printf("WARNING: CPU switch overriding PID\n");
865 sleep(1);
866 tid = -1;
867 }
868 profile_cpu = atoi(optarg); break;
869 case 'd': delay_secs = atoi(optarg); break;
870 case 'D': dump_symtab = 1; break;
871
872 case 'e': error = parse_events(optarg); break;
873
874 case 'f': count_filter = atoi(optarg); break;
875 case 'g': group = atoi(optarg); break;
876 case 'h': display_help(); break;
877 case 'l': scale = 1; break;
878 case 'n': nmi = atoi(optarg); break;
879 case 'p':
880 /* CPU and PID are mutually exclusive */
881 if (profile_cpu != -1) {
882 printf("WARNING: PID switch overriding CPU\n");
883 sleep(1);
884 profile_cpu = -1;
885 }
886 tid = atoi(optarg); break;
887 case 'r': realtime_prio = atoi(optarg); break;
888 case 's': sym_filter = strdup(optarg); break;
889 case 'x': vmlinux = strdup(optarg); break;
890 case 'z': zero = 1; break;
891 case 'm': mmap_pages = atoi(optarg); break;
892 case 'M': use_mmap = 1; break;
893 case 'U': use_munmap = 1; break;
894 case 'F': freq = 1; default_interval = atoi(optarg); break;
895 default: error = 1; break;
896 }
897 }
898 if (error)
899 display_help();
900
901 if (!nr_counters) {
902 nr_counters = 1;
903 event_id[0] = 0;
904 }
905
906 for (counter = 0; counter < nr_counters; counter++) {
907 if (event_count[counter])
908 continue;
909
910 event_count[counter] = default_interval;
911 }
912}
913
914struct mmap_data {
915 int counter;
916 void *base;
917 unsigned int mask;
918 unsigned int prev;
919};
920
921static unsigned int mmap_read_head(struct mmap_data *md)
922{
923 struct perf_counter_mmap_page *pc = md->base;
924 int head;
925
926 head = pc->data_head;
927 rmb();
928
929 return head;
930}
931
932struct timeval last_read, this_read;
933
934static void mmap_read(struct mmap_data *md)
935{
936 unsigned int head = mmap_read_head(md);
937 unsigned int old = md->prev;
938 unsigned char *data = md->base + page_size;
939 int diff;
940
941 gettimeofday(&this_read, NULL);
942
943 /*
944 * If we're further behind than half the buffer, there's a chance
945 * the writer will bite our tail and screw up the events under us.
946 *
947 * If we somehow ended up ahead of the head, we got messed up.
948 *
949 * In either case, truncate and restart at head.
950 */
951 diff = head - old;
952 if (diff > md->mask / 2 || diff < 0) {
953 struct timeval iv;
954 unsigned long msecs;
955
956 timersub(&this_read, &last_read, &iv);
957 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
958
959 fprintf(stderr, "WARNING: failed to keep up with mmap data."
960 " Last read %lu msecs ago.\n", msecs);
961
962 /*
963 * head points to a known good entry, start there.
964 */
965 old = head;
966 }
967
968 last_read = this_read;
969
970 for (; old != head;) {
971 struct ip_event {
972 struct perf_event_header header;
973 __u64 ip;
974 __u32 pid, tid;
975 };
976 struct mmap_event {
977 struct perf_event_header header;
978 __u32 pid, tid;
979 __u64 start;
980 __u64 len;
981 __u64 pgoff;
982 char filename[PATH_MAX];
983 };
984
985 typedef union event_union {
986 struct perf_event_header header;
987 struct ip_event ip;
988 struct mmap_event mmap;
989 } event_t;
990
991 event_t *event = (event_t *)&data[old & md->mask];
992
993 event_t event_copy;
994
995 size_t size = event->header.size;
996
997 /*
998 * Event straddles the mmap boundary -- header should always
999 * be inside due to u64 alignment of output.
1000 */
1001 if ((old & md->mask) + size != ((old + size) & md->mask)) {
1002 unsigned int offset = old;
1003 unsigned int len = min(sizeof(*event), size), cpy;
1004 void *dst = &event_copy;
1005
1006 do {
1007 cpy = min(md->mask + 1 - (offset & md->mask), len);
1008 memcpy(dst, &data[offset & md->mask], cpy);
1009 offset += cpy;
1010 dst += cpy;
1011 len -= cpy;
1012 } while (len);
1013
1014 event = &event_copy;
1015 }
1016
1017 old += size;
1018
1019 if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
1020 if (event->header.type & PERF_RECORD_IP)
1021 process_event(event->ip.ip, md->counter);
1022 } else {
1023 switch (event->header.type) {
1024 case PERF_EVENT_MMAP:
1025 case PERF_EVENT_MUNMAP:
1026 printf("%s: %Lu %Lu %Lu %s\n",
1027 event->header.type == PERF_EVENT_MMAP
1028 ? "mmap" : "munmap",
1029 event->mmap.start,
1030 event->mmap.len,
1031 event->mmap.pgoff,
1032 event->mmap.filename);
1033 break;
1034 }
1035 }
1036 }
1037
1038 md->prev = old;
1039}
1040
1041int cmd_top(int argc, char **argv, const char *prefix)
1042{
1043 struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
1044 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1045 struct perf_counter_hw_event hw_event;
1046 pthread_t thread;
1047 int i, counter, group_fd, nr_poll = 0;
1048 unsigned int cpu;
1049 int ret;
1050
1051 page_size = sysconf(_SC_PAGE_SIZE);
1052
1053 process_options(argc, argv);
1054
1055 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1056 assert(nr_cpus <= MAX_NR_CPUS);
1057 assert(nr_cpus >= 0);
1058
1059 if (tid != -1 || profile_cpu != -1)
1060 nr_cpus = 1;
1061
1062 parse_symbols();
1063 if (vmlinux && sym_filter_entry)
1064 parse_vmlinux(vmlinux);
1065
1066 for (i = 0; i < nr_cpus; i++) {
1067 group_fd = -1;
1068 for (counter = 0; counter < nr_counters; counter++) {
1069
1070 cpu = profile_cpu;
1071 if (tid == -1 && profile_cpu == -1)
1072 cpu = i;
1073
1074 memset(&hw_event, 0, sizeof(hw_event));
1075 hw_event.config = event_id[counter];
1076 hw_event.irq_period = event_count[counter];
1077 hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID;
1078 hw_event.nmi = nmi;
1079 hw_event.mmap = use_mmap;
1080 hw_event.munmap = use_munmap;
1081 hw_event.freq = freq;
1082
1083 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1084 if (fd[i][counter] < 0) {
1085 int err = errno;
1086 printf("kerneltop error: syscall returned with %d (%s)\n",
1087 fd[i][counter], strerror(err));
1088 if (err == EPERM)
1089 printf("Are you root?\n");
1090 exit(-1);
1091 }
1092 assert(fd[i][counter] >= 0);
1093 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1094
1095 /*
1096 * First counter acts as the group leader:
1097 */
1098 if (group && group_fd == -1)
1099 group_fd = fd[i][counter];
1100
1101 event_array[nr_poll].fd = fd[i][counter];
1102 event_array[nr_poll].events = POLLIN;
1103 nr_poll++;
1104
1105 mmap_array[i][counter].counter = counter;
1106 mmap_array[i][counter].prev = 0;
1107 mmap_array[i][counter].mask = mmap_pages*page_size - 1;
1108 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
1109 PROT_READ, MAP_SHARED, fd[i][counter], 0);
1110 if (mmap_array[i][counter].base == MAP_FAILED) {
1111 printf("kerneltop error: failed to mmap with %d (%s)\n",
1112 errno, strerror(errno));
1113 exit(-1);
1114 }
1115 }
1116 }
1117
1118 if (pthread_create(&thread, NULL, display_thread, NULL)) {
1119 printf("Could not create display thread.\n");
1120 exit(-1);
1121 }
1122
1123 if (realtime_prio) {
1124 struct sched_param param;
1125
1126 param.sched_priority = realtime_prio;
1127 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1128 printf("Could not set realtime priority.\n");
1129 exit(-1);
1130 }
1131 }
1132
1133 while (1) {
1134 int hits = events;
1135
1136 for (i = 0; i < nr_cpus; i++) {
1137 for (counter = 0; counter < nr_counters; counter++)
1138 mmap_read(&mmap_array[i][counter]);
1139 }
1140
1141 if (hits == events)
1142 ret = poll(event_array, nr_poll, 100);
1143 }
1144
1145 return 0;
1146}
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
new file mode 100644
index 000000000000..d32318aed8cf
--- /dev/null
+++ b/Documentation/perf_counter/builtin.h
@@ -0,0 +1,22 @@
1#ifndef BUILTIN_H
2#define BUILTIN_H
3
4#include "util/util.h"
5#include "util/strbuf.h"
6
7extern const char perf_version_string[];
8extern const char perf_usage_string[];
9extern const char perf_more_info_string[];
10
11extern void list_common_cmds_help(void);
12extern const char *help_unknown_cmd(const char *cmd);
13extern void prune_packed_objects(int);
14extern int read_line_with_nul(char *buf, int size, FILE *file);
15extern int check_pager_config(const char *cmd);
16
17extern int cmd_help(int argc, const char **argv, const char *prefix);
18extern int cmd_record(int argc, const char **argv, const char *prefix);
19extern int cmd_stat(int argc, const char **argv, const char *prefix);
20extern int cmd_top(int argc, const char **argv, const char *prefix);
21extern int cmd_version(int argc, const char **argv, const char *prefix);
22#endif
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
new file mode 100644
index 000000000000..d15210aa0cae
--- /dev/null
+++ b/Documentation/perf_counter/command-list.txt
@@ -0,0 +1,6 @@
1# List of known perf commands.
2# command name category [deprecated] [common]
3perf-record mainporcelain common
4perf-stat mainporcelain common
5perf-top mainporcelain common
6
diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
new file mode 100644
index 000000000000..9930c4bddc6f
--- /dev/null
+++ b/Documentation/perf_counter/design.txt
@@ -0,0 +1,449 @@
1
2Performance Counters for Linux
3------------------------------
4
5Performance counters are special hardware registers available on most modern
6CPUs. These registers count the number of certain types of hw events: such
7as instructions executed, cachemisses suffered, or branches mis-predicted -
8without slowing down the kernel or applications. These registers can also
9trigger interrupts when a threshold number of events have passed - and can
10thus be used to profile the code that runs on that CPU.
11
12The Linux Performance Counter subsystem provides an abstraction of these
13hardware capabilities. It provides per task and per CPU counters, counter
14groups, and it provides event capabilities on top of those. It
15provides "virtual" 64-bit counters, regardless of the width of the
16underlying hardware counters.
17
18Performance counters are accessed via special file descriptors.
19There's one file descriptor per virtual counter used.
20
21The special file descriptor is opened via the perf_counter_open()
22system call:
23
24 int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
25 pid_t pid, int cpu, int group_fd,
26 unsigned long flags);
27
28The syscall returns the new fd. The fd can be used via the normal
29VFS system calls: read() can be used to read the counter, fcntl()
30can be used to set the blocking mode, etc.
31
32Multiple counters can be kept open at a time, and the counters
33can be poll()ed.
34
35When creating a new counter fd, 'perf_counter_hw_event' is:
36
37struct perf_counter_hw_event {
38 /*
39 * The MSB of the config word signifies if the rest contains cpu
40 * specific (raw) counter configuration data, if unset, the next
41 * 7 bits are an event type and the rest of the bits are the event
42 * identifier.
43 */
44 __u64 config;
45
46 __u64 irq_period;
47 __u32 record_type;
48 __u32 read_format;
49
50 __u64 disabled : 1, /* off by default */
51 nmi : 1, /* NMI sampling */
52 inherit : 1, /* children inherit it */
53 pinned : 1, /* must always be on PMU */
54 exclusive : 1, /* only group on PMU */
55 exclude_user : 1, /* don't count user */
56 exclude_kernel : 1, /* ditto kernel */
57 exclude_hv : 1, /* ditto hypervisor */
58 exclude_idle : 1, /* don't count when idle */
59 mmap : 1, /* include mmap data */
60 munmap : 1, /* include munmap data */
61 comm : 1, /* include comm data */
62
63 __reserved_1 : 52;
64
65 __u32 extra_config_len;
66 __u32 wakeup_events; /* wakeup every n events */
67
68 __u64 __reserved_2;
69 __u64 __reserved_3;
70};
71
72The 'config' field specifies what the counter should count. It
73is divided into 3 bit-fields:
74
75raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000
76type: 7 bits (next most significant) 0x7f00_0000_0000_0000
77event_id: 56 bits (least significant) 0x00ff_ffff_ffff_ffff
78
79If 'raw_type' is 1, then the counter will count a hardware event
80specified by the remaining 63 bits of event_config. The encoding is
81machine-specific.
82
83If 'raw_type' is 0, then the 'type' field says what kind of counter
84this is, with the following encoding:
85
86enum perf_event_types {
87 PERF_TYPE_HARDWARE = 0,
88 PERF_TYPE_SOFTWARE = 1,
89 PERF_TYPE_TRACEPOINT = 2,
90};
91
92A counter of PERF_TYPE_HARDWARE will count the hardware event
93specified by 'event_id':
94
95/*
96 * Generalized performance counter event types, used by the hw_event.event_id
97 * parameter of the sys_perf_counter_open() syscall:
98 */
99enum hw_event_ids {
100 /*
101 * Common hardware events, generalized by the kernel:
102 */
103 PERF_COUNT_CPU_CYCLES = 0,
104 PERF_COUNT_INSTRUCTIONS = 1,
105 PERF_COUNT_CACHE_REFERENCES = 2,
106 PERF_COUNT_CACHE_MISSES = 3,
107 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
108 PERF_COUNT_BRANCH_MISSES = 5,
109 PERF_COUNT_BUS_CYCLES = 6,
110};
111
112These are standardized types of events that work relatively uniformly
113on all CPUs that implement Performance Counters support under Linux,
114although there may be variations (e.g., different CPUs might count
115cache references and misses at different levels of the cache hierarchy).
116If a CPU is not able to count the selected event, then the system call
117will return -EINVAL.
118
119More hw_event_types are supported as well, but they are CPU-specific
120and accessed as raw events. For example, to count "External bus
121cycles while bus lock signal asserted" events on Intel Core CPUs, pass
122in a 0x4064 event_id value and set hw_event.raw_type to 1.
123
124A counter of type PERF_TYPE_SOFTWARE will count one of the available
125software events, selected by 'event_id':
126
127/*
128 * Special "software" counters provided by the kernel, even if the hardware
129 * does not support performance counters. These counters measure various
130 * physical and sw events of the kernel (and allow the profiling of them as
131 * well):
132 */
133enum sw_event_ids {
134 PERF_COUNT_CPU_CLOCK = 0,
135 PERF_COUNT_TASK_CLOCK = 1,
136 PERF_COUNT_PAGE_FAULTS = 2,
137 PERF_COUNT_CONTEXT_SWITCHES = 3,
138 PERF_COUNT_CPU_MIGRATIONS = 4,
139 PERF_COUNT_PAGE_FAULTS_MIN = 5,
140 PERF_COUNT_PAGE_FAULTS_MAJ = 6,
141};
142
143Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
144tracer is available, and event_id values can be obtained from
145/debug/tracing/events/*/*/id
146
147
148Counters come in two flavours: counting counters and sampling
149counters. A "counting" counter is one that is used for counting the
150number of events that occur, and is characterised by having
151irq_period = 0.
152
153
154A read() on a counter returns the current value of the counter and possible
155additional values as specified by 'read_format', each value is a u64 (8 bytes)
156in size.
157
158/*
159 * Bits that can be set in hw_event.read_format to request that
160 * reads on the counter should return the indicated quantities,
161 * in increasing order of bit value, after the counter value.
162 */
163enum perf_counter_read_format {
164 PERF_FORMAT_TOTAL_TIME_ENABLED = 1,
165 PERF_FORMAT_TOTAL_TIME_RUNNING = 2,
166};
167
168Using these additional values one can establish the overcommit ratio for a
169particular counter allowing one to take the round-robin scheduling effect
170into account.
171
172
173A "sampling" counter is one that is set up to generate an interrupt
174every N events, where N is given by 'irq_period'. A sampling counter
175has irq_period > 0. The record_type controls what data is recorded on each
176interrupt:
177
178/*
179 * Bits that can be set in hw_event.record_type to request information
180 * in the overflow packets.
181 */
182enum perf_counter_record_format {
183 PERF_RECORD_IP = 1U << 0,
184 PERF_RECORD_TID = 1U << 1,
185 PERF_RECORD_TIME = 1U << 2,
186 PERF_RECORD_ADDR = 1U << 3,
187 PERF_RECORD_GROUP = 1U << 4,
188 PERF_RECORD_CALLCHAIN = 1U << 5,
189};
190
191Such (and other) events will be recorded in a ring-buffer, which is
192available to user-space using mmap() (see below).
193
194The 'disabled' bit specifies whether the counter starts out disabled
195or enabled. If it is initially disabled, it can be enabled by ioctl
196or prctl (see below).
197
198The 'nmi' bit specifies, for hardware events, whether the counter
199should be set up to request non-maskable interrupts (NMIs) or normal
200interrupts. This bit is ignored if the user doesn't have
201CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't
202generate NMIs from hardware counters.
203
204The 'inherit' bit, if set, specifies that this counter should count
205events on descendant tasks as well as the task specified. This only
206applies to new descendents, not to any existing descendents at the
207time the counter is created (nor to any new descendents of existing
208descendents).
209
210The 'pinned' bit, if set, specifies that the counter should always be
211on the CPU if at all possible. It only applies to hardware counters
212and only to group leaders. If a pinned counter cannot be put onto the
213CPU (e.g. because there are not enough hardware counters or because of
214a conflict with some other event), then the counter goes into an
215'error' state, where reads return end-of-file (i.e. read() returns 0)
216until the counter is subsequently enabled or disabled.
217
218The 'exclusive' bit, if set, specifies that when this counter's group
219is on the CPU, it should be the only group using the CPU's counters.
220In future, this will allow sophisticated monitoring programs to supply
221extra configuration information via 'extra_config_len' to exploit
222advanced features of the CPU's Performance Monitor Unit (PMU) that are
223not otherwise accessible and that might disrupt other hardware
224counters.
225
226The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
227way to request that counting of events be restricted to times when the
228CPU is in user, kernel and/or hypervisor mode.
229
230The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap
231operations, these can be used to relate userspace IP addresses to actual
232code, even after the mapping (or even the whole process) is gone,
233these events are recorded in the ring-buffer (see below).
234
235The 'comm' bit allows tracking of process comm data on process creation.
236This too is recorded in the ring-buffer (see below).
237
238The 'pid' parameter to the perf_counter_open() system call allows the
239counter to be specific to a task:
240
241 pid == 0: if the pid parameter is zero, the counter is attached to the
242 current task.
243
244 pid > 0: the counter is attached to a specific task (if the current task
245 has sufficient privilege to do so)
246
247 pid < 0: all tasks are counted (per cpu counters)
248
249The 'cpu' parameter allows a counter to be made specific to a CPU:
250
251 cpu >= 0: the counter is restricted to a specific CPU
252 cpu == -1: the counter counts on all CPUs
253
254(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
255
256A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
257events of that task and 'follows' that task to whatever CPU the task
258gets schedule to. Per task counters can be created by any user, for
259their own tasks.
260
261A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
262all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
263
264The 'flags' parameter is currently unused and must be zero.
265
266The 'group_fd' parameter allows counter "groups" to be set up. A
267counter group has one counter which is the group "leader". The leader
268is created first, with group_fd = -1 in the perf_counter_open call
269that creates it. The rest of the group members are created
270subsequently, with group_fd giving the fd of the group leader.
271(A single counter on its own is created with group_fd = -1 and is
272considered to be a group with only 1 member.)
273
274A counter group is scheduled onto the CPU as a unit, that is, it will
275only be put onto the CPU if all of the counters in the group can be
276put onto the CPU. This means that the values of the member counters
277can be meaningfully compared, added, divided (to get ratios), etc.,
278with each other, since they have counted events for the same set of
279executed instructions.
280
281
282Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap
283tracking are logged into a ring-buffer. This ring-buffer is created and
284accessed through mmap().
285
286The mmap size should be 1+2^n pages, where the first page is a meta-data page
287(struct perf_counter_mmap_page) that contains various bits of information such
288as where the ring-buffer head is.
289
290/*
291 * Structure of the page that can be mapped via mmap
292 */
293struct perf_counter_mmap_page {
294 __u32 version; /* version number of this structure */
295 __u32 compat_version; /* lowest version this is compat with */
296
297 /*
298 * Bits needed to read the hw counters in user-space.
299 *
300 * u32 seq;
301 * s64 count;
302 *
303 * do {
304 * seq = pc->lock;
305 *
306 * barrier()
307 * if (pc->index) {
308 * count = pmc_read(pc->index - 1);
309 * count += pc->offset;
310 * } else
311 * goto regular_read;
312 *
313 * barrier();
314 * } while (pc->lock != seq);
315 *
316 * NOTE: for obvious reason this only works on self-monitoring
317 * processes.
318 */
319 __u32 lock; /* seqlock for synchronization */
320 __u32 index; /* hardware counter identifier */
321 __s64 offset; /* add to hardware counter value */
322
323 /*
324 * Control data for the mmap() data buffer.
325 *
326 * User-space reading this value should issue an rmb(), on SMP capable
327 * platforms, after reading this value -- see perf_counter_wakeup().
328 */
329 __u32 data_head; /* head in the data section */
330};
331
332NOTE: the hw-counter userspace bits are arch specific and are currently only
333 implemented on powerpc.
334
335The following 2^n pages are the ring-buffer which contains events of the form:
336
337#define PERF_EVENT_MISC_KERNEL (1 << 0)
338#define PERF_EVENT_MISC_USER (1 << 1)
339#define PERF_EVENT_MISC_OVERFLOW (1 << 2)
340
341struct perf_event_header {
342 __u32 type;
343 __u16 misc;
344 __u16 size;
345};
346
347enum perf_event_type {
348
349 /*
350 * The MMAP events record the PROT_EXEC mappings so that we can
351 * correlate userspace IPs to code. They have the following structure:
352 *
353 * struct {
354 * struct perf_event_header header;
355 *
356 * u32 pid, tid;
357 * u64 addr;
358 * u64 len;
359 * u64 pgoff;
360 * char filename[];
361 * };
362 */
363 PERF_EVENT_MMAP = 1,
364 PERF_EVENT_MUNMAP = 2,
365
366 /*
367 * struct {
368 * struct perf_event_header header;
369 *
370 * u32 pid, tid;
371 * char comm[];
372 * };
373 */
374 PERF_EVENT_COMM = 3,
375
376 /*
377 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
378 * will be PERF_RECORD_*
379 *
380 * struct {
381 * struct perf_event_header header;
382 *
383 * { u64 ip; } && PERF_RECORD_IP
384 * { u32 pid, tid; } && PERF_RECORD_TID
385 * { u64 time; } && PERF_RECORD_TIME
386 * { u64 addr; } && PERF_RECORD_ADDR
387 *
388 * { u64 nr;
389 * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP
390 *
391 * { u16 nr,
392 * hv,
393 * kernel,
394 * user;
395 * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN
396 * };
397 */
398};
399
400NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented
401 on x86.
402
403Notification of new events is possible through poll()/select()/epoll() and
404fcntl() managing signals.
405
406Normally a notification is generated for every page filled, however one can
407additionally set perf_counter_hw_event.wakeup_events to generate one every
408so many counter overflow events.
409
410Future work will include a splice() interface to the ring-buffer.
411
412
413Counters can be enabled and disabled in two ways: via ioctl and via
414prctl. When a counter is disabled, it doesn't count or generate
415events but does continue to exist and maintain its count value.
416
417An individual counter or counter group can be enabled with
418
419 ioctl(fd, PERF_COUNTER_IOC_ENABLE);
420
421or disabled with
422
423 ioctl(fd, PERF_COUNTER_IOC_DISABLE);
424
425Enabling or disabling the leader of a group enables or disables the
426whole group; that is, while the group leader is disabled, none of the
427counters in the group will count. Enabling or disabling a member of a
428group other than the leader only affects that counter - disabling an
429non-leader stops that counter from counting but doesn't affect any
430other counter.
431
432Additionally, non-inherited overflow counters can use
433
434 ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
435
436to enable a counter for 'nr' events, after which it gets disabled again.
437
438A process can enable or disable all the counter groups that are
439attached to it, using prctl:
440
441 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
442
443 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
444
445This applies to all counters on the current process, whether created
446by this process or by another, and doesn't affect any counters that
447this process has created on other processes. It only enables or
448disables the group leaders, not any other members in the groups.
449
diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
new file mode 100644
index 000000000000..8855107fe6b3
--- /dev/null
+++ b/Documentation/perf_counter/perf-report.cc
@@ -0,0 +1,515 @@
1#define _GNU_SOURCE
2#include <sys/types.h>
3#include <sys/stat.h>
4#include <sys/time.h>
5#include <unistd.h>
6#include <stdint.h>
7#include <stdlib.h>
8#include <string.h>
9#include <limits.h>
10#include <fcntl.h>
11#include <stdio.h>
12#include <errno.h>
13#include <ctype.h>
14#include <time.h>
15#include <getopt.h>
16#include <assert.h>
17
18#include <sys/ioctl.h>
19#include <sys/poll.h>
20#include <sys/prctl.h>
21#include <sys/wait.h>
22#include <sys/mman.h>
23#include <sys/types.h>
24#include <sys/stat.h>
25
26#include <linux/unistd.h>
27#include <linux/types.h>
28
29#include "../../include/linux/perf_counter.h"
30
31#include <set>
32#include <map>
33#include <string>
34
35
36#define SHOW_KERNEL 1
37#define SHOW_USER 2
38#define SHOW_HV 4
39
40static char const *input_name = "output.perf";
41static int input;
42static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
43
44static unsigned long page_size;
45static unsigned long mmap_window = 32;
46
47struct ip_event {
48 struct perf_event_header header;
49 __u64 ip;
50 __u32 pid, tid;
51};
52struct mmap_event {
53 struct perf_event_header header;
54 __u32 pid, tid;
55 __u64 start;
56 __u64 len;
57 __u64 pgoff;
58 char filename[PATH_MAX];
59};
60struct comm_event {
61 struct perf_event_header header;
62 __u32 pid,tid;
63 char comm[16];
64};
65
66typedef union event_union {
67 struct perf_event_header header;
68 struct ip_event ip;
69 struct mmap_event mmap;
70 struct comm_event comm;
71} event_t;
72
73struct section {
74 uint64_t start;
75 uint64_t end;
76
77 uint64_t offset;
78
79 std::string name;
80
81 section() { };
82
83 section(uint64_t stab) : end(stab) { };
84
85 section(uint64_t start, uint64_t size, uint64_t offset, std::string name) :
86 start(start), end(start + size), offset(offset), name(name)
87 { };
88
89 bool operator < (const struct section &s) const {
90 return end < s.end;
91 };
92};
93
94typedef std::set<struct section> sections_t;
95
96struct symbol {
97 uint64_t start;
98 uint64_t end;
99
100 std::string name;
101
102 symbol() { };
103
104 symbol(uint64_t ip) : start(ip) { }
105
106 symbol(uint64_t start, uint64_t len, std::string name) :
107 start(start), end(start + len), name(name)
108 { };
109
110 bool operator < (const struct symbol &s) const {
111 return start < s.start;
112 };
113};
114
115typedef std::set<struct symbol> symbols_t;
116
117struct dso {
118 sections_t sections;
119 symbols_t syms;
120};
121
122static std::map<std::string, struct dso> dsos;
123
124static void load_dso_sections(std::string dso_name)
125{
126 struct dso &dso = dsos[dso_name];
127
128 std::string cmd = "readelf -DSW " + dso_name;
129
130 FILE *file = popen(cmd.c_str(), "r");
131 if (!file) {
132 perror("failed to open pipe");
133 exit(-1);
134 }
135
136 char *line = NULL;
137 size_t n = 0;
138
139 while (!feof(file)) {
140 uint64_t addr, off, size;
141 char name[32];
142
143 if (getline(&line, &n, file) < 0)
144 break;
145 if (!line)
146 break;
147
148 if (sscanf(line, " [%*2d] %16s %*14s %Lx %Lx %Lx",
149 name, &addr, &off, &size) == 4) {
150
151 dso.sections.insert(section(addr, size, addr - off, name));
152 }
153#if 0
154 /*
155 * for reading readelf symbols (-s), however these don't seem
156 * to include nearly everything, so use nm for that.
157 */
158 if (sscanf(line, " %*4d %*3d: %Lx %5Lu %*7s %*6s %*7s %3d %s",
159 &start, &size, &section, sym) == 4) {
160
161 start -= dso.section_offsets[section];
162
163 dso.syms.insert(symbol(start, size, std::string(sym)));
164 }
165#endif
166 }
167 pclose(file);
168}
169
170static void load_dso_symbols(std::string dso_name, std::string args)
171{
172 struct dso &dso = dsos[dso_name];
173
174 std::string cmd = "nm -nSC " + args + " " + dso_name;
175
176 FILE *file = popen(cmd.c_str(), "r");
177 if (!file) {
178 perror("failed to open pipe");
179 exit(-1);
180 }
181
182 char *line = NULL;
183 size_t n = 0;
184
185 while (!feof(file)) {
186 uint64_t start, size;
187 char c;
188 char sym[1024];
189
190 if (getline(&line, &n, file) < 0)
191 break;
192 if (!line)
193 break;
194
195
196 if (sscanf(line, "%Lx %Lx %c %s", &start, &size, &c, sym) == 4) {
197 sections_t::const_iterator si =
198 dso.sections.upper_bound(section(start));
199 if (si == dso.sections.end()) {
200 printf("symbol in unknown section: %s\n", sym);
201 continue;
202 }
203
204 start -= si->offset;
205
206 dso.syms.insert(symbol(start, size, sym));
207 }
208 }
209 pclose(file);
210}
211
212static void load_dso(std::string dso_name)
213{
214 load_dso_sections(dso_name);
215 load_dso_symbols(dso_name, "-D"); /* dynamic symbols */
216 load_dso_symbols(dso_name, ""); /* regular ones */
217}
218
219void load_kallsyms(void)
220{
221 struct dso &dso = dsos["[kernel]"];
222
223 FILE *file = fopen("/proc/kallsyms", "r");
224 if (!file) {
225 perror("failed to open kallsyms");
226 exit(-1);
227 }
228
229 char *line;
230 size_t n;
231
232 while (!feof(file)) {
233 uint64_t start;
234 char c;
235 char sym[1024000];
236
237 if (getline(&line, &n, file) < 0)
238 break;
239 if (!line)
240 break;
241
242 if (sscanf(line, "%Lx %c %s", &start, &c, sym) == 3)
243 dso.syms.insert(symbol(start, 0x1000000, std::string(sym)));
244 }
245 fclose(file);
246}
247
248struct map {
249 uint64_t start;
250 uint64_t end;
251 uint64_t pgoff;
252
253 std::string dso;
254
255 map() { };
256
257 map(uint64_t ip) : end(ip) { }
258
259 map(mmap_event *mmap) {
260 start = mmap->start;
261 end = mmap->start + mmap->len;
262 pgoff = mmap->pgoff;
263
264 dso = std::string(mmap->filename);
265
266 if (dsos.find(dso) == dsos.end())
267 load_dso(dso);
268 };
269
270 bool operator < (const struct map &m) const {
271 return end < m.end;
272 };
273};
274
275typedef std::set<struct map> maps_t;
276
277static std::map<int, maps_t> maps;
278
279static std::map<int, std::string> comms;
280
281static std::map<std::string, int> hist;
282static std::multimap<int, std::string> rev_hist;
283
284static std::string resolve_comm(int pid)
285{
286 std::string comm;
287
288 std::map<int, std::string>::const_iterator ci = comms.find(pid);
289 if (ci != comms.end()) {
290 comm = ci->second;
291 } else {
292 char pid_str[30];
293
294 sprintf(pid_str, ":%d", pid);
295 comm = pid_str;
296 }
297
298 return comm;
299}
300
301static std::string resolve_user_symbol(int pid, uint64_t ip)
302{
303 std::string sym = "<unknown>";
304
305 maps_t &m = maps[pid];
306 maps_t::const_iterator mi = m.upper_bound(map(ip));
307 if (mi == m.end())
308 return sym;
309
310 ip -= mi->start + mi->pgoff;
311
312 symbols_t &s = dsos[mi->dso].syms;
313 symbols_t::const_iterator si = s.upper_bound(symbol(ip));
314
315 sym = mi->dso + ": <unknown>";
316
317 if (si == s.begin())
318 return sym;
319 si--;
320
321 if (si->start <= ip && ip < si->end)
322 sym = mi->dso + ": " + si->name;
323#if 0
324 else if (si->start <= ip)
325 sym = mi->dso + ": ?" + si->name;
326#endif
327
328 return sym;
329}
330
331static std::string resolve_kernel_symbol(uint64_t ip)
332{
333 std::string sym = "<unknown>";
334
335 symbols_t &s = dsos["[kernel]"].syms;
336 symbols_t::const_iterator si = s.upper_bound(symbol(ip));
337
338 if (si == s.begin())
339 return sym;
340 si--;
341
342 if (si->start <= ip && ip < si->end)
343 sym = si->name;
344
345 return sym;
346}
347
348static void display_help(void)
349{
350 printf(
351 "Usage: perf-report [<options>]\n"
352 " -i file --input=<file> # input file\n"
353 );
354
355 exit(0);
356}
357
358static void process_options(int argc, char *argv[])
359{
360 int error = 0;
361
362 for (;;) {
363 int option_index = 0;
364 /** Options for getopt */
365 static struct option long_options[] = {
366 {"input", required_argument, NULL, 'i'},
367 {"no-user", no_argument, NULL, 'u'},
368 {"no-kernel", no_argument, NULL, 'k'},
369 {"no-hv", no_argument, NULL, 'h'},
370 {NULL, 0, NULL, 0 }
371 };
372 int c = getopt_long(argc, argv, "+:i:kuh",
373 long_options, &option_index);
374 if (c == -1)
375 break;
376
377 switch (c) {
378 case 'i': input_name = strdup(optarg); break;
379 case 'k': show_mask &= ~SHOW_KERNEL; break;
380 case 'u': show_mask &= ~SHOW_USER; break;
381 case 'h': show_mask &= ~SHOW_HV; break;
382 default: error = 1; break;
383 }
384 }
385
386 if (error)
387 display_help();
388}
389
390int main(int argc, char *argv[])
391{
392 unsigned long offset = 0;
393 unsigned long head = 0;
394 struct stat stat;
395 char *buf;
396 event_t *event;
397 int ret;
398 unsigned long total = 0;
399
400 page_size = getpagesize();
401
402 process_options(argc, argv);
403
404 input = open(input_name, O_RDONLY);
405 if (input < 0) {
406 perror("failed to open file");
407 exit(-1);
408 }
409
410 ret = fstat(input, &stat);
411 if (ret < 0) {
412 perror("failed to stat file");
413 exit(-1);
414 }
415
416 if (!stat.st_size) {
417 fprintf(stderr, "zero-sized file, nothing to do!\n");
418 exit(0);
419 }
420
421 load_kallsyms();
422
423remap:
424 buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
425 MAP_SHARED, input, offset);
426 if (buf == MAP_FAILED) {
427 perror("failed to mmap file");
428 exit(-1);
429 }
430
431more:
432 event = (event_t *)(buf + head);
433
434 if (head + event->header.size >= page_size * mmap_window) {
435 unsigned long shift = page_size * (head / page_size);
436 int ret;
437
438 ret = munmap(buf, page_size * mmap_window);
439 assert(ret == 0);
440
441 offset += shift;
442 head -= shift;
443 goto remap;
444 }
445
446
447 if (!event->header.size) {
448 fprintf(stderr, "zero-sized event at file offset %ld\n", offset + head);
449 fprintf(stderr, "skipping %ld bytes of events.\n", stat.st_size - offset - head);
450 goto done;
451 }
452
453 head += event->header.size;
454
455 if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
456 std::string comm, sym, level;
457 int show = 0;
458 char output[1024];
459
460 if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
461 show |= SHOW_KERNEL;
462 level = " [k] ";
463 sym = resolve_kernel_symbol(event->ip.ip);
464 } else if (event->header.misc & PERF_EVENT_MISC_USER) {
465 show |= SHOW_USER;
466 level = " [.] ";
467 sym = resolve_user_symbol(event->ip.pid, event->ip.ip);
468 } else {
469 show |= SHOW_HV;
470 level = " [H] ";
471 }
472
473 if (show & show_mask) {
474 comm = resolve_comm(event->ip.pid);
475 snprintf(output, sizeof(output), "%16s %s %s",
476 comm.c_str(), level.c_str(), sym.c_str());
477 hist[output]++;
478 }
479
480 total++;
481
482 } else switch (event->header.type) {
483 case PERF_EVENT_MMAP:
484 maps[event->mmap.pid].insert(map(&event->mmap));
485 break;
486
487 case PERF_EVENT_COMM:
488 comms[event->comm.pid] = std::string(event->comm.comm);
489 break;
490 }
491
492 if (offset + head < stat.st_size)
493 goto more;
494
495done:
496
497 close(input);
498
499 std::map<std::string, int>::iterator hi = hist.begin();
500
501 while (hi != hist.end()) {
502 rev_hist.insert(std::pair<int, std::string>(hi->second, hi->first));
503 hist.erase(hi++);
504 }
505
506 std::multimap<int, std::string>::const_iterator ri = rev_hist.begin();
507
508 while (ri != rev_hist.end()) {
509 printf(" %5.2f %s\n", (100.0 * ri->first)/total, ri->second.c_str());
510 ri++;
511 }
512
513 return 0;
514}
515
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
new file mode 100644
index 000000000000..594d270be390
--- /dev/null
+++ b/Documentation/perf_counter/perf.c
@@ -0,0 +1,414 @@
1#include "builtin.h"
2#include "util/exec_cmd.h"
3#include "util/cache.h"
4#include "util/quote.h"
5#include "util/run-command.h"
6
7const char perf_usage_string[] =
8 "perf [--version] [--help] COMMAND [ARGS]";
9
10const char perf_more_info_string[] =
11 "See 'perf help COMMAND' for more information on a specific command.";
12
13static int use_pager = -1;
14struct pager_config {
15 const char *cmd;
16 int val;
17};
18
19static int pager_command_config(const char *var, const char *value, void *data)
20{
21 struct pager_config *c = data;
22 if (!prefixcmp(var, "pager.") && !strcmp(var + 6, c->cmd))
23 c->val = perf_config_bool(var, value);
24 return 0;
25}
26
27/* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */
28int check_pager_config(const char *cmd)
29{
30 struct pager_config c;
31 c.cmd = cmd;
32 c.val = -1;
33 perf_config(pager_command_config, &c);
34 return c.val;
35}
36
37static void commit_pager_choice(void) {
38 switch (use_pager) {
39 case 0:
40 setenv("PERF_PAGER", "cat", 1);
41 break;
42 case 1:
43 /* setup_pager(); */
44 break;
45 default:
46 break;
47 }
48}
49
50static int handle_options(const char*** argv, int* argc, int* envchanged)
51{
52 int handled = 0;
53
54 while (*argc > 0) {
55 const char *cmd = (*argv)[0];
56 if (cmd[0] != '-')
57 break;
58
59 /*
60 * For legacy reasons, the "version" and "help"
61 * commands can be written with "--" prepended
62 * to make them look like flags.
63 */
64 if (!strcmp(cmd, "--help") || !strcmp(cmd, "--version"))
65 break;
66
67 /*
68 * Check remaining flags.
69 */
70 if (!prefixcmp(cmd, "--exec-path")) {
71 cmd += 11;
72 if (*cmd == '=')
73 perf_set_argv_exec_path(cmd + 1);
74 else {
75 puts(perf_exec_path());
76 exit(0);
77 }
78 } else if (!strcmp(cmd, "--html-path")) {
79 puts(system_path(PERF_HTML_PATH));
80 exit(0);
81 } else if (!strcmp(cmd, "-p") || !strcmp(cmd, "--paginate")) {
82 use_pager = 1;
83 } else if (!strcmp(cmd, "--no-pager")) {
84 use_pager = 0;
85 if (envchanged)
86 *envchanged = 1;
87 } else if (!strcmp(cmd, "--perf-dir")) {
88 if (*argc < 2) {
89 fprintf(stderr, "No directory given for --perf-dir.\n" );
90 usage(perf_usage_string);
91 }
92 setenv(PERF_DIR_ENVIRONMENT, (*argv)[1], 1);
93 if (envchanged)
94 *envchanged = 1;
95 (*argv)++;
96 (*argc)--;
97 handled++;
98 } else if (!prefixcmp(cmd, "--perf-dir=")) {
99 setenv(PERF_DIR_ENVIRONMENT, cmd + 10, 1);
100 if (envchanged)
101 *envchanged = 1;
102 } else if (!strcmp(cmd, "--work-tree")) {
103 if (*argc < 2) {
104 fprintf(stderr, "No directory given for --work-tree.\n" );
105 usage(perf_usage_string);
106 }
107 setenv(PERF_WORK_TREE_ENVIRONMENT, (*argv)[1], 1);
108 if (envchanged)
109 *envchanged = 1;
110 (*argv)++;
111 (*argc)--;
112 } else if (!prefixcmp(cmd, "--work-tree=")) {
113 setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1);
114 if (envchanged)
115 *envchanged = 1;
116 } else {
117 fprintf(stderr, "Unknown option: %s\n", cmd);
118 usage(perf_usage_string);
119 }
120
121 (*argv)++;
122 (*argc)--;
123 handled++;
124 }
125 return handled;
126}
127
128static int handle_alias(int *argcp, const char ***argv)
129{
130 int envchanged = 0, ret = 0, saved_errno = errno;
131 int count, option_count;
132 const char** new_argv;
133 const char *alias_command;
134 char *alias_string;
135
136 alias_command = (*argv)[0];
137 alias_string = alias_lookup(alias_command);
138 if (alias_string) {
139 if (alias_string[0] == '!') {
140 if (*argcp > 1) {
141 struct strbuf buf;
142
143 strbuf_init(&buf, PATH_MAX);
144 strbuf_addstr(&buf, alias_string);
145 sq_quote_argv(&buf, (*argv) + 1, PATH_MAX);
146 free(alias_string);
147 alias_string = buf.buf;
148 }
149 ret = system(alias_string + 1);
150 if (ret >= 0 && WIFEXITED(ret) &&
151 WEXITSTATUS(ret) != 127)
152 exit(WEXITSTATUS(ret));
153 die("Failed to run '%s' when expanding alias '%s'",
154 alias_string + 1, alias_command);
155 }
156 count = split_cmdline(alias_string, &new_argv);
157 if (count < 0)
158 die("Bad alias.%s string", alias_command);
159 option_count = handle_options(&new_argv, &count, &envchanged);
160 if (envchanged)
161 die("alias '%s' changes environment variables\n"
162 "You can use '!perf' in the alias to do this.",
163 alias_command);
164 memmove(new_argv - option_count, new_argv,
165 count * sizeof(char *));
166 new_argv -= option_count;
167
168 if (count < 1)
169 die("empty alias for %s", alias_command);
170
171 if (!strcmp(alias_command, new_argv[0]))
172 die("recursive alias: %s", alias_command);
173
174 new_argv = realloc(new_argv, sizeof(char*) *
175 (count + *argcp + 1));
176 /* insert after command name */
177 memcpy(new_argv + count, *argv + 1, sizeof(char*) * *argcp);
178 new_argv[count+*argcp] = NULL;
179
180 *argv = new_argv;
181 *argcp += count - 1;
182
183 ret = 1;
184 }
185
186 errno = saved_errno;
187
188 return ret;
189}
190
191const char perf_version_string[] = PERF_VERSION;
192
193#define RUN_SETUP (1<<0)
194#define USE_PAGER (1<<1)
195/*
196 * require working tree to be present -- anything uses this needs
197 * RUN_SETUP for reading from the configuration file.
198 */
199#define NEED_WORK_TREE (1<<2)
200
201struct cmd_struct {
202 const char *cmd;
203 int (*fn)(int, const char **, const char *);
204 int option;
205};
206
207static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
208{
209 int status;
210 struct stat st;
211 const char *prefix;
212
213 prefix = NULL;
214 if (p->option & RUN_SETUP)
215 prefix = NULL; /* setup_perf_directory(); */
216
217 if (use_pager == -1 && p->option & RUN_SETUP)
218 use_pager = check_pager_config(p->cmd);
219 if (use_pager == -1 && p->option & USE_PAGER)
220 use_pager = 1;
221 commit_pager_choice();
222
223 if (p->option & NEED_WORK_TREE)
224 /* setup_work_tree() */;
225
226 status = p->fn(argc, argv, prefix);
227 if (status)
228 return status & 0xff;
229
230 /* Somebody closed stdout? */
231 if (fstat(fileno(stdout), &st))
232 return 0;
233 /* Ignore write errors for pipes and sockets.. */
234 if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode))
235 return 0;
236
237 /* Check for ENOSPC and EIO errors.. */
238 if (fflush(stdout))
239 die("write failure on standard output: %s", strerror(errno));
240 if (ferror(stdout))
241 die("unknown write failure on standard output");
242 if (fclose(stdout))
243 die("close failed on standard output: %s", strerror(errno));
244 return 0;
245}
246
247static void handle_internal_command(int argc, const char **argv)
248{
249 const char *cmd = argv[0];
250 static struct cmd_struct commands[] = {
251 { "help", cmd_help, 0 },
252 { "record", cmd_record, 0 },
253 { "stat", cmd_stat, 0 },
254 { "top", cmd_top, 0 },
255 { "version", cmd_version, 0 },
256 };
257 int i;
258 static const char ext[] = STRIP_EXTENSION;
259
260 if (sizeof(ext) > 1) {
261 i = strlen(argv[0]) - strlen(ext);
262 if (i > 0 && !strcmp(argv[0] + i, ext)) {
263 char *argv0 = strdup(argv[0]);
264 argv[0] = cmd = argv0;
265 argv0[i] = '\0';
266 }
267 }
268
269 /* Turn "perf cmd --help" into "perf help cmd" */
270 if (argc > 1 && !strcmp(argv[1], "--help")) {
271 argv[1] = argv[0];
272 argv[0] = cmd = "help";
273 }
274
275 for (i = 0; i < ARRAY_SIZE(commands); i++) {
276 struct cmd_struct *p = commands+i;
277 if (strcmp(p->cmd, cmd))
278 continue;
279 exit(run_builtin(p, argc, argv));
280 }
281}
282
283static void execv_dashed_external(const char **argv)
284{
285 struct strbuf cmd = STRBUF_INIT;
286 const char *tmp;
287 int status;
288
289 strbuf_addf(&cmd, "perf-%s", argv[0]);
290
291 /*
292 * argv[0] must be the perf command, but the argv array
293 * belongs to the caller, and may be reused in
294 * subsequent loop iterations. Save argv[0] and
295 * restore it on error.
296 */
297 tmp = argv[0];
298 argv[0] = cmd.buf;
299
300 /*
301 * if we fail because the command is not found, it is
302 * OK to return. Otherwise, we just pass along the status code.
303 */
304 status = run_command_v_opt(argv, 0);
305 if (status != -ERR_RUN_COMMAND_EXEC) {
306 if (IS_RUN_COMMAND_ERR(status))
307 die("unable to run '%s'", argv[0]);
308 exit(-status);
309 }
310 errno = ENOENT; /* as if we called execvp */
311
312 argv[0] = tmp;
313
314 strbuf_release(&cmd);
315}
316
317static int run_argv(int *argcp, const char ***argv)
318{
319 int done_alias = 0;
320
321 while (1) {
322 /* See if it's an internal command */
323 handle_internal_command(*argcp, *argv);
324
325 /* .. then try the external ones */
326 execv_dashed_external(*argv);
327
328 /* It could be an alias -- this works around the insanity
329 * of overriding "perf log" with "perf show" by having
330 * alias.log = show
331 */
332 if (done_alias || !handle_alias(argcp, argv))
333 break;
334 done_alias = 1;
335 }
336
337 return done_alias;
338}
339
340
341int main(int argc, const char **argv)
342{
343 const char *cmd;
344
345 cmd = perf_extract_argv0_path(argv[0]);
346 if (!cmd)
347 cmd = "perf-help";
348
349 /*
350 * "perf-xxxx" is the same as "perf xxxx", but we obviously:
351 *
352 * - cannot take flags in between the "perf" and the "xxxx".
353 * - cannot execute it externally (since it would just do
354 * the same thing over again)
355 *
356 * So we just directly call the internal command handler, and
357 * die if that one cannot handle it.
358 */
359 if (!prefixcmp(cmd, "perf-")) {
360 cmd += 4;
361 argv[0] = cmd;
362 handle_internal_command(argc, argv);
363 die("cannot handle %s internally", cmd);
364 }
365
366 /* Look for flags.. */
367 argv++;
368 argc--;
369 handle_options(&argv, &argc, NULL);
370 commit_pager_choice();
371 if (argc > 0) {
372 if (!prefixcmp(argv[0], "--"))
373 argv[0] += 2;
374 } else {
375 /* The user didn't specify a command; give them help */
376 printf("usage: %s\n\n", perf_usage_string);
377 list_common_cmds_help();
378 printf("\n%s\n", perf_more_info_string);
379 exit(1);
380 }
381 cmd = argv[0];
382
383 /*
384 * We use PATH to find perf commands, but we prepend some higher
385 * precidence paths: the "--exec-path" option, the PERF_EXEC_PATH
386 * environment, and the $(perfexecdir) from the Makefile at build
387 * time.
388 */
389 setup_path();
390
391 while (1) {
392 static int done_help = 0;
393 static int was_alias = 0;
394 was_alias = run_argv(&argc, &argv);
395 if (errno != ENOENT)
396 break;
397 if (was_alias) {
398 fprintf(stderr, "Expansion of alias '%s' failed; "
399 "'%s' is not a perf-command\n",
400 cmd, argv[0]);
401 exit(1);
402 }
403 if (!done_help) {
404 cmd = argv[0] = help_unknown_cmd(cmd);
405 done_help = 1;
406 } else
407 break;
408 }
409
410 fprintf(stderr, "Failed to run command '%s': %s\n",
411 cmd, strerror(errno));
412
413 return 1;
414}
diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
new file mode 100644
index 000000000000..6fa3656399f4
--- /dev/null
+++ b/Documentation/perf_counter/perf.h
@@ -0,0 +1,62 @@
1#ifndef _PERF_PERF_H
2#define _PERF_PERF_H
3
4/*
5 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
6 * counters in the current task.
7 */
8#define PR_TASK_PERF_COUNTERS_DISABLE 31
9#define PR_TASK_PERF_COUNTERS_ENABLE 32
10
11#ifndef NSEC_PER_SEC
12# define NSEC_PER_SEC 1000000000ULL
13#endif
14
15static inline unsigned long long rdclock(void)
16{
17 struct timespec ts;
18
19 clock_gettime(CLOCK_MONOTONIC, &ts);
20 return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
21}
22
23/*
24 * Pick up some kernel type conventions:
25 */
26#define __user
27#define asmlinkage
28
29#if defined(__x86_64__) || defined(__i386__)
30#include "../../arch/x86/include/asm/unistd.h"
31#define rmb() asm volatile("lfence" ::: "memory")
32#define cpu_relax() asm volatile("rep; nop" ::: "memory");
33#endif
34
35#ifdef __powerpc__
36#include "../../arch/powerpc/include/asm/unistd.h"
37#define rmb() asm volatile ("sync" ::: "memory")
38#define cpu_relax() asm volatile ("" ::: "memory");
39#endif
40
41#define unlikely(x) __builtin_expect(!!(x), 0)
42#define min(x, y) ({ \
43 typeof(x) _min1 = (x); \
44 typeof(y) _min2 = (y); \
45 (void) (&_min1 == &_min2); \
46 _min1 < _min2 ? _min1 : _min2; })
47
48static inline int
49sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
50 pid_t pid, int cpu, int group_fd,
51 unsigned long flags)
52{
53 return syscall(__NR_perf_counter_open, hw_event_uptr, pid, cpu,
54 group_fd, flags);
55}
56
57#define MAX_COUNTERS 64
58#define MAX_NR_CPUS 256
59
60#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
61
62#endif
diff --git a/Documentation/perf_counter/util/PERF-VERSION-GEN b/Documentation/perf_counter/util/PERF-VERSION-GEN
new file mode 100755
index 000000000000..c561d1538c03
--- /dev/null
+++ b/Documentation/perf_counter/util/PERF-VERSION-GEN
@@ -0,0 +1,42 @@
1#!/bin/sh
2
3GVF=PERF-VERSION-FILE
4DEF_VER=v0.0.1.PERF
5
6LF='
7'
8
9# First see if there is a version file (included in release tarballs),
10# then try git-describe, then default.
11if test -f version
12then
13 VN=$(cat version) || VN="$DEF_VER"
14elif test -d .git -o -f .git &&
15 VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
16 case "$VN" in
17 *$LF*) (exit 1) ;;
18 v[0-9]*)
19 git update-index -q --refresh
20 test -z "$(git diff-index --name-only HEAD --)" ||
21 VN="$VN-dirty" ;;
22 esac
23then
24 VN=$(echo "$VN" | sed -e 's/-/./g');
25else
26 VN="$DEF_VER"
27fi
28
29VN=$(expr "$VN" : v*'\(.*\)')
30
31if test -r $GVF
32then
33 VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
34else
35 VC=unset
36fi
37test "$VN" = "$VC" || {
38 echo >&2 "PERF_VERSION = $VN"
39 echo "PERF_VERSION = $VN" >$GVF
40}
41
42
diff --git a/Documentation/perf_counter/util/abspath.c b/Documentation/perf_counter/util/abspath.c
new file mode 100644
index 000000000000..649f34f83365
--- /dev/null
+++ b/Documentation/perf_counter/util/abspath.c
@@ -0,0 +1,117 @@
1#include "cache.h"
2
3/*
4 * Do not use this for inspecting *tracked* content. When path is a
5 * symlink to a directory, we do not want to say it is a directory when
6 * dealing with tracked content in the working tree.
7 */
8int is_directory(const char *path)
9{
10 struct stat st;
11 return (!stat(path, &st) && S_ISDIR(st.st_mode));
12}
13
14/* We allow "recursive" symbolic links. Only within reason, though. */
15#define MAXDEPTH 5
16
17const char *make_absolute_path(const char *path)
18{
19 static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1];
20 char cwd[1024] = "";
21 int buf_index = 1, len;
22
23 int depth = MAXDEPTH;
24 char *last_elem = NULL;
25 struct stat st;
26
27 if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
28 die ("Too long path: %.*s", 60, path);
29
30 while (depth--) {
31 if (!is_directory(buf)) {
32 char *last_slash = strrchr(buf, '/');
33 if (last_slash) {
34 *last_slash = '\0';
35 last_elem = xstrdup(last_slash + 1);
36 } else {
37 last_elem = xstrdup(buf);
38 *buf = '\0';
39 }
40 }
41
42 if (*buf) {
43 if (!*cwd && !getcwd(cwd, sizeof(cwd)))
44 die ("Could not get current working directory");
45
46 if (chdir(buf))
47 die ("Could not switch to '%s'", buf);
48 }
49 if (!getcwd(buf, PATH_MAX))
50 die ("Could not get current working directory");
51
52 if (last_elem) {
53 int len = strlen(buf);
54 if (len + strlen(last_elem) + 2 > PATH_MAX)
55 die ("Too long path name: '%s/%s'",
56 buf, last_elem);
57 buf[len] = '/';
58 strcpy(buf + len + 1, last_elem);
59 free(last_elem);
60 last_elem = NULL;
61 }
62
63 if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) {
64 len = readlink(buf, next_buf, PATH_MAX);
65 if (len < 0)
66 die ("Invalid symlink: %s", buf);
67 if (PATH_MAX <= len)
68 die("symbolic link too long: %s", buf);
69 next_buf[len] = '\0';
70 buf = next_buf;
71 buf_index = 1 - buf_index;
72 next_buf = bufs[buf_index];
73 } else
74 break;
75 }
76
77 if (*cwd && chdir(cwd))
78 die ("Could not change back to '%s'", cwd);
79
80 return buf;
81}
82
83static const char *get_pwd_cwd(void)
84{
85 static char cwd[PATH_MAX + 1];
86 char *pwd;
87 struct stat cwd_stat, pwd_stat;
88 if (getcwd(cwd, PATH_MAX) == NULL)
89 return NULL;
90 pwd = getenv("PWD");
91 if (pwd && strcmp(pwd, cwd)) {
92 stat(cwd, &cwd_stat);
93 if (!stat(pwd, &pwd_stat) &&
94 pwd_stat.st_dev == cwd_stat.st_dev &&
95 pwd_stat.st_ino == cwd_stat.st_ino) {
96 strlcpy(cwd, pwd, PATH_MAX);
97 }
98 }
99 return cwd;
100}
101
102const char *make_nonrelative_path(const char *path)
103{
104 static char buf[PATH_MAX + 1];
105
106 if (is_absolute_path(path)) {
107 if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
108 die("Too long path: %.*s", 60, path);
109 } else {
110 const char *cwd = get_pwd_cwd();
111 if (!cwd)
112 die("Cannot determine the current working directory");
113 if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
114 die("Too long path: %.*s", 60, path);
115 }
116 return buf;
117}
diff --git a/Documentation/perf_counter/util/alias.c b/Documentation/perf_counter/util/alias.c
new file mode 100644
index 000000000000..9b3dd2b428df
--- /dev/null
+++ b/Documentation/perf_counter/util/alias.c
@@ -0,0 +1,77 @@
1#include "cache.h"
2
3static const char *alias_key;
4static char *alias_val;
5
6static int alias_lookup_cb(const char *k, const char *v, void *cb)
7{
8 if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) {
9 if (!v)
10 return config_error_nonbool(k);
11 alias_val = strdup(v);
12 return 0;
13 }
14 return 0;
15}
16
17char *alias_lookup(const char *alias)
18{
19 alias_key = alias;
20 alias_val = NULL;
21 perf_config(alias_lookup_cb, NULL);
22 return alias_val;
23}
24
25int split_cmdline(char *cmdline, const char ***argv)
26{
27 int src, dst, count = 0, size = 16;
28 char quoted = 0;
29
30 *argv = malloc(sizeof(char*) * size);
31
32 /* split alias_string */
33 (*argv)[count++] = cmdline;
34 for (src = dst = 0; cmdline[src];) {
35 char c = cmdline[src];
36 if (!quoted && isspace(c)) {
37 cmdline[dst++] = 0;
38 while (cmdline[++src]
39 && isspace(cmdline[src]))
40 ; /* skip */
41 if (count >= size) {
42 size += 16;
43 *argv = realloc(*argv, sizeof(char*) * size);
44 }
45 (*argv)[count++] = cmdline + dst;
46 } else if (!quoted && (c == '\'' || c == '"')) {
47 quoted = c;
48 src++;
49 } else if (c == quoted) {
50 quoted = 0;
51 src++;
52 } else {
53 if (c == '\\' && quoted != '\'') {
54 src++;
55 c = cmdline[src];
56 if (!c) {
57 free(*argv);
58 *argv = NULL;
59 return error("cmdline ends with \\");
60 }
61 }
62 cmdline[dst++] = c;
63 src++;
64 }
65 }
66
67 cmdline[dst] = 0;
68
69 if (quoted) {
70 free(*argv);
71 *argv = NULL;
72 return error("unclosed quote");
73 }
74
75 return count;
76}
77
diff --git a/Documentation/perf_counter/util/cache.h b/Documentation/perf_counter/util/cache.h
new file mode 100644
index 000000000000..71080512fa86
--- /dev/null
+++ b/Documentation/perf_counter/util/cache.h
@@ -0,0 +1,117 @@
1#ifndef CACHE_H
2#define CACHE_H
3
4#include "util.h"
5#include "strbuf.h"
6
7#define PERF_DIR_ENVIRONMENT "PERF_DIR"
8#define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE"
9#define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
10#define DB_ENVIRONMENT "PERF_OBJECT_DIRECTORY"
11#define INDEX_ENVIRONMENT "PERF_INDEX_FILE"
12#define GRAFT_ENVIRONMENT "PERF_GRAFT_FILE"
13#define TEMPLATE_DIR_ENVIRONMENT "PERF_TEMPLATE_DIR"
14#define CONFIG_ENVIRONMENT "PERF_CONFIG"
15#define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH"
16#define CEILING_DIRECTORIES_ENVIRONMENT "PERF_CEILING_DIRECTORIES"
17#define PERFATTRIBUTES_FILE ".perfattributes"
18#define INFOATTRIBUTES_FILE "info/attributes"
19#define ATTRIBUTE_MACRO_PREFIX "[attr]"
20
21typedef int (*config_fn_t)(const char *, const char *, void *);
22extern int perf_default_config(const char *, const char *, void *);
23extern int perf_config_from_file(config_fn_t fn, const char *, void *);
24extern int perf_config(config_fn_t fn, void *);
25extern int perf_parse_ulong(const char *, unsigned long *);
26extern int perf_config_int(const char *, const char *);
27extern unsigned long perf_config_ulong(const char *, const char *);
28extern int perf_config_bool_or_int(const char *, const char *, int *);
29extern int perf_config_bool(const char *, const char *);
30extern int perf_config_string(const char **, const char *, const char *);
31extern int perf_config_set(const char *, const char *);
32extern int perf_config_set_multivar(const char *, const char *, const char *, int);
33extern int perf_config_rename_section(const char *, const char *);
34extern const char *perf_etc_perfconfig(void);
35extern int check_repository_format_version(const char *var, const char *value, void *cb);
36extern int perf_config_system(void);
37extern int perf_config_global(void);
38extern int config_error_nonbool(const char *);
39extern const char *config_exclusive_filename;
40
41#define MAX_PERFNAME (1000)
42extern char perf_default_email[MAX_PERFNAME];
43extern char perf_default_name[MAX_PERFNAME];
44extern int user_ident_explicitly_given;
45
46extern const char *perf_log_output_encoding;
47extern const char *perf_mailmap_file;
48
49/* IO helper functions */
50extern void maybe_flush_or_die(FILE *, const char *);
51extern int copy_fd(int ifd, int ofd);
52extern int copy_file(const char *dst, const char *src, int mode);
53extern ssize_t read_in_full(int fd, void *buf, size_t count);
54extern ssize_t write_in_full(int fd, const void *buf, size_t count);
55extern void write_or_die(int fd, const void *buf, size_t count);
56extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg);
57extern int write_or_whine_pipe(int fd, const void *buf, size_t count, const char *msg);
58extern void fsync_or_die(int fd, const char *);
59
60/* pager.c */
61extern void setup_pager(void);
62extern const char *pager_program;
63extern int pager_in_use(void);
64extern int pager_use_color;
65
66extern const char *editor_program;
67extern const char *excludes_file;
68
69char *alias_lookup(const char *alias);
70int split_cmdline(char *cmdline, const char ***argv);
71
72#define alloc_nr(x) (((x)+16)*3/2)
73
74/*
75 * Realloc the buffer pointed at by variable 'x' so that it can hold
76 * at least 'nr' entries; the number of entries currently allocated
77 * is 'alloc', using the standard growing factor alloc_nr() macro.
78 *
79 * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
80 */
81#define ALLOC_GROW(x, nr, alloc) \
82 do { \
83 if ((nr) > alloc) { \
84 if (alloc_nr(alloc) < (nr)) \
85 alloc = (nr); \
86 else \
87 alloc = alloc_nr(alloc); \
88 x = xrealloc((x), alloc * sizeof(*(x))); \
89 } \
90 } while(0)
91
92
93static inline int is_absolute_path(const char *path)
94{
95 return path[0] == '/';
96}
97
98const char *make_absolute_path(const char *path);
99const char *make_nonrelative_path(const char *path);
100const char *make_relative_path(const char *abs, const char *base);
101int normalize_path_copy(char *dst, const char *src);
102int longest_ancestor_length(const char *path, const char *prefix_list);
103char *strip_path_suffix(const char *path, const char *suffix);
104
105extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
106extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
107
108extern char *mksnpath(char *buf, size_t n, const char *fmt, ...)
109 __attribute__((format (printf, 3, 4)));
110extern char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
111 __attribute__((format (printf, 3, 4)));
112extern char *perf_pathdup(const char *fmt, ...)
113 __attribute__((format (printf, 1, 2)));
114
115extern size_t strlcpy(char *dest, const char *src, size_t size);
116
117#endif /* CACHE_H */
diff --git a/Documentation/perf_counter/util/config.c b/Documentation/perf_counter/util/config.c
new file mode 100644
index 000000000000..3dd13faa6a27
--- /dev/null
+++ b/Documentation/perf_counter/util/config.c
@@ -0,0 +1,873 @@
1/*
2 * GIT - The information manager from hell
3 *
4 * Copyright (C) Linus Torvalds, 2005
5 * Copyright (C) Johannes Schindelin, 2005
6 *
7 */
8#include "util.h"
9#include "cache.h"
10#include "exec_cmd.h"
11
12#define MAXNAME (256)
13
14static FILE *config_file;
15static const char *config_file_name;
16static int config_linenr;
17static int config_file_eof;
18
19const char *config_exclusive_filename = NULL;
20
21static int get_next_char(void)
22{
23 int c;
24 FILE *f;
25
26 c = '\n';
27 if ((f = config_file) != NULL) {
28 c = fgetc(f);
29 if (c == '\r') {
30 /* DOS like systems */
31 c = fgetc(f);
32 if (c != '\n') {
33 ungetc(c, f);
34 c = '\r';
35 }
36 }
37 if (c == '\n')
38 config_linenr++;
39 if (c == EOF) {
40 config_file_eof = 1;
41 c = '\n';
42 }
43 }
44 return c;
45}
46
47static char *parse_value(void)
48{
49 static char value[1024];
50 int quote = 0, comment = 0, len = 0, space = 0;
51
52 for (;;) {
53 int c = get_next_char();
54 if (len >= sizeof(value) - 1)
55 return NULL;
56 if (c == '\n') {
57 if (quote)
58 return NULL;
59 value[len] = 0;
60 return value;
61 }
62 if (comment)
63 continue;
64 if (isspace(c) && !quote) {
65 space = 1;
66 continue;
67 }
68 if (!quote) {
69 if (c == ';' || c == '#') {
70 comment = 1;
71 continue;
72 }
73 }
74 if (space) {
75 if (len)
76 value[len++] = ' ';
77 space = 0;
78 }
79 if (c == '\\') {
80 c = get_next_char();
81 switch (c) {
82 case '\n':
83 continue;
84 case 't':
85 c = '\t';
86 break;
87 case 'b':
88 c = '\b';
89 break;
90 case 'n':
91 c = '\n';
92 break;
93 /* Some characters escape as themselves */
94 case '\\': case '"':
95 break;
96 /* Reject unknown escape sequences */
97 default:
98 return NULL;
99 }
100 value[len++] = c;
101 continue;
102 }
103 if (c == '"') {
104 quote = 1-quote;
105 continue;
106 }
107 value[len++] = c;
108 }
109}
110
111static inline int iskeychar(int c)
112{
113 return isalnum(c) || c == '-';
114}
115
116static int get_value(config_fn_t fn, void *data, char *name, unsigned int len)
117{
118 int c;
119 char *value;
120
121 /* Get the full name */
122 for (;;) {
123 c = get_next_char();
124 if (config_file_eof)
125 break;
126 if (!iskeychar(c))
127 break;
128 name[len++] = tolower(c);
129 if (len >= MAXNAME)
130 return -1;
131 }
132 name[len] = 0;
133 while (c == ' ' || c == '\t')
134 c = get_next_char();
135
136 value = NULL;
137 if (c != '\n') {
138 if (c != '=')
139 return -1;
140 value = parse_value();
141 if (!value)
142 return -1;
143 }
144 return fn(name, value, data);
145}
146
147static int get_extended_base_var(char *name, int baselen, int c)
148{
149 do {
150 if (c == '\n')
151 return -1;
152 c = get_next_char();
153 } while (isspace(c));
154
155 /* We require the format to be '[base "extension"]' */
156 if (c != '"')
157 return -1;
158 name[baselen++] = '.';
159
160 for (;;) {
161 int c = get_next_char();
162 if (c == '\n')
163 return -1;
164 if (c == '"')
165 break;
166 if (c == '\\') {
167 c = get_next_char();
168 if (c == '\n')
169 return -1;
170 }
171 name[baselen++] = c;
172 if (baselen > MAXNAME / 2)
173 return -1;
174 }
175
176 /* Final ']' */
177 if (get_next_char() != ']')
178 return -1;
179 return baselen;
180}
181
182static int get_base_var(char *name)
183{
184 int baselen = 0;
185
186 for (;;) {
187 int c = get_next_char();
188 if (config_file_eof)
189 return -1;
190 if (c == ']')
191 return baselen;
192 if (isspace(c))
193 return get_extended_base_var(name, baselen, c);
194 if (!iskeychar(c) && c != '.')
195 return -1;
196 if (baselen > MAXNAME / 2)
197 return -1;
198 name[baselen++] = tolower(c);
199 }
200}
201
202static int perf_parse_file(config_fn_t fn, void *data)
203{
204 int comment = 0;
205 int baselen = 0;
206 static char var[MAXNAME];
207
208 /* U+FEFF Byte Order Mark in UTF8 */
209 static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf";
210 const unsigned char *bomptr = utf8_bom;
211
212 for (;;) {
213 int c = get_next_char();
214 if (bomptr && *bomptr) {
215 /* We are at the file beginning; skip UTF8-encoded BOM
216 * if present. Sane editors won't put this in on their
217 * own, but e.g. Windows Notepad will do it happily. */
218 if ((unsigned char) c == *bomptr) {
219 bomptr++;
220 continue;
221 } else {
222 /* Do not tolerate partial BOM. */
223 if (bomptr != utf8_bom)
224 break;
225 /* No BOM at file beginning. Cool. */
226 bomptr = NULL;
227 }
228 }
229 if (c == '\n') {
230 if (config_file_eof)
231 return 0;
232 comment = 0;
233 continue;
234 }
235 if (comment || isspace(c))
236 continue;
237 if (c == '#' || c == ';') {
238 comment = 1;
239 continue;
240 }
241 if (c == '[') {
242 baselen = get_base_var(var);
243 if (baselen <= 0)
244 break;
245 var[baselen++] = '.';
246 var[baselen] = 0;
247 continue;
248 }
249 if (!isalpha(c))
250 break;
251 var[baselen] = tolower(c);
252 if (get_value(fn, data, var, baselen+1) < 0)
253 break;
254 }
255 die("bad config file line %d in %s", config_linenr, config_file_name);
256}
257
258static int parse_unit_factor(const char *end, unsigned long *val)
259{
260 if (!*end)
261 return 1;
262 else if (!strcasecmp(end, "k")) {
263 *val *= 1024;
264 return 1;
265 }
266 else if (!strcasecmp(end, "m")) {
267 *val *= 1024 * 1024;
268 return 1;
269 }
270 else if (!strcasecmp(end, "g")) {
271 *val *= 1024 * 1024 * 1024;
272 return 1;
273 }
274 return 0;
275}
276
277static int perf_parse_long(const char *value, long *ret)
278{
279 if (value && *value) {
280 char *end;
281 long val = strtol(value, &end, 0);
282 unsigned long factor = 1;
283 if (!parse_unit_factor(end, &factor))
284 return 0;
285 *ret = val * factor;
286 return 1;
287 }
288 return 0;
289}
290
291int perf_parse_ulong(const char *value, unsigned long *ret)
292{
293 if (value && *value) {
294 char *end;
295 unsigned long val = strtoul(value, &end, 0);
296 if (!parse_unit_factor(end, &val))
297 return 0;
298 *ret = val;
299 return 1;
300 }
301 return 0;
302}
303
304static void die_bad_config(const char *name)
305{
306 if (config_file_name)
307 die("bad config value for '%s' in %s", name, config_file_name);
308 die("bad config value for '%s'", name);
309}
310
311int perf_config_int(const char *name, const char *value)
312{
313 long ret = 0;
314 if (!perf_parse_long(value, &ret))
315 die_bad_config(name);
316 return ret;
317}
318
319unsigned long perf_config_ulong(const char *name, const char *value)
320{
321 unsigned long ret;
322 if (!perf_parse_ulong(value, &ret))
323 die_bad_config(name);
324 return ret;
325}
326
327int perf_config_bool_or_int(const char *name, const char *value, int *is_bool)
328{
329 *is_bool = 1;
330 if (!value)
331 return 1;
332 if (!*value)
333 return 0;
334 if (!strcasecmp(value, "true") || !strcasecmp(value, "yes") || !strcasecmp(value, "on"))
335 return 1;
336 if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off"))
337 return 0;
338 *is_bool = 0;
339 return perf_config_int(name, value);
340}
341
342int perf_config_bool(const char *name, const char *value)
343{
344 int discard;
345 return !!perf_config_bool_or_int(name, value, &discard);
346}
347
348int perf_config_string(const char **dest, const char *var, const char *value)
349{
350 if (!value)
351 return config_error_nonbool(var);
352 *dest = strdup(value);
353 return 0;
354}
355
356static int perf_default_core_config(const char *var, const char *value)
357{
358 /* Add other config variables here and to Documentation/config.txt. */
359 return 0;
360}
361
362int perf_default_config(const char *var, const char *value, void *dummy)
363{
364 if (!prefixcmp(var, "core."))
365 return perf_default_core_config(var, value);
366
367 /* Add other config variables here and to Documentation/config.txt. */
368 return 0;
369}
370
371int perf_config_from_file(config_fn_t fn, const char *filename, void *data)
372{
373 int ret;
374 FILE *f = fopen(filename, "r");
375
376 ret = -1;
377 if (f) {
378 config_file = f;
379 config_file_name = filename;
380 config_linenr = 1;
381 config_file_eof = 0;
382 ret = perf_parse_file(fn, data);
383 fclose(f);
384 config_file_name = NULL;
385 }
386 return ret;
387}
388
389const char *perf_etc_perfconfig(void)
390{
391 static const char *system_wide;
392 if (!system_wide)
393 system_wide = system_path(ETC_PERFCONFIG);
394 return system_wide;
395}
396
397static int perf_env_bool(const char *k, int def)
398{
399 const char *v = getenv(k);
400 return v ? perf_config_bool(k, v) : def;
401}
402
403int perf_config_system(void)
404{
405 return !perf_env_bool("PERF_CONFIG_NOSYSTEM", 0);
406}
407
408int perf_config_global(void)
409{
410 return !perf_env_bool("PERF_CONFIG_NOGLOBAL", 0);
411}
412
413int perf_config(config_fn_t fn, void *data)
414{
415 int ret = 0, found = 0;
416 char *repo_config = NULL;
417 const char *home = NULL;
418
419 /* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
420 if (config_exclusive_filename)
421 return perf_config_from_file(fn, config_exclusive_filename, data);
422 if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
423 ret += perf_config_from_file(fn, perf_etc_perfconfig(),
424 data);
425 found += 1;
426 }
427
428 home = getenv("HOME");
429 if (perf_config_global() && home) {
430 char *user_config = strdup(mkpath("%s/.perfconfig", home));
431 if (!access(user_config, R_OK)) {
432 ret += perf_config_from_file(fn, user_config, data);
433 found += 1;
434 }
435 free(user_config);
436 }
437
438 repo_config = perf_pathdup("config");
439 if (!access(repo_config, R_OK)) {
440 ret += perf_config_from_file(fn, repo_config, data);
441 found += 1;
442 }
443 free(repo_config);
444 if (found == 0)
445 return -1;
446 return ret;
447}
448
449/*
450 * Find all the stuff for perf_config_set() below.
451 */
452
453#define MAX_MATCHES 512
454
455static struct {
456 int baselen;
457 char* key;
458 int do_not_match;
459 regex_t* value_regex;
460 int multi_replace;
461 size_t offset[MAX_MATCHES];
462 enum { START, SECTION_SEEN, SECTION_END_SEEN, KEY_SEEN } state;
463 int seen;
464} store;
465
466static int matches(const char* key, const char* value)
467{
468 return !strcmp(key, store.key) &&
469 (store.value_regex == NULL ||
470 (store.do_not_match ^
471 !regexec(store.value_regex, value, 0, NULL, 0)));
472}
473
474static int store_aux(const char* key, const char* value, void *cb)
475{
476 const char *ep;
477 size_t section_len;
478
479 switch (store.state) {
480 case KEY_SEEN:
481 if (matches(key, value)) {
482 if (store.seen == 1 && store.multi_replace == 0) {
483 warning("%s has multiple values", key);
484 } else if (store.seen >= MAX_MATCHES) {
485 error("too many matches for %s", key);
486 return 1;
487 }
488
489 store.offset[store.seen] = ftell(config_file);
490 store.seen++;
491 }
492 break;
493 case SECTION_SEEN:
494 /*
495 * What we are looking for is in store.key (both
496 * section and var), and its section part is baselen
497 * long. We found key (again, both section and var).
498 * We would want to know if this key is in the same
499 * section as what we are looking for. We already
500 * know we are in the same section as what should
501 * hold store.key.
502 */
503 ep = strrchr(key, '.');
504 section_len = ep - key;
505
506 if ((section_len != store.baselen) ||
507 memcmp(key, store.key, section_len+1)) {
508 store.state = SECTION_END_SEEN;
509 break;
510 }
511
512 /*
513 * Do not increment matches: this is no match, but we
514 * just made sure we are in the desired section.
515 */
516 store.offset[store.seen] = ftell(config_file);
517 /* fallthru */
518 case SECTION_END_SEEN:
519 case START:
520 if (matches(key, value)) {
521 store.offset[store.seen] = ftell(config_file);
522 store.state = KEY_SEEN;
523 store.seen++;
524 } else {
525 if (strrchr(key, '.') - key == store.baselen &&
526 !strncmp(key, store.key, store.baselen)) {
527 store.state = SECTION_SEEN;
528 store.offset[store.seen] = ftell(config_file);
529 }
530 }
531 }
532 return 0;
533}
534
535static int store_write_section(int fd, const char* key)
536{
537 const char *dot;
538 int i, success;
539 struct strbuf sb = STRBUF_INIT;
540
541 dot = memchr(key, '.', store.baselen);
542 if (dot) {
543 strbuf_addf(&sb, "[%.*s \"", (int)(dot - key), key);
544 for (i = dot - key + 1; i < store.baselen; i++) {
545 if (key[i] == '"' || key[i] == '\\')
546 strbuf_addch(&sb, '\\');
547 strbuf_addch(&sb, key[i]);
548 }
549 strbuf_addstr(&sb, "\"]\n");
550 } else {
551 strbuf_addf(&sb, "[%.*s]\n", store.baselen, key);
552 }
553
554 success = write_in_full(fd, sb.buf, sb.len) == sb.len;
555 strbuf_release(&sb);
556
557 return success;
558}
559
560static int store_write_pair(int fd, const char* key, const char* value)
561{
562 int i, success;
563 int length = strlen(key + store.baselen + 1);
564 const char *quote = "";
565 struct strbuf sb = STRBUF_INIT;
566
567 /*
568 * Check to see if the value needs to be surrounded with a dq pair.
569 * Note that problematic characters are always backslash-quoted; this
570 * check is about not losing leading or trailing SP and strings that
571 * follow beginning-of-comment characters (i.e. ';' and '#') by the
572 * configuration parser.
573 */
574 if (value[0] == ' ')
575 quote = "\"";
576 for (i = 0; value[i]; i++)
577 if (value[i] == ';' || value[i] == '#')
578 quote = "\"";
579 if (i && value[i - 1] == ' ')
580 quote = "\"";
581
582 strbuf_addf(&sb, "\t%.*s = %s",
583 length, key + store.baselen + 1, quote);
584
585 for (i = 0; value[i]; i++)
586 switch (value[i]) {
587 case '\n':
588 strbuf_addstr(&sb, "\\n");
589 break;
590 case '\t':
591 strbuf_addstr(&sb, "\\t");
592 break;
593 case '"':
594 case '\\':
595 strbuf_addch(&sb, '\\');
596 default:
597 strbuf_addch(&sb, value[i]);
598 break;
599 }
600 strbuf_addf(&sb, "%s\n", quote);
601
602 success = write_in_full(fd, sb.buf, sb.len) == sb.len;
603 strbuf_release(&sb);
604
605 return success;
606}
607
608static ssize_t find_beginning_of_line(const char* contents, size_t size,
609 size_t offset_, int* found_bracket)
610{
611 size_t equal_offset = size, bracket_offset = size;
612 ssize_t offset;
613
614contline:
615 for (offset = offset_-2; offset > 0
616 && contents[offset] != '\n'; offset--)
617 switch (contents[offset]) {
618 case '=': equal_offset = offset; break;
619 case ']': bracket_offset = offset; break;
620 }
621 if (offset > 0 && contents[offset-1] == '\\') {
622 offset_ = offset;
623 goto contline;
624 }
625 if (bracket_offset < equal_offset) {
626 *found_bracket = 1;
627 offset = bracket_offset+1;
628 } else
629 offset++;
630
631 return offset;
632}
633
634int perf_config_set(const char* key, const char* value)
635{
636 return perf_config_set_multivar(key, value, NULL, 0);
637}
638
639/*
640 * If value==NULL, unset in (remove from) config,
641 * if value_regex!=NULL, disregard key/value pairs where value does not match.
642 * if multi_replace==0, nothing, or only one matching key/value is replaced,
643 * else all matching key/values (regardless how many) are removed,
644 * before the new pair is written.
645 *
646 * Returns 0 on success.
647 *
648 * This function does this:
649 *
650 * - it locks the config file by creating ".perf/config.lock"
651 *
652 * - it then parses the config using store_aux() as validator to find
653 * the position on the key/value pair to replace. If it is to be unset,
654 * it must be found exactly once.
655 *
656 * - the config file is mmap()ed and the part before the match (if any) is
657 * written to the lock file, then the changed part and the rest.
658 *
659 * - the config file is removed and the lock file rename()d to it.
660 *
661 */
662int perf_config_set_multivar(const char* key, const char* value,
663 const char* value_regex, int multi_replace)
664{
665 int i, dot;
666 int fd = -1, in_fd;
667 int ret = 0;
668 char* config_filename;
669 const char* last_dot = strrchr(key, '.');
670
671 if (config_exclusive_filename)
672 config_filename = strdup(config_exclusive_filename);
673 else
674 config_filename = perf_pathdup("config");
675
676 /*
677 * Since "key" actually contains the section name and the real
678 * key name separated by a dot, we have to know where the dot is.
679 */
680
681 if (last_dot == NULL) {
682 error("key does not contain a section: %s", key);
683 ret = 2;
684 goto out_free;
685 }
686 store.baselen = last_dot - key;
687
688 store.multi_replace = multi_replace;
689
690 /*
691 * Validate the key and while at it, lower case it for matching.
692 */
693 store.key = malloc(strlen(key) + 1);
694 dot = 0;
695 for (i = 0; key[i]; i++) {
696 unsigned char c = key[i];
697 if (c == '.')
698 dot = 1;
699 /* Leave the extended basename untouched.. */
700 if (!dot || i > store.baselen) {
701 if (!iskeychar(c) || (i == store.baselen+1 && !isalpha(c))) {
702 error("invalid key: %s", key);
703 free(store.key);
704 ret = 1;
705 goto out_free;
706 }
707 c = tolower(c);
708 } else if (c == '\n') {
709 error("invalid key (newline): %s", key);
710 free(store.key);
711 ret = 1;
712 goto out_free;
713 }
714 store.key[i] = c;
715 }
716 store.key[i] = 0;
717
718 /*
719 * If .perf/config does not exist yet, write a minimal version.
720 */
721 in_fd = open(config_filename, O_RDONLY);
722 if ( in_fd < 0 ) {
723 free(store.key);
724
725 if ( ENOENT != errno ) {
726 error("opening %s: %s", config_filename,
727 strerror(errno));
728 ret = 3; /* same as "invalid config file" */
729 goto out_free;
730 }
731 /* if nothing to unset, error out */
732 if (value == NULL) {
733 ret = 5;
734 goto out_free;
735 }
736
737 store.key = (char*)key;
738 if (!store_write_section(fd, key) ||
739 !store_write_pair(fd, key, value))
740 goto write_err_out;
741 } else {
742 struct stat st;
743 char* contents;
744 size_t contents_sz, copy_begin, copy_end;
745 int i, new_line = 0;
746
747 if (value_regex == NULL)
748 store.value_regex = NULL;
749 else {
750 if (value_regex[0] == '!') {
751 store.do_not_match = 1;
752 value_regex++;
753 } else
754 store.do_not_match = 0;
755
756 store.value_regex = (regex_t*)malloc(sizeof(regex_t));
757 if (regcomp(store.value_regex, value_regex,
758 REG_EXTENDED)) {
759 error("invalid pattern: %s", value_regex);
760 free(store.value_regex);
761 ret = 6;
762 goto out_free;
763 }
764 }
765
766 store.offset[0] = 0;
767 store.state = START;
768 store.seen = 0;
769
770 /*
771 * After this, store.offset will contain the *end* offset
772 * of the last match, or remain at 0 if no match was found.
773 * As a side effect, we make sure to transform only a valid
774 * existing config file.
775 */
776 if (perf_config_from_file(store_aux, config_filename, NULL)) {
777 error("invalid config file %s", config_filename);
778 free(store.key);
779 if (store.value_regex != NULL) {
780 regfree(store.value_regex);
781 free(store.value_regex);
782 }
783 ret = 3;
784 goto out_free;
785 }
786
787 free(store.key);
788 if (store.value_regex != NULL) {
789 regfree(store.value_regex);
790 free(store.value_regex);
791 }
792
793 /* if nothing to unset, or too many matches, error out */
794 if ((store.seen == 0 && value == NULL) ||
795 (store.seen > 1 && multi_replace == 0)) {
796 ret = 5;
797 goto out_free;
798 }
799
800 fstat(in_fd, &st);
801 contents_sz = xsize_t(st.st_size);
802 contents = mmap(NULL, contents_sz, PROT_READ,
803 MAP_PRIVATE, in_fd, 0);
804 close(in_fd);
805
806 if (store.seen == 0)
807 store.seen = 1;
808
809 for (i = 0, copy_begin = 0; i < store.seen; i++) {
810 if (store.offset[i] == 0) {
811 store.offset[i] = copy_end = contents_sz;
812 } else if (store.state != KEY_SEEN) {
813 copy_end = store.offset[i];
814 } else
815 copy_end = find_beginning_of_line(
816 contents, contents_sz,
817 store.offset[i]-2, &new_line);
818
819 if (copy_end > 0 && contents[copy_end-1] != '\n')
820 new_line = 1;
821
822 /* write the first part of the config */
823 if (copy_end > copy_begin) {
824 if (write_in_full(fd, contents + copy_begin,
825 copy_end - copy_begin) <
826 copy_end - copy_begin)
827 goto write_err_out;
828 if (new_line &&
829 write_in_full(fd, "\n", 1) != 1)
830 goto write_err_out;
831 }
832 copy_begin = store.offset[i];
833 }
834
835 /* write the pair (value == NULL means unset) */
836 if (value != NULL) {
837 if (store.state == START) {
838 if (!store_write_section(fd, key))
839 goto write_err_out;
840 }
841 if (!store_write_pair(fd, key, value))
842 goto write_err_out;
843 }
844
845 /* write the rest of the config */
846 if (copy_begin < contents_sz)
847 if (write_in_full(fd, contents + copy_begin,
848 contents_sz - copy_begin) <
849 contents_sz - copy_begin)
850 goto write_err_out;
851
852 munmap(contents, contents_sz);
853 }
854
855 ret = 0;
856
857out_free:
858 free(config_filename);
859 return ret;
860
861write_err_out:
862 goto out_free;
863
864}
865
866/*
867 * Call this to report error for your variable that should not
868 * get a boolean value (i.e. "[my] var" means "true").
869 */
870int config_error_nonbool(const char *var)
871{
872 return error("Missing value for '%s'", var);
873}
diff --git a/Documentation/perf_counter/util/ctype.c b/Documentation/perf_counter/util/ctype.c
new file mode 100644
index 000000000000..b90ec004f29c
--- /dev/null
+++ b/Documentation/perf_counter/util/ctype.c
@@ -0,0 +1,26 @@
1/*
2 * Sane locale-independent, ASCII ctype.
3 *
4 * No surprises, and works with signed and unsigned chars.
5 */
6#include "cache.h"
7
8enum {
9 S = GIT_SPACE,
10 A = GIT_ALPHA,
11 D = GIT_DIGIT,
12 G = GIT_GLOB_SPECIAL, /* *, ?, [, \\ */
13 R = GIT_REGEX_SPECIAL, /* $, (, ), +, ., ^, {, | * */
14};
15
16unsigned char sane_ctype[256] = {
17 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0, /* 0.. 15 */
18 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16.. 31 */
19 S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0, /* 32.. 47 */
20 D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G, /* 48.. 63 */
21 0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 64.. 79 */
22 A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0, /* 80.. 95 */
23 0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 96..111 */
24 A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0, /* 112..127 */
25 /* Nothing in the 128.. range */
26};
diff --git a/Documentation/perf_counter/util/exec_cmd.c b/Documentation/perf_counter/util/exec_cmd.c
new file mode 100644
index 000000000000..d39292263153
--- /dev/null
+++ b/Documentation/perf_counter/util/exec_cmd.c
@@ -0,0 +1,165 @@
1#include "cache.h"
2#include "exec_cmd.h"
3#include "quote.h"
4#define MAX_ARGS 32
5
6extern char **environ;
7static const char *argv_exec_path;
8static const char *argv0_path;
9
10const char *system_path(const char *path)
11{
12#ifdef RUNTIME_PREFIX
13 static const char *prefix;
14#else
15 static const char *prefix = PREFIX;
16#endif
17 struct strbuf d = STRBUF_INIT;
18
19 if (is_absolute_path(path))
20 return path;
21
22#ifdef RUNTIME_PREFIX
23 assert(argv0_path);
24 assert(is_absolute_path(argv0_path));
25
26 if (!prefix &&
27 !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) &&
28 !(prefix = strip_path_suffix(argv0_path, BINDIR)) &&
29 !(prefix = strip_path_suffix(argv0_path, "perf"))) {
30 prefix = PREFIX;
31 fprintf(stderr, "RUNTIME_PREFIX requested, "
32 "but prefix computation failed. "
33 "Using static fallback '%s'.\n", prefix);
34 }
35#endif
36
37 strbuf_addf(&d, "%s/%s", prefix, path);
38 path = strbuf_detach(&d, NULL);
39 return path;
40}
41
42const char *perf_extract_argv0_path(const char *argv0)
43{
44 const char *slash;
45
46 if (!argv0 || !*argv0)
47 return NULL;
48 slash = argv0 + strlen(argv0);
49
50 while (argv0 <= slash && !is_dir_sep(*slash))
51 slash--;
52
53 if (slash >= argv0) {
54 argv0_path = strndup(argv0, slash - argv0);
55 return slash + 1;
56 }
57
58 return argv0;
59}
60
61void perf_set_argv_exec_path(const char *exec_path)
62{
63 argv_exec_path = exec_path;
64 /*
65 * Propagate this setting to external programs.
66 */
67 setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1);
68}
69
70
71/* Returns the highest-priority, location to look for perf programs. */
72const char *perf_exec_path(void)
73{
74 const char *env;
75
76 if (argv_exec_path)
77 return argv_exec_path;
78
79 env = getenv(EXEC_PATH_ENVIRONMENT);
80 if (env && *env) {
81 return env;
82 }
83
84 return system_path(PERF_EXEC_PATH);
85}
86
87static void add_path(struct strbuf *out, const char *path)
88{
89 if (path && *path) {
90 if (is_absolute_path(path))
91 strbuf_addstr(out, path);
92 else
93 strbuf_addstr(out, make_nonrelative_path(path));
94
95 strbuf_addch(out, PATH_SEP);
96 }
97}
98
99void setup_path(void)
100{
101 const char *old_path = getenv("PATH");
102 struct strbuf new_path = STRBUF_INIT;
103
104 add_path(&new_path, perf_exec_path());
105 add_path(&new_path, argv0_path);
106
107 if (old_path)
108 strbuf_addstr(&new_path, old_path);
109 else
110 strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin");
111
112 setenv("PATH", new_path.buf, 1);
113
114 strbuf_release(&new_path);
115}
116
117const char **prepare_perf_cmd(const char **argv)
118{
119 int argc;
120 const char **nargv;
121
122 for (argc = 0; argv[argc]; argc++)
123 ; /* just counting */
124 nargv = malloc(sizeof(*nargv) * (argc + 2));
125
126 nargv[0] = "perf";
127 for (argc = 0; argv[argc]; argc++)
128 nargv[argc + 1] = argv[argc];
129 nargv[argc + 1] = NULL;
130 return nargv;
131}
132
133int execv_perf_cmd(const char **argv) {
134 const char **nargv = prepare_perf_cmd(argv);
135
136 /* execvp() can only ever return if it fails */
137 execvp("perf", (char **)nargv);
138
139 free(nargv);
140 return -1;
141}
142
143
144int execl_perf_cmd(const char *cmd,...)
145{
146 int argc;
147 const char *argv[MAX_ARGS + 1];
148 const char *arg;
149 va_list param;
150
151 va_start(param, cmd);
152 argv[0] = cmd;
153 argc = 1;
154 while (argc < MAX_ARGS) {
155 arg = argv[argc++] = va_arg(param, char *);
156 if (!arg)
157 break;
158 }
159 va_end(param);
160 if (MAX_ARGS <= argc)
161 return error("too many args to run %s", cmd);
162
163 argv[argc] = NULL;
164 return execv_perf_cmd(argv);
165}
diff --git a/Documentation/perf_counter/util/exec_cmd.h b/Documentation/perf_counter/util/exec_cmd.h
new file mode 100644
index 000000000000..effe25eb1545
--- /dev/null
+++ b/Documentation/perf_counter/util/exec_cmd.h
@@ -0,0 +1,13 @@
1#ifndef PERF_EXEC_CMD_H
2#define PERF_EXEC_CMD_H
3
4extern void perf_set_argv_exec_path(const char *exec_path);
5extern const char *perf_extract_argv0_path(const char *path);
6extern const char *perf_exec_path(void);
7extern void setup_path(void);
8extern const char **prepare_perf_cmd(const char **argv);
9extern int execv_perf_cmd(const char **argv); /* NULL terminated */
10extern int execl_perf_cmd(const char *cmd, ...);
11extern const char *system_path(const char *path);
12
13#endif /* PERF_EXEC_CMD_H */
diff --git a/Documentation/perf_counter/util/generate-cmdlist.sh b/Documentation/perf_counter/util/generate-cmdlist.sh
new file mode 100755
index 000000000000..f06f6fd148f8
--- /dev/null
+++ b/Documentation/perf_counter/util/generate-cmdlist.sh
@@ -0,0 +1,24 @@
1#!/bin/sh
2
3echo "/* Automatically generated by $0 */
4struct cmdname_help
5{
6 char name[16];
7 char help[80];
8};
9
10static struct cmdname_help common_cmds[] = {"
11
12sed -n -e 's/^perf-\([^ ]*\)[ ].* common.*/\1/p' command-list.txt |
13sort |
14while read cmd
15do
16 sed -n '
17 /^NAME/,/perf-'"$cmd"'/H
18 ${
19 x
20 s/.*perf-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/
21 p
22 }' "Documentation/perf-$cmd.txt"
23done
24echo "};"
diff --git a/Documentation/perf_counter/util/help.c b/Documentation/perf_counter/util/help.c
new file mode 100644
index 000000000000..edde541d238d
--- /dev/null
+++ b/Documentation/perf_counter/util/help.c
@@ -0,0 +1,366 @@
1#include "cache.h"
2#include "../builtin.h"
3#include "exec_cmd.h"
4#include "levenshtein.h"
5#include "help.h"
6
7/* most GUI terminals set COLUMNS (although some don't export it) */
8static int term_columns(void)
9{
10 char *col_string = getenv("COLUMNS");
11 int n_cols;
12
13 if (col_string && (n_cols = atoi(col_string)) > 0)
14 return n_cols;
15
16#ifdef TIOCGWINSZ
17 {
18 struct winsize ws;
19 if (!ioctl(1, TIOCGWINSZ, &ws)) {
20 if (ws.ws_col)
21 return ws.ws_col;
22 }
23 }
24#endif
25
26 return 80;
27}
28
29void add_cmdname(struct cmdnames *cmds, const char *name, int len)
30{
31 struct cmdname *ent = malloc(sizeof(*ent) + len + 1);
32
33 ent->len = len;
34 memcpy(ent->name, name, len);
35 ent->name[len] = 0;
36
37 ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc);
38 cmds->names[cmds->cnt++] = ent;
39}
40
41static void clean_cmdnames(struct cmdnames *cmds)
42{
43 int i;
44 for (i = 0; i < cmds->cnt; ++i)
45 free(cmds->names[i]);
46 free(cmds->names);
47 cmds->cnt = 0;
48 cmds->alloc = 0;
49}
50
51static int cmdname_compare(const void *a_, const void *b_)
52{
53 struct cmdname *a = *(struct cmdname **)a_;
54 struct cmdname *b = *(struct cmdname **)b_;
55 return strcmp(a->name, b->name);
56}
57
58static void uniq(struct cmdnames *cmds)
59{
60 int i, j;
61
62 if (!cmds->cnt)
63 return;
64
65 for (i = j = 1; i < cmds->cnt; i++)
66 if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name))
67 cmds->names[j++] = cmds->names[i];
68
69 cmds->cnt = j;
70}
71
72void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
73{
74 int ci, cj, ei;
75 int cmp;
76
77 ci = cj = ei = 0;
78 while (ci < cmds->cnt && ei < excludes->cnt) {
79 cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
80 if (cmp < 0)
81 cmds->names[cj++] = cmds->names[ci++];
82 else if (cmp == 0)
83 ci++, ei++;
84 else if (cmp > 0)
85 ei++;
86 }
87
88 while (ci < cmds->cnt)
89 cmds->names[cj++] = cmds->names[ci++];
90
91 cmds->cnt = cj;
92}
93
94static void pretty_print_string_list(struct cmdnames *cmds, int longest)
95{
96 int cols = 1, rows;
97 int space = longest + 1; /* min 1 SP between words */
98 int max_cols = term_columns() - 1; /* don't print *on* the edge */
99 int i, j;
100
101 if (space < max_cols)
102 cols = max_cols / space;
103 rows = (cmds->cnt + cols - 1) / cols;
104
105 for (i = 0; i < rows; i++) {
106 printf(" ");
107
108 for (j = 0; j < cols; j++) {
109 int n = j * rows + i;
110 int size = space;
111 if (n >= cmds->cnt)
112 break;
113 if (j == cols-1 || n + rows >= cmds->cnt)
114 size = 1;
115 printf("%-*s", size, cmds->names[n]->name);
116 }
117 putchar('\n');
118 }
119}
120
121static int is_executable(const char *name)
122{
123 struct stat st;
124
125 if (stat(name, &st) || /* stat, not lstat */
126 !S_ISREG(st.st_mode))
127 return 0;
128
129#ifdef __MINGW32__
130 /* cannot trust the executable bit, peek into the file instead */
131 char buf[3] = { 0 };
132 int n;
133 int fd = open(name, O_RDONLY);
134 st.st_mode &= ~S_IXUSR;
135 if (fd >= 0) {
136 n = read(fd, buf, 2);
137 if (n == 2)
138 /* DOS executables start with "MZ" */
139 if (!strcmp(buf, "#!") || !strcmp(buf, "MZ"))
140 st.st_mode |= S_IXUSR;
141 close(fd);
142 }
143#endif
144 return st.st_mode & S_IXUSR;
145}
146
147static void list_commands_in_dir(struct cmdnames *cmds,
148 const char *path,
149 const char *prefix)
150{
151 int prefix_len;
152 DIR *dir = opendir(path);
153 struct dirent *de;
154 struct strbuf buf = STRBUF_INIT;
155 int len;
156
157 if (!dir)
158 return;
159 if (!prefix)
160 prefix = "perf-";
161 prefix_len = strlen(prefix);
162
163 strbuf_addf(&buf, "%s/", path);
164 len = buf.len;
165
166 while ((de = readdir(dir)) != NULL) {
167 int entlen;
168
169 if (prefixcmp(de->d_name, prefix))
170 continue;
171
172 strbuf_setlen(&buf, len);
173 strbuf_addstr(&buf, de->d_name);
174 if (!is_executable(buf.buf))
175 continue;
176
177 entlen = strlen(de->d_name) - prefix_len;
178 if (has_extension(de->d_name, ".exe"))
179 entlen -= 4;
180
181 add_cmdname(cmds, de->d_name + prefix_len, entlen);
182 }
183 closedir(dir);
184 strbuf_release(&buf);
185}
186
187void load_command_list(const char *prefix,
188 struct cmdnames *main_cmds,
189 struct cmdnames *other_cmds)
190{
191 const char *env_path = getenv("PATH");
192 const char *exec_path = perf_exec_path();
193
194 if (exec_path) {
195 list_commands_in_dir(main_cmds, exec_path, prefix);
196 qsort(main_cmds->names, main_cmds->cnt,
197 sizeof(*main_cmds->names), cmdname_compare);
198 uniq(main_cmds);
199 }
200
201 if (env_path) {
202 char *paths, *path, *colon;
203 path = paths = strdup(env_path);
204 while (1) {
205 if ((colon = strchr(path, PATH_SEP)))
206 *colon = 0;
207 if (!exec_path || strcmp(path, exec_path))
208 list_commands_in_dir(other_cmds, path, prefix);
209
210 if (!colon)
211 break;
212 path = colon + 1;
213 }
214 free(paths);
215
216 qsort(other_cmds->names, other_cmds->cnt,
217 sizeof(*other_cmds->names), cmdname_compare);
218 uniq(other_cmds);
219 }
220 exclude_cmds(other_cmds, main_cmds);
221}
222
223void list_commands(const char *title, struct cmdnames *main_cmds,
224 struct cmdnames *other_cmds)
225{
226 int i, longest = 0;
227
228 for (i = 0; i < main_cmds->cnt; i++)
229 if (longest < main_cmds->names[i]->len)
230 longest = main_cmds->names[i]->len;
231 for (i = 0; i < other_cmds->cnt; i++)
232 if (longest < other_cmds->names[i]->len)
233 longest = other_cmds->names[i]->len;
234
235 if (main_cmds->cnt) {
236 const char *exec_path = perf_exec_path();
237 printf("available %s in '%s'\n", title, exec_path);
238 printf("----------------");
239 mput_char('-', strlen(title) + strlen(exec_path));
240 putchar('\n');
241 pretty_print_string_list(main_cmds, longest);
242 putchar('\n');
243 }
244
245 if (other_cmds->cnt) {
246 printf("%s available from elsewhere on your $PATH\n", title);
247 printf("---------------------------------------");
248 mput_char('-', strlen(title));
249 putchar('\n');
250 pretty_print_string_list(other_cmds, longest);
251 putchar('\n');
252 }
253}
254
255int is_in_cmdlist(struct cmdnames *c, const char *s)
256{
257 int i;
258 for (i = 0; i < c->cnt; i++)
259 if (!strcmp(s, c->names[i]->name))
260 return 1;
261 return 0;
262}
263
264static int autocorrect;
265static struct cmdnames aliases;
266
267static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
268{
269 if (!strcmp(var, "help.autocorrect"))
270 autocorrect = perf_config_int(var,value);
271 /* Also use aliases for command lookup */
272 if (!prefixcmp(var, "alias."))
273 add_cmdname(&aliases, var + 6, strlen(var + 6));
274
275 return perf_default_config(var, value, cb);
276}
277
278static int levenshtein_compare(const void *p1, const void *p2)
279{
280 const struct cmdname *const *c1 = p1, *const *c2 = p2;
281 const char *s1 = (*c1)->name, *s2 = (*c2)->name;
282 int l1 = (*c1)->len;
283 int l2 = (*c2)->len;
284 return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
285}
286
287static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
288{
289 int i;
290 ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
291
292 for (i = 0; i < old->cnt; i++)
293 cmds->names[cmds->cnt++] = old->names[i];
294 free(old->names);
295 old->cnt = 0;
296 old->names = NULL;
297}
298
299const char *help_unknown_cmd(const char *cmd)
300{
301 int i, n, best_similarity = 0;
302 struct cmdnames main_cmds, other_cmds;
303
304 memset(&main_cmds, 0, sizeof(main_cmds));
305 memset(&other_cmds, 0, sizeof(main_cmds));
306 memset(&aliases, 0, sizeof(aliases));
307
308 perf_config(perf_unknown_cmd_config, NULL);
309
310 load_command_list("perf-", &main_cmds, &other_cmds);
311
312 add_cmd_list(&main_cmds, &aliases);
313 add_cmd_list(&main_cmds, &other_cmds);
314 qsort(main_cmds.names, main_cmds.cnt,
315 sizeof(main_cmds.names), cmdname_compare);
316 uniq(&main_cmds);
317
318 /* This reuses cmdname->len for similarity index */
319 for (i = 0; i < main_cmds.cnt; ++i)
320 main_cmds.names[i]->len =
321 levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
322
323 qsort(main_cmds.names, main_cmds.cnt,
324 sizeof(*main_cmds.names), levenshtein_compare);
325
326 if (!main_cmds.cnt)
327 die ("Uh oh. Your system reports no Git commands at all.");
328
329 best_similarity = main_cmds.names[0]->len;
330 n = 1;
331 while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
332 ++n;
333 if (autocorrect && n == 1) {
334 const char *assumed = main_cmds.names[0]->name;
335 main_cmds.names[0] = NULL;
336 clean_cmdnames(&main_cmds);
337 fprintf(stderr, "WARNING: You called a Git program named '%s', "
338 "which does not exist.\n"
339 "Continuing under the assumption that you meant '%s'\n",
340 cmd, assumed);
341 if (autocorrect > 0) {
342 fprintf(stderr, "in %0.1f seconds automatically...\n",
343 (float)autocorrect/10.0);
344 poll(NULL, 0, autocorrect * 100);
345 }
346 return assumed;
347 }
348
349 fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
350
351 if (best_similarity < 6) {
352 fprintf(stderr, "\nDid you mean %s?\n",
353 n < 2 ? "this": "one of these");
354
355 for (i = 0; i < n; i++)
356 fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
357 }
358
359 exit(1);
360}
361
362int cmd_version(int argc, const char **argv, const char *prefix)
363{
364 printf("perf version %s\n", perf_version_string);
365 return 0;
366}
diff --git a/Documentation/perf_counter/util/help.h b/Documentation/perf_counter/util/help.h
new file mode 100644
index 000000000000..56bc15406ffc
--- /dev/null
+++ b/Documentation/perf_counter/util/help.h
@@ -0,0 +1,29 @@
1#ifndef HELP_H
2#define HELP_H
3
4struct cmdnames {
5 int alloc;
6 int cnt;
7 struct cmdname {
8 size_t len; /* also used for similarity index in help.c */
9 char name[FLEX_ARRAY];
10 } **names;
11};
12
13static inline void mput_char(char c, unsigned int num)
14{
15 while(num--)
16 putchar(c);
17}
18
19void load_command_list(const char *prefix,
20 struct cmdnames *main_cmds,
21 struct cmdnames *other_cmds);
22void add_cmdname(struct cmdnames *cmds, const char *name, int len);
23/* Here we require that excludes is a sorted list. */
24void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
25int is_in_cmdlist(struct cmdnames *c, const char *s);
26void list_commands(const char *title, struct cmdnames *main_cmds,
27 struct cmdnames *other_cmds);
28
29#endif /* HELP_H */
diff --git a/Documentation/perf_counter/util/levenshtein.c b/Documentation/perf_counter/util/levenshtein.c
new file mode 100644
index 000000000000..e521d1516df6
--- /dev/null
+++ b/Documentation/perf_counter/util/levenshtein.c
@@ -0,0 +1,84 @@
1#include "cache.h"
2#include "levenshtein.h"
3
4/*
5 * This function implements the Damerau-Levenshtein algorithm to
6 * calculate a distance between strings.
7 *
8 * Basically, it says how many letters need to be swapped, substituted,
9 * deleted from, or added to string1, at least, to get string2.
10 *
11 * The idea is to build a distance matrix for the substrings of both
12 * strings. To avoid a large space complexity, only the last three rows
13 * are kept in memory (if swaps had the same or higher cost as one deletion
14 * plus one insertion, only two rows would be needed).
15 *
16 * At any stage, "i + 1" denotes the length of the current substring of
17 * string1 that the distance is calculated for.
18 *
19 * row2 holds the current row, row1 the previous row (i.e. for the substring
20 * of string1 of length "i"), and row0 the row before that.
21 *
22 * In other words, at the start of the big loop, row2[j + 1] contains the
23 * Damerau-Levenshtein distance between the substring of string1 of length
24 * "i" and the substring of string2 of length "j + 1".
25 *
26 * All the big loop does is determine the partial minimum-cost paths.
27 *
28 * It does so by calculating the costs of the path ending in characters
29 * i (in string1) and j (in string2), respectively, given that the last
30 * operation is a substition, a swap, a deletion, or an insertion.
31 *
32 * This implementation allows the costs to be weighted:
33 *
34 * - w (as in "sWap")
35 * - s (as in "Substitution")
36 * - a (for insertion, AKA "Add")
37 * - d (as in "Deletion")
38 *
39 * Note that this algorithm calculates a distance _iff_ d == a.
40 */
41int levenshtein(const char *string1, const char *string2,
42 int w, int s, int a, int d)
43{
44 int len1 = strlen(string1), len2 = strlen(string2);
45 int *row0 = malloc(sizeof(int) * (len2 + 1));
46 int *row1 = malloc(sizeof(int) * (len2 + 1));
47 int *row2 = malloc(sizeof(int) * (len2 + 1));
48 int i, j;
49
50 for (j = 0; j <= len2; j++)
51 row1[j] = j * a;
52 for (i = 0; i < len1; i++) {
53 int *dummy;
54
55 row2[0] = (i + 1) * d;
56 for (j = 0; j < len2; j++) {
57 /* substitution */
58 row2[j + 1] = row1[j] + s * (string1[i] != string2[j]);
59 /* swap */
60 if (i > 0 && j > 0 && string1[i - 1] == string2[j] &&
61 string1[i] == string2[j - 1] &&
62 row2[j + 1] > row0[j - 1] + w)
63 row2[j + 1] = row0[j - 1] + w;
64 /* deletion */
65 if (row2[j + 1] > row1[j + 1] + d)
66 row2[j + 1] = row1[j + 1] + d;
67 /* insertion */
68 if (row2[j + 1] > row2[j] + a)
69 row2[j + 1] = row2[j] + a;
70 }
71
72 dummy = row0;
73 row0 = row1;
74 row1 = row2;
75 row2 = dummy;
76 }
77
78 i = row1[len2];
79 free(row0);
80 free(row1);
81 free(row2);
82
83 return i;
84}
diff --git a/Documentation/perf_counter/util/levenshtein.h b/Documentation/perf_counter/util/levenshtein.h
new file mode 100644
index 000000000000..0173abeef52c
--- /dev/null
+++ b/Documentation/perf_counter/util/levenshtein.h
@@ -0,0 +1,8 @@
1#ifndef LEVENSHTEIN_H
2#define LEVENSHTEIN_H
3
4int levenshtein(const char *string1, const char *string2,
5 int swap_penalty, int substition_penalty,
6 int insertion_penalty, int deletion_penalty);
7
8#endif
diff --git a/Documentation/perf_counter/util/parse-options.c b/Documentation/perf_counter/util/parse-options.c
new file mode 100644
index 000000000000..28b34c1c29cf
--- /dev/null
+++ b/Documentation/perf_counter/util/parse-options.c
@@ -0,0 +1,492 @@
1#include "util.h"
2#include "parse-options.h"
3#include "cache.h"
4
5#define OPT_SHORT 1
6#define OPT_UNSET 2
7
8static int opterror(const struct option *opt, const char *reason, int flags)
9{
10 if (flags & OPT_SHORT)
11 return error("switch `%c' %s", opt->short_name, reason);
12 if (flags & OPT_UNSET)
13 return error("option `no-%s' %s", opt->long_name, reason);
14 return error("option `%s' %s", opt->long_name, reason);
15}
16
17static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
18 int flags, const char **arg)
19{
20 if (p->opt) {
21 *arg = p->opt;
22 p->opt = NULL;
23 } else if (p->argc == 1 && (opt->flags & PARSE_OPT_LASTARG_DEFAULT)) {
24 *arg = (const char *)opt->defval;
25 } else if (p->argc > 1) {
26 p->argc--;
27 *arg = *++p->argv;
28 } else
29 return opterror(opt, "requires a value", flags);
30 return 0;
31}
32
33static int get_value(struct parse_opt_ctx_t *p,
34 const struct option *opt, int flags)
35{
36 const char *s, *arg;
37 const int unset = flags & OPT_UNSET;
38
39 if (unset && p->opt)
40 return opterror(opt, "takes no value", flags);
41 if (unset && (opt->flags & PARSE_OPT_NONEG))
42 return opterror(opt, "isn't available", flags);
43
44 if (!(flags & OPT_SHORT) && p->opt) {
45 switch (opt->type) {
46 case OPTION_CALLBACK:
47 if (!(opt->flags & PARSE_OPT_NOARG))
48 break;
49 /* FALLTHROUGH */
50 case OPTION_BOOLEAN:
51 case OPTION_BIT:
52 case OPTION_SET_INT:
53 case OPTION_SET_PTR:
54 return opterror(opt, "takes no value", flags);
55 default:
56 break;
57 }
58 }
59
60 switch (opt->type) {
61 case OPTION_BIT:
62 if (unset)
63 *(int *)opt->value &= ~opt->defval;
64 else
65 *(int *)opt->value |= opt->defval;
66 return 0;
67
68 case OPTION_BOOLEAN:
69 *(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
70 return 0;
71
72 case OPTION_SET_INT:
73 *(int *)opt->value = unset ? 0 : opt->defval;
74 return 0;
75
76 case OPTION_SET_PTR:
77 *(void **)opt->value = unset ? NULL : (void *)opt->defval;
78 return 0;
79
80 case OPTION_STRING:
81 if (unset)
82 *(const char **)opt->value = NULL;
83 else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
84 *(const char **)opt->value = (const char *)opt->defval;
85 else
86 return get_arg(p, opt, flags, (const char **)opt->value);
87 return 0;
88
89 case OPTION_CALLBACK:
90 if (unset)
91 return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
92 if (opt->flags & PARSE_OPT_NOARG)
93 return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
94 if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
95 return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
96 if (get_arg(p, opt, flags, &arg))
97 return -1;
98 return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
99
100 case OPTION_INTEGER:
101 if (unset) {
102 *(int *)opt->value = 0;
103 return 0;
104 }
105 if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
106 *(int *)opt->value = opt->defval;
107 return 0;
108 }
109 if (get_arg(p, opt, flags, &arg))
110 return -1;
111 *(int *)opt->value = strtol(arg, (char **)&s, 10);
112 if (*s)
113 return opterror(opt, "expects a numerical value", flags);
114 return 0;
115
116 default:
117 die("should not happen, someone must be hit on the forehead");
118 }
119}
120
121static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options)
122{
123 for (; options->type != OPTION_END; options++) {
124 if (options->short_name == *p->opt) {
125 p->opt = p->opt[1] ? p->opt + 1 : NULL;
126 return get_value(p, options, OPT_SHORT);
127 }
128 }
129 return -2;
130}
131
132static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
133 const struct option *options)
134{
135 const char *arg_end = strchr(arg, '=');
136 const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
137 int abbrev_flags = 0, ambiguous_flags = 0;
138
139 if (!arg_end)
140 arg_end = arg + strlen(arg);
141
142 for (; options->type != OPTION_END; options++) {
143 const char *rest;
144 int flags = 0;
145
146 if (!options->long_name)
147 continue;
148
149 rest = skip_prefix(arg, options->long_name);
150 if (options->type == OPTION_ARGUMENT) {
151 if (!rest)
152 continue;
153 if (*rest == '=')
154 return opterror(options, "takes no value", flags);
155 if (*rest)
156 continue;
157 p->out[p->cpidx++] = arg - 2;
158 return 0;
159 }
160 if (!rest) {
161 /* abbreviated? */
162 if (!strncmp(options->long_name, arg, arg_end - arg)) {
163is_abbreviated:
164 if (abbrev_option) {
165 /*
166 * If this is abbreviated, it is
167 * ambiguous. So when there is no
168 * exact match later, we need to
169 * error out.
170 */
171 ambiguous_option = abbrev_option;
172 ambiguous_flags = abbrev_flags;
173 }
174 if (!(flags & OPT_UNSET) && *arg_end)
175 p->opt = arg_end + 1;
176 abbrev_option = options;
177 abbrev_flags = flags;
178 continue;
179 }
180 /* negated and abbreviated very much? */
181 if (!prefixcmp("no-", arg)) {
182 flags |= OPT_UNSET;
183 goto is_abbreviated;
184 }
185 /* negated? */
186 if (strncmp(arg, "no-", 3))
187 continue;
188 flags |= OPT_UNSET;
189 rest = skip_prefix(arg + 3, options->long_name);
190 /* abbreviated and negated? */
191 if (!rest && !prefixcmp(options->long_name, arg + 3))
192 goto is_abbreviated;
193 if (!rest)
194 continue;
195 }
196 if (*rest) {
197 if (*rest != '=')
198 continue;
199 p->opt = rest + 1;
200 }
201 return get_value(p, options, flags);
202 }
203
204 if (ambiguous_option)
205 return error("Ambiguous option: %s "
206 "(could be --%s%s or --%s%s)",
207 arg,
208 (ambiguous_flags & OPT_UNSET) ? "no-" : "",
209 ambiguous_option->long_name,
210 (abbrev_flags & OPT_UNSET) ? "no-" : "",
211 abbrev_option->long_name);
212 if (abbrev_option)
213 return get_value(p, abbrev_option, abbrev_flags);
214 return -2;
215}
216
217static void check_typos(const char *arg, const struct option *options)
218{
219 if (strlen(arg) < 3)
220 return;
221
222 if (!prefixcmp(arg, "no-")) {
223 error ("did you mean `--%s` (with two dashes ?)", arg);
224 exit(129);
225 }
226
227 for (; options->type != OPTION_END; options++) {
228 if (!options->long_name)
229 continue;
230 if (!prefixcmp(options->long_name, arg)) {
231 error ("did you mean `--%s` (with two dashes ?)", arg);
232 exit(129);
233 }
234 }
235}
236
237void parse_options_start(struct parse_opt_ctx_t *ctx,
238 int argc, const char **argv, int flags)
239{
240 memset(ctx, 0, sizeof(*ctx));
241 ctx->argc = argc - 1;
242 ctx->argv = argv + 1;
243 ctx->out = argv;
244 ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
245 ctx->flags = flags;
246 if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
247 (flags & PARSE_OPT_STOP_AT_NON_OPTION))
248 die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
249}
250
251static int usage_with_options_internal(const char * const *,
252 const struct option *, int);
253
254int parse_options_step(struct parse_opt_ctx_t *ctx,
255 const struct option *options,
256 const char * const usagestr[])
257{
258 int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
259
260 /* we must reset ->opt, unknown short option leave it dangling */
261 ctx->opt = NULL;
262
263 for (; ctx->argc; ctx->argc--, ctx->argv++) {
264 const char *arg = ctx->argv[0];
265
266 if (*arg != '-' || !arg[1]) {
267 if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
268 break;
269 ctx->out[ctx->cpidx++] = ctx->argv[0];
270 continue;
271 }
272
273 if (arg[1] != '-') {
274 ctx->opt = arg + 1;
275 if (internal_help && *ctx->opt == 'h')
276 return parse_options_usage(usagestr, options);
277 switch (parse_short_opt(ctx, options)) {
278 case -1:
279 return parse_options_usage(usagestr, options);
280 case -2:
281 goto unknown;
282 }
283 if (ctx->opt)
284 check_typos(arg + 1, options);
285 while (ctx->opt) {
286 if (internal_help && *ctx->opt == 'h')
287 return parse_options_usage(usagestr, options);
288 switch (parse_short_opt(ctx, options)) {
289 case -1:
290 return parse_options_usage(usagestr, options);
291 case -2:
292 /* fake a short option thing to hide the fact that we may have
293 * started to parse aggregated stuff
294 *
295 * This is leaky, too bad.
296 */
297 ctx->argv[0] = strdup(ctx->opt - 1);
298 *(char *)ctx->argv[0] = '-';
299 goto unknown;
300 }
301 }
302 continue;
303 }
304
305 if (!arg[2]) { /* "--" */
306 if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
307 ctx->argc--;
308 ctx->argv++;
309 }
310 break;
311 }
312
313 if (internal_help && !strcmp(arg + 2, "help-all"))
314 return usage_with_options_internal(usagestr, options, 1);
315 if (internal_help && !strcmp(arg + 2, "help"))
316 return parse_options_usage(usagestr, options);
317 switch (parse_long_opt(ctx, arg + 2, options)) {
318 case -1:
319 return parse_options_usage(usagestr, options);
320 case -2:
321 goto unknown;
322 }
323 continue;
324unknown:
325 if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
326 return PARSE_OPT_UNKNOWN;
327 ctx->out[ctx->cpidx++] = ctx->argv[0];
328 ctx->opt = NULL;
329 }
330 return PARSE_OPT_DONE;
331}
332
333int parse_options_end(struct parse_opt_ctx_t *ctx)
334{
335 memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
336 ctx->out[ctx->cpidx + ctx->argc] = NULL;
337 return ctx->cpidx + ctx->argc;
338}
339
340int parse_options(int argc, const char **argv, const struct option *options,
341 const char * const usagestr[], int flags)
342{
343 struct parse_opt_ctx_t ctx;
344
345 parse_options_start(&ctx, argc, argv, flags);
346 switch (parse_options_step(&ctx, options, usagestr)) {
347 case PARSE_OPT_HELP:
348 exit(129);
349 case PARSE_OPT_DONE:
350 break;
351 default: /* PARSE_OPT_UNKNOWN */
352 if (ctx.argv[0][1] == '-') {
353 error("unknown option `%s'", ctx.argv[0] + 2);
354 } else {
355 error("unknown switch `%c'", *ctx.opt);
356 }
357 usage_with_options(usagestr, options);
358 }
359
360 return parse_options_end(&ctx);
361}
362
363#define USAGE_OPTS_WIDTH 24
364#define USAGE_GAP 2
365
366int usage_with_options_internal(const char * const *usagestr,
367 const struct option *opts, int full)
368{
369 if (!usagestr)
370 return PARSE_OPT_HELP;
371
372 fprintf(stderr, "usage: %s\n", *usagestr++);
373 while (*usagestr && **usagestr)
374 fprintf(stderr, " or: %s\n", *usagestr++);
375 while (*usagestr) {
376 fprintf(stderr, "%s%s\n",
377 **usagestr ? " " : "",
378 *usagestr);
379 usagestr++;
380 }
381
382 if (opts->type != OPTION_GROUP)
383 fputc('\n', stderr);
384
385 for (; opts->type != OPTION_END; opts++) {
386 size_t pos;
387 int pad;
388
389 if (opts->type == OPTION_GROUP) {
390 fputc('\n', stderr);
391 if (*opts->help)
392 fprintf(stderr, "%s\n", opts->help);
393 continue;
394 }
395 if (!full && (opts->flags & PARSE_OPT_HIDDEN))
396 continue;
397
398 pos = fprintf(stderr, " ");
399 if (opts->short_name)
400 pos += fprintf(stderr, "-%c", opts->short_name);
401 if (opts->long_name && opts->short_name)
402 pos += fprintf(stderr, ", ");
403 if (opts->long_name)
404 pos += fprintf(stderr, "--%s", opts->long_name);
405
406 switch (opts->type) {
407 case OPTION_ARGUMENT:
408 break;
409 case OPTION_INTEGER:
410 if (opts->flags & PARSE_OPT_OPTARG)
411 if (opts->long_name)
412 pos += fprintf(stderr, "[=<n>]");
413 else
414 pos += fprintf(stderr, "[<n>]");
415 else
416 pos += fprintf(stderr, " <n>");
417 break;
418 case OPTION_CALLBACK:
419 if (opts->flags & PARSE_OPT_NOARG)
420 break;
421 /* FALLTHROUGH */
422 case OPTION_STRING:
423 if (opts->argh) {
424 if (opts->flags & PARSE_OPT_OPTARG)
425 if (opts->long_name)
426 pos += fprintf(stderr, "[=<%s>]", opts->argh);
427 else
428 pos += fprintf(stderr, "[<%s>]", opts->argh);
429 else
430 pos += fprintf(stderr, " <%s>", opts->argh);
431 } else {
432 if (opts->flags & PARSE_OPT_OPTARG)
433 if (opts->long_name)
434 pos += fprintf(stderr, "[=...]");
435 else
436 pos += fprintf(stderr, "[...]");
437 else
438 pos += fprintf(stderr, " ...");
439 }
440 break;
441 default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */
442 break;
443 }
444
445 if (pos <= USAGE_OPTS_WIDTH)
446 pad = USAGE_OPTS_WIDTH - pos;
447 else {
448 fputc('\n', stderr);
449 pad = USAGE_OPTS_WIDTH;
450 }
451 fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
452 }
453 fputc('\n', stderr);
454
455 return PARSE_OPT_HELP;
456}
457
458void usage_with_options(const char * const *usagestr,
459 const struct option *opts)
460{
461 usage_with_options_internal(usagestr, opts, 0);
462 exit(129);
463}
464
465int parse_options_usage(const char * const *usagestr,
466 const struct option *opts)
467{
468 return usage_with_options_internal(usagestr, opts, 0);
469}
470
471
472int parse_opt_verbosity_cb(const struct option *opt, const char *arg,
473 int unset)
474{
475 int *target = opt->value;
476
477 if (unset)
478 /* --no-quiet, --no-verbose */
479 *target = 0;
480 else if (opt->short_name == 'v') {
481 if (*target >= 0)
482 (*target)++;
483 else
484 *target = 1;
485 } else {
486 if (*target <= 0)
487 (*target)--;
488 else
489 *target = -1;
490 }
491 return 0;
492}
diff --git a/Documentation/perf_counter/util/parse-options.h b/Documentation/perf_counter/util/parse-options.h
new file mode 100644
index 000000000000..a81c7faff68e
--- /dev/null
+++ b/Documentation/perf_counter/util/parse-options.h
@@ -0,0 +1,172 @@
1#ifndef PARSE_OPTIONS_H
2#define PARSE_OPTIONS_H
3
4enum parse_opt_type {
5 /* special types */
6 OPTION_END,
7 OPTION_ARGUMENT,
8 OPTION_GROUP,
9 /* options with no arguments */
10 OPTION_BIT,
11 OPTION_BOOLEAN, /* _INCR would have been a better name */
12 OPTION_SET_INT,
13 OPTION_SET_PTR,
14 /* options with arguments (usually) */
15 OPTION_STRING,
16 OPTION_INTEGER,
17 OPTION_CALLBACK,
18};
19
20enum parse_opt_flags {
21 PARSE_OPT_KEEP_DASHDASH = 1,
22 PARSE_OPT_STOP_AT_NON_OPTION = 2,
23 PARSE_OPT_KEEP_ARGV0 = 4,
24 PARSE_OPT_KEEP_UNKNOWN = 8,
25 PARSE_OPT_NO_INTERNAL_HELP = 16,
26};
27
28enum parse_opt_option_flags {
29 PARSE_OPT_OPTARG = 1,
30 PARSE_OPT_NOARG = 2,
31 PARSE_OPT_NONEG = 4,
32 PARSE_OPT_HIDDEN = 8,
33 PARSE_OPT_LASTARG_DEFAULT = 16,
34};
35
36struct option;
37typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
38
39/*
40 * `type`::
41 * holds the type of the option, you must have an OPTION_END last in your
42 * array.
43 *
44 * `short_name`::
45 * the character to use as a short option name, '\0' if none.
46 *
47 * `long_name`::
48 * the long option name, without the leading dashes, NULL if none.
49 *
50 * `value`::
51 * stores pointers to the values to be filled.
52 *
53 * `argh`::
54 * token to explain the kind of argument this option wants. Keep it
55 * homogenous across the repository.
56 *
57 * `help`::
58 * the short help associated to what the option does.
59 * Must never be NULL (except for OPTION_END).
60 * OPTION_GROUP uses this pointer to store the group header.
61 *
62 * `flags`::
63 * mask of parse_opt_option_flags.
64 * PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
65 * PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
66 * PARSE_OPT_NONEG: says that this option cannot be negated
67 * PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
68 * the long one.
69 *
70 * `callback`::
71 * pointer to the callback to use for OPTION_CALLBACK.
72 *
73 * `defval`::
74 * default value to fill (*->value) with for PARSE_OPT_OPTARG.
75 * OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in
76 * the value when met.
77 * CALLBACKS can use it like they want.
78 */
79struct option {
80 enum parse_opt_type type;
81 int short_name;
82 const char *long_name;
83 void *value;
84 const char *argh;
85 const char *help;
86
87 int flags;
88 parse_opt_cb *callback;
89 intptr_t defval;
90};
91
92#define OPT_END() { OPTION_END }
93#define OPT_ARGUMENT(l, h) { OPTION_ARGUMENT, 0, (l), NULL, NULL, (h) }
94#define OPT_GROUP(h) { OPTION_GROUP, 0, NULL, NULL, NULL, (h) }
95#define OPT_BIT(s, l, v, h, b) { OPTION_BIT, (s), (l), (v), NULL, (h), 0, NULL, (b) }
96#define OPT_BOOLEAN(s, l, v, h) { OPTION_BOOLEAN, (s), (l), (v), NULL, (h) }
97#define OPT_SET_INT(s, l, v, h, i) { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) }
98#define OPT_SET_PTR(s, l, v, h, p) { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) }
99#define OPT_INTEGER(s, l, v, h) { OPTION_INTEGER, (s), (l), (v), NULL, (h) }
100#define OPT_STRING(s, l, v, a, h) { OPTION_STRING, (s), (l), (v), (a), (h) }
101#define OPT_DATE(s, l, v, h) \
102 { OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \
103 parse_opt_approxidate_cb }
104#define OPT_CALLBACK(s, l, v, a, h, f) \
105 { OPTION_CALLBACK, (s), (l), (v), (a), (h), 0, (f) }
106
107/* parse_options() will filter out the processed options and leave the
108 * non-option argments in argv[].
109 * Returns the number of arguments left in argv[].
110 */
111extern int parse_options(int argc, const char **argv,
112 const struct option *options,
113 const char * const usagestr[], int flags);
114
115extern NORETURN void usage_with_options(const char * const *usagestr,
116 const struct option *options);
117
118/*----- incremantal advanced APIs -----*/
119
120enum {
121 PARSE_OPT_HELP = -1,
122 PARSE_OPT_DONE,
123 PARSE_OPT_UNKNOWN,
124};
125
126/*
127 * It's okay for the caller to consume argv/argc in the usual way.
128 * Other fields of that structure are private to parse-options and should not
129 * be modified in any way.
130 */
131struct parse_opt_ctx_t {
132 const char **argv;
133 const char **out;
134 int argc, cpidx;
135 const char *opt;
136 int flags;
137};
138
139extern int parse_options_usage(const char * const *usagestr,
140 const struct option *opts);
141
142extern void parse_options_start(struct parse_opt_ctx_t *ctx,
143 int argc, const char **argv, int flags);
144
145extern int parse_options_step(struct parse_opt_ctx_t *ctx,
146 const struct option *options,
147 const char * const usagestr[]);
148
149extern int parse_options_end(struct parse_opt_ctx_t *ctx);
150
151
152/*----- some often used options -----*/
153extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
154extern int parse_opt_approxidate_cb(const struct option *, const char *, int);
155extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
156
157#define OPT__VERBOSE(var) OPT_BOOLEAN('v', "verbose", (var), "be verbose")
158#define OPT__QUIET(var) OPT_BOOLEAN('q', "quiet", (var), "be quiet")
159#define OPT__VERBOSITY(var) \
160 { OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \
161 PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \
162 { OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \
163 PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }
164#define OPT__DRY_RUN(var) OPT_BOOLEAN('n', "dry-run", (var), "dry run")
165#define OPT__ABBREV(var) \
166 { OPTION_CALLBACK, 0, "abbrev", (var), "n", \
167 "use <n> digits to display SHA-1s", \
168 PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 }
169
170extern const char *parse_options_fix_filename(const char *prefix, const char *file);
171
172#endif
diff --git a/Documentation/perf_counter/util/path.c b/Documentation/perf_counter/util/path.c
new file mode 100644
index 000000000000..a501a40dd2cb
--- /dev/null
+++ b/Documentation/perf_counter/util/path.c
@@ -0,0 +1,353 @@
1/*
2 * I'm tired of doing "vsnprintf()" etc just to open a
3 * file, so here's a "return static buffer with printf"
4 * interface for paths.
5 *
6 * It's obviously not thread-safe. Sue me. But it's quite
7 * useful for doing things like
8 *
9 * f = open(mkpath("%s/%s.perf", base, name), O_RDONLY);
10 *
11 * which is what it's designed for.
12 */
13#include "cache.h"
14
15static char bad_path[] = "/bad-path/";
16/*
17 * Two hacks:
18 */
19
20static char *get_perf_dir(void)
21{
22 return ".";
23}
24
25size_t strlcpy(char *dest, const char *src, size_t size)
26{
27 size_t ret = strlen(src);
28
29 if (size) {
30 size_t len = (ret >= size) ? size - 1 : ret;
31 memcpy(dest, src, len);
32 dest[len] = '\0';
33 }
34 return ret;
35}
36
37
38static char *get_pathname(void)
39{
40 static char pathname_array[4][PATH_MAX];
41 static int index;
42 return pathname_array[3 & ++index];
43}
44
45static char *cleanup_path(char *path)
46{
47 /* Clean it up */
48 if (!memcmp(path, "./", 2)) {
49 path += 2;
50 while (*path == '/')
51 path++;
52 }
53 return path;
54}
55
56char *mksnpath(char *buf, size_t n, const char *fmt, ...)
57{
58 va_list args;
59 unsigned len;
60
61 va_start(args, fmt);
62 len = vsnprintf(buf, n, fmt, args);
63 va_end(args);
64 if (len >= n) {
65 strlcpy(buf, bad_path, n);
66 return buf;
67 }
68 return cleanup_path(buf);
69}
70
71static char *perf_vsnpath(char *buf, size_t n, const char *fmt, va_list args)
72{
73 const char *perf_dir = get_perf_dir();
74 size_t len;
75
76 len = strlen(perf_dir);
77 if (n < len + 1)
78 goto bad;
79 memcpy(buf, perf_dir, len);
80 if (len && !is_dir_sep(perf_dir[len-1]))
81 buf[len++] = '/';
82 len += vsnprintf(buf + len, n - len, fmt, args);
83 if (len >= n)
84 goto bad;
85 return cleanup_path(buf);
86bad:
87 strlcpy(buf, bad_path, n);
88 return buf;
89}
90
91char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
92{
93 va_list args;
94 va_start(args, fmt);
95 (void)perf_vsnpath(buf, n, fmt, args);
96 va_end(args);
97 return buf;
98}
99
100char *perf_pathdup(const char *fmt, ...)
101{
102 char path[PATH_MAX];
103 va_list args;
104 va_start(args, fmt);
105 (void)perf_vsnpath(path, sizeof(path), fmt, args);
106 va_end(args);
107 return xstrdup(path);
108}
109
110char *mkpath(const char *fmt, ...)
111{
112 va_list args;
113 unsigned len;
114 char *pathname = get_pathname();
115
116 va_start(args, fmt);
117 len = vsnprintf(pathname, PATH_MAX, fmt, args);
118 va_end(args);
119 if (len >= PATH_MAX)
120 return bad_path;
121 return cleanup_path(pathname);
122}
123
124char *perf_path(const char *fmt, ...)
125{
126 const char *perf_dir = get_perf_dir();
127 char *pathname = get_pathname();
128 va_list args;
129 unsigned len;
130
131 len = strlen(perf_dir);
132 if (len > PATH_MAX-100)
133 return bad_path;
134 memcpy(pathname, perf_dir, len);
135 if (len && perf_dir[len-1] != '/')
136 pathname[len++] = '/';
137 va_start(args, fmt);
138 len += vsnprintf(pathname + len, PATH_MAX - len, fmt, args);
139 va_end(args);
140 if (len >= PATH_MAX)
141 return bad_path;
142 return cleanup_path(pathname);
143}
144
145
146/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
147int perf_mkstemp(char *path, size_t len, const char *template)
148{
149 const char *tmp;
150 size_t n;
151
152 tmp = getenv("TMPDIR");
153 if (!tmp)
154 tmp = "/tmp";
155 n = snprintf(path, len, "%s/%s", tmp, template);
156 if (len <= n) {
157 errno = ENAMETOOLONG;
158 return -1;
159 }
160 return mkstemp(path);
161}
162
163
164const char *make_relative_path(const char *abs, const char *base)
165{
166 static char buf[PATH_MAX + 1];
167 int baselen;
168 if (!base)
169 return abs;
170 baselen = strlen(base);
171 if (prefixcmp(abs, base))
172 return abs;
173 if (abs[baselen] == '/')
174 baselen++;
175 else if (base[baselen - 1] != '/')
176 return abs;
177 strcpy(buf, abs + baselen);
178 return buf;
179}
180
181/*
182 * It is okay if dst == src, but they should not overlap otherwise.
183 *
184 * Performs the following normalizations on src, storing the result in dst:
185 * - Ensures that components are separated by '/' (Windows only)
186 * - Squashes sequences of '/'.
187 * - Removes "." components.
188 * - Removes ".." components, and the components the precede them.
189 * Returns failure (non-zero) if a ".." component appears as first path
190 * component anytime during the normalization. Otherwise, returns success (0).
191 *
192 * Note that this function is purely textual. It does not follow symlinks,
193 * verify the existence of the path, or make any system calls.
194 */
195int normalize_path_copy(char *dst, const char *src)
196{
197 char *dst0;
198
199 if (has_dos_drive_prefix(src)) {
200 *dst++ = *src++;
201 *dst++ = *src++;
202 }
203 dst0 = dst;
204
205 if (is_dir_sep(*src)) {
206 *dst++ = '/';
207 while (is_dir_sep(*src))
208 src++;
209 }
210
211 for (;;) {
212 char c = *src;
213
214 /*
215 * A path component that begins with . could be
216 * special:
217 * (1) "." and ends -- ignore and terminate.
218 * (2) "./" -- ignore them, eat slash and continue.
219 * (3) ".." and ends -- strip one and terminate.
220 * (4) "../" -- strip one, eat slash and continue.
221 */
222 if (c == '.') {
223 if (!src[1]) {
224 /* (1) */
225 src++;
226 } else if (is_dir_sep(src[1])) {
227 /* (2) */
228 src += 2;
229 while (is_dir_sep(*src))
230 src++;
231 continue;
232 } else if (src[1] == '.') {
233 if (!src[2]) {
234 /* (3) */
235 src += 2;
236 goto up_one;
237 } else if (is_dir_sep(src[2])) {
238 /* (4) */
239 src += 3;
240 while (is_dir_sep(*src))
241 src++;
242 goto up_one;
243 }
244 }
245 }
246
247 /* copy up to the next '/', and eat all '/' */
248 while ((c = *src++) != '\0' && !is_dir_sep(c))
249 *dst++ = c;
250 if (is_dir_sep(c)) {
251 *dst++ = '/';
252 while (is_dir_sep(c))
253 c = *src++;
254 src--;
255 } else if (!c)
256 break;
257 continue;
258
259 up_one:
260 /*
261 * dst0..dst is prefix portion, and dst[-1] is '/';
262 * go up one level.
263 */
264 dst--; /* go to trailing '/' */
265 if (dst <= dst0)
266 return -1;
267 /* Windows: dst[-1] cannot be backslash anymore */
268 while (dst0 < dst && dst[-1] != '/')
269 dst--;
270 }
271 *dst = '\0';
272 return 0;
273}
274
275/*
276 * path = Canonical absolute path
277 * prefix_list = Colon-separated list of absolute paths
278 *
279 * Determines, for each path in prefix_list, whether the "prefix" really
280 * is an ancestor directory of path. Returns the length of the longest
281 * ancestor directory, excluding any trailing slashes, or -1 if no prefix
282 * is an ancestor. (Note that this means 0 is returned if prefix_list is
283 * "/".) "/foo" is not considered an ancestor of "/foobar". Directories
284 * are not considered to be their own ancestors. path must be in a
285 * canonical form: empty components, or "." or ".." components are not
286 * allowed. prefix_list may be null, which is like "".
287 */
288int longest_ancestor_length(const char *path, const char *prefix_list)
289{
290 char buf[PATH_MAX+1];
291 const char *ceil, *colon;
292 int len, max_len = -1;
293
294 if (prefix_list == NULL || !strcmp(path, "/"))
295 return -1;
296
297 for (colon = ceil = prefix_list; *colon; ceil = colon+1) {
298 for (colon = ceil; *colon && *colon != PATH_SEP; colon++);
299 len = colon - ceil;
300 if (len == 0 || len > PATH_MAX || !is_absolute_path(ceil))
301 continue;
302 strlcpy(buf, ceil, len+1);
303 if (normalize_path_copy(buf, buf) < 0)
304 continue;
305 len = strlen(buf);
306 if (len > 0 && buf[len-1] == '/')
307 buf[--len] = '\0';
308
309 if (!strncmp(path, buf, len) &&
310 path[len] == '/' &&
311 len > max_len) {
312 max_len = len;
313 }
314 }
315
316 return max_len;
317}
318
319/* strip arbitrary amount of directory separators at end of path */
320static inline int chomp_trailing_dir_sep(const char *path, int len)
321{
322 while (len && is_dir_sep(path[len - 1]))
323 len--;
324 return len;
325}
326
327/*
328 * If path ends with suffix (complete path components), returns the
329 * part before suffix (sans trailing directory separators).
330 * Otherwise returns NULL.
331 */
332char *strip_path_suffix(const char *path, const char *suffix)
333{
334 int path_len = strlen(path), suffix_len = strlen(suffix);
335
336 while (suffix_len) {
337 if (!path_len)
338 return NULL;
339
340 if (is_dir_sep(path[path_len - 1])) {
341 if (!is_dir_sep(suffix[suffix_len - 1]))
342 return NULL;
343 path_len = chomp_trailing_dir_sep(path, path_len);
344 suffix_len = chomp_trailing_dir_sep(suffix, suffix_len);
345 }
346 else if (path[--path_len] != suffix[--suffix_len])
347 return NULL;
348 }
349
350 if (path_len && !is_dir_sep(path[path_len - 1]))
351 return NULL;
352 return xstrndup(path, chomp_trailing_dir_sep(path, path_len));
353}
diff --git a/Documentation/perf_counter/util/quote.c b/Documentation/perf_counter/util/quote.c
new file mode 100644
index 000000000000..7a49fcf69671
--- /dev/null
+++ b/Documentation/perf_counter/util/quote.c
@@ -0,0 +1,478 @@
1#include "cache.h"
2#include "quote.h"
3
4int quote_path_fully = 1;
5
6/* Help to copy the thing properly quoted for the shell safety.
7 * any single quote is replaced with '\'', any exclamation point
8 * is replaced with '\!', and the whole thing is enclosed in a
9 *
10 * E.g.
11 * original sq_quote result
12 * name ==> name ==> 'name'
13 * a b ==> a b ==> 'a b'
14 * a'b ==> a'\''b ==> 'a'\''b'
15 * a!b ==> a'\!'b ==> 'a'\!'b'
16 */
17static inline int need_bs_quote(char c)
18{
19 return (c == '\'' || c == '!');
20}
21
22void sq_quote_buf(struct strbuf *dst, const char *src)
23{
24 char *to_free = NULL;
25
26 if (dst->buf == src)
27 to_free = strbuf_detach(dst, NULL);
28
29 strbuf_addch(dst, '\'');
30 while (*src) {
31 size_t len = strcspn(src, "'!");
32 strbuf_add(dst, src, len);
33 src += len;
34 while (need_bs_quote(*src)) {
35 strbuf_addstr(dst, "'\\");
36 strbuf_addch(dst, *src++);
37 strbuf_addch(dst, '\'');
38 }
39 }
40 strbuf_addch(dst, '\'');
41 free(to_free);
42}
43
44void sq_quote_print(FILE *stream, const char *src)
45{
46 char c;
47
48 fputc('\'', stream);
49 while ((c = *src++)) {
50 if (need_bs_quote(c)) {
51 fputs("'\\", stream);
52 fputc(c, stream);
53 fputc('\'', stream);
54 } else {
55 fputc(c, stream);
56 }
57 }
58 fputc('\'', stream);
59}
60
61void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
62{
63 int i;
64
65 /* Copy into destination buffer. */
66 strbuf_grow(dst, 255);
67 for (i = 0; argv[i]; ++i) {
68 strbuf_addch(dst, ' ');
69 sq_quote_buf(dst, argv[i]);
70 if (maxlen && dst->len > maxlen)
71 die("Too many or long arguments");
72 }
73}
74
75char *sq_dequote_step(char *arg, char **next)
76{
77 char *dst = arg;
78 char *src = arg;
79 char c;
80
81 if (*src != '\'')
82 return NULL;
83 for (;;) {
84 c = *++src;
85 if (!c)
86 return NULL;
87 if (c != '\'') {
88 *dst++ = c;
89 continue;
90 }
91 /* We stepped out of sq */
92 switch (*++src) {
93 case '\0':
94 *dst = 0;
95 if (next)
96 *next = NULL;
97 return arg;
98 case '\\':
99 c = *++src;
100 if (need_bs_quote(c) && *++src == '\'') {
101 *dst++ = c;
102 continue;
103 }
104 /* Fallthrough */
105 default:
106 if (!next || !isspace(*src))
107 return NULL;
108 do {
109 c = *++src;
110 } while (isspace(c));
111 *dst = 0;
112 *next = src;
113 return arg;
114 }
115 }
116}
117
118char *sq_dequote(char *arg)
119{
120 return sq_dequote_step(arg, NULL);
121}
122
123int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc)
124{
125 char *next = arg;
126
127 if (!*arg)
128 return 0;
129 do {
130 char *dequoted = sq_dequote_step(next, &next);
131 if (!dequoted)
132 return -1;
133 ALLOC_GROW(*argv, *nr + 1, *alloc);
134 (*argv)[(*nr)++] = dequoted;
135 } while (next);
136
137 return 0;
138}
139
140/* 1 means: quote as octal
141 * 0 means: quote as octal if (quote_path_fully)
142 * -1 means: never quote
143 * c: quote as "\\c"
144 */
145#define X8(x) x, x, x, x, x, x, x, x
146#define X16(x) X8(x), X8(x)
147static signed char const sq_lookup[256] = {
148 /* 0 1 2 3 4 5 6 7 */
149 /* 0x00 */ 1, 1, 1, 1, 1, 1, 1, 'a',
150 /* 0x08 */ 'b', 't', 'n', 'v', 'f', 'r', 1, 1,
151 /* 0x10 */ X16(1),
152 /* 0x20 */ -1, -1, '"', -1, -1, -1, -1, -1,
153 /* 0x28 */ X16(-1), X16(-1), X16(-1),
154 /* 0x58 */ -1, -1, -1, -1,'\\', -1, -1, -1,
155 /* 0x60 */ X16(-1), X8(-1),
156 /* 0x78 */ -1, -1, -1, -1, -1, -1, -1, 1,
157 /* 0x80 */ /* set to 0 */
158};
159
160static inline int sq_must_quote(char c)
161{
162 return sq_lookup[(unsigned char)c] + quote_path_fully > 0;
163}
164
165/* returns the longest prefix not needing a quote up to maxlen if positive.
166 This stops at the first \0 because it's marked as a character needing an
167 escape */
168static size_t next_quote_pos(const char *s, ssize_t maxlen)
169{
170 size_t len;
171 if (maxlen < 0) {
172 for (len = 0; !sq_must_quote(s[len]); len++);
173 } else {
174 for (len = 0; len < maxlen && !sq_must_quote(s[len]); len++);
175 }
176 return len;
177}
178
179/*
180 * C-style name quoting.
181 *
182 * (1) if sb and fp are both NULL, inspect the input name and counts the
183 * number of bytes that are needed to hold c_style quoted version of name,
184 * counting the double quotes around it but not terminating NUL, and
185 * returns it.
186 * However, if name does not need c_style quoting, it returns 0.
187 *
188 * (2) if sb or fp are not NULL, it emits the c_style quoted version
189 * of name, enclosed with double quotes if asked and needed only.
190 * Return value is the same as in (1).
191 */
192static size_t quote_c_style_counted(const char *name, ssize_t maxlen,
193 struct strbuf *sb, FILE *fp, int no_dq)
194{
195#undef EMIT
196#define EMIT(c) \
197 do { \
198 if (sb) strbuf_addch(sb, (c)); \
199 if (fp) fputc((c), fp); \
200 count++; \
201 } while (0)
202#define EMITBUF(s, l) \
203 do { \
204 if (sb) strbuf_add(sb, (s), (l)); \
205 if (fp) fwrite((s), (l), 1, fp); \
206 count += (l); \
207 } while (0)
208
209 size_t len, count = 0;
210 const char *p = name;
211
212 for (;;) {
213 int ch;
214
215 len = next_quote_pos(p, maxlen);
216 if (len == maxlen || !p[len])
217 break;
218
219 if (!no_dq && p == name)
220 EMIT('"');
221
222 EMITBUF(p, len);
223 EMIT('\\');
224 p += len;
225 ch = (unsigned char)*p++;
226 if (sq_lookup[ch] >= ' ') {
227 EMIT(sq_lookup[ch]);
228 } else {
229 EMIT(((ch >> 6) & 03) + '0');
230 EMIT(((ch >> 3) & 07) + '0');
231 EMIT(((ch >> 0) & 07) + '0');
232 }
233 }
234
235 EMITBUF(p, len);
236 if (p == name) /* no ending quote needed */
237 return 0;
238
239 if (!no_dq)
240 EMIT('"');
241 return count;
242}
243
244size_t quote_c_style(const char *name, struct strbuf *sb, FILE *fp, int nodq)
245{
246 return quote_c_style_counted(name, -1, sb, fp, nodq);
247}
248
249void quote_two_c_style(struct strbuf *sb, const char *prefix, const char *path, int nodq)
250{
251 if (quote_c_style(prefix, NULL, NULL, 0) ||
252 quote_c_style(path, NULL, NULL, 0)) {
253 if (!nodq)
254 strbuf_addch(sb, '"');
255 quote_c_style(prefix, sb, NULL, 1);
256 quote_c_style(path, sb, NULL, 1);
257 if (!nodq)
258 strbuf_addch(sb, '"');
259 } else {
260 strbuf_addstr(sb, prefix);
261 strbuf_addstr(sb, path);
262 }
263}
264
265void write_name_quoted(const char *name, FILE *fp, int terminator)
266{
267 if (terminator) {
268 quote_c_style(name, NULL, fp, 0);
269 } else {
270 fputs(name, fp);
271 }
272 fputc(terminator, fp);
273}
274
275extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
276 const char *name, FILE *fp, int terminator)
277{
278 int needquote = 0;
279
280 if (terminator) {
281 needquote = next_quote_pos(pfx, pfxlen) < pfxlen
282 || name[next_quote_pos(name, -1)];
283 }
284 if (needquote) {
285 fputc('"', fp);
286 quote_c_style_counted(pfx, pfxlen, NULL, fp, 1);
287 quote_c_style(name, NULL, fp, 1);
288 fputc('"', fp);
289 } else {
290 fwrite(pfx, pfxlen, 1, fp);
291 fputs(name, fp);
292 }
293 fputc(terminator, fp);
294}
295
296/* quote path as relative to the given prefix */
297char *quote_path_relative(const char *in, int len,
298 struct strbuf *out, const char *prefix)
299{
300 int needquote;
301
302 if (len < 0)
303 len = strlen(in);
304
305 /* "../" prefix itself does not need quoting, but "in" might. */
306 needquote = next_quote_pos(in, len) < len;
307 strbuf_setlen(out, 0);
308 strbuf_grow(out, len);
309
310 if (needquote)
311 strbuf_addch(out, '"');
312 if (prefix) {
313 int off = 0;
314 while (prefix[off] && off < len && prefix[off] == in[off])
315 if (prefix[off] == '/') {
316 prefix += off + 1;
317 in += off + 1;
318 len -= off + 1;
319 off = 0;
320 } else
321 off++;
322
323 for (; *prefix; prefix++)
324 if (*prefix == '/')
325 strbuf_addstr(out, "../");
326 }
327
328 quote_c_style_counted (in, len, out, NULL, 1);
329
330 if (needquote)
331 strbuf_addch(out, '"');
332 if (!out->len)
333 strbuf_addstr(out, "./");
334
335 return out->buf;
336}
337
338/*
339 * C-style name unquoting.
340 *
341 * Quoted should point at the opening double quote.
342 * + Returns 0 if it was able to unquote the string properly, and appends the
343 * result in the strbuf `sb'.
344 * + Returns -1 in case of error, and doesn't touch the strbuf. Though note
345 * that this function will allocate memory in the strbuf, so calling
346 * strbuf_release is mandatory whichever result unquote_c_style returns.
347 *
348 * Updates endp pointer to point at one past the ending double quote if given.
349 */
350int unquote_c_style(struct strbuf *sb, const char *quoted, const char **endp)
351{
352 size_t oldlen = sb->len, len;
353 int ch, ac;
354
355 if (*quoted++ != '"')
356 return -1;
357
358 for (;;) {
359 len = strcspn(quoted, "\"\\");
360 strbuf_add(sb, quoted, len);
361 quoted += len;
362
363 switch (*quoted++) {
364 case '"':
365 if (endp)
366 *endp = quoted;
367 return 0;
368 case '\\':
369 break;
370 default:
371 goto error;
372 }
373
374 switch ((ch = *quoted++)) {
375 case 'a': ch = '\a'; break;
376 case 'b': ch = '\b'; break;
377 case 'f': ch = '\f'; break;
378 case 'n': ch = '\n'; break;
379 case 'r': ch = '\r'; break;
380 case 't': ch = '\t'; break;
381 case 'v': ch = '\v'; break;
382
383 case '\\': case '"':
384 break; /* verbatim */
385
386 /* octal values with first digit over 4 overflow */
387 case '0': case '1': case '2': case '3':
388 ac = ((ch - '0') << 6);
389 if ((ch = *quoted++) < '0' || '7' < ch)
390 goto error;
391 ac |= ((ch - '0') << 3);
392 if ((ch = *quoted++) < '0' || '7' < ch)
393 goto error;
394 ac |= (ch - '0');
395 ch = ac;
396 break;
397 default:
398 goto error;
399 }
400 strbuf_addch(sb, ch);
401 }
402
403 error:
404 strbuf_setlen(sb, oldlen);
405 return -1;
406}
407
408/* quoting as a string literal for other languages */
409
410void perl_quote_print(FILE *stream, const char *src)
411{
412 const char sq = '\'';
413 const char bq = '\\';
414 char c;
415
416 fputc(sq, stream);
417 while ((c = *src++)) {
418 if (c == sq || c == bq)
419 fputc(bq, stream);
420 fputc(c, stream);
421 }
422 fputc(sq, stream);
423}
424
425void python_quote_print(FILE *stream, const char *src)
426{
427 const char sq = '\'';
428 const char bq = '\\';
429 const char nl = '\n';
430 char c;
431
432 fputc(sq, stream);
433 while ((c = *src++)) {
434 if (c == nl) {
435 fputc(bq, stream);
436 fputc('n', stream);
437 continue;
438 }
439 if (c == sq || c == bq)
440 fputc(bq, stream);
441 fputc(c, stream);
442 }
443 fputc(sq, stream);
444}
445
446void tcl_quote_print(FILE *stream, const char *src)
447{
448 char c;
449
450 fputc('"', stream);
451 while ((c = *src++)) {
452 switch (c) {
453 case '[': case ']':
454 case '{': case '}':
455 case '$': case '\\': case '"':
456 fputc('\\', stream);
457 default:
458 fputc(c, stream);
459 break;
460 case '\f':
461 fputs("\\f", stream);
462 break;
463 case '\r':
464 fputs("\\r", stream);
465 break;
466 case '\n':
467 fputs("\\n", stream);
468 break;
469 case '\t':
470 fputs("\\t", stream);
471 break;
472 case '\v':
473 fputs("\\v", stream);
474 break;
475 }
476 }
477 fputc('"', stream);
478}
diff --git a/Documentation/perf_counter/util/quote.h b/Documentation/perf_counter/util/quote.h
new file mode 100644
index 000000000000..5dfad89816db
--- /dev/null
+++ b/Documentation/perf_counter/util/quote.h
@@ -0,0 +1,68 @@
1#ifndef QUOTE_H
2#define QUOTE_H
3
4#include <stddef.h>
5#include <stdio.h>
6
7/* Help to copy the thing properly quoted for the shell safety.
8 * any single quote is replaced with '\'', any exclamation point
9 * is replaced with '\!', and the whole thing is enclosed in a
10 * single quote pair.
11 *
12 * For example, if you are passing the result to system() as an
13 * argument:
14 *
15 * sprintf(cmd, "foobar %s %s", sq_quote(arg0), sq_quote(arg1))
16 *
17 * would be appropriate. If the system() is going to call ssh to
18 * run the command on the other side:
19 *
20 * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1));
21 * sprintf(rcmd, "ssh %s %s", sq_util/quote.host), sq_quote(cmd));
22 *
23 * Note that the above examples leak memory! Remember to free result from
24 * sq_quote() in a real application.
25 *
26 * sq_quote_buf() writes to an existing buffer of specified size; it
27 * will return the number of characters that would have been written
28 * excluding the final null regardless of the buffer size.
29 */
30
31extern void sq_quote_print(FILE *stream, const char *src);
32
33extern void sq_quote_buf(struct strbuf *, const char *src);
34extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
35
36/* This unwraps what sq_quote() produces in place, but returns
37 * NULL if the input does not look like what sq_quote would have
38 * produced.
39 */
40extern char *sq_dequote(char *);
41
42/*
43 * Same as the above, but can be used to unwrap many arguments in the
44 * same string separated by space. "next" is changed to point to the
45 * next argument that should be passed as first parameter. When there
46 * is no more argument to be dequoted, "next" is updated to point to NULL.
47 */
48extern char *sq_dequote_step(char *arg, char **next);
49extern int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc);
50
51extern int unquote_c_style(struct strbuf *, const char *quoted, const char **endp);
52extern size_t quote_c_style(const char *name, struct strbuf *, FILE *, int no_dq);
53extern void quote_two_c_style(struct strbuf *, const char *, const char *, int);
54
55extern void write_name_quoted(const char *name, FILE *, int terminator);
56extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
57 const char *name, FILE *, int terminator);
58
59/* quote path as relative to the given prefix */
60char *quote_path_relative(const char *in, int len,
61 struct strbuf *out, const char *prefix);
62
63/* quoting as a string literal for other languages */
64extern void perl_quote_print(FILE *stream, const char *src);
65extern void python_quote_print(FILE *stream, const char *src);
66extern void tcl_quote_print(FILE *stream, const char *src);
67
68#endif
diff --git a/Documentation/perf_counter/util/run-command.c b/Documentation/perf_counter/util/run-command.c
new file mode 100644
index 000000000000..b2f5e854f40a
--- /dev/null
+++ b/Documentation/perf_counter/util/run-command.c
@@ -0,0 +1,395 @@
1#include "cache.h"
2#include "run-command.h"
3#include "exec_cmd.h"
4
5static inline void close_pair(int fd[2])
6{
7 close(fd[0]);
8 close(fd[1]);
9}
10
11static inline void dup_devnull(int to)
12{
13 int fd = open("/dev/null", O_RDWR);
14 dup2(fd, to);
15 close(fd);
16}
17
18int start_command(struct child_process *cmd)
19{
20 int need_in, need_out, need_err;
21 int fdin[2], fdout[2], fderr[2];
22
23 /*
24 * In case of errors we must keep the promise to close FDs
25 * that have been passed in via ->in and ->out.
26 */
27
28 need_in = !cmd->no_stdin && cmd->in < 0;
29 if (need_in) {
30 if (pipe(fdin) < 0) {
31 if (cmd->out > 0)
32 close(cmd->out);
33 return -ERR_RUN_COMMAND_PIPE;
34 }
35 cmd->in = fdin[1];
36 }
37
38 need_out = !cmd->no_stdout
39 && !cmd->stdout_to_stderr
40 && cmd->out < 0;
41 if (need_out) {
42 if (pipe(fdout) < 0) {
43 if (need_in)
44 close_pair(fdin);
45 else if (cmd->in)
46 close(cmd->in);
47 return -ERR_RUN_COMMAND_PIPE;
48 }
49 cmd->out = fdout[0];
50 }
51
52 need_err = !cmd->no_stderr && cmd->err < 0;
53 if (need_err) {
54 if (pipe(fderr) < 0) {
55 if (need_in)
56 close_pair(fdin);
57 else if (cmd->in)
58 close(cmd->in);
59 if (need_out)
60 close_pair(fdout);
61 else if (cmd->out)
62 close(cmd->out);
63 return -ERR_RUN_COMMAND_PIPE;
64 }
65 cmd->err = fderr[0];
66 }
67
68#ifndef __MINGW32__
69 fflush(NULL);
70 cmd->pid = fork();
71 if (!cmd->pid) {
72 if (cmd->no_stdin)
73 dup_devnull(0);
74 else if (need_in) {
75 dup2(fdin[0], 0);
76 close_pair(fdin);
77 } else if (cmd->in) {
78 dup2(cmd->in, 0);
79 close(cmd->in);
80 }
81
82 if (cmd->no_stderr)
83 dup_devnull(2);
84 else if (need_err) {
85 dup2(fderr[1], 2);
86 close_pair(fderr);
87 }
88
89 if (cmd->no_stdout)
90 dup_devnull(1);
91 else if (cmd->stdout_to_stderr)
92 dup2(2, 1);
93 else if (need_out) {
94 dup2(fdout[1], 1);
95 close_pair(fdout);
96 } else if (cmd->out > 1) {
97 dup2(cmd->out, 1);
98 close(cmd->out);
99 }
100
101 if (cmd->dir && chdir(cmd->dir))
102 die("exec %s: cd to %s failed (%s)", cmd->argv[0],
103 cmd->dir, strerror(errno));
104 if (cmd->env) {
105 for (; *cmd->env; cmd->env++) {
106 if (strchr(*cmd->env, '='))
107 putenv((char*)*cmd->env);
108 else
109 unsetenv(*cmd->env);
110 }
111 }
112 if (cmd->preexec_cb)
113 cmd->preexec_cb();
114 if (cmd->perf_cmd) {
115 execv_perf_cmd(cmd->argv);
116 } else {
117 execvp(cmd->argv[0], (char *const*) cmd->argv);
118 }
119 exit(127);
120 }
121#else
122 int s0 = -1, s1 = -1, s2 = -1; /* backups of stdin, stdout, stderr */
123 const char **sargv = cmd->argv;
124 char **env = environ;
125
126 if (cmd->no_stdin) {
127 s0 = dup(0);
128 dup_devnull(0);
129 } else if (need_in) {
130 s0 = dup(0);
131 dup2(fdin[0], 0);
132 } else if (cmd->in) {
133 s0 = dup(0);
134 dup2(cmd->in, 0);
135 }
136
137 if (cmd->no_stderr) {
138 s2 = dup(2);
139 dup_devnull(2);
140 } else if (need_err) {
141 s2 = dup(2);
142 dup2(fderr[1], 2);
143 }
144
145 if (cmd->no_stdout) {
146 s1 = dup(1);
147 dup_devnull(1);
148 } else if (cmd->stdout_to_stderr) {
149 s1 = dup(1);
150 dup2(2, 1);
151 } else if (need_out) {
152 s1 = dup(1);
153 dup2(fdout[1], 1);
154 } else if (cmd->out > 1) {
155 s1 = dup(1);
156 dup2(cmd->out, 1);
157 }
158
159 if (cmd->dir)
160 die("chdir in start_command() not implemented");
161 if (cmd->env) {
162 env = copy_environ();
163 for (; *cmd->env; cmd->env++)
164 env = env_setenv(env, *cmd->env);
165 }
166
167 if (cmd->perf_cmd) {
168 cmd->argv = prepare_perf_cmd(cmd->argv);
169 }
170
171 cmd->pid = mingw_spawnvpe(cmd->argv[0], cmd->argv, env);
172
173 if (cmd->env)
174 free_environ(env);
175 if (cmd->perf_cmd)
176 free(cmd->argv);
177
178 cmd->argv = sargv;
179 if (s0 >= 0)
180 dup2(s0, 0), close(s0);
181 if (s1 >= 0)
182 dup2(s1, 1), close(s1);
183 if (s2 >= 0)
184 dup2(s2, 2), close(s2);
185#endif
186
187 if (cmd->pid < 0) {
188 int err = errno;
189 if (need_in)
190 close_pair(fdin);
191 else if (cmd->in)
192 close(cmd->in);
193 if (need_out)
194 close_pair(fdout);
195 else if (cmd->out)
196 close(cmd->out);
197 if (need_err)
198 close_pair(fderr);
199 return err == ENOENT ?
200 -ERR_RUN_COMMAND_EXEC :
201 -ERR_RUN_COMMAND_FORK;
202 }
203
204 if (need_in)
205 close(fdin[0]);
206 else if (cmd->in)
207 close(cmd->in);
208
209 if (need_out)
210 close(fdout[1]);
211 else if (cmd->out)
212 close(cmd->out);
213
214 if (need_err)
215 close(fderr[1]);
216
217 return 0;
218}
219
220static int wait_or_whine(pid_t pid)
221{
222 for (;;) {
223 int status, code;
224 pid_t waiting = waitpid(pid, &status, 0);
225
226 if (waiting < 0) {
227 if (errno == EINTR)
228 continue;
229 error("waitpid failed (%s)", strerror(errno));
230 return -ERR_RUN_COMMAND_WAITPID;
231 }
232 if (waiting != pid)
233 return -ERR_RUN_COMMAND_WAITPID_WRONG_PID;
234 if (WIFSIGNALED(status))
235 return -ERR_RUN_COMMAND_WAITPID_SIGNAL;
236
237 if (!WIFEXITED(status))
238 return -ERR_RUN_COMMAND_WAITPID_NOEXIT;
239 code = WEXITSTATUS(status);
240 switch (code) {
241 case 127:
242 return -ERR_RUN_COMMAND_EXEC;
243 case 0:
244 return 0;
245 default:
246 return -code;
247 }
248 }
249}
250
251int finish_command(struct child_process *cmd)
252{
253 return wait_or_whine(cmd->pid);
254}
255
256int run_command(struct child_process *cmd)
257{
258 int code = start_command(cmd);
259 if (code)
260 return code;
261 return finish_command(cmd);
262}
263
264static void prepare_run_command_v_opt(struct child_process *cmd,
265 const char **argv,
266 int opt)
267{
268 memset(cmd, 0, sizeof(*cmd));
269 cmd->argv = argv;
270 cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
271 cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0;
272 cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
273}
274
275int run_command_v_opt(const char **argv, int opt)
276{
277 struct child_process cmd;
278 prepare_run_command_v_opt(&cmd, argv, opt);
279 return run_command(&cmd);
280}
281
282int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env)
283{
284 struct child_process cmd;
285 prepare_run_command_v_opt(&cmd, argv, opt);
286 cmd.dir = dir;
287 cmd.env = env;
288 return run_command(&cmd);
289}
290
291#ifdef __MINGW32__
292static __stdcall unsigned run_thread(void *data)
293{
294 struct async *async = data;
295 return async->proc(async->fd_for_proc, async->data);
296}
297#endif
298
299int start_async(struct async *async)
300{
301 int pipe_out[2];
302
303 if (pipe(pipe_out) < 0)
304 return error("cannot create pipe: %s", strerror(errno));
305 async->out = pipe_out[0];
306
307#ifndef __MINGW32__
308 /* Flush stdio before fork() to avoid cloning buffers */
309 fflush(NULL);
310
311 async->pid = fork();
312 if (async->pid < 0) {
313 error("fork (async) failed: %s", strerror(errno));
314 close_pair(pipe_out);
315 return -1;
316 }
317 if (!async->pid) {
318 close(pipe_out[0]);
319 exit(!!async->proc(pipe_out[1], async->data));
320 }
321 close(pipe_out[1]);
322#else
323 async->fd_for_proc = pipe_out[1];
324 async->tid = (HANDLE) _beginthreadex(NULL, 0, run_thread, async, 0, NULL);
325 if (!async->tid) {
326 error("cannot create thread: %s", strerror(errno));
327 close_pair(pipe_out);
328 return -1;
329 }
330#endif
331 return 0;
332}
333
334int finish_async(struct async *async)
335{
336#ifndef __MINGW32__
337 int ret = 0;
338
339 if (wait_or_whine(async->pid))
340 ret = error("waitpid (async) failed");
341#else
342 DWORD ret = 0;
343 if (WaitForSingleObject(async->tid, INFINITE) != WAIT_OBJECT_0)
344 ret = error("waiting for thread failed: %lu", GetLastError());
345 else if (!GetExitCodeThread(async->tid, &ret))
346 ret = error("cannot get thread exit code: %lu", GetLastError());
347 CloseHandle(async->tid);
348#endif
349 return ret;
350}
351
352int run_hook(const char *index_file, const char *name, ...)
353{
354 struct child_process hook;
355 const char **argv = NULL, *env[2];
356 char index[PATH_MAX];
357 va_list args;
358 int ret;
359 size_t i = 0, alloc = 0;
360
361 if (access(perf_path("hooks/%s", name), X_OK) < 0)
362 return 0;
363
364 va_start(args, name);
365 ALLOC_GROW(argv, i + 1, alloc);
366 argv[i++] = perf_path("hooks/%s", name);
367 while (argv[i-1]) {
368 ALLOC_GROW(argv, i + 1, alloc);
369 argv[i++] = va_arg(args, const char *);
370 }
371 va_end(args);
372
373 memset(&hook, 0, sizeof(hook));
374 hook.argv = argv;
375 hook.no_stdin = 1;
376 hook.stdout_to_stderr = 1;
377 if (index_file) {
378 snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file);
379 env[0] = index;
380 env[1] = NULL;
381 hook.env = env;
382 }
383
384 ret = start_command(&hook);
385 free(argv);
386 if (ret) {
387 warning("Could not spawn %s", argv[0]);
388 return ret;
389 }
390 ret = finish_command(&hook);
391 if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
392 warning("%s exited due to uncaught signal", argv[0]);
393
394 return ret;
395}
diff --git a/Documentation/perf_counter/util/run-command.h b/Documentation/perf_counter/util/run-command.h
new file mode 100644
index 000000000000..328289f23669
--- /dev/null
+++ b/Documentation/perf_counter/util/run-command.h
@@ -0,0 +1,93 @@
1#ifndef RUN_COMMAND_H
2#define RUN_COMMAND_H
3
4enum {
5 ERR_RUN_COMMAND_FORK = 10000,
6 ERR_RUN_COMMAND_EXEC,
7 ERR_RUN_COMMAND_PIPE,
8 ERR_RUN_COMMAND_WAITPID,
9 ERR_RUN_COMMAND_WAITPID_WRONG_PID,
10 ERR_RUN_COMMAND_WAITPID_SIGNAL,
11 ERR_RUN_COMMAND_WAITPID_NOEXIT,
12};
13#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK)
14
15struct child_process {
16 const char **argv;
17 pid_t pid;
18 /*
19 * Using .in, .out, .err:
20 * - Specify 0 for no redirections (child inherits stdin, stdout,
21 * stderr from parent).
22 * - Specify -1 to have a pipe allocated as follows:
23 * .in: returns the writable pipe end; parent writes to it,
24 * the readable pipe end becomes child's stdin
25 * .out, .err: returns the readable pipe end; parent reads from
26 * it, the writable pipe end becomes child's stdout/stderr
27 * The caller of start_command() must close the returned FDs
28 * after it has completed reading from/writing to it!
29 * - Specify > 0 to set a channel to a particular FD as follows:
30 * .in: a readable FD, becomes child's stdin
31 * .out: a writable FD, becomes child's stdout/stderr
32 * .err > 0 not supported
33 * The specified FD is closed by start_command(), even in case
34 * of errors!
35 */
36 int in;
37 int out;
38 int err;
39 const char *dir;
40 const char *const *env;
41 unsigned no_stdin:1;
42 unsigned no_stdout:1;
43 unsigned no_stderr:1;
44 unsigned perf_cmd:1; /* if this is to be perf sub-command */
45 unsigned stdout_to_stderr:1;
46 void (*preexec_cb)(void);
47};
48
49int start_command(struct child_process *);
50int finish_command(struct child_process *);
51int run_command(struct child_process *);
52
53extern int run_hook(const char *index_file, const char *name, ...);
54
55#define RUN_COMMAND_NO_STDIN 1
56#define RUN_PERF_CMD 2 /*If this is to be perf sub-command */
57#define RUN_COMMAND_STDOUT_TO_STDERR 4
58int run_command_v_opt(const char **argv, int opt);
59
60/*
61 * env (the environment) is to be formatted like environ: "VAR=VALUE".
62 * To unset an environment variable use just "VAR".
63 */
64int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env);
65
66/*
67 * The purpose of the following functions is to feed a pipe by running
68 * a function asynchronously and providing output that the caller reads.
69 *
70 * It is expected that no synchronization and mutual exclusion between
71 * the caller and the feed function is necessary so that the function
72 * can run in a thread without interfering with the caller.
73 */
74struct async {
75 /*
76 * proc writes to fd and closes it;
77 * returns 0 on success, non-zero on failure
78 */
79 int (*proc)(int fd, void *data);
80 void *data;
81 int out; /* caller reads from here and closes it */
82#ifndef __MINGW32__
83 pid_t pid;
84#else
85 HANDLE tid;
86 int fd_for_proc;
87#endif
88};
89
90int start_async(struct async *async);
91int finish_async(struct async *async);
92
93#endif
diff --git a/Documentation/perf_counter/util/strbuf.c b/Documentation/perf_counter/util/strbuf.c
new file mode 100644
index 000000000000..eaba09306802
--- /dev/null
+++ b/Documentation/perf_counter/util/strbuf.c
@@ -0,0 +1,359 @@
1#include "cache.h"
2
3int prefixcmp(const char *str, const char *prefix)
4{
5 for (; ; str++, prefix++)
6 if (!*prefix)
7 return 0;
8 else if (*str != *prefix)
9 return (unsigned char)*prefix - (unsigned char)*str;
10}
11
12/*
13 * Used as the default ->buf value, so that people can always assume
14 * buf is non NULL and ->buf is NUL terminated even for a freshly
15 * initialized strbuf.
16 */
17char strbuf_slopbuf[1];
18
19void strbuf_init(struct strbuf *sb, size_t hint)
20{
21 sb->alloc = sb->len = 0;
22 sb->buf = strbuf_slopbuf;
23 if (hint)
24 strbuf_grow(sb, hint);
25}
26
27void strbuf_release(struct strbuf *sb)
28{
29 if (sb->alloc) {
30 free(sb->buf);
31 strbuf_init(sb, 0);
32 }
33}
34
35char *strbuf_detach(struct strbuf *sb, size_t *sz)
36{
37 char *res = sb->alloc ? sb->buf : NULL;
38 if (sz)
39 *sz = sb->len;
40 strbuf_init(sb, 0);
41 return res;
42}
43
44void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc)
45{
46 strbuf_release(sb);
47 sb->buf = buf;
48 sb->len = len;
49 sb->alloc = alloc;
50 strbuf_grow(sb, 0);
51 sb->buf[sb->len] = '\0';
52}
53
54void strbuf_grow(struct strbuf *sb, size_t extra)
55{
56 if (sb->len + extra + 1 <= sb->len)
57 die("you want to use way too much memory");
58 if (!sb->alloc)
59 sb->buf = NULL;
60 ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
61}
62
63void strbuf_trim(struct strbuf *sb)
64{
65 char *b = sb->buf;
66 while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
67 sb->len--;
68 while (sb->len > 0 && isspace(*b)) {
69 b++;
70 sb->len--;
71 }
72 memmove(sb->buf, b, sb->len);
73 sb->buf[sb->len] = '\0';
74}
75void strbuf_rtrim(struct strbuf *sb)
76{
77 while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
78 sb->len--;
79 sb->buf[sb->len] = '\0';
80}
81
82void strbuf_ltrim(struct strbuf *sb)
83{
84 char *b = sb->buf;
85 while (sb->len > 0 && isspace(*b)) {
86 b++;
87 sb->len--;
88 }
89 memmove(sb->buf, b, sb->len);
90 sb->buf[sb->len] = '\0';
91}
92
93void strbuf_tolower(struct strbuf *sb)
94{
95 int i;
96 for (i = 0; i < sb->len; i++)
97 sb->buf[i] = tolower(sb->buf[i]);
98}
99
100struct strbuf **strbuf_split(const struct strbuf *sb, int delim)
101{
102 int alloc = 2, pos = 0;
103 char *n, *p;
104 struct strbuf **ret;
105 struct strbuf *t;
106
107 ret = calloc(alloc, sizeof(struct strbuf *));
108 p = n = sb->buf;
109 while (n < sb->buf + sb->len) {
110 int len;
111 n = memchr(n, delim, sb->len - (n - sb->buf));
112 if (pos + 1 >= alloc) {
113 alloc = alloc * 2;
114 ret = realloc(ret, sizeof(struct strbuf *) * alloc);
115 }
116 if (!n)
117 n = sb->buf + sb->len - 1;
118 len = n - p + 1;
119 t = malloc(sizeof(struct strbuf));
120 strbuf_init(t, len);
121 strbuf_add(t, p, len);
122 ret[pos] = t;
123 ret[++pos] = NULL;
124 p = ++n;
125 }
126 return ret;
127}
128
129void strbuf_list_free(struct strbuf **sbs)
130{
131 struct strbuf **s = sbs;
132
133 while (*s) {
134 strbuf_release(*s);
135 free(*s++);
136 }
137 free(sbs);
138}
139
140int strbuf_cmp(const struct strbuf *a, const struct strbuf *b)
141{
142 int len = a->len < b->len ? a->len: b->len;
143 int cmp = memcmp(a->buf, b->buf, len);
144 if (cmp)
145 return cmp;
146 return a->len < b->len ? -1: a->len != b->len;
147}
148
149void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
150 const void *data, size_t dlen)
151{
152 if (pos + len < pos)
153 die("you want to use way too much memory");
154 if (pos > sb->len)
155 die("`pos' is too far after the end of the buffer");
156 if (pos + len > sb->len)
157 die("`pos + len' is too far after the end of the buffer");
158
159 if (dlen >= len)
160 strbuf_grow(sb, dlen - len);
161 memmove(sb->buf + pos + dlen,
162 sb->buf + pos + len,
163 sb->len - pos - len);
164 memcpy(sb->buf + pos, data, dlen);
165 strbuf_setlen(sb, sb->len + dlen - len);
166}
167
168void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len)
169{
170 strbuf_splice(sb, pos, 0, data, len);
171}
172
173void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
174{
175 strbuf_splice(sb, pos, len, NULL, 0);
176}
177
178void strbuf_add(struct strbuf *sb, const void *data, size_t len)
179{
180 strbuf_grow(sb, len);
181 memcpy(sb->buf + sb->len, data, len);
182 strbuf_setlen(sb, sb->len + len);
183}
184
185void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len)
186{
187 strbuf_grow(sb, len);
188 memcpy(sb->buf + sb->len, sb->buf + pos, len);
189 strbuf_setlen(sb, sb->len + len);
190}
191
192void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
193{
194 int len;
195 va_list ap;
196
197 if (!strbuf_avail(sb))
198 strbuf_grow(sb, 64);
199 va_start(ap, fmt);
200 len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
201 va_end(ap);
202 if (len < 0)
203 die("your vsnprintf is broken");
204 if (len > strbuf_avail(sb)) {
205 strbuf_grow(sb, len);
206 va_start(ap, fmt);
207 len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
208 va_end(ap);
209 if (len > strbuf_avail(sb)) {
210 die("this should not happen, your snprintf is broken");
211 }
212 }
213 strbuf_setlen(sb, sb->len + len);
214}
215
216void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn,
217 void *context)
218{
219 for (;;) {
220 const char *percent;
221 size_t consumed;
222
223 percent = strchrnul(format, '%');
224 strbuf_add(sb, format, percent - format);
225 if (!*percent)
226 break;
227 format = percent + 1;
228
229 consumed = fn(sb, format, context);
230 if (consumed)
231 format += consumed;
232 else
233 strbuf_addch(sb, '%');
234 }
235}
236
237size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder,
238 void *context)
239{
240 struct strbuf_expand_dict_entry *e = context;
241 size_t len;
242
243 for (; e->placeholder && (len = strlen(e->placeholder)); e++) {
244 if (!strncmp(placeholder, e->placeholder, len)) {
245 if (e->value)
246 strbuf_addstr(sb, e->value);
247 return len;
248 }
249 }
250 return 0;
251}
252
253size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f)
254{
255 size_t res;
256 size_t oldalloc = sb->alloc;
257
258 strbuf_grow(sb, size);
259 res = fread(sb->buf + sb->len, 1, size, f);
260 if (res > 0)
261 strbuf_setlen(sb, sb->len + res);
262 else if (res < 0 && oldalloc == 0)
263 strbuf_release(sb);
264 return res;
265}
266
267ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint)
268{
269 size_t oldlen = sb->len;
270 size_t oldalloc = sb->alloc;
271
272 strbuf_grow(sb, hint ? hint : 8192);
273 for (;;) {
274 ssize_t cnt;
275
276 cnt = read(fd, sb->buf + sb->len, sb->alloc - sb->len - 1);
277 if (cnt < 0) {
278 if (oldalloc == 0)
279 strbuf_release(sb);
280 else
281 strbuf_setlen(sb, oldlen);
282 return -1;
283 }
284 if (!cnt)
285 break;
286 sb->len += cnt;
287 strbuf_grow(sb, 8192);
288 }
289
290 sb->buf[sb->len] = '\0';
291 return sb->len - oldlen;
292}
293
294#define STRBUF_MAXLINK (2*PATH_MAX)
295
296int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint)
297{
298 size_t oldalloc = sb->alloc;
299
300 if (hint < 32)
301 hint = 32;
302
303 while (hint < STRBUF_MAXLINK) {
304 int len;
305
306 strbuf_grow(sb, hint);
307 len = readlink(path, sb->buf, hint);
308 if (len < 0) {
309 if (errno != ERANGE)
310 break;
311 } else if (len < hint) {
312 strbuf_setlen(sb, len);
313 return 0;
314 }
315
316 /* .. the buffer was too small - try again */
317 hint *= 2;
318 }
319 if (oldalloc == 0)
320 strbuf_release(sb);
321 return -1;
322}
323
324int strbuf_getline(struct strbuf *sb, FILE *fp, int term)
325{
326 int ch;
327
328 strbuf_grow(sb, 0);
329 if (feof(fp))
330 return EOF;
331
332 strbuf_reset(sb);
333 while ((ch = fgetc(fp)) != EOF) {
334 if (ch == term)
335 break;
336 strbuf_grow(sb, 1);
337 sb->buf[sb->len++] = ch;
338 }
339 if (ch == EOF && sb->len == 0)
340 return EOF;
341
342 sb->buf[sb->len] = '\0';
343 return 0;
344}
345
346int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint)
347{
348 int fd, len;
349
350 fd = open(path, O_RDONLY);
351 if (fd < 0)
352 return -1;
353 len = strbuf_read(sb, fd, hint);
354 close(fd);
355 if (len < 0)
356 return -1;
357
358 return len;
359}
diff --git a/Documentation/perf_counter/util/strbuf.h b/Documentation/perf_counter/util/strbuf.h
new file mode 100644
index 000000000000..9ee908a3ec5d
--- /dev/null
+++ b/Documentation/perf_counter/util/strbuf.h
@@ -0,0 +1,137 @@
1#ifndef STRBUF_H
2#define STRBUF_H
3
4/*
5 * Strbuf's can be use in many ways: as a byte array, or to store arbitrary
6 * long, overflow safe strings.
7 *
8 * Strbufs has some invariants that are very important to keep in mind:
9 *
10 * 1. the ->buf member is always malloc-ed, hence strbuf's can be used to
11 * build complex strings/buffers whose final size isn't easily known.
12 *
13 * It is NOT legal to copy the ->buf pointer away.
14 * `strbuf_detach' is the operation that detachs a buffer from its shell
15 * while keeping the shell valid wrt its invariants.
16 *
17 * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
18 * allocated. The extra byte is used to store a '\0', allowing the ->buf
19 * member to be a valid C-string. Every strbuf function ensure this
20 * invariant is preserved.
21 *
22 * Note that it is OK to "play" with the buffer directly if you work it
23 * that way:
24 *
25 * strbuf_grow(sb, SOME_SIZE);
26 * ... Here, the memory array starting at sb->buf, and of length
27 * ... strbuf_avail(sb) is all yours, and you are sure that
28 * ... strbuf_avail(sb) is at least SOME_SIZE.
29 * strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE);
30 *
31 * Of course, SOME_OTHER_SIZE must be smaller or equal to strbuf_avail(sb).
32 *
33 * Doing so is safe, though if it has to be done in many places, adding the
34 * missing API to the strbuf module is the way to go.
35 *
36 * XXX: do _not_ assume that the area that is yours is of size ->alloc - 1
37 * even if it's true in the current implementation. Alloc is somehow a
38 * "private" member that should not be messed with.
39 */
40
41#include <assert.h>
42
43extern char strbuf_slopbuf[];
44struct strbuf {
45 size_t alloc;
46 size_t len;
47 char *buf;
48};
49
50#define STRBUF_INIT { 0, 0, strbuf_slopbuf }
51
52/*----- strbuf life cycle -----*/
53extern void strbuf_init(struct strbuf *, size_t);
54extern void strbuf_release(struct strbuf *);
55extern char *strbuf_detach(struct strbuf *, size_t *);
56extern void strbuf_attach(struct strbuf *, void *, size_t, size_t);
57static inline void strbuf_swap(struct strbuf *a, struct strbuf *b) {
58 struct strbuf tmp = *a;
59 *a = *b;
60 *b = tmp;
61}
62
63/*----- strbuf size related -----*/
64static inline size_t strbuf_avail(const struct strbuf *sb) {
65 return sb->alloc ? sb->alloc - sb->len - 1 : 0;
66}
67
68extern void strbuf_grow(struct strbuf *, size_t);
69
70static inline void strbuf_setlen(struct strbuf *sb, size_t len) {
71 if (!sb->alloc)
72 strbuf_grow(sb, 0);
73 assert(len < sb->alloc);
74 sb->len = len;
75 sb->buf[len] = '\0';
76}
77#define strbuf_reset(sb) strbuf_setlen(sb, 0)
78
79/*----- content related -----*/
80extern void strbuf_trim(struct strbuf *);
81extern void strbuf_rtrim(struct strbuf *);
82extern void strbuf_ltrim(struct strbuf *);
83extern int strbuf_cmp(const struct strbuf *, const struct strbuf *);
84extern void strbuf_tolower(struct strbuf *);
85
86extern struct strbuf **strbuf_split(const struct strbuf *, int delim);
87extern void strbuf_list_free(struct strbuf **);
88
89/*----- add data in your buffer -----*/
90static inline void strbuf_addch(struct strbuf *sb, int c) {
91 strbuf_grow(sb, 1);
92 sb->buf[sb->len++] = c;
93 sb->buf[sb->len] = '\0';
94}
95
96extern void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t);
97extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
98
99/* splice pos..pos+len with given data */
100extern void strbuf_splice(struct strbuf *, size_t pos, size_t len,
101 const void *, size_t);
102
103extern void strbuf_add(struct strbuf *, const void *, size_t);
104static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
105 strbuf_add(sb, s, strlen(s));
106}
107static inline void strbuf_addbuf(struct strbuf *sb, const struct strbuf *sb2) {
108 strbuf_add(sb, sb2->buf, sb2->len);
109}
110extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len);
111
112typedef size_t (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context);
113extern void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, void *context);
114struct strbuf_expand_dict_entry {
115 const char *placeholder;
116 const char *value;
117};
118extern size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void *context);
119
120__attribute__((format(printf,2,3)))
121extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
122
123extern size_t strbuf_fread(struct strbuf *, size_t, FILE *);
124/* XXX: if read fails, any partial read is undone */
125extern ssize_t strbuf_read(struct strbuf *, int fd, size_t hint);
126extern int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint);
127extern int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint);
128
129extern int strbuf_getline(struct strbuf *, FILE *, int);
130
131extern void stripspace(struct strbuf *buf, int skip_comments);
132extern int launch_editor(const char *path, struct strbuf *buffer, const char *const *env);
133
134extern int strbuf_branchname(struct strbuf *sb, const char *name);
135extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name);
136
137#endif /* STRBUF_H */
diff --git a/Documentation/perf_counter/util/usage.c b/Documentation/perf_counter/util/usage.c
new file mode 100644
index 000000000000..7a10421fe6b4
--- /dev/null
+++ b/Documentation/perf_counter/util/usage.c
@@ -0,0 +1,80 @@
1/*
2 * GIT - The information manager from hell
3 *
4 * Copyright (C) Linus Torvalds, 2005
5 */
6#include "util.h"
7
8static void report(const char *prefix, const char *err, va_list params)
9{
10 char msg[1024];
11 vsnprintf(msg, sizeof(msg), err, params);
12 fprintf(stderr, "%s%s\n", prefix, msg);
13}
14
15static NORETURN void usage_builtin(const char *err)
16{
17 fprintf(stderr, "usage: %s\n", err);
18 exit(129);
19}
20
21static NORETURN void die_builtin(const char *err, va_list params)
22{
23 report("fatal: ", err, params);
24 exit(128);
25}
26
27static void error_builtin(const char *err, va_list params)
28{
29 report("error: ", err, params);
30}
31
32static void warn_builtin(const char *warn, va_list params)
33{
34 report("warning: ", warn, params);
35}
36
37/* If we are in a dlopen()ed .so write to a global variable would segfault
38 * (ugh), so keep things static. */
39static void (*usage_routine)(const char *err) NORETURN = usage_builtin;
40static void (*die_routine)(const char *err, va_list params) NORETURN = die_builtin;
41static void (*error_routine)(const char *err, va_list params) = error_builtin;
42static void (*warn_routine)(const char *err, va_list params) = warn_builtin;
43
44void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN)
45{
46 die_routine = routine;
47}
48
49void usage(const char *err)
50{
51 usage_routine(err);
52}
53
54void die(const char *err, ...)
55{
56 va_list params;
57
58 va_start(params, err);
59 die_routine(err, params);
60 va_end(params);
61}
62
63int error(const char *err, ...)
64{
65 va_list params;
66
67 va_start(params, err);
68 error_routine(err, params);
69 va_end(params);
70 return -1;
71}
72
73void warning(const char *warn, ...)
74{
75 va_list params;
76
77 va_start(params, warn);
78 warn_routine(warn, params);
79 va_end(params);
80}
diff --git a/Documentation/perf_counter/util/util.h b/Documentation/perf_counter/util/util.h
new file mode 100644
index 000000000000..36e40c38e093
--- /dev/null
+++ b/Documentation/perf_counter/util/util.h
@@ -0,0 +1,408 @@
1#ifndef GIT_COMPAT_UTIL_H
2#define GIT_COMPAT_UTIL_H
3
4#define _FILE_OFFSET_BITS 64
5
6#ifndef FLEX_ARRAY
7/*
8 * See if our compiler is known to support flexible array members.
9 */
10#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
11# define FLEX_ARRAY /* empty */
12#elif defined(__GNUC__)
13# if (__GNUC__ >= 3)
14# define FLEX_ARRAY /* empty */
15# else
16# define FLEX_ARRAY 0 /* older GNU extension */
17# endif
18#endif
19
20/*
21 * Otherwise, default to safer but a bit wasteful traditional style
22 */
23#ifndef FLEX_ARRAY
24# define FLEX_ARRAY 1
25#endif
26#endif
27
28#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
29
30#ifdef __GNUC__
31#define TYPEOF(x) (__typeof__(x))
32#else
33#define TYPEOF(x)
34#endif
35
36#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits))))
37#define HAS_MULTI_BITS(i) ((i) & ((i) - 1)) /* checks if an integer has more than 1 bit set */
38
39/* Approximation of the length of the decimal representation of this type. */
40#define decimal_length(x) ((int)(sizeof(x) * 2.56 + 0.5) + 1)
41
42#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__USLC__) && !defined(_M_UNIX)
43#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
44#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
45#endif
46#define _ALL_SOURCE 1
47#define _GNU_SOURCE 1
48#define _BSD_SOURCE 1
49
50#include <unistd.h>
51#include <stdio.h>
52#include <sys/stat.h>
53#include <fcntl.h>
54#include <stddef.h>
55#include <stdlib.h>
56#include <stdarg.h>
57#include <string.h>
58#include <errno.h>
59#include <limits.h>
60#include <sys/param.h>
61#include <sys/types.h>
62#include <dirent.h>
63#include <sys/time.h>
64#include <time.h>
65#include <signal.h>
66#include <fnmatch.h>
67#include <assert.h>
68#include <regex.h>
69#include <utime.h>
70#ifndef __MINGW32__
71#include <sys/wait.h>
72#include <sys/poll.h>
73#include <sys/socket.h>
74#include <sys/ioctl.h>
75#ifndef NO_SYS_SELECT_H
76#include <sys/select.h>
77#endif
78#include <netinet/in.h>
79#include <netinet/tcp.h>
80#include <arpa/inet.h>
81#include <netdb.h>
82#include <pwd.h>
83#include <inttypes.h>
84#if defined(__CYGWIN__)
85#undef _XOPEN_SOURCE
86#include <grp.h>
87#define _XOPEN_SOURCE 600
88#include "compat/cygwin.h"
89#else
90#undef _ALL_SOURCE /* AIX 5.3L defines a struct list with _ALL_SOURCE. */
91#include <grp.h>
92#define _ALL_SOURCE 1
93#endif
94#else /* __MINGW32__ */
95/* pull in Windows compatibility stuff */
96#include "compat/mingw.h"
97#endif /* __MINGW32__ */
98
99#ifndef NO_ICONV
100#include <iconv.h>
101#endif
102
103#ifndef NO_OPENSSL
104#include <openssl/ssl.h>
105#include <openssl/err.h>
106#endif
107
108/* On most systems <limits.h> would have given us this, but
109 * not on some systems (e.g. GNU/Hurd).
110 */
111#ifndef PATH_MAX
112#define PATH_MAX 4096
113#endif
114
115#ifndef PRIuMAX
116#define PRIuMAX "llu"
117#endif
118
119#ifndef PRIu32
120#define PRIu32 "u"
121#endif
122
123#ifndef PRIx32
124#define PRIx32 "x"
125#endif
126
127#ifndef PATH_SEP
128#define PATH_SEP ':'
129#endif
130
131#ifndef STRIP_EXTENSION
132#define STRIP_EXTENSION ""
133#endif
134
135#ifndef has_dos_drive_prefix
136#define has_dos_drive_prefix(path) 0
137#endif
138
139#ifndef is_dir_sep
140#define is_dir_sep(c) ((c) == '/')
141#endif
142
143#ifdef __GNUC__
144#define NORETURN __attribute__((__noreturn__))
145#else
146#define NORETURN
147#ifndef __attribute__
148#define __attribute__(x)
149#endif
150#endif
151
152/* General helper functions */
153extern void usage(const char *err) NORETURN;
154extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
155extern int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
156extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
157
158extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
159
160extern int prefixcmp(const char *str, const char *prefix);
161extern time_t tm_to_time_t(const struct tm *tm);
162
163static inline const char *skip_prefix(const char *str, const char *prefix)
164{
165 size_t len = strlen(prefix);
166 return strncmp(str, prefix, len) ? NULL : str + len;
167}
168
169#if defined(NO_MMAP) || defined(USE_WIN32_MMAP)
170
171#ifndef PROT_READ
172#define PROT_READ 1
173#define PROT_WRITE 2
174#define MAP_PRIVATE 1
175#define MAP_FAILED ((void*)-1)
176#endif
177
178#define mmap git_mmap
179#define munmap git_munmap
180extern void *git_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
181extern int git_munmap(void *start, size_t length);
182
183#else /* NO_MMAP || USE_WIN32_MMAP */
184
185#include <sys/mman.h>
186
187#endif /* NO_MMAP || USE_WIN32_MMAP */
188
189#ifdef NO_MMAP
190
191/* This value must be multiple of (pagesize * 2) */
192#define DEFAULT_PACKED_GIT_WINDOW_SIZE (1 * 1024 * 1024)
193
194#else /* NO_MMAP */
195
196/* This value must be multiple of (pagesize * 2) */
197#define DEFAULT_PACKED_GIT_WINDOW_SIZE \
198 (sizeof(void*) >= 8 \
199 ? 1 * 1024 * 1024 * 1024 \
200 : 32 * 1024 * 1024)
201
202#endif /* NO_MMAP */
203
204#ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
205#define on_disk_bytes(st) ((st).st_size)
206#else
207#define on_disk_bytes(st) ((st).st_blocks * 512)
208#endif
209
210#define DEFAULT_PACKED_GIT_LIMIT \
211 ((1024L * 1024L) * (sizeof(void*) >= 8 ? 8192 : 256))
212
213#ifdef NO_PREAD
214#define pread git_pread
215extern ssize_t git_pread(int fd, void *buf, size_t count, off_t offset);
216#endif
217/*
218 * Forward decl that will remind us if its twin in cache.h changes.
219 * This function is used in compat/pread.c. But we can't include
220 * cache.h there.
221 */
222extern ssize_t read_in_full(int fd, void *buf, size_t count);
223
224#ifdef NO_SETENV
225#define setenv gitsetenv
226extern int gitsetenv(const char *, const char *, int);
227#endif
228
229#ifdef NO_MKDTEMP
230#define mkdtemp gitmkdtemp
231extern char *gitmkdtemp(char *);
232#endif
233
234#ifdef NO_UNSETENV
235#define unsetenv gitunsetenv
236extern void gitunsetenv(const char *);
237#endif
238
239#ifdef NO_STRCASESTR
240#define strcasestr gitstrcasestr
241extern char *gitstrcasestr(const char *haystack, const char *needle);
242#endif
243
244#ifdef NO_STRLCPY
245#define strlcpy gitstrlcpy
246extern size_t gitstrlcpy(char *, const char *, size_t);
247#endif
248
249#ifdef NO_STRTOUMAX
250#define strtoumax gitstrtoumax
251extern uintmax_t gitstrtoumax(const char *, char **, int);
252#endif
253
254#ifdef NO_HSTRERROR
255#define hstrerror githstrerror
256extern const char *githstrerror(int herror);
257#endif
258
259#ifdef NO_MEMMEM
260#define memmem gitmemmem
261void *gitmemmem(const void *haystack, size_t haystacklen,
262 const void *needle, size_t needlelen);
263#endif
264
265#ifdef FREAD_READS_DIRECTORIES
266#ifdef fopen
267#undef fopen
268#endif
269#define fopen(a,b) git_fopen(a,b)
270extern FILE *git_fopen(const char*, const char*);
271#endif
272
273#ifdef SNPRINTF_RETURNS_BOGUS
274#define snprintf git_snprintf
275extern int git_snprintf(char *str, size_t maxsize,
276 const char *format, ...);
277#define vsnprintf git_vsnprintf
278extern int git_vsnprintf(char *str, size_t maxsize,
279 const char *format, va_list ap);
280#endif
281
282#ifdef __GLIBC_PREREQ
283#if __GLIBC_PREREQ(2, 1)
284#define HAVE_STRCHRNUL
285#endif
286#endif
287
288#ifndef HAVE_STRCHRNUL
289#define strchrnul gitstrchrnul
290static inline char *gitstrchrnul(const char *s, int c)
291{
292 while (*s && *s != c)
293 s++;
294 return (char *)s;
295}
296#endif
297
298/*
299 * Wrappers:
300 */
301extern char *xstrdup(const char *str);
302extern void *xmalloc(size_t size);
303extern void *xmemdupz(const void *data, size_t len);
304extern char *xstrndup(const char *str, size_t len);
305extern void *xrealloc(void *ptr, size_t size);
306extern void *xcalloc(size_t nmemb, size_t size);
307extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
308extern ssize_t xread(int fd, void *buf, size_t len);
309extern ssize_t xwrite(int fd, const void *buf, size_t len);
310extern int xdup(int fd);
311extern FILE *xfdopen(int fd, const char *mode);
312static inline size_t xsize_t(off_t len)
313{
314 return (size_t)len;
315}
316
317static inline int has_extension(const char *filename, const char *ext)
318{
319 size_t len = strlen(filename);
320 size_t extlen = strlen(ext);
321 return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
322}
323
324/* Sane ctype - no locale, and works with signed chars */
325#undef isascii
326#undef isspace
327#undef isdigit
328#undef isalpha
329#undef isalnum
330#undef tolower
331#undef toupper
332extern unsigned char sane_ctype[256];
333#define GIT_SPACE 0x01
334#define GIT_DIGIT 0x02
335#define GIT_ALPHA 0x04
336#define GIT_GLOB_SPECIAL 0x08
337#define GIT_REGEX_SPECIAL 0x10
338#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
339#define isascii(x) (((x) & ~0x7f) == 0)
340#define isspace(x) sane_istest(x,GIT_SPACE)
341#define isdigit(x) sane_istest(x,GIT_DIGIT)
342#define isalpha(x) sane_istest(x,GIT_ALPHA)
343#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
344#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
345#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
346#define tolower(x) sane_case((unsigned char)(x), 0x20)
347#define toupper(x) sane_case((unsigned char)(x), 0)
348
349static inline int sane_case(int x, int high)
350{
351 if (sane_istest(x, GIT_ALPHA))
352 x = (x & ~0x20) | high;
353 return x;
354}
355
356static inline int strtoul_ui(char const *s, int base, unsigned int *result)
357{
358 unsigned long ul;
359 char *p;
360
361 errno = 0;
362 ul = strtoul(s, &p, base);
363 if (errno || *p || p == s || (unsigned int) ul != ul)
364 return -1;
365 *result = ul;
366 return 0;
367}
368
369static inline int strtol_i(char const *s, int base, int *result)
370{
371 long ul;
372 char *p;
373
374 errno = 0;
375 ul = strtol(s, &p, base);
376 if (errno || *p || p == s || (int) ul != ul)
377 return -1;
378 *result = ul;
379 return 0;
380}
381
382#ifdef INTERNAL_QSORT
383void git_qsort(void *base, size_t nmemb, size_t size,
384 int(*compar)(const void *, const void *));
385#define qsort git_qsort
386#endif
387
388#ifndef DIR_HAS_BSD_GROUP_SEMANTICS
389# define FORCE_DIR_SET_GID S_ISGID
390#else
391# define FORCE_DIR_SET_GID 0
392#endif
393
394#ifdef NO_NSEC
395#undef USE_NSEC
396#define ST_CTIME_NSEC(st) 0
397#define ST_MTIME_NSEC(st) 0
398#else
399#ifdef USE_ST_TIMESPEC
400#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec))
401#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec))
402#else
403#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec))
404#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec))
405#endif
406#endif
407
408#endif
diff --git a/Documentation/perf_counter/util/wrapper.c b/Documentation/perf_counter/util/wrapper.c
new file mode 100644
index 000000000000..6350d65f6d9e
--- /dev/null
+++ b/Documentation/perf_counter/util/wrapper.c
@@ -0,0 +1,206 @@
1/*
2 * Various trivial helper wrappers around standard functions
3 */
4#include "cache.h"
5
6/*
7 * There's no pack memory to release - but stay close to the Git
8 * version so wrap this away:
9 */
10static inline void release_pack_memory(size_t size, int flag)
11{
12}
13
14char *xstrdup(const char *str)
15{
16 char *ret = strdup(str);
17 if (!ret) {
18 release_pack_memory(strlen(str) + 1, -1);
19 ret = strdup(str);
20 if (!ret)
21 die("Out of memory, strdup failed");
22 }
23 return ret;
24}
25
26void *xmalloc(size_t size)
27{
28 void *ret = malloc(size);
29 if (!ret && !size)
30 ret = malloc(1);
31 if (!ret) {
32 release_pack_memory(size, -1);
33 ret = malloc(size);
34 if (!ret && !size)
35 ret = malloc(1);
36 if (!ret)
37 die("Out of memory, malloc failed");
38 }
39#ifdef XMALLOC_POISON
40 memset(ret, 0xA5, size);
41#endif
42 return ret;
43}
44
45/*
46 * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of
47 * "data" to the allocated memory, zero terminates the allocated memory,
48 * and returns a pointer to the allocated memory. If the allocation fails,
49 * the program dies.
50 */
51void *xmemdupz(const void *data, size_t len)
52{
53 char *p = xmalloc(len + 1);
54 memcpy(p, data, len);
55 p[len] = '\0';
56 return p;
57}
58
59char *xstrndup(const char *str, size_t len)
60{
61 char *p = memchr(str, '\0', len);
62 return xmemdupz(str, p ? p - str : len);
63}
64
65void *xrealloc(void *ptr, size_t size)
66{
67 void *ret = realloc(ptr, size);
68 if (!ret && !size)
69 ret = realloc(ptr, 1);
70 if (!ret) {
71 release_pack_memory(size, -1);
72 ret = realloc(ptr, size);
73 if (!ret && !size)
74 ret = realloc(ptr, 1);
75 if (!ret)
76 die("Out of memory, realloc failed");
77 }
78 return ret;
79}
80
81void *xcalloc(size_t nmemb, size_t size)
82{
83 void *ret = calloc(nmemb, size);
84 if (!ret && (!nmemb || !size))
85 ret = calloc(1, 1);
86 if (!ret) {
87 release_pack_memory(nmemb * size, -1);
88 ret = calloc(nmemb, size);
89 if (!ret && (!nmemb || !size))
90 ret = calloc(1, 1);
91 if (!ret)
92 die("Out of memory, calloc failed");
93 }
94 return ret;
95}
96
97void *xmmap(void *start, size_t length,
98 int prot, int flags, int fd, off_t offset)
99{
100 void *ret = mmap(start, length, prot, flags, fd, offset);
101 if (ret == MAP_FAILED) {
102 if (!length)
103 return NULL;
104 release_pack_memory(length, fd);
105 ret = mmap(start, length, prot, flags, fd, offset);
106 if (ret == MAP_FAILED)
107 die("Out of memory? mmap failed: %s", strerror(errno));
108 }
109 return ret;
110}
111
112/*
113 * xread() is the same a read(), but it automatically restarts read()
114 * operations with a recoverable error (EAGAIN and EINTR). xread()
115 * DOES NOT GUARANTEE that "len" bytes is read even if the data is available.
116 */
117ssize_t xread(int fd, void *buf, size_t len)
118{
119 ssize_t nr;
120 while (1) {
121 nr = read(fd, buf, len);
122 if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
123 continue;
124 return nr;
125 }
126}
127
128/*
129 * xwrite() is the same a write(), but it automatically restarts write()
130 * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT
131 * GUARANTEE that "len" bytes is written even if the operation is successful.
132 */
133ssize_t xwrite(int fd, const void *buf, size_t len)
134{
135 ssize_t nr;
136 while (1) {
137 nr = write(fd, buf, len);
138 if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
139 continue;
140 return nr;
141 }
142}
143
144ssize_t read_in_full(int fd, void *buf, size_t count)
145{
146 char *p = buf;
147 ssize_t total = 0;
148
149 while (count > 0) {
150 ssize_t loaded = xread(fd, p, count);
151 if (loaded <= 0)
152 return total ? total : loaded;
153 count -= loaded;
154 p += loaded;
155 total += loaded;
156 }
157
158 return total;
159}
160
161ssize_t write_in_full(int fd, const void *buf, size_t count)
162{
163 const char *p = buf;
164 ssize_t total = 0;
165
166 while (count > 0) {
167 ssize_t written = xwrite(fd, p, count);
168 if (written < 0)
169 return -1;
170 if (!written) {
171 errno = ENOSPC;
172 return -1;
173 }
174 count -= written;
175 p += written;
176 total += written;
177 }
178
179 return total;
180}
181
182int xdup(int fd)
183{
184 int ret = dup(fd);
185 if (ret < 0)
186 die("dup failed: %s", strerror(errno));
187 return ret;
188}
189
190FILE *xfdopen(int fd, const char *mode)
191{
192 FILE *stream = fdopen(fd, mode);
193 if (stream == NULL)
194 die("Out of memory? fdopen failed: %s", strerror(errno));
195 return stream;
196}
197
198int xmkstemp(char *template)
199{
200 int fd;
201
202 fd = mkstemp(template);
203 if (fd < 0)
204 die("Unable to create temporary file: %s", strerror(errno));
205 return fd;
206}
diff --git a/MAINTAINERS b/MAINTAINERS
index 2b349ba4add4..8f4a8b601a15 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4375,6 +4375,16 @@ S: Maintained
4375F: include/linux/delayacct.h 4375F: include/linux/delayacct.h
4376F: kernel/delayacct.c 4376F: kernel/delayacct.c
4377 4377
4378PERFORMANCE COUNTER SUBSYSTEM
4379P: Peter Zijlstra
4380M: a.p.zijlstra@chello.nl
4381P: Paul Mackerras
4382M: paulus@samba.org
4383P: Ingo Molnar
4384M: mingo@elte.hu
4385L: linux-kernel@vger.kernel.org
4386S: Supported
4387
4378PERSONALITY HANDLING 4388PERSONALITY HANDLING
4379P: Christoph Hellwig 4389P: Christoph Hellwig
4380M: hch@infradead.org 4390M: hch@infradead.org
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index b7e034b0a6dd..20a44d0c9fdd 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,5 +131,44 @@ static inline int irqs_disabled_flags(unsigned long flags)
131 */ 131 */
132struct irq_chip; 132struct irq_chip;
133 133
134#ifdef CONFIG_PERF_COUNTERS
135static inline unsigned long test_perf_counter_pending(void)
136{
137 unsigned long x;
138
139 asm volatile("lbz %0,%1(13)"
140 : "=r" (x)
141 : "i" (offsetof(struct paca_struct, perf_counter_pending)));
142 return x;
143}
144
145static inline void set_perf_counter_pending(void)
146{
147 asm volatile("stb %0,%1(13)" : :
148 "r" (1),
149 "i" (offsetof(struct paca_struct, perf_counter_pending)));
150}
151
152static inline void clear_perf_counter_pending(void)
153{
154 asm volatile("stb %0,%1(13)" : :
155 "r" (0),
156 "i" (offsetof(struct paca_struct, perf_counter_pending)));
157}
158
159extern void perf_counter_do_pending(void);
160
161#else
162
163static inline unsigned long test_perf_counter_pending(void)
164{
165 return 0;
166}
167
168static inline void set_perf_counter_pending(void) {}
169static inline void clear_perf_counter_pending(void) {}
170static inline void perf_counter_do_pending(void) {}
171#endif /* CONFIG_PERF_COUNTERS */
172
134#endif /* __KERNEL__ */ 173#endif /* __KERNEL__ */
135#endif /* _ASM_POWERPC_HW_IRQ_H */ 174#endif /* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 082b3aedf145..6ef055723019 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,6 +99,7 @@ struct paca_struct {
99 u8 soft_enabled; /* irq soft-enable flag */ 99 u8 soft_enabled; /* irq soft-enable flag */
100 u8 hard_enabled; /* set if irqs are enabled in MSR */ 100 u8 hard_enabled; /* set if irqs are enabled in MSR */
101 u8 io_sync; /* writel() needs spin_unlock sync */ 101 u8 io_sync; /* writel() needs spin_unlock sync */
102 u8 perf_counter_pending; /* PM interrupt while soft-disabled */
102 103
103 /* Stuff for accurate time accounting */ 104 /* Stuff for accurate time accounting */
104 u64 user_time; /* accumulated usermode TB ticks */ 105 u64 user_time; /* accumulated usermode TB ticks */
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
new file mode 100644
index 000000000000..1c60f0ca7920
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -0,0 +1,95 @@
1/*
2 * Performance counter support - PowerPC-specific definitions.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/types.h>
12
13#define MAX_HWCOUNTERS 8
14#define MAX_EVENT_ALTERNATIVES 8
15#define MAX_LIMITED_HWCOUNTERS 2
16
17/*
18 * This struct provides the constants and functions needed to
19 * describe the PMU on a particular POWER-family CPU.
20 */
21struct power_pmu {
22 int n_counter;
23 int max_alternatives;
24 u64 add_fields;
25 u64 test_adder;
26 int (*compute_mmcr)(u64 events[], int n_ev,
27 unsigned int hwc[], u64 mmcr[]);
28 int (*get_constraint)(u64 event, u64 *mskp, u64 *valp);
29 int (*get_alternatives)(u64 event, unsigned int flags,
30 u64 alt[]);
31 void (*disable_pmc)(unsigned int pmc, u64 mmcr[]);
32 int (*limited_pmc_event)(u64 event);
33 u32 flags;
34 int n_generic;
35 int *generic_events;
36};
37
38extern struct power_pmu *ppmu;
39
40/*
41 * Values for power_pmu.flags
42 */
43#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */
44#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */
45
46/*
47 * Values for flags to get_alternatives()
48 */
49#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */
50#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */
51#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */
52
53struct pt_regs;
54extern unsigned long perf_misc_flags(struct pt_regs *regs);
55#define perf_misc_flags(regs) perf_misc_flags(regs)
56
57extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
58
59/*
60 * The power_pmu.get_constraint function returns a 64-bit value and
61 * a 64-bit mask that express the constraints between this event and
62 * other events.
63 *
64 * The value and mask are divided up into (non-overlapping) bitfields
65 * of three different types:
66 *
67 * Select field: this expresses the constraint that some set of bits
68 * in MMCR* needs to be set to a specific value for this event. For a
69 * select field, the mask contains 1s in every bit of the field, and
70 * the value contains a unique value for each possible setting of the
71 * MMCR* bits. The constraint checking code will ensure that two events
72 * that set the same field in their masks have the same value in their
73 * value dwords.
74 *
75 * Add field: this expresses the constraint that there can be at most
76 * N events in a particular class. A field of k bits can be used for
77 * N <= 2^(k-1) - 1. The mask has the most significant bit of the field
78 * set (and the other bits 0), and the value has only the least significant
79 * bit of the field set. In addition, the 'add_fields' and 'test_adder'
80 * in the struct power_pmu for this processor come into play. The
81 * add_fields value contains 1 in the LSB of the field, and the
82 * test_adder contains 2^(k-1) - 1 - N in the field.
83 *
84 * NAND field: this expresses the constraint that you may not have events
85 * in all of a set of classes. (For example, on PPC970, you can't select
86 * events from the FPU, ISU and IDU simultaneously, although any two are
87 * possible.) For N classes, the field is N+1 bits wide, and each class
88 * is assigned one bit from the least-significant N bits. The mask has
89 * only the most-significant bit set, and the value has only the bit
90 * for the event's class set. The test_adder has the least significant
91 * bit set in the field.
92 *
93 * If an event is not subject to the constraint expressed by a particular
94 * field, then it will have 0 in both the mask and value for that field.
95 */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index e8018d540e87..fb359b0a6937 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -492,11 +492,13 @@
492#define MMCR0_FCHV 0x00000001UL /* freeze conditions in hypervisor mode */ 492#define MMCR0_FCHV 0x00000001UL /* freeze conditions in hypervisor mode */
493#define SPRN_MMCR1 798 493#define SPRN_MMCR1 798
494#define SPRN_MMCRA 0x312 494#define SPRN_MMCRA 0x312
495#define MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */
495#define MMCRA_SIHV 0x10000000UL /* state of MSR HV when SIAR set */ 496#define MMCRA_SIHV 0x10000000UL /* state of MSR HV when SIAR set */
496#define MMCRA_SIPR 0x08000000UL /* state of MSR PR when SIAR set */ 497#define MMCRA_SIPR 0x08000000UL /* state of MSR PR when SIAR set */
497#define MMCRA_SLOT 0x07000000UL /* SLOT bits (37-39) */ 498#define MMCRA_SLOT 0x07000000UL /* SLOT bits (37-39) */
498#define MMCRA_SLOT_SHIFT 24 499#define MMCRA_SLOT_SHIFT 24
499#define MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */ 500#define MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */
501#define POWER6_MMCRA_SDSYNC 0x0000080000000000ULL /* SDAR/SIAR synced */
500#define POWER6_MMCRA_SIHV 0x0000040000000000ULL 502#define POWER6_MMCRA_SIHV 0x0000040000000000ULL
501#define POWER6_MMCRA_SIPR 0x0000020000000000ULL 503#define POWER6_MMCRA_SIPR 0x0000020000000000ULL
502#define POWER6_MMCRA_THRM 0x00000020UL 504#define POWER6_MMCRA_THRM 0x00000020UL
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index d98a30dfd41c..a0b92de51c7e 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,6 +322,6 @@ SYSCALL_SPU(epoll_create1)
322SYSCALL_SPU(dup3) 322SYSCALL_SPU(dup3)
323SYSCALL_SPU(pipe2) 323SYSCALL_SPU(pipe2)
324SYSCALL(inotify_init1) 324SYSCALL(inotify_init1)
325SYSCALL(ni_syscall) 325SYSCALL_SPU(perf_counter_open)
326COMPAT_SYS_SPU(preadv) 326COMPAT_SYS_SPU(preadv)
327COMPAT_SYS_SPU(pwritev) 327COMPAT_SYS_SPU(pwritev)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index 3f06f8ec81c5..4badac2d11d1 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,6 +341,7 @@
341#define __NR_dup3 316 341#define __NR_dup3 316
342#define __NR_pipe2 317 342#define __NR_pipe2 317
343#define __NR_inotify_init1 318 343#define __NR_inotify_init1 318
344#define __NR_perf_counter_open 319
344#define __NR_preadv 320 345#define __NR_preadv 320
345#define __NR_pwritev 321 346#define __NR_pwritev 321
346 347
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 71901fbda4a5..9ba1bb731fcc 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,6 +94,8 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o
94 94
95obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 95obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
96obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 96obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
97obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o power4-pmu.o ppc970-pmu.o \
98 power5-pmu.o power5+-pmu.o power6-pmu.o
97 99
98obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o 100obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
99 101
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 1e40bc053946..e981d1ce1914 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -131,6 +131,7 @@ int main(void)
131 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); 131 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
132 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); 132 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
133 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); 133 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
134 DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
134 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); 135 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
135 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); 136 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
136 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); 137 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index abfc32330479..43e073477c34 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
5262: 5262:
527 TRACE_AND_RESTORE_IRQ(r5); 527 TRACE_AND_RESTORE_IRQ(r5);
528 528
529#ifdef CONFIG_PERF_COUNTERS
530 /* check paca->perf_counter_pending if we're enabling ints */
531 lbz r3,PACAPERFPEND(r13)
532 and. r3,r3,r5
533 beq 27f
534 bl .perf_counter_do_pending
53527:
536#endif /* CONFIG_PERF_COUNTERS */
537
529 /* extract EE bit and use it to restore paca->hard_enabled */ 538 /* extract EE bit and use it to restore paca->hard_enabled */
530 ld r3,_MSR(r1) 539 ld r3,_MSR(r1)
531 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ 540 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 8c1a4966867e..feff792ed0f9 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,6 +135,11 @@ notrace void raw_local_irq_restore(unsigned long en)
135 iseries_handle_interrupts(); 135 iseries_handle_interrupts();
136 } 136 }
137 137
138 if (test_perf_counter_pending()) {
139 clear_perf_counter_pending();
140 perf_counter_do_pending();
141 }
142
138 /* 143 /*
139 * if (get_paca()->hard_enabled) return; 144 * if (get_paca()->hard_enabled) return;
140 * But again we need to take care that gcc gets hard_enabled directly 145 * But again we need to take care that gcc gets hard_enabled directly
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
new file mode 100644
index 000000000000..6baae5a5c331
--- /dev/null
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,1165 @@
1/*
2 * Performance counter support - powerpc architecture code
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/sched.h>
13#include <linux/perf_counter.h>
14#include <linux/percpu.h>
15#include <linux/hardirq.h>
16#include <asm/reg.h>
17#include <asm/pmc.h>
18#include <asm/machdep.h>
19#include <asm/firmware.h>
20#include <asm/ptrace.h>
21
22struct cpu_hw_counters {
23 int n_counters;
24 int n_percpu;
25 int disabled;
26 int n_added;
27 int n_limited;
28 u8 pmcs_enabled;
29 struct perf_counter *counter[MAX_HWCOUNTERS];
30 u64 events[MAX_HWCOUNTERS];
31 unsigned int flags[MAX_HWCOUNTERS];
32 u64 mmcr[3];
33 struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
34 u8 limited_hwidx[MAX_LIMITED_HWCOUNTERS];
35};
36DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
37
38struct power_pmu *ppmu;
39
40/*
41 * Normally, to ignore kernel events we set the FCS (freeze counters
42 * in supervisor mode) bit in MMCR0, but if the kernel runs with the
43 * hypervisor bit set in the MSR, or if we are running on a processor
44 * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
45 * then we need to use the FCHV bit to ignore kernel events.
46 */
47static unsigned int freeze_counters_kernel = MMCR0_FCS;
48
49static void perf_counter_interrupt(struct pt_regs *regs);
50
51void perf_counter_print_debug(void)
52{
53}
54
55/*
56 * Read one performance monitor counter (PMC).
57 */
58static unsigned long read_pmc(int idx)
59{
60 unsigned long val;
61
62 switch (idx) {
63 case 1:
64 val = mfspr(SPRN_PMC1);
65 break;
66 case 2:
67 val = mfspr(SPRN_PMC2);
68 break;
69 case 3:
70 val = mfspr(SPRN_PMC3);
71 break;
72 case 4:
73 val = mfspr(SPRN_PMC4);
74 break;
75 case 5:
76 val = mfspr(SPRN_PMC5);
77 break;
78 case 6:
79 val = mfspr(SPRN_PMC6);
80 break;
81 case 7:
82 val = mfspr(SPRN_PMC7);
83 break;
84 case 8:
85 val = mfspr(SPRN_PMC8);
86 break;
87 default:
88 printk(KERN_ERR "oops trying to read PMC%d\n", idx);
89 val = 0;
90 }
91 return val;
92}
93
94/*
95 * Write one PMC.
96 */
97static void write_pmc(int idx, unsigned long val)
98{
99 switch (idx) {
100 case 1:
101 mtspr(SPRN_PMC1, val);
102 break;
103 case 2:
104 mtspr(SPRN_PMC2, val);
105 break;
106 case 3:
107 mtspr(SPRN_PMC3, val);
108 break;
109 case 4:
110 mtspr(SPRN_PMC4, val);
111 break;
112 case 5:
113 mtspr(SPRN_PMC5, val);
114 break;
115 case 6:
116 mtspr(SPRN_PMC6, val);
117 break;
118 case 7:
119 mtspr(SPRN_PMC7, val);
120 break;
121 case 8:
122 mtspr(SPRN_PMC8, val);
123 break;
124 default:
125 printk(KERN_ERR "oops trying to write PMC%d\n", idx);
126 }
127}
128
129/*
130 * Check if a set of events can all go on the PMU at once.
131 * If they can't, this will look at alternative codes for the events
132 * and see if any combination of alternative codes is feasible.
133 * The feasible set is returned in event[].
134 */
135static int power_check_constraints(u64 event[], unsigned int cflags[],
136 int n_ev)
137{
138 u64 mask, value, nv;
139 u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
140 u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
141 u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
142 u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
143 int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
144 int i, j;
145 u64 addf = ppmu->add_fields;
146 u64 tadd = ppmu->test_adder;
147
148 if (n_ev > ppmu->n_counter)
149 return -1;
150
151 /* First see if the events will go on as-is */
152 for (i = 0; i < n_ev; ++i) {
153 if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
154 && !ppmu->limited_pmc_event(event[i])) {
155 ppmu->get_alternatives(event[i], cflags[i],
156 alternatives[i]);
157 event[i] = alternatives[i][0];
158 }
159 if (ppmu->get_constraint(event[i], &amasks[i][0],
160 &avalues[i][0]))
161 return -1;
162 }
163 value = mask = 0;
164 for (i = 0; i < n_ev; ++i) {
165 nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
166 if ((((nv + tadd) ^ value) & mask) != 0 ||
167 (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
168 break;
169 value = nv;
170 mask |= amasks[i][0];
171 }
172 if (i == n_ev)
173 return 0; /* all OK */
174
175 /* doesn't work, gather alternatives... */
176 if (!ppmu->get_alternatives)
177 return -1;
178 for (i = 0; i < n_ev; ++i) {
179 choice[i] = 0;
180 n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
181 alternatives[i]);
182 for (j = 1; j < n_alt[i]; ++j)
183 ppmu->get_constraint(alternatives[i][j],
184 &amasks[i][j], &avalues[i][j]);
185 }
186
187 /* enumerate all possibilities and see if any will work */
188 i = 0;
189 j = -1;
190 value = mask = nv = 0;
191 while (i < n_ev) {
192 if (j >= 0) {
193 /* we're backtracking, restore context */
194 value = svalues[i];
195 mask = smasks[i];
196 j = choice[i];
197 }
198 /*
199 * See if any alternative k for event i,
200 * where k > j, will satisfy the constraints.
201 */
202 while (++j < n_alt[i]) {
203 nv = (value | avalues[i][j]) +
204 (value & avalues[i][j] & addf);
205 if ((((nv + tadd) ^ value) & mask) == 0 &&
206 (((nv + tadd) ^ avalues[i][j])
207 & amasks[i][j]) == 0)
208 break;
209 }
210 if (j >= n_alt[i]) {
211 /*
212 * No feasible alternative, backtrack
213 * to event i-1 and continue enumerating its
214 * alternatives from where we got up to.
215 */
216 if (--i < 0)
217 return -1;
218 } else {
219 /*
220 * Found a feasible alternative for event i,
221 * remember where we got up to with this event,
222 * go on to the next event, and start with
223 * the first alternative for it.
224 */
225 choice[i] = j;
226 svalues[i] = value;
227 smasks[i] = mask;
228 value = nv;
229 mask |= amasks[i][j];
230 ++i;
231 j = -1;
232 }
233 }
234
235 /* OK, we have a feasible combination, tell the caller the solution */
236 for (i = 0; i < n_ev; ++i)
237 event[i] = alternatives[i][choice[i]];
238 return 0;
239}
240
241/*
242 * Check if newly-added counters have consistent settings for
243 * exclude_{user,kernel,hv} with each other and any previously
244 * added counters.
245 */
246static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
247 int n_prev, int n_new)
248{
249 int eu = 0, ek = 0, eh = 0;
250 int i, n, first;
251 struct perf_counter *counter;
252
253 n = n_prev + n_new;
254 if (n <= 1)
255 return 0;
256
257 first = 1;
258 for (i = 0; i < n; ++i) {
259 if (cflags[i] & PPMU_LIMITED_PMC_OK) {
260 cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
261 continue;
262 }
263 counter = ctrs[i];
264 if (first) {
265 eu = counter->hw_event.exclude_user;
266 ek = counter->hw_event.exclude_kernel;
267 eh = counter->hw_event.exclude_hv;
268 first = 0;
269 } else if (counter->hw_event.exclude_user != eu ||
270 counter->hw_event.exclude_kernel != ek ||
271 counter->hw_event.exclude_hv != eh) {
272 return -EAGAIN;
273 }
274 }
275
276 if (eu || ek || eh)
277 for (i = 0; i < n; ++i)
278 if (cflags[i] & PPMU_LIMITED_PMC_OK)
279 cflags[i] |= PPMU_LIMITED_PMC_REQD;
280
281 return 0;
282}
283
284static void power_pmu_read(struct perf_counter *counter)
285{
286 long val, delta, prev;
287
288 if (!counter->hw.idx)
289 return;
290 /*
291 * Performance monitor interrupts come even when interrupts
292 * are soft-disabled, as long as interrupts are hard-enabled.
293 * Therefore we treat them like NMIs.
294 */
295 do {
296 prev = atomic64_read(&counter->hw.prev_count);
297 barrier();
298 val = read_pmc(counter->hw.idx);
299 } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
300
301 /* The counters are only 32 bits wide */
302 delta = (val - prev) & 0xfffffffful;
303 atomic64_add(delta, &counter->count);
304 atomic64_sub(delta, &counter->hw.period_left);
305}
306
307/*
308 * On some machines, PMC5 and PMC6 can't be written, don't respect
309 * the freeze conditions, and don't generate interrupts. This tells
310 * us if `counter' is using such a PMC.
311 */
312static int is_limited_pmc(int pmcnum)
313{
314 return (ppmu->flags & PPMU_LIMITED_PMC5_6)
315 && (pmcnum == 5 || pmcnum == 6);
316}
317
318static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
319 unsigned long pmc5, unsigned long pmc6)
320{
321 struct perf_counter *counter;
322 u64 val, prev, delta;
323 int i;
324
325 for (i = 0; i < cpuhw->n_limited; ++i) {
326 counter = cpuhw->limited_counter[i];
327 if (!counter->hw.idx)
328 continue;
329 val = (counter->hw.idx == 5) ? pmc5 : pmc6;
330 prev = atomic64_read(&counter->hw.prev_count);
331 counter->hw.idx = 0;
332 delta = (val - prev) & 0xfffffffful;
333 atomic64_add(delta, &counter->count);
334 }
335}
336
337static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
338 unsigned long pmc5, unsigned long pmc6)
339{
340 struct perf_counter *counter;
341 u64 val;
342 int i;
343
344 for (i = 0; i < cpuhw->n_limited; ++i) {
345 counter = cpuhw->limited_counter[i];
346 counter->hw.idx = cpuhw->limited_hwidx[i];
347 val = (counter->hw.idx == 5) ? pmc5 : pmc6;
348 atomic64_set(&counter->hw.prev_count, val);
349 perf_counter_update_userpage(counter);
350 }
351}
352
353/*
354 * Since limited counters don't respect the freeze conditions, we
355 * have to read them immediately after freezing or unfreezing the
356 * other counters. We try to keep the values from the limited
357 * counters as consistent as possible by keeping the delay (in
358 * cycles and instructions) between freezing/unfreezing and reading
359 * the limited counters as small and consistent as possible.
360 * Therefore, if any limited counters are in use, we read them
361 * both, and always in the same order, to minimize variability,
362 * and do it inside the same asm that writes MMCR0.
363 */
364static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
365{
366 unsigned long pmc5, pmc6;
367
368 if (!cpuhw->n_limited) {
369 mtspr(SPRN_MMCR0, mmcr0);
370 return;
371 }
372
373 /*
374 * Write MMCR0, then read PMC5 and PMC6 immediately.
375 */
376 asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
377 : "=&r" (pmc5), "=&r" (pmc6)
378 : "r" (mmcr0), "i" (SPRN_MMCR0),
379 "i" (SPRN_PMC5), "i" (SPRN_PMC6));
380
381 if (mmcr0 & MMCR0_FC)
382 freeze_limited_counters(cpuhw, pmc5, pmc6);
383 else
384 thaw_limited_counters(cpuhw, pmc5, pmc6);
385}
386
387/*
388 * Disable all counters to prevent PMU interrupts and to allow
389 * counters to be added or removed.
390 */
391void hw_perf_disable(void)
392{
393 struct cpu_hw_counters *cpuhw;
394 unsigned long ret;
395 unsigned long flags;
396
397 local_irq_save(flags);
398 cpuhw = &__get_cpu_var(cpu_hw_counters);
399
400 ret = cpuhw->disabled;
401 if (!ret) {
402 cpuhw->disabled = 1;
403 cpuhw->n_added = 0;
404
405 /*
406 * Check if we ever enabled the PMU on this cpu.
407 */
408 if (!cpuhw->pmcs_enabled) {
409 if (ppc_md.enable_pmcs)
410 ppc_md.enable_pmcs();
411 cpuhw->pmcs_enabled = 1;
412 }
413
414 /*
415 * Disable instruction sampling if it was enabled
416 */
417 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
418 mtspr(SPRN_MMCRA,
419 cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
420 mb();
421 }
422
423 /*
424 * Set the 'freeze counters' bit.
425 * The barrier is to make sure the mtspr has been
426 * executed and the PMU has frozen the counters
427 * before we return.
428 */
429 write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
430 mb();
431 }
432 local_irq_restore(flags);
433}
434
435/*
436 * Re-enable all counters if disable == 0.
437 * If we were previously disabled and counters were added, then
438 * put the new config on the PMU.
439 */
440void hw_perf_enable(void)
441{
442 struct perf_counter *counter;
443 struct cpu_hw_counters *cpuhw;
444 unsigned long flags;
445 long i;
446 unsigned long val;
447 s64 left;
448 unsigned int hwc_index[MAX_HWCOUNTERS];
449 int n_lim;
450 int idx;
451
452 local_irq_save(flags);
453 if (!cpuhw->disabled) {
454 local_irq_restore(flags);
455 return;
456 }
457
458 cpuhw = &__get_cpu_var(cpu_hw_counters);
459 cpuhw->disabled = 0;
460
461 /*
462 * If we didn't change anything, or only removed counters,
463 * no need to recalculate MMCR* settings and reset the PMCs.
464 * Just reenable the PMU with the current MMCR* settings
465 * (possibly updated for removal of counters).
466 */
467 if (!cpuhw->n_added) {
468 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
469 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
470 if (cpuhw->n_counters == 0)
471 get_lppaca()->pmcregs_in_use = 0;
472 goto out_enable;
473 }
474
475 /*
476 * Compute MMCR* values for the new set of counters
477 */
478 if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
479 cpuhw->mmcr)) {
480 /* shouldn't ever get here */
481 printk(KERN_ERR "oops compute_mmcr failed\n");
482 goto out;
483 }
484
485 /*
486 * Add in MMCR0 freeze bits corresponding to the
487 * hw_event.exclude_* bits for the first counter.
488 * We have already checked that all counters have the
489 * same values for these bits as the first counter.
490 */
491 counter = cpuhw->counter[0];
492 if (counter->hw_event.exclude_user)
493 cpuhw->mmcr[0] |= MMCR0_FCP;
494 if (counter->hw_event.exclude_kernel)
495 cpuhw->mmcr[0] |= freeze_counters_kernel;
496 if (counter->hw_event.exclude_hv)
497 cpuhw->mmcr[0] |= MMCR0_FCHV;
498
499 /*
500 * Write the new configuration to MMCR* with the freeze
501 * bit set and set the hardware counters to their initial values.
502 * Then unfreeze the counters.
503 */
504 get_lppaca()->pmcregs_in_use = 1;
505 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
506 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
507 mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
508 | MMCR0_FC);
509
510 /*
511 * Read off any pre-existing counters that need to move
512 * to another PMC.
513 */
514 for (i = 0; i < cpuhw->n_counters; ++i) {
515 counter = cpuhw->counter[i];
516 if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
517 power_pmu_read(counter);
518 write_pmc(counter->hw.idx, 0);
519 counter->hw.idx = 0;
520 }
521 }
522
523 /*
524 * Initialize the PMCs for all the new and moved counters.
525 */
526 cpuhw->n_limited = n_lim = 0;
527 for (i = 0; i < cpuhw->n_counters; ++i) {
528 counter = cpuhw->counter[i];
529 if (counter->hw.idx)
530 continue;
531 idx = hwc_index[i] + 1;
532 if (is_limited_pmc(idx)) {
533 cpuhw->limited_counter[n_lim] = counter;
534 cpuhw->limited_hwidx[n_lim] = idx;
535 ++n_lim;
536 continue;
537 }
538 val = 0;
539 if (counter->hw.irq_period) {
540 left = atomic64_read(&counter->hw.period_left);
541 if (left < 0x80000000L)
542 val = 0x80000000L - left;
543 }
544 atomic64_set(&counter->hw.prev_count, val);
545 counter->hw.idx = idx;
546 write_pmc(idx, val);
547 perf_counter_update_userpage(counter);
548 }
549 cpuhw->n_limited = n_lim;
550 cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
551
552 out_enable:
553 mb();
554 write_mmcr0(cpuhw, cpuhw->mmcr[0]);
555
556 /*
557 * Enable instruction sampling if necessary
558 */
559 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
560 mb();
561 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
562 }
563
564 out:
565 local_irq_restore(flags);
566}
567
568static int collect_events(struct perf_counter *group, int max_count,
569 struct perf_counter *ctrs[], u64 *events,
570 unsigned int *flags)
571{
572 int n = 0;
573 struct perf_counter *counter;
574
575 if (!is_software_counter(group)) {
576 if (n >= max_count)
577 return -1;
578 ctrs[n] = group;
579 flags[n] = group->hw.counter_base;
580 events[n++] = group->hw.config;
581 }
582 list_for_each_entry(counter, &group->sibling_list, list_entry) {
583 if (!is_software_counter(counter) &&
584 counter->state != PERF_COUNTER_STATE_OFF) {
585 if (n >= max_count)
586 return -1;
587 ctrs[n] = counter;
588 flags[n] = counter->hw.counter_base;
589 events[n++] = counter->hw.config;
590 }
591 }
592 return n;
593}
594
595static void counter_sched_in(struct perf_counter *counter, int cpu)
596{
597 counter->state = PERF_COUNTER_STATE_ACTIVE;
598 counter->oncpu = cpu;
599 counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
600 if (is_software_counter(counter))
601 counter->pmu->enable(counter);
602}
603
604/*
605 * Called to enable a whole group of counters.
606 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
607 * Assumes the caller has disabled interrupts and has
608 * frozen the PMU with hw_perf_save_disable.
609 */
610int hw_perf_group_sched_in(struct perf_counter *group_leader,
611 struct perf_cpu_context *cpuctx,
612 struct perf_counter_context *ctx, int cpu)
613{
614 struct cpu_hw_counters *cpuhw;
615 long i, n, n0;
616 struct perf_counter *sub;
617
618 cpuhw = &__get_cpu_var(cpu_hw_counters);
619 n0 = cpuhw->n_counters;
620 n = collect_events(group_leader, ppmu->n_counter - n0,
621 &cpuhw->counter[n0], &cpuhw->events[n0],
622 &cpuhw->flags[n0]);
623 if (n < 0)
624 return -EAGAIN;
625 if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
626 return -EAGAIN;
627 i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0);
628 if (i < 0)
629 return -EAGAIN;
630 cpuhw->n_counters = n0 + n;
631 cpuhw->n_added += n;
632
633 /*
634 * OK, this group can go on; update counter states etc.,
635 * and enable any software counters
636 */
637 for (i = n0; i < n0 + n; ++i)
638 cpuhw->counter[i]->hw.config = cpuhw->events[i];
639 cpuctx->active_oncpu += n;
640 n = 1;
641 counter_sched_in(group_leader, cpu);
642 list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
643 if (sub->state != PERF_COUNTER_STATE_OFF) {
644 counter_sched_in(sub, cpu);
645 ++n;
646 }
647 }
648 ctx->nr_active += n;
649
650 return 1;
651}
652
653/*
654 * Add a counter to the PMU.
655 * If all counters are not already frozen, then we disable and
656 * re-enable the PMU in order to get hw_perf_enable to do the
657 * actual work of reconfiguring the PMU.
658 */
659static int power_pmu_enable(struct perf_counter *counter)
660{
661 struct cpu_hw_counters *cpuhw;
662 unsigned long flags;
663 int n0;
664 int ret = -EAGAIN;
665
666 local_irq_save(flags);
667 perf_disable();
668
669 /*
670 * Add the counter to the list (if there is room)
671 * and check whether the total set is still feasible.
672 */
673 cpuhw = &__get_cpu_var(cpu_hw_counters);
674 n0 = cpuhw->n_counters;
675 if (n0 >= ppmu->n_counter)
676 goto out;
677 cpuhw->counter[n0] = counter;
678 cpuhw->events[n0] = counter->hw.config;
679 cpuhw->flags[n0] = counter->hw.counter_base;
680 if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
681 goto out;
682 if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1))
683 goto out;
684
685 counter->hw.config = cpuhw->events[n0];
686 ++cpuhw->n_counters;
687 ++cpuhw->n_added;
688
689 ret = 0;
690 out:
691 perf_enable();
692 local_irq_restore(flags);
693 return ret;
694}
695
696/*
697 * Remove a counter from the PMU.
698 */
699static void power_pmu_disable(struct perf_counter *counter)
700{
701 struct cpu_hw_counters *cpuhw;
702 long i;
703 unsigned long flags;
704
705 local_irq_save(flags);
706 perf_disable();
707
708 power_pmu_read(counter);
709
710 cpuhw = &__get_cpu_var(cpu_hw_counters);
711 for (i = 0; i < cpuhw->n_counters; ++i) {
712 if (counter == cpuhw->counter[i]) {
713 while (++i < cpuhw->n_counters)
714 cpuhw->counter[i-1] = cpuhw->counter[i];
715 --cpuhw->n_counters;
716 ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
717 if (counter->hw.idx) {
718 write_pmc(counter->hw.idx, 0);
719 counter->hw.idx = 0;
720 }
721 perf_counter_update_userpage(counter);
722 break;
723 }
724 }
725 for (i = 0; i < cpuhw->n_limited; ++i)
726 if (counter == cpuhw->limited_counter[i])
727 break;
728 if (i < cpuhw->n_limited) {
729 while (++i < cpuhw->n_limited) {
730 cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
731 cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
732 }
733 --cpuhw->n_limited;
734 }
735 if (cpuhw->n_counters == 0) {
736 /* disable exceptions if no counters are running */
737 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
738 }
739
740 perf_enable();
741 local_irq_restore(flags);
742}
743
744struct pmu power_pmu = {
745 .enable = power_pmu_enable,
746 .disable = power_pmu_disable,
747 .read = power_pmu_read,
748};
749
750/*
751 * Return 1 if we might be able to put counter on a limited PMC,
752 * or 0 if not.
753 * A counter can only go on a limited PMC if it counts something
754 * that a limited PMC can count, doesn't require interrupts, and
755 * doesn't exclude any processor mode.
756 */
757static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
758 unsigned int flags)
759{
760 int n;
761 u64 alt[MAX_EVENT_ALTERNATIVES];
762
763 if (counter->hw_event.exclude_user
764 || counter->hw_event.exclude_kernel
765 || counter->hw_event.exclude_hv
766 || counter->hw_event.irq_period)
767 return 0;
768
769 if (ppmu->limited_pmc_event(ev))
770 return 1;
771
772 /*
773 * The requested event isn't on a limited PMC already;
774 * see if any alternative code goes on a limited PMC.
775 */
776 if (!ppmu->get_alternatives)
777 return 0;
778
779 flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
780 n = ppmu->get_alternatives(ev, flags, alt);
781
782 return n > 0;
783}
784
785/*
786 * Find an alternative event that goes on a normal PMC, if possible,
787 * and return the event code, or 0 if there is no such alternative.
788 * (Note: event code 0 is "don't count" on all machines.)
789 */
790static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
791{
792 u64 alt[MAX_EVENT_ALTERNATIVES];
793 int n;
794
795 flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
796 n = ppmu->get_alternatives(ev, flags, alt);
797 if (!n)
798 return 0;
799 return alt[0];
800}
801
802/* Number of perf_counters counting hardware events */
803static atomic_t num_counters;
804/* Used to avoid races in calling reserve/release_pmc_hardware */
805static DEFINE_MUTEX(pmc_reserve_mutex);
806
807/*
808 * Release the PMU if this is the last perf_counter.
809 */
810static void hw_perf_counter_destroy(struct perf_counter *counter)
811{
812 if (!atomic_add_unless(&num_counters, -1, 1)) {
813 mutex_lock(&pmc_reserve_mutex);
814 if (atomic_dec_return(&num_counters) == 0)
815 release_pmc_hardware();
816 mutex_unlock(&pmc_reserve_mutex);
817 }
818}
819
820const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
821{
822 u64 ev;
823 unsigned long flags;
824 struct perf_counter *ctrs[MAX_HWCOUNTERS];
825 u64 events[MAX_HWCOUNTERS];
826 unsigned int cflags[MAX_HWCOUNTERS];
827 int n;
828 int err;
829
830 if (!ppmu)
831 return ERR_PTR(-ENXIO);
832 if (!perf_event_raw(&counter->hw_event)) {
833 ev = perf_event_id(&counter->hw_event);
834 if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
835 return ERR_PTR(-EOPNOTSUPP);
836 ev = ppmu->generic_events[ev];
837 } else {
838 ev = perf_event_config(&counter->hw_event);
839 }
840 counter->hw.config_base = ev;
841 counter->hw.idx = 0;
842
843 /*
844 * If we are not running on a hypervisor, force the
845 * exclude_hv bit to 0 so that we don't care what
846 * the user set it to.
847 */
848 if (!firmware_has_feature(FW_FEATURE_LPAR))
849 counter->hw_event.exclude_hv = 0;
850
851 /*
852 * If this is a per-task counter, then we can use
853 * PM_RUN_* events interchangeably with their non RUN_*
854 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
855 * XXX we should check if the task is an idle task.
856 */
857 flags = 0;
858 if (counter->ctx->task)
859 flags |= PPMU_ONLY_COUNT_RUN;
860
861 /*
862 * If this machine has limited counters, check whether this
863 * event could go on a limited counter.
864 */
865 if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
866 if (can_go_on_limited_pmc(counter, ev, flags)) {
867 flags |= PPMU_LIMITED_PMC_OK;
868 } else if (ppmu->limited_pmc_event(ev)) {
869 /*
870 * The requested event is on a limited PMC,
871 * but we can't use a limited PMC; see if any
872 * alternative goes on a normal PMC.
873 */
874 ev = normal_pmc_alternative(ev, flags);
875 if (!ev)
876 return ERR_PTR(-EINVAL);
877 }
878 }
879
880 /*
881 * If this is in a group, check if it can go on with all the
882 * other hardware counters in the group. We assume the counter
883 * hasn't been linked into its leader's sibling list at this point.
884 */
885 n = 0;
886 if (counter->group_leader != counter) {
887 n = collect_events(counter->group_leader, ppmu->n_counter - 1,
888 ctrs, events, cflags);
889 if (n < 0)
890 return ERR_PTR(-EINVAL);
891 }
892 events[n] = ev;
893 ctrs[n] = counter;
894 cflags[n] = flags;
895 if (check_excludes(ctrs, cflags, n, 1))
896 return ERR_PTR(-EINVAL);
897 if (power_check_constraints(events, cflags, n + 1))
898 return ERR_PTR(-EINVAL);
899
900 counter->hw.config = events[n];
901 counter->hw.counter_base = cflags[n];
902 atomic64_set(&counter->hw.period_left, counter->hw.irq_period);
903
904 /*
905 * See if we need to reserve the PMU.
906 * If no counters are currently in use, then we have to take a
907 * mutex to ensure that we don't race with another task doing
908 * reserve_pmc_hardware or release_pmc_hardware.
909 */
910 err = 0;
911 if (!atomic_inc_not_zero(&num_counters)) {
912 mutex_lock(&pmc_reserve_mutex);
913 if (atomic_read(&num_counters) == 0 &&
914 reserve_pmc_hardware(perf_counter_interrupt))
915 err = -EBUSY;
916 else
917 atomic_inc(&num_counters);
918 mutex_unlock(&pmc_reserve_mutex);
919 }
920 counter->destroy = hw_perf_counter_destroy;
921
922 if (err)
923 return ERR_PTR(err);
924 return &power_pmu;
925}
926
927/*
928 * A counter has overflowed; update its count and record
929 * things if requested. Note that interrupts are hard-disabled
930 * here so there is no possibility of being interrupted.
931 */
932static void record_and_restart(struct perf_counter *counter, long val,
933 struct pt_regs *regs, int nmi)
934{
935 u64 period = counter->hw.irq_period;
936 s64 prev, delta, left;
937 int record = 0;
938 u64 addr, mmcra, sdsync;
939
940 /* we don't have to worry about interrupts here */
941 prev = atomic64_read(&counter->hw.prev_count);
942 delta = (val - prev) & 0xfffffffful;
943 atomic64_add(delta, &counter->count);
944
945 /*
946 * See if the total period for this counter has expired,
947 * and update for the next period.
948 */
949 val = 0;
950 left = atomic64_read(&counter->hw.period_left) - delta;
951 if (period) {
952 if (left <= 0) {
953 left += period;
954 if (left <= 0)
955 left = period;
956 record = 1;
957 }
958 if (left < 0x80000000L)
959 val = 0x80000000L - left;
960 }
961 write_pmc(counter->hw.idx, val);
962 atomic64_set(&counter->hw.prev_count, val);
963 atomic64_set(&counter->hw.period_left, left);
964 perf_counter_update_userpage(counter);
965
966 /*
967 * Finally record data if requested.
968 */
969 if (record) {
970 addr = 0;
971 if (counter->hw_event.record_type & PERF_RECORD_ADDR) {
972 /*
973 * The user wants a data address recorded.
974 * If we're not doing instruction sampling,
975 * give them the SDAR (sampled data address).
976 * If we are doing instruction sampling, then only
977 * give them the SDAR if it corresponds to the
978 * instruction pointed to by SIAR; this is indicated
979 * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA.
980 */
981 mmcra = regs->dsisr;
982 sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
983 POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
984 if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
985 addr = mfspr(SPRN_SDAR);
986 }
987 perf_counter_overflow(counter, nmi, regs, addr);
988 }
989}
990
991/*
992 * Called from generic code to get the misc flags (i.e. processor mode)
993 * for an event.
994 */
995unsigned long perf_misc_flags(struct pt_regs *regs)
996{
997 unsigned long mmcra;
998
999 if (TRAP(regs) != 0xf00) {
1000 /* not a PMU interrupt */
1001 return user_mode(regs) ? PERF_EVENT_MISC_USER :
1002 PERF_EVENT_MISC_KERNEL;
1003 }
1004
1005 mmcra = regs->dsisr;
1006 if (ppmu->flags & PPMU_ALT_SIPR) {
1007 if (mmcra & POWER6_MMCRA_SIHV)
1008 return PERF_EVENT_MISC_HYPERVISOR;
1009 return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
1010 PERF_EVENT_MISC_KERNEL;
1011 }
1012 if (mmcra & MMCRA_SIHV)
1013 return PERF_EVENT_MISC_HYPERVISOR;
1014 return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
1015 PERF_EVENT_MISC_KERNEL;
1016}
1017
1018/*
1019 * Called from generic code to get the instruction pointer
1020 * for an event.
1021 */
1022unsigned long perf_instruction_pointer(struct pt_regs *regs)
1023{
1024 unsigned long mmcra;
1025 unsigned long ip;
1026 unsigned long slot;
1027
1028 if (TRAP(regs) != 0xf00)
1029 return regs->nip; /* not a PMU interrupt */
1030
1031 ip = mfspr(SPRN_SIAR);
1032 mmcra = regs->dsisr;
1033 if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
1034 slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
1035 if (slot > 1)
1036 ip += 4 * (slot - 1);
1037 }
1038 return ip;
1039}
1040
1041/*
1042 * Performance monitor interrupt stuff
1043 */
1044static void perf_counter_interrupt(struct pt_regs *regs)
1045{
1046 int i;
1047 struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
1048 struct perf_counter *counter;
1049 long val;
1050 int found = 0;
1051 int nmi;
1052
1053 if (cpuhw->n_limited)
1054 freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
1055 mfspr(SPRN_PMC6));
1056
1057 /*
1058 * Overload regs->dsisr to store MMCRA so we only need to read it once.
1059 */
1060 regs->dsisr = mfspr(SPRN_MMCRA);
1061
1062 /*
1063 * If interrupts were soft-disabled when this PMU interrupt
1064 * occurred, treat it as an NMI.
1065 */
1066 nmi = !regs->softe;
1067 if (nmi)
1068 nmi_enter();
1069 else
1070 irq_enter();
1071
1072 for (i = 0; i < cpuhw->n_counters; ++i) {
1073 counter = cpuhw->counter[i];
1074 if (is_limited_pmc(counter->hw.idx))
1075 continue;
1076 val = read_pmc(counter->hw.idx);
1077 if ((int)val < 0) {
1078 /* counter has overflowed */
1079 found = 1;
1080 record_and_restart(counter, val, regs, nmi);
1081 }
1082 }
1083
1084 /*
1085 * In case we didn't find and reset the counter that caused
1086 * the interrupt, scan all counters and reset any that are
1087 * negative, to avoid getting continual interrupts.
1088 * Any that we processed in the previous loop will not be negative.
1089 */
1090 if (!found) {
1091 for (i = 0; i < ppmu->n_counter; ++i) {
1092 if (is_limited_pmc(i + 1))
1093 continue;
1094 val = read_pmc(i + 1);
1095 if ((int)val < 0)
1096 write_pmc(i + 1, 0);
1097 }
1098 }
1099
1100 /*
1101 * Reset MMCR0 to its normal value. This will set PMXE and
1102 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
1103 * and thus allow interrupts to occur again.
1104 * XXX might want to use MSR.PM to keep the counters frozen until
1105 * we get back out of this interrupt.
1106 */
1107 write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1108
1109 if (nmi)
1110 nmi_exit();
1111 else
1112 irq_exit();
1113}
1114
1115void hw_perf_counter_setup(int cpu)
1116{
1117 struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
1118
1119 memset(cpuhw, 0, sizeof(*cpuhw));
1120 cpuhw->mmcr[0] = MMCR0_FC;
1121}
1122
1123extern struct power_pmu power4_pmu;
1124extern struct power_pmu ppc970_pmu;
1125extern struct power_pmu power5_pmu;
1126extern struct power_pmu power5p_pmu;
1127extern struct power_pmu power6_pmu;
1128
1129static int init_perf_counters(void)
1130{
1131 unsigned long pvr;
1132
1133 /* XXX should get this from cputable */
1134 pvr = mfspr(SPRN_PVR);
1135 switch (PVR_VER(pvr)) {
1136 case PV_POWER4:
1137 case PV_POWER4p:
1138 ppmu = &power4_pmu;
1139 break;
1140 case PV_970:
1141 case PV_970FX:
1142 case PV_970MP:
1143 ppmu = &ppc970_pmu;
1144 break;
1145 case PV_POWER5:
1146 ppmu = &power5_pmu;
1147 break;
1148 case PV_POWER5p:
1149 ppmu = &power5p_pmu;
1150 break;
1151 case 0x3e:
1152 ppmu = &power6_pmu;
1153 break;
1154 }
1155
1156 /*
1157 * Use FCHV to ignore kernel events if MSR.HV is set.
1158 */
1159 if (mfmsr() & MSR_HV)
1160 freeze_counters_kernel = MMCR0_FCHV;
1161
1162 return 0;
1163}
1164
1165arch_initcall(init_perf_counters);
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
new file mode 100644
index 000000000000..836fa118eb1e
--- /dev/null
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -0,0 +1,557 @@
1/*
2 * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER4
17 */
18#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
21#define PM_UNIT_MSK 0xf
22#define PM_LOWER_SH 6
23#define PM_LOWER_MSK 1
24#define PM_LOWER_MSKS 0x40
25#define PM_BYTE_SH 4 /* Byte number of event bus to use */
26#define PM_BYTE_MSK 3
27#define PM_PMCSEL_MSK 7
28
29/*
30 * Unit code values
31 */
32#define PM_FPU 1
33#define PM_ISU1 2
34#define PM_IFU 3
35#define PM_IDU0 4
36#define PM_ISU1_ALT 6
37#define PM_ISU2 7
38#define PM_IFU_ALT 8
39#define PM_LSU0 9
40#define PM_LSU1 0xc
41#define PM_GPS 0xf
42
43/*
44 * Bits in MMCR0 for POWER4
45 */
46#define MMCR0_PMC1SEL_SH 8
47#define MMCR0_PMC2SEL_SH 1
48#define MMCR_PMCSEL_MSK 0x1f
49
50/*
51 * Bits in MMCR1 for POWER4
52 */
53#define MMCR1_TTM0SEL_SH 62
54#define MMCR1_TTC0SEL_SH 61
55#define MMCR1_TTM1SEL_SH 59
56#define MMCR1_TTC1SEL_SH 58
57#define MMCR1_TTM2SEL_SH 56
58#define MMCR1_TTC2SEL_SH 55
59#define MMCR1_TTM3SEL_SH 53
60#define MMCR1_TTC3SEL_SH 52
61#define MMCR1_TTMSEL_MSK 3
62#define MMCR1_TD_CP_DBG0SEL_SH 50
63#define MMCR1_TD_CP_DBG1SEL_SH 48
64#define MMCR1_TD_CP_DBG2SEL_SH 46
65#define MMCR1_TD_CP_DBG3SEL_SH 44
66#define MMCR1_DEBUG0SEL_SH 43
67#define MMCR1_DEBUG1SEL_SH 42
68#define MMCR1_DEBUG2SEL_SH 41
69#define MMCR1_DEBUG3SEL_SH 40
70#define MMCR1_PMC1_ADDER_SEL_SH 39
71#define MMCR1_PMC2_ADDER_SEL_SH 38
72#define MMCR1_PMC6_ADDER_SEL_SH 37
73#define MMCR1_PMC5_ADDER_SEL_SH 36
74#define MMCR1_PMC8_ADDER_SEL_SH 35
75#define MMCR1_PMC7_ADDER_SEL_SH 34
76#define MMCR1_PMC3_ADDER_SEL_SH 33
77#define MMCR1_PMC4_ADDER_SEL_SH 32
78#define MMCR1_PMC3SEL_SH 27
79#define MMCR1_PMC4SEL_SH 22
80#define MMCR1_PMC5SEL_SH 17
81#define MMCR1_PMC6SEL_SH 12
82#define MMCR1_PMC7SEL_SH 7
83#define MMCR1_PMC8SEL_SH 2 /* note bit 0 is in MMCRA for GP */
84
85static short mmcr1_adder_bits[8] = {
86 MMCR1_PMC1_ADDER_SEL_SH,
87 MMCR1_PMC2_ADDER_SEL_SH,
88 MMCR1_PMC3_ADDER_SEL_SH,
89 MMCR1_PMC4_ADDER_SEL_SH,
90 MMCR1_PMC5_ADDER_SEL_SH,
91 MMCR1_PMC6_ADDER_SEL_SH,
92 MMCR1_PMC7_ADDER_SEL_SH,
93 MMCR1_PMC8_ADDER_SEL_SH
94};
95
96/*
97 * Bits in MMCRA
98 */
99#define MMCRA_PMC8SEL0_SH 17 /* PMC8SEL bit 0 for GP */
100
101/*
102 * Layout of constraint bits:
103 * 6666555555555544444444443333333333222222222211111111110000000000
104 * 3210987654321098765432109876543210987654321098765432109876543210
105 * |[ >[ >[ >|||[ >[ >< >< >< >< ><><><><><><><><>
106 * | UC1 UC2 UC3 ||| PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
107 * \SMPL ||\TTC3SEL
108 * |\TTC_IFU_SEL
109 * \TTM2SEL0
110 *
111 * SMPL - SAMPLE_ENABLE constraint
112 * 56: SAMPLE_ENABLE value 0x0100_0000_0000_0000
113 *
114 * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2
115 * 55: UC1 error 0x0080_0000_0000_0000
116 * 54: FPU events needed 0x0040_0000_0000_0000
117 * 53: ISU1 events needed 0x0020_0000_0000_0000
118 * 52: IDU0|ISU2 events needed 0x0010_0000_0000_0000
119 *
120 * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0
121 * 51: UC2 error 0x0008_0000_0000_0000
122 * 50: FPU events needed 0x0004_0000_0000_0000
123 * 49: IFU events needed 0x0002_0000_0000_0000
124 * 48: LSU0 events needed 0x0001_0000_0000_0000
125 *
126 * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1
127 * 47: UC3 error 0x8000_0000_0000
128 * 46: LSU0 events needed 0x4000_0000_0000
129 * 45: IFU events needed 0x2000_0000_0000
130 * 44: IDU0|ISU2 events needed 0x1000_0000_0000
131 * 43: ISU1 events needed 0x0800_0000_0000
132 *
133 * TTM2SEL0
134 * 42: 0 = IDU0 events needed
135 * 1 = ISU2 events needed 0x0400_0000_0000
136 *
137 * TTC_IFU_SEL
138 * 41: 0 = IFU.U events needed
139 * 1 = IFU.L events needed 0x0200_0000_0000
140 *
141 * TTC3SEL
142 * 40: 0 = LSU1.U events needed
143 * 1 = LSU1.L events needed 0x0100_0000_0000
144 *
145 * PS1
146 * 39: PS1 error 0x0080_0000_0000
147 * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
148 *
149 * PS2
150 * 35: PS2 error 0x0008_0000_0000
151 * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
152 *
153 * B0
154 * 28-31: Byte 0 event source 0xf000_0000
155 * 1 = FPU
156 * 2 = ISU1
157 * 3 = IFU
158 * 4 = IDU0
159 * 7 = ISU2
160 * 9 = LSU0
161 * c = LSU1
162 * f = GPS
163 *
164 * B1, B2, B3
165 * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
166 *
167 * P8
168 * 15: P8 error 0x8000
169 * 14-15: Count of events needing PMC8
170 *
171 * P1..P7
172 * 0-13: Count of events needing PMC1..PMC7
173 *
174 * Note: this doesn't allow events using IFU.U to be combined with events
175 * using IFU.L, though that is feasible (using TTM0 and TTM2). However
176 * there are no listed events for IFU.L (they are debug events not
177 * verified for performance monitoring) so this shouldn't cause a
178 * problem.
179 */
180
181static struct unitinfo {
182 u64 value, mask;
183 int unit;
184 int lowerbit;
185} p4_unitinfo[16] = {
186 [PM_FPU] = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 },
187 [PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
188 [PM_ISU1_ALT] =
189 { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
190 [PM_IFU] = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
191 [PM_IFU_ALT] =
192 { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
193 [PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 },
194 [PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 },
195 [PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 },
196 [PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 },
197 [PM_GPS] = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 }
198};
199
200static unsigned char direct_marked_event[8] = {
201 (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
202 (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
203 (1<<3), /* PMC3: PM_MRK_ST_CMPL_INT */
204 (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
205 (1<<4) | (1<<5), /* PMC5: PM_MRK_GRP_TIMEO */
206 (1<<3) | (1<<4) | (1<<5),
207 /* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
208 (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
209 (1<<4), /* PMC8: PM_MRK_LSU_FIN */
210};
211
212/*
213 * Returns 1 if event counts things relating to marked instructions
214 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
215 */
216static int p4_marked_instr_event(u64 event)
217{
218 int pmc, psel, unit, byte, bit;
219 unsigned int mask;
220
221 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
222 psel = event & PM_PMCSEL_MSK;
223 if (pmc) {
224 if (direct_marked_event[pmc - 1] & (1 << psel))
225 return 1;
226 if (psel == 0) /* add events */
227 bit = (pmc <= 4)? pmc - 1: 8 - pmc;
228 else if (psel == 6) /* decode events */
229 bit = 4;
230 else
231 return 0;
232 } else
233 bit = psel;
234
235 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
236 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
237 mask = 0;
238 switch (unit) {
239 case PM_LSU1:
240 if (event & PM_LOWER_MSKS)
241 mask = 1 << 28; /* byte 7 bit 4 */
242 else
243 mask = 6 << 24; /* byte 3 bits 1 and 2 */
244 break;
245 case PM_LSU0:
246 /* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */
247 mask = 0x083dff00;
248 }
249 return (mask >> (byte * 8 + bit)) & 1;
250}
251
252static int p4_get_constraint(u64 event, u64 *maskp, u64 *valp)
253{
254 int pmc, byte, unit, lower, sh;
255 u64 mask = 0, value = 0;
256 int grp = -1;
257
258 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
259 if (pmc) {
260 if (pmc > 8)
261 return -1;
262 sh = (pmc - 1) * 2;
263 mask |= 2 << sh;
264 value |= 1 << sh;
265 grp = ((pmc - 1) >> 1) & 1;
266 }
267 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
268 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
269 if (unit) {
270 lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK;
271
272 /*
273 * Bus events on bytes 0 and 2 can be counted
274 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
275 */
276 if (!pmc)
277 grp = byte & 1;
278
279 if (!p4_unitinfo[unit].unit)
280 return -1;
281 mask |= p4_unitinfo[unit].mask;
282 value |= p4_unitinfo[unit].value;
283 sh = p4_unitinfo[unit].lowerbit;
284 if (sh > 1)
285 value |= (u64)lower << sh;
286 else if (lower != sh)
287 return -1;
288 unit = p4_unitinfo[unit].unit;
289
290 /* Set byte lane select field */
291 mask |= 0xfULL << (28 - 4 * byte);
292 value |= (u64)unit << (28 - 4 * byte);
293 }
294 if (grp == 0) {
295 /* increment PMC1/2/5/6 field */
296 mask |= 0x8000000000ull;
297 value |= 0x1000000000ull;
298 } else {
299 /* increment PMC3/4/7/8 field */
300 mask |= 0x800000000ull;
301 value |= 0x100000000ull;
302 }
303
304 /* Marked instruction events need sample_enable set */
305 if (p4_marked_instr_event(event)) {
306 mask |= 1ull << 56;
307 value |= 1ull << 56;
308 }
309
310 /* PMCSEL=6 decode events on byte 2 need sample_enable clear */
311 if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2)
312 mask |= 1ull << 56;
313
314 *maskp = mask;
315 *valp = value;
316 return 0;
317}
318
319static unsigned int ppc_inst_cmpl[] = {
320 0x1001, 0x4001, 0x6001, 0x7001, 0x8001
321};
322
323static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[])
324{
325 int i, j, na;
326
327 alt[0] = event;
328 na = 1;
329
330 /* 2 possibilities for PM_GRP_DISP_REJECT */
331 if (event == 0x8003 || event == 0x0224) {
332 alt[1] = event ^ (0x8003 ^ 0x0224);
333 return 2;
334 }
335
336 /* 2 possibilities for PM_ST_MISS_L1 */
337 if (event == 0x0c13 || event == 0x0c23) {
338 alt[1] = event ^ (0x0c13 ^ 0x0c23);
339 return 2;
340 }
341
342 /* several possibilities for PM_INST_CMPL */
343 for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) {
344 if (event == ppc_inst_cmpl[i]) {
345 for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j)
346 if (j != i)
347 alt[na++] = ppc_inst_cmpl[j];
348 break;
349 }
350 }
351
352 return na;
353}
354
355static int p4_compute_mmcr(u64 event[], int n_ev,
356 unsigned int hwc[], u64 mmcr[])
357{
358 u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
359 unsigned int pmc, unit, byte, psel, lower;
360 unsigned int ttm, grp;
361 unsigned int pmc_inuse = 0;
362 unsigned int pmc_grp_use[2];
363 unsigned char busbyte[4];
364 unsigned char unituse[16];
365 unsigned int unitlower = 0;
366 int i;
367
368 if (n_ev > 8)
369 return -1;
370
371 /* First pass to count resource use */
372 pmc_grp_use[0] = pmc_grp_use[1] = 0;
373 memset(busbyte, 0, sizeof(busbyte));
374 memset(unituse, 0, sizeof(unituse));
375 for (i = 0; i < n_ev; ++i) {
376 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
377 if (pmc) {
378 if (pmc_inuse & (1 << (pmc - 1)))
379 return -1;
380 pmc_inuse |= 1 << (pmc - 1);
381 /* count 1/2/5/6 vs 3/4/7/8 use */
382 ++pmc_grp_use[((pmc - 1) >> 1) & 1];
383 }
384 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
385 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
386 lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK;
387 if (unit) {
388 if (!pmc)
389 ++pmc_grp_use[byte & 1];
390 if (unit == 6 || unit == 8)
391 /* map alt ISU1/IFU codes: 6->2, 8->3 */
392 unit = (unit >> 1) - 1;
393 if (busbyte[byte] && busbyte[byte] != unit)
394 return -1;
395 busbyte[byte] = unit;
396 lower <<= unit;
397 if (unituse[unit] && lower != (unitlower & lower))
398 return -1;
399 unituse[unit] = 1;
400 unitlower |= lower;
401 }
402 }
403 if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
404 return -1;
405
406 /*
407 * Assign resources and set multiplexer selects.
408 *
409 * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2.
410 * Each TTMx can only select one unit, but since
411 * units 2 and 6 are both ISU1, and 3 and 8 are both IFU,
412 * we have some choices.
413 */
414 if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) {
415 unituse[6] = 1; /* Move 2 to 6 */
416 unituse[2] = 0;
417 }
418 if (unituse[3] & (unituse[1] | unituse[2])) {
419 unituse[8] = 1; /* Move 3 to 8 */
420 unituse[3] = 0;
421 unitlower = (unitlower & ~8) | ((unitlower & 8) << 5);
422 }
423 /* Check only one unit per TTMx */
424 if (unituse[1] + unituse[2] + unituse[3] > 1 ||
425 unituse[4] + unituse[6] + unituse[7] > 1 ||
426 unituse[8] + unituse[9] > 1 ||
427 (unituse[5] | unituse[10] | unituse[11] |
428 unituse[13] | unituse[14]))
429 return -1;
430
431 /* Set TTMxSEL fields. Note, units 1-3 => TTM0SEL codes 0-2 */
432 mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH;
433 mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH;
434 mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH;
435
436 /* Set TTCxSEL fields. */
437 if (unitlower & 0xe)
438 mmcr1 |= 1ull << MMCR1_TTC0SEL_SH;
439 if (unitlower & 0xf0)
440 mmcr1 |= 1ull << MMCR1_TTC1SEL_SH;
441 if (unitlower & 0xf00)
442 mmcr1 |= 1ull << MMCR1_TTC2SEL_SH;
443 if (unitlower & 0x7000)
444 mmcr1 |= 1ull << MMCR1_TTC3SEL_SH;
445
446 /* Set byte lane select fields. */
447 for (byte = 0; byte < 4; ++byte) {
448 unit = busbyte[byte];
449 if (!unit)
450 continue;
451 if (unit == 0xf) {
452 /* special case for GPS */
453 mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte);
454 } else {
455 if (!unituse[unit])
456 ttm = unit - 1; /* 2->1, 3->2 */
457 else
458 ttm = unit >> 2;
459 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte);
460 }
461 }
462
463 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
464 for (i = 0; i < n_ev; ++i) {
465 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
466 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
467 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
468 psel = event[i] & PM_PMCSEL_MSK;
469 if (!pmc) {
470 /* Bus event or 00xxx direct event (off or cycles) */
471 if (unit)
472 psel |= 0x10 | ((byte & 2) << 2);
473 for (pmc = 0; pmc < 8; ++pmc) {
474 if (pmc_inuse & (1 << pmc))
475 continue;
476 grp = (pmc >> 1) & 1;
477 if (unit) {
478 if (grp == (byte & 1))
479 break;
480 } else if (pmc_grp_use[grp] < 4) {
481 ++pmc_grp_use[grp];
482 break;
483 }
484 }
485 pmc_inuse |= 1 << pmc;
486 } else {
487 /* Direct event */
488 --pmc;
489 if (psel == 0 && (byte & 2))
490 /* add events on higher-numbered bus */
491 mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
492 else if (psel == 6 && byte == 3)
493 /* seem to need to set sample_enable here */
494 mmcra |= MMCRA_SAMPLE_ENABLE;
495 psel |= 8;
496 }
497 if (pmc <= 1)
498 mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc);
499 else
500 mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
501 if (pmc == 7) /* PMC8 */
502 mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH;
503 hwc[i] = pmc;
504 if (p4_marked_instr_event(event[i]))
505 mmcra |= MMCRA_SAMPLE_ENABLE;
506 }
507
508 if (pmc_inuse & 1)
509 mmcr0 |= MMCR0_PMC1CE;
510 if (pmc_inuse & 0xfe)
511 mmcr0 |= MMCR0_PMCjCE;
512
513 mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
514
515 /* Return MMCRx values */
516 mmcr[0] = mmcr0;
517 mmcr[1] = mmcr1;
518 mmcr[2] = mmcra;
519 return 0;
520}
521
522static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
523{
524 /*
525 * Setting the PMCxSEL field to 0 disables PMC x.
526 * (Note that pmc is 0-based here, not 1-based.)
527 */
528 if (pmc <= 1) {
529 mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc));
530 } else {
531 mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)));
532 if (pmc == 7)
533 mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH);
534 }
535}
536
537static int p4_generic_events[] = {
538 [PERF_COUNT_CPU_CYCLES] = 7,
539 [PERF_COUNT_INSTRUCTIONS] = 0x1001,
540 [PERF_COUNT_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */
541 [PERF_COUNT_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */
542 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */
543 [PERF_COUNT_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */
544};
545
546struct power_pmu power4_pmu = {
547 .n_counter = 8,
548 .max_alternatives = 5,
549 .add_fields = 0x0000001100005555ull,
550 .test_adder = 0x0011083300000000ull,
551 .compute_mmcr = p4_compute_mmcr,
552 .get_constraint = p4_get_constraint,
553 .get_alternatives = p4_get_alternatives,
554 .disable_pmc = p4_disable_pmc,
555 .n_generic = ARRAY_SIZE(p4_generic_events),
556 .generic_events = p4_generic_events,
557};
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
new file mode 100644
index 000000000000..c6cdfc165d6e
--- /dev/null
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -0,0 +1,630 @@
1/*
2 * Performance counter support for POWER5+/++ (not POWER5) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3)
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
22#define PM_UNIT_MSK 0xf
23#define PM_BYTE_SH 12 /* Byte number of event bus to use */
24#define PM_BYTE_MSK 7
25#define PM_GRS_SH 8 /* Storage subsystem mux select */
26#define PM_GRS_MSK 7
27#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
28#define PM_PMCSEL_MSK 0x7f
29
30/* Values in PM_UNIT field */
31#define PM_FPU 0
32#define PM_ISU0 1
33#define PM_IFU 2
34#define PM_ISU1 3
35#define PM_IDU 4
36#define PM_ISU0_ALT 6
37#define PM_GRS 7
38#define PM_LSU0 8
39#define PM_LSU1 0xc
40#define PM_LASTUNIT 0xc
41
42/*
43 * Bits in MMCR1 for POWER5+
44 */
45#define MMCR1_TTM0SEL_SH 62
46#define MMCR1_TTM1SEL_SH 60
47#define MMCR1_TTM2SEL_SH 58
48#define MMCR1_TTM3SEL_SH 56
49#define MMCR1_TTMSEL_MSK 3
50#define MMCR1_TD_CP_DBG0SEL_SH 54
51#define MMCR1_TD_CP_DBG1SEL_SH 52
52#define MMCR1_TD_CP_DBG2SEL_SH 50
53#define MMCR1_TD_CP_DBG3SEL_SH 48
54#define MMCR1_GRS_L2SEL_SH 46
55#define MMCR1_GRS_L2SEL_MSK 3
56#define MMCR1_GRS_L3SEL_SH 44
57#define MMCR1_GRS_L3SEL_MSK 3
58#define MMCR1_GRS_MCSEL_SH 41
59#define MMCR1_GRS_MCSEL_MSK 7
60#define MMCR1_GRS_FABSEL_SH 39
61#define MMCR1_GRS_FABSEL_MSK 3
62#define MMCR1_PMC1_ADDER_SEL_SH 35
63#define MMCR1_PMC2_ADDER_SEL_SH 34
64#define MMCR1_PMC3_ADDER_SEL_SH 33
65#define MMCR1_PMC4_ADDER_SEL_SH 32
66#define MMCR1_PMC1SEL_SH 25
67#define MMCR1_PMC2SEL_SH 17
68#define MMCR1_PMC3SEL_SH 9
69#define MMCR1_PMC4SEL_SH 1
70#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
71#define MMCR1_PMCSEL_MSK 0x7f
72
73/*
74 * Bits in MMCRA
75 */
76
77/*
78 * Layout of constraint bits:
79 * 6666555555555544444444443333333333222222222211111111110000000000
80 * 3210987654321098765432109876543210987654321098765432109876543210
81 * [ ><><>< ><> <><>[ > < >< >< >< ><><><><><><>
82 * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P6P5P4P3P2P1
83 *
84 * NC - number of counters
85 * 51: NC error 0x0008_0000_0000_0000
86 * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
87 *
88 * G0..G3 - GRS mux constraints
89 * 46-47: GRS_L2SEL value
90 * 44-45: GRS_L3SEL value
91 * 41-44: GRS_MCSEL value
92 * 39-40: GRS_FABSEL value
93 * Note that these match up with their bit positions in MMCR1
94 *
95 * T0 - TTM0 constraint
96 * 36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000
97 *
98 * T1 - TTM1 constraint
99 * 34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000
100 *
101 * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
102 * 33: UC3 error 0x02_0000_0000
103 * 32: FPU|IFU|ISU1 events needed 0x01_0000_0000
104 * 31: ISU0 events needed 0x01_8000_0000
105 * 30: IDU|GRS events needed 0x00_4000_0000
106 *
107 * B0
108 * 24-27: Byte 0 event source 0x0f00_0000
109 * Encoding as for the event code
110 *
111 * B1, B2, B3
112 * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
113 *
114 * P6
115 * 11: P6 error 0x800
116 * 10-11: Count of events needing PMC6
117 *
118 * P1..P5
119 * 0-9: Count of events needing PMC1..PMC5
120 */
121
122static const int grsel_shift[8] = {
123 MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
124 MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
125 MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
126};
127
128/* Masks and values for using events from the various units */
129static u64 unit_cons[PM_LASTUNIT+1][2] = {
130 [PM_FPU] = { 0x3200000000ull, 0x0100000000ull },
131 [PM_ISU0] = { 0x0200000000ull, 0x0080000000ull },
132 [PM_ISU1] = { 0x3200000000ull, 0x3100000000ull },
133 [PM_IFU] = { 0x3200000000ull, 0x2100000000ull },
134 [PM_IDU] = { 0x0e00000000ull, 0x0040000000ull },
135 [PM_GRS] = { 0x0e00000000ull, 0x0c40000000ull },
136};
137
138static int power5p_get_constraint(u64 event, u64 *maskp, u64 *valp)
139{
140 int pmc, byte, unit, sh;
141 int bit, fmask;
142 u64 mask = 0, value = 0;
143
144 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
145 if (pmc) {
146 if (pmc > 6)
147 return -1;
148 sh = (pmc - 1) * 2;
149 mask |= 2 << sh;
150 value |= 1 << sh;
151 if (pmc >= 5 && !(event == 0x500009 || event == 0x600005))
152 return -1;
153 }
154 if (event & PM_BUSEVENT_MSK) {
155 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
156 if (unit > PM_LASTUNIT)
157 return -1;
158 if (unit == PM_ISU0_ALT)
159 unit = PM_ISU0;
160 mask |= unit_cons[unit][0];
161 value |= unit_cons[unit][1];
162 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
163 if (byte >= 4) {
164 if (unit != PM_LSU1)
165 return -1;
166 /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
167 ++unit;
168 byte &= 3;
169 }
170 if (unit == PM_GRS) {
171 bit = event & 7;
172 fmask = (bit == 6)? 7: 3;
173 sh = grsel_shift[bit];
174 mask |= (u64)fmask << sh;
175 value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
176 }
177 /* Set byte lane select field */
178 mask |= 0xfULL << (24 - 4 * byte);
179 value |= (u64)unit << (24 - 4 * byte);
180 }
181 if (pmc < 5) {
182 /* need a counter from PMC1-4 set */
183 mask |= 0x8000000000000ull;
184 value |= 0x1000000000000ull;
185 }
186 *maskp = mask;
187 *valp = value;
188 return 0;
189}
190
191static int power5p_limited_pmc_event(u64 event)
192{
193 int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
194
195 return pmc == 5 || pmc == 6;
196}
197
198#define MAX_ALT 3 /* at most 3 alternatives for any event */
199
200static const unsigned int event_alternatives[][MAX_ALT] = {
201 { 0x100c0, 0x40001f }, /* PM_GCT_FULL_CYC */
202 { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
203 { 0x230e2, 0x323087 }, /* PM_BR_PRED_CR */
204 { 0x230e3, 0x223087, 0x3230a0 }, /* PM_BR_PRED_TA */
205 { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
206 { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */
207 { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */
208 { 0x100005, 0x600005 }, /* PM_RUN_CYC */
209 { 0x100009, 0x200009 }, /* PM_INST_CMPL */
210 { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */
211 { 0x300009, 0x400009 }, /* PM_INST_DISP */
212};
213
214/*
215 * Scan the alternatives table for a match and return the
216 * index into the alternatives table if found, else -1.
217 */
218static int find_alternative(unsigned int event)
219{
220 int i, j;
221
222 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
223 if (event < event_alternatives[i][0])
224 break;
225 for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
226 if (event == event_alternatives[i][j])
227 return i;
228 }
229 return -1;
230}
231
232static const unsigned char bytedecode_alternatives[4][4] = {
233 /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
234 /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
235 /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
236 /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
237};
238
239/*
240 * Some direct events for decodes of event bus byte 3 have alternative
241 * PMCSEL values on other counters. This returns the alternative
242 * event code for those that do, or -1 otherwise. This also handles
243 * alternative PCMSEL values for add events.
244 */
245static int find_alternative_bdecode(unsigned int event)
246{
247 int pmc, altpmc, pp, j;
248
249 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
250 if (pmc == 0 || pmc > 4)
251 return -1;
252 altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
253 pp = event & PM_PMCSEL_MSK;
254 for (j = 0; j < 4; ++j) {
255 if (bytedecode_alternatives[pmc - 1][j] == pp) {
256 return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
257 (altpmc << PM_PMC_SH) |
258 bytedecode_alternatives[altpmc - 1][j];
259 }
260 }
261
262 /* new decode alternatives for power5+ */
263 if (pmc == 1 && (pp == 0x0d || pp == 0x0e))
264 return event + (2 << PM_PMC_SH) + (0x2e - 0x0d);
265 if (pmc == 3 && (pp == 0x2e || pp == 0x2f))
266 return event - (2 << PM_PMC_SH) - (0x2e - 0x0d);
267
268 /* alternative add event encodings */
269 if (pp == 0x10 || pp == 0x28)
270 return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) |
271 (altpmc << PM_PMC_SH);
272
273 return -1;
274}
275
276static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[])
277{
278 int i, j, nalt = 1;
279 int nlim;
280 u64 ae;
281
282 alt[0] = event;
283 nalt = 1;
284 nlim = power5p_limited_pmc_event(event);
285 i = find_alternative(event);
286 if (i >= 0) {
287 for (j = 0; j < MAX_ALT; ++j) {
288 ae = event_alternatives[i][j];
289 if (ae && ae != event)
290 alt[nalt++] = ae;
291 nlim += power5p_limited_pmc_event(ae);
292 }
293 } else {
294 ae = find_alternative_bdecode(event);
295 if (ae > 0)
296 alt[nalt++] = ae;
297 }
298
299 if (flags & PPMU_ONLY_COUNT_RUN) {
300 /*
301 * We're only counting in RUN state,
302 * so PM_CYC is equivalent to PM_RUN_CYC
303 * and PM_INST_CMPL === PM_RUN_INST_CMPL.
304 * This doesn't include alternatives that don't provide
305 * any extra flexibility in assigning PMCs (e.g.
306 * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC).
307 * Note that even with these additional alternatives
308 * we never end up with more than 3 alternatives for any event.
309 */
310 j = nalt;
311 for (i = 0; i < nalt; ++i) {
312 switch (alt[i]) {
313 case 0xf: /* PM_CYC */
314 alt[j++] = 0x600005; /* PM_RUN_CYC */
315 ++nlim;
316 break;
317 case 0x600005: /* PM_RUN_CYC */
318 alt[j++] = 0xf;
319 break;
320 case 0x100009: /* PM_INST_CMPL */
321 alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */
322 ++nlim;
323 break;
324 case 0x500009: /* PM_RUN_INST_CMPL */
325 alt[j++] = 0x100009; /* PM_INST_CMPL */
326 alt[j++] = 0x200009;
327 break;
328 }
329 }
330 nalt = j;
331 }
332
333 if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
334 /* remove the limited PMC events */
335 j = 0;
336 for (i = 0; i < nalt; ++i) {
337 if (!power5p_limited_pmc_event(alt[i])) {
338 alt[j] = alt[i];
339 ++j;
340 }
341 }
342 nalt = j;
343 } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
344 /* remove all but the limited PMC events */
345 j = 0;
346 for (i = 0; i < nalt; ++i) {
347 if (power5p_limited_pmc_event(alt[i])) {
348 alt[j] = alt[i];
349 ++j;
350 }
351 }
352 nalt = j;
353 }
354
355 return nalt;
356}
357
358/*
359 * Map of which direct events on which PMCs are marked instruction events.
360 * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
361 * Bit 0 is set if it is marked for all PMCs.
362 * The 0x80 bit indicates a byte decode PMCSEL value.
363 */
364static unsigned char direct_event_is_marked[0x28] = {
365 0, /* 00 */
366 0x1f, /* 01 PM_IOPS_CMPL */
367 0x2, /* 02 PM_MRK_GRP_DISP */
368 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
369 0, /* 04 */
370 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
371 0x80, /* 06 */
372 0x80, /* 07 */
373 0, 0, 0,/* 08 - 0a */
374 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
375 0, /* 0c */
376 0x80, /* 0d */
377 0x80, /* 0e */
378 0, /* 0f */
379 0, /* 10 */
380 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
381 0, /* 12 */
382 0x10, /* 13 PM_MRK_GRP_CMPL */
383 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
384 0x2, /* 15 PM_MRK_GRP_ISSUED */
385 0x80, /* 16 */
386 0x80, /* 17 */
387 0, 0, 0, 0, 0,
388 0x80, /* 1d */
389 0x80, /* 1e */
390 0, /* 1f */
391 0x80, /* 20 */
392 0x80, /* 21 */
393 0x80, /* 22 */
394 0x80, /* 23 */
395 0x80, /* 24 */
396 0x80, /* 25 */
397 0x80, /* 26 */
398 0x80, /* 27 */
399};
400
401/*
402 * Returns 1 if event counts things relating to marked instructions
403 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
404 */
405static int power5p_marked_instr_event(u64 event)
406{
407 int pmc, psel;
408 int bit, byte, unit;
409 u32 mask;
410
411 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
412 psel = event & PM_PMCSEL_MSK;
413 if (pmc >= 5)
414 return 0;
415
416 bit = -1;
417 if (psel < sizeof(direct_event_is_marked)) {
418 if (direct_event_is_marked[psel] & (1 << pmc))
419 return 1;
420 if (direct_event_is_marked[psel] & 0x80)
421 bit = 4;
422 else if (psel == 0x08)
423 bit = pmc - 1;
424 else if (psel == 0x10)
425 bit = 4 - pmc;
426 else if (psel == 0x1b && (pmc == 1 || pmc == 3))
427 bit = 4;
428 } else if ((psel & 0x48) == 0x40) {
429 bit = psel & 7;
430 } else if (psel == 0x28) {
431 bit = pmc - 1;
432 } else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) {
433 bit = 4;
434 }
435
436 if (!(event & PM_BUSEVENT_MSK) || bit == -1)
437 return 0;
438
439 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
440 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
441 if (unit == PM_LSU0) {
442 /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
443 mask = 0x5dff00;
444 } else if (unit == PM_LSU1 && byte >= 4) {
445 byte -= 4;
446 /* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */
447 mask = 0x5f11c000;
448 } else
449 return 0;
450
451 return (mask >> (byte * 8 + bit)) & 1;
452}
453
454static int power5p_compute_mmcr(u64 event[], int n_ev,
455 unsigned int hwc[], u64 mmcr[])
456{
457 u64 mmcr1 = 0;
458 u64 mmcra = 0;
459 unsigned int pmc, unit, byte, psel;
460 unsigned int ttm;
461 int i, isbus, bit, grsel;
462 unsigned int pmc_inuse = 0;
463 unsigned char busbyte[4];
464 unsigned char unituse[16];
465 int ttmuse;
466
467 if (n_ev > 6)
468 return -1;
469
470 /* First pass to count resource use */
471 memset(busbyte, 0, sizeof(busbyte));
472 memset(unituse, 0, sizeof(unituse));
473 for (i = 0; i < n_ev; ++i) {
474 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
475 if (pmc) {
476 if (pmc > 6)
477 return -1;
478 if (pmc_inuse & (1 << (pmc - 1)))
479 return -1;
480 pmc_inuse |= 1 << (pmc - 1);
481 }
482 if (event[i] & PM_BUSEVENT_MSK) {
483 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
484 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
485 if (unit > PM_LASTUNIT)
486 return -1;
487 if (unit == PM_ISU0_ALT)
488 unit = PM_ISU0;
489 if (byte >= 4) {
490 if (unit != PM_LSU1)
491 return -1;
492 ++unit;
493 byte &= 3;
494 }
495 if (busbyte[byte] && busbyte[byte] != unit)
496 return -1;
497 busbyte[byte] = unit;
498 unituse[unit] = 1;
499 }
500 }
501
502 /*
503 * Assign resources and set multiplexer selects.
504 *
505 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
506 * choice we have to deal with.
507 */
508 if (unituse[PM_ISU0] &
509 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
510 unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
511 unituse[PM_ISU0] = 0;
512 }
513 /* Set TTM[01]SEL fields. */
514 ttmuse = 0;
515 for (i = PM_FPU; i <= PM_ISU1; ++i) {
516 if (!unituse[i])
517 continue;
518 if (ttmuse++)
519 return -1;
520 mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
521 }
522 ttmuse = 0;
523 for (; i <= PM_GRS; ++i) {
524 if (!unituse[i])
525 continue;
526 if (ttmuse++)
527 return -1;
528 mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
529 }
530 if (ttmuse > 1)
531 return -1;
532
533 /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
534 for (byte = 0; byte < 4; ++byte) {
535 unit = busbyte[byte];
536 if (!unit)
537 continue;
538 if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
539 /* get ISU0 through TTM1 rather than TTM0 */
540 unit = PM_ISU0_ALT;
541 } else if (unit == PM_LSU1 + 1) {
542 /* select lower word of LSU1 for this byte */
543 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
544 }
545 ttm = unit >> 2;
546 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
547 }
548
549 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
550 for (i = 0; i < n_ev; ++i) {
551 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
552 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
553 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
554 psel = event[i] & PM_PMCSEL_MSK;
555 isbus = event[i] & PM_BUSEVENT_MSK;
556 if (!pmc) {
557 /* Bus event or any-PMC direct event */
558 for (pmc = 0; pmc < 4; ++pmc) {
559 if (!(pmc_inuse & (1 << pmc)))
560 break;
561 }
562 if (pmc >= 4)
563 return -1;
564 pmc_inuse |= 1 << pmc;
565 } else if (pmc <= 4) {
566 /* Direct event */
567 --pmc;
568 if (isbus && (byte & 2) &&
569 (psel == 8 || psel == 0x10 || psel == 0x28))
570 /* add events on higher-numbered bus */
571 mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
572 } else {
573 /* Instructions or run cycles on PMC5/6 */
574 --pmc;
575 }
576 if (isbus && unit == PM_GRS) {
577 bit = psel & 7;
578 grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
579 mmcr1 |= (u64)grsel << grsel_shift[bit];
580 }
581 if (power5p_marked_instr_event(event[i]))
582 mmcra |= MMCRA_SAMPLE_ENABLE;
583 if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
584 /* select alternate byte lane */
585 psel |= 0x10;
586 if (pmc <= 3)
587 mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
588 hwc[i] = pmc;
589 }
590
591 /* Return MMCRx values */
592 mmcr[0] = 0;
593 if (pmc_inuse & 1)
594 mmcr[0] = MMCR0_PMC1CE;
595 if (pmc_inuse & 0x3e)
596 mmcr[0] |= MMCR0_PMCjCE;
597 mmcr[1] = mmcr1;
598 mmcr[2] = mmcra;
599 return 0;
600}
601
602static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
603{
604 if (pmc <= 3)
605 mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
606}
607
608static int power5p_generic_events[] = {
609 [PERF_COUNT_CPU_CYCLES] = 0xf,
610 [PERF_COUNT_INSTRUCTIONS] = 0x100009,
611 [PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */
612 [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
613 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
614 [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
615};
616
617struct power_pmu power5p_pmu = {
618 .n_counter = 6,
619 .max_alternatives = MAX_ALT,
620 .add_fields = 0x7000000000055ull,
621 .test_adder = 0x3000040000000ull,
622 .compute_mmcr = power5p_compute_mmcr,
623 .get_constraint = power5p_get_constraint,
624 .get_alternatives = power5p_get_alternatives,
625 .disable_pmc = power5p_disable_pmc,
626 .n_generic = ARRAY_SIZE(power5p_generic_events),
627 .generic_events = power5p_generic_events,
628 .flags = PPMU_LIMITED_PMC5_6,
629 .limited_pmc_event = power5p_limited_pmc_event,
630};
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
new file mode 100644
index 000000000000..d5344968ee9c
--- /dev/null
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -0,0 +1,570 @@
1/*
2 * Performance counter support for POWER5 (not POWER5++) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER5 (not POWER5++)
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
22#define PM_UNIT_MSK 0xf
23#define PM_BYTE_SH 12 /* Byte number of event bus to use */
24#define PM_BYTE_MSK 7
25#define PM_GRS_SH 8 /* Storage subsystem mux select */
26#define PM_GRS_MSK 7
27#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
28#define PM_PMCSEL_MSK 0x7f
29
30/* Values in PM_UNIT field */
31#define PM_FPU 0
32#define PM_ISU0 1
33#define PM_IFU 2
34#define PM_ISU1 3
35#define PM_IDU 4
36#define PM_ISU0_ALT 6
37#define PM_GRS 7
38#define PM_LSU0 8
39#define PM_LSU1 0xc
40#define PM_LASTUNIT 0xc
41
42/*
43 * Bits in MMCR1 for POWER5
44 */
45#define MMCR1_TTM0SEL_SH 62
46#define MMCR1_TTM1SEL_SH 60
47#define MMCR1_TTM2SEL_SH 58
48#define MMCR1_TTM3SEL_SH 56
49#define MMCR1_TTMSEL_MSK 3
50#define MMCR1_TD_CP_DBG0SEL_SH 54
51#define MMCR1_TD_CP_DBG1SEL_SH 52
52#define MMCR1_TD_CP_DBG2SEL_SH 50
53#define MMCR1_TD_CP_DBG3SEL_SH 48
54#define MMCR1_GRS_L2SEL_SH 46
55#define MMCR1_GRS_L2SEL_MSK 3
56#define MMCR1_GRS_L3SEL_SH 44
57#define MMCR1_GRS_L3SEL_MSK 3
58#define MMCR1_GRS_MCSEL_SH 41
59#define MMCR1_GRS_MCSEL_MSK 7
60#define MMCR1_GRS_FABSEL_SH 39
61#define MMCR1_GRS_FABSEL_MSK 3
62#define MMCR1_PMC1_ADDER_SEL_SH 35
63#define MMCR1_PMC2_ADDER_SEL_SH 34
64#define MMCR1_PMC3_ADDER_SEL_SH 33
65#define MMCR1_PMC4_ADDER_SEL_SH 32
66#define MMCR1_PMC1SEL_SH 25
67#define MMCR1_PMC2SEL_SH 17
68#define MMCR1_PMC3SEL_SH 9
69#define MMCR1_PMC4SEL_SH 1
70#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
71#define MMCR1_PMCSEL_MSK 0x7f
72
73/*
74 * Bits in MMCRA
75 */
76
77/*
78 * Layout of constraint bits:
79 * 6666555555555544444444443333333333222222222211111111110000000000
80 * 3210987654321098765432109876543210987654321098765432109876543210
81 * <><>[ ><><>< ><> [ >[ >[ >< >< >< >< ><><><><><><>
82 * T0T1 NC G0G1G2 G3 UC PS1PS2 B0 B1 B2 B3 P6P5P4P3P2P1
83 *
84 * T0 - TTM0 constraint
85 * 54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000
86 *
87 * T1 - TTM1 constraint
88 * 52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000
89 *
90 * NC - number of counters
91 * 51: NC error 0x0008_0000_0000_0000
92 * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
93 *
94 * G0..G3 - GRS mux constraints
95 * 46-47: GRS_L2SEL value
96 * 44-45: GRS_L3SEL value
97 * 41-44: GRS_MCSEL value
98 * 39-40: GRS_FABSEL value
99 * Note that these match up with their bit positions in MMCR1
100 *
101 * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
102 * 37: UC3 error 0x20_0000_0000
103 * 36: FPU|IFU|ISU1 events needed 0x10_0000_0000
104 * 35: ISU0 events needed 0x08_0000_0000
105 * 34: IDU|GRS events needed 0x04_0000_0000
106 *
107 * PS1
108 * 33: PS1 error 0x2_0000_0000
109 * 31-32: count of events needing PMC1/2 0x1_8000_0000
110 *
111 * PS2
112 * 30: PS2 error 0x4000_0000
113 * 28-29: count of events needing PMC3/4 0x3000_0000
114 *
115 * B0
116 * 24-27: Byte 0 event source 0x0f00_0000
117 * Encoding as for the event code
118 *
119 * B1, B2, B3
120 * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
121 *
122 * P1..P6
123 * 0-11: Count of events needing PMC1..PMC6
124 */
125
126static const int grsel_shift[8] = {
127 MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
128 MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
129 MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
130};
131
132/* Masks and values for using events from the various units */
133static u64 unit_cons[PM_LASTUNIT+1][2] = {
134 [PM_FPU] = { 0xc0002000000000ull, 0x00001000000000ull },
135 [PM_ISU0] = { 0x00002000000000ull, 0x00000800000000ull },
136 [PM_ISU1] = { 0xc0002000000000ull, 0xc0001000000000ull },
137 [PM_IFU] = { 0xc0002000000000ull, 0x80001000000000ull },
138 [PM_IDU] = { 0x30002000000000ull, 0x00000400000000ull },
139 [PM_GRS] = { 0x30002000000000ull, 0x30000400000000ull },
140};
141
142static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp)
143{
144 int pmc, byte, unit, sh;
145 int bit, fmask;
146 u64 mask = 0, value = 0;
147 int grp = -1;
148
149 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
150 if (pmc) {
151 if (pmc > 6)
152 return -1;
153 sh = (pmc - 1) * 2;
154 mask |= 2 << sh;
155 value |= 1 << sh;
156 if (pmc <= 4)
157 grp = (pmc - 1) >> 1;
158 else if (event != 0x500009 && event != 0x600005)
159 return -1;
160 }
161 if (event & PM_BUSEVENT_MSK) {
162 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
163 if (unit > PM_LASTUNIT)
164 return -1;
165 if (unit == PM_ISU0_ALT)
166 unit = PM_ISU0;
167 mask |= unit_cons[unit][0];
168 value |= unit_cons[unit][1];
169 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
170 if (byte >= 4) {
171 if (unit != PM_LSU1)
172 return -1;
173 /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
174 ++unit;
175 byte &= 3;
176 }
177 if (unit == PM_GRS) {
178 bit = event & 7;
179 fmask = (bit == 6)? 7: 3;
180 sh = grsel_shift[bit];
181 mask |= (u64)fmask << sh;
182 value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
183 }
184 /*
185 * Bus events on bytes 0 and 2 can be counted
186 * on PMC1/2; bytes 1 and 3 on PMC3/4.
187 */
188 if (!pmc)
189 grp = byte & 1;
190 /* Set byte lane select field */
191 mask |= 0xfULL << (24 - 4 * byte);
192 value |= (u64)unit << (24 - 4 * byte);
193 }
194 if (grp == 0) {
195 /* increment PMC1/2 field */
196 mask |= 0x200000000ull;
197 value |= 0x080000000ull;
198 } else if (grp == 1) {
199 /* increment PMC3/4 field */
200 mask |= 0x40000000ull;
201 value |= 0x10000000ull;
202 }
203 if (pmc < 5) {
204 /* need a counter from PMC1-4 set */
205 mask |= 0x8000000000000ull;
206 value |= 0x1000000000000ull;
207 }
208 *maskp = mask;
209 *valp = value;
210 return 0;
211}
212
213#define MAX_ALT 3 /* at most 3 alternatives for any event */
214
215static const unsigned int event_alternatives[][MAX_ALT] = {
216 { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
217 { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
218 { 0x100005, 0x600005 }, /* PM_RUN_CYC */
219 { 0x100009, 0x200009, 0x500009 }, /* PM_INST_CMPL */
220 { 0x300009, 0x400009 }, /* PM_INST_DISP */
221};
222
223/*
224 * Scan the alternatives table for a match and return the
225 * index into the alternatives table if found, else -1.
226 */
227static int find_alternative(u64 event)
228{
229 int i, j;
230
231 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
232 if (event < event_alternatives[i][0])
233 break;
234 for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
235 if (event == event_alternatives[i][j])
236 return i;
237 }
238 return -1;
239}
240
241static const unsigned char bytedecode_alternatives[4][4] = {
242 /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
243 /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
244 /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
245 /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
246};
247
248/*
249 * Some direct events for decodes of event bus byte 3 have alternative
250 * PMCSEL values on other counters. This returns the alternative
251 * event code for those that do, or -1 otherwise.
252 */
253static u64 find_alternative_bdecode(u64 event)
254{
255 int pmc, altpmc, pp, j;
256
257 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
258 if (pmc == 0 || pmc > 4)
259 return -1;
260 altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
261 pp = event & PM_PMCSEL_MSK;
262 for (j = 0; j < 4; ++j) {
263 if (bytedecode_alternatives[pmc - 1][j] == pp) {
264 return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
265 (altpmc << PM_PMC_SH) |
266 bytedecode_alternatives[altpmc - 1][j];
267 }
268 }
269 return -1;
270}
271
272static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[])
273{
274 int i, j, nalt = 1;
275 u64 ae;
276
277 alt[0] = event;
278 nalt = 1;
279 i = find_alternative(event);
280 if (i >= 0) {
281 for (j = 0; j < MAX_ALT; ++j) {
282 ae = event_alternatives[i][j];
283 if (ae && ae != event)
284 alt[nalt++] = ae;
285 }
286 } else {
287 ae = find_alternative_bdecode(event);
288 if (ae > 0)
289 alt[nalt++] = ae;
290 }
291 return nalt;
292}
293
294/*
295 * Map of which direct events on which PMCs are marked instruction events.
296 * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
297 * Bit 0 is set if it is marked for all PMCs.
298 * The 0x80 bit indicates a byte decode PMCSEL value.
299 */
300static unsigned char direct_event_is_marked[0x28] = {
301 0, /* 00 */
302 0x1f, /* 01 PM_IOPS_CMPL */
303 0x2, /* 02 PM_MRK_GRP_DISP */
304 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
305 0, /* 04 */
306 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
307 0x80, /* 06 */
308 0x80, /* 07 */
309 0, 0, 0,/* 08 - 0a */
310 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
311 0, /* 0c */
312 0x80, /* 0d */
313 0x80, /* 0e */
314 0, /* 0f */
315 0, /* 10 */
316 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
317 0, /* 12 */
318 0x10, /* 13 PM_MRK_GRP_CMPL */
319 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
320 0x2, /* 15 PM_MRK_GRP_ISSUED */
321 0x80, /* 16 */
322 0x80, /* 17 */
323 0, 0, 0, 0, 0,
324 0x80, /* 1d */
325 0x80, /* 1e */
326 0, /* 1f */
327 0x80, /* 20 */
328 0x80, /* 21 */
329 0x80, /* 22 */
330 0x80, /* 23 */
331 0x80, /* 24 */
332 0x80, /* 25 */
333 0x80, /* 26 */
334 0x80, /* 27 */
335};
336
337/*
338 * Returns 1 if event counts things relating to marked instructions
339 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
340 */
341static int power5_marked_instr_event(u64 event)
342{
343 int pmc, psel;
344 int bit, byte, unit;
345 u32 mask;
346
347 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
348 psel = event & PM_PMCSEL_MSK;
349 if (pmc >= 5)
350 return 0;
351
352 bit = -1;
353 if (psel < sizeof(direct_event_is_marked)) {
354 if (direct_event_is_marked[psel] & (1 << pmc))
355 return 1;
356 if (direct_event_is_marked[psel] & 0x80)
357 bit = 4;
358 else if (psel == 0x08)
359 bit = pmc - 1;
360 else if (psel == 0x10)
361 bit = 4 - pmc;
362 else if (psel == 0x1b && (pmc == 1 || pmc == 3))
363 bit = 4;
364 } else if ((psel & 0x58) == 0x40)
365 bit = psel & 7;
366
367 if (!(event & PM_BUSEVENT_MSK))
368 return 0;
369
370 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
371 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
372 if (unit == PM_LSU0) {
373 /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
374 mask = 0x5dff00;
375 } else if (unit == PM_LSU1 && byte >= 4) {
376 byte -= 4;
377 /* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */
378 mask = 0x5f00c0aa;
379 } else
380 return 0;
381
382 return (mask >> (byte * 8 + bit)) & 1;
383}
384
385static int power5_compute_mmcr(u64 event[], int n_ev,
386 unsigned int hwc[], u64 mmcr[])
387{
388 u64 mmcr1 = 0;
389 u64 mmcra = 0;
390 unsigned int pmc, unit, byte, psel;
391 unsigned int ttm, grp;
392 int i, isbus, bit, grsel;
393 unsigned int pmc_inuse = 0;
394 unsigned int pmc_grp_use[2];
395 unsigned char busbyte[4];
396 unsigned char unituse[16];
397 int ttmuse;
398
399 if (n_ev > 6)
400 return -1;
401
402 /* First pass to count resource use */
403 pmc_grp_use[0] = pmc_grp_use[1] = 0;
404 memset(busbyte, 0, sizeof(busbyte));
405 memset(unituse, 0, sizeof(unituse));
406 for (i = 0; i < n_ev; ++i) {
407 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
408 if (pmc) {
409 if (pmc > 6)
410 return -1;
411 if (pmc_inuse & (1 << (pmc - 1)))
412 return -1;
413 pmc_inuse |= 1 << (pmc - 1);
414 /* count 1/2 vs 3/4 use */
415 if (pmc <= 4)
416 ++pmc_grp_use[(pmc - 1) >> 1];
417 }
418 if (event[i] & PM_BUSEVENT_MSK) {
419 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
420 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
421 if (unit > PM_LASTUNIT)
422 return -1;
423 if (unit == PM_ISU0_ALT)
424 unit = PM_ISU0;
425 if (byte >= 4) {
426 if (unit != PM_LSU1)
427 return -1;
428 ++unit;
429 byte &= 3;
430 }
431 if (!pmc)
432 ++pmc_grp_use[byte & 1];
433 if (busbyte[byte] && busbyte[byte] != unit)
434 return -1;
435 busbyte[byte] = unit;
436 unituse[unit] = 1;
437 }
438 }
439 if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2)
440 return -1;
441
442 /*
443 * Assign resources and set multiplexer selects.
444 *
445 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
446 * choice we have to deal with.
447 */
448 if (unituse[PM_ISU0] &
449 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
450 unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
451 unituse[PM_ISU0] = 0;
452 }
453 /* Set TTM[01]SEL fields. */
454 ttmuse = 0;
455 for (i = PM_FPU; i <= PM_ISU1; ++i) {
456 if (!unituse[i])
457 continue;
458 if (ttmuse++)
459 return -1;
460 mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
461 }
462 ttmuse = 0;
463 for (; i <= PM_GRS; ++i) {
464 if (!unituse[i])
465 continue;
466 if (ttmuse++)
467 return -1;
468 mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
469 }
470 if (ttmuse > 1)
471 return -1;
472
473 /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
474 for (byte = 0; byte < 4; ++byte) {
475 unit = busbyte[byte];
476 if (!unit)
477 continue;
478 if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
479 /* get ISU0 through TTM1 rather than TTM0 */
480 unit = PM_ISU0_ALT;
481 } else if (unit == PM_LSU1 + 1) {
482 /* select lower word of LSU1 for this byte */
483 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
484 }
485 ttm = unit >> 2;
486 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
487 }
488
489 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
490 for (i = 0; i < n_ev; ++i) {
491 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
492 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
493 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
494 psel = event[i] & PM_PMCSEL_MSK;
495 isbus = event[i] & PM_BUSEVENT_MSK;
496 if (!pmc) {
497 /* Bus event or any-PMC direct event */
498 for (pmc = 0; pmc < 4; ++pmc) {
499 if (pmc_inuse & (1 << pmc))
500 continue;
501 grp = (pmc >> 1) & 1;
502 if (isbus) {
503 if (grp == (byte & 1))
504 break;
505 } else if (pmc_grp_use[grp] < 2) {
506 ++pmc_grp_use[grp];
507 break;
508 }
509 }
510 pmc_inuse |= 1 << pmc;
511 } else if (pmc <= 4) {
512 /* Direct event */
513 --pmc;
514 if ((psel == 8 || psel == 0x10) && isbus && (byte & 2))
515 /* add events on higher-numbered bus */
516 mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
517 } else {
518 /* Instructions or run cycles on PMC5/6 */
519 --pmc;
520 }
521 if (isbus && unit == PM_GRS) {
522 bit = psel & 7;
523 grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
524 mmcr1 |= (u64)grsel << grsel_shift[bit];
525 }
526 if (power5_marked_instr_event(event[i]))
527 mmcra |= MMCRA_SAMPLE_ENABLE;
528 if (pmc <= 3)
529 mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
530 hwc[i] = pmc;
531 }
532
533 /* Return MMCRx values */
534 mmcr[0] = 0;
535 if (pmc_inuse & 1)
536 mmcr[0] = MMCR0_PMC1CE;
537 if (pmc_inuse & 0x3e)
538 mmcr[0] |= MMCR0_PMCjCE;
539 mmcr[1] = mmcr1;
540 mmcr[2] = mmcra;
541 return 0;
542}
543
544static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
545{
546 if (pmc <= 3)
547 mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
548}
549
550static int power5_generic_events[] = {
551 [PERF_COUNT_CPU_CYCLES] = 0xf,
552 [PERF_COUNT_INSTRUCTIONS] = 0x100009,
553 [PERF_COUNT_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */
554 [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
555 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
556 [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
557};
558
559struct power_pmu power5_pmu = {
560 .n_counter = 6,
561 .max_alternatives = MAX_ALT,
562 .add_fields = 0x7000090000555ull,
563 .test_adder = 0x3000490000000ull,
564 .compute_mmcr = power5_compute_mmcr,
565 .get_constraint = power5_get_constraint,
566 .get_alternatives = power5_get_alternatives,
567 .disable_pmc = power5_disable_pmc,
568 .n_generic = ARRAY_SIZE(power5_generic_events),
569 .generic_events = power5_generic_events,
570};
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
new file mode 100644
index 000000000000..cd4fbe06c35d
--- /dev/null
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -0,0 +1,490 @@
1/*
2 * Performance counter support for POWER6 processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER6
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0x7
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */
22#define PM_UNIT_MSK 0xf
23#define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH)
24#define PM_LLAV 0x8000 /* Load lookahead match value */
25#define PM_LLA 0x4000 /* Load lookahead match enable */
26#define PM_BYTE_SH 12 /* Byte of event bus to use */
27#define PM_BYTE_MSK 3
28#define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */
29#define PM_SUBUNIT_MSK 7
30#define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
31#define PM_PMCSEL_MSK 0xff /* PMCxSEL value */
32#define PM_BUSEVENT_MSK 0xf3700
33
34/*
35 * Bits in MMCR1 for POWER6
36 */
37#define MMCR1_TTM0SEL_SH 60
38#define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4)
39#define MMCR1_TTMSEL_MSK 0xf
40#define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
41#define MMCR1_NESTSEL_SH 45
42#define MMCR1_NESTSEL_MSK 0x7
43#define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
44#define MMCR1_PMC1_LLA ((u64)1 << 44)
45#define MMCR1_PMC1_LLA_VALUE ((u64)1 << 39)
46#define MMCR1_PMC1_ADDR_SEL ((u64)1 << 35)
47#define MMCR1_PMC1SEL_SH 24
48#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
49#define MMCR1_PMCSEL_MSK 0xff
50
51/*
52 * Map of which direct events on which PMCs are marked instruction events.
53 * Indexed by PMCSEL value >> 1.
54 * Bottom 4 bits are a map of which PMCs are interesting,
55 * top 4 bits say what sort of event:
56 * 0 = direct marked event,
57 * 1 = byte decode event,
58 * 4 = add/and event (PMC1 -> bits 0 & 4),
59 * 5 = add/and event (PMC1 -> bits 1 & 5),
60 * 6 = add/and event (PMC1 -> bits 2 & 6),
61 * 7 = add/and event (PMC1 -> bits 3 & 7).
62 */
63static unsigned char direct_event_is_marked[0x60 >> 1] = {
64 0, /* 00 */
65 0, /* 02 */
66 0, /* 04 */
67 0x07, /* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
68 0x04, /* 08 PM_MRK_DFU_FIN */
69 0x06, /* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */
70 0, /* 0c */
71 0, /* 0e */
72 0x02, /* 10 PM_MRK_INST_DISP */
73 0x08, /* 12 PM_MRK_LSU_DERAT_MISS */
74 0, /* 14 */
75 0, /* 16 */
76 0x0c, /* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */
77 0x0f, /* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */
78 0x01, /* 1c PM_MRK_INST_ISSUED */
79 0, /* 1e */
80 0, /* 20 */
81 0, /* 22 */
82 0, /* 24 */
83 0, /* 26 */
84 0x15, /* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */
85 0, /* 2a */
86 0, /* 2c */
87 0, /* 2e */
88 0x4f, /* 30 */
89 0x7f, /* 32 */
90 0x4f, /* 34 */
91 0x5f, /* 36 */
92 0x6f, /* 38 */
93 0x4f, /* 3a */
94 0, /* 3c */
95 0x08, /* 3e PM_MRK_INST_TIMEO */
96 0x1f, /* 40 */
97 0x1f, /* 42 */
98 0x1f, /* 44 */
99 0x1f, /* 46 */
100 0x1f, /* 48 */
101 0x1f, /* 4a */
102 0x1f, /* 4c */
103 0x1f, /* 4e */
104 0, /* 50 */
105 0x05, /* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */
106 0x1c, /* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */
107 0x02, /* 56 PM_MRK_LD_MISS_L1 */
108 0, /* 58 */
109 0, /* 5a */
110 0, /* 5c */
111 0, /* 5e */
112};
113
114/*
115 * Masks showing for each unit which bits are marked events.
116 * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0.
117 */
118static u32 marked_bus_events[16] = {
119 0x01000000, /* direct events set 1: byte 3 bit 0 */
120 0x00010000, /* direct events set 2: byte 2 bit 0 */
121 0, 0, 0, 0, /* IDU, IFU, nest: nothing */
122 0x00000088, /* VMX set 1: byte 0 bits 3, 7 */
123 0x000000c0, /* VMX set 2: byte 0 bits 4-7 */
124 0x04010000, /* LSU set 1: byte 2 bit 0, byte 3 bit 2 */
125 0xff010000u, /* LSU set 2: byte 2 bit 0, all of byte 3 */
126 0, /* LSU set 3 */
127 0x00000010, /* VMX set 3: byte 0 bit 4 */
128 0, /* BFP set 1 */
129 0x00000022, /* BFP set 2: byte 0 bits 1, 5 */
130 0, 0
131};
132
133/*
134 * Returns 1 if event counts things relating to marked instructions
135 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
136 */
137static int power6_marked_instr_event(u64 event)
138{
139 int pmc, psel, ptype;
140 int bit, byte, unit;
141 u32 mask;
142
143 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
144 psel = (event & PM_PMCSEL_MSK) >> 1; /* drop edge/level bit */
145 if (pmc >= 5)
146 return 0;
147
148 bit = -1;
149 if (psel < sizeof(direct_event_is_marked)) {
150 ptype = direct_event_is_marked[psel];
151 if (pmc == 0 || !(ptype & (1 << (pmc - 1))))
152 return 0;
153 ptype >>= 4;
154 if (ptype == 0)
155 return 1;
156 if (ptype == 1)
157 bit = 0;
158 else
159 bit = ptype ^ (pmc - 1);
160 } else if ((psel & 0x48) == 0x40)
161 bit = psel & 7;
162
163 if (!(event & PM_BUSEVENT_MSK) || bit == -1)
164 return 0;
165
166 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
167 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
168 mask = marked_bus_events[unit];
169 return (mask >> (byte * 8 + bit)) & 1;
170}
171
172/*
173 * Assign PMC numbers and compute MMCR1 value for a set of events
174 */
175static int p6_compute_mmcr(u64 event[], int n_ev,
176 unsigned int hwc[], u64 mmcr[])
177{
178 u64 mmcr1 = 0;
179 u64 mmcra = 0;
180 int i;
181 unsigned int pmc, ev, b, u, s, psel;
182 unsigned int ttmset = 0;
183 unsigned int pmc_inuse = 0;
184
185 if (n_ev > 6)
186 return -1;
187 for (i = 0; i < n_ev; ++i) {
188 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
189 if (pmc) {
190 if (pmc_inuse & (1 << (pmc - 1)))
191 return -1; /* collision! */
192 pmc_inuse |= 1 << (pmc - 1);
193 }
194 }
195 for (i = 0; i < n_ev; ++i) {
196 ev = event[i];
197 pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
198 if (pmc) {
199 --pmc;
200 } else {
201 /* can go on any PMC; find a free one */
202 for (pmc = 0; pmc < 4; ++pmc)
203 if (!(pmc_inuse & (1 << pmc)))
204 break;
205 if (pmc >= 4)
206 return -1;
207 pmc_inuse |= 1 << pmc;
208 }
209 hwc[i] = pmc;
210 psel = ev & PM_PMCSEL_MSK;
211 if (ev & PM_BUSEVENT_MSK) {
212 /* this event uses the event bus */
213 b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
214 u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
215 /* check for conflict on this byte of event bus */
216 if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
217 return -1;
218 mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
219 ttmset |= 1 << b;
220 if (u == 5) {
221 /* Nest events have a further mux */
222 s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
223 if ((ttmset & 0x10) &&
224 MMCR1_NESTSEL(mmcr1) != s)
225 return -1;
226 ttmset |= 0x10;
227 mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
228 }
229 if (0x30 <= psel && psel <= 0x3d) {
230 /* these need the PMCx_ADDR_SEL bits */
231 if (b >= 2)
232 mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
233 }
234 /* bus select values are different for PMC3/4 */
235 if (pmc >= 2 && (psel & 0x90) == 0x80)
236 psel ^= 0x20;
237 }
238 if (ev & PM_LLA) {
239 mmcr1 |= MMCR1_PMC1_LLA >> pmc;
240 if (ev & PM_LLAV)
241 mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
242 }
243 if (power6_marked_instr_event(event[i]))
244 mmcra |= MMCRA_SAMPLE_ENABLE;
245 if (pmc < 4)
246 mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
247 }
248 mmcr[0] = 0;
249 if (pmc_inuse & 1)
250 mmcr[0] = MMCR0_PMC1CE;
251 if (pmc_inuse & 0xe)
252 mmcr[0] |= MMCR0_PMCjCE;
253 mmcr[1] = mmcr1;
254 mmcr[2] = mmcra;
255 return 0;
256}
257
258/*
259 * Layout of constraint bits:
260 *
261 * 0-1 add field: number of uses of PMC1 (max 1)
262 * 2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6
263 * 12-15 add field: number of uses of PMC1-4 (max 4)
264 * 16-19 select field: unit on byte 0 of event bus
265 * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
266 * 32-34 select field: nest (subunit) event selector
267 */
268static int p6_get_constraint(u64 event, u64 *maskp, u64 *valp)
269{
270 int pmc, byte, sh, subunit;
271 u64 mask = 0, value = 0;
272
273 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
274 if (pmc) {
275 if (pmc > 4 && !(event == 0x500009 || event == 0x600005))
276 return -1;
277 sh = (pmc - 1) * 2;
278 mask |= 2 << sh;
279 value |= 1 << sh;
280 }
281 if (event & PM_BUSEVENT_MSK) {
282 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
283 sh = byte * 4 + (16 - PM_UNIT_SH);
284 mask |= PM_UNIT_MSKS << sh;
285 value |= (u64)(event & PM_UNIT_MSKS) << sh;
286 if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
287 subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
288 mask |= (u64)PM_SUBUNIT_MSK << 32;
289 value |= (u64)subunit << 32;
290 }
291 }
292 if (pmc <= 4) {
293 mask |= 0x8000; /* add field for count of PMC1-4 uses */
294 value |= 0x1000;
295 }
296 *maskp = mask;
297 *valp = value;
298 return 0;
299}
300
301static int p6_limited_pmc_event(u64 event)
302{
303 int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
304
305 return pmc == 5 || pmc == 6;
306}
307
308#define MAX_ALT 4 /* at most 4 alternatives for any event */
309
310static const unsigned int event_alternatives[][MAX_ALT] = {
311 { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */
312 { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
313 { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */
314 { 0x10000a, 0x2000f4, 0x600005 }, /* PM_RUN_CYC */
315 { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */
316 { 0x10000e, 0x400010 }, /* PM_PURR */
317 { 0x100010, 0x4000f8 }, /* PM_FLUSH */
318 { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */
319 { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */
320 { 0x100054, 0x2000f0 }, /* PM_ST_FIN */
321 { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */
322 { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */
323 { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */
324 { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */
325 { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */
326 { 0x200012, 0x300012 }, /* PM_INST_DISP */
327 { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */
328 { 0x2000f8, 0x300010 }, /* PM_EXT_INT */
329 { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */
330 { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */
331 { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */
332 { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */
333 { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */
334};
335
336/*
337 * This could be made more efficient with a binary search on
338 * a presorted list, if necessary
339 */
340static int find_alternatives_list(u64 event)
341{
342 int i, j;
343 unsigned int alt;
344
345 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
346 if (event < event_alternatives[i][0])
347 return -1;
348 for (j = 0; j < MAX_ALT; ++j) {
349 alt = event_alternatives[i][j];
350 if (!alt || event < alt)
351 break;
352 if (event == alt)
353 return i;
354 }
355 }
356 return -1;
357}
358
359static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[])
360{
361 int i, j, nlim;
362 unsigned int psel, pmc;
363 unsigned int nalt = 1;
364 u64 aevent;
365
366 alt[0] = event;
367 nlim = p6_limited_pmc_event(event);
368
369 /* check the alternatives table */
370 i = find_alternatives_list(event);
371 if (i >= 0) {
372 /* copy out alternatives from list */
373 for (j = 0; j < MAX_ALT; ++j) {
374 aevent = event_alternatives[i][j];
375 if (!aevent)
376 break;
377 if (aevent != event)
378 alt[nalt++] = aevent;
379 nlim += p6_limited_pmc_event(aevent);
380 }
381
382 } else {
383 /* Check for alternative ways of computing sum events */
384 /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
385 psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */
386 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
387 if (pmc && (psel == 0x32 || psel == 0x34))
388 alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
389 ((5 - pmc) << PM_PMC_SH);
390
391 /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
392 if (pmc && (psel == 0x38 || psel == 0x3a))
393 alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
394 ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
395 }
396
397 if (flags & PPMU_ONLY_COUNT_RUN) {
398 /*
399 * We're only counting in RUN state,
400 * so PM_CYC is equivalent to PM_RUN_CYC,
401 * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR.
402 * This doesn't include alternatives that don't provide
403 * any extra flexibility in assigning PMCs (e.g.
404 * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC).
405 * Note that even with these additional alternatives
406 * we never end up with more than 4 alternatives for any event.
407 */
408 j = nalt;
409 for (i = 0; i < nalt; ++i) {
410 switch (alt[i]) {
411 case 0x1e: /* PM_CYC */
412 alt[j++] = 0x600005; /* PM_RUN_CYC */
413 ++nlim;
414 break;
415 case 0x10000a: /* PM_RUN_CYC */
416 alt[j++] = 0x1e; /* PM_CYC */
417 break;
418 case 2: /* PM_INST_CMPL */
419 alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */
420 ++nlim;
421 break;
422 case 0x500009: /* PM_RUN_INST_CMPL */
423 alt[j++] = 2; /* PM_INST_CMPL */
424 break;
425 case 0x10000e: /* PM_PURR */
426 alt[j++] = 0x4000f4; /* PM_RUN_PURR */
427 break;
428 case 0x4000f4: /* PM_RUN_PURR */
429 alt[j++] = 0x10000e; /* PM_PURR */
430 break;
431 }
432 }
433 nalt = j;
434 }
435
436 if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
437 /* remove the limited PMC events */
438 j = 0;
439 for (i = 0; i < nalt; ++i) {
440 if (!p6_limited_pmc_event(alt[i])) {
441 alt[j] = alt[i];
442 ++j;
443 }
444 }
445 nalt = j;
446 } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
447 /* remove all but the limited PMC events */
448 j = 0;
449 for (i = 0; i < nalt; ++i) {
450 if (p6_limited_pmc_event(alt[i])) {
451 alt[j] = alt[i];
452 ++j;
453 }
454 }
455 nalt = j;
456 }
457
458 return nalt;
459}
460
461static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
462{
463 /* Set PMCxSEL to 0 to disable PMCx */
464 if (pmc <= 3)
465 mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
466}
467
468static int power6_generic_events[] = {
469 [PERF_COUNT_CPU_CYCLES] = 0x1e,
470 [PERF_COUNT_INSTRUCTIONS] = 2,
471 [PERF_COUNT_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */
472 [PERF_COUNT_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */
473 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */
474 [PERF_COUNT_BRANCH_MISSES] = 0x400052, /* BR_MPRED */
475};
476
477struct power_pmu power6_pmu = {
478 .n_counter = 6,
479 .max_alternatives = MAX_ALT,
480 .add_fields = 0x1555,
481 .test_adder = 0x3000,
482 .compute_mmcr = p6_compute_mmcr,
483 .get_constraint = p6_get_constraint,
484 .get_alternatives = p6_get_alternatives,
485 .disable_pmc = p6_disable_pmc,
486 .n_generic = ARRAY_SIZE(power6_generic_events),
487 .generic_events = power6_generic_events,
488 .flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
489 .limited_pmc_event = p6_limited_pmc_event,
490};
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
new file mode 100644
index 000000000000..eed47c4523f1
--- /dev/null
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -0,0 +1,441 @@
1/*
2 * Performance counter support for PPC970-family processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/string.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for PPC970
17 */
18#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
21#define PM_UNIT_MSK 0xf
22#define PM_SPCSEL_SH 6
23#define PM_SPCSEL_MSK 3
24#define PM_BYTE_SH 4 /* Byte number of event bus to use */
25#define PM_BYTE_MSK 3
26#define PM_PMCSEL_MSK 0xf
27
28/* Values in PM_UNIT field */
29#define PM_NONE 0
30#define PM_FPU 1
31#define PM_VPU 2
32#define PM_ISU 3
33#define PM_IFU 4
34#define PM_IDU 5
35#define PM_STS 6
36#define PM_LSU0 7
37#define PM_LSU1U 8
38#define PM_LSU1L 9
39#define PM_LASTUNIT 9
40
41/*
42 * Bits in MMCR0 for PPC970
43 */
44#define MMCR0_PMC1SEL_SH 8
45#define MMCR0_PMC2SEL_SH 1
46#define MMCR_PMCSEL_MSK 0x1f
47
48/*
49 * Bits in MMCR1 for PPC970
50 */
51#define MMCR1_TTM0SEL_SH 62
52#define MMCR1_TTM1SEL_SH 59
53#define MMCR1_TTM3SEL_SH 53
54#define MMCR1_TTMSEL_MSK 3
55#define MMCR1_TD_CP_DBG0SEL_SH 50
56#define MMCR1_TD_CP_DBG1SEL_SH 48
57#define MMCR1_TD_CP_DBG2SEL_SH 46
58#define MMCR1_TD_CP_DBG3SEL_SH 44
59#define MMCR1_PMC1_ADDER_SEL_SH 39
60#define MMCR1_PMC2_ADDER_SEL_SH 38
61#define MMCR1_PMC6_ADDER_SEL_SH 37
62#define MMCR1_PMC5_ADDER_SEL_SH 36
63#define MMCR1_PMC8_ADDER_SEL_SH 35
64#define MMCR1_PMC7_ADDER_SEL_SH 34
65#define MMCR1_PMC3_ADDER_SEL_SH 33
66#define MMCR1_PMC4_ADDER_SEL_SH 32
67#define MMCR1_PMC3SEL_SH 27
68#define MMCR1_PMC4SEL_SH 22
69#define MMCR1_PMC5SEL_SH 17
70#define MMCR1_PMC6SEL_SH 12
71#define MMCR1_PMC7SEL_SH 7
72#define MMCR1_PMC8SEL_SH 2
73
74static short mmcr1_adder_bits[8] = {
75 MMCR1_PMC1_ADDER_SEL_SH,
76 MMCR1_PMC2_ADDER_SEL_SH,
77 MMCR1_PMC3_ADDER_SEL_SH,
78 MMCR1_PMC4_ADDER_SEL_SH,
79 MMCR1_PMC5_ADDER_SEL_SH,
80 MMCR1_PMC6_ADDER_SEL_SH,
81 MMCR1_PMC7_ADDER_SEL_SH,
82 MMCR1_PMC8_ADDER_SEL_SH
83};
84
85/*
86 * Bits in MMCRA
87 */
88
89/*
90 * Layout of constraint bits:
91 * 6666555555555544444444443333333333222222222211111111110000000000
92 * 3210987654321098765432109876543210987654321098765432109876543210
93 * <><><>[ >[ >[ >< >< >< >< ><><><><><><><><>
94 * SPT0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
95 *
96 * SP - SPCSEL constraint
97 * 48-49: SPCSEL value 0x3_0000_0000_0000
98 *
99 * T0 - TTM0 constraint
100 * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
101 *
102 * T1 - TTM1 constraint
103 * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
104 *
105 * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
106 * 43: UC3 error 0x0800_0000_0000
107 * 42: FPU|IFU|VPU events needed 0x0400_0000_0000
108 * 41: ISU events needed 0x0200_0000_0000
109 * 40: IDU|STS events needed 0x0100_0000_0000
110 *
111 * PS1
112 * 39: PS1 error 0x0080_0000_0000
113 * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
114 *
115 * PS2
116 * 35: PS2 error 0x0008_0000_0000
117 * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
118 *
119 * B0
120 * 28-31: Byte 0 event source 0xf000_0000
121 * Encoding as for the event code
122 *
123 * B1, B2, B3
124 * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
125 *
126 * P1
127 * 15: P1 error 0x8000
128 * 14-15: Count of events needing PMC1
129 *
130 * P2..P8
131 * 0-13: Count of events needing PMC2..PMC8
132 */
133
134static unsigned char direct_marked_event[8] = {
135 (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
136 (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
137 (1<<3) | (1<<5), /* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */
138 (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
139 (1<<4) | (1<<5), /* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */
140 (1<<3) | (1<<4) | (1<<5),
141 /* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
142 (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
143 (1<<4) /* PMC8: PM_MRK_LSU_FIN */
144};
145
146/*
147 * Returns 1 if event counts things relating to marked instructions
148 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
149 */
150static int p970_marked_instr_event(u64 event)
151{
152 int pmc, psel, unit, byte, bit;
153 unsigned int mask;
154
155 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
156 psel = event & PM_PMCSEL_MSK;
157 if (pmc) {
158 if (direct_marked_event[pmc - 1] & (1 << psel))
159 return 1;
160 if (psel == 0) /* add events */
161 bit = (pmc <= 4)? pmc - 1: 8 - pmc;
162 else if (psel == 7 || psel == 13) /* decode events */
163 bit = 4;
164 else
165 return 0;
166 } else
167 bit = psel;
168
169 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
170 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
171 mask = 0;
172 switch (unit) {
173 case PM_VPU:
174 mask = 0x4c; /* byte 0 bits 2,3,6 */
175 case PM_LSU0:
176 /* byte 2 bits 0,2,3,4,6; all of byte 1 */
177 mask = 0x085dff00;
178 case PM_LSU1L:
179 mask = 0x50 << 24; /* byte 3 bits 4,6 */
180 break;
181 }
182 return (mask >> (byte * 8 + bit)) & 1;
183}
184
185/* Masks and values for using events from the various units */
186static u64 unit_cons[PM_LASTUNIT+1][2] = {
187 [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull },
188 [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull },
189 [PM_ISU] = { 0x080000000000ull, 0x020000000000ull },
190 [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull },
191 [PM_IDU] = { 0x380000000000ull, 0x010000000000ull },
192 [PM_STS] = { 0x380000000000ull, 0x310000000000ull },
193};
194
195static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp)
196{
197 int pmc, byte, unit, sh, spcsel;
198 u64 mask = 0, value = 0;
199 int grp = -1;
200
201 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
202 if (pmc) {
203 if (pmc > 8)
204 return -1;
205 sh = (pmc - 1) * 2;
206 mask |= 2 << sh;
207 value |= 1 << sh;
208 grp = ((pmc - 1) >> 1) & 1;
209 }
210 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
211 if (unit) {
212 if (unit > PM_LASTUNIT)
213 return -1;
214 mask |= unit_cons[unit][0];
215 value |= unit_cons[unit][1];
216 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
217 /*
218 * Bus events on bytes 0 and 2 can be counted
219 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
220 */
221 if (!pmc)
222 grp = byte & 1;
223 /* Set byte lane select field */
224 mask |= 0xfULL << (28 - 4 * byte);
225 value |= (u64)unit << (28 - 4 * byte);
226 }
227 if (grp == 0) {
228 /* increment PMC1/2/5/6 field */
229 mask |= 0x8000000000ull;
230 value |= 0x1000000000ull;
231 } else if (grp == 1) {
232 /* increment PMC3/4/7/8 field */
233 mask |= 0x800000000ull;
234 value |= 0x100000000ull;
235 }
236 spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
237 if (spcsel) {
238 mask |= 3ull << 48;
239 value |= (u64)spcsel << 48;
240 }
241 *maskp = mask;
242 *valp = value;
243 return 0;
244}
245
246static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[])
247{
248 alt[0] = event;
249
250 /* 2 alternatives for LSU empty */
251 if (event == 0x2002 || event == 0x3002) {
252 alt[1] = event ^ 0x1000;
253 return 2;
254 }
255
256 return 1;
257}
258
259static int p970_compute_mmcr(u64 event[], int n_ev,
260 unsigned int hwc[], u64 mmcr[])
261{
262 u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
263 unsigned int pmc, unit, byte, psel;
264 unsigned int ttm, grp;
265 unsigned int pmc_inuse = 0;
266 unsigned int pmc_grp_use[2];
267 unsigned char busbyte[4];
268 unsigned char unituse[16];
269 unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
270 unsigned char ttmuse[2];
271 unsigned char pmcsel[8];
272 int i;
273 int spcsel;
274
275 if (n_ev > 8)
276 return -1;
277
278 /* First pass to count resource use */
279 pmc_grp_use[0] = pmc_grp_use[1] = 0;
280 memset(busbyte, 0, sizeof(busbyte));
281 memset(unituse, 0, sizeof(unituse));
282 for (i = 0; i < n_ev; ++i) {
283 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
284 if (pmc) {
285 if (pmc_inuse & (1 << (pmc - 1)))
286 return -1;
287 pmc_inuse |= 1 << (pmc - 1);
288 /* count 1/2/5/6 vs 3/4/7/8 use */
289 ++pmc_grp_use[((pmc - 1) >> 1) & 1];
290 }
291 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
292 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
293 if (unit) {
294 if (unit > PM_LASTUNIT)
295 return -1;
296 if (!pmc)
297 ++pmc_grp_use[byte & 1];
298 if (busbyte[byte] && busbyte[byte] != unit)
299 return -1;
300 busbyte[byte] = unit;
301 unituse[unit] = 1;
302 }
303 }
304 if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
305 return -1;
306
307 /*
308 * Assign resources and set multiplexer selects.
309 *
310 * PM_ISU can go either on TTM0 or TTM1, but that's the only
311 * choice we have to deal with.
312 */
313 if (unituse[PM_ISU] &
314 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
315 unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */
316 /* Set TTM[01]SEL fields. */
317 ttmuse[0] = ttmuse[1] = 0;
318 for (i = PM_FPU; i <= PM_STS; ++i) {
319 if (!unituse[i])
320 continue;
321 ttm = unitmap[i];
322 ++ttmuse[(ttm >> 2) & 1];
323 mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
324 }
325 /* Check only one unit per TTMx */
326 if (ttmuse[0] > 1 || ttmuse[1] > 1)
327 return -1;
328
329 /* Set byte lane select fields and TTM3SEL. */
330 for (byte = 0; byte < 4; ++byte) {
331 unit = busbyte[byte];
332 if (!unit)
333 continue;
334 if (unit <= PM_STS)
335 ttm = (unitmap[unit] >> 2) & 1;
336 else if (unit == PM_LSU0)
337 ttm = 2;
338 else {
339 ttm = 3;
340 if (unit == PM_LSU1L && byte >= 2)
341 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
342 }
343 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
344 }
345
346 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
347 memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */
348 for (i = 0; i < n_ev; ++i) {
349 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
350 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
351 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
352 psel = event[i] & PM_PMCSEL_MSK;
353 if (!pmc) {
354 /* Bus event or any-PMC direct event */
355 if (unit)
356 psel |= 0x10 | ((byte & 2) << 2);
357 else
358 psel |= 8;
359 for (pmc = 0; pmc < 8; ++pmc) {
360 if (pmc_inuse & (1 << pmc))
361 continue;
362 grp = (pmc >> 1) & 1;
363 if (unit) {
364 if (grp == (byte & 1))
365 break;
366 } else if (pmc_grp_use[grp] < 4) {
367 ++pmc_grp_use[grp];
368 break;
369 }
370 }
371 pmc_inuse |= 1 << pmc;
372 } else {
373 /* Direct event */
374 --pmc;
375 if (psel == 0 && (byte & 2))
376 /* add events on higher-numbered bus */
377 mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
378 }
379 pmcsel[pmc] = psel;
380 hwc[i] = pmc;
381 spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
382 mmcr1 |= spcsel;
383 if (p970_marked_instr_event(event[i]))
384 mmcra |= MMCRA_SAMPLE_ENABLE;
385 }
386 for (pmc = 0; pmc < 2; ++pmc)
387 mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
388 for (; pmc < 8; ++pmc)
389 mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
390 if (pmc_inuse & 1)
391 mmcr0 |= MMCR0_PMC1CE;
392 if (pmc_inuse & 0xfe)
393 mmcr0 |= MMCR0_PMCjCE;
394
395 mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
396
397 /* Return MMCRx values */
398 mmcr[0] = mmcr0;
399 mmcr[1] = mmcr1;
400 mmcr[2] = mmcra;
401 return 0;
402}
403
404static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
405{
406 int shift, i;
407
408 if (pmc <= 1) {
409 shift = MMCR0_PMC1SEL_SH - 7 * pmc;
410 i = 0;
411 } else {
412 shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
413 i = 1;
414 }
415 /*
416 * Setting the PMCxSEL field to 0x08 disables PMC x.
417 */
418 mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
419}
420
421static int ppc970_generic_events[] = {
422 [PERF_COUNT_CPU_CYCLES] = 7,
423 [PERF_COUNT_INSTRUCTIONS] = 1,
424 [PERF_COUNT_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */
425 [PERF_COUNT_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */
426 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */
427 [PERF_COUNT_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */
428};
429
430struct power_pmu ppc970_pmu = {
431 .n_counter = 8,
432 .max_alternatives = 2,
433 .add_fields = 0x001100005555ull,
434 .test_adder = 0x013300000000ull,
435 .compute_mmcr = p970_compute_mmcr,
436 .get_constraint = p970_get_constraint,
437 .get_alternatives = p970_get_alternatives,
438 .disable_pmc = p970_disable_pmc,
439 .n_generic = ARRAY_SIZE(ppc970_generic_events),
440 .generic_events = ppc970_generic_events,
441};
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 76993941cac9..ac0e112031b2 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,6 +29,7 @@
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/kprobes.h> 30#include <linux/kprobes.h>
31#include <linux/kdebug.h> 31#include <linux/kdebug.h>
32#include <linux/perf_counter.h>
32 33
33#include <asm/firmware.h> 34#include <asm/firmware.h>
34#include <asm/page.h> 35#include <asm/page.h>
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
170 die("Weird page fault", regs, SIGSEGV); 171 die("Weird page fault", regs, SIGSEGV);
171 } 172 }
172 173
174 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
175
173 /* When running in the kernel we expect faults to occur only to 176 /* When running in the kernel we expect faults to occur only to
174 * addresses in user space. All other faults represent errors in the 177 * addresses in user space. All other faults represent errors in the
175 * kernel and should generate an OOPS. Unfortunately, in the case of an 178 * kernel and should generate an OOPS. Unfortunately, in the case of an
@@ -309,6 +312,8 @@ good_area:
309 } 312 }
310 if (ret & VM_FAULT_MAJOR) { 313 if (ret & VM_FAULT_MAJOR) {
311 current->maj_flt++; 314 current->maj_flt++;
315 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
316 regs, address);
312#ifdef CONFIG_PPC_SMLPAR 317#ifdef CONFIG_PPC_SMLPAR
313 if (firmware_has_feature(FW_FEATURE_CMO)) { 318 if (firmware_has_feature(FW_FEATURE_CMO)) {
314 preempt_disable(); 319 preempt_disable();
@@ -316,8 +321,11 @@ good_area:
316 preempt_enable(); 321 preempt_enable();
317 } 322 }
318#endif 323#endif
319 } else 324 } else {
320 current->min_flt++; 325 current->min_flt++;
326 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
327 regs, address);
328 }
321 up_read(&mm->mmap_sem); 329 up_read(&mm->mmap_sem);
322 return 0; 330 return 0;
323 331
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 9da795e49337..732ee93a8e98 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
1config PPC64 1config PPC64
2 bool "64-bit kernel" 2 bool "64-bit kernel"
3 default n 3 default n
4 select HAVE_PERF_COUNTERS
4 help 5 help
5 This option selects whether a 32-bit or a 64-bit kernel 6 This option selects whether a 32-bit or a 64-bit kernel
6 will be built. 7 will be built.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index df9e885eee14..32ada97c964d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -727,6 +727,7 @@ config X86_UP_IOAPIC
727config X86_LOCAL_APIC 727config X86_LOCAL_APIC
728 def_bool y 728 def_bool y
729 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 729 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
730 select HAVE_PERF_COUNTERS if (!M386 && !M486)
730 731
731config X86_IO_APIC 732config X86_IO_APIC
732 def_bool y 733 def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a505202086e8..e590261ba059 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -825,9 +825,11 @@ ia32_sys_call_table:
825 .quad compat_sys_signalfd4 825 .quad compat_sys_signalfd4
826 .quad sys_eventfd2 826 .quad sys_eventfd2
827 .quad sys_epoll_create1 827 .quad sys_epoll_create1
828 .quad sys_dup3 /* 330 */ 828 .quad sys_dup3 /* 330 */
829 .quad sys_pipe2 829 .quad sys_pipe2
830 .quad sys_inotify_init1 830 .quad sys_inotify_init1
831 .quad compat_sys_preadv 831 .quad compat_sys_preadv
832 .quad compat_sys_pwritev 832 .quad compat_sys_pwritev
833 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
834 .quad sys_perf_counter_open
833ia32_syscall_end: 835ia32_syscall_end:
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba4229..aff9f1fcdcd7 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,241 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
247#define smp_mb__before_atomic_inc() barrier() 247#define smp_mb__before_atomic_inc() barrier()
248#define smp_mb__after_atomic_inc() barrier() 248#define smp_mb__after_atomic_inc() barrier()
249 249
250/* An 64bit atomic type */
251
252typedef struct {
253 unsigned long long counter;
254} atomic64_t;
255
256#define ATOMIC64_INIT(val) { (val) }
257
258/**
259 * atomic64_read - read atomic64 variable
260 * @v: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292
293/**
294 * atomic64_xchg - xchg atomic64 variable
295 * @ptr: pointer to type atomic64_t
296 * @new_val: value to assign
297 * @old_val: old value that was there
298 *
299 * Atomically xchgs the value of @ptr to @new_val and returns
300 * the old value.
301 */
302
303static inline unsigned long long
304atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
305{
306 unsigned long long old_val;
307
308 do {
309 old_val = atomic_read(ptr);
310 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
311
312 return old_val;
313}
314
315/**
316 * atomic64_set - set atomic64 variable
317 * @ptr: pointer to type atomic64_t
318 * @new_val: value to assign
319 *
320 * Atomically sets the value of @ptr to @new_val.
321 */
322static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
323{
324 atomic64_xchg(ptr, new_val);
325}
326
327/**
328 * atomic64_read - read atomic64 variable
329 * @ptr: pointer to type atomic64_t
330 *
331 * Atomically reads the value of @ptr and returns it.
332 */
333static inline unsigned long long atomic64_read(atomic64_t *ptr)
334{
335 unsigned long long curr_val;
336
337 do {
338 curr_val = __atomic64_read(ptr);
339 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
340
341 return curr_val;
342}
343
344/**
345 * atomic64_add_return - add and return
346 * @delta: integer value to add
347 * @ptr: pointer to type atomic64_t
348 *
349 * Atomically adds @delta to @ptr and returns @delta + *@ptr
350 */
351static inline unsigned long long
352atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
353{
354 unsigned long long old_val, new_val;
355
356 do {
357 old_val = atomic_read(ptr);
358 new_val = old_val + delta;
359
360 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
361
362 return new_val;
363}
364
365static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
366{
367 return atomic64_add_return(-delta, ptr);
368}
369
370static inline long atomic64_inc_return(atomic64_t *ptr)
371{
372 return atomic64_add_return(1, ptr);
373}
374
375static inline long atomic64_dec_return(atomic64_t *ptr)
376{
377 return atomic64_sub_return(1, ptr);
378}
379
380/**
381 * atomic64_add - add integer to atomic64 variable
382 * @delta: integer value to add
383 * @ptr: pointer to type atomic64_t
384 *
385 * Atomically adds @delta to @ptr.
386 */
387static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
388{
389 atomic64_add_return(delta, ptr);
390}
391
392/**
393 * atomic64_sub - subtract the atomic64 variable
394 * @delta: integer value to subtract
395 * @ptr: pointer to type atomic64_t
396 *
397 * Atomically subtracts @delta from @ptr.
398 */
399static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
400{
401 atomic64_add(-delta, ptr);
402}
403
404/**
405 * atomic64_sub_and_test - subtract value from variable and test result
406 * @delta: integer value to subtract
407 * @ptr: pointer to type atomic64_t
408 *
409 * Atomically subtracts @delta from @ptr and returns
410 * true if the result is zero, or false for all
411 * other cases.
412 */
413static inline int
414atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
415{
416 unsigned long long old_val = atomic64_sub_return(delta, ptr);
417
418 return old_val == 0;
419}
420
421/**
422 * atomic64_inc - increment atomic64 variable
423 * @ptr: pointer to type atomic64_t
424 *
425 * Atomically increments @ptr by 1.
426 */
427static inline void atomic64_inc(atomic64_t *ptr)
428{
429 atomic64_add(1, ptr);
430}
431
432/**
433 * atomic64_dec - decrement atomic64 variable
434 * @ptr: pointer to type atomic64_t
435 *
436 * Atomically decrements @ptr by 1.
437 */
438static inline void atomic64_dec(atomic64_t *ptr)
439{
440 atomic64_sub(1, ptr);
441}
442
443/**
444 * atomic64_dec_and_test - decrement and test
445 * @ptr: pointer to type atomic64_t
446 *
447 * Atomically decrements @ptr by 1 and
448 * returns true if the result is 0, or false for all other
449 * cases.
450 */
451static inline int atomic64_dec_and_test(atomic64_t *ptr)
452{
453 return atomic64_sub_and_test(1, ptr);
454}
455
456/**
457 * atomic64_inc_and_test - increment and test
458 * @ptr: pointer to type atomic64_t
459 *
460 * Atomically increments @ptr by 1
461 * and returns true if the result is zero, or false for all
462 * other cases.
463 */
464static inline int atomic64_inc_and_test(atomic64_t *ptr)
465{
466 return atomic64_sub_and_test(-1, ptr);
467}
468
469/**
470 * atomic64_add_negative - add and test if negative
471 * @delta: integer value to add
472 * @ptr: pointer to type atomic64_t
473 *
474 * Atomically adds @delta to @ptr and returns true
475 * if the result is negative, or false when
476 * result is greater than or equal to zero.
477 */
478static inline int
479atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
480{
481 long long old_val = atomic64_add_return(delta, ptr);
482
483 return old_val < 0;
484}
485
250#include <asm-generic/atomic.h> 486#include <asm-generic/atomic.h>
251#endif /* _ASM_X86_ATOMIC_32_H */ 487#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf258..fe24d2802490 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -50,6 +50,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
50 50
51#ifdef CONFIG_PERF_COUNTERS 51#ifdef CONFIG_PERF_COUNTERS
52BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) 52BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
53#endif 54#endif
54 55
55#ifdef CONFIG_X86_MCE_P4THERMAL 56#ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 37555e52f980..9ebc5c255032 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int generic_irqs; /* arch dependent */
16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs;
16#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
17 unsigned int irq_resched_count; 19 unsigned int irq_resched_count;
18 unsigned int irq_call_count; 20 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b762ea49bd70..7309c0ad6902 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,9 @@
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void generic_interrupt(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_counter_interrupt(void);
33extern void perf_pending_interrupt(void);
34
32extern void spurious_interrupt(void); 35extern void spurious_interrupt(void);
33extern void thermal_interrupt(void); 36extern void thermal_interrupt(void);
34extern void reschedule_interrupt(void); 37extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2e..000000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47c..545bb811ccb5 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -117,6 +117,11 @@
117#define GENERIC_INTERRUPT_VECTOR 0xed 117#define GENERIC_INTERRUPT_VECTOR 0xed
118 118
119/* 119/*
120 * Performance monitoring pending work vector:
121 */
122#define LOCAL_PENDING_VECTOR 0xec
123
124/*
120 * First APIC vector available to drivers: (vectors 0x30-0xee) we 125 * First APIC vector available to drivers: (vectors 0x30-0xee) we
121 * start at 0x31(0x41) to spread out vectors evenly between priority 126 * start at 0x31(0x41) to spread out vectors evenly between priority
122 * levels. (0x80 is the syscall vector) 127 * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 000000000000..d08dd52cb8ff
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,100 @@
1#ifndef _ASM_X86_PERF_COUNTER_H
2#define _ASM_X86_PERF_COUNTER_H
3
4/*
5 * Performance counter hw details:
6 */
7
8#define X86_PMC_MAX_GENERIC 8
9#define X86_PMC_MAX_FIXED 3
10
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64
14
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
16#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
17
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
25
26/*
27 * Includes eventsel and unit mask as well:
28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff
30
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36
37#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
38
39/*
40 * Intel "Architectural Performance Monitoring" CPUID
41 * detection/enumeration details:
42 */
43union cpuid10_eax {
44 struct {
45 unsigned int version_id:8;
46 unsigned int num_counters:8;
47 unsigned int bit_width:8;
48 unsigned int mask_length:8;
49 } split;
50 unsigned int full;
51};
52
53union cpuid10_edx {
54 struct {
55 unsigned int num_counters_fixed:4;
56 unsigned int reserved:28;
57 } split;
58 unsigned int full;
59};
60
61
62/*
63 * Fixed-purpose performance counters:
64 */
65
66/*
67 * All 3 fixed-mode PMCs are configured via this single MSR:
68 */
69#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
70
71/*
72 * The counts are available in three separate MSRs:
73 */
74
75/* Instr_Retired.Any: */
76#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
77#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
78
79/* CPU_CLK_Unhalted.Core: */
80#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
81#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
82
83/* CPU_CLK_Unhalted.Ref: */
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86
87extern void set_perf_counter_pending(void);
88
89#define clear_perf_counter_pending() do { } while (0)
90#define test_perf_counter_pending() (0)
91
92#ifdef CONFIG_PERF_COUNTERS
93extern void init_hw_perf_counters(void);
94extern void perf_counters_lapic_init(int nmi);
95#else
96static inline void init_hw_perf_counters(void) { }
97static inline void perf_counters_lapic_init(int nmi) { }
98#endif
99
100#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74cf8dc..732a30706153 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,8 @@
340#define __NR_inotify_init1 332 340#define __NR_inotify_init1 332
341#define __NR_preadv 333 341#define __NR_preadv 333
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_counter_open 336
343 345
344#ifdef __KERNEL__ 346#ifdef __KERNEL__
345 347
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index f81829462325..900e1617e672 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -657,7 +657,10 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
657__SYSCALL(__NR_preadv, sys_preadv) 657__SYSCALL(__NR_preadv, sys_preadv)
658#define __NR_pwritev 296 658#define __NR_pwritev 296
659__SYSCALL(__NR_pwritev, sys_pwritev) 659__SYSCALL(__NR_pwritev, sys_pwritev)
660 660#define __NR_rt_tgsigqueueinfo 297
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662#define __NR_perf_counter_open 298
663__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
661 664
662#ifndef __NO_STUBS 665#ifndef __NO_STUBS
663#define __ARCH_WANT_OLD_READDIR 666#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f2870920f246..e9021a908020 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -34,6 +34,7 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/mm.h> 35#include <linux/mm.h>
36 36
37#include <asm/perf_counter.h>
37#include <asm/pgalloc.h> 38#include <asm/pgalloc.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <asm/mpspec.h> 40#include <asm/mpspec.h>
@@ -761,6 +762,8 @@ static void local_apic_timer_interrupt(void)
761 inc_irq_stat(apic_timer_irqs); 762 inc_irq_stat(apic_timer_irqs);
762 763
763 evt->event_handler(evt); 764 evt->event_handler(evt);
765
766 perf_counter_unthrottle();
764} 767}
765 768
766/* 769/*
@@ -1133,6 +1136,7 @@ void __cpuinit setup_local_APIC(void)
1133 apic_write(APIC_ESR, 0); 1136 apic_write(APIC_ESR, 0);
1134 } 1137 }
1135#endif 1138#endif
1139 perf_counters_lapic_init(0);
1136 1140
1137 preempt_disable(); 1141 preempt_disable();
1138 1142
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e4..3efcb2b96a15 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
25 25
26obj-$(CONFIG_X86_MCE) += mcheck/ 26obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
27obj-$(CONFIG_MTRR) += mtrr/
28obj-$(CONFIG_CPU_FREQ) += cpufreq/
29 27
30obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 28obj-$(CONFIG_X86_MCE) += mcheck/
29obj-$(CONFIG_MTRR) += mtrr/
30obj-$(CONFIG_CPU_FREQ) += cpufreq/
31
32obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
31 33
32quiet_cmd_mkcapflags = MKCAP $@ 34quiet_cmd_mkcapflags = MKCAP $@
33 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 35 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c1caefc82e62..591012fb949f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h>
16#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
17#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -854,6 +855,7 @@ void __init identify_boot_cpu(void)
854#else 855#else
855 vgetcpu_set_mode(); 856 vgetcpu_set_mode();
856#endif 857#endif
858 init_hw_perf_counters();
857} 859}
858 860
859void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 861void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..5bfd30ab3920
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1242 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 *
10 * For licencing details see kernel-base/COPYING
11 */
12
13#include <linux/perf_counter.h>
14#include <linux/capability.h>
15#include <linux/notifier.h>
16#include <linux/hardirq.h>
17#include <linux/kprobes.h>
18#include <linux/module.h>
19#include <linux/kdebug.h>
20#include <linux/sched.h>
21#include <linux/uaccess.h>
22
23#include <asm/apic.h>
24#include <asm/stacktrace.h>
25#include <asm/nmi.h>
26
27static u64 perf_counter_mask __read_mostly;
28
29struct cpu_hw_counters {
30 struct perf_counter *counters[X86_PMC_IDX_MAX];
31 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
32 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33 unsigned long interrupts;
34 int enabled;
35};
36
37/*
38 * struct x86_pmu - generic x86 pmu
39 */
40struct x86_pmu {
41 const char *name;
42 int version;
43 int (*handle_irq)(struct pt_regs *, int);
44 void (*disable_all)(void);
45 void (*enable_all)(void);
46 void (*enable)(struct hw_perf_counter *, int);
47 void (*disable)(struct hw_perf_counter *, int);
48 unsigned eventsel;
49 unsigned perfctr;
50 u64 (*event_map)(int);
51 u64 (*raw_event)(u64);
52 int max_events;
53 int num_counters;
54 int num_counters_fixed;
55 int counter_bits;
56 u64 counter_mask;
57 u64 max_period;
58 u64 intel_ctrl;
59};
60
61static struct x86_pmu x86_pmu __read_mostly;
62
63static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
64 .enabled = 1,
65};
66
67/*
68 * Intel PerfMon v3. Used on Core2 and later.
69 */
70static const u64 intel_perfmon_event_map[] =
71{
72 [PERF_COUNT_CPU_CYCLES] = 0x003c,
73 [PERF_COUNT_INSTRUCTIONS] = 0x00c0,
74 [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e,
75 [PERF_COUNT_CACHE_MISSES] = 0x412e,
76 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4,
77 [PERF_COUNT_BRANCH_MISSES] = 0x00c5,
78 [PERF_COUNT_BUS_CYCLES] = 0x013c,
79};
80
81static u64 intel_pmu_event_map(int event)
82{
83 return intel_perfmon_event_map[event];
84}
85
86static u64 intel_pmu_raw_event(u64 event)
87{
88#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
89#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
90#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
91
92#define CORE_EVNTSEL_MASK \
93 (CORE_EVNTSEL_EVENT_MASK | \
94 CORE_EVNTSEL_UNIT_MASK | \
95 CORE_EVNTSEL_COUNTER_MASK)
96
97 return event & CORE_EVNTSEL_MASK;
98}
99
100/*
101 * AMD Performance Monitor K7 and later.
102 */
103static const u64 amd_perfmon_event_map[] =
104{
105 [PERF_COUNT_CPU_CYCLES] = 0x0076,
106 [PERF_COUNT_INSTRUCTIONS] = 0x00c0,
107 [PERF_COUNT_CACHE_REFERENCES] = 0x0080,
108 [PERF_COUNT_CACHE_MISSES] = 0x0081,
109 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4,
110 [PERF_COUNT_BRANCH_MISSES] = 0x00c5,
111};
112
113static u64 amd_pmu_event_map(int event)
114{
115 return amd_perfmon_event_map[event];
116}
117
118static u64 amd_pmu_raw_event(u64 event)
119{
120#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
121#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
122#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
123
124#define K7_EVNTSEL_MASK \
125 (K7_EVNTSEL_EVENT_MASK | \
126 K7_EVNTSEL_UNIT_MASK | \
127 K7_EVNTSEL_COUNTER_MASK)
128
129 return event & K7_EVNTSEL_MASK;
130}
131
132/*
133 * Propagate counter elapsed time into the generic counter.
134 * Can only be executed on the CPU where the counter is active.
135 * Returns the delta events processed.
136 */
137static u64
138x86_perf_counter_update(struct perf_counter *counter,
139 struct hw_perf_counter *hwc, int idx)
140{
141 int shift = 64 - x86_pmu.counter_bits;
142 u64 prev_raw_count, new_raw_count;
143 s64 delta;
144
145 /*
146 * Careful: an NMI might modify the previous counter value.
147 *
148 * Our tactic to handle this is to first atomically read and
149 * exchange a new raw count - then add that new-prev delta
150 * count to the generic counter atomically:
151 */
152again:
153 prev_raw_count = atomic64_read(&hwc->prev_count);
154 rdmsrl(hwc->counter_base + idx, new_raw_count);
155
156 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
157 new_raw_count) != prev_raw_count)
158 goto again;
159
160 /*
161 * Now we have the new raw value and have updated the prev
162 * timestamp already. We can now calculate the elapsed delta
163 * (counter-)time and add that to the generic counter.
164 *
165 * Careful, not all hw sign-extends above the physical width
166 * of the count.
167 */
168 delta = (new_raw_count << shift) - (prev_raw_count << shift);
169 delta >>= shift;
170
171 atomic64_add(delta, &counter->count);
172 atomic64_sub(delta, &hwc->period_left);
173
174 return new_raw_count;
175}
176
177static atomic_t active_counters;
178static DEFINE_MUTEX(pmc_reserve_mutex);
179
180static bool reserve_pmc_hardware(void)
181{
182 int i;
183
184 if (nmi_watchdog == NMI_LOCAL_APIC)
185 disable_lapic_nmi_watchdog();
186
187 for (i = 0; i < x86_pmu.num_counters; i++) {
188 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
189 goto perfctr_fail;
190 }
191
192 for (i = 0; i < x86_pmu.num_counters; i++) {
193 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
194 goto eventsel_fail;
195 }
196
197 return true;
198
199eventsel_fail:
200 for (i--; i >= 0; i--)
201 release_evntsel_nmi(x86_pmu.eventsel + i);
202
203 i = x86_pmu.num_counters;
204
205perfctr_fail:
206 for (i--; i >= 0; i--)
207 release_perfctr_nmi(x86_pmu.perfctr + i);
208
209 if (nmi_watchdog == NMI_LOCAL_APIC)
210 enable_lapic_nmi_watchdog();
211
212 return false;
213}
214
215static void release_pmc_hardware(void)
216{
217 int i;
218
219 for (i = 0; i < x86_pmu.num_counters; i++) {
220 release_perfctr_nmi(x86_pmu.perfctr + i);
221 release_evntsel_nmi(x86_pmu.eventsel + i);
222 }
223
224 if (nmi_watchdog == NMI_LOCAL_APIC)
225 enable_lapic_nmi_watchdog();
226}
227
228static void hw_perf_counter_destroy(struct perf_counter *counter)
229{
230 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
231 release_pmc_hardware();
232 mutex_unlock(&pmc_reserve_mutex);
233 }
234}
235
236static inline int x86_pmu_initialized(void)
237{
238 return x86_pmu.handle_irq != NULL;
239}
240
241/*
242 * Setup the hardware configuration for a given hw_event_type
243 */
244static int __hw_perf_counter_init(struct perf_counter *counter)
245{
246 struct perf_counter_hw_event *hw_event = &counter->hw_event;
247 struct hw_perf_counter *hwc = &counter->hw;
248 int err;
249
250 if (!x86_pmu_initialized())
251 return -ENODEV;
252
253 err = 0;
254 if (!atomic_inc_not_zero(&active_counters)) {
255 mutex_lock(&pmc_reserve_mutex);
256 if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
257 err = -EBUSY;
258 else
259 atomic_inc(&active_counters);
260 mutex_unlock(&pmc_reserve_mutex);
261 }
262 if (err)
263 return err;
264
265 /*
266 * Generate PMC IRQs:
267 * (keep 'enabled' bit clear for now)
268 */
269 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
270
271 /*
272 * Count user and OS events unless requested not to.
273 */
274 if (!hw_event->exclude_user)
275 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
276 if (!hw_event->exclude_kernel)
277 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
278
279 /*
280 * If privileged enough, allow NMI events:
281 */
282 hwc->nmi = 0;
283 if (hw_event->nmi) {
284 if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
285 return -EACCES;
286 hwc->nmi = 1;
287 }
288
289 if (!hwc->irq_period)
290 hwc->irq_period = x86_pmu.max_period;
291
292 atomic64_set(&hwc->period_left,
293 min(x86_pmu.max_period, hwc->irq_period));
294
295 /*
296 * Raw event type provide the config in the event structure
297 */
298 if (perf_event_raw(hw_event)) {
299 hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
300 } else {
301 if (perf_event_id(hw_event) >= x86_pmu.max_events)
302 return -EINVAL;
303 /*
304 * The generic map:
305 */
306 hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
307 }
308
309 counter->destroy = hw_perf_counter_destroy;
310
311 return 0;
312}
313
314static void intel_pmu_disable_all(void)
315{
316 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
317}
318
319static void amd_pmu_disable_all(void)
320{
321 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
322 int idx;
323
324 if (!cpuc->enabled)
325 return;
326
327 cpuc->enabled = 0;
328 /*
329 * ensure we write the disable before we start disabling the
330 * counters proper, so that amd_pmu_enable_counter() does the
331 * right thing.
332 */
333 barrier();
334
335 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
336 u64 val;
337
338 if (!test_bit(idx, cpuc->active_mask))
339 continue;
340 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
341 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
342 continue;
343 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
344 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
345 }
346}
347
348void hw_perf_disable(void)
349{
350 if (!x86_pmu_initialized())
351 return;
352 return x86_pmu.disable_all();
353}
354
355static void intel_pmu_enable_all(void)
356{
357 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
358}
359
360static void amd_pmu_enable_all(void)
361{
362 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
363 int idx;
364
365 if (cpuc->enabled)
366 return;
367
368 cpuc->enabled = 1;
369 barrier();
370
371 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
372 u64 val;
373
374 if (!test_bit(idx, cpuc->active_mask))
375 continue;
376 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
377 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
378 continue;
379 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
380 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
381 }
382}
383
384void hw_perf_enable(void)
385{
386 if (!x86_pmu_initialized())
387 return;
388 x86_pmu.enable_all();
389}
390
391static inline u64 intel_pmu_get_status(void)
392{
393 u64 status;
394
395 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
396
397 return status;
398}
399
400static inline void intel_pmu_ack_status(u64 ack)
401{
402 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
403}
404
405static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
406{
407 int err;
408 err = checking_wrmsrl(hwc->config_base + idx,
409 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
410}
411
412static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
413{
414 int err;
415 err = checking_wrmsrl(hwc->config_base + idx,
416 hwc->config);
417}
418
419static inline void
420intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
421{
422 int idx = __idx - X86_PMC_IDX_FIXED;
423 u64 ctrl_val, mask;
424 int err;
425
426 mask = 0xfULL << (idx * 4);
427
428 rdmsrl(hwc->config_base, ctrl_val);
429 ctrl_val &= ~mask;
430 err = checking_wrmsrl(hwc->config_base, ctrl_val);
431}
432
433static inline void
434intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
435{
436 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
437 intel_pmu_disable_fixed(hwc, idx);
438 return;
439 }
440
441 x86_pmu_disable_counter(hwc, idx);
442}
443
444static inline void
445amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
446{
447 x86_pmu_disable_counter(hwc, idx);
448}
449
450static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
451
452/*
453 * Set the next IRQ period, based on the hwc->period_left value.
454 * To be called with the counter disabled in hw:
455 */
456static void
457x86_perf_counter_set_period(struct perf_counter *counter,
458 struct hw_perf_counter *hwc, int idx)
459{
460 s64 left = atomic64_read(&hwc->period_left);
461 s64 period = min(x86_pmu.max_period, hwc->irq_period);
462 int err;
463
464 /*
465 * If we are way outside a reasoable range then just skip forward:
466 */
467 if (unlikely(left <= -period)) {
468 left = period;
469 atomic64_set(&hwc->period_left, left);
470 }
471
472 if (unlikely(left <= 0)) {
473 left += period;
474 atomic64_set(&hwc->period_left, left);
475 }
476 /*
477 * Quirk: certain CPUs dont like it if just 1 event is left:
478 */
479 if (unlikely(left < 2))
480 left = 2;
481
482 per_cpu(prev_left[idx], smp_processor_id()) = left;
483
484 /*
485 * The hw counter starts counting from this counter offset,
486 * mark it to be able to extra future deltas:
487 */
488 atomic64_set(&hwc->prev_count, (u64)-left);
489
490 err = checking_wrmsrl(hwc->counter_base + idx,
491 (u64)(-left) & x86_pmu.counter_mask);
492}
493
494static inline void
495intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
496{
497 int idx = __idx - X86_PMC_IDX_FIXED;
498 u64 ctrl_val, bits, mask;
499 int err;
500
501 /*
502 * Enable IRQ generation (0x8),
503 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
504 * if requested:
505 */
506 bits = 0x8ULL;
507 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
508 bits |= 0x2;
509 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
510 bits |= 0x1;
511 bits <<= (idx * 4);
512 mask = 0xfULL << (idx * 4);
513
514 rdmsrl(hwc->config_base, ctrl_val);
515 ctrl_val &= ~mask;
516 ctrl_val |= bits;
517 err = checking_wrmsrl(hwc->config_base, ctrl_val);
518}
519
520static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
521{
522 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
523 intel_pmu_enable_fixed(hwc, idx);
524 return;
525 }
526
527 x86_pmu_enable_counter(hwc, idx);
528}
529
530static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
531{
532 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
533
534 if (cpuc->enabled)
535 x86_pmu_enable_counter(hwc, idx);
536 else
537 x86_pmu_disable_counter(hwc, idx);
538}
539
540static int
541fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
542{
543 unsigned int event;
544
545 if (!x86_pmu.num_counters_fixed)
546 return -1;
547
548 if (unlikely(hwc->nmi))
549 return -1;
550
551 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
552
553 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
554 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
555 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
556 return X86_PMC_IDX_FIXED_CPU_CYCLES;
557 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
558 return X86_PMC_IDX_FIXED_BUS_CYCLES;
559
560 return -1;
561}
562
563/*
564 * Find a PMC slot for the freshly enabled / scheduled in counter:
565 */
566static int x86_pmu_enable(struct perf_counter *counter)
567{
568 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
569 struct hw_perf_counter *hwc = &counter->hw;
570 int idx;
571
572 idx = fixed_mode_idx(counter, hwc);
573 if (idx >= 0) {
574 /*
575 * Try to get the fixed counter, if that is already taken
576 * then try to get a generic counter:
577 */
578 if (test_and_set_bit(idx, cpuc->used_mask))
579 goto try_generic;
580
581 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
582 /*
583 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
584 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
585 */
586 hwc->counter_base =
587 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
588 hwc->idx = idx;
589 } else {
590 idx = hwc->idx;
591 /* Try to get the previous generic counter again */
592 if (test_and_set_bit(idx, cpuc->used_mask)) {
593try_generic:
594 idx = find_first_zero_bit(cpuc->used_mask,
595 x86_pmu.num_counters);
596 if (idx == x86_pmu.num_counters)
597 return -EAGAIN;
598
599 set_bit(idx, cpuc->used_mask);
600 hwc->idx = idx;
601 }
602 hwc->config_base = x86_pmu.eventsel;
603 hwc->counter_base = x86_pmu.perfctr;
604 }
605
606 perf_counters_lapic_init(hwc->nmi);
607
608 x86_pmu.disable(hwc, idx);
609
610 cpuc->counters[idx] = counter;
611 set_bit(idx, cpuc->active_mask);
612
613 x86_perf_counter_set_period(counter, hwc, idx);
614 x86_pmu.enable(hwc, idx);
615
616 return 0;
617}
618
619void perf_counter_print_debug(void)
620{
621 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
622 struct cpu_hw_counters *cpuc;
623 unsigned long flags;
624 int cpu, idx;
625
626 if (!x86_pmu.num_counters)
627 return;
628
629 local_irq_save(flags);
630
631 cpu = smp_processor_id();
632 cpuc = &per_cpu(cpu_hw_counters, cpu);
633
634 if (x86_pmu.version >= 2) {
635 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
636 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
637 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
638 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
639
640 pr_info("\n");
641 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
642 pr_info("CPU#%d: status: %016llx\n", cpu, status);
643 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
644 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
645 }
646 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
647
648 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
649 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
650 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
651
652 prev_left = per_cpu(prev_left[idx], cpu);
653
654 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
655 cpu, idx, pmc_ctrl);
656 pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
657 cpu, idx, pmc_count);
658 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
659 cpu, idx, prev_left);
660 }
661 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
662 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
663
664 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
665 cpu, idx, pmc_count);
666 }
667 local_irq_restore(flags);
668}
669
670static void x86_pmu_disable(struct perf_counter *counter)
671{
672 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
673 struct hw_perf_counter *hwc = &counter->hw;
674 int idx = hwc->idx;
675
676 /*
677 * Must be done before we disable, otherwise the nmi handler
678 * could reenable again:
679 */
680 clear_bit(idx, cpuc->active_mask);
681 x86_pmu.disable(hwc, idx);
682
683 /*
684 * Make sure the cleared pointer becomes visible before we
685 * (potentially) free the counter:
686 */
687 barrier();
688
689 /*
690 * Drain the remaining delta count out of a counter
691 * that we are disabling:
692 */
693 x86_perf_counter_update(counter, hwc, idx);
694 cpuc->counters[idx] = NULL;
695 clear_bit(idx, cpuc->used_mask);
696}
697
698/*
699 * Save and restart an expired counter. Called by NMI contexts,
700 * so it has to be careful about preempting normal counter ops:
701 */
702static void intel_pmu_save_and_restart(struct perf_counter *counter)
703{
704 struct hw_perf_counter *hwc = &counter->hw;
705 int idx = hwc->idx;
706
707 x86_perf_counter_update(counter, hwc, idx);
708 x86_perf_counter_set_period(counter, hwc, idx);
709
710 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
711 intel_pmu_enable_counter(hwc, idx);
712}
713
714/*
715 * Maximum interrupt frequency of 100KHz per CPU
716 */
717#define PERFMON_MAX_INTERRUPTS (100000/HZ)
718
719/*
720 * This handler is triggered by the local APIC, so the APIC IRQ handling
721 * rules apply:
722 */
723static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
724{
725 struct cpu_hw_counters *cpuc;
726 struct cpu_hw_counters;
727 int bit, cpu, loops;
728 u64 ack, status;
729
730 cpu = smp_processor_id();
731 cpuc = &per_cpu(cpu_hw_counters, cpu);
732
733 perf_disable();
734 status = intel_pmu_get_status();
735 if (!status) {
736 perf_enable();
737 return 0;
738 }
739
740 loops = 0;
741again:
742 if (++loops > 100) {
743 WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
744 return 1;
745 }
746
747 inc_irq_stat(apic_perf_irqs);
748 ack = status;
749 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
750 struct perf_counter *counter = cpuc->counters[bit];
751
752 clear_bit(bit, (unsigned long *) &status);
753 if (!test_bit(bit, cpuc->active_mask))
754 continue;
755
756 intel_pmu_save_and_restart(counter);
757 if (perf_counter_overflow(counter, nmi, regs, 0))
758 intel_pmu_disable_counter(&counter->hw, bit);
759 }
760
761 intel_pmu_ack_status(ack);
762
763 /*
764 * Repeat if there is more work to be done:
765 */
766 status = intel_pmu_get_status();
767 if (status)
768 goto again;
769
770 if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS)
771 perf_enable();
772
773 return 1;
774}
775
776static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
777{
778 int cpu, idx, throttle = 0, handled = 0;
779 struct cpu_hw_counters *cpuc;
780 struct perf_counter *counter;
781 struct hw_perf_counter *hwc;
782 u64 val;
783
784 cpu = smp_processor_id();
785 cpuc = &per_cpu(cpu_hw_counters, cpu);
786
787 if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
788 throttle = 1;
789 __perf_disable();
790 cpuc->enabled = 0;
791 barrier();
792 }
793
794 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
795 int disable = 0;
796
797 if (!test_bit(idx, cpuc->active_mask))
798 continue;
799
800 counter = cpuc->counters[idx];
801 hwc = &counter->hw;
802
803 if (counter->hw_event.nmi != nmi)
804 goto next;
805
806 val = x86_perf_counter_update(counter, hwc, idx);
807 if (val & (1ULL << (x86_pmu.counter_bits - 1)))
808 goto next;
809
810 /* counter overflow */
811 x86_perf_counter_set_period(counter, hwc, idx);
812 handled = 1;
813 inc_irq_stat(apic_perf_irqs);
814 disable = perf_counter_overflow(counter, nmi, regs, 0);
815
816next:
817 if (disable || throttle)
818 amd_pmu_disable_counter(hwc, idx);
819 }
820
821 return handled;
822}
823
824void perf_counter_unthrottle(void)
825{
826 struct cpu_hw_counters *cpuc;
827
828 if (!x86_pmu_initialized())
829 return;
830
831 cpuc = &__get_cpu_var(cpu_hw_counters);
832 if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
833 /*
834 * Clear them before re-enabling irqs/NMIs again:
835 */
836 cpuc->interrupts = 0;
837 perf_enable();
838 } else {
839 cpuc->interrupts = 0;
840 }
841}
842
843void smp_perf_counter_interrupt(struct pt_regs *regs)
844{
845 irq_enter();
846 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
847 ack_APIC_irq();
848 x86_pmu.handle_irq(regs, 0);
849 irq_exit();
850}
851
852void smp_perf_pending_interrupt(struct pt_regs *regs)
853{
854 irq_enter();
855 ack_APIC_irq();
856 inc_irq_stat(apic_pending_irqs);
857 perf_counter_do_pending();
858 irq_exit();
859}
860
861void set_perf_counter_pending(void)
862{
863 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
864}
865
866void perf_counters_lapic_init(int nmi)
867{
868 u32 apic_val;
869
870 if (!x86_pmu_initialized())
871 return;
872
873 /*
874 * Enable the performance counter vector in the APIC LVT:
875 */
876 apic_val = apic_read(APIC_LVTERR);
877
878 apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
879 if (nmi)
880 apic_write(APIC_LVTPC, APIC_DM_NMI);
881 else
882 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
883 apic_write(APIC_LVTERR, apic_val);
884}
885
886static int __kprobes
887perf_counter_nmi_handler(struct notifier_block *self,
888 unsigned long cmd, void *__args)
889{
890 struct die_args *args = __args;
891 struct pt_regs *regs;
892
893 if (!atomic_read(&active_counters))
894 return NOTIFY_DONE;
895
896 switch (cmd) {
897 case DIE_NMI:
898 case DIE_NMI_IPI:
899 break;
900
901 default:
902 return NOTIFY_DONE;
903 }
904
905 regs = args->regs;
906
907 apic_write(APIC_LVTPC, APIC_DM_NMI);
908 /*
909 * Can't rely on the handled return value to say it was our NMI, two
910 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
911 *
912 * If the first NMI handles both, the latter will be empty and daze
913 * the CPU.
914 */
915 x86_pmu.handle_irq(regs, 1);
916
917 return NOTIFY_STOP;
918}
919
920static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
921 .notifier_call = perf_counter_nmi_handler,
922 .next = NULL,
923 .priority = 1
924};
925
926static struct x86_pmu intel_pmu = {
927 .name = "Intel",
928 .handle_irq = intel_pmu_handle_irq,
929 .disable_all = intel_pmu_disable_all,
930 .enable_all = intel_pmu_enable_all,
931 .enable = intel_pmu_enable_counter,
932 .disable = intel_pmu_disable_counter,
933 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
934 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
935 .event_map = intel_pmu_event_map,
936 .raw_event = intel_pmu_raw_event,
937 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
938 /*
939 * Intel PMCs cannot be accessed sanely above 32 bit width,
940 * so we install an artificial 1<<31 period regardless of
941 * the generic counter period:
942 */
943 .max_period = (1ULL << 31) - 1,
944};
945
946static struct x86_pmu amd_pmu = {
947 .name = "AMD",
948 .handle_irq = amd_pmu_handle_irq,
949 .disable_all = amd_pmu_disable_all,
950 .enable_all = amd_pmu_enable_all,
951 .enable = amd_pmu_enable_counter,
952 .disable = amd_pmu_disable_counter,
953 .eventsel = MSR_K7_EVNTSEL0,
954 .perfctr = MSR_K7_PERFCTR0,
955 .event_map = amd_pmu_event_map,
956 .raw_event = amd_pmu_raw_event,
957 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
958 .num_counters = 4,
959 .counter_bits = 48,
960 .counter_mask = (1ULL << 48) - 1,
961 /* use highest bit to detect overflow */
962 .max_period = (1ULL << 47) - 1,
963};
964
965static int intel_pmu_init(void)
966{
967 union cpuid10_edx edx;
968 union cpuid10_eax eax;
969 unsigned int unused;
970 unsigned int ebx;
971 int version;
972
973 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
974 return -ENODEV;
975
976 /*
977 * Check whether the Architectural PerfMon supports
978 * Branch Misses Retired Event or not.
979 */
980 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
981 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
982 return -ENODEV;
983
984 version = eax.split.version_id;
985 if (version < 2)
986 return -ENODEV;
987
988 x86_pmu = intel_pmu;
989 x86_pmu.version = version;
990 x86_pmu.num_counters = eax.split.num_counters;
991
992 /*
993 * Quirk: v2 perfmon does not report fixed-purpose counters, so
994 * assume at least 3 counters:
995 */
996 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
997
998 x86_pmu.counter_bits = eax.split.bit_width;
999 x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
1000
1001 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1002
1003 return 0;
1004}
1005
1006static int amd_pmu_init(void)
1007{
1008 x86_pmu = amd_pmu;
1009 return 0;
1010}
1011
1012void __init init_hw_perf_counters(void)
1013{
1014 int err;
1015
1016 switch (boot_cpu_data.x86_vendor) {
1017 case X86_VENDOR_INTEL:
1018 err = intel_pmu_init();
1019 break;
1020 case X86_VENDOR_AMD:
1021 err = amd_pmu_init();
1022 break;
1023 default:
1024 return;
1025 }
1026 if (err != 0)
1027 return;
1028
1029 pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
1030 pr_info("... version: %d\n", x86_pmu.version);
1031 pr_info("... bit width: %d\n", x86_pmu.counter_bits);
1032
1033 pr_info("... num counters: %d\n", x86_pmu.num_counters);
1034 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1035 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1036 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1037 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1038 }
1039 perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
1040 perf_max_counters = x86_pmu.num_counters;
1041
1042 pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
1043 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1044
1045 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1046 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1047 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1048 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1049 }
1050 pr_info("... fixed counters: %d\n", x86_pmu.num_counters_fixed);
1051
1052 perf_counter_mask |=
1053 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1054
1055 pr_info("... counter mask: %016Lx\n", perf_counter_mask);
1056
1057 perf_counters_lapic_init(0);
1058 register_die_notifier(&perf_counter_nmi_notifier);
1059}
1060
1061static inline void x86_pmu_read(struct perf_counter *counter)
1062{
1063 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1064}
1065
1066static const struct pmu pmu = {
1067 .enable = x86_pmu_enable,
1068 .disable = x86_pmu_disable,
1069 .read = x86_pmu_read,
1070};
1071
1072const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
1073{
1074 int err;
1075
1076 err = __hw_perf_counter_init(counter);
1077 if (err)
1078 return ERR_PTR(err);
1079
1080 return &pmu;
1081}
1082
1083/*
1084 * callchain support
1085 */
1086
1087static inline
1088void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
1089{
1090 if (entry->nr < MAX_STACK_DEPTH)
1091 entry->ip[entry->nr++] = ip;
1092}
1093
1094static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1095static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1096
1097
1098static void
1099backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1100{
1101 /* Ignore warnings */
1102}
1103
1104static void backtrace_warning(void *data, char *msg)
1105{
1106 /* Ignore warnings */
1107}
1108
1109static int backtrace_stack(void *data, char *name)
1110{
1111 /* Don't bother with IRQ stacks for now */
1112 return -1;
1113}
1114
1115static void backtrace_address(void *data, unsigned long addr, int reliable)
1116{
1117 struct perf_callchain_entry *entry = data;
1118
1119 if (reliable)
1120 callchain_store(entry, addr);
1121}
1122
1123static const struct stacktrace_ops backtrace_ops = {
1124 .warning = backtrace_warning,
1125 .warning_symbol = backtrace_warning_symbol,
1126 .stack = backtrace_stack,
1127 .address = backtrace_address,
1128};
1129
1130static void
1131perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1132{
1133 unsigned long bp;
1134 char *stack;
1135 int nr = entry->nr;
1136
1137 callchain_store(entry, instruction_pointer(regs));
1138
1139 stack = ((char *)regs + sizeof(struct pt_regs));
1140#ifdef CONFIG_FRAME_POINTER
1141 bp = frame_pointer(regs);
1142#else
1143 bp = 0;
1144#endif
1145
1146 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1147
1148 entry->kernel = entry->nr - nr;
1149}
1150
1151
1152struct stack_frame {
1153 const void __user *next_fp;
1154 unsigned long return_address;
1155};
1156
1157static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1158{
1159 int ret;
1160
1161 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
1162 return 0;
1163
1164 ret = 1;
1165 pagefault_disable();
1166 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
1167 ret = 0;
1168 pagefault_enable();
1169
1170 return ret;
1171}
1172
1173static void
1174perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1175{
1176 struct stack_frame frame;
1177 const void __user *fp;
1178 int nr = entry->nr;
1179
1180 regs = (struct pt_regs *)current->thread.sp0 - 1;
1181 fp = (void __user *)regs->bp;
1182
1183 callchain_store(entry, regs->ip);
1184
1185 while (entry->nr < MAX_STACK_DEPTH) {
1186 frame.next_fp = NULL;
1187 frame.return_address = 0;
1188
1189 if (!copy_stack_frame(fp, &frame))
1190 break;
1191
1192 if ((unsigned long)fp < user_stack_pointer(regs))
1193 break;
1194
1195 callchain_store(entry, frame.return_address);
1196 fp = frame.next_fp;
1197 }
1198
1199 entry->user = entry->nr - nr;
1200}
1201
1202static void
1203perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1204{
1205 int is_user;
1206
1207 if (!regs)
1208 return;
1209
1210 is_user = user_mode(regs);
1211
1212 if (!current || current->pid == 0)
1213 return;
1214
1215 if (is_user && current->state != TASK_RUNNING)
1216 return;
1217
1218 if (!is_user)
1219 perf_callchain_kernel(regs, entry);
1220
1221 if (current->mm)
1222 perf_callchain_user(regs, entry);
1223}
1224
1225struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1226{
1227 struct perf_callchain_entry *entry;
1228
1229 if (in_nmi())
1230 entry = &__get_cpu_var(nmi_entry);
1231 else
1232 entry = &__get_cpu_var(irq_entry);
1233
1234 entry->nr = 0;
1235 entry->hv = 0;
1236 entry->kernel = 0;
1237 entry->user = 0;
1238
1239 perf_do_callchain(regs, entry);
1240
1241 return entry;
1242}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e32..d6f5b9fbde32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/genapic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 38946c6e8433..891004619142 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1025,6 +1025,13 @@ apicinterrupt ERROR_APIC_VECTOR \
1025apicinterrupt SPURIOUS_APIC_VECTOR \ 1025apicinterrupt SPURIOUS_APIC_VECTOR \
1026 spurious_interrupt smp_spurious_interrupt 1026 spurious_interrupt smp_spurious_interrupt
1027 1027
1028#ifdef CONFIG_PERF_COUNTERS
1029apicinterrupt LOCAL_PERF_VECTOR \
1030 perf_counter_interrupt smp_perf_counter_interrupt
1031apicinterrupt LOCAL_PENDING_VECTOR \
1032 perf_pending_interrupt smp_perf_pending_interrupt
1033#endif
1034
1028/* 1035/*
1029 * Exception entry points. 1036 * Exception entry points.
1030 */ 1037 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c3fe010d74c8..8279fb8df17f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -63,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
63 for_each_online_cpu(j) 63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n"); 65 seq_printf(p, " Spurious interrupts\n");
66 seq_printf(p, "%*s: ", prec, "CNT");
67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance counter interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND");
71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n");
66#endif 74#endif
67 if (generic_interrupt_extension) { 75 if (generic_interrupt_extension) {
68 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
@@ -166,6 +174,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
166#ifdef CONFIG_X86_LOCAL_APIC 174#ifdef CONFIG_X86_LOCAL_APIC
167 sum += irq_stats(cpu)->apic_timer_irqs; 175 sum += irq_stats(cpu)->apic_timer_irqs;
168 sum += irq_stats(cpu)->irq_spurious_count; 176 sum += irq_stats(cpu)->irq_spurious_count;
177 sum += irq_stats(cpu)->apic_perf_irqs;
178 sum += irq_stats(cpu)->apic_pending_irqs;
169#endif 179#endif
170 if (generic_interrupt_extension) 180 if (generic_interrupt_extension)
171 sum += irq_stats(cpu)->generic_irqs; 181 sum += irq_stats(cpu)->generic_irqs;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 368b0a8836f9..3190a6b961e6 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -118,28 +118,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
118 return 0; 118 return 0;
119} 119}
120 120
121/* Overridden in paravirt.c */ 121static void __init smp_intr_init(void)
122void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
123
124void __init native_init_IRQ(void)
125{ 122{
126 int i;
127
128 /* Execute any quirks before the call gates are initialised: */
129 x86_quirk_pre_intr_init();
130
131 /*
132 * Cover the whole vector space, no vector can escape
133 * us. (some of these will be overridden and become
134 * 'special' SMP interrupts)
135 */
136 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
137 /* SYSCALL_VECTOR was reserved in trap_init. */
138 if (i != SYSCALL_VECTOR)
139 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
140 }
141
142
143#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) 123#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
144 /* 124 /*
145 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 125 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -168,6 +148,11 @@ void __init native_init_IRQ(void)
168 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 148 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
169 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); 149 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
170#endif 150#endif
151}
152
153static void __init apic_intr_init(void)
154{
155 smp_intr_init();
171 156
172#ifdef CONFIG_X86_LOCAL_APIC 157#ifdef CONFIG_X86_LOCAL_APIC
173 /* self generated IPI for local APIC timer */ 158 /* self generated IPI for local APIC timer */
@@ -179,12 +164,41 @@ void __init native_init_IRQ(void)
179 /* IPI vectors for APIC spurious and error interrupts */ 164 /* IPI vectors for APIC spurious and error interrupts */
180 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 165 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
181 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 166 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
182#endif 167# ifdef CONFIG_PERF_COUNTERS
168 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
169 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
170# endif
183 171
184#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) 172# ifdef CONFIG_X86_MCE_P4THERMAL
185 /* thermal monitor LVT interrupt */ 173 /* thermal monitor LVT interrupt */
186 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 174 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
175# endif
187#endif 176#endif
177}
178
179/* Overridden in paravirt.c */
180void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
181
182void __init native_init_IRQ(void)
183{
184 int i;
185
186 /* Execute any quirks before the call gates are initialised: */
187 x86_quirk_pre_intr_init();
188
189 apic_intr_init();
190
191 /*
192 * Cover the whole vector space, no vector can escape
193 * us. (some of these will be overridden and become
194 * 'special' SMP interrupts)
195 */
196 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
197 int vector = FIRST_EXTERNAL_VECTOR + i;
198 /* SYSCALL_VECTOR was reserved in trap_init. */
199 if (!test_bit(vector, used_vectors))
200 set_intr_gate(vector, interrupt[i]);
201 }
188 202
189 if (!acpi_ioapic) 203 if (!acpi_ioapic)
190 setup_irq(2, &irq2); 204 setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8cd10537fd46..53ceb26f80ff 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -152,6 +152,12 @@ static void __init apic_intr_init(void)
152 /* IPI vectors for APIC spurious and error interrupts */ 152 /* IPI vectors for APIC spurious and error interrupts */
153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
155
156 /* Performance monitoring interrupt: */
157#ifdef CONFIG_PERF_COUNTERS
158 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
159 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
160#endif
155} 161}
156 162
157void __init native_init_IRQ(void) 163void __init native_init_IRQ(void)
@@ -159,6 +165,9 @@ void __init native_init_IRQ(void)
159 int i; 165 int i;
160 166
161 init_ISA_irqs(); 167 init_ISA_irqs();
168
169 apic_intr_init();
170
162 /* 171 /*
163 * Cover the whole vector space, no vector can escape 172 * Cover the whole vector space, no vector can escape
164 * us. (some of these will be overridden and become 173 * us. (some of these will be overridden and become
@@ -166,12 +175,10 @@ void __init native_init_IRQ(void)
166 */ 175 */
167 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { 176 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
168 int vector = FIRST_EXTERNAL_VECTOR + i; 177 int vector = FIRST_EXTERNAL_VECTOR + i;
169 if (vector != IA32_SYSCALL_VECTOR) 178 if (!test_bit(vector, used_vectors))
170 set_intr_gate(vector, interrupt[i]); 179 set_intr_gate(vector, interrupt[i]);
171 } 180 }
172 181
173 apic_intr_init();
174
175 if (!acpi_ioapic) 182 if (!acpi_ioapic)
176 setup_irq(2, &irq2); 183 setup_irq(2, &irq2);
177} 184}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 14425166b8e3..0a813b17b172 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#include <linux/sched.h> 9#include <linux/sched.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c8736b491..d51321ddafda 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,5 @@ ENTRY(sys_call_table)
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_preadv 335 .long sys_preadv
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_counter_open
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d288327ff0..2cc162e09c4b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -945,8 +945,13 @@ void __init trap_init(void)
945#endif 945#endif
946 set_intr_gate(19, &simd_coprocessor_error); 946 set_intr_gate(19, &simd_coprocessor_error);
947 947
948 /* Reserve all the builtin and the syscall vector: */
949 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
950 set_bit(i, used_vectors);
951
948#ifdef CONFIG_IA32_EMULATION 952#ifdef CONFIG_IA32_EMULATION
949 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 953 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
954 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
950#endif 955#endif
951 956
952#ifdef CONFIG_X86_32 957#ifdef CONFIG_X86_32
@@ -963,17 +968,9 @@ void __init trap_init(void)
963 } 968 }
964 969
965 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 970 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
966#endif
967
968 /* Reserve all the builtin and the syscall vector: */
969 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
970 set_bit(i, used_vectors);
971
972#ifdef CONFIG_X86_64
973 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
974#else
975 set_bit(SYSCALL_VECTOR, used_vectors); 971 set_bit(SYSCALL_VECTOR, used_vectors);
976#endif 972#endif
973
977 /* 974 /*
978 * Should be a barrier for any external CPU state: 975 * Should be a barrier for any external CPU state:
979 */ 976 */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa0..6f9df2babe48 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
27#include <linux/tty.h> 27#include <linux/tty.h>
28#include <linux/smp.h> 28#include <linux/smp.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/perf_counter.h>
30 31
31#include <asm-generic/sections.h> 32#include <asm-generic/sections.h>
32 33
@@ -1044,6 +1045,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1044 if (unlikely(error_code & PF_RSVD)) 1045 if (unlikely(error_code & PF_RSVD))
1045 pgtable_bad(regs, error_code, address); 1046 pgtable_bad(regs, error_code, address);
1046 1047
1048 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
1049
1047 /* 1050 /*
1048 * If we're in an interrupt, have no user context or are running 1051 * If we're in an interrupt, have no user context or are running
1049 * in an atomic region then we must not take the fault: 1052 * in an atomic region then we must not take the fault:
@@ -1137,10 +1140,15 @@ good_area:
1137 return; 1140 return;
1138 } 1141 }
1139 1142
1140 if (fault & VM_FAULT_MAJOR) 1143 if (fault & VM_FAULT_MAJOR) {
1141 tsk->maj_flt++; 1144 tsk->maj_flt++;
1142 else 1145 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
1146 regs, address);
1147 } else {
1143 tsk->min_flt++; 1148 tsk->min_flt++;
1149 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
1150 regs, address);
1151 }
1144 1152
1145 check_v8086_mode(regs, address, tsk); 1153 check_v8086_mode(regs, address, tsk);
1146 1154
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864ad49a7..c638685136e1 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
40 40
41 switch (val) { 41 switch (val) {
42 case DIE_NMI: 42 case DIE_NMI:
43 if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) 43 case DIE_NMI_IPI:
44 ret = NOTIFY_STOP; 44 model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
45 ret = NOTIFY_STOP;
45 break; 46 break;
46 default: 47 default:
47 break; 48 break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
134static struct notifier_block profile_exceptions_nb = { 135static struct notifier_block profile_exceptions_nb = {
135 .notifier_call = profile_exceptions_notify, 136 .notifier_call = profile_exceptions_notify,
136 .next = NULL, 137 .next = NULL,
137 .priority = 0 138 .priority = 2
138}; 139};
139 140
140static int nmi_setup(void) 141static int nmi_setup(void)
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 10131fbdaada..4da7230b3d17 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/apic.h> 19#include <asm/apic.h>
20#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h> 21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
@@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
136 u64 val; 136 u64 val;
137 int i; 137 int i;
138 138
139 /*
140 * This can happen if perf counters are in use when
141 * we steal the die notifier NMI.
142 */
143 if (unlikely(!reset_value))
144 goto out;
145
139 for (i = 0 ; i < num_counters; ++i) { 146 for (i = 0 ; i < num_counters; ++i) {
140 if (!reset_value[i]) 147 if (!reset_value[i])
141 continue; 148 continue;
@@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
146 } 153 }
147 } 154 }
148 155
156out:
149 /* Only P6 based Pentium M need to re-unmask the apic vector but it 157 /* Only P6 based Pentium M need to re-unmask the apic vector but it
150 * doesn't hurt other P6 variant */ 158 * doesn't hurt other P6 variant */
151 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); 159 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index d6a807f4077d..39a05b5fa9cb 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
25#include <linux/kbd_kern.h> 25#include <linux/kbd_kern.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/quotaops.h> 27#include <linux/quotaops.h>
28#include <linux/perf_counter.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/suspend.h> 31#include <linux/suspend.h>
@@ -243,6 +244,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
243 struct pt_regs *regs = get_irq_regs(); 244 struct pt_regs *regs = get_irq_regs();
244 if (regs) 245 if (regs)
245 show_regs(regs); 246 show_regs(regs);
247 perf_counter_print_debug();
246} 248}
247static struct sysrq_key_op sysrq_showregs_op = { 249static struct sysrq_key_op sysrq_showregs_op = {
248 .handler = sysrq_handle_showregs, 250 .handler = sysrq_handle_showregs,
diff --git a/fs/exec.c b/fs/exec.c
index 895823d0149d..ad4f28c2327a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h>
36#include <linux/highmem.h> 37#include <linux/highmem.h>
37#include <linux/spinlock.h> 38#include <linux/spinlock.h>
38#include <linux/key.h> 39#include <linux/key.h>
@@ -922,6 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
922 task_lock(tsk); 923 task_lock(tsk);
923 strlcpy(tsk->comm, buf, sizeof(tsk->comm)); 924 strlcpy(tsk->comm, buf, sizeof(tsk->comm));
924 task_unlock(tsk); 925 task_unlock(tsk);
926 perf_counter_comm(tsk);
925} 927}
926 928
927int flush_old_exec(struct linux_binprm * bprm) 929int flush_old_exec(struct linux_binprm * bprm)
@@ -990,6 +992,13 @@ int flush_old_exec(struct linux_binprm * bprm)
990 992
991 current->personality &= ~bprm->per_clear; 993 current->personality &= ~bprm->per_clear;
992 994
995 /*
996 * Flush performance counters when crossing a
997 * security domain:
998 */
999 if (!get_dumpable(current->mm))
1000 perf_counter_exit_task(current);
1001
993 /* An exec changes our domain. We are no longer part of the thread 1002 /* An exec changes our domain. We are no longer part of the thread
994 group */ 1003 group */
995 1004
diff --git a/include/linux/compat.h b/include/linux/compat.h
index f2ded21f9a3c..af931ee43dd8 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -222,6 +222,8 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from);
222int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from); 222int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from);
223int get_compat_sigevent(struct sigevent *event, 223int get_compat_sigevent(struct sigevent *event,
224 const struct compat_sigevent __user *u_event); 224 const struct compat_sigevent __user *u_event);
225long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
226 struct compat_siginfo __user *uinfo);
225 227
226static inline int compat_timeval_compare(struct compat_timeval *lhs, 228static inline int compat_timeval_compare(struct compat_timeval *lhs,
227 struct compat_timeval *rhs) 229 struct compat_timeval *rhs)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d87247d2641f..503afaa0afa7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,6 +108,18 @@ extern struct group_info init_groups;
108 108
109extern struct cred init_cred; 109extern struct cred init_cred;
110 110
111#ifdef CONFIG_PERF_COUNTERS
112# define INIT_PERF_COUNTERS(tsk) \
113 .perf_counter_ctx.counter_list = \
114 LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \
115 .perf_counter_ctx.event_list = \
116 LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \
117 .perf_counter_ctx.lock = \
118 __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
119#else
120# define INIT_PERF_COUNTERS(tsk)
121#endif
122
111/* 123/*
112 * INIT_TASK is used to set up the first task table, touch at 124 * INIT_TASK is used to set up the first task table, touch at
113 * your own risk!. Base=0, limit=0x1fffff (=2MB) 125 * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -171,6 +183,7 @@ extern struct cred init_cred;
171 }, \ 183 }, \
172 .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ 184 .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \
173 INIT_IDS \ 185 INIT_IDS \
186 INIT_PERF_COUNTERS(tsk) \
174 INIT_TRACE_IRQFLAGS \ 187 INIT_TRACE_IRQFLAGS \
175 INIT_LOCKDEP \ 188 INIT_LOCKDEP \
176 INIT_FTRACE_GRAPH \ 189 INIT_FTRACE_GRAPH \
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 0c8b89f28a95..a77c6007dc99 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -81,7 +81,12 @@ static inline unsigned int kstat_irqs(unsigned int irq)
81 return sum; 81 return sum;
82} 82}
83 83
84
85/*
86 * Lock/unlock the current runqueue - to extract task statistics:
87 */
84extern unsigned long long task_delta_exec(struct task_struct *); 88extern unsigned long long task_delta_exec(struct task_struct *);
89
85extern void account_user_time(struct task_struct *, cputime_t, cputime_t); 90extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
86extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); 91extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
87extern void account_steal_time(cputime_t); 92extern void account_steal_time(cputime_t);
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3069ec7e0ab8..878cab4f5fcc 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -150,5 +150,6 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
150 */ 150 */
151extern int mutex_trylock(struct mutex *lock); 151extern int mutex_trylock(struct mutex *lock);
152extern void mutex_unlock(struct mutex *lock); 152extern void mutex_unlock(struct mutex *lock);
153extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
153 154
154#endif 155#endif
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 000000000000..c8c1dfc22c93
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,642 @@
1/*
2 * Performance counters:
3 *
4 * Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
6 *
7 * Data type definitions, declarations, prototypes.
8 *
9 * Started by: Thomas Gleixner and Ingo Molnar
10 *
11 * For licencing details see kernel-base/COPYING
12 */
13#ifndef _LINUX_PERF_COUNTER_H
14#define _LINUX_PERF_COUNTER_H
15
16#include <linux/types.h>
17#include <linux/ioctl.h>
18#include <asm/byteorder.h>
19
20/*
21 * User-space ABI bits:
22 */
23
24/*
25 * hw_event.type
26 */
27enum perf_event_types {
28 PERF_TYPE_HARDWARE = 0,
29 PERF_TYPE_SOFTWARE = 1,
30 PERF_TYPE_TRACEPOINT = 2,
31
32 /*
33 * available TYPE space, raw is the max value.
34 */
35
36 PERF_TYPE_RAW = 128,
37};
38
39/*
40 * Generalized performance counter event types, used by the hw_event.event_id
41 * parameter of the sys_perf_counter_open() syscall:
42 */
43enum hw_event_ids {
44 /*
45 * Common hardware events, generalized by the kernel:
46 */
47 PERF_COUNT_CPU_CYCLES = 0,
48 PERF_COUNT_INSTRUCTIONS = 1,
49 PERF_COUNT_CACHE_REFERENCES = 2,
50 PERF_COUNT_CACHE_MISSES = 3,
51 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
52 PERF_COUNT_BRANCH_MISSES = 5,
53 PERF_COUNT_BUS_CYCLES = 6,
54
55 PERF_HW_EVENTS_MAX = 7,
56};
57
58/*
59 * Special "software" counters provided by the kernel, even if the hardware
60 * does not support performance counters. These counters measure various
61 * physical and sw events of the kernel (and allow the profiling of them as
62 * well):
63 */
64enum sw_event_ids {
65 PERF_COUNT_CPU_CLOCK = 0,
66 PERF_COUNT_TASK_CLOCK = 1,
67 PERF_COUNT_PAGE_FAULTS = 2,
68 PERF_COUNT_CONTEXT_SWITCHES = 3,
69 PERF_COUNT_CPU_MIGRATIONS = 4,
70 PERF_COUNT_PAGE_FAULTS_MIN = 5,
71 PERF_COUNT_PAGE_FAULTS_MAJ = 6,
72
73 PERF_SW_EVENTS_MAX = 7,
74};
75
76#define __PERF_COUNTER_MASK(name) \
77 (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \
78 PERF_COUNTER_##name##_SHIFT)
79
80#define PERF_COUNTER_RAW_BITS 1
81#define PERF_COUNTER_RAW_SHIFT 63
82#define PERF_COUNTER_RAW_MASK __PERF_COUNTER_MASK(RAW)
83
84#define PERF_COUNTER_CONFIG_BITS 63
85#define PERF_COUNTER_CONFIG_SHIFT 0
86#define PERF_COUNTER_CONFIG_MASK __PERF_COUNTER_MASK(CONFIG)
87
88#define PERF_COUNTER_TYPE_BITS 7
89#define PERF_COUNTER_TYPE_SHIFT 56
90#define PERF_COUNTER_TYPE_MASK __PERF_COUNTER_MASK(TYPE)
91
92#define PERF_COUNTER_EVENT_BITS 56
93#define PERF_COUNTER_EVENT_SHIFT 0
94#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT)
95
96/*
97 * Bits that can be set in hw_event.record_type to request information
98 * in the overflow packets.
99 */
100enum perf_counter_record_format {
101 PERF_RECORD_IP = 1U << 0,
102 PERF_RECORD_TID = 1U << 1,
103 PERF_RECORD_TIME = 1U << 2,
104 PERF_RECORD_ADDR = 1U << 3,
105 PERF_RECORD_GROUP = 1U << 4,
106 PERF_RECORD_CALLCHAIN = 1U << 5,
107 PERF_RECORD_CONFIG = 1U << 6,
108 PERF_RECORD_CPU = 1U << 7,
109};
110
111/*
112 * Bits that can be set in hw_event.read_format to request that
113 * reads on the counter should return the indicated quantities,
114 * in increasing order of bit value, after the counter value.
115 */
116enum perf_counter_read_format {
117 PERF_FORMAT_TOTAL_TIME_ENABLED = 1,
118 PERF_FORMAT_TOTAL_TIME_RUNNING = 2,
119};
120
121/*
122 * Hardware event to monitor via a performance monitoring counter:
123 */
124struct perf_counter_hw_event {
125 /*
126 * The MSB of the config word signifies if the rest contains cpu
127 * specific (raw) counter configuration data, if unset, the next
128 * 7 bits are an event type and the rest of the bits are the event
129 * identifier.
130 */
131 __u64 config;
132
133 union {
134 __u64 irq_period;
135 __u64 irq_freq;
136 };
137
138 __u32 record_type;
139 __u32 read_format;
140
141 __u64 disabled : 1, /* off by default */
142 nmi : 1, /* NMI sampling */
143 inherit : 1, /* children inherit it */
144 pinned : 1, /* must always be on PMU */
145 exclusive : 1, /* only group on PMU */
146 exclude_user : 1, /* don't count user */
147 exclude_kernel : 1, /* ditto kernel */
148 exclude_hv : 1, /* ditto hypervisor */
149 exclude_idle : 1, /* don't count when idle */
150 mmap : 1, /* include mmap data */
151 munmap : 1, /* include munmap data */
152 comm : 1, /* include comm data */
153 freq : 1, /* use freq, not period */
154
155 __reserved_1 : 51;
156
157 __u32 extra_config_len;
158 __u32 wakeup_events; /* wakeup every n events */
159
160 __u64 __reserved_2;
161 __u64 __reserved_3;
162};
163
164/*
165 * Ioctls that can be done on a perf counter fd:
166 */
167#define PERF_COUNTER_IOC_ENABLE _IOW('$', 0, u32)
168#define PERF_COUNTER_IOC_DISABLE _IOW('$', 1, u32)
169#define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32)
170#define PERF_COUNTER_IOC_RESET _IOW('$', 3, u32)
171
172enum perf_counter_ioc_flags {
173 PERF_IOC_FLAG_GROUP = 1U << 0,
174};
175
176/*
177 * Structure of the page that can be mapped via mmap
178 */
179struct perf_counter_mmap_page {
180 __u32 version; /* version number of this structure */
181 __u32 compat_version; /* lowest version this is compat with */
182
183 /*
184 * Bits needed to read the hw counters in user-space.
185 *
186 * u32 seq;
187 * s64 count;
188 *
189 * do {
190 * seq = pc->lock;
191 *
192 * barrier()
193 * if (pc->index) {
194 * count = pmc_read(pc->index - 1);
195 * count += pc->offset;
196 * } else
197 * goto regular_read;
198 *
199 * barrier();
200 * } while (pc->lock != seq);
201 *
202 * NOTE: for obvious reason this only works on self-monitoring
203 * processes.
204 */
205 __u32 lock; /* seqlock for synchronization */
206 __u32 index; /* hardware counter identifier */
207 __s64 offset; /* add to hardware counter value */
208
209 /*
210 * Control data for the mmap() data buffer.
211 *
212 * User-space reading this value should issue an rmb(), on SMP capable
213 * platforms, after reading this value -- see perf_counter_wakeup().
214 */
215 __u32 data_head; /* head in the data section */
216};
217
218#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0)
219#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0)
220#define PERF_EVENT_MISC_KERNEL (1 << 0)
221#define PERF_EVENT_MISC_USER (2 << 0)
222#define PERF_EVENT_MISC_HYPERVISOR (3 << 0)
223#define PERF_EVENT_MISC_OVERFLOW (1 << 2)
224
225struct perf_event_header {
226 __u32 type;
227 __u16 misc;
228 __u16 size;
229};
230
231enum perf_event_type {
232
233 /*
234 * The MMAP events record the PROT_EXEC mappings so that we can
235 * correlate userspace IPs to code. They have the following structure:
236 *
237 * struct {
238 * struct perf_event_header header;
239 *
240 * u32 pid, tid;
241 * u64 addr;
242 * u64 len;
243 * u64 pgoff;
244 * char filename[];
245 * };
246 */
247 PERF_EVENT_MMAP = 1,
248 PERF_EVENT_MUNMAP = 2,
249
250 /*
251 * struct {
252 * struct perf_event_header header;
253 *
254 * u32 pid, tid;
255 * char comm[];
256 * };
257 */
258 PERF_EVENT_COMM = 3,
259
260 /*
261 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
262 * will be PERF_RECORD_*
263 *
264 * struct {
265 * struct perf_event_header header;
266 *
267 * { u64 ip; } && PERF_RECORD_IP
268 * { u32 pid, tid; } && PERF_RECORD_TID
269 * { u64 time; } && PERF_RECORD_TIME
270 * { u64 addr; } && PERF_RECORD_ADDR
271 * { u64 config; } && PERF_RECORD_CONFIG
272 * { u32 cpu, res; } && PERF_RECORD_CPU
273 *
274 * { u64 nr;
275 * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP
276 *
277 * { u16 nr,
278 * hv,
279 * kernel,
280 * user;
281 * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN
282 * };
283 */
284};
285
286#ifdef __KERNEL__
287/*
288 * Kernel-internal data types and definitions:
289 */
290
291#ifdef CONFIG_PERF_COUNTERS
292# include <asm/perf_counter.h>
293#endif
294
295#include <linux/list.h>
296#include <linux/mutex.h>
297#include <linux/rculist.h>
298#include <linux/rcupdate.h>
299#include <linux/spinlock.h>
300#include <linux/hrtimer.h>
301#include <linux/fs.h>
302#include <asm/atomic.h>
303
304struct task_struct;
305
306static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
307{
308 return hw_event->config & PERF_COUNTER_RAW_MASK;
309}
310
311static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
312{
313 return hw_event->config & PERF_COUNTER_CONFIG_MASK;
314}
315
316static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
317{
318 return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
319 PERF_COUNTER_TYPE_SHIFT;
320}
321
322static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
323{
324 return hw_event->config & PERF_COUNTER_EVENT_MASK;
325}
326
327/**
328 * struct hw_perf_counter - performance counter hardware details:
329 */
330struct hw_perf_counter {
331#ifdef CONFIG_PERF_COUNTERS
332 union {
333 struct { /* hardware */
334 u64 config;
335 unsigned long config_base;
336 unsigned long counter_base;
337 int nmi;
338 int idx;
339 };
340 union { /* software */
341 atomic64_t count;
342 struct hrtimer hrtimer;
343 };
344 };
345 atomic64_t prev_count;
346 u64 irq_period;
347 atomic64_t period_left;
348 u64 interrupts;
349#endif
350};
351
352struct perf_counter;
353
354/**
355 * struct pmu - generic performance monitoring unit
356 */
357struct pmu {
358 int (*enable) (struct perf_counter *counter);
359 void (*disable) (struct perf_counter *counter);
360 void (*read) (struct perf_counter *counter);
361};
362
363/**
364 * enum perf_counter_active_state - the states of a counter
365 */
366enum perf_counter_active_state {
367 PERF_COUNTER_STATE_ERROR = -2,
368 PERF_COUNTER_STATE_OFF = -1,
369 PERF_COUNTER_STATE_INACTIVE = 0,
370 PERF_COUNTER_STATE_ACTIVE = 1,
371};
372
373struct file;
374
375struct perf_mmap_data {
376 struct rcu_head rcu_head;
377 int nr_pages; /* nr of data pages */
378 int nr_locked; /* nr pages mlocked */
379
380 atomic_t poll; /* POLL_ for wakeups */
381 atomic_t head; /* write position */
382 atomic_t events; /* event limit */
383
384 atomic_t done_head; /* completed head */
385 atomic_t lock; /* concurrent writes */
386
387 atomic_t wakeup; /* needs a wakeup */
388
389 struct perf_counter_mmap_page *user_page;
390 void *data_pages[0];
391};
392
393struct perf_pending_entry {
394 struct perf_pending_entry *next;
395 void (*func)(struct perf_pending_entry *);
396};
397
398/**
399 * struct perf_counter - performance counter kernel representation:
400 */
401struct perf_counter {
402#ifdef CONFIG_PERF_COUNTERS
403 struct list_head list_entry;
404 struct list_head event_entry;
405 struct list_head sibling_list;
406 int nr_siblings;
407 struct perf_counter *group_leader;
408 const struct pmu *pmu;
409
410 enum perf_counter_active_state state;
411 enum perf_counter_active_state prev_state;
412 atomic64_t count;
413
414 /*
415 * These are the total time in nanoseconds that the counter
416 * has been enabled (i.e. eligible to run, and the task has
417 * been scheduled in, if this is a per-task counter)
418 * and running (scheduled onto the CPU), respectively.
419 *
420 * They are computed from tstamp_enabled, tstamp_running and
421 * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
422 */
423 u64 total_time_enabled;
424 u64 total_time_running;
425
426 /*
427 * These are timestamps used for computing total_time_enabled
428 * and total_time_running when the counter is in INACTIVE or
429 * ACTIVE state, measured in nanoseconds from an arbitrary point
430 * in time.
431 * tstamp_enabled: the notional time when the counter was enabled
432 * tstamp_running: the notional time when the counter was scheduled on
433 * tstamp_stopped: in INACTIVE state, the notional time when the
434 * counter was scheduled off.
435 */
436 u64 tstamp_enabled;
437 u64 tstamp_running;
438 u64 tstamp_stopped;
439
440 struct perf_counter_hw_event hw_event;
441 struct hw_perf_counter hw;
442
443 struct perf_counter_context *ctx;
444 struct task_struct *task;
445 struct file *filp;
446
447 struct perf_counter *parent;
448 struct list_head child_list;
449
450 /*
451 * These accumulate total time (in nanoseconds) that children
452 * counters have been enabled and running, respectively.
453 */
454 atomic64_t child_total_time_enabled;
455 atomic64_t child_total_time_running;
456
457 /*
458 * Protect attach/detach and child_list:
459 */
460 struct mutex mutex;
461
462 int oncpu;
463 int cpu;
464
465 /* mmap bits */
466 struct mutex mmap_mutex;
467 atomic_t mmap_count;
468 struct perf_mmap_data *data;
469
470 /* poll related */
471 wait_queue_head_t waitq;
472 struct fasync_struct *fasync;
473
474 /* delayed work for NMIs and such */
475 int pending_wakeup;
476 int pending_kill;
477 int pending_disable;
478 struct perf_pending_entry pending;
479
480 atomic_t event_limit;
481
482 void (*destroy)(struct perf_counter *);
483 struct rcu_head rcu_head;
484#endif
485};
486
487/**
488 * struct perf_counter_context - counter context structure
489 *
490 * Used as a container for task counters and CPU counters as well:
491 */
492struct perf_counter_context {
493#ifdef CONFIG_PERF_COUNTERS
494 /*
495 * Protect the states of the counters in the list,
496 * nr_active, and the list:
497 */
498 spinlock_t lock;
499 /*
500 * Protect the list of counters. Locking either mutex or lock
501 * is sufficient to ensure the list doesn't change; to change
502 * the list you need to lock both the mutex and the spinlock.
503 */
504 struct mutex mutex;
505
506 struct list_head counter_list;
507 struct list_head event_list;
508 int nr_counters;
509 int nr_active;
510 int is_active;
511 struct task_struct *task;
512
513 /*
514 * Context clock, runs when context enabled.
515 */
516 u64 time;
517 u64 timestamp;
518#endif
519};
520
521/**
522 * struct perf_counter_cpu_context - per cpu counter context structure
523 */
524struct perf_cpu_context {
525 struct perf_counter_context ctx;
526 struct perf_counter_context *task_ctx;
527 int active_oncpu;
528 int max_pertask;
529 int exclusive;
530
531 /*
532 * Recursion avoidance:
533 *
534 * task, softirq, irq, nmi context
535 */
536 int recursion[4];
537};
538
539#ifdef CONFIG_PERF_COUNTERS
540
541/*
542 * Set by architecture code:
543 */
544extern int perf_max_counters;
545
546extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
547
548extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
549extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
550extern void perf_counter_task_tick(struct task_struct *task, int cpu);
551extern void perf_counter_init_task(struct task_struct *child);
552extern void perf_counter_exit_task(struct task_struct *child);
553extern void perf_counter_do_pending(void);
554extern void perf_counter_print_debug(void);
555extern void perf_counter_unthrottle(void);
556extern void __perf_disable(void);
557extern bool __perf_enable(void);
558extern void perf_disable(void);
559extern void perf_enable(void);
560extern int perf_counter_task_disable(void);
561extern int perf_counter_task_enable(void);
562extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
563 struct perf_cpu_context *cpuctx,
564 struct perf_counter_context *ctx, int cpu);
565extern void perf_counter_update_userpage(struct perf_counter *counter);
566
567extern int perf_counter_overflow(struct perf_counter *counter,
568 int nmi, struct pt_regs *regs, u64 addr);
569/*
570 * Return 1 for a software counter, 0 for a hardware counter
571 */
572static inline int is_software_counter(struct perf_counter *counter)
573{
574 return !perf_event_raw(&counter->hw_event) &&
575 perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
576}
577
578extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
579
580extern void perf_counter_mmap(unsigned long addr, unsigned long len,
581 unsigned long pgoff, struct file *file);
582
583extern void perf_counter_munmap(unsigned long addr, unsigned long len,
584 unsigned long pgoff, struct file *file);
585
586extern void perf_counter_comm(struct task_struct *tsk);
587
588#define MAX_STACK_DEPTH 255
589
590struct perf_callchain_entry {
591 u16 nr, hv, kernel, user;
592 u64 ip[MAX_STACK_DEPTH];
593};
594
595extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
596
597extern int sysctl_perf_counter_priv;
598extern int sysctl_perf_counter_mlock;
599
600extern void perf_counter_init(void);
601
602#ifndef perf_misc_flags
603#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \
604 PERF_EVENT_MISC_KERNEL)
605#define perf_instruction_pointer(regs) instruction_pointer(regs)
606#endif
607
608#else
609static inline void
610perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
611static inline void
612perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
613static inline void
614perf_counter_task_tick(struct task_struct *task, int cpu) { }
615static inline void perf_counter_init_task(struct task_struct *child) { }
616static inline void perf_counter_exit_task(struct task_struct *child) { }
617static inline void perf_counter_do_pending(void) { }
618static inline void perf_counter_print_debug(void) { }
619static inline void perf_counter_unthrottle(void) { }
620static inline void perf_disable(void) { }
621static inline void perf_enable(void) { }
622static inline int perf_counter_task_disable(void) { return -EINVAL; }
623static inline int perf_counter_task_enable(void) { return -EINVAL; }
624
625static inline void
626perf_swcounter_event(u32 event, u64 nr, int nmi,
627 struct pt_regs *regs, u64 addr) { }
628
629static inline void
630perf_counter_mmap(unsigned long addr, unsigned long len,
631 unsigned long pgoff, struct file *file) { }
632
633static inline void
634perf_counter_munmap(unsigned long addr, unsigned long len,
635 unsigned long pgoff, struct file *file) { }
636
637static inline void perf_counter_comm(struct task_struct *tsk) { }
638static inline void perf_counter_init(void) { }
639#endif
640
641#endif /* __KERNEL__ */
642#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 48d887e3c6e7..b00df4c79c63 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
85#define PR_SET_TIMERSLACK 29 85#define PR_SET_TIMERSLACK 29
86#define PR_GET_TIMERSLACK 30 86#define PR_GET_TIMERSLACK 30
87 87
88#define PR_TASK_PERF_COUNTERS_DISABLE 31
89#define PR_TASK_PERF_COUNTERS_ENABLE 32
90
88#endif /* _LINUX_PRCTL_H */ 91#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049c..ff59d1231519 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,7 @@ struct sched_param {
71#include <linux/path.h> 71#include <linux/path.h>
72#include <linux/compiler.h> 72#include <linux/compiler.h>
73#include <linux/completion.h> 73#include <linux/completion.h>
74#include <linux/perf_counter.h>
74#include <linux/pid.h> 75#include <linux/pid.h>
75#include <linux/percpu.h> 76#include <linux/percpu.h>
76#include <linux/topology.h> 77#include <linux/topology.h>
@@ -137,6 +138,7 @@ extern unsigned long nr_running(void);
137extern unsigned long nr_uninterruptible(void); 138extern unsigned long nr_uninterruptible(void);
138extern unsigned long nr_active(void); 139extern unsigned long nr_active(void);
139extern unsigned long nr_iowait(void); 140extern unsigned long nr_iowait(void);
141extern u64 cpu_nr_migrations(int cpu);
140 142
141extern unsigned long get_parent_ip(unsigned long addr); 143extern unsigned long get_parent_ip(unsigned long addr);
142 144
@@ -672,6 +674,10 @@ struct user_struct {
672 struct work_struct work; 674 struct work_struct work;
673#endif 675#endif
674#endif 676#endif
677
678#ifdef CONFIG_PERF_COUNTERS
679 atomic_long_t locked_vm;
680#endif
675}; 681};
676 682
677extern int uids_sysfs_init(void); 683extern int uids_sysfs_init(void);
@@ -1052,9 +1058,10 @@ struct sched_entity {
1052 u64 last_wakeup; 1058 u64 last_wakeup;
1053 u64 avg_overlap; 1059 u64 avg_overlap;
1054 1060
1061 u64 nr_migrations;
1062
1055 u64 start_runtime; 1063 u64 start_runtime;
1056 u64 avg_wakeup; 1064 u64 avg_wakeup;
1057 u64 nr_migrations;
1058 1065
1059#ifdef CONFIG_SCHEDSTATS 1066#ifdef CONFIG_SCHEDSTATS
1060 u64 wait_start; 1067 u64 wait_start;
@@ -1380,6 +1387,7 @@ struct task_struct {
1380 struct list_head pi_state_list; 1387 struct list_head pi_state_list;
1381 struct futex_pi_state *pi_state_cache; 1388 struct futex_pi_state *pi_state_cache;
1382#endif 1389#endif
1390 struct perf_counter_context perf_counter_ctx;
1383#ifdef CONFIG_NUMA 1391#ifdef CONFIG_NUMA
1384 struct mempolicy *mempolicy; 1392 struct mempolicy *mempolicy;
1385 short il_next; 1393 short il_next;
@@ -2388,6 +2396,13 @@ static inline void inc_syscw(struct task_struct *tsk)
2388#define TASK_SIZE_OF(tsk) TASK_SIZE 2396#define TASK_SIZE_OF(tsk) TASK_SIZE
2389#endif 2397#endif
2390 2398
2399/*
2400 * Call the function if the target task is executing on a CPU right now:
2401 */
2402extern void task_oncpu_function_call(struct task_struct *p,
2403 void (*func) (void *info), void *info);
2404
2405
2391#ifdef CONFIG_MM_OWNER 2406#ifdef CONFIG_MM_OWNER
2392extern void mm_update_next_owner(struct mm_struct *mm); 2407extern void mm_update_next_owner(struct mm_struct *mm);
2393extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); 2408extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 84f997f8aa53..c7552836bd95 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -235,6 +235,8 @@ static inline int valid_signal(unsigned long sig)
235extern int next_signal(struct sigpending *pending, sigset_t *mask); 235extern int next_signal(struct sigpending *pending, sigset_t *mask);
236extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p); 236extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
237extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); 237extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
238extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
239 siginfo_t *info);
238extern long do_sigpending(void __user *, unsigned long); 240extern long do_sigpending(void __user *, unsigned long);
239extern int sigprocmask(int, sigset_t *, sigset_t *); 241extern int sigprocmask(int, sigset_t *, sigset_t *);
240extern int show_unhandled_signals; 242extern int show_unhandled_signals;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 30520844b8da..79faae950e2e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,6 +55,7 @@ struct compat_timeval;
55struct robust_list_head; 55struct robust_list_head;
56struct getcpu_cache; 56struct getcpu_cache;
57struct old_linux_dirent; 57struct old_linux_dirent;
58struct perf_counter_hw_event;
58 59
59#include <linux/types.h> 60#include <linux/types.h>
60#include <linux/aio_abi.h> 61#include <linux/aio_abi.h>
@@ -755,4 +756,8 @@ asmlinkage long sys_pipe(int __user *);
755 756
756int kernel_execve(const char *filename, char *const argv[], char *const envp[]); 757int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
757 758
759
760asmlinkage long sys_perf_counter_open(
761 const struct perf_counter_hw_event __user *hw_event_uptr,
762 pid_t pid, int cpu, int group_fd, unsigned long flags);
758#endif 763#endif
diff --git a/init/Kconfig b/init/Kconfig
index 7be4d3836745..8158f1f44694 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -933,6 +933,41 @@ config AIO
933 by some high performance threaded applications. Disabling 933 by some high performance threaded applications. Disabling
934 this option saves about 7k. 934 this option saves about 7k.
935 935
936config HAVE_PERF_COUNTERS
937 bool
938
939menu "Performance Counters"
940
941config PERF_COUNTERS
942 bool "Kernel Performance Counters"
943 depends on HAVE_PERF_COUNTERS
944 default y
945 select ANON_INODES
946 help
947 Enable kernel support for performance counter hardware.
948
949 Performance counters are special hardware registers available
950 on most modern CPUs. These registers count the number of certain
951 types of hw events: such as instructions executed, cachemisses
952 suffered, or branches mis-predicted - without slowing down the
953 kernel or applications. These registers can also trigger interrupts
954 when a threshold number of events have passed - and can thus be
955 used to profile the code that runs on that CPU.
956
957 The Linux Performance Counter subsystem provides an abstraction of
958 these hardware capabilities, available via a system call. It
959 provides per task and per CPU counters, and it provides event
960 capabilities on top of those.
961
962 Say Y if unsure.
963
964config EVENT_PROFILE
965 bool "Tracepoint profile sources"
966 depends on PERF_COUNTERS && EVENT_TRACER
967 default y
968
969endmenu
970
936config VM_EVENT_COUNTERS 971config VM_EVENT_COUNTERS
937 default y 972 default y
938 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED 973 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
diff --git a/kernel/Makefile b/kernel/Makefile
index 42423665660a..e914ca992d70 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
95obj-$(CONFIG_TRACING) += trace/ 95obj-$(CONFIG_TRACING) += trace/
96obj-$(CONFIG_SMP) += sched_cpupri.o 96obj-$(CONFIG_SMP) += sched_cpupri.o
97obj-$(CONFIG_SLOW_WORK) += slow-work.o 97obj-$(CONFIG_SLOW_WORK) += slow-work.o
98obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
98 99
99ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 100ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
100# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 101# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/compat.c b/kernel/compat.c
index 42d56544460f..f6c204f07ea6 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
882 882
883} 883}
884 884
885asmlinkage long
886compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
887 struct compat_siginfo __user *uinfo)
888{
889 siginfo_t info;
890
891 if (copy_siginfo_from_user32(&info, uinfo))
892 return -EFAULT;
893 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
894}
895
885#ifdef __ARCH_WANT_COMPAT_SYS_TIME 896#ifdef __ARCH_WANT_COMPAT_SYS_TIME
886 897
887/* compat_time_t is a 32 bit "long" and needs to get converted. */ 898/* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/exit.c b/kernel/exit.c
index abf9cf3b95c6..73affd35e76d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -158,6 +158,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
158{ 158{
159 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 159 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
160 160
161#ifdef CONFIG_PERF_COUNTERS
162 WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
163#endif
161 trace_sched_process_free(tsk); 164 trace_sched_process_free(tsk);
162 put_task_struct(tsk); 165 put_task_struct(tsk);
163} 166}
@@ -174,6 +177,13 @@ repeat:
174 atomic_dec(&__task_cred(p)->user->processes); 177 atomic_dec(&__task_cred(p)->user->processes);
175 178
176 proc_flush_task(p); 179 proc_flush_task(p);
180
181 /*
182 * Flush inherited counters to the parent - before the parent
183 * gets woken up by child-exit notifications.
184 */
185 perf_counter_exit_task(p);
186
177 write_lock_irq(&tasklist_lock); 187 write_lock_irq(&tasklist_lock);
178 tracehook_finish_release_task(p); 188 tracehook_finish_release_task(p);
179 __exit_signal(p); 189 __exit_signal(p);
@@ -981,10 +991,6 @@ NORET_TYPE void do_exit(long code)
981 tsk->mempolicy = NULL; 991 tsk->mempolicy = NULL;
982#endif 992#endif
983#ifdef CONFIG_FUTEX 993#ifdef CONFIG_FUTEX
984 /*
985 * This must happen late, after the PID is not
986 * hashed anymore:
987 */
988 if (unlikely(!list_empty(&tsk->pi_state_list))) 994 if (unlikely(!list_empty(&tsk->pi_state_list)))
989 exit_pi_state_list(tsk); 995 exit_pi_state_list(tsk);
990 if (unlikely(current->pi_state_cache)) 996 if (unlikely(current->pi_state_cache))
@@ -1251,6 +1257,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
1251 */ 1257 */
1252 read_unlock(&tasklist_lock); 1258 read_unlock(&tasklist_lock);
1253 1259
1260 /*
1261 * Flush inherited counters to the parent - before the parent
1262 * gets woken up by child-exit notifications.
1263 */
1264 perf_counter_exit_task(p);
1265
1254 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1266 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
1255 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1267 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1256 ? p->signal->group_exit_code : p->exit_code; 1268 ? p->signal->group_exit_code : p->exit_code;
diff --git a/kernel/fork.c b/kernel/fork.c
index b9e2edd00726..d32fef4d38e5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -983,6 +983,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
983 goto fork_out; 983 goto fork_out;
984 984
985 rt_mutex_init_task(p); 985 rt_mutex_init_task(p);
986 perf_counter_init_task(p);
986 987
987#ifdef CONFIG_PROVE_LOCKING 988#ifdef CONFIG_PROVE_LOCKING
988 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 989 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f1..f788a5ace24b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
89 * 89 *
90 * This function is similar to (but not equivalent to) down(). 90 * This function is similar to (but not equivalent to) down().
91 */ 91 */
92void inline __sched mutex_lock(struct mutex *lock) 92void __sched mutex_lock(struct mutex *lock)
93{ 93{
94 might_sleep(); 94 might_sleep();
95 /* 95 /*
@@ -471,5 +471,28 @@ int __sched mutex_trylock(struct mutex *lock)
471 471
472 return ret; 472 return ret;
473} 473}
474
475EXPORT_SYMBOL(mutex_trylock); 474EXPORT_SYMBOL(mutex_trylock);
475
476/**
477 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
478 * @cnt: the atomic which we are to dec
479 * @lock: the mutex to return holding if we dec to 0
480 *
481 * return true and hold lock if we dec to 0, return false otherwise
482 */
483int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
484{
485 /* dec if we can't possibly hit 0 */
486 if (atomic_add_unless(cnt, -1, 1))
487 return 0;
488 /* we might hit 0, so take the lock */
489 mutex_lock(lock);
490 if (!atomic_dec_and_test(cnt)) {
491 /* when we actually did the dec, we didn't hit 0 */
492 mutex_unlock(lock);
493 return 0;
494 }
495 /* we hit 0, and we hold the lock */
496 return 1;
497}
498EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..59a926d04baf
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,3526 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/ptrace.h>
20#include <linux/percpu.h>
21#include <linux/vmstat.h>
22#include <linux/hardirq.h>
23#include <linux/rculist.h>
24#include <linux/uaccess.h>
25#include <linux/syscalls.h>
26#include <linux/anon_inodes.h>
27#include <linux/kernel_stat.h>
28#include <linux/perf_counter.h>
29#include <linux/dcache.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU counters:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_counters __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_counters __read_mostly;
43static atomic_t nr_mmap_tracking __read_mostly;
44static atomic_t nr_munmap_tracking __read_mostly;
45static atomic_t nr_comm_tracking __read_mostly;
46
47int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
48int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
49
50/*
51 * Lock for (sysadmin-configurable) counter reservations:
52 */
53static DEFINE_SPINLOCK(perf_resource_lock);
54
55/*
56 * Architecture provided APIs - weak aliases:
57 */
58extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
59{
60 return NULL;
61}
62
63void __weak hw_perf_disable(void) { barrier(); }
64void __weak hw_perf_enable(void) { barrier(); }
65
66void __weak hw_perf_counter_setup(int cpu) { barrier(); }
67int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
68 struct perf_cpu_context *cpuctx,
69 struct perf_counter_context *ctx, int cpu)
70{
71 return 0;
72}
73
74void __weak perf_counter_print_debug(void) { }
75
76static DEFINE_PER_CPU(int, disable_count);
77
78void __perf_disable(void)
79{
80 __get_cpu_var(disable_count)++;
81}
82
83bool __perf_enable(void)
84{
85 return !--__get_cpu_var(disable_count);
86}
87
88void perf_disable(void)
89{
90 __perf_disable();
91 hw_perf_disable();
92}
93
94void perf_enable(void)
95{
96 if (__perf_enable())
97 hw_perf_enable();
98}
99
100static void
101list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
102{
103 struct perf_counter *group_leader = counter->group_leader;
104
105 /*
106 * Depending on whether it is a standalone or sibling counter,
107 * add it straight to the context's counter list, or to the group
108 * leader's sibling list:
109 */
110 if (group_leader == counter)
111 list_add_tail(&counter->list_entry, &ctx->counter_list);
112 else {
113 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
114 group_leader->nr_siblings++;
115 }
116
117 list_add_rcu(&counter->event_entry, &ctx->event_list);
118 ctx->nr_counters++;
119}
120
121static void
122list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
123{
124 struct perf_counter *sibling, *tmp;
125
126 ctx->nr_counters--;
127
128 list_del_init(&counter->list_entry);
129 list_del_rcu(&counter->event_entry);
130
131 if (counter->group_leader != counter)
132 counter->group_leader->nr_siblings--;
133
134 /*
135 * If this was a group counter with sibling counters then
136 * upgrade the siblings to singleton counters by adding them
137 * to the context list directly:
138 */
139 list_for_each_entry_safe(sibling, tmp,
140 &counter->sibling_list, list_entry) {
141
142 list_move_tail(&sibling->list_entry, &ctx->counter_list);
143 sibling->group_leader = sibling;
144 }
145}
146
147static void
148counter_sched_out(struct perf_counter *counter,
149 struct perf_cpu_context *cpuctx,
150 struct perf_counter_context *ctx)
151{
152 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
153 return;
154
155 counter->state = PERF_COUNTER_STATE_INACTIVE;
156 counter->tstamp_stopped = ctx->time;
157 counter->pmu->disable(counter);
158 counter->oncpu = -1;
159
160 if (!is_software_counter(counter))
161 cpuctx->active_oncpu--;
162 ctx->nr_active--;
163 if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
164 cpuctx->exclusive = 0;
165}
166
167static void
168group_sched_out(struct perf_counter *group_counter,
169 struct perf_cpu_context *cpuctx,
170 struct perf_counter_context *ctx)
171{
172 struct perf_counter *counter;
173
174 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
175 return;
176
177 counter_sched_out(group_counter, cpuctx, ctx);
178
179 /*
180 * Schedule out siblings (if any):
181 */
182 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
183 counter_sched_out(counter, cpuctx, ctx);
184
185 if (group_counter->hw_event.exclusive)
186 cpuctx->exclusive = 0;
187}
188
189/*
190 * Cross CPU call to remove a performance counter
191 *
192 * We disable the counter on the hardware level first. After that we
193 * remove it from the context list.
194 */
195static void __perf_counter_remove_from_context(void *info)
196{
197 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
198 struct perf_counter *counter = info;
199 struct perf_counter_context *ctx = counter->ctx;
200 unsigned long flags;
201
202 /*
203 * If this is a task context, we need to check whether it is
204 * the current task context of this cpu. If not it has been
205 * scheduled out before the smp call arrived.
206 */
207 if (ctx->task && cpuctx->task_ctx != ctx)
208 return;
209
210 spin_lock_irqsave(&ctx->lock, flags);
211
212 counter_sched_out(counter, cpuctx, ctx);
213
214 counter->task = NULL;
215
216 /*
217 * Protect the list operation against NMI by disabling the
218 * counters on a global level. NOP for non NMI based counters.
219 */
220 perf_disable();
221 list_del_counter(counter, ctx);
222 perf_enable();
223
224 if (!ctx->task) {
225 /*
226 * Allow more per task counters with respect to the
227 * reservation:
228 */
229 cpuctx->max_pertask =
230 min(perf_max_counters - ctx->nr_counters,
231 perf_max_counters - perf_reserved_percpu);
232 }
233
234 spin_unlock_irqrestore(&ctx->lock, flags);
235}
236
237
238/*
239 * Remove the counter from a task's (or a CPU's) list of counters.
240 *
241 * Must be called with counter->mutex and ctx->mutex held.
242 *
243 * CPU counters are removed with a smp call. For task counters we only
244 * call when the task is on a CPU.
245 */
246static void perf_counter_remove_from_context(struct perf_counter *counter)
247{
248 struct perf_counter_context *ctx = counter->ctx;
249 struct task_struct *task = ctx->task;
250
251 if (!task) {
252 /*
253 * Per cpu counters are removed via an smp call and
254 * the removal is always sucessful.
255 */
256 smp_call_function_single(counter->cpu,
257 __perf_counter_remove_from_context,
258 counter, 1);
259 return;
260 }
261
262retry:
263 task_oncpu_function_call(task, __perf_counter_remove_from_context,
264 counter);
265
266 spin_lock_irq(&ctx->lock);
267 /*
268 * If the context is active we need to retry the smp call.
269 */
270 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
271 spin_unlock_irq(&ctx->lock);
272 goto retry;
273 }
274
275 /*
276 * The lock prevents that this context is scheduled in so we
277 * can remove the counter safely, if the call above did not
278 * succeed.
279 */
280 if (!list_empty(&counter->list_entry)) {
281 list_del_counter(counter, ctx);
282 counter->task = NULL;
283 }
284 spin_unlock_irq(&ctx->lock);
285}
286
287static inline u64 perf_clock(void)
288{
289 return cpu_clock(smp_processor_id());
290}
291
292/*
293 * Update the record of the current time in a context.
294 */
295static void update_context_time(struct perf_counter_context *ctx)
296{
297 u64 now = perf_clock();
298
299 ctx->time += now - ctx->timestamp;
300 ctx->timestamp = now;
301}
302
303/*
304 * Update the total_time_enabled and total_time_running fields for a counter.
305 */
306static void update_counter_times(struct perf_counter *counter)
307{
308 struct perf_counter_context *ctx = counter->ctx;
309 u64 run_end;
310
311 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
312 return;
313
314 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
315
316 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
317 run_end = counter->tstamp_stopped;
318 else
319 run_end = ctx->time;
320
321 counter->total_time_running = run_end - counter->tstamp_running;
322}
323
324/*
325 * Update total_time_enabled and total_time_running for all counters in a group.
326 */
327static void update_group_times(struct perf_counter *leader)
328{
329 struct perf_counter *counter;
330
331 update_counter_times(leader);
332 list_for_each_entry(counter, &leader->sibling_list, list_entry)
333 update_counter_times(counter);
334}
335
336/*
337 * Cross CPU call to disable a performance counter
338 */
339static void __perf_counter_disable(void *info)
340{
341 struct perf_counter *counter = info;
342 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
343 struct perf_counter_context *ctx = counter->ctx;
344 unsigned long flags;
345
346 /*
347 * If this is a per-task counter, need to check whether this
348 * counter's task is the current task on this cpu.
349 */
350 if (ctx->task && cpuctx->task_ctx != ctx)
351 return;
352
353 spin_lock_irqsave(&ctx->lock, flags);
354
355 /*
356 * If the counter is on, turn it off.
357 * If it is in error state, leave it in error state.
358 */
359 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
360 update_context_time(ctx);
361 update_counter_times(counter);
362 if (counter == counter->group_leader)
363 group_sched_out(counter, cpuctx, ctx);
364 else
365 counter_sched_out(counter, cpuctx, ctx);
366 counter->state = PERF_COUNTER_STATE_OFF;
367 }
368
369 spin_unlock_irqrestore(&ctx->lock, flags);
370}
371
372/*
373 * Disable a counter.
374 */
375static void perf_counter_disable(struct perf_counter *counter)
376{
377 struct perf_counter_context *ctx = counter->ctx;
378 struct task_struct *task = ctx->task;
379
380 if (!task) {
381 /*
382 * Disable the counter on the cpu that it's on
383 */
384 smp_call_function_single(counter->cpu, __perf_counter_disable,
385 counter, 1);
386 return;
387 }
388
389 retry:
390 task_oncpu_function_call(task, __perf_counter_disable, counter);
391
392 spin_lock_irq(&ctx->lock);
393 /*
394 * If the counter is still active, we need to retry the cross-call.
395 */
396 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
397 spin_unlock_irq(&ctx->lock);
398 goto retry;
399 }
400
401 /*
402 * Since we have the lock this context can't be scheduled
403 * in, so we can change the state safely.
404 */
405 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
406 update_counter_times(counter);
407 counter->state = PERF_COUNTER_STATE_OFF;
408 }
409
410 spin_unlock_irq(&ctx->lock);
411}
412
413static int
414counter_sched_in(struct perf_counter *counter,
415 struct perf_cpu_context *cpuctx,
416 struct perf_counter_context *ctx,
417 int cpu)
418{
419 if (counter->state <= PERF_COUNTER_STATE_OFF)
420 return 0;
421
422 counter->state = PERF_COUNTER_STATE_ACTIVE;
423 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
424 /*
425 * The new state must be visible before we turn it on in the hardware:
426 */
427 smp_wmb();
428
429 if (counter->pmu->enable(counter)) {
430 counter->state = PERF_COUNTER_STATE_INACTIVE;
431 counter->oncpu = -1;
432 return -EAGAIN;
433 }
434
435 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
436
437 if (!is_software_counter(counter))
438 cpuctx->active_oncpu++;
439 ctx->nr_active++;
440
441 if (counter->hw_event.exclusive)
442 cpuctx->exclusive = 1;
443
444 return 0;
445}
446
447static int
448group_sched_in(struct perf_counter *group_counter,
449 struct perf_cpu_context *cpuctx,
450 struct perf_counter_context *ctx,
451 int cpu)
452{
453 struct perf_counter *counter, *partial_group;
454 int ret;
455
456 if (group_counter->state == PERF_COUNTER_STATE_OFF)
457 return 0;
458
459 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
460 if (ret)
461 return ret < 0 ? ret : 0;
462
463 group_counter->prev_state = group_counter->state;
464 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
465 return -EAGAIN;
466
467 /*
468 * Schedule in siblings as one group (if any):
469 */
470 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
471 counter->prev_state = counter->state;
472 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
473 partial_group = counter;
474 goto group_error;
475 }
476 }
477
478 return 0;
479
480group_error:
481 /*
482 * Groups can be scheduled in as one unit only, so undo any
483 * partial group before returning:
484 */
485 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
486 if (counter == partial_group)
487 break;
488 counter_sched_out(counter, cpuctx, ctx);
489 }
490 counter_sched_out(group_counter, cpuctx, ctx);
491
492 return -EAGAIN;
493}
494
495/*
496 * Return 1 for a group consisting entirely of software counters,
497 * 0 if the group contains any hardware counters.
498 */
499static int is_software_only_group(struct perf_counter *leader)
500{
501 struct perf_counter *counter;
502
503 if (!is_software_counter(leader))
504 return 0;
505
506 list_for_each_entry(counter, &leader->sibling_list, list_entry)
507 if (!is_software_counter(counter))
508 return 0;
509
510 return 1;
511}
512
513/*
514 * Work out whether we can put this counter group on the CPU now.
515 */
516static int group_can_go_on(struct perf_counter *counter,
517 struct perf_cpu_context *cpuctx,
518 int can_add_hw)
519{
520 /*
521 * Groups consisting entirely of software counters can always go on.
522 */
523 if (is_software_only_group(counter))
524 return 1;
525 /*
526 * If an exclusive group is already on, no other hardware
527 * counters can go on.
528 */
529 if (cpuctx->exclusive)
530 return 0;
531 /*
532 * If this group is exclusive and there are already
533 * counters on the CPU, it can't go on.
534 */
535 if (counter->hw_event.exclusive && cpuctx->active_oncpu)
536 return 0;
537 /*
538 * Otherwise, try to add it if all previous groups were able
539 * to go on.
540 */
541 return can_add_hw;
542}
543
544static void add_counter_to_ctx(struct perf_counter *counter,
545 struct perf_counter_context *ctx)
546{
547 list_add_counter(counter, ctx);
548 counter->prev_state = PERF_COUNTER_STATE_OFF;
549 counter->tstamp_enabled = ctx->time;
550 counter->tstamp_running = ctx->time;
551 counter->tstamp_stopped = ctx->time;
552}
553
554/*
555 * Cross CPU call to install and enable a performance counter
556 */
557static void __perf_install_in_context(void *info)
558{
559 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
560 struct perf_counter *counter = info;
561 struct perf_counter_context *ctx = counter->ctx;
562 struct perf_counter *leader = counter->group_leader;
563 int cpu = smp_processor_id();
564 unsigned long flags;
565 int err;
566
567 /*
568 * If this is a task context, we need to check whether it is
569 * the current task context of this cpu. If not it has been
570 * scheduled out before the smp call arrived.
571 */
572 if (ctx->task && cpuctx->task_ctx != ctx)
573 return;
574
575 spin_lock_irqsave(&ctx->lock, flags);
576 update_context_time(ctx);
577
578 /*
579 * Protect the list operation against NMI by disabling the
580 * counters on a global level. NOP for non NMI based counters.
581 */
582 perf_disable();
583
584 add_counter_to_ctx(counter, ctx);
585
586 /*
587 * Don't put the counter on if it is disabled or if
588 * it is in a group and the group isn't on.
589 */
590 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
591 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
592 goto unlock;
593
594 /*
595 * An exclusive counter can't go on if there are already active
596 * hardware counters, and no hardware counter can go on if there
597 * is already an exclusive counter on.
598 */
599 if (!group_can_go_on(counter, cpuctx, 1))
600 err = -EEXIST;
601 else
602 err = counter_sched_in(counter, cpuctx, ctx, cpu);
603
604 if (err) {
605 /*
606 * This counter couldn't go on. If it is in a group
607 * then we have to pull the whole group off.
608 * If the counter group is pinned then put it in error state.
609 */
610 if (leader != counter)
611 group_sched_out(leader, cpuctx, ctx);
612 if (leader->hw_event.pinned) {
613 update_group_times(leader);
614 leader->state = PERF_COUNTER_STATE_ERROR;
615 }
616 }
617
618 if (!err && !ctx->task && cpuctx->max_pertask)
619 cpuctx->max_pertask--;
620
621 unlock:
622 perf_enable();
623
624 spin_unlock_irqrestore(&ctx->lock, flags);
625}
626
627/*
628 * Attach a performance counter to a context
629 *
630 * First we add the counter to the list with the hardware enable bit
631 * in counter->hw_config cleared.
632 *
633 * If the counter is attached to a task which is on a CPU we use a smp
634 * call to enable it in the task context. The task might have been
635 * scheduled away, but we check this in the smp call again.
636 *
637 * Must be called with ctx->mutex held.
638 */
639static void
640perf_install_in_context(struct perf_counter_context *ctx,
641 struct perf_counter *counter,
642 int cpu)
643{
644 struct task_struct *task = ctx->task;
645
646 if (!task) {
647 /*
648 * Per cpu counters are installed via an smp call and
649 * the install is always sucessful.
650 */
651 smp_call_function_single(cpu, __perf_install_in_context,
652 counter, 1);
653 return;
654 }
655
656 counter->task = task;
657retry:
658 task_oncpu_function_call(task, __perf_install_in_context,
659 counter);
660
661 spin_lock_irq(&ctx->lock);
662 /*
663 * we need to retry the smp call.
664 */
665 if (ctx->is_active && list_empty(&counter->list_entry)) {
666 spin_unlock_irq(&ctx->lock);
667 goto retry;
668 }
669
670 /*
671 * The lock prevents that this context is scheduled in so we
672 * can add the counter safely, if it the call above did not
673 * succeed.
674 */
675 if (list_empty(&counter->list_entry))
676 add_counter_to_ctx(counter, ctx);
677 spin_unlock_irq(&ctx->lock);
678}
679
680/*
681 * Cross CPU call to enable a performance counter
682 */
683static void __perf_counter_enable(void *info)
684{
685 struct perf_counter *counter = info;
686 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
687 struct perf_counter_context *ctx = counter->ctx;
688 struct perf_counter *leader = counter->group_leader;
689 unsigned long flags;
690 int err;
691
692 /*
693 * If this is a per-task counter, need to check whether this
694 * counter's task is the current task on this cpu.
695 */
696 if (ctx->task && cpuctx->task_ctx != ctx)
697 return;
698
699 spin_lock_irqsave(&ctx->lock, flags);
700 update_context_time(ctx);
701
702 counter->prev_state = counter->state;
703 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
704 goto unlock;
705 counter->state = PERF_COUNTER_STATE_INACTIVE;
706 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
707
708 /*
709 * If the counter is in a group and isn't the group leader,
710 * then don't put it on unless the group is on.
711 */
712 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
713 goto unlock;
714
715 if (!group_can_go_on(counter, cpuctx, 1)) {
716 err = -EEXIST;
717 } else {
718 perf_disable();
719 if (counter == leader)
720 err = group_sched_in(counter, cpuctx, ctx,
721 smp_processor_id());
722 else
723 err = counter_sched_in(counter, cpuctx, ctx,
724 smp_processor_id());
725 perf_enable();
726 }
727
728 if (err) {
729 /*
730 * If this counter can't go on and it's part of a
731 * group, then the whole group has to come off.
732 */
733 if (leader != counter)
734 group_sched_out(leader, cpuctx, ctx);
735 if (leader->hw_event.pinned) {
736 update_group_times(leader);
737 leader->state = PERF_COUNTER_STATE_ERROR;
738 }
739 }
740
741 unlock:
742 spin_unlock_irqrestore(&ctx->lock, flags);
743}
744
745/*
746 * Enable a counter.
747 */
748static void perf_counter_enable(struct perf_counter *counter)
749{
750 struct perf_counter_context *ctx = counter->ctx;
751 struct task_struct *task = ctx->task;
752
753 if (!task) {
754 /*
755 * Enable the counter on the cpu that it's on
756 */
757 smp_call_function_single(counter->cpu, __perf_counter_enable,
758 counter, 1);
759 return;
760 }
761
762 spin_lock_irq(&ctx->lock);
763 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
764 goto out;
765
766 /*
767 * If the counter is in error state, clear that first.
768 * That way, if we see the counter in error state below, we
769 * know that it has gone back into error state, as distinct
770 * from the task having been scheduled away before the
771 * cross-call arrived.
772 */
773 if (counter->state == PERF_COUNTER_STATE_ERROR)
774 counter->state = PERF_COUNTER_STATE_OFF;
775
776 retry:
777 spin_unlock_irq(&ctx->lock);
778 task_oncpu_function_call(task, __perf_counter_enable, counter);
779
780 spin_lock_irq(&ctx->lock);
781
782 /*
783 * If the context is active and the counter is still off,
784 * we need to retry the cross-call.
785 */
786 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
787 goto retry;
788
789 /*
790 * Since we have the lock this context can't be scheduled
791 * in, so we can change the state safely.
792 */
793 if (counter->state == PERF_COUNTER_STATE_OFF) {
794 counter->state = PERF_COUNTER_STATE_INACTIVE;
795 counter->tstamp_enabled =
796 ctx->time - counter->total_time_enabled;
797 }
798 out:
799 spin_unlock_irq(&ctx->lock);
800}
801
802static int perf_counter_refresh(struct perf_counter *counter, int refresh)
803{
804 /*
805 * not supported on inherited counters
806 */
807 if (counter->hw_event.inherit)
808 return -EINVAL;
809
810 atomic_add(refresh, &counter->event_limit);
811 perf_counter_enable(counter);
812
813 return 0;
814}
815
816void __perf_counter_sched_out(struct perf_counter_context *ctx,
817 struct perf_cpu_context *cpuctx)
818{
819 struct perf_counter *counter;
820
821 spin_lock(&ctx->lock);
822 ctx->is_active = 0;
823 if (likely(!ctx->nr_counters))
824 goto out;
825 update_context_time(ctx);
826
827 perf_disable();
828 if (ctx->nr_active) {
829 list_for_each_entry(counter, &ctx->counter_list, list_entry)
830 group_sched_out(counter, cpuctx, ctx);
831 }
832 perf_enable();
833 out:
834 spin_unlock(&ctx->lock);
835}
836
837/*
838 * Called from scheduler to remove the counters of the current task,
839 * with interrupts disabled.
840 *
841 * We stop each counter and update the counter value in counter->count.
842 *
843 * This does not protect us against NMI, but disable()
844 * sets the disabled bit in the control field of counter _before_
845 * accessing the counter control register. If a NMI hits, then it will
846 * not restart the counter.
847 */
848void perf_counter_task_sched_out(struct task_struct *task, int cpu)
849{
850 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
851 struct perf_counter_context *ctx = &task->perf_counter_ctx;
852 struct pt_regs *regs;
853
854 if (likely(!cpuctx->task_ctx))
855 return;
856
857 update_context_time(ctx);
858
859 regs = task_pt_regs(task);
860 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
861 __perf_counter_sched_out(ctx, cpuctx);
862
863 cpuctx->task_ctx = NULL;
864}
865
866static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
867{
868 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
869
870 __perf_counter_sched_out(ctx, cpuctx);
871 cpuctx->task_ctx = NULL;
872}
873
874static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
875{
876 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
877}
878
879static void
880__perf_counter_sched_in(struct perf_counter_context *ctx,
881 struct perf_cpu_context *cpuctx, int cpu)
882{
883 struct perf_counter *counter;
884 int can_add_hw = 1;
885
886 spin_lock(&ctx->lock);
887 ctx->is_active = 1;
888 if (likely(!ctx->nr_counters))
889 goto out;
890
891 ctx->timestamp = perf_clock();
892
893 perf_disable();
894
895 /*
896 * First go through the list and put on any pinned groups
897 * in order to give them the best chance of going on.
898 */
899 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
900 if (counter->state <= PERF_COUNTER_STATE_OFF ||
901 !counter->hw_event.pinned)
902 continue;
903 if (counter->cpu != -1 && counter->cpu != cpu)
904 continue;
905
906 if (group_can_go_on(counter, cpuctx, 1))
907 group_sched_in(counter, cpuctx, ctx, cpu);
908
909 /*
910 * If this pinned group hasn't been scheduled,
911 * put it in error state.
912 */
913 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
914 update_group_times(counter);
915 counter->state = PERF_COUNTER_STATE_ERROR;
916 }
917 }
918
919 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
920 /*
921 * Ignore counters in OFF or ERROR state, and
922 * ignore pinned counters since we did them already.
923 */
924 if (counter->state <= PERF_COUNTER_STATE_OFF ||
925 counter->hw_event.pinned)
926 continue;
927
928 /*
929 * Listen to the 'cpu' scheduling filter constraint
930 * of counters:
931 */
932 if (counter->cpu != -1 && counter->cpu != cpu)
933 continue;
934
935 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
936 if (group_sched_in(counter, cpuctx, ctx, cpu))
937 can_add_hw = 0;
938 }
939 }
940 perf_enable();
941 out:
942 spin_unlock(&ctx->lock);
943}
944
945/*
946 * Called from scheduler to add the counters of the current task
947 * with interrupts disabled.
948 *
949 * We restore the counter value and then enable it.
950 *
951 * This does not protect us against NMI, but enable()
952 * sets the enabled bit in the control field of counter _before_
953 * accessing the counter control register. If a NMI hits, then it will
954 * keep the counter running.
955 */
956void perf_counter_task_sched_in(struct task_struct *task, int cpu)
957{
958 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
959 struct perf_counter_context *ctx = &task->perf_counter_ctx;
960
961 __perf_counter_sched_in(ctx, cpuctx, cpu);
962 cpuctx->task_ctx = ctx;
963}
964
965static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
966{
967 struct perf_counter_context *ctx = &cpuctx->ctx;
968
969 __perf_counter_sched_in(ctx, cpuctx, cpu);
970}
971
972int perf_counter_task_disable(void)
973{
974 struct task_struct *curr = current;
975 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
976 struct perf_counter *counter;
977 unsigned long flags;
978
979 if (likely(!ctx->nr_counters))
980 return 0;
981
982 local_irq_save(flags);
983
984 __perf_counter_task_sched_out(ctx);
985
986 spin_lock(&ctx->lock);
987
988 /*
989 * Disable all the counters:
990 */
991 perf_disable();
992
993 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
994 if (counter->state != PERF_COUNTER_STATE_ERROR) {
995 update_group_times(counter);
996 counter->state = PERF_COUNTER_STATE_OFF;
997 }
998 }
999
1000 perf_enable();
1001
1002 spin_unlock_irqrestore(&ctx->lock, flags);
1003
1004 return 0;
1005}
1006
1007int perf_counter_task_enable(void)
1008{
1009 struct task_struct *curr = current;
1010 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1011 struct perf_counter *counter;
1012 unsigned long flags;
1013 int cpu;
1014
1015 if (likely(!ctx->nr_counters))
1016 return 0;
1017
1018 local_irq_save(flags);
1019 cpu = smp_processor_id();
1020
1021 __perf_counter_task_sched_out(ctx);
1022
1023 spin_lock(&ctx->lock);
1024
1025 /*
1026 * Disable all the counters:
1027 */
1028 perf_disable();
1029
1030 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1031 if (counter->state > PERF_COUNTER_STATE_OFF)
1032 continue;
1033 counter->state = PERF_COUNTER_STATE_INACTIVE;
1034 counter->tstamp_enabled =
1035 ctx->time - counter->total_time_enabled;
1036 counter->hw_event.disabled = 0;
1037 }
1038 perf_enable();
1039
1040 spin_unlock(&ctx->lock);
1041
1042 perf_counter_task_sched_in(curr, cpu);
1043
1044 local_irq_restore(flags);
1045
1046 return 0;
1047}
1048
1049void perf_adjust_freq(struct perf_counter_context *ctx)
1050{
1051 struct perf_counter *counter;
1052 u64 irq_period;
1053 u64 events, period;
1054 s64 delta;
1055
1056 spin_lock(&ctx->lock);
1057 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1058 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1059 continue;
1060
1061 if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
1062 continue;
1063
1064 events = HZ * counter->hw.interrupts * counter->hw.irq_period;
1065 period = div64_u64(events, counter->hw_event.irq_freq);
1066
1067 delta = (s64)(1 + period - counter->hw.irq_period);
1068 delta >>= 1;
1069
1070 irq_period = counter->hw.irq_period + delta;
1071
1072 if (!irq_period)
1073 irq_period = 1;
1074
1075 counter->hw.irq_period = irq_period;
1076 counter->hw.interrupts = 0;
1077 }
1078 spin_unlock(&ctx->lock);
1079}
1080
1081/*
1082 * Round-robin a context's counters:
1083 */
1084static void rotate_ctx(struct perf_counter_context *ctx)
1085{
1086 struct perf_counter *counter;
1087
1088 if (!ctx->nr_counters)
1089 return;
1090
1091 spin_lock(&ctx->lock);
1092 /*
1093 * Rotate the first entry last (works just fine for group counters too):
1094 */
1095 perf_disable();
1096 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1097 list_move_tail(&counter->list_entry, &ctx->counter_list);
1098 break;
1099 }
1100 perf_enable();
1101
1102 spin_unlock(&ctx->lock);
1103}
1104
1105void perf_counter_task_tick(struct task_struct *curr, int cpu)
1106{
1107 struct perf_cpu_context *cpuctx;
1108 struct perf_counter_context *ctx;
1109
1110 if (!atomic_read(&nr_counters))
1111 return;
1112
1113 cpuctx = &per_cpu(perf_cpu_context, cpu);
1114 ctx = &curr->perf_counter_ctx;
1115
1116 perf_adjust_freq(&cpuctx->ctx);
1117 perf_adjust_freq(ctx);
1118
1119 perf_counter_cpu_sched_out(cpuctx);
1120 __perf_counter_task_sched_out(ctx);
1121
1122 rotate_ctx(&cpuctx->ctx);
1123 rotate_ctx(ctx);
1124
1125 perf_counter_cpu_sched_in(cpuctx, cpu);
1126 perf_counter_task_sched_in(curr, cpu);
1127}
1128
1129/*
1130 * Cross CPU call to read the hardware counter
1131 */
1132static void __read(void *info)
1133{
1134 struct perf_counter *counter = info;
1135 struct perf_counter_context *ctx = counter->ctx;
1136 unsigned long flags;
1137
1138 local_irq_save(flags);
1139 if (ctx->is_active)
1140 update_context_time(ctx);
1141 counter->pmu->read(counter);
1142 update_counter_times(counter);
1143 local_irq_restore(flags);
1144}
1145
1146static u64 perf_counter_read(struct perf_counter *counter)
1147{
1148 /*
1149 * If counter is enabled and currently active on a CPU, update the
1150 * value in the counter structure:
1151 */
1152 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1153 smp_call_function_single(counter->oncpu,
1154 __read, counter, 1);
1155 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1156 update_counter_times(counter);
1157 }
1158
1159 return atomic64_read(&counter->count);
1160}
1161
1162static void put_context(struct perf_counter_context *ctx)
1163{
1164 if (ctx->task)
1165 put_task_struct(ctx->task);
1166}
1167
1168static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1169{
1170 struct perf_cpu_context *cpuctx;
1171 struct perf_counter_context *ctx;
1172 struct task_struct *task;
1173
1174 /*
1175 * If cpu is not a wildcard then this is a percpu counter:
1176 */
1177 if (cpu != -1) {
1178 /* Must be root to operate on a CPU counter: */
1179 if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
1180 return ERR_PTR(-EACCES);
1181
1182 if (cpu < 0 || cpu > num_possible_cpus())
1183 return ERR_PTR(-EINVAL);
1184
1185 /*
1186 * We could be clever and allow to attach a counter to an
1187 * offline CPU and activate it when the CPU comes up, but
1188 * that's for later.
1189 */
1190 if (!cpu_isset(cpu, cpu_online_map))
1191 return ERR_PTR(-ENODEV);
1192
1193 cpuctx = &per_cpu(perf_cpu_context, cpu);
1194 ctx = &cpuctx->ctx;
1195
1196 return ctx;
1197 }
1198
1199 rcu_read_lock();
1200 if (!pid)
1201 task = current;
1202 else
1203 task = find_task_by_vpid(pid);
1204 if (task)
1205 get_task_struct(task);
1206 rcu_read_unlock();
1207
1208 if (!task)
1209 return ERR_PTR(-ESRCH);
1210
1211 ctx = &task->perf_counter_ctx;
1212 ctx->task = task;
1213
1214 /* Reuse ptrace permission checks for now. */
1215 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1216 put_context(ctx);
1217 return ERR_PTR(-EACCES);
1218 }
1219
1220 return ctx;
1221}
1222
1223static void free_counter_rcu(struct rcu_head *head)
1224{
1225 struct perf_counter *counter;
1226
1227 counter = container_of(head, struct perf_counter, rcu_head);
1228 kfree(counter);
1229}
1230
1231static void perf_pending_sync(struct perf_counter *counter);
1232
1233static void free_counter(struct perf_counter *counter)
1234{
1235 perf_pending_sync(counter);
1236
1237 atomic_dec(&nr_counters);
1238 if (counter->hw_event.mmap)
1239 atomic_dec(&nr_mmap_tracking);
1240 if (counter->hw_event.munmap)
1241 atomic_dec(&nr_munmap_tracking);
1242 if (counter->hw_event.comm)
1243 atomic_dec(&nr_comm_tracking);
1244
1245 if (counter->destroy)
1246 counter->destroy(counter);
1247
1248 call_rcu(&counter->rcu_head, free_counter_rcu);
1249}
1250
1251/*
1252 * Called when the last reference to the file is gone.
1253 */
1254static int perf_release(struct inode *inode, struct file *file)
1255{
1256 struct perf_counter *counter = file->private_data;
1257 struct perf_counter_context *ctx = counter->ctx;
1258
1259 file->private_data = NULL;
1260
1261 mutex_lock(&ctx->mutex);
1262 mutex_lock(&counter->mutex);
1263
1264 perf_counter_remove_from_context(counter);
1265
1266 mutex_unlock(&counter->mutex);
1267 mutex_unlock(&ctx->mutex);
1268
1269 free_counter(counter);
1270 put_context(ctx);
1271
1272 return 0;
1273}
1274
1275/*
1276 * Read the performance counter - simple non blocking version for now
1277 */
1278static ssize_t
1279perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1280{
1281 u64 values[3];
1282 int n;
1283
1284 /*
1285 * Return end-of-file for a read on a counter that is in
1286 * error state (i.e. because it was pinned but it couldn't be
1287 * scheduled on to the CPU at some point).
1288 */
1289 if (counter->state == PERF_COUNTER_STATE_ERROR)
1290 return 0;
1291
1292 mutex_lock(&counter->mutex);
1293 values[0] = perf_counter_read(counter);
1294 n = 1;
1295 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1296 values[n++] = counter->total_time_enabled +
1297 atomic64_read(&counter->child_total_time_enabled);
1298 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1299 values[n++] = counter->total_time_running +
1300 atomic64_read(&counter->child_total_time_running);
1301 mutex_unlock(&counter->mutex);
1302
1303 if (count < n * sizeof(u64))
1304 return -EINVAL;
1305 count = n * sizeof(u64);
1306
1307 if (copy_to_user(buf, values, count))
1308 return -EFAULT;
1309
1310 return count;
1311}
1312
1313static ssize_t
1314perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1315{
1316 struct perf_counter *counter = file->private_data;
1317
1318 return perf_read_hw(counter, buf, count);
1319}
1320
1321static unsigned int perf_poll(struct file *file, poll_table *wait)
1322{
1323 struct perf_counter *counter = file->private_data;
1324 struct perf_mmap_data *data;
1325 unsigned int events = POLL_HUP;
1326
1327 rcu_read_lock();
1328 data = rcu_dereference(counter->data);
1329 if (data)
1330 events = atomic_xchg(&data->poll, 0);
1331 rcu_read_unlock();
1332
1333 poll_wait(file, &counter->waitq, wait);
1334
1335 return events;
1336}
1337
1338static void perf_counter_reset(struct perf_counter *counter)
1339{
1340 (void)perf_counter_read(counter);
1341 atomic64_set(&counter->count, 0);
1342 perf_counter_update_userpage(counter);
1343}
1344
1345static void perf_counter_for_each_sibling(struct perf_counter *counter,
1346 void (*func)(struct perf_counter *))
1347{
1348 struct perf_counter_context *ctx = counter->ctx;
1349 struct perf_counter *sibling;
1350
1351 spin_lock_irq(&ctx->lock);
1352 counter = counter->group_leader;
1353
1354 func(counter);
1355 list_for_each_entry(sibling, &counter->sibling_list, list_entry)
1356 func(sibling);
1357 spin_unlock_irq(&ctx->lock);
1358}
1359
1360static void perf_counter_for_each_child(struct perf_counter *counter,
1361 void (*func)(struct perf_counter *))
1362{
1363 struct perf_counter *child;
1364
1365 mutex_lock(&counter->mutex);
1366 func(counter);
1367 list_for_each_entry(child, &counter->child_list, child_list)
1368 func(child);
1369 mutex_unlock(&counter->mutex);
1370}
1371
1372static void perf_counter_for_each(struct perf_counter *counter,
1373 void (*func)(struct perf_counter *))
1374{
1375 struct perf_counter *child;
1376
1377 mutex_lock(&counter->mutex);
1378 perf_counter_for_each_sibling(counter, func);
1379 list_for_each_entry(child, &counter->child_list, child_list)
1380 perf_counter_for_each_sibling(child, func);
1381 mutex_unlock(&counter->mutex);
1382}
1383
1384static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1385{
1386 struct perf_counter *counter = file->private_data;
1387 void (*func)(struct perf_counter *);
1388 u32 flags = arg;
1389
1390 switch (cmd) {
1391 case PERF_COUNTER_IOC_ENABLE:
1392 func = perf_counter_enable;
1393 break;
1394 case PERF_COUNTER_IOC_DISABLE:
1395 func = perf_counter_disable;
1396 break;
1397 case PERF_COUNTER_IOC_RESET:
1398 func = perf_counter_reset;
1399 break;
1400
1401 case PERF_COUNTER_IOC_REFRESH:
1402 return perf_counter_refresh(counter, arg);
1403 default:
1404 return -ENOTTY;
1405 }
1406
1407 if (flags & PERF_IOC_FLAG_GROUP)
1408 perf_counter_for_each(counter, func);
1409 else
1410 perf_counter_for_each_child(counter, func);
1411
1412 return 0;
1413}
1414
1415/*
1416 * Callers need to ensure there can be no nesting of this function, otherwise
1417 * the seqlock logic goes bad. We can not serialize this because the arch
1418 * code calls this from NMI context.
1419 */
1420void perf_counter_update_userpage(struct perf_counter *counter)
1421{
1422 struct perf_mmap_data *data;
1423 struct perf_counter_mmap_page *userpg;
1424
1425 rcu_read_lock();
1426 data = rcu_dereference(counter->data);
1427 if (!data)
1428 goto unlock;
1429
1430 userpg = data->user_page;
1431
1432 /*
1433 * Disable preemption so as to not let the corresponding user-space
1434 * spin too long if we get preempted.
1435 */
1436 preempt_disable();
1437 ++userpg->lock;
1438 barrier();
1439 userpg->index = counter->hw.idx;
1440 userpg->offset = atomic64_read(&counter->count);
1441 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1442 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1443
1444 barrier();
1445 ++userpg->lock;
1446 preempt_enable();
1447unlock:
1448 rcu_read_unlock();
1449}
1450
1451static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1452{
1453 struct perf_counter *counter = vma->vm_file->private_data;
1454 struct perf_mmap_data *data;
1455 int ret = VM_FAULT_SIGBUS;
1456
1457 rcu_read_lock();
1458 data = rcu_dereference(counter->data);
1459 if (!data)
1460 goto unlock;
1461
1462 if (vmf->pgoff == 0) {
1463 vmf->page = virt_to_page(data->user_page);
1464 } else {
1465 int nr = vmf->pgoff - 1;
1466
1467 if ((unsigned)nr > data->nr_pages)
1468 goto unlock;
1469
1470 vmf->page = virt_to_page(data->data_pages[nr]);
1471 }
1472 get_page(vmf->page);
1473 ret = 0;
1474unlock:
1475 rcu_read_unlock();
1476
1477 return ret;
1478}
1479
1480static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1481{
1482 struct perf_mmap_data *data;
1483 unsigned long size;
1484 int i;
1485
1486 WARN_ON(atomic_read(&counter->mmap_count));
1487
1488 size = sizeof(struct perf_mmap_data);
1489 size += nr_pages * sizeof(void *);
1490
1491 data = kzalloc(size, GFP_KERNEL);
1492 if (!data)
1493 goto fail;
1494
1495 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1496 if (!data->user_page)
1497 goto fail_user_page;
1498
1499 for (i = 0; i < nr_pages; i++) {
1500 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1501 if (!data->data_pages[i])
1502 goto fail_data_pages;
1503 }
1504
1505 data->nr_pages = nr_pages;
1506 atomic_set(&data->lock, -1);
1507
1508 rcu_assign_pointer(counter->data, data);
1509
1510 return 0;
1511
1512fail_data_pages:
1513 for (i--; i >= 0; i--)
1514 free_page((unsigned long)data->data_pages[i]);
1515
1516 free_page((unsigned long)data->user_page);
1517
1518fail_user_page:
1519 kfree(data);
1520
1521fail:
1522 return -ENOMEM;
1523}
1524
1525static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1526{
1527 struct perf_mmap_data *data = container_of(rcu_head,
1528 struct perf_mmap_data, rcu_head);
1529 int i;
1530
1531 free_page((unsigned long)data->user_page);
1532 for (i = 0; i < data->nr_pages; i++)
1533 free_page((unsigned long)data->data_pages[i]);
1534 kfree(data);
1535}
1536
1537static void perf_mmap_data_free(struct perf_counter *counter)
1538{
1539 struct perf_mmap_data *data = counter->data;
1540
1541 WARN_ON(atomic_read(&counter->mmap_count));
1542
1543 rcu_assign_pointer(counter->data, NULL);
1544 call_rcu(&data->rcu_head, __perf_mmap_data_free);
1545}
1546
1547static void perf_mmap_open(struct vm_area_struct *vma)
1548{
1549 struct perf_counter *counter = vma->vm_file->private_data;
1550
1551 atomic_inc(&counter->mmap_count);
1552}
1553
1554static void perf_mmap_close(struct vm_area_struct *vma)
1555{
1556 struct perf_counter *counter = vma->vm_file->private_data;
1557
1558 if (atomic_dec_and_mutex_lock(&counter->mmap_count,
1559 &counter->mmap_mutex)) {
1560 struct user_struct *user = current_user();
1561
1562 atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
1563 vma->vm_mm->locked_vm -= counter->data->nr_locked;
1564 perf_mmap_data_free(counter);
1565 mutex_unlock(&counter->mmap_mutex);
1566 }
1567}
1568
1569static struct vm_operations_struct perf_mmap_vmops = {
1570 .open = perf_mmap_open,
1571 .close = perf_mmap_close,
1572 .fault = perf_mmap_fault,
1573};
1574
1575static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1576{
1577 struct perf_counter *counter = file->private_data;
1578 struct user_struct *user = current_user();
1579 unsigned long vma_size;
1580 unsigned long nr_pages;
1581 unsigned long user_locked, user_lock_limit;
1582 unsigned long locked, lock_limit;
1583 long user_extra, extra;
1584 int ret = 0;
1585
1586 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1587 return -EINVAL;
1588
1589 vma_size = vma->vm_end - vma->vm_start;
1590 nr_pages = (vma_size / PAGE_SIZE) - 1;
1591
1592 /*
1593 * If we have data pages ensure they're a power-of-two number, so we
1594 * can do bitmasks instead of modulo.
1595 */
1596 if (nr_pages != 0 && !is_power_of_2(nr_pages))
1597 return -EINVAL;
1598
1599 if (vma_size != PAGE_SIZE * (1 + nr_pages))
1600 return -EINVAL;
1601
1602 if (vma->vm_pgoff != 0)
1603 return -EINVAL;
1604
1605 mutex_lock(&counter->mmap_mutex);
1606 if (atomic_inc_not_zero(&counter->mmap_count)) {
1607 if (nr_pages != counter->data->nr_pages)
1608 ret = -EINVAL;
1609 goto unlock;
1610 }
1611
1612 user_extra = nr_pages + 1;
1613 user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
1614 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
1615
1616 extra = 0;
1617 if (user_locked > user_lock_limit)
1618 extra = user_locked - user_lock_limit;
1619
1620 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1621 lock_limit >>= PAGE_SHIFT;
1622 locked = vma->vm_mm->locked_vm + extra;
1623
1624 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1625 ret = -EPERM;
1626 goto unlock;
1627 }
1628
1629 WARN_ON(counter->data);
1630 ret = perf_mmap_data_alloc(counter, nr_pages);
1631 if (ret)
1632 goto unlock;
1633
1634 atomic_set(&counter->mmap_count, 1);
1635 atomic_long_add(user_extra, &user->locked_vm);
1636 vma->vm_mm->locked_vm += extra;
1637 counter->data->nr_locked = extra;
1638unlock:
1639 mutex_unlock(&counter->mmap_mutex);
1640
1641 vma->vm_flags &= ~VM_MAYWRITE;
1642 vma->vm_flags |= VM_RESERVED;
1643 vma->vm_ops = &perf_mmap_vmops;
1644
1645 return ret;
1646}
1647
1648static int perf_fasync(int fd, struct file *filp, int on)
1649{
1650 struct perf_counter *counter = filp->private_data;
1651 struct inode *inode = filp->f_path.dentry->d_inode;
1652 int retval;
1653
1654 mutex_lock(&inode->i_mutex);
1655 retval = fasync_helper(fd, filp, on, &counter->fasync);
1656 mutex_unlock(&inode->i_mutex);
1657
1658 if (retval < 0)
1659 return retval;
1660
1661 return 0;
1662}
1663
1664static const struct file_operations perf_fops = {
1665 .release = perf_release,
1666 .read = perf_read,
1667 .poll = perf_poll,
1668 .unlocked_ioctl = perf_ioctl,
1669 .compat_ioctl = perf_ioctl,
1670 .mmap = perf_mmap,
1671 .fasync = perf_fasync,
1672};
1673
1674/*
1675 * Perf counter wakeup
1676 *
1677 * If there's data, ensure we set the poll() state and publish everything
1678 * to user-space before waking everybody up.
1679 */
1680
1681void perf_counter_wakeup(struct perf_counter *counter)
1682{
1683 wake_up_all(&counter->waitq);
1684
1685 if (counter->pending_kill) {
1686 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
1687 counter->pending_kill = 0;
1688 }
1689}
1690
1691/*
1692 * Pending wakeups
1693 *
1694 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
1695 *
1696 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
1697 * single linked list and use cmpxchg() to add entries lockless.
1698 */
1699
1700static void perf_pending_counter(struct perf_pending_entry *entry)
1701{
1702 struct perf_counter *counter = container_of(entry,
1703 struct perf_counter, pending);
1704
1705 if (counter->pending_disable) {
1706 counter->pending_disable = 0;
1707 perf_counter_disable(counter);
1708 }
1709
1710 if (counter->pending_wakeup) {
1711 counter->pending_wakeup = 0;
1712 perf_counter_wakeup(counter);
1713 }
1714}
1715
1716#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
1717
1718static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
1719 PENDING_TAIL,
1720};
1721
1722static void perf_pending_queue(struct perf_pending_entry *entry,
1723 void (*func)(struct perf_pending_entry *))
1724{
1725 struct perf_pending_entry **head;
1726
1727 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
1728 return;
1729
1730 entry->func = func;
1731
1732 head = &get_cpu_var(perf_pending_head);
1733
1734 do {
1735 entry->next = *head;
1736 } while (cmpxchg(head, entry->next, entry) != entry->next);
1737
1738 set_perf_counter_pending();
1739
1740 put_cpu_var(perf_pending_head);
1741}
1742
1743static int __perf_pending_run(void)
1744{
1745 struct perf_pending_entry *list;
1746 int nr = 0;
1747
1748 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
1749 while (list != PENDING_TAIL) {
1750 void (*func)(struct perf_pending_entry *);
1751 struct perf_pending_entry *entry = list;
1752
1753 list = list->next;
1754
1755 func = entry->func;
1756 entry->next = NULL;
1757 /*
1758 * Ensure we observe the unqueue before we issue the wakeup,
1759 * so that we won't be waiting forever.
1760 * -- see perf_not_pending().
1761 */
1762 smp_wmb();
1763
1764 func(entry);
1765 nr++;
1766 }
1767
1768 return nr;
1769}
1770
1771static inline int perf_not_pending(struct perf_counter *counter)
1772{
1773 /*
1774 * If we flush on whatever cpu we run, there is a chance we don't
1775 * need to wait.
1776 */
1777 get_cpu();
1778 __perf_pending_run();
1779 put_cpu();
1780
1781 /*
1782 * Ensure we see the proper queue state before going to sleep
1783 * so that we do not miss the wakeup. -- see perf_pending_handle()
1784 */
1785 smp_rmb();
1786 return counter->pending.next == NULL;
1787}
1788
1789static void perf_pending_sync(struct perf_counter *counter)
1790{
1791 wait_event(counter->waitq, perf_not_pending(counter));
1792}
1793
1794void perf_counter_do_pending(void)
1795{
1796 __perf_pending_run();
1797}
1798
1799/*
1800 * Callchain support -- arch specific
1801 */
1802
1803__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1804{
1805 return NULL;
1806}
1807
1808/*
1809 * Output
1810 */
1811
1812struct perf_output_handle {
1813 struct perf_counter *counter;
1814 struct perf_mmap_data *data;
1815 unsigned int offset;
1816 unsigned int head;
1817 int nmi;
1818 int overflow;
1819 int locked;
1820 unsigned long flags;
1821};
1822
1823static void perf_output_wakeup(struct perf_output_handle *handle)
1824{
1825 atomic_set(&handle->data->poll, POLL_IN);
1826
1827 if (handle->nmi) {
1828 handle->counter->pending_wakeup = 1;
1829 perf_pending_queue(&handle->counter->pending,
1830 perf_pending_counter);
1831 } else
1832 perf_counter_wakeup(handle->counter);
1833}
1834
1835/*
1836 * Curious locking construct.
1837 *
1838 * We need to ensure a later event doesn't publish a head when a former
1839 * event isn't done writing. However since we need to deal with NMIs we
1840 * cannot fully serialize things.
1841 *
1842 * What we do is serialize between CPUs so we only have to deal with NMI
1843 * nesting on a single CPU.
1844 *
1845 * We only publish the head (and generate a wakeup) when the outer-most
1846 * event completes.
1847 */
1848static void perf_output_lock(struct perf_output_handle *handle)
1849{
1850 struct perf_mmap_data *data = handle->data;
1851 int cpu;
1852
1853 handle->locked = 0;
1854
1855 local_irq_save(handle->flags);
1856 cpu = smp_processor_id();
1857
1858 if (in_nmi() && atomic_read(&data->lock) == cpu)
1859 return;
1860
1861 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
1862 cpu_relax();
1863
1864 handle->locked = 1;
1865}
1866
1867static void perf_output_unlock(struct perf_output_handle *handle)
1868{
1869 struct perf_mmap_data *data = handle->data;
1870 int head, cpu;
1871
1872 data->done_head = data->head;
1873
1874 if (!handle->locked)
1875 goto out;
1876
1877again:
1878 /*
1879 * The xchg implies a full barrier that ensures all writes are done
1880 * before we publish the new head, matched by a rmb() in userspace when
1881 * reading this position.
1882 */
1883 while ((head = atomic_xchg(&data->done_head, 0)))
1884 data->user_page->data_head = head;
1885
1886 /*
1887 * NMI can happen here, which means we can miss a done_head update.
1888 */
1889
1890 cpu = atomic_xchg(&data->lock, -1);
1891 WARN_ON_ONCE(cpu != smp_processor_id());
1892
1893 /*
1894 * Therefore we have to validate we did not indeed do so.
1895 */
1896 if (unlikely(atomic_read(&data->done_head))) {
1897 /*
1898 * Since we had it locked, we can lock it again.
1899 */
1900 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
1901 cpu_relax();
1902
1903 goto again;
1904 }
1905
1906 if (atomic_xchg(&data->wakeup, 0))
1907 perf_output_wakeup(handle);
1908out:
1909 local_irq_restore(handle->flags);
1910}
1911
1912static int perf_output_begin(struct perf_output_handle *handle,
1913 struct perf_counter *counter, unsigned int size,
1914 int nmi, int overflow)
1915{
1916 struct perf_mmap_data *data;
1917 unsigned int offset, head;
1918
1919 /*
1920 * For inherited counters we send all the output towards the parent.
1921 */
1922 if (counter->parent)
1923 counter = counter->parent;
1924
1925 rcu_read_lock();
1926 data = rcu_dereference(counter->data);
1927 if (!data)
1928 goto out;
1929
1930 handle->data = data;
1931 handle->counter = counter;
1932 handle->nmi = nmi;
1933 handle->overflow = overflow;
1934
1935 if (!data->nr_pages)
1936 goto fail;
1937
1938 perf_output_lock(handle);
1939
1940 do {
1941 offset = head = atomic_read(&data->head);
1942 head += size;
1943 } while (atomic_cmpxchg(&data->head, offset, head) != offset);
1944
1945 handle->offset = offset;
1946 handle->head = head;
1947
1948 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
1949 atomic_set(&data->wakeup, 1);
1950
1951 return 0;
1952
1953fail:
1954 perf_output_wakeup(handle);
1955out:
1956 rcu_read_unlock();
1957
1958 return -ENOSPC;
1959}
1960
1961static void perf_output_copy(struct perf_output_handle *handle,
1962 void *buf, unsigned int len)
1963{
1964 unsigned int pages_mask;
1965 unsigned int offset;
1966 unsigned int size;
1967 void **pages;
1968
1969 offset = handle->offset;
1970 pages_mask = handle->data->nr_pages - 1;
1971 pages = handle->data->data_pages;
1972
1973 do {
1974 unsigned int page_offset;
1975 int nr;
1976
1977 nr = (offset >> PAGE_SHIFT) & pages_mask;
1978 page_offset = offset & (PAGE_SIZE - 1);
1979 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
1980
1981 memcpy(pages[nr] + page_offset, buf, size);
1982
1983 len -= size;
1984 buf += size;
1985 offset += size;
1986 } while (len);
1987
1988 handle->offset = offset;
1989
1990 /*
1991 * Check we didn't copy past our reservation window, taking the
1992 * possible unsigned int wrap into account.
1993 */
1994 WARN_ON_ONCE(((int)(handle->head - handle->offset)) < 0);
1995}
1996
1997#define perf_output_put(handle, x) \
1998 perf_output_copy((handle), &(x), sizeof(x))
1999
2000static void perf_output_end(struct perf_output_handle *handle)
2001{
2002 struct perf_counter *counter = handle->counter;
2003 struct perf_mmap_data *data = handle->data;
2004
2005 int wakeup_events = counter->hw_event.wakeup_events;
2006
2007 if (handle->overflow && wakeup_events) {
2008 int events = atomic_inc_return(&data->events);
2009 if (events >= wakeup_events) {
2010 atomic_sub(wakeup_events, &data->events);
2011 atomic_set(&data->wakeup, 1);
2012 }
2013 }
2014
2015 perf_output_unlock(handle);
2016 rcu_read_unlock();
2017}
2018
2019static void perf_counter_output(struct perf_counter *counter,
2020 int nmi, struct pt_regs *regs, u64 addr)
2021{
2022 int ret;
2023 u64 record_type = counter->hw_event.record_type;
2024 struct perf_output_handle handle;
2025 struct perf_event_header header;
2026 u64 ip;
2027 struct {
2028 u32 pid, tid;
2029 } tid_entry;
2030 struct {
2031 u64 event;
2032 u64 counter;
2033 } group_entry;
2034 struct perf_callchain_entry *callchain = NULL;
2035 int callchain_size = 0;
2036 u64 time;
2037 struct {
2038 u32 cpu, reserved;
2039 } cpu_entry;
2040
2041 header.type = 0;
2042 header.size = sizeof(header);
2043
2044 header.misc = PERF_EVENT_MISC_OVERFLOW;
2045 header.misc |= perf_misc_flags(regs);
2046
2047 if (record_type & PERF_RECORD_IP) {
2048 ip = perf_instruction_pointer(regs);
2049 header.type |= PERF_RECORD_IP;
2050 header.size += sizeof(ip);
2051 }
2052
2053 if (record_type & PERF_RECORD_TID) {
2054 /* namespace issues */
2055 tid_entry.pid = current->group_leader->pid;
2056 tid_entry.tid = current->pid;
2057
2058 header.type |= PERF_RECORD_TID;
2059 header.size += sizeof(tid_entry);
2060 }
2061
2062 if (record_type & PERF_RECORD_TIME) {
2063 /*
2064 * Maybe do better on x86 and provide cpu_clock_nmi()
2065 */
2066 time = sched_clock();
2067
2068 header.type |= PERF_RECORD_TIME;
2069 header.size += sizeof(u64);
2070 }
2071
2072 if (record_type & PERF_RECORD_ADDR) {
2073 header.type |= PERF_RECORD_ADDR;
2074 header.size += sizeof(u64);
2075 }
2076
2077 if (record_type & PERF_RECORD_CONFIG) {
2078 header.type |= PERF_RECORD_CONFIG;
2079 header.size += sizeof(u64);
2080 }
2081
2082 if (record_type & PERF_RECORD_CPU) {
2083 header.type |= PERF_RECORD_CPU;
2084 header.size += sizeof(cpu_entry);
2085
2086 cpu_entry.cpu = raw_smp_processor_id();
2087 }
2088
2089 if (record_type & PERF_RECORD_GROUP) {
2090 header.type |= PERF_RECORD_GROUP;
2091 header.size += sizeof(u64) +
2092 counter->nr_siblings * sizeof(group_entry);
2093 }
2094
2095 if (record_type & PERF_RECORD_CALLCHAIN) {
2096 callchain = perf_callchain(regs);
2097
2098 if (callchain) {
2099 callchain_size = (1 + callchain->nr) * sizeof(u64);
2100
2101 header.type |= PERF_RECORD_CALLCHAIN;
2102 header.size += callchain_size;
2103 }
2104 }
2105
2106 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
2107 if (ret)
2108 return;
2109
2110 perf_output_put(&handle, header);
2111
2112 if (record_type & PERF_RECORD_IP)
2113 perf_output_put(&handle, ip);
2114
2115 if (record_type & PERF_RECORD_TID)
2116 perf_output_put(&handle, tid_entry);
2117
2118 if (record_type & PERF_RECORD_TIME)
2119 perf_output_put(&handle, time);
2120
2121 if (record_type & PERF_RECORD_ADDR)
2122 perf_output_put(&handle, addr);
2123
2124 if (record_type & PERF_RECORD_CONFIG)
2125 perf_output_put(&handle, counter->hw_event.config);
2126
2127 if (record_type & PERF_RECORD_CPU)
2128 perf_output_put(&handle, cpu_entry);
2129
2130 /*
2131 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
2132 */
2133 if (record_type & PERF_RECORD_GROUP) {
2134 struct perf_counter *leader, *sub;
2135 u64 nr = counter->nr_siblings;
2136
2137 perf_output_put(&handle, nr);
2138
2139 leader = counter->group_leader;
2140 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2141 if (sub != counter)
2142 sub->pmu->read(sub);
2143
2144 group_entry.event = sub->hw_event.config;
2145 group_entry.counter = atomic64_read(&sub->count);
2146
2147 perf_output_put(&handle, group_entry);
2148 }
2149 }
2150
2151 if (callchain)
2152 perf_output_copy(&handle, callchain, callchain_size);
2153
2154 perf_output_end(&handle);
2155}
2156
2157/*
2158 * comm tracking
2159 */
2160
2161struct perf_comm_event {
2162 struct task_struct *task;
2163 char *comm;
2164 int comm_size;
2165
2166 struct {
2167 struct perf_event_header header;
2168
2169 u32 pid;
2170 u32 tid;
2171 } event;
2172};
2173
2174static void perf_counter_comm_output(struct perf_counter *counter,
2175 struct perf_comm_event *comm_event)
2176{
2177 struct perf_output_handle handle;
2178 int size = comm_event->event.header.size;
2179 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2180
2181 if (ret)
2182 return;
2183
2184 perf_output_put(&handle, comm_event->event);
2185 perf_output_copy(&handle, comm_event->comm,
2186 comm_event->comm_size);
2187 perf_output_end(&handle);
2188}
2189
2190static int perf_counter_comm_match(struct perf_counter *counter,
2191 struct perf_comm_event *comm_event)
2192{
2193 if (counter->hw_event.comm &&
2194 comm_event->event.header.type == PERF_EVENT_COMM)
2195 return 1;
2196
2197 return 0;
2198}
2199
2200static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
2201 struct perf_comm_event *comm_event)
2202{
2203 struct perf_counter *counter;
2204
2205 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2206 return;
2207
2208 rcu_read_lock();
2209 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2210 if (perf_counter_comm_match(counter, comm_event))
2211 perf_counter_comm_output(counter, comm_event);
2212 }
2213 rcu_read_unlock();
2214}
2215
2216static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2217{
2218 struct perf_cpu_context *cpuctx;
2219 unsigned int size;
2220 char *comm = comm_event->task->comm;
2221
2222 size = ALIGN(strlen(comm)+1, sizeof(u64));
2223
2224 comm_event->comm = comm;
2225 comm_event->comm_size = size;
2226
2227 comm_event->event.header.size = sizeof(comm_event->event) + size;
2228
2229 cpuctx = &get_cpu_var(perf_cpu_context);
2230 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
2231 put_cpu_var(perf_cpu_context);
2232
2233 perf_counter_comm_ctx(&current->perf_counter_ctx, comm_event);
2234}
2235
2236void perf_counter_comm(struct task_struct *task)
2237{
2238 struct perf_comm_event comm_event;
2239
2240 if (!atomic_read(&nr_comm_tracking))
2241 return;
2242
2243 comm_event = (struct perf_comm_event){
2244 .task = task,
2245 .event = {
2246 .header = { .type = PERF_EVENT_COMM, },
2247 .pid = task->group_leader->pid,
2248 .tid = task->pid,
2249 },
2250 };
2251
2252 perf_counter_comm_event(&comm_event);
2253}
2254
2255/*
2256 * mmap tracking
2257 */
2258
2259struct perf_mmap_event {
2260 struct file *file;
2261 char *file_name;
2262 int file_size;
2263
2264 struct {
2265 struct perf_event_header header;
2266
2267 u32 pid;
2268 u32 tid;
2269 u64 start;
2270 u64 len;
2271 u64 pgoff;
2272 } event;
2273};
2274
2275static void perf_counter_mmap_output(struct perf_counter *counter,
2276 struct perf_mmap_event *mmap_event)
2277{
2278 struct perf_output_handle handle;
2279 int size = mmap_event->event.header.size;
2280 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2281
2282 if (ret)
2283 return;
2284
2285 perf_output_put(&handle, mmap_event->event);
2286 perf_output_copy(&handle, mmap_event->file_name,
2287 mmap_event->file_size);
2288 perf_output_end(&handle);
2289}
2290
2291static int perf_counter_mmap_match(struct perf_counter *counter,
2292 struct perf_mmap_event *mmap_event)
2293{
2294 if (counter->hw_event.mmap &&
2295 mmap_event->event.header.type == PERF_EVENT_MMAP)
2296 return 1;
2297
2298 if (counter->hw_event.munmap &&
2299 mmap_event->event.header.type == PERF_EVENT_MUNMAP)
2300 return 1;
2301
2302 return 0;
2303}
2304
2305static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
2306 struct perf_mmap_event *mmap_event)
2307{
2308 struct perf_counter *counter;
2309
2310 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2311 return;
2312
2313 rcu_read_lock();
2314 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2315 if (perf_counter_mmap_match(counter, mmap_event))
2316 perf_counter_mmap_output(counter, mmap_event);
2317 }
2318 rcu_read_unlock();
2319}
2320
2321static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2322{
2323 struct perf_cpu_context *cpuctx;
2324 struct file *file = mmap_event->file;
2325 unsigned int size;
2326 char tmp[16];
2327 char *buf = NULL;
2328 char *name;
2329
2330 if (file) {
2331 buf = kzalloc(PATH_MAX, GFP_KERNEL);
2332 if (!buf) {
2333 name = strncpy(tmp, "//enomem", sizeof(tmp));
2334 goto got_name;
2335 }
2336 name = d_path(&file->f_path, buf, PATH_MAX);
2337 if (IS_ERR(name)) {
2338 name = strncpy(tmp, "//toolong", sizeof(tmp));
2339 goto got_name;
2340 }
2341 } else {
2342 name = strncpy(tmp, "//anon", sizeof(tmp));
2343 goto got_name;
2344 }
2345
2346got_name:
2347 size = ALIGN(strlen(name)+1, sizeof(u64));
2348
2349 mmap_event->file_name = name;
2350 mmap_event->file_size = size;
2351
2352 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2353
2354 cpuctx = &get_cpu_var(perf_cpu_context);
2355 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2356 put_cpu_var(perf_cpu_context);
2357
2358 perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
2359
2360 kfree(buf);
2361}
2362
2363void perf_counter_mmap(unsigned long addr, unsigned long len,
2364 unsigned long pgoff, struct file *file)
2365{
2366 struct perf_mmap_event mmap_event;
2367
2368 if (!atomic_read(&nr_mmap_tracking))
2369 return;
2370
2371 mmap_event = (struct perf_mmap_event){
2372 .file = file,
2373 .event = {
2374 .header = { .type = PERF_EVENT_MMAP, },
2375 .pid = current->group_leader->pid,
2376 .tid = current->pid,
2377 .start = addr,
2378 .len = len,
2379 .pgoff = pgoff,
2380 },
2381 };
2382
2383 perf_counter_mmap_event(&mmap_event);
2384}
2385
2386void perf_counter_munmap(unsigned long addr, unsigned long len,
2387 unsigned long pgoff, struct file *file)
2388{
2389 struct perf_mmap_event mmap_event;
2390
2391 if (!atomic_read(&nr_munmap_tracking))
2392 return;
2393
2394 mmap_event = (struct perf_mmap_event){
2395 .file = file,
2396 .event = {
2397 .header = { .type = PERF_EVENT_MUNMAP, },
2398 .pid = current->group_leader->pid,
2399 .tid = current->pid,
2400 .start = addr,
2401 .len = len,
2402 .pgoff = pgoff,
2403 },
2404 };
2405
2406 perf_counter_mmap_event(&mmap_event);
2407}
2408
2409/*
2410 * Generic counter overflow handling.
2411 */
2412
2413int perf_counter_overflow(struct perf_counter *counter,
2414 int nmi, struct pt_regs *regs, u64 addr)
2415{
2416 int events = atomic_read(&counter->event_limit);
2417 int ret = 0;
2418
2419 counter->hw.interrupts++;
2420
2421 /*
2422 * XXX event_limit might not quite work as expected on inherited
2423 * counters
2424 */
2425
2426 counter->pending_kill = POLL_IN;
2427 if (events && atomic_dec_and_test(&counter->event_limit)) {
2428 ret = 1;
2429 counter->pending_kill = POLL_HUP;
2430 if (nmi) {
2431 counter->pending_disable = 1;
2432 perf_pending_queue(&counter->pending,
2433 perf_pending_counter);
2434 } else
2435 perf_counter_disable(counter);
2436 }
2437
2438 perf_counter_output(counter, nmi, regs, addr);
2439 return ret;
2440}
2441
2442/*
2443 * Generic software counter infrastructure
2444 */
2445
2446static void perf_swcounter_update(struct perf_counter *counter)
2447{
2448 struct hw_perf_counter *hwc = &counter->hw;
2449 u64 prev, now;
2450 s64 delta;
2451
2452again:
2453 prev = atomic64_read(&hwc->prev_count);
2454 now = atomic64_read(&hwc->count);
2455 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
2456 goto again;
2457
2458 delta = now - prev;
2459
2460 atomic64_add(delta, &counter->count);
2461 atomic64_sub(delta, &hwc->period_left);
2462}
2463
2464static void perf_swcounter_set_period(struct perf_counter *counter)
2465{
2466 struct hw_perf_counter *hwc = &counter->hw;
2467 s64 left = atomic64_read(&hwc->period_left);
2468 s64 period = hwc->irq_period;
2469
2470 if (unlikely(left <= -period)) {
2471 left = period;
2472 atomic64_set(&hwc->period_left, left);
2473 }
2474
2475 if (unlikely(left <= 0)) {
2476 left += period;
2477 atomic64_add(period, &hwc->period_left);
2478 }
2479
2480 atomic64_set(&hwc->prev_count, -left);
2481 atomic64_set(&hwc->count, -left);
2482}
2483
2484static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
2485{
2486 enum hrtimer_restart ret = HRTIMER_RESTART;
2487 struct perf_counter *counter;
2488 struct pt_regs *regs;
2489 u64 period;
2490
2491 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
2492 counter->pmu->read(counter);
2493
2494 regs = get_irq_regs();
2495 /*
2496 * In case we exclude kernel IPs or are somehow not in interrupt
2497 * context, provide the next best thing, the user IP.
2498 */
2499 if ((counter->hw_event.exclude_kernel || !regs) &&
2500 !counter->hw_event.exclude_user)
2501 regs = task_pt_regs(current);
2502
2503 if (regs) {
2504 if (perf_counter_overflow(counter, 0, regs, 0))
2505 ret = HRTIMER_NORESTART;
2506 }
2507
2508 period = max_t(u64, 10000, counter->hw.irq_period);
2509 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
2510
2511 return ret;
2512}
2513
2514static void perf_swcounter_overflow(struct perf_counter *counter,
2515 int nmi, struct pt_regs *regs, u64 addr)
2516{
2517 perf_swcounter_update(counter);
2518 perf_swcounter_set_period(counter);
2519 if (perf_counter_overflow(counter, nmi, regs, addr))
2520 /* soft-disable the counter */
2521 ;
2522
2523}
2524
2525static int perf_swcounter_match(struct perf_counter *counter,
2526 enum perf_event_types type,
2527 u32 event, struct pt_regs *regs)
2528{
2529 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2530 return 0;
2531
2532 if (perf_event_raw(&counter->hw_event))
2533 return 0;
2534
2535 if (perf_event_type(&counter->hw_event) != type)
2536 return 0;
2537
2538 if (perf_event_id(&counter->hw_event) != event)
2539 return 0;
2540
2541 if (counter->hw_event.exclude_user && user_mode(regs))
2542 return 0;
2543
2544 if (counter->hw_event.exclude_kernel && !user_mode(regs))
2545 return 0;
2546
2547 return 1;
2548}
2549
2550static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
2551 int nmi, struct pt_regs *regs, u64 addr)
2552{
2553 int neg = atomic64_add_negative(nr, &counter->hw.count);
2554 if (counter->hw.irq_period && !neg)
2555 perf_swcounter_overflow(counter, nmi, regs, addr);
2556}
2557
2558static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
2559 enum perf_event_types type, u32 event,
2560 u64 nr, int nmi, struct pt_regs *regs,
2561 u64 addr)
2562{
2563 struct perf_counter *counter;
2564
2565 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2566 return;
2567
2568 rcu_read_lock();
2569 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2570 if (perf_swcounter_match(counter, type, event, regs))
2571 perf_swcounter_add(counter, nr, nmi, regs, addr);
2572 }
2573 rcu_read_unlock();
2574}
2575
2576static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
2577{
2578 if (in_nmi())
2579 return &cpuctx->recursion[3];
2580
2581 if (in_irq())
2582 return &cpuctx->recursion[2];
2583
2584 if (in_softirq())
2585 return &cpuctx->recursion[1];
2586
2587 return &cpuctx->recursion[0];
2588}
2589
2590static void __perf_swcounter_event(enum perf_event_types type, u32 event,
2591 u64 nr, int nmi, struct pt_regs *regs,
2592 u64 addr)
2593{
2594 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
2595 int *recursion = perf_swcounter_recursion_context(cpuctx);
2596
2597 if (*recursion)
2598 goto out;
2599
2600 (*recursion)++;
2601 barrier();
2602
2603 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
2604 nr, nmi, regs, addr);
2605 if (cpuctx->task_ctx) {
2606 perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
2607 nr, nmi, regs, addr);
2608 }
2609
2610 barrier();
2611 (*recursion)--;
2612
2613out:
2614 put_cpu_var(perf_cpu_context);
2615}
2616
2617void
2618perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
2619{
2620 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
2621}
2622
2623static void perf_swcounter_read(struct perf_counter *counter)
2624{
2625 perf_swcounter_update(counter);
2626}
2627
2628static int perf_swcounter_enable(struct perf_counter *counter)
2629{
2630 perf_swcounter_set_period(counter);
2631 return 0;
2632}
2633
2634static void perf_swcounter_disable(struct perf_counter *counter)
2635{
2636 perf_swcounter_update(counter);
2637}
2638
2639static const struct pmu perf_ops_generic = {
2640 .enable = perf_swcounter_enable,
2641 .disable = perf_swcounter_disable,
2642 .read = perf_swcounter_read,
2643};
2644
2645/*
2646 * Software counter: cpu wall time clock
2647 */
2648
2649static void cpu_clock_perf_counter_update(struct perf_counter *counter)
2650{
2651 int cpu = raw_smp_processor_id();
2652 s64 prev;
2653 u64 now;
2654
2655 now = cpu_clock(cpu);
2656 prev = atomic64_read(&counter->hw.prev_count);
2657 atomic64_set(&counter->hw.prev_count, now);
2658 atomic64_add(now - prev, &counter->count);
2659}
2660
2661static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
2662{
2663 struct hw_perf_counter *hwc = &counter->hw;
2664 int cpu = raw_smp_processor_id();
2665
2666 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
2667 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2668 hwc->hrtimer.function = perf_swcounter_hrtimer;
2669 if (hwc->irq_period) {
2670 u64 period = max_t(u64, 10000, hwc->irq_period);
2671 __hrtimer_start_range_ns(&hwc->hrtimer,
2672 ns_to_ktime(period), 0,
2673 HRTIMER_MODE_REL, 0);
2674 }
2675
2676 return 0;
2677}
2678
2679static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
2680{
2681 hrtimer_cancel(&counter->hw.hrtimer);
2682 cpu_clock_perf_counter_update(counter);
2683}
2684
2685static void cpu_clock_perf_counter_read(struct perf_counter *counter)
2686{
2687 cpu_clock_perf_counter_update(counter);
2688}
2689
2690static const struct pmu perf_ops_cpu_clock = {
2691 .enable = cpu_clock_perf_counter_enable,
2692 .disable = cpu_clock_perf_counter_disable,
2693 .read = cpu_clock_perf_counter_read,
2694};
2695
2696/*
2697 * Software counter: task time clock
2698 */
2699
2700static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
2701{
2702 u64 prev;
2703 s64 delta;
2704
2705 prev = atomic64_xchg(&counter->hw.prev_count, now);
2706 delta = now - prev;
2707 atomic64_add(delta, &counter->count);
2708}
2709
2710static int task_clock_perf_counter_enable(struct perf_counter *counter)
2711{
2712 struct hw_perf_counter *hwc = &counter->hw;
2713 u64 now;
2714
2715 now = counter->ctx->time;
2716
2717 atomic64_set(&hwc->prev_count, now);
2718 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2719 hwc->hrtimer.function = perf_swcounter_hrtimer;
2720 if (hwc->irq_period) {
2721 u64 period = max_t(u64, 10000, hwc->irq_period);
2722 __hrtimer_start_range_ns(&hwc->hrtimer,
2723 ns_to_ktime(period), 0,
2724 HRTIMER_MODE_REL, 0);
2725 }
2726
2727 return 0;
2728}
2729
2730static void task_clock_perf_counter_disable(struct perf_counter *counter)
2731{
2732 hrtimer_cancel(&counter->hw.hrtimer);
2733 task_clock_perf_counter_update(counter, counter->ctx->time);
2734
2735}
2736
2737static void task_clock_perf_counter_read(struct perf_counter *counter)
2738{
2739 u64 time;
2740
2741 if (!in_nmi()) {
2742 update_context_time(counter->ctx);
2743 time = counter->ctx->time;
2744 } else {
2745 u64 now = perf_clock();
2746 u64 delta = now - counter->ctx->timestamp;
2747 time = counter->ctx->time + delta;
2748 }
2749
2750 task_clock_perf_counter_update(counter, time);
2751}
2752
2753static const struct pmu perf_ops_task_clock = {
2754 .enable = task_clock_perf_counter_enable,
2755 .disable = task_clock_perf_counter_disable,
2756 .read = task_clock_perf_counter_read,
2757};
2758
2759/*
2760 * Software counter: cpu migrations
2761 */
2762
2763static inline u64 get_cpu_migrations(struct perf_counter *counter)
2764{
2765 struct task_struct *curr = counter->ctx->task;
2766
2767 if (curr)
2768 return curr->se.nr_migrations;
2769 return cpu_nr_migrations(smp_processor_id());
2770}
2771
2772static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
2773{
2774 u64 prev, now;
2775 s64 delta;
2776
2777 prev = atomic64_read(&counter->hw.prev_count);
2778 now = get_cpu_migrations(counter);
2779
2780 atomic64_set(&counter->hw.prev_count, now);
2781
2782 delta = now - prev;
2783
2784 atomic64_add(delta, &counter->count);
2785}
2786
2787static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
2788{
2789 cpu_migrations_perf_counter_update(counter);
2790}
2791
2792static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
2793{
2794 if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
2795 atomic64_set(&counter->hw.prev_count,
2796 get_cpu_migrations(counter));
2797 return 0;
2798}
2799
2800static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
2801{
2802 cpu_migrations_perf_counter_update(counter);
2803}
2804
2805static const struct pmu perf_ops_cpu_migrations = {
2806 .enable = cpu_migrations_perf_counter_enable,
2807 .disable = cpu_migrations_perf_counter_disable,
2808 .read = cpu_migrations_perf_counter_read,
2809};
2810
2811#ifdef CONFIG_EVENT_PROFILE
2812void perf_tpcounter_event(int event_id)
2813{
2814 struct pt_regs *regs = get_irq_regs();
2815
2816 if (!regs)
2817 regs = task_pt_regs(current);
2818
2819 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
2820}
2821EXPORT_SYMBOL_GPL(perf_tpcounter_event);
2822
2823extern int ftrace_profile_enable(int);
2824extern void ftrace_profile_disable(int);
2825
2826static void tp_perf_counter_destroy(struct perf_counter *counter)
2827{
2828 ftrace_profile_disable(perf_event_id(&counter->hw_event));
2829}
2830
2831static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
2832{
2833 int event_id = perf_event_id(&counter->hw_event);
2834 int ret;
2835
2836 ret = ftrace_profile_enable(event_id);
2837 if (ret)
2838 return NULL;
2839
2840 counter->destroy = tp_perf_counter_destroy;
2841 counter->hw.irq_period = counter->hw_event.irq_period;
2842
2843 return &perf_ops_generic;
2844}
2845#else
2846static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
2847{
2848 return NULL;
2849}
2850#endif
2851
2852static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
2853{
2854 const struct pmu *pmu = NULL;
2855
2856 /*
2857 * Software counters (currently) can't in general distinguish
2858 * between user, kernel and hypervisor events.
2859 * However, context switches and cpu migrations are considered
2860 * to be kernel events, and page faults are never hypervisor
2861 * events.
2862 */
2863 switch (perf_event_id(&counter->hw_event)) {
2864 case PERF_COUNT_CPU_CLOCK:
2865 pmu = &perf_ops_cpu_clock;
2866
2867 break;
2868 case PERF_COUNT_TASK_CLOCK:
2869 /*
2870 * If the user instantiates this as a per-cpu counter,
2871 * use the cpu_clock counter instead.
2872 */
2873 if (counter->ctx->task)
2874 pmu = &perf_ops_task_clock;
2875 else
2876 pmu = &perf_ops_cpu_clock;
2877
2878 break;
2879 case PERF_COUNT_PAGE_FAULTS:
2880 case PERF_COUNT_PAGE_FAULTS_MIN:
2881 case PERF_COUNT_PAGE_FAULTS_MAJ:
2882 case PERF_COUNT_CONTEXT_SWITCHES:
2883 pmu = &perf_ops_generic;
2884 break;
2885 case PERF_COUNT_CPU_MIGRATIONS:
2886 if (!counter->hw_event.exclude_kernel)
2887 pmu = &perf_ops_cpu_migrations;
2888 break;
2889 }
2890
2891 return pmu;
2892}
2893
2894/*
2895 * Allocate and initialize a counter structure
2896 */
2897static struct perf_counter *
2898perf_counter_alloc(struct perf_counter_hw_event *hw_event,
2899 int cpu,
2900 struct perf_counter_context *ctx,
2901 struct perf_counter *group_leader,
2902 gfp_t gfpflags)
2903{
2904 const struct pmu *pmu;
2905 struct perf_counter *counter;
2906 struct hw_perf_counter *hwc;
2907 long err;
2908
2909 counter = kzalloc(sizeof(*counter), gfpflags);
2910 if (!counter)
2911 return ERR_PTR(-ENOMEM);
2912
2913 /*
2914 * Single counters are their own group leaders, with an
2915 * empty sibling list:
2916 */
2917 if (!group_leader)
2918 group_leader = counter;
2919
2920 mutex_init(&counter->mutex);
2921 INIT_LIST_HEAD(&counter->list_entry);
2922 INIT_LIST_HEAD(&counter->event_entry);
2923 INIT_LIST_HEAD(&counter->sibling_list);
2924 init_waitqueue_head(&counter->waitq);
2925
2926 mutex_init(&counter->mmap_mutex);
2927
2928 INIT_LIST_HEAD(&counter->child_list);
2929
2930 counter->cpu = cpu;
2931 counter->hw_event = *hw_event;
2932 counter->group_leader = group_leader;
2933 counter->pmu = NULL;
2934 counter->ctx = ctx;
2935
2936 counter->state = PERF_COUNTER_STATE_INACTIVE;
2937 if (hw_event->disabled)
2938 counter->state = PERF_COUNTER_STATE_OFF;
2939
2940 pmu = NULL;
2941
2942 hwc = &counter->hw;
2943 if (hw_event->freq && hw_event->irq_freq)
2944 hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq);
2945 else
2946 hwc->irq_period = hw_event->irq_period;
2947
2948 /*
2949 * we currently do not support PERF_RECORD_GROUP on inherited counters
2950 */
2951 if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP))
2952 goto done;
2953
2954 if (perf_event_raw(hw_event)) {
2955 pmu = hw_perf_counter_init(counter);
2956 goto done;
2957 }
2958
2959 switch (perf_event_type(hw_event)) {
2960 case PERF_TYPE_HARDWARE:
2961 pmu = hw_perf_counter_init(counter);
2962 break;
2963
2964 case PERF_TYPE_SOFTWARE:
2965 pmu = sw_perf_counter_init(counter);
2966 break;
2967
2968 case PERF_TYPE_TRACEPOINT:
2969 pmu = tp_perf_counter_init(counter);
2970 break;
2971 }
2972done:
2973 err = 0;
2974 if (!pmu)
2975 err = -EINVAL;
2976 else if (IS_ERR(pmu))
2977 err = PTR_ERR(pmu);
2978
2979 if (err) {
2980 kfree(counter);
2981 return ERR_PTR(err);
2982 }
2983
2984 counter->pmu = pmu;
2985
2986 atomic_inc(&nr_counters);
2987 if (counter->hw_event.mmap)
2988 atomic_inc(&nr_mmap_tracking);
2989 if (counter->hw_event.munmap)
2990 atomic_inc(&nr_munmap_tracking);
2991 if (counter->hw_event.comm)
2992 atomic_inc(&nr_comm_tracking);
2993
2994 return counter;
2995}
2996
2997/**
2998 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
2999 *
3000 * @hw_event_uptr: event type attributes for monitoring/sampling
3001 * @pid: target pid
3002 * @cpu: target cpu
3003 * @group_fd: group leader counter fd
3004 */
3005SYSCALL_DEFINE5(perf_counter_open,
3006 const struct perf_counter_hw_event __user *, hw_event_uptr,
3007 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
3008{
3009 struct perf_counter *counter, *group_leader;
3010 struct perf_counter_hw_event hw_event;
3011 struct perf_counter_context *ctx;
3012 struct file *counter_file = NULL;
3013 struct file *group_file = NULL;
3014 int fput_needed = 0;
3015 int fput_needed2 = 0;
3016 int ret;
3017
3018 /* for future expandability... */
3019 if (flags)
3020 return -EINVAL;
3021
3022 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
3023 return -EFAULT;
3024
3025 /*
3026 * Get the target context (task or percpu):
3027 */
3028 ctx = find_get_context(pid, cpu);
3029 if (IS_ERR(ctx))
3030 return PTR_ERR(ctx);
3031
3032 /*
3033 * Look up the group leader (we will attach this counter to it):
3034 */
3035 group_leader = NULL;
3036 if (group_fd != -1) {
3037 ret = -EINVAL;
3038 group_file = fget_light(group_fd, &fput_needed);
3039 if (!group_file)
3040 goto err_put_context;
3041 if (group_file->f_op != &perf_fops)
3042 goto err_put_context;
3043
3044 group_leader = group_file->private_data;
3045 /*
3046 * Do not allow a recursive hierarchy (this new sibling
3047 * becoming part of another group-sibling):
3048 */
3049 if (group_leader->group_leader != group_leader)
3050 goto err_put_context;
3051 /*
3052 * Do not allow to attach to a group in a different
3053 * task or CPU context:
3054 */
3055 if (group_leader->ctx != ctx)
3056 goto err_put_context;
3057 /*
3058 * Only a group leader can be exclusive or pinned
3059 */
3060 if (hw_event.exclusive || hw_event.pinned)
3061 goto err_put_context;
3062 }
3063
3064 counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
3065 GFP_KERNEL);
3066 ret = PTR_ERR(counter);
3067 if (IS_ERR(counter))
3068 goto err_put_context;
3069
3070 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
3071 if (ret < 0)
3072 goto err_free_put_context;
3073
3074 counter_file = fget_light(ret, &fput_needed2);
3075 if (!counter_file)
3076 goto err_free_put_context;
3077
3078 counter->filp = counter_file;
3079 mutex_lock(&ctx->mutex);
3080 perf_install_in_context(ctx, counter, cpu);
3081 mutex_unlock(&ctx->mutex);
3082
3083 fput_light(counter_file, fput_needed2);
3084
3085out_fput:
3086 fput_light(group_file, fput_needed);
3087
3088 return ret;
3089
3090err_free_put_context:
3091 kfree(counter);
3092
3093err_put_context:
3094 put_context(ctx);
3095
3096 goto out_fput;
3097}
3098
3099/*
3100 * Initialize the perf_counter context in a task_struct:
3101 */
3102static void
3103__perf_counter_init_context(struct perf_counter_context *ctx,
3104 struct task_struct *task)
3105{
3106 memset(ctx, 0, sizeof(*ctx));
3107 spin_lock_init(&ctx->lock);
3108 mutex_init(&ctx->mutex);
3109 INIT_LIST_HEAD(&ctx->counter_list);
3110 INIT_LIST_HEAD(&ctx->event_list);
3111 ctx->task = task;
3112}
3113
3114/*
3115 * inherit a counter from parent task to child task:
3116 */
3117static struct perf_counter *
3118inherit_counter(struct perf_counter *parent_counter,
3119 struct task_struct *parent,
3120 struct perf_counter_context *parent_ctx,
3121 struct task_struct *child,
3122 struct perf_counter *group_leader,
3123 struct perf_counter_context *child_ctx)
3124{
3125 struct perf_counter *child_counter;
3126
3127 /*
3128 * Instead of creating recursive hierarchies of counters,
3129 * we link inherited counters back to the original parent,
3130 * which has a filp for sure, which we use as the reference
3131 * count:
3132 */
3133 if (parent_counter->parent)
3134 parent_counter = parent_counter->parent;
3135
3136 child_counter = perf_counter_alloc(&parent_counter->hw_event,
3137 parent_counter->cpu, child_ctx,
3138 group_leader, GFP_KERNEL);
3139 if (IS_ERR(child_counter))
3140 return child_counter;
3141
3142 /*
3143 * Link it up in the child's context:
3144 */
3145 child_counter->task = child;
3146 add_counter_to_ctx(child_counter, child_ctx);
3147
3148 child_counter->parent = parent_counter;
3149 /*
3150 * inherit into child's child as well:
3151 */
3152 child_counter->hw_event.inherit = 1;
3153
3154 /*
3155 * Get a reference to the parent filp - we will fput it
3156 * when the child counter exits. This is safe to do because
3157 * we are in the parent and we know that the filp still
3158 * exists and has a nonzero count:
3159 */
3160 atomic_long_inc(&parent_counter->filp->f_count);
3161
3162 /*
3163 * Link this into the parent counter's child list
3164 */
3165 mutex_lock(&parent_counter->mutex);
3166 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
3167
3168 /*
3169 * Make the child state follow the state of the parent counter,
3170 * not its hw_event.disabled bit. We hold the parent's mutex,
3171 * so we won't race with perf_counter_{en,dis}able_family.
3172 */
3173 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
3174 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
3175 else
3176 child_counter->state = PERF_COUNTER_STATE_OFF;
3177
3178 mutex_unlock(&parent_counter->mutex);
3179
3180 return child_counter;
3181}
3182
3183static int inherit_group(struct perf_counter *parent_counter,
3184 struct task_struct *parent,
3185 struct perf_counter_context *parent_ctx,
3186 struct task_struct *child,
3187 struct perf_counter_context *child_ctx)
3188{
3189 struct perf_counter *leader;
3190 struct perf_counter *sub;
3191 struct perf_counter *child_ctr;
3192
3193 leader = inherit_counter(parent_counter, parent, parent_ctx,
3194 child, NULL, child_ctx);
3195 if (IS_ERR(leader))
3196 return PTR_ERR(leader);
3197 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
3198 child_ctr = inherit_counter(sub, parent, parent_ctx,
3199 child, leader, child_ctx);
3200 if (IS_ERR(child_ctr))
3201 return PTR_ERR(child_ctr);
3202 }
3203 return 0;
3204}
3205
3206static void sync_child_counter(struct perf_counter *child_counter,
3207 struct perf_counter *parent_counter)
3208{
3209 u64 child_val;
3210
3211 child_val = atomic64_read(&child_counter->count);
3212
3213 /*
3214 * Add back the child's count to the parent's count:
3215 */
3216 atomic64_add(child_val, &parent_counter->count);
3217 atomic64_add(child_counter->total_time_enabled,
3218 &parent_counter->child_total_time_enabled);
3219 atomic64_add(child_counter->total_time_running,
3220 &parent_counter->child_total_time_running);
3221
3222 /*
3223 * Remove this counter from the parent's list
3224 */
3225 mutex_lock(&parent_counter->mutex);
3226 list_del_init(&child_counter->child_list);
3227 mutex_unlock(&parent_counter->mutex);
3228
3229 /*
3230 * Release the parent counter, if this was the last
3231 * reference to it.
3232 */
3233 fput(parent_counter->filp);
3234}
3235
3236static void
3237__perf_counter_exit_task(struct task_struct *child,
3238 struct perf_counter *child_counter,
3239 struct perf_counter_context *child_ctx)
3240{
3241 struct perf_counter *parent_counter;
3242
3243 /*
3244 * If we do not self-reap then we have to wait for the
3245 * child task to unschedule (it will happen for sure),
3246 * so that its counter is at its final count. (This
3247 * condition triggers rarely - child tasks usually get
3248 * off their CPU before the parent has a chance to
3249 * get this far into the reaping action)
3250 */
3251 if (child != current) {
3252 wait_task_inactive(child, 0);
3253 update_counter_times(child_counter);
3254 list_del_counter(child_counter, child_ctx);
3255 } else {
3256 struct perf_cpu_context *cpuctx;
3257 unsigned long flags;
3258
3259 /*
3260 * Disable and unlink this counter.
3261 *
3262 * Be careful about zapping the list - IRQ/NMI context
3263 * could still be processing it:
3264 */
3265 local_irq_save(flags);
3266 perf_disable();
3267
3268 cpuctx = &__get_cpu_var(perf_cpu_context);
3269
3270 group_sched_out(child_counter, cpuctx, child_ctx);
3271 update_counter_times(child_counter);
3272
3273 list_del_counter(child_counter, child_ctx);
3274
3275 perf_enable();
3276 local_irq_restore(flags);
3277 }
3278
3279 parent_counter = child_counter->parent;
3280 /*
3281 * It can happen that parent exits first, and has counters
3282 * that are still around due to the child reference. These
3283 * counters need to be zapped - but otherwise linger.
3284 */
3285 if (parent_counter) {
3286 sync_child_counter(child_counter, parent_counter);
3287 free_counter(child_counter);
3288 }
3289}
3290
3291/*
3292 * When a child task exits, feed back counter values to parent counters.
3293 *
3294 * Note: we may be running in child context, but the PID is not hashed
3295 * anymore so new counters will not be added.
3296 */
3297void perf_counter_exit_task(struct task_struct *child)
3298{
3299 struct perf_counter *child_counter, *tmp;
3300 struct perf_counter_context *child_ctx;
3301
3302 child_ctx = &child->perf_counter_ctx;
3303
3304 if (likely(!child_ctx->nr_counters))
3305 return;
3306
3307again:
3308 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
3309 list_entry)
3310 __perf_counter_exit_task(child, child_counter, child_ctx);
3311
3312 /*
3313 * If the last counter was a group counter, it will have appended all
3314 * its siblings to the list, but we obtained 'tmp' before that which
3315 * will still point to the list head terminating the iteration.
3316 */
3317 if (!list_empty(&child_ctx->counter_list))
3318 goto again;
3319}
3320
3321/*
3322 * Initialize the perf_counter context in task_struct
3323 */
3324void perf_counter_init_task(struct task_struct *child)
3325{
3326 struct perf_counter_context *child_ctx, *parent_ctx;
3327 struct perf_counter *counter;
3328 struct task_struct *parent = current;
3329
3330 child_ctx = &child->perf_counter_ctx;
3331 parent_ctx = &parent->perf_counter_ctx;
3332
3333 __perf_counter_init_context(child_ctx, child);
3334
3335 /*
3336 * This is executed from the parent task context, so inherit
3337 * counters that have been marked for cloning:
3338 */
3339
3340 if (likely(!parent_ctx->nr_counters))
3341 return;
3342
3343 /*
3344 * Lock the parent list. No need to lock the child - not PID
3345 * hashed yet and not running, so nobody can access it.
3346 */
3347 mutex_lock(&parent_ctx->mutex);
3348
3349 /*
3350 * We dont have to disable NMIs - we are only looking at
3351 * the list, not manipulating it:
3352 */
3353 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
3354 if (!counter->hw_event.inherit)
3355 continue;
3356
3357 if (inherit_group(counter, parent,
3358 parent_ctx, child, child_ctx))
3359 break;
3360 }
3361
3362 mutex_unlock(&parent_ctx->mutex);
3363}
3364
3365static void __cpuinit perf_counter_init_cpu(int cpu)
3366{
3367 struct perf_cpu_context *cpuctx;
3368
3369 cpuctx = &per_cpu(perf_cpu_context, cpu);
3370 __perf_counter_init_context(&cpuctx->ctx, NULL);
3371
3372 spin_lock(&perf_resource_lock);
3373 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
3374 spin_unlock(&perf_resource_lock);
3375
3376 hw_perf_counter_setup(cpu);
3377}
3378
3379#ifdef CONFIG_HOTPLUG_CPU
3380static void __perf_counter_exit_cpu(void *info)
3381{
3382 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3383 struct perf_counter_context *ctx = &cpuctx->ctx;
3384 struct perf_counter *counter, *tmp;
3385
3386 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
3387 __perf_counter_remove_from_context(counter);
3388}
3389static void perf_counter_exit_cpu(int cpu)
3390{
3391 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3392 struct perf_counter_context *ctx = &cpuctx->ctx;
3393
3394 mutex_lock(&ctx->mutex);
3395 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
3396 mutex_unlock(&ctx->mutex);
3397}
3398#else
3399static inline void perf_counter_exit_cpu(int cpu) { }
3400#endif
3401
3402static int __cpuinit
3403perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
3404{
3405 unsigned int cpu = (long)hcpu;
3406
3407 switch (action) {
3408
3409 case CPU_UP_PREPARE:
3410 case CPU_UP_PREPARE_FROZEN:
3411 perf_counter_init_cpu(cpu);
3412 break;
3413
3414 case CPU_DOWN_PREPARE:
3415 case CPU_DOWN_PREPARE_FROZEN:
3416 perf_counter_exit_cpu(cpu);
3417 break;
3418
3419 default:
3420 break;
3421 }
3422
3423 return NOTIFY_OK;
3424}
3425
3426static struct notifier_block __cpuinitdata perf_cpu_nb = {
3427 .notifier_call = perf_cpu_notify,
3428};
3429
3430void __init perf_counter_init(void)
3431{
3432 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
3433 (void *)(long)smp_processor_id());
3434 register_cpu_notifier(&perf_cpu_nb);
3435}
3436
3437static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
3438{
3439 return sprintf(buf, "%d\n", perf_reserved_percpu);
3440}
3441
3442static ssize_t
3443perf_set_reserve_percpu(struct sysdev_class *class,
3444 const char *buf,
3445 size_t count)
3446{
3447 struct perf_cpu_context *cpuctx;
3448 unsigned long val;
3449 int err, cpu, mpt;
3450
3451 err = strict_strtoul(buf, 10, &val);
3452 if (err)
3453 return err;
3454 if (val > perf_max_counters)
3455 return -EINVAL;
3456
3457 spin_lock(&perf_resource_lock);
3458 perf_reserved_percpu = val;
3459 for_each_online_cpu(cpu) {
3460 cpuctx = &per_cpu(perf_cpu_context, cpu);
3461 spin_lock_irq(&cpuctx->ctx.lock);
3462 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
3463 perf_max_counters - perf_reserved_percpu);
3464 cpuctx->max_pertask = mpt;
3465 spin_unlock_irq(&cpuctx->ctx.lock);
3466 }
3467 spin_unlock(&perf_resource_lock);
3468
3469 return count;
3470}
3471
3472static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
3473{
3474 return sprintf(buf, "%d\n", perf_overcommit);
3475}
3476
3477static ssize_t
3478perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
3479{
3480 unsigned long val;
3481 int err;
3482
3483 err = strict_strtoul(buf, 10, &val);
3484 if (err)
3485 return err;
3486 if (val > 1)
3487 return -EINVAL;
3488
3489 spin_lock(&perf_resource_lock);
3490 perf_overcommit = val;
3491 spin_unlock(&perf_resource_lock);
3492
3493 return count;
3494}
3495
3496static SYSDEV_CLASS_ATTR(
3497 reserve_percpu,
3498 0644,
3499 perf_show_reserve_percpu,
3500 perf_set_reserve_percpu
3501 );
3502
3503static SYSDEV_CLASS_ATTR(
3504 overcommit,
3505 0644,
3506 perf_show_overcommit,
3507 perf_set_overcommit
3508 );
3509
3510static struct attribute *perfclass_attrs[] = {
3511 &attr_reserve_percpu.attr,
3512 &attr_overcommit.attr,
3513 NULL
3514};
3515
3516static struct attribute_group perfclass_attr_group = {
3517 .attrs = perfclass_attrs,
3518 .name = "perf_counters",
3519};
3520
3521static int __init perf_counter_sysfs_init(void)
3522{
3523 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
3524 &perfclass_attr_group);
3525}
3526device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb921ffa..013882e83497 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -864,9 +864,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
864EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); 864EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
865 865
866/** 866/**
867 * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible 867 * rt_mutex_timed_lock - lock a rt_mutex interruptible
868 * the timeout structure is provided 868 * the timeout structure is provided
869 * by the caller 869 * by the caller
870 * 870 *
871 * @lock: the rt_mutex to be locked 871 * @lock: the rt_mutex to be locked
872 * @timeout: timeout structure or NULL (no timeout) 872 * @timeout: timeout structure or NULL (no timeout)
@@ -913,7 +913,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
913} 913}
914EXPORT_SYMBOL_GPL(rt_mutex_unlock); 914EXPORT_SYMBOL_GPL(rt_mutex_unlock);
915 915
916/*** 916/**
917 * rt_mutex_destroy - mark a mutex unusable 917 * rt_mutex_destroy - mark a mutex unusable
918 * @lock: the mutex to be destroyed 918 * @lock: the mutex to be destroyed
919 * 919 *
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..419a39d0988f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h>
42#include <linux/security.h> 43#include <linux/security.h>
43#include <linux/notifier.h> 44#include <linux/notifier.h>
44#include <linux/profile.h> 45#include <linux/profile.h>
@@ -584,6 +585,7 @@ struct rq {
584 struct load_weight load; 585 struct load_weight load;
585 unsigned long nr_load_updates; 586 unsigned long nr_load_updates;
586 u64 nr_switches; 587 u64 nr_switches;
588 u64 nr_migrations_in;
587 589
588 struct cfs_rq cfs; 590 struct cfs_rq cfs;
589 struct rt_rq rt; 591 struct rt_rq rt;
@@ -692,7 +694,7 @@ static inline int cpu_of(struct rq *rq)
692#define task_rq(p) cpu_rq(task_cpu(p)) 694#define task_rq(p) cpu_rq(task_cpu(p))
693#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 695#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
694 696
695static inline void update_rq_clock(struct rq *rq) 697inline void update_rq_clock(struct rq *rq)
696{ 698{
697 rq->clock = sched_clock_cpu(cpu_of(rq)); 699 rq->clock = sched_clock_cpu(cpu_of(rq));
698} 700}
@@ -1967,12 +1969,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1967 p->se.sleep_start -= clock_offset; 1969 p->se.sleep_start -= clock_offset;
1968 if (p->se.block_start) 1970 if (p->se.block_start)
1969 p->se.block_start -= clock_offset; 1971 p->se.block_start -= clock_offset;
1972#endif
1970 if (old_cpu != new_cpu) { 1973 if (old_cpu != new_cpu) {
1971 schedstat_inc(p, se.nr_migrations); 1974 p->se.nr_migrations++;
1975 new_rq->nr_migrations_in++;
1976#ifdef CONFIG_SCHEDSTATS
1972 if (task_hot(p, old_rq->clock, NULL)) 1977 if (task_hot(p, old_rq->clock, NULL))
1973 schedstat_inc(p, se.nr_forced2_migrations); 1978 schedstat_inc(p, se.nr_forced2_migrations);
1974 }
1975#endif 1979#endif
1980 }
1976 p->se.vruntime -= old_cfsrq->min_vruntime - 1981 p->se.vruntime -= old_cfsrq->min_vruntime -
1977 new_cfsrq->min_vruntime; 1982 new_cfsrq->min_vruntime;
1978 1983
@@ -2324,6 +2329,27 @@ static int sched_balance_self(int cpu, int flag)
2324 2329
2325#endif /* CONFIG_SMP */ 2330#endif /* CONFIG_SMP */
2326 2331
2332/**
2333 * task_oncpu_function_call - call a function on the cpu on which a task runs
2334 * @p: the task to evaluate
2335 * @func: the function to be called
2336 * @info: the function call argument
2337 *
2338 * Calls the function @func when the task is currently running. This might
2339 * be on the current CPU, which just calls the function directly
2340 */
2341void task_oncpu_function_call(struct task_struct *p,
2342 void (*func) (void *info), void *info)
2343{
2344 int cpu;
2345
2346 preempt_disable();
2347 cpu = task_cpu(p);
2348 if (task_curr(p))
2349 smp_call_function_single(cpu, func, info, 1);
2350 preempt_enable();
2351}
2352
2327/*** 2353/***
2328 * try_to_wake_up - wake up a thread 2354 * try_to_wake_up - wake up a thread
2329 * @p: the to-be-woken-up thread 2355 * @p: the to-be-woken-up thread
@@ -2480,6 +2506,7 @@ static void __sched_fork(struct task_struct *p)
2480 p->se.exec_start = 0; 2506 p->se.exec_start = 0;
2481 p->se.sum_exec_runtime = 0; 2507 p->se.sum_exec_runtime = 0;
2482 p->se.prev_sum_exec_runtime = 0; 2508 p->se.prev_sum_exec_runtime = 0;
2509 p->se.nr_migrations = 0;
2483 p->se.last_wakeup = 0; 2510 p->se.last_wakeup = 0;
2484 p->se.avg_overlap = 0; 2511 p->se.avg_overlap = 0;
2485 p->se.start_runtime = 0; 2512 p->se.start_runtime = 0;
@@ -2710,6 +2737,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2710 */ 2737 */
2711 prev_state = prev->state; 2738 prev_state = prev->state;
2712 finish_arch_switch(prev); 2739 finish_arch_switch(prev);
2740 perf_counter_task_sched_in(current, cpu_of(rq));
2713 finish_lock_switch(rq, prev); 2741 finish_lock_switch(rq, prev);
2714#ifdef CONFIG_SMP 2742#ifdef CONFIG_SMP
2715 if (post_schedule) 2743 if (post_schedule)
@@ -2872,6 +2900,15 @@ unsigned long nr_active(void)
2872} 2900}
2873 2901
2874/* 2902/*
2903 * Externally visible per-cpu scheduler statistics:
2904 * cpu_nr_migrations(cpu) - number of migrations into that cpu
2905 */
2906u64 cpu_nr_migrations(int cpu)
2907{
2908 return cpu_rq(cpu)->nr_migrations_in;
2909}
2910
2911/*
2875 * Update rq->cpu_load[] statistics. This function is usually called every 2912 * Update rq->cpu_load[] statistics. This function is usually called every
2876 * scheduler tick (TICK_NSEC). 2913 * scheduler tick (TICK_NSEC).
2877 */ 2914 */
@@ -4838,6 +4875,7 @@ void scheduler_tick(void)
4838 update_rq_clock(rq); 4875 update_rq_clock(rq);
4839 update_cpu_load(rq); 4876 update_cpu_load(rq);
4840 curr->sched_class->task_tick(rq, curr, 0); 4877 curr->sched_class->task_tick(rq, curr, 0);
4878 perf_counter_task_tick(curr, cpu);
4841 spin_unlock(&rq->lock); 4879 spin_unlock(&rq->lock);
4842 4880
4843#ifdef CONFIG_SMP 4881#ifdef CONFIG_SMP
@@ -5053,6 +5091,7 @@ need_resched_nonpreemptible:
5053 5091
5054 if (likely(prev != next)) { 5092 if (likely(prev != next)) {
5055 sched_info_switch(prev, next); 5093 sched_info_switch(prev, next);
5094 perf_counter_task_sched_out(prev, cpu);
5056 5095
5057 rq->nr_switches++; 5096 rq->nr_switches++;
5058 rq->curr = next; 5097 rq->curr = next;
@@ -8958,7 +8997,7 @@ void __init sched_init(void)
8958 * 1024) and two child groups A0 and A1 (of weight 1024 each), 8997 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8959 * then A0's share of the cpu resource is: 8998 * then A0's share of the cpu resource is:
8960 * 8999 *
8961 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 9000 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8962 * 9001 *
8963 * We achieve this by letting init_task_group's tasks sit 9002 * We achieve this by letting init_task_group's tasks sit
8964 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 9003 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9059,6 +9098,8 @@ void __init sched_init(void)
9059 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9098 alloc_bootmem_cpumask_var(&cpu_isolated_map);
9060#endif /* SMP */ 9099#endif /* SMP */
9061 9100
9101 perf_counter_init();
9102
9062 scheduler_running = 1; 9103 scheduler_running = 1;
9063} 9104}
9064 9105
diff --git a/kernel/signal.c b/kernel/signal.c
index d8034737db4c..f79b3b9f8375 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2278,24 +2278,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2278 return kill_something_info(sig, &info, pid); 2278 return kill_something_info(sig, &info, pid);
2279} 2279}
2280 2280
2281static int do_tkill(pid_t tgid, pid_t pid, int sig) 2281static int
2282do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2282{ 2283{
2283 int error;
2284 struct siginfo info;
2285 struct task_struct *p; 2284 struct task_struct *p;
2286 unsigned long flags; 2285 unsigned long flags;
2287 2286 int error = -ESRCH;
2288 error = -ESRCH;
2289 info.si_signo = sig;
2290 info.si_errno = 0;
2291 info.si_code = SI_TKILL;
2292 info.si_pid = task_tgid_vnr(current);
2293 info.si_uid = current_uid();
2294 2287
2295 rcu_read_lock(); 2288 rcu_read_lock();
2296 p = find_task_by_vpid(pid); 2289 p = find_task_by_vpid(pid);
2297 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { 2290 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
2298 error = check_kill_permission(sig, &info, p); 2291 error = check_kill_permission(sig, info, p);
2299 /* 2292 /*
2300 * The null signal is a permissions and process existence 2293 * The null signal is a permissions and process existence
2301 * probe. No signal is actually delivered. 2294 * probe. No signal is actually delivered.
@@ -2305,7 +2298,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2305 * signal is private anyway. 2298 * signal is private anyway.
2306 */ 2299 */
2307 if (!error && sig && lock_task_sighand(p, &flags)) { 2300 if (!error && sig && lock_task_sighand(p, &flags)) {
2308 error = specific_send_sig_info(sig, &info, p); 2301 error = specific_send_sig_info(sig, info, p);
2309 unlock_task_sighand(p, &flags); 2302 unlock_task_sighand(p, &flags);
2310 } 2303 }
2311 } 2304 }
@@ -2314,6 +2307,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2314 return error; 2307 return error;
2315} 2308}
2316 2309
2310static int do_tkill(pid_t tgid, pid_t pid, int sig)
2311{
2312 struct siginfo info;
2313
2314 info.si_signo = sig;
2315 info.si_errno = 0;
2316 info.si_code = SI_TKILL;
2317 info.si_pid = task_tgid_vnr(current);
2318 info.si_uid = current_uid();
2319
2320 return do_send_specific(tgid, pid, sig, &info);
2321}
2322
2317/** 2323/**
2318 * sys_tgkill - send signal to one specific thread 2324 * sys_tgkill - send signal to one specific thread
2319 * @tgid: the thread group ID of the thread 2325 * @tgid: the thread group ID of the thread
@@ -2363,6 +2369,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2363 return kill_proc_info(sig, &info, pid); 2369 return kill_proc_info(sig, &info, pid);
2364} 2370}
2365 2371
2372long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2373{
2374 /* This is only valid for single tasks */
2375 if (pid <= 0 || tgid <= 0)
2376 return -EINVAL;
2377
2378 /* Not even root can pretend to send signals from the kernel.
2379 Nor can they impersonate a kill(), which adds source info. */
2380 if (info->si_code >= 0)
2381 return -EPERM;
2382 info->si_signo = sig;
2383
2384 return do_send_specific(tgid, pid, sig, info);
2385}
2386
2387SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
2388 siginfo_t __user *, uinfo)
2389{
2390 siginfo_t info;
2391
2392 if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
2393 return -EFAULT;
2394
2395 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
2396}
2397
2366int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 2398int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2367{ 2399{
2368 struct task_struct *t = current; 2400 struct task_struct *t = current;
diff --git a/kernel/sys.c b/kernel/sys.c
index e7998cf31498..438d99a38c87 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h>
17#include <linux/resource.h> 18#include <linux/resource.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1793,6 +1794,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1793 case PR_SET_TSC: 1794 case PR_SET_TSC:
1794 error = SET_TSC_CTL(arg2); 1795 error = SET_TSC_CTL(arg2);
1795 break; 1796 break;
1797 case PR_TASK_PERF_COUNTERS_DISABLE:
1798 error = perf_counter_task_disable();
1799 break;
1800 case PR_TASK_PERF_COUNTERS_ENABLE:
1801 error = perf_counter_task_enable();
1802 break;
1796 case PR_GET_TIMERSLACK: 1803 case PR_GET_TIMERSLACK:
1797 error = current->timer_slack_ns; 1804 error = current->timer_slack_ns;
1798 break; 1805 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad2967387..68320f6b07b5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
175cond_syscall(compat_sys_timerfd_gettime); 175cond_syscall(compat_sys_timerfd_gettime);
176cond_syscall(sys_eventfd); 176cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 177cond_syscall(sys_eventfd2);
178
179/* performance counters: */
180cond_syscall(sys_perf_counter_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2970d56fb76..3cb1849f5989 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,6 +49,7 @@
49#include <linux/reboot.h> 49#include <linux/reboot.h>
50#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/slow-work.h> 51#include <linux/slow-work.h>
52#include <linux/perf_counter.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/processor.h> 55#include <asm/processor.h>
@@ -912,6 +913,24 @@ static struct ctl_table kern_table[] = {
912 .child = slow_work_sysctls, 913 .child = slow_work_sysctls,
913 }, 914 },
914#endif 915#endif
916#ifdef CONFIG_PERF_COUNTERS
917 {
918 .ctl_name = CTL_UNNUMBERED,
919 .procname = "perf_counter_privileged",
920 .data = &sysctl_perf_counter_priv,
921 .maxlen = sizeof(sysctl_perf_counter_priv),
922 .mode = 0644,
923 .proc_handler = &proc_dointvec,
924 },
925 {
926 .ctl_name = CTL_UNNUMBERED,
927 .procname = "perf_counter_mlock_kb",
928 .data = &sysctl_perf_counter_mlock,
929 .maxlen = sizeof(sysctl_perf_counter_mlock),
930 .mode = 0644,
931 .proc_handler = &proc_dointvec,
932 },
933#endif
915/* 934/*
916 * NOTE: do not add new entries to this table unless you have read 935 * NOTE: do not add new entries to this table unless you have read
917 * Documentation/sysctl/ctl_unnumbered.txt 936 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..fed53be44fd9 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -1170,6 +1171,8 @@ static void run_timer_softirq(struct softirq_action *h)
1170{ 1171{
1171 struct tvec_base *base = __get_cpu_var(tvec_bases); 1172 struct tvec_base *base = __get_cpu_var(tvec_bases);
1172 1173
1174 perf_counter_do_pending();
1175
1173 hrtimer_run_pending(); 1176 hrtimer_run_pending();
1174 1177
1175 if (time_after_eq(jiffies, base->timer_jiffies)) 1178 if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/mm/mmap.c b/mm/mmap.c
index 6b7b1a95944b..2c1c2cb0e2e1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
28#include <linux/mempolicy.h> 28#include <linux/mempolicy.h>
29#include <linux/rmap.h> 29#include <linux/rmap.h>
30#include <linux/mmu_notifier.h> 30#include <linux/mmu_notifier.h>
31#include <linux/perf_counter.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -1219,6 +1220,9 @@ munmap_back:
1219 if (correct_wcount) 1220 if (correct_wcount)
1220 atomic_inc(&inode->i_writecount); 1221 atomic_inc(&inode->i_writecount);
1221out: 1222out:
1223 if (vm_flags & VM_EXEC)
1224 perf_counter_mmap(addr, len, pgoff, file);
1225
1222 mm->total_vm += len >> PAGE_SHIFT; 1226 mm->total_vm += len >> PAGE_SHIFT;
1223 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1227 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1224 if (vm_flags & VM_LOCKED) { 1228 if (vm_flags & VM_LOCKED) {
@@ -1752,6 +1756,12 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1752 do { 1756 do {
1753 long nrpages = vma_pages(vma); 1757 long nrpages = vma_pages(vma);
1754 1758
1759 if (vma->vm_flags & VM_EXEC) {
1760 perf_counter_munmap(vma->vm_start,
1761 nrpages << PAGE_SHIFT,
1762 vma->vm_pgoff, vma->vm_file);
1763 }
1764
1755 mm->total_vm -= nrpages; 1765 mm->total_vm -= nrpages;
1756 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 1766 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1757 vma = remove_vma(vma); 1767 vma = remove_vma(vma);