aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/perf_counter/.gitignore179
-rw-r--r--Documentation/perf_counter/Documentation/perf-help.txt38
-rw-r--r--Documentation/perf_counter/Documentation/perf-record.txt63
-rw-r--r--Documentation/perf_counter/Documentation/perf-stat.txt76
-rw-r--r--Documentation/perf_counter/Documentation/perf-top.txt61
-rw-r--r--Documentation/perf_counter/Makefile844
-rw-r--r--Documentation/perf_counter/PERF-BUILD-OPTIONS4
-rw-r--r--Documentation/perf_counter/PERF-CFLAGS1
-rw-r--r--Documentation/perf_counter/PERF-VERSION-FILE1
-rw-r--r--Documentation/perf_counter/builtin-help.c461
-rw-r--r--Documentation/perf_counter/builtin-record.c506
-rw-r--r--Documentation/perf_counter/builtin-stat.c591
-rw-r--r--Documentation/perf_counter/builtin-top.c1203
-rw-r--r--Documentation/perf_counter/builtin.h22
-rw-r--r--Documentation/perf_counter/command-list.txt6
-rw-r--r--Documentation/perf_counter/design.txt283
-rw-r--r--Documentation/perf_counter/perf-report.cc479
-rw-r--r--Documentation/perf_counter/perf.c414
-rwxr-xr-xDocumentation/perf_counter/util/PERF-VERSION-GEN42
-rw-r--r--Documentation/perf_counter/util/abspath.c117
-rw-r--r--Documentation/perf_counter/util/alias.c77
-rw-r--r--Documentation/perf_counter/util/cache.h117
-rw-r--r--Documentation/perf_counter/util/config.c873
-rw-r--r--Documentation/perf_counter/util/ctype.c26
-rw-r--r--Documentation/perf_counter/util/exec_cmd.c165
-rw-r--r--Documentation/perf_counter/util/exec_cmd.h13
-rwxr-xr-xDocumentation/perf_counter/util/generate-cmdlist.sh24
-rw-r--r--Documentation/perf_counter/util/help.c366
-rw-r--r--Documentation/perf_counter/util/help.h29
-rw-r--r--Documentation/perf_counter/util/levenshtein.c84
-rw-r--r--Documentation/perf_counter/util/levenshtein.h8
-rw-r--r--Documentation/perf_counter/util/parse-options.c492
-rw-r--r--Documentation/perf_counter/util/parse-options.h172
-rw-r--r--Documentation/perf_counter/util/path.c353
-rw-r--r--Documentation/perf_counter/util/quote.c478
-rw-r--r--Documentation/perf_counter/util/quote.h68
-rw-r--r--Documentation/perf_counter/util/run-command.c395
-rw-r--r--Documentation/perf_counter/util/run-command.h93
-rw-r--r--Documentation/perf_counter/util/strbuf.c359
-rw-r--r--Documentation/perf_counter/util/strbuf.h137
-rw-r--r--Documentation/perf_counter/util/usage.c80
-rw-r--r--Documentation/perf_counter/util/util.h408
-rw-r--r--Documentation/perf_counter/util/wrapper.c206
-rw-r--r--MAINTAINERS10
-rw-r--r--arch/powerpc/include/asm/hw_irq.h39
-rw-r--r--arch/powerpc/include/asm/paca.h1
-rw-r--r--arch/powerpc/include/asm/perf_counter.h72
-rw-r--r--arch/powerpc/include/asm/systbl.h2
-rw-r--r--arch/powerpc/include/asm/unistd.h1
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kernel/asm-offsets.c1
-rw-r--r--arch/powerpc/kernel/entry_64.S9
-rw-r--r--arch/powerpc/kernel/irq.c5
-rw-r--r--arch/powerpc/kernel/perf_counter.c866
-rw-r--r--arch/powerpc/kernel/power4-pmu.c557
-rw-r--r--arch/powerpc/kernel/power5+-pmu.c551
-rw-r--r--arch/powerpc/kernel/power5-pmu.c569
-rw-r--r--arch/powerpc/kernel/power6-pmu.c407
-rw-r--r--arch/powerpc/kernel/ppc970-pmu.c441
-rw-r--r--arch/powerpc/mm/fault.c10
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype1
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/ia32/ia32entry.S3
-rw-r--r--arch/x86/include/asm/atomic_32.h236
-rw-r--r--arch/x86/include/asm/entry_arch.h1
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h3
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/perf_counter.h100
-rw-r--r--arch/x86/include/asm/unistd_32.h1
-rw-r--r--arch/x86/include/asm/unistd_64.h3
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/amd.c4
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1213
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/entry_64.S7
-rw-r--r--arch/x86/kernel/irq.c10
-rw-r--r--arch/x86/kernel/irqinit_32.c60
-rw-r--r--arch/x86/kernel/irqinit_64.c13
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/syscall_table_32.S1
-rw-r--r--arch/x86/kernel/traps.c15
-rw-r--r--arch/x86/mm/fault.c12
-rw-r--r--arch/x86/oprofile/nmi_int.c7
-rw-r--r--arch/x86/oprofile/op_model_ppro.c10
-rw-r--r--drivers/acpi/processor_idle.c4
-rw-r--r--drivers/char/sysrq.c2
-rw-r--r--fs/exec.c9
-rw-r--r--include/linux/init_task.h13
-rw-r--r--include/linux/kernel_stat.h5
-rw-r--r--include/linux/mutex.h23
-rw-r--r--include/linux/perf_counter.h606
-rw-r--r--include/linux/prctl.h3
-rw-r--r--include/linux/sched.h13
-rw-r--r--include/linux/syscalls.h5
-rw-r--r--init/Kconfig35
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/exit.c13
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/perf_counter.c3302
-rw-r--r--kernel/sched.c44
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/timer.c3
-rw-r--r--mm/mmap.c10
110 files changed, 19741 insertions, 93 deletions
diff --git a/Documentation/perf_counter/.gitignore b/Documentation/perf_counter/.gitignore
new file mode 100644
index 000000000000..41c0b20a76ce
--- /dev/null
+++ b/Documentation/perf_counter/.gitignore
@@ -0,0 +1,179 @@
1GIT-BUILD-OPTIONS
2GIT-CFLAGS
3GIT-GUI-VARS
4GIT-VERSION-FILE
5git
6git-add
7git-add--interactive
8git-am
9git-annotate
10git-apply
11git-archimport
12git-archive
13git-bisect
14git-bisect--helper
15git-blame
16git-branch
17git-bundle
18git-cat-file
19git-check-attr
20git-check-ref-format
21git-checkout
22git-checkout-index
23git-cherry
24git-cherry-pick
25git-clean
26git-clone
27git-commit
28git-commit-tree
29git-config
30git-count-objects
31git-cvsexportcommit
32git-cvsimport
33git-cvsserver
34git-daemon
35git-diff
36git-diff-files
37git-diff-index
38git-diff-tree
39git-difftool
40git-difftool--helper
41git-describe
42git-fast-export
43git-fast-import
44git-fetch
45git-fetch--tool
46git-fetch-pack
47git-filter-branch
48git-fmt-merge-msg
49git-for-each-ref
50git-format-patch
51git-fsck
52git-fsck-objects
53git-gc
54git-get-tar-commit-id
55git-grep
56git-hash-object
57git-help
58git-http-fetch
59git-http-push
60git-imap-send
61git-index-pack
62git-init
63git-init-db
64git-instaweb
65git-log
66git-lost-found
67git-ls-files
68git-ls-remote
69git-ls-tree
70git-mailinfo
71git-mailsplit
72git-merge
73git-merge-base
74git-merge-index
75git-merge-file
76git-merge-tree
77git-merge-octopus
78git-merge-one-file
79git-merge-ours
80git-merge-recursive
81git-merge-resolve
82git-merge-subtree
83git-mergetool
84git-mergetool--lib
85git-mktag
86git-mktree
87git-name-rev
88git-mv
89git-pack-redundant
90git-pack-objects
91git-pack-refs
92git-parse-remote
93git-patch-id
94git-peek-remote
95git-prune
96git-prune-packed
97git-pull
98git-push
99git-quiltimport
100git-read-tree
101git-rebase
102git-rebase--interactive
103git-receive-pack
104git-reflog
105git-relink
106git-remote
107git-repack
108git-repo-config
109git-request-pull
110git-rerere
111git-reset
112git-rev-list
113git-rev-parse
114git-revert
115git-rm
116git-send-email
117git-send-pack
118git-sh-setup
119git-shell
120git-shortlog
121git-show
122git-show-branch
123git-show-index
124git-show-ref
125git-stage
126git-stash
127git-status
128git-stripspace
129git-submodule
130git-svn
131git-symbolic-ref
132git-tag
133git-tar-tree
134git-unpack-file
135git-unpack-objects
136git-update-index
137git-update-ref
138git-update-server-info
139git-upload-archive
140git-upload-pack
141git-var
142git-verify-pack
143git-verify-tag
144git-web--browse
145git-whatchanged
146git-write-tree
147git-core-*/?*
148gitk-wish
149gitweb/gitweb.cgi
150test-chmtime
151test-ctype
152test-date
153test-delta
154test-dump-cache-tree
155test-genrandom
156test-match-trees
157test-parse-options
158test-path-utils
159test-sha1
160test-sigchain
161common-cmds.h
162*.tar.gz
163*.dsc
164*.deb
165git.spec
166*.exe
167*.[aos]
168*.py[co]
169config.mak
170autom4te.cache
171config.cache
172config.log
173config.status
174config.mak.autogen
175config.mak.append
176configure
177tags
178TAGS
179cscope*
diff --git a/Documentation/perf_counter/Documentation/perf-help.txt b/Documentation/perf_counter/Documentation/perf-help.txt
new file mode 100644
index 000000000000..f85fed5a7edb
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-help.txt
@@ -0,0 +1,38 @@
1perf-help(1)
2===========
3
4NAME
5----
6perf-help - display help information about perf
7
8SYNOPSIS
9--------
10'perf help' [-a|--all] [COMMAND]
11
12DESCRIPTION
13-----------
14
15With no options and no COMMAND given, the synopsis of the 'perf'
16command and a list of the most commonly used perf commands are printed
17on the standard output.
18
19If the option '--all' or '-a' is given, then all available commands are
20printed on the standard output.
21
22If a perf command is named, a manual page for that command is brought
23up. The 'man' program is used by default for this purpose, but this
24can be overridden by other options or configuration variables.
25
26Note that `perf --help ...` is identical to `perf help ...` because the
27former is internally converted into the latter.
28
29OPTIONS
30-------
31-a::
32--all::
33 Prints all the available commands on the standard output. This
34 option supersedes any other option.
35
36PERF
37----
38Part of the linkperf:perf[1] suite
diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
new file mode 100644
index 000000000000..d07700e35eb2
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-record.txt
@@ -0,0 +1,63 @@
1perf-record(1)
2==========
3
4NAME
5----
6perf-record - Run a command and record its profile into output.perf
7
8SYNOPSIS
9--------
10[verse]
11'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
12
13DESCRIPTION
14-----------
15This command runs a command and gathers a performance counter profile
16from it, into output.perf - without displaying anything.
17
18This file can then be inspected later on, using 'perf report'.
19
20
21OPTIONS
22-------
23<command>...::
24 Any command you can specify in a shell.
25
26-e::
27--event=::
28 0:0: cpu-cycles
29 0:0: cycles
30 0:1: instructions
31 0:2: cache-references
32 0:3: cache-misses
33 0:4: branch-instructions
34 0:4: branches
35 0:5: branch-misses
36 0:6: bus-cycles
37 1:0: cpu-clock
38 1:1: task-clock
39 1:2: page-faults
40 1:2: faults
41 1:5: minor-faults
42 1:6: major-faults
43 1:3: context-switches
44 1:3: cs
45 1:4: cpu-migrations
46 1:4: migrations
47 rNNN: raw PMU events (eventsel+umask)
48
49-a::
50 system-wide collection
51
52-l::
53 scale counter values
54
55Configuration
56-------------
57
58EXAMPLES
59--------
60
61SEE ALSO
62--------
63linkperf:git-stat[1]
diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
new file mode 100644
index 000000000000..7fcab271e570
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -0,0 +1,76 @@
1perf-stat(1)
2==========
3
4NAME
5----
6perf-stat - Run a command and gather performance counter statistics
7
8SYNOPSIS
9--------
10[verse]
11'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
12
13DESCRIPTION
14-----------
15This command runs a command and gathers performance counter statistics
16from it.
17
18
19OPTIONS
20-------
21<command>...::
22 Any command you can specify in a shell.
23
24-e::
25--event=::
26 0:0: cpu-cycles
27 0:0: cycles
28 0:1: instructions
29 0:2: cache-references
30 0:3: cache-misses
31 0:4: branch-instructions
32 0:4: branches
33 0:5: branch-misses
34 0:6: bus-cycles
35 1:0: cpu-clock
36 1:1: task-clock
37 1:2: page-faults
38 1:2: faults
39 1:5: minor-faults
40 1:6: major-faults
41 1:3: context-switches
42 1:3: cs
43 1:4: cpu-migrations
44 1:4: migrations
45 rNNN: raw PMU events (eventsel+umask)
46
47-a::
48 system-wide collection
49
50-l::
51 scale counter values
52
53Configuration
54-------------
55
56EXAMPLES
57--------
58
59$ perf stat sleep 1
60
61 Performance counter stats for 'sleep':
62
63 0.678356 task clock ticks (msecs)
64 7 context switches (events)
65 4 CPU migrations (events)
66 232 pagefaults (events)
67 1810403 CPU cycles (events)
68 946759 instructions (events)
69 18952 cache references (events)
70 4885 cache misses (events)
71
72 Wall-clock time elapsed: 1001.252894 msecs
73
74SEE ALSO
75--------
76linkperf:git-tops[1]
diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/Documentation/perf_counter/Documentation/perf-top.txt
new file mode 100644
index 000000000000..057333b72534
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-top.txt
@@ -0,0 +1,61 @@
1perf-top(1)
2==========
3
4NAME
5----
6perf-top - Run a command and profile it
7
8SYNOPSIS
9--------
10[verse]
11'perf top' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
12
13DESCRIPTION
14-----------
15This command runs a command and gathers a performance counter profile
16from it.
17
18
19OPTIONS
20-------
21<command>...::
22 Any command you can specify in a shell.
23
24-e::
25--event=::
26 0:0: cpu-cycles
27 0:0: cycles
28 0:1: instructions
29 0:2: cache-references
30 0:3: cache-misses
31 0:4: branch-instructions
32 0:4: branches
33 0:5: branch-misses
34 0:6: bus-cycles
35 1:0: cpu-clock
36 1:1: task-clock
37 1:2: page-faults
38 1:2: faults
39 1:5: minor-faults
40 1:6: major-faults
41 1:3: context-switches
42 1:3: cs
43 1:4: cpu-migrations
44 1:4: migrations
45 rNNN: raw PMU events (eventsel+umask)
46
47-a::
48 system-wide collection
49
50-l::
51 scale counter values
52
53Configuration
54-------------
55
56EXAMPLES
57--------
58
59SEE ALSO
60--------
61linkperf:git-stat[1]
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
new file mode 100644
index 000000000000..543ccf28ac4a
--- /dev/null
+++ b/Documentation/perf_counter/Makefile
@@ -0,0 +1,844 @@
1# The default target of this Makefile is...
2all::
3
4# Define V=1 to have a more verbose compile.
5#
6# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
7# or vsnprintf() return -1 instead of number of characters which would
8# have been written to the final string if enough space had been available.
9#
10# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
11# when attempting to read from an fopen'ed directory.
12#
13# Define NO_OPENSSL environment variable if you do not have OpenSSL.
14# This also implies MOZILLA_SHA1.
15#
16# Define CURLDIR=/foo/bar if your curl header and library files are in
17# /foo/bar/include and /foo/bar/lib directories.
18#
19# Define EXPATDIR=/foo/bar if your expat header and library files are in
20# /foo/bar/include and /foo/bar/lib directories.
21#
22# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent.
23#
24# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks
25# d_type in struct dirent (latest Cygwin -- will be fixed soonish).
26#
27# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
28# do not support the 'size specifiers' introduced by C99, namely ll, hh,
29# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
30# some C compilers supported these specifiers prior to C99 as an extension.
31#
32# Define NO_STRCASESTR if you don't have strcasestr.
33#
34# Define NO_MEMMEM if you don't have memmem.
35#
36# Define NO_STRTOUMAX if you don't have strtoumax in the C library.
37# If your compiler also does not support long long or does not have
38# strtoull, define NO_STRTOULL.
39#
40# Define NO_SETENV if you don't have setenv in the C library.
41#
42# Define NO_UNSETENV if you don't have unsetenv in the C library.
43#
44# Define NO_MKDTEMP if you don't have mkdtemp in the C library.
45#
46# Define NO_SYS_SELECT_H if you don't have sys/select.h.
47#
48# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link.
49# Enable it on Windows. By default, symrefs are still used.
50#
51# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
52# tests. These tests take up a significant amount of the total test time
53# but are not needed unless you plan to talk to SVN repos.
54#
55# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
56# installed in /sw, but don't want PERF to link against any libraries
57# installed there. If defined you may specify your own (or Fink's)
58# include directories and library directories by defining CFLAGS
59# and LDFLAGS appropriately.
60#
61# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
62# have DarwinPorts installed in /opt/local, but don't want PERF to
63# link against any libraries installed there. If defined you may
64# specify your own (or DarwinPort's) include directories and
65# library directories by defining CFLAGS and LDFLAGS appropriately.
66#
67# Define PPC_SHA1 environment variable when running make to make use of
68# a bundled SHA1 routine optimized for PowerPC.
69#
70# Define ARM_SHA1 environment variable when running make to make use of
71# a bundled SHA1 routine optimized for ARM.
72#
73# Define MOZILLA_SHA1 environment variable when running make to make use of
74# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast
75# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default
76# choice) has very fast version optimized for i586.
77#
78# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
79#
80# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
81#
82# Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
83# Patrick Mauritz).
84#
85# Define NO_MMAP if you want to avoid mmap.
86#
87# Define NO_PTHREADS if you do not have or do not want to use Pthreads.
88#
89# Define NO_PREAD if you have a problem with pread() system call (e.g.
90# cygwin.dll before v1.5.22).
91#
92# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is
93# generally faster on your platform than accessing the working directory.
94#
95# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
96# the executable mode bit, but doesn't really do so.
97#
98# Define NO_IPV6 if you lack IPv6 support and getaddrinfo().
99#
100# Define NO_SOCKADDR_STORAGE if your platform does not have struct
101# sockaddr_storage.
102#
103# Define NO_ICONV if your libc does not properly support iconv.
104#
105# Define OLD_ICONV if your library has an old iconv(), where the second
106# (input buffer pointer) parameter is declared with type (const char **).
107#
108# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
109#
110# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
111# that tells runtime paths to dynamic libraries;
112# "-Wl,-rpath=/path/lib" is used instead.
113#
114# Define USE_NSEC below if you want perf to care about sub-second file mtimes
115# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
116# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
117# randomly break unless your underlying filesystem supports those sub-second
118# times (my ext3 doesn't).
119#
120# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of
121# "st_ctim"
122#
123# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
124# available. This automatically turns USE_NSEC off.
125#
126# Define USE_STDEV below if you want perf to care about the underlying device
127# change being considered an inode change from the update-index perspective.
128#
129# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
130# field that counts the on-disk footprint in 512-byte blocks.
131#
132# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
133#
134# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
135#
136# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's
137# MakeMaker (e.g. using ActiveState under Cygwin).
138#
139# Define NO_PERL if you do not want Perl scripts or libraries at all.
140#
141# Define INTERNAL_QSORT to use Git's implementation of qsort(), which
142# is a simplified version of the merge sort used in glibc. This is
143# recommended if Git triggers O(n^2) behavior in your platform's qsort().
144#
145# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
146# your external grep (e.g., if your system lacks grep, if its grep is
147# broken, or spawning external process is slower than built-in grep perf has).
148
149PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
150 @$(SHELL_PATH) util/PERF-VERSION-GEN
151-include PERF-VERSION-FILE
152
153uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
154uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
155uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not')
156uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
157uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
158uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
159
160# CFLAGS and LDFLAGS are for the users to override from the command line.
161
162CFLAGS = -g -O2 -Wall
163LDFLAGS = -lpthread -lrt
164ALL_CFLAGS = $(CFLAGS)
165ALL_LDFLAGS = $(LDFLAGS)
166STRIP ?= strip
167
168# Among the variables below, these:
169# perfexecdir
170# template_dir
171# mandir
172# infodir
173# htmldir
174# ETC_PERFCONFIG (but not sysconfdir)
175# can be specified as a relative path some/where/else;
176# this is interpreted as relative to $(prefix) and "perf" at
177# runtime figures out where they are based on the path to the executable.
178# This can help installing the suite in a relocatable way.
179
180prefix = $(HOME)
181bindir_relative = bin
182bindir = $(prefix)/$(bindir_relative)
183mandir = share/man
184infodir = share/info
185perfexecdir = libexec/perf-core
186sharedir = $(prefix)/share
187template_dir = share/perf-core/templates
188htmldir = share/doc/perf-doc
189ifeq ($(prefix),/usr)
190sysconfdir = /etc
191ETC_PERFCONFIG = $(sysconfdir)/perfconfig
192else
193sysconfdir = $(prefix)/etc
194ETC_PERFCONFIG = etc/perfconfig
195endif
196lib = lib
197# DESTDIR=
198
199export prefix bindir sharedir sysconfdir
200
201CC = gcc
202AR = ar
203RM = rm -f
204TAR = tar
205FIND = find
206INSTALL = install
207RPMBUILD = rpmbuild
208PTHREAD_LIBS = -lpthread
209
210# sparse is architecture-neutral, which means that we need to tell it
211# explicitly what architecture to check for. Fix this up for yours..
212SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
213
214
215
216### --- END CONFIGURATION SECTION ---
217
218# Those must not be GNU-specific; they are shared with perl/ which may
219# be built by a different compiler. (Note that this is an artifact now
220# but it still might be nice to keep that distinction.)
221BASIC_CFLAGS =
222BASIC_LDFLAGS =
223
224# Guard against environment variables
225BUILTIN_OBJS =
226BUILT_INS =
227COMPAT_CFLAGS =
228COMPAT_OBJS =
229LIB_H =
230LIB_OBJS =
231PROGRAMS =
232SCRIPT_PERL =
233SCRIPT_SH =
234TEST_PROGRAMS =
235
236#
237# No scripts right now:
238#
239
240# SCRIPT_SH += perf-am.sh
241
242#
243# No Perl scripts right now:
244#
245
246# SCRIPT_PERL += perf-add--interactive.perl
247
248SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
249 $(patsubst %.perl,%,$(SCRIPT_PERL))
250
251# Empty...
252EXTRA_PROGRAMS =
253
254# ... and all the rest that could be moved out of bindir to perfexecdir
255PROGRAMS += $(EXTRA_PROGRAMS)
256
257#
258# Single 'perf' binary right now:
259#
260PROGRAMS += perf
261
262# List built-in command $C whose implementation cmd_$C() is not in
263# builtin-$C.o but is linked in as part of some other command.
264BUILT_INS += $(patsubst builtin-%.o,perf-%$X,$(BUILTIN_OBJS))
265
266#
267# None right now:
268#
269# BUILT_INS += perf-init $X
270
271# what 'all' will build and 'install' will install, in perfexecdir
272ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
273
274# what 'all' will build but not install in perfexecdir
275OTHER_PROGRAMS = perf$X
276
277# Set paths to tools early so that they can be used for version tests.
278ifndef SHELL_PATH
279 SHELL_PATH = /bin/sh
280endif
281ifndef PERL_PATH
282 PERL_PATH = /usr/bin/perl
283endif
284
285export PERL_PATH
286
287LIB_FILE=libperf.a
288
289LIB_H += ../../include/linux/perf_counter.h
290LIB_H += util/levenshtein.h
291LIB_H += util/parse-options.h
292LIB_H += util/quote.h
293LIB_H += util/util.h
294LIB_H += util/help.h
295LIB_H += util/strbuf.h
296LIB_H += util/run-command.h
297
298LIB_OBJS += util/abspath.o
299LIB_OBJS += util/alias.o
300LIB_OBJS += util/config.o
301LIB_OBJS += util/ctype.o
302LIB_OBJS += util/exec_cmd.o
303LIB_OBJS += util/help.o
304LIB_OBJS += util/levenshtein.o
305LIB_OBJS += util/parse-options.o
306LIB_OBJS += util/path.o
307LIB_OBJS += util/run-command.o
308LIB_OBJS += util/quote.o
309LIB_OBJS += util/strbuf.o
310LIB_OBJS += util/usage.o
311LIB_OBJS += util/wrapper.o
312
313BUILTIN_OBJS += builtin-help.o
314BUILTIN_OBJS += builtin-record.o
315BUILTIN_OBJS += builtin-stat.o
316BUILTIN_OBJS += builtin-top.o
317
318PERFLIBS = $(LIB_FILE)
319EXTLIBS =
320
321#
322# Platform specific tweaks
323#
324
325# We choose to avoid "if .. else if .. else .. endif endif"
326# because maintaining the nesting to match is a pain. If
327# we had "elif" things would have been much nicer...
328
329-include config.mak.autogen
330-include config.mak
331
332ifeq ($(uname_S),Darwin)
333 ifndef NO_FINK
334 ifeq ($(shell test -d /sw/lib && echo y),y)
335 BASIC_CFLAGS += -I/sw/include
336 BASIC_LDFLAGS += -L/sw/lib
337 endif
338 endif
339 ifndef NO_DARWIN_PORTS
340 ifeq ($(shell test -d /opt/local/lib && echo y),y)
341 BASIC_CFLAGS += -I/opt/local/include
342 BASIC_LDFLAGS += -L/opt/local/lib
343 endif
344 endif
345 PTHREAD_LIBS =
346endif
347
348ifndef CC_LD_DYNPATH
349 ifdef NO_R_TO_GCC_LINKER
350 # Some gcc does not accept and pass -R to the linker to specify
351 # the runtime dynamic library path.
352 CC_LD_DYNPATH = -Wl,-rpath,
353 else
354 CC_LD_DYNPATH = -R
355 endif
356endif
357
358ifdef ZLIB_PATH
359 BASIC_CFLAGS += -I$(ZLIB_PATH)/include
360 EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib)
361endif
362EXTLIBS += -lz
363
364ifdef NEEDS_SOCKET
365 EXTLIBS += -lsocket
366endif
367ifdef NEEDS_NSL
368 EXTLIBS += -lnsl
369endif
370ifdef NO_D_TYPE_IN_DIRENT
371 BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT
372endif
373ifdef NO_D_INO_IN_DIRENT
374 BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT
375endif
376ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
377 BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT
378endif
379ifdef USE_NSEC
380 BASIC_CFLAGS += -DUSE_NSEC
381endif
382ifdef USE_ST_TIMESPEC
383 BASIC_CFLAGS += -DUSE_ST_TIMESPEC
384endif
385ifdef NO_NSEC
386 BASIC_CFLAGS += -DNO_NSEC
387endif
388ifdef NO_C99_FORMAT
389 BASIC_CFLAGS += -DNO_C99_FORMAT
390endif
391ifdef SNPRINTF_RETURNS_BOGUS
392 COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
393 COMPAT_OBJS += compat/snprintf.o
394endif
395ifdef FREAD_READS_DIRECTORIES
396 COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
397 COMPAT_OBJS += compat/fopen.o
398endif
399ifdef NO_SYMLINK_HEAD
400 BASIC_CFLAGS += -DNO_SYMLINK_HEAD
401endif
402ifdef NO_STRCASESTR
403 COMPAT_CFLAGS += -DNO_STRCASESTR
404 COMPAT_OBJS += compat/strcasestr.o
405endif
406ifdef NO_STRTOUMAX
407 COMPAT_CFLAGS += -DNO_STRTOUMAX
408 COMPAT_OBJS += compat/strtoumax.o
409endif
410ifdef NO_STRTOULL
411 COMPAT_CFLAGS += -DNO_STRTOULL
412endif
413ifdef NO_SETENV
414 COMPAT_CFLAGS += -DNO_SETENV
415 COMPAT_OBJS += compat/setenv.o
416endif
417ifdef NO_MKDTEMP
418 COMPAT_CFLAGS += -DNO_MKDTEMP
419 COMPAT_OBJS += compat/mkdtemp.o
420endif
421ifdef NO_UNSETENV
422 COMPAT_CFLAGS += -DNO_UNSETENV
423 COMPAT_OBJS += compat/unsetenv.o
424endif
425ifdef NO_SYS_SELECT_H
426 BASIC_CFLAGS += -DNO_SYS_SELECT_H
427endif
428ifdef NO_MMAP
429 COMPAT_CFLAGS += -DNO_MMAP
430 COMPAT_OBJS += compat/mmap.o
431else
432 ifdef USE_WIN32_MMAP
433 COMPAT_CFLAGS += -DUSE_WIN32_MMAP
434 COMPAT_OBJS += compat/win32mmap.o
435 endif
436endif
437ifdef NO_PREAD
438 COMPAT_CFLAGS += -DNO_PREAD
439 COMPAT_OBJS += compat/pread.o
440endif
441ifdef NO_FAST_WORKING_DIRECTORY
442 BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
443endif
444ifdef NO_TRUSTABLE_FILEMODE
445 BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
446endif
447ifdef NO_IPV6
448 BASIC_CFLAGS += -DNO_IPV6
449endif
450ifdef NO_UINTMAX_T
451 BASIC_CFLAGS += -Duintmax_t=uint32_t
452endif
453ifdef NO_SOCKADDR_STORAGE
454ifdef NO_IPV6
455 BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in
456else
457 BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6
458endif
459endif
460ifdef NO_INET_NTOP
461 LIB_OBJS += compat/inet_ntop.o
462endif
463ifdef NO_INET_PTON
464 LIB_OBJS += compat/inet_pton.o
465endif
466
467ifdef NO_ICONV
468 BASIC_CFLAGS += -DNO_ICONV
469endif
470
471ifdef OLD_ICONV
472 BASIC_CFLAGS += -DOLD_ICONV
473endif
474
475ifdef NO_DEFLATE_BOUND
476 BASIC_CFLAGS += -DNO_DEFLATE_BOUND
477endif
478
479ifdef PPC_SHA1
480 SHA1_HEADER = "ppc/sha1.h"
481 LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
482else
483ifdef ARM_SHA1
484 SHA1_HEADER = "arm/sha1.h"
485 LIB_OBJS += arm/sha1.o arm/sha1_arm.o
486else
487ifdef MOZILLA_SHA1
488 SHA1_HEADER = "mozilla-sha1/sha1.h"
489 LIB_OBJS += mozilla-sha1/sha1.o
490else
491 SHA1_HEADER = <openssl/sha.h>
492 EXTLIBS += $(LIB_4_CRYPTO)
493endif
494endif
495endif
496ifdef NO_PERL_MAKEMAKER
497 export NO_PERL_MAKEMAKER
498endif
499ifdef NO_HSTRERROR
500 COMPAT_CFLAGS += -DNO_HSTRERROR
501 COMPAT_OBJS += compat/hstrerror.o
502endif
503ifdef NO_MEMMEM
504 COMPAT_CFLAGS += -DNO_MEMMEM
505 COMPAT_OBJS += compat/memmem.o
506endif
507ifdef INTERNAL_QSORT
508 COMPAT_CFLAGS += -DINTERNAL_QSORT
509 COMPAT_OBJS += compat/qsort.o
510endif
511ifdef RUNTIME_PREFIX
512 COMPAT_CFLAGS += -DRUNTIME_PREFIX
513endif
514
515ifdef DIR_HAS_BSD_GROUP_SEMANTICS
516 COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
517endif
518ifdef NO_EXTERNAL_GREP
519 BASIC_CFLAGS += -DNO_EXTERNAL_GREP
520endif
521
522ifeq ($(PERL_PATH),)
523NO_PERL=NoThanks
524endif
525
526QUIET_SUBDIR0 = +$(MAKE) -C # space to separate -C and subdir
527QUIET_SUBDIR1 =
528
529ifneq ($(findstring $(MAKEFLAGS),w),w)
530PRINT_DIR = --no-print-directory
531else # "make -w"
532NO_SUBDIR = :
533endif
534
535ifneq ($(findstring $(MAKEFLAGS),s),s)
536ifndef V
537 QUIET_CC = @echo ' ' CC $@;
538 QUIET_AR = @echo ' ' AR $@;
539 QUIET_LINK = @echo ' ' LINK $@;
540 QUIET_BUILT_IN = @echo ' ' BUILTIN $@;
541 QUIET_GEN = @echo ' ' GEN $@;
542 QUIET_SUBDIR0 = +@subdir=
543 QUIET_SUBDIR1 = ;$(NO_SUBDIR) echo ' ' SUBDIR $$subdir; \
544 $(MAKE) $(PRINT_DIR) -C $$subdir
545 export V
546 export QUIET_GEN
547 export QUIET_BUILT_IN
548endif
549endif
550
551ifdef ASCIIDOC8
552 export ASCIIDOC8
553endif
554
555# Shell quote (do not use $(call) to accommodate ancient setups);
556
557SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
558ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
559
560DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
561bindir_SQ = $(subst ','\'',$(bindir))
562bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
563mandir_SQ = $(subst ','\'',$(mandir))
564infodir_SQ = $(subst ','\'',$(infodir))
565perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
566template_dir_SQ = $(subst ','\'',$(template_dir))
567htmldir_SQ = $(subst ','\'',$(htmldir))
568prefix_SQ = $(subst ','\'',$(prefix))
569
570SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
571PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
572
573LIBS = $(PERFLIBS) $(EXTLIBS)
574
575BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
576 $(COMPAT_CFLAGS)
577LIB_OBJS += $(COMPAT_OBJS)
578
579ALL_CFLAGS += $(BASIC_CFLAGS)
580ALL_LDFLAGS += $(BASIC_LDFLAGS)
581
582export TAR INSTALL DESTDIR SHELL_PATH
583
584
585### Build rules
586
587SHELL = $(SHELL_PATH)
588
589all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS
590ifneq (,$X)
591 $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
592endif
593
594all::
595
596please_set_SHELL_PATH_to_a_more_modern_shell:
597 @$$(:)
598
599shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
600
601strip: $(PROGRAMS) perf$X
602 $(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X
603
604perf.o: perf.c common-cmds.h PERF-CFLAGS
605 $(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
606 '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
607 $(ALL_CFLAGS) -c $(filter %.c,$^)
608
609perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS)
610 $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \
611 $(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
612
613builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS
614 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
615 '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
616 '-DPERF_MAN_PATH="$(mandir_SQ)"' \
617 '-DPERF_INFO_PATH="$(infodir_SQ)"' $<
618
619$(BUILT_INS): perf$X
620 $(QUIET_BUILT_IN)$(RM) $@ && \
621 ln perf$X $@ 2>/dev/null || \
622 ln -s perf$X $@ 2>/dev/null || \
623 cp perf$X $@
624
625common-cmds.h: util/generate-cmdlist.sh command-list.txt
626
627common-cmds.h: $(wildcard Documentation/perf-*.txt)
628 $(QUIET_GEN)util/generate-cmdlist.sh > $@+ && mv $@+ $@
629
630$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
631 $(QUIET_GEN)$(RM) $@ $@+ && \
632 sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
633 -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
634 -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
635 -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
636 -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
637 $@.sh >$@+ && \
638 chmod +x $@+ && \
639 mv $@+ $@
640
641configure: configure.ac
642 $(QUIET_GEN)$(RM) $@ $<+ && \
643 sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
644 $< > $<+ && \
645 autoconf -o $@ $<+ && \
646 $(RM) $<+
647
648# These can record PERF_VERSION
649perf.o perf.spec \
650 $(patsubst %.sh,%,$(SCRIPT_SH)) \
651 $(patsubst %.perl,%,$(SCRIPT_PERL)) \
652 : PERF-VERSION-FILE
653
654%.o: %.c PERF-CFLAGS
655 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
656%.s: %.c PERF-CFLAGS
657 $(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
658%.o: %.S
659 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
660
661util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS
662 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
663 '-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
664 '-DBINDIR="$(bindir_relative_SQ)"' \
665 '-DPREFIX="$(prefix_SQ)"' \
666 $<
667
668builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
669 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
670
671util/config.o: util/config.c PERF-CFLAGS
672 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
673
674perf-%$X: %.o $(PERFLIBS)
675 $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
676
677$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
678$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
679builtin-revert.o wt-status.o: wt-status.h
680
681$(LIB_FILE): $(LIB_OBJS)
682 $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
683
684TAGS:
685 $(RM) TAGS
686 $(FIND) . -name '*.[hcS]' -print | xargs etags -a
687
688tags:
689 $(RM) tags
690 $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
691
692cscope:
693 $(RM) cscope*
694 $(FIND) . -name '*.[hcS]' -print | xargs cscope -b
695
696### Detect prefix changes
697TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
698 $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
699
700PERF-CFLAGS: .FORCE-PERF-CFLAGS
701 @FLAGS='$(TRACK_CFLAGS)'; \
702 if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \
703 echo 1>&2 " * new build flags or prefix"; \
704 echo "$$FLAGS" >PERF-CFLAGS; \
705 fi
706
707# We need to apply sq twice, once to protect from the shell
708# that runs PERF-BUILD-OPTIONS, and then again to protect it
709# and the first level quoting from the shell that runs "echo".
710PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
711 @echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
712 @echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
713 @echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
714 @echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
715
716### Testing rules
717
718#
719# None right now:
720#
721# TEST_PROGRAMS += test-something$X
722
723all:: $(TEST_PROGRAMS)
724
725# GNU make supports exporting all variables by "export" without parameters.
726# However, the environment gets quite big, and some programs have problems
727# with that.
728
729export NO_SVN_TESTS
730
731check: common-cmds.h
732 if sparse; \
733 then \
734 for i in *.c */*.c; \
735 do \
736 sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
737 done; \
738 else \
739 echo 2>&1 "Did you mean 'make test'?"; \
740 exit 1; \
741 fi
742
743remove-dashes:
744 ./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS)
745
746### Installation rules
747
748ifneq ($(filter /%,$(firstword $(template_dir))),)
749template_instdir = $(template_dir)
750else
751template_instdir = $(prefix)/$(template_dir)
752endif
753export template_instdir
754
755ifneq ($(filter /%,$(firstword $(perfexecdir))),)
756perfexec_instdir = $(perfexecdir)
757else
758perfexec_instdir = $(prefix)/$(perfexecdir)
759endif
760perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
761export perfexec_instdir
762
763install: all
764 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
765 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
766 $(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
767ifneq (,$X)
768 $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
769endif
770
771### Maintainer's dist rules
772
773perf.spec: perf.spec.in
774 sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
775 mv $@+ $@
776
777PERF_TARNAME=perf-$(PERF_VERSION)
778dist: perf.spec perf-archive$(X) configure
779 ./perf-archive --format=tar \
780 --prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
781 @mkdir -p $(PERF_TARNAME)
782 @cp perf.spec configure $(PERF_TARNAME)
783 @echo $(PERF_VERSION) > $(PERF_TARNAME)/version
784 $(TAR) rf $(PERF_TARNAME).tar \
785 $(PERF_TARNAME)/perf.spec \
786 $(PERF_TARNAME)/configure \
787 $(PERF_TARNAME)/version
788 @$(RM) -r $(PERF_TARNAME)
789 gzip -f -9 $(PERF_TARNAME).tar
790
791rpm: dist
792 $(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
793
794### Cleaning rules
795
796distclean: clean
797 $(RM) configure
798
799clean:
800 $(RM) *.o */*.o $(LIB_FILE)
801 $(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
802 $(RM) $(TEST_PROGRAMS)
803 $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
804 $(RM) -r autom4te.cache
805 $(RM) config.log config.mak.autogen config.mak.append config.status config.cache
806 $(RM) -r $(PERF_TARNAME) .doc-tmp-dir
807 $(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
808 $(RM) $(htmldocs).tar.gz $(manpages).tar.gz
809 $(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
810
811.PHONY: all install clean strip
812.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
813.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
814.PHONY: .FORCE-PERF-BUILD-OPTIONS
815
816### Make sure built-ins do not have dups and listed in perf.c
817#
818check-builtins::
819 ./check-builtins.sh
820
821### Test suite coverage testing
822#
823.PHONY: coverage coverage-clean coverage-build coverage-report
824
825coverage:
826 $(MAKE) coverage-build
827 $(MAKE) coverage-report
828
829coverage-clean:
830 rm -f *.gcda *.gcno
831
832COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
833COVERAGE_LDFLAGS = $(CFLAGS) -O0 -lgcov
834
835coverage-build: coverage-clean
836 $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
837 $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
838 -j1 test
839
840coverage-report:
841 gcov -b *.c */*.c
842 grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
843 | sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
844 | tee coverage-untested-functions
diff --git a/Documentation/perf_counter/PERF-BUILD-OPTIONS b/Documentation/perf_counter/PERF-BUILD-OPTIONS
new file mode 100644
index 000000000000..46d8d6ceb2f4
--- /dev/null
+++ b/Documentation/perf_counter/PERF-BUILD-OPTIONS
@@ -0,0 +1,4 @@
1SHELL_PATH='/bin/sh'
2TAR='tar'
3NO_CURL=''
4NO_PERL=''
diff --git a/Documentation/perf_counter/PERF-CFLAGS b/Documentation/perf_counter/PERF-CFLAGS
new file mode 100644
index 000000000000..f24906ca688d
--- /dev/null
+++ b/Documentation/perf_counter/PERF-CFLAGS
@@ -0,0 +1 @@
-g -O2 -Wall -DSHA1_HEADER='<openssl/sha.h>' : /home/mingo/bin:libexec/perf-core:share/perf-core/templates:/home/mingo
diff --git a/Documentation/perf_counter/PERF-VERSION-FILE b/Documentation/perf_counter/PERF-VERSION-FILE
new file mode 100644
index 000000000000..328e244c0c81
--- /dev/null
+++ b/Documentation/perf_counter/PERF-VERSION-FILE
@@ -0,0 +1 @@
PERF_VERSION = 0.0.1.PERF
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
new file mode 100644
index 000000000000..6616de0ef053
--- /dev/null
+++ b/Documentation/perf_counter/builtin-help.c
@@ -0,0 +1,461 @@
1/*
2 * builtin-help.c
3 *
4 * Builtin help command
5 */
6#include "util/cache.h"
7#include "builtin.h"
8#include "util/exec_cmd.h"
9#include "common-cmds.h"
10#include "util/parse-options.h"
11#include "util/run-command.h"
12#include "util/help.h"
13
14static struct man_viewer_list {
15 struct man_viewer_list *next;
16 char name[FLEX_ARRAY];
17} *man_viewer_list;
18
19static struct man_viewer_info_list {
20 struct man_viewer_info_list *next;
21 const char *info;
22 char name[FLEX_ARRAY];
23} *man_viewer_info_list;
24
25enum help_format {
26 HELP_FORMAT_MAN,
27 HELP_FORMAT_INFO,
28 HELP_FORMAT_WEB,
29};
30
31static int show_all = 0;
32static enum help_format help_format = HELP_FORMAT_MAN;
33static struct option builtin_help_options[] = {
34 OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
35 OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
36 OPT_SET_INT('w', "web", &help_format, "show manual in web browser",
37 HELP_FORMAT_WEB),
38 OPT_SET_INT('i', "info", &help_format, "show info page",
39 HELP_FORMAT_INFO),
40 OPT_END(),
41};
42
43static const char * const builtin_help_usage[] = {
44 "perf help [--all] [--man|--web|--info] [command]",
45 NULL
46};
47
48static enum help_format parse_help_format(const char *format)
49{
50 if (!strcmp(format, "man"))
51 return HELP_FORMAT_MAN;
52 if (!strcmp(format, "info"))
53 return HELP_FORMAT_INFO;
54 if (!strcmp(format, "web") || !strcmp(format, "html"))
55 return HELP_FORMAT_WEB;
56 die("unrecognized help format '%s'", format);
57}
58
59static const char *get_man_viewer_info(const char *name)
60{
61 struct man_viewer_info_list *viewer;
62
63 for (viewer = man_viewer_info_list; viewer; viewer = viewer->next)
64 {
65 if (!strcasecmp(name, viewer->name))
66 return viewer->info;
67 }
68 return NULL;
69}
70
71static int check_emacsclient_version(void)
72{
73 struct strbuf buffer = STRBUF_INIT;
74 struct child_process ec_process;
75 const char *argv_ec[] = { "emacsclient", "--version", NULL };
76 int version;
77
78 /* emacsclient prints its version number on stderr */
79 memset(&ec_process, 0, sizeof(ec_process));
80 ec_process.argv = argv_ec;
81 ec_process.err = -1;
82 ec_process.stdout_to_stderr = 1;
83 if (start_command(&ec_process)) {
84 fprintf(stderr, "Failed to start emacsclient.\n");
85 return -1;
86 }
87 strbuf_read(&buffer, ec_process.err, 20);
88 close(ec_process.err);
89
90 /*
91 * Don't bother checking return value, because "emacsclient --version"
92 * seems to always exits with code 1.
93 */
94 finish_command(&ec_process);
95
96 if (prefixcmp(buffer.buf, "emacsclient")) {
97 fprintf(stderr, "Failed to parse emacsclient version.\n");
98 strbuf_release(&buffer);
99 return -1;
100 }
101
102 strbuf_remove(&buffer, 0, strlen("emacsclient"));
103 version = atoi(buffer.buf);
104
105 if (version < 22) {
106 fprintf(stderr,
107 "emacsclient version '%d' too old (< 22).\n",
108 version);
109 strbuf_release(&buffer);
110 return -1;
111 }
112
113 strbuf_release(&buffer);
114 return 0;
115}
116
117static void exec_woman_emacs(const char* path, const char *page)
118{
119 if (!check_emacsclient_version()) {
120 /* This works only with emacsclient version >= 22. */
121 struct strbuf man_page = STRBUF_INIT;
122
123 if (!path)
124 path = "emacsclient";
125 strbuf_addf(&man_page, "(woman \"%s\")", page);
126 execlp(path, "emacsclient", "-e", man_page.buf, NULL);
127 warning("failed to exec '%s': %s", path, strerror(errno));
128 }
129}
130
131static void exec_man_konqueror(const char* path, const char *page)
132{
133 const char *display = getenv("DISPLAY");
134 if (display && *display) {
135 struct strbuf man_page = STRBUF_INIT;
136 const char *filename = "kfmclient";
137
138 /* It's simpler to launch konqueror using kfmclient. */
139 if (path) {
140 const char *file = strrchr(path, '/');
141 if (file && !strcmp(file + 1, "konqueror")) {
142 char *new = strdup(path);
143 char *dest = strrchr(new, '/');
144
145 /* strlen("konqueror") == strlen("kfmclient") */
146 strcpy(dest + 1, "kfmclient");
147 path = new;
148 }
149 if (file)
150 filename = file;
151 } else
152 path = "kfmclient";
153 strbuf_addf(&man_page, "man:%s(1)", page);
154 execlp(path, filename, "newTab", man_page.buf, NULL);
155 warning("failed to exec '%s': %s", path, strerror(errno));
156 }
157}
158
159static void exec_man_man(const char* path, const char *page)
160{
161 if (!path)
162 path = "man";
163 execlp(path, "man", page, NULL);
164 warning("failed to exec '%s': %s", path, strerror(errno));
165}
166
167static void exec_man_cmd(const char *cmd, const char *page)
168{
169 struct strbuf shell_cmd = STRBUF_INIT;
170 strbuf_addf(&shell_cmd, "%s %s", cmd, page);
171 execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL);
172 warning("failed to exec '%s': %s", cmd, strerror(errno));
173}
174
175static void add_man_viewer(const char *name)
176{
177 struct man_viewer_list **p = &man_viewer_list;
178 size_t len = strlen(name);
179
180 while (*p)
181 p = &((*p)->next);
182 *p = calloc(1, (sizeof(**p) + len + 1));
183 strncpy((*p)->name, name, len);
184}
185
186static int supported_man_viewer(const char *name, size_t len)
187{
188 return (!strncasecmp("man", name, len) ||
189 !strncasecmp("woman", name, len) ||
190 !strncasecmp("konqueror", name, len));
191}
192
193static void do_add_man_viewer_info(const char *name,
194 size_t len,
195 const char *value)
196{
197 struct man_viewer_info_list *new = calloc(1, sizeof(*new) + len + 1);
198
199 strncpy(new->name, name, len);
200 new->info = strdup(value);
201 new->next = man_viewer_info_list;
202 man_viewer_info_list = new;
203}
204
205static int add_man_viewer_path(const char *name,
206 size_t len,
207 const char *value)
208{
209 if (supported_man_viewer(name, len))
210 do_add_man_viewer_info(name, len, value);
211 else
212 warning("'%s': path for unsupported man viewer.\n"
213 "Please consider using 'man.<tool>.cmd' instead.",
214 name);
215
216 return 0;
217}
218
219static int add_man_viewer_cmd(const char *name,
220 size_t len,
221 const char *value)
222{
223 if (supported_man_viewer(name, len))
224 warning("'%s': cmd for supported man viewer.\n"
225 "Please consider using 'man.<tool>.path' instead.",
226 name);
227 else
228 do_add_man_viewer_info(name, len, value);
229
230 return 0;
231}
232
233static int add_man_viewer_info(const char *var, const char *value)
234{
235 const char *name = var + 4;
236 const char *subkey = strrchr(name, '.');
237
238 if (!subkey)
239 return error("Config with no key for man viewer: %s", name);
240
241 if (!strcmp(subkey, ".path")) {
242 if (!value)
243 return config_error_nonbool(var);
244 return add_man_viewer_path(name, subkey - name, value);
245 }
246 if (!strcmp(subkey, ".cmd")) {
247 if (!value)
248 return config_error_nonbool(var);
249 return add_man_viewer_cmd(name, subkey - name, value);
250 }
251
252 warning("'%s': unsupported man viewer sub key.", subkey);
253 return 0;
254}
255
256static int perf_help_config(const char *var, const char *value, void *cb)
257{
258 if (!strcmp(var, "help.format")) {
259 if (!value)
260 return config_error_nonbool(var);
261 help_format = parse_help_format(value);
262 return 0;
263 }
264 if (!strcmp(var, "man.viewer")) {
265 if (!value)
266 return config_error_nonbool(var);
267 add_man_viewer(value);
268 return 0;
269 }
270 if (!prefixcmp(var, "man."))
271 return add_man_viewer_info(var, value);
272
273 return perf_default_config(var, value, cb);
274}
275
276static struct cmdnames main_cmds, other_cmds;
277
278void list_common_cmds_help(void)
279{
280 int i, longest = 0;
281
282 for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
283 if (longest < strlen(common_cmds[i].name))
284 longest = strlen(common_cmds[i].name);
285 }
286
287 puts("The most commonly used perf commands are:");
288 for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
289 printf(" %s ", common_cmds[i].name);
290 mput_char(' ', longest - strlen(common_cmds[i].name));
291 puts(common_cmds[i].help);
292 }
293}
294
295static int is_perf_command(const char *s)
296{
297 return is_in_cmdlist(&main_cmds, s) ||
298 is_in_cmdlist(&other_cmds, s);
299}
300
301static const char *prepend(const char *prefix, const char *cmd)
302{
303 size_t pre_len = strlen(prefix);
304 size_t cmd_len = strlen(cmd);
305 char *p = malloc(pre_len + cmd_len + 1);
306 memcpy(p, prefix, pre_len);
307 strcpy(p + pre_len, cmd);
308 return p;
309}
310
311static const char *cmd_to_page(const char *perf_cmd)
312{
313 if (!perf_cmd)
314 return "perf";
315 else if (!prefixcmp(perf_cmd, "perf"))
316 return perf_cmd;
317 else if (is_perf_command(perf_cmd))
318 return prepend("perf-", perf_cmd);
319 else
320 return prepend("perf", perf_cmd);
321}
322
323static void setup_man_path(void)
324{
325 struct strbuf new_path = STRBUF_INIT;
326 const char *old_path = getenv("MANPATH");
327
328 /* We should always put ':' after our path. If there is no
329 * old_path, the ':' at the end will let 'man' to try
330 * system-wide paths after ours to find the manual page. If
331 * there is old_path, we need ':' as delimiter. */
332 strbuf_addstr(&new_path, system_path(PERF_MAN_PATH));
333 strbuf_addch(&new_path, ':');
334 if (old_path)
335 strbuf_addstr(&new_path, old_path);
336
337 setenv("MANPATH", new_path.buf, 1);
338
339 strbuf_release(&new_path);
340}
341
342static void exec_viewer(const char *name, const char *page)
343{
344 const char *info = get_man_viewer_info(name);
345
346 if (!strcasecmp(name, "man"))
347 exec_man_man(info, page);
348 else if (!strcasecmp(name, "woman"))
349 exec_woman_emacs(info, page);
350 else if (!strcasecmp(name, "konqueror"))
351 exec_man_konqueror(info, page);
352 else if (info)
353 exec_man_cmd(info, page);
354 else
355 warning("'%s': unknown man viewer.", name);
356}
357
358static void show_man_page(const char *perf_cmd)
359{
360 struct man_viewer_list *viewer;
361 const char *page = cmd_to_page(perf_cmd);
362 const char *fallback = getenv("PERF_MAN_VIEWER");
363
364 setup_man_path();
365 for (viewer = man_viewer_list; viewer; viewer = viewer->next)
366 {
367 exec_viewer(viewer->name, page); /* will return when unable */
368 }
369 if (fallback)
370 exec_viewer(fallback, page);
371 exec_viewer("man", page);
372 die("no man viewer handled the request");
373}
374
375static void show_info_page(const char *perf_cmd)
376{
377 const char *page = cmd_to_page(perf_cmd);
378 setenv("INFOPATH", system_path(PERF_INFO_PATH), 1);
379 execlp("info", "info", "perfman", page, NULL);
380}
381
382static void get_html_page_path(struct strbuf *page_path, const char *page)
383{
384 struct stat st;
385 const char *html_path = system_path(PERF_HTML_PATH);
386
387 /* Check that we have a perf documentation directory. */
388 if (stat(mkpath("%s/perf.html", html_path), &st)
389 || !S_ISREG(st.st_mode))
390 die("'%s': not a documentation directory.", html_path);
391
392 strbuf_init(page_path, 0);
393 strbuf_addf(page_path, "%s/%s.html", html_path, page);
394}
395
396/*
397 * If open_html is not defined in a platform-specific way (see for
398 * example compat/mingw.h), we use the script web--browse to display
399 * HTML.
400 */
401#ifndef open_html
402void open_html(const char *path)
403{
404 execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
405}
406#endif
407
408static void show_html_page(const char *perf_cmd)
409{
410 const char *page = cmd_to_page(perf_cmd);
411 struct strbuf page_path; /* it leaks but we exec bellow */
412
413 get_html_page_path(&page_path, page);
414
415 open_html(page_path.buf);
416}
417
418int cmd_help(int argc, const char **argv, const char *prefix)
419{
420 const char *alias;
421 load_command_list("perf-", &main_cmds, &other_cmds);
422
423 perf_config(perf_help_config, NULL);
424
425 argc = parse_options(argc, argv, builtin_help_options,
426 builtin_help_usage, 0);
427
428 if (show_all) {
429 printf("usage: %s\n\n", perf_usage_string);
430 list_commands("perf commands", &main_cmds, &other_cmds);
431 printf("%s\n", perf_more_info_string);
432 return 0;
433 }
434
435 if (!argv[0]) {
436 printf("usage: %s\n\n", perf_usage_string);
437 list_common_cmds_help();
438 printf("\n%s\n", perf_more_info_string);
439 return 0;
440 }
441
442 alias = alias_lookup(argv[0]);
443 if (alias && !is_perf_command(argv[0])) {
444 printf("`perf %s' is aliased to `%s'\n", argv[0], alias);
445 return 0;
446 }
447
448 switch (help_format) {
449 case HELP_FORMAT_MAN:
450 show_man_page(argv[0]);
451 break;
452 case HELP_FORMAT_INFO:
453 show_info_page(argv[0]);
454 break;
455 case HELP_FORMAT_WEB:
456 show_html_page(argv[0]);
457 break;
458 }
459
460 return 0;
461}
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
new file mode 100644
index 000000000000..4a50abf843ee
--- /dev/null
+++ b/Documentation/perf_counter/builtin-record.c
@@ -0,0 +1,506 @@
1
2
3#define _GNU_SOURCE
4#include <sys/types.h>
5#include <sys/stat.h>
6#include <sys/time.h>
7#include <unistd.h>
8#include <stdint.h>
9#include <stdlib.h>
10#include <string.h>
11#include <limits.h>
12#include <getopt.h>
13#include <assert.h>
14#include <fcntl.h>
15#include <stdio.h>
16#include <errno.h>
17#include <ctype.h>
18#include <time.h>
19#include <sched.h>
20#include <pthread.h>
21
22#include <sys/syscall.h>
23#include <sys/ioctl.h>
24#include <sys/poll.h>
25#include <sys/prctl.h>
26#include <sys/wait.h>
27#include <sys/uio.h>
28#include <sys/mman.h>
29
30#include <linux/unistd.h>
31#include <linux/types.h>
32
33#include "../../include/linux/perf_counter.h"
34
35
36/*
37 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
38 * counters in the current task.
39 */
40#define PR_TASK_PERF_COUNTERS_DISABLE 31
41#define PR_TASK_PERF_COUNTERS_ENABLE 32
42
43#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
44
45#define rdclock() \
46({ \
47 struct timespec ts; \
48 \
49 clock_gettime(CLOCK_MONOTONIC, &ts); \
50 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
51})
52
53/*
54 * Pick up some kernel type conventions:
55 */
56#define __user
57#define asmlinkage
58
59#ifdef __x86_64__
60#define __NR_perf_counter_open 295
61#define rmb() asm volatile("lfence" ::: "memory")
62#define cpu_relax() asm volatile("rep; nop" ::: "memory");
63#endif
64
65#ifdef __i386__
66#define __NR_perf_counter_open 333
67#define rmb() asm volatile("lfence" ::: "memory")
68#define cpu_relax() asm volatile("rep; nop" ::: "memory");
69#endif
70
71#ifdef __powerpc__
72#define __NR_perf_counter_open 319
73#define rmb() asm volatile ("sync" ::: "memory")
74#define cpu_relax() asm volatile ("" ::: "memory");
75#endif
76
77#define unlikely(x) __builtin_expect(!!(x), 0)
78#define min(x, y) ({ \
79 typeof(x) _min1 = (x); \
80 typeof(y) _min2 = (y); \
81 (void) (&_min1 == &_min2); \
82 _min1 < _min2 ? _min1 : _min2; })
83
84extern asmlinkage int sys_perf_counter_open(
85 struct perf_counter_hw_event *hw_event_uptr __user,
86 pid_t pid,
87 int cpu,
88 int group_fd,
89 unsigned long flags);
90
91#define MAX_COUNTERS 64
92#define MAX_NR_CPUS 256
93
94#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
95
96static int nr_counters = 0;
97static __u64 event_id[MAX_COUNTERS] = { };
98static int default_interval = 100000;
99static int event_count[MAX_COUNTERS];
100static int fd[MAX_NR_CPUS][MAX_COUNTERS];
101static int nr_cpus = 0;
102static unsigned int page_size;
103static unsigned int mmap_pages = 16;
104static int output;
105static char *output_name = "output.perf";
106static int group = 0;
107static unsigned int realtime_prio = 0;
108
109const unsigned int default_count[] = {
110 1000000,
111 1000000,
112 10000,
113 10000,
114 1000000,
115 10000,
116};
117
118struct event_symbol {
119 __u64 event;
120 char *symbol;
121};
122
123static struct event_symbol event_symbols[] = {
124 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
125 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
126 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
127 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
128 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
129 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
130 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
131 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
132 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
133
134 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
135 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
136 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
137 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
138 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
139 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
140 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
141 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
142 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
143 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
144};
145
146/*
147 * Each event can have multiple symbolic names.
148 * Symbolic names are (almost) exactly matched.
149 */
150static __u64 match_event_symbols(char *str)
151{
152 __u64 config, id;
153 int type;
154 unsigned int i;
155
156 if (sscanf(str, "r%llx", &config) == 1)
157 return config | PERF_COUNTER_RAW_MASK;
158
159 if (sscanf(str, "%d:%llu", &type, &id) == 2)
160 return EID(type, id);
161
162 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
163 if (!strncmp(str, event_symbols[i].symbol,
164 strlen(event_symbols[i].symbol)))
165 return event_symbols[i].event;
166 }
167
168 return ~0ULL;
169}
170
171static int parse_events(char *str)
172{
173 __u64 config;
174
175again:
176 if (nr_counters == MAX_COUNTERS)
177 return -1;
178
179 config = match_event_symbols(str);
180 if (config == ~0ULL)
181 return -1;
182
183 event_id[nr_counters] = config;
184 nr_counters++;
185
186 str = strstr(str, ",");
187 if (str) {
188 str++;
189 goto again;
190 }
191
192 return 0;
193}
194
195#define __PERF_COUNTER_FIELD(config, name) \
196 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
197
198#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
199#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
200#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
201#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
202
203static void display_events_help(void)
204{
205 unsigned int i;
206 __u64 e;
207
208 printf(
209 " -e EVENT --event=EVENT # symbolic-name abbreviations");
210
211 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
212 int type, id;
213
214 e = event_symbols[i].event;
215 type = PERF_COUNTER_TYPE(e);
216 id = PERF_COUNTER_ID(e);
217
218 printf("\n %d:%d: %-20s",
219 type, id, event_symbols[i].symbol);
220 }
221
222 printf("\n"
223 " rNNN: raw PMU events (eventsel+umask)\n\n");
224}
225
226static void display_help(void)
227{
228 printf(
229 "Usage: perf-record [<options>]\n"
230 "perf-record Options (up to %d event types can be specified at once):\n\n",
231 MAX_COUNTERS);
232
233 display_events_help();
234
235 printf(
236 " -c CNT --count=CNT # event period to sample\n"
237 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
238 " -o file --output=<file> # output file\n"
239 " -r prio --realtime=<prio> # use RT prio\n"
240 );
241
242 exit(0);
243}
244
245static void process_options(int argc, char *argv[])
246{
247 int error = 0, counter;
248
249 for (;;) {
250 int option_index = 0;
251 /** Options for getopt */
252 static struct option long_options[] = {
253 {"count", required_argument, NULL, 'c'},
254 {"event", required_argument, NULL, 'e'},
255 {"mmap_pages", required_argument, NULL, 'm'},
256 {"output", required_argument, NULL, 'o'},
257 {"realtime", required_argument, NULL, 'r'},
258 {NULL, 0, NULL, 0 }
259 };
260 int c = getopt_long(argc, argv, "+:c:e:m:o:r:",
261 long_options, &option_index);
262 if (c == -1)
263 break;
264
265 switch (c) {
266 case 'c': default_interval = atoi(optarg); break;
267 case 'e': error = parse_events(optarg); break;
268 case 'm': mmap_pages = atoi(optarg); break;
269 case 'o': output_name = strdup(optarg); break;
270 case 'r': realtime_prio = atoi(optarg); break;
271 default: error = 1; break;
272 }
273 }
274 if (error)
275 display_help();
276
277 if (!nr_counters) {
278 nr_counters = 1;
279 event_id[0] = 0;
280 }
281
282 for (counter = 0; counter < nr_counters; counter++) {
283 if (event_count[counter])
284 continue;
285
286 event_count[counter] = default_interval;
287 }
288}
289
290struct mmap_data {
291 int counter;
292 void *base;
293 unsigned int mask;
294 unsigned int prev;
295};
296
297static unsigned int mmap_read_head(struct mmap_data *md)
298{
299 struct perf_counter_mmap_page *pc = md->base;
300 int head;
301
302 head = pc->data_head;
303 rmb();
304
305 return head;
306}
307
308static long events;
309static struct timeval last_read, this_read;
310
311static void mmap_read(struct mmap_data *md)
312{
313 unsigned int head = mmap_read_head(md);
314 unsigned int old = md->prev;
315 unsigned char *data = md->base + page_size;
316 unsigned long size;
317 void *buf;
318 int diff;
319
320 gettimeofday(&this_read, NULL);
321
322 /*
323 * If we're further behind than half the buffer, there's a chance
324 * the writer will bite our tail and screw up the events under us.
325 *
326 * If we somehow ended up ahead of the head, we got messed up.
327 *
328 * In either case, truncate and restart at head.
329 */
330 diff = head - old;
331 if (diff > md->mask / 2 || diff < 0) {
332 struct timeval iv;
333 unsigned long msecs;
334
335 timersub(&this_read, &last_read, &iv);
336 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
337
338 fprintf(stderr, "WARNING: failed to keep up with mmap data."
339 " Last read %lu msecs ago.\n", msecs);
340
341 /*
342 * head points to a known good entry, start there.
343 */
344 old = head;
345 }
346
347 last_read = this_read;
348
349 if (old != head)
350 events++;
351
352 size = head - old;
353
354 if ((old & md->mask) + size != (head & md->mask)) {
355 buf = &data[old & md->mask];
356 size = md->mask + 1 - (old & md->mask);
357 old += size;
358 while (size) {
359 int ret = write(output, buf, size);
360 if (ret < 0) {
361 perror("failed to write");
362 exit(-1);
363 }
364 size -= ret;
365 buf += ret;
366 }
367 }
368
369 buf = &data[old & md->mask];
370 size = head - old;
371 old += size;
372 while (size) {
373 int ret = write(output, buf, size);
374 if (ret < 0) {
375 perror("failed to write");
376 exit(-1);
377 }
378 size -= ret;
379 buf += ret;
380 }
381
382 md->prev = old;
383}
384
385static volatile int done = 0;
386
387static void sigchld_handler(int sig)
388{
389 if (sig == SIGCHLD)
390 done = 1;
391}
392
393int cmd_record(int argc, const char **argv)
394{
395 struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
396 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
397 struct perf_counter_hw_event hw_event;
398 int i, counter, group_fd, nr_poll = 0;
399 pid_t pid;
400 int ret;
401
402 page_size = sysconf(_SC_PAGE_SIZE);
403
404 process_options(argc, argv);
405
406 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
407 assert(nr_cpus <= MAX_NR_CPUS);
408 assert(nr_cpus >= 0);
409
410 output = open(output_name, O_CREAT|O_RDWR, S_IRWXU);
411 if (output < 0) {
412 perror("failed to create output file");
413 exit(-1);
414 }
415
416 argc -= optind;
417 argv += optind;
418
419 for (i = 0; i < nr_cpus; i++) {
420 group_fd = -1;
421 for (counter = 0; counter < nr_counters; counter++) {
422
423 memset(&hw_event, 0, sizeof(hw_event));
424 hw_event.config = event_id[counter];
425 hw_event.irq_period = event_count[counter];
426 hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID;
427 hw_event.nmi = 1;
428 hw_event.mmap = 1;
429 hw_event.comm = 1;
430
431 fd[i][counter] = sys_perf_counter_open(&hw_event, -1, i, group_fd, 0);
432 if (fd[i][counter] < 0) {
433 int err = errno;
434 printf("kerneltop error: syscall returned with %d (%s)\n",
435 fd[i][counter], strerror(err));
436 if (err == EPERM)
437 printf("Are you root?\n");
438 exit(-1);
439 }
440 assert(fd[i][counter] >= 0);
441 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
442
443 /*
444 * First counter acts as the group leader:
445 */
446 if (group && group_fd == -1)
447 group_fd = fd[i][counter];
448
449 event_array[nr_poll].fd = fd[i][counter];
450 event_array[nr_poll].events = POLLIN;
451 nr_poll++;
452
453 mmap_array[i][counter].counter = counter;
454 mmap_array[i][counter].prev = 0;
455 mmap_array[i][counter].mask = mmap_pages*page_size - 1;
456 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
457 PROT_READ, MAP_SHARED, fd[i][counter], 0);
458 if (mmap_array[i][counter].base == MAP_FAILED) {
459 printf("kerneltop error: failed to mmap with %d (%s)\n",
460 errno, strerror(errno));
461 exit(-1);
462 }
463 }
464 }
465
466 signal(SIGCHLD, sigchld_handler);
467
468 pid = fork();
469 if (pid < 0)
470 perror("failed to fork");
471
472 if (!pid) {
473 if (execvp(argv[0], argv)) {
474 perror(argv[0]);
475 exit(-1);
476 }
477 }
478
479 if (realtime_prio) {
480 struct sched_param param;
481
482 param.sched_priority = realtime_prio;
483 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
484 printf("Could not set realtime priority.\n");
485 exit(-1);
486 }
487 }
488
489 /*
490 * TODO: store the current /proc/$/maps information somewhere
491 */
492
493 while (!done) {
494 int hits = events;
495
496 for (i = 0; i < nr_cpus; i++) {
497 for (counter = 0; counter < nr_counters; counter++)
498 mmap_read(&mmap_array[i][counter]);
499 }
500
501 if (hits == events)
502 ret = poll(event_array, nr_poll, 100);
503 }
504
505 return 0;
506}
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
new file mode 100644
index 000000000000..112b94ed3298
--- /dev/null
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -0,0 +1,591 @@
1/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
60 *
61 * Released under the GPL v2. (and only v2, not any later version)
62 */
63
64#include "util/util.h"
65
66#include <getopt.h>
67#include <assert.h>
68#include <fcntl.h>
69#include <stdio.h>
70#include <errno.h>
71#include <time.h>
72#include <sched.h>
73#include <pthread.h>
74
75#include <sys/syscall.h>
76#include <sys/ioctl.h>
77#include <sys/poll.h>
78#include <sys/prctl.h>
79#include <sys/wait.h>
80#include <sys/uio.h>
81#include <sys/mman.h>
82
83#include <linux/unistd.h>
84#include <linux/types.h>
85
86#include "../../include/linux/perf_counter.h"
87
88
89/*
90 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
91 * counters in the current task.
92 */
93#define PR_TASK_PERF_COUNTERS_DISABLE 31
94#define PR_TASK_PERF_COUNTERS_ENABLE 32
95
96#define rdclock() \
97({ \
98 struct timespec ts; \
99 \
100 clock_gettime(CLOCK_MONOTONIC, &ts); \
101 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
102})
103
104/*
105 * Pick up some kernel type conventions:
106 */
107#define __user
108#define asmlinkage
109
110#ifdef __x86_64__
111#define __NR_perf_counter_open 295
112#define rmb() asm volatile("lfence" ::: "memory")
113#define cpu_relax() asm volatile("rep; nop" ::: "memory");
114#endif
115
116#ifdef __i386__
117#define __NR_perf_counter_open 333
118#define rmb() asm volatile("lfence" ::: "memory")
119#define cpu_relax() asm volatile("rep; nop" ::: "memory");
120#endif
121
122#ifdef __powerpc__
123#define __NR_perf_counter_open 319
124#define rmb() asm volatile ("sync" ::: "memory")
125#define cpu_relax() asm volatile ("" ::: "memory");
126#endif
127
128#define unlikely(x) __builtin_expect(!!(x), 0)
129#define min(x, y) ({ \
130 typeof(x) _min1 = (x); \
131 typeof(y) _min2 = (y); \
132 (void) (&_min1 == &_min2); \
133 _min1 < _min2 ? _min1 : _min2; })
134
135extern asmlinkage int sys_perf_counter_open(
136 struct perf_counter_hw_event *hw_event_uptr __user,
137 pid_t pid,
138 int cpu,
139 int group_fd,
140 unsigned long flags);
141
142#define MAX_COUNTERS 64
143#define MAX_NR_CPUS 256
144
145#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
146
147static int system_wide = 0;
148
149static int nr_counters = 0;
150static __u64 event_id[MAX_COUNTERS] = {
151 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
152 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
153 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
154 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
155
156 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
157 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
158 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
159 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
160};
161static int default_interval = 100000;
162static int event_count[MAX_COUNTERS];
163static int fd[MAX_NR_CPUS][MAX_COUNTERS];
164
165static int tid = -1;
166static int profile_cpu = -1;
167static int nr_cpus = 0;
168static int nmi = 1;
169static int group = 0;
170static unsigned int page_size;
171
172static int zero;
173
174static int scale;
175
176static const unsigned int default_count[] = {
177 1000000,
178 1000000,
179 10000,
180 10000,
181 1000000,
182 10000,
183};
184
185static char *hw_event_names[] = {
186 "CPU cycles",
187 "instructions",
188 "cache references",
189 "cache misses",
190 "branches",
191 "branch misses",
192 "bus cycles",
193};
194
195static char *sw_event_names[] = {
196 "cpu clock ticks",
197 "task clock ticks",
198 "pagefaults",
199 "context switches",
200 "CPU migrations",
201 "minor faults",
202 "major faults",
203};
204
205struct event_symbol {
206 __u64 event;
207 char *symbol;
208};
209
210static struct event_symbol event_symbols[] = {
211 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
212 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
213 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
214 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
215 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
216 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
217 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
218 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
219 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
220
221 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
222 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
223 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
224 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
225 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
226 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
227 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
228 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
229 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
230 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
231};
232
233#define __PERF_COUNTER_FIELD(config, name) \
234 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
235
236#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
237#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
238#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
239#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
240
241static void display_events_help(void)
242{
243 unsigned int i;
244 __u64 e;
245
246 printf(
247 " -e EVENT --event=EVENT # symbolic-name abbreviations");
248
249 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
250 int type, id;
251
252 e = event_symbols[i].event;
253 type = PERF_COUNTER_TYPE(e);
254 id = PERF_COUNTER_ID(e);
255
256 printf("\n %d:%d: %-20s",
257 type, id, event_symbols[i].symbol);
258 }
259
260 printf("\n"
261 " rNNN: raw PMU events (eventsel+umask)\n\n");
262}
263
264static void display_help(void)
265{
266 printf(
267 "Usage: perfstat [<events...>] <cmd...>\n\n"
268 "PerfStat Options (up to %d event types can be specified):\n\n",
269 MAX_COUNTERS);
270
271 display_events_help();
272
273 printf(
274 " -l # scale counter values\n"
275 " -a # system-wide collection\n");
276 exit(0);
277}
278
279static char *event_name(int ctr)
280{
281 __u64 config = event_id[ctr];
282 int type = PERF_COUNTER_TYPE(config);
283 int id = PERF_COUNTER_ID(config);
284 static char buf[32];
285
286 if (PERF_COUNTER_RAW(config)) {
287 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
288 return buf;
289 }
290
291 switch (type) {
292 case PERF_TYPE_HARDWARE:
293 if (id < PERF_HW_EVENTS_MAX)
294 return hw_event_names[id];
295 return "unknown-hardware";
296
297 case PERF_TYPE_SOFTWARE:
298 if (id < PERF_SW_EVENTS_MAX)
299 return sw_event_names[id];
300 return "unknown-software";
301
302 default:
303 break;
304 }
305
306 return "unknown";
307}
308
309/*
310 * Each event can have multiple symbolic names.
311 * Symbolic names are (almost) exactly matched.
312 */
313static __u64 match_event_symbols(char *str)
314{
315 __u64 config, id;
316 int type;
317 unsigned int i;
318
319 if (sscanf(str, "r%llx", &config) == 1)
320 return config | PERF_COUNTER_RAW_MASK;
321
322 if (sscanf(str, "%d:%llu", &type, &id) == 2)
323 return EID(type, id);
324
325 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
326 if (!strncmp(str, event_symbols[i].symbol,
327 strlen(event_symbols[i].symbol)))
328 return event_symbols[i].event;
329 }
330
331 return ~0ULL;
332}
333
334static int parse_events(char *str)
335{
336 __u64 config;
337
338again:
339 if (nr_counters == MAX_COUNTERS)
340 return -1;
341
342 config = match_event_symbols(str);
343 if (config == ~0ULL)
344 return -1;
345
346 event_id[nr_counters] = config;
347 nr_counters++;
348
349 str = strstr(str, ",");
350 if (str) {
351 str++;
352 goto again;
353 }
354
355 return 0;
356}
357
358
359/*
360 * perfstat
361 */
362
363char fault_here[1000000];
364
365static void create_perfstat_counter(int counter)
366{
367 struct perf_counter_hw_event hw_event;
368
369 memset(&hw_event, 0, sizeof(hw_event));
370 hw_event.config = event_id[counter];
371 hw_event.record_type = 0;
372 hw_event.nmi = 0;
373 if (scale)
374 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
375 PERF_FORMAT_TOTAL_TIME_RUNNING;
376
377 if (system_wide) {
378 int cpu;
379 for (cpu = 0; cpu < nr_cpus; cpu ++) {
380 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
381 if (fd[cpu][counter] < 0) {
382 printf("perfstat error: syscall returned with %d (%s)\n",
383 fd[cpu][counter], strerror(errno));
384 exit(-1);
385 }
386 }
387 } else {
388 hw_event.inherit = 1;
389 hw_event.disabled = 1;
390
391 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
392 if (fd[0][counter] < 0) {
393 printf("perfstat error: syscall returned with %d (%s)\n",
394 fd[0][counter], strerror(errno));
395 exit(-1);
396 }
397 }
398}
399
400int do_perfstat(int argc, char *argv[])
401{
402 unsigned long long t0, t1;
403 int counter;
404 ssize_t res;
405 int status;
406 int pid;
407
408 if (!system_wide)
409 nr_cpus = 1;
410
411 for (counter = 0; counter < nr_counters; counter++)
412 create_perfstat_counter(counter);
413
414 argc -= optind;
415 argv += optind;
416
417 if (!argc)
418 display_help();
419
420 /*
421 * Enable counters and exec the command:
422 */
423 t0 = rdclock();
424 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
425
426 if ((pid = fork()) < 0)
427 perror("failed to fork");
428 if (!pid) {
429 if (execvp(argv[0], argv)) {
430 perror(argv[0]);
431 exit(-1);
432 }
433 }
434 while (wait(&status) >= 0)
435 ;
436 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
437 t1 = rdclock();
438
439 fflush(stdout);
440
441 fprintf(stderr, "\n");
442 fprintf(stderr, " Performance counter stats for \'%s\':\n",
443 argv[0]);
444 fprintf(stderr, "\n");
445
446 for (counter = 0; counter < nr_counters; counter++) {
447 int cpu, nv;
448 __u64 count[3], single_count[3];
449 int scaled;
450
451 count[0] = count[1] = count[2] = 0;
452 nv = scale ? 3 : 1;
453 for (cpu = 0; cpu < nr_cpus; cpu ++) {
454 res = read(fd[cpu][counter],
455 single_count, nv * sizeof(__u64));
456 assert(res == nv * sizeof(__u64));
457
458 count[0] += single_count[0];
459 if (scale) {
460 count[1] += single_count[1];
461 count[2] += single_count[2];
462 }
463 }
464
465 scaled = 0;
466 if (scale) {
467 if (count[2] == 0) {
468 fprintf(stderr, " %14s %-20s\n",
469 "<not counted>", event_name(counter));
470 continue;
471 }
472 if (count[2] < count[1]) {
473 scaled = 1;
474 count[0] = (unsigned long long)
475 ((double)count[0] * count[1] / count[2] + 0.5);
476 }
477 }
478
479 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
480 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
481
482 double msecs = (double)count[0] / 1000000;
483
484 fprintf(stderr, " %14.6f %-20s (msecs)",
485 msecs, event_name(counter));
486 } else {
487 fprintf(stderr, " %14Ld %-20s (events)",
488 count[0], event_name(counter));
489 }
490 if (scaled)
491 fprintf(stderr, " (scaled from %.2f%%)",
492 (double) count[2] / count[1] * 100);
493 fprintf(stderr, "\n");
494 }
495 fprintf(stderr, "\n");
496 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
497 (double)(t1-t0)/1e6);
498 fprintf(stderr, "\n");
499
500 return 0;
501}
502
503static void process_options(int argc, char **argv)
504{
505 int error = 0, counter;
506
507 for (;;) {
508 int option_index = 0;
509 /** Options for getopt */
510 static struct option long_options[] = {
511 {"count", required_argument, NULL, 'c'},
512 {"cpu", required_argument, NULL, 'C'},
513 {"delay", required_argument, NULL, 'd'},
514 {"dump_symtab", no_argument, NULL, 'D'},
515 {"event", required_argument, NULL, 'e'},
516 {"filter", required_argument, NULL, 'f'},
517 {"group", required_argument, NULL, 'g'},
518 {"help", no_argument, NULL, 'h'},
519 {"nmi", required_argument, NULL, 'n'},
520 {"munmap_info", no_argument, NULL, 'U'},
521 {"pid", required_argument, NULL, 'p'},
522 {"realtime", required_argument, NULL, 'r'},
523 {"scale", no_argument, NULL, 'l'},
524 {"symbol", required_argument, NULL, 's'},
525 {"stat", no_argument, NULL, 'S'},
526 {"vmlinux", required_argument, NULL, 'x'},
527 {"zero", no_argument, NULL, 'z'},
528 {NULL, 0, NULL, 0 }
529 };
530 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
531 long_options, &option_index);
532 if (c == -1)
533 break;
534
535 switch (c) {
536 case 'a': system_wide = 1; break;
537 case 'c': default_interval = atoi(optarg); break;
538 case 'C':
539 /* CPU and PID are mutually exclusive */
540 if (tid != -1) {
541 printf("WARNING: CPU switch overriding PID\n");
542 sleep(1);
543 tid = -1;
544 }
545 profile_cpu = atoi(optarg); break;
546
547 case 'e': error = parse_events(optarg); break;
548
549 case 'g': group = atoi(optarg); break;
550 case 'h': display_help(); break;
551 case 'l': scale = 1; break;
552 case 'n': nmi = atoi(optarg); break;
553 case 'p':
554 /* CPU and PID are mutually exclusive */
555 if (profile_cpu != -1) {
556 printf("WARNING: PID switch overriding CPU\n");
557 sleep(1);
558 profile_cpu = -1;
559 }
560 tid = atoi(optarg); break;
561 case 'z': zero = 1; break;
562 default: error = 1; break;
563 }
564 }
565 if (error)
566 display_help();
567
568 if (!nr_counters) {
569 nr_counters = 8;
570 }
571
572 for (counter = 0; counter < nr_counters; counter++) {
573 if (event_count[counter])
574 continue;
575
576 event_count[counter] = default_interval;
577 }
578}
579
580int cmd_stat(int argc, char **argv, const char *prefix)
581{
582 page_size = sysconf(_SC_PAGE_SIZE);
583
584 process_options(argc, argv);
585
586 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
587 assert(nr_cpus <= MAX_NR_CPUS);
588 assert(nr_cpus >= 0);
589
590 return do_perfstat(argc, argv);
591}
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
new file mode 100644
index 000000000000..6a276d2b2bbc
--- /dev/null
+++ b/Documentation/perf_counter/builtin-top.c
@@ -0,0 +1,1203 @@
1/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31 /*
32 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
33 *
34 * Improvements and fixes by:
35 *
36 * Arjan van de Ven <arjan@linux.intel.com>
37 * Yanmin Zhang <yanmin.zhang@intel.com>
38 * Wu Fengguang <fengguang.wu@intel.com>
39 * Mike Galbraith <efault@gmx.de>
40 * Paul Mackerras <paulus@samba.org>
41 *
42 * Released under the GPL v2. (and only v2, not any later version)
43 */
44
45#include "util/util.h"
46
47#include <getopt.h>
48#include <assert.h>
49#include <fcntl.h>
50#include <stdio.h>
51#include <errno.h>
52#include <time.h>
53#include <sched.h>
54#include <pthread.h>
55
56#include <sys/syscall.h>
57#include <sys/ioctl.h>
58#include <sys/poll.h>
59#include <sys/prctl.h>
60#include <sys/wait.h>
61#include <sys/uio.h>
62#include <sys/mman.h>
63
64#include <linux/unistd.h>
65#include <linux/types.h>
66
67#include "../../include/linux/perf_counter.h"
68
69
70/*
71 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
72 * counters in the current task.
73 */
74#define PR_TASK_PERF_COUNTERS_DISABLE 31
75#define PR_TASK_PERF_COUNTERS_ENABLE 32
76
77#define rdclock() \
78({ \
79 struct timespec ts; \
80 \
81 clock_gettime(CLOCK_MONOTONIC, &ts); \
82 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
83})
84
85/*
86 * Pick up some kernel type conventions:
87 */
88#define __user
89#define asmlinkage
90
91#ifdef __x86_64__
92#define __NR_perf_counter_open 295
93#define rmb() asm volatile("lfence" ::: "memory")
94#define cpu_relax() asm volatile("rep; nop" ::: "memory");
95#endif
96
97#ifdef __i386__
98#define __NR_perf_counter_open 333
99#define rmb() asm volatile("lfence" ::: "memory")
100#define cpu_relax() asm volatile("rep; nop" ::: "memory");
101#endif
102
103#ifdef __powerpc__
104#define __NR_perf_counter_open 319
105#define rmb() asm volatile ("sync" ::: "memory")
106#define cpu_relax() asm volatile ("" ::: "memory");
107#endif
108
109#define unlikely(x) __builtin_expect(!!(x), 0)
110#define min(x, y) ({ \
111 typeof(x) _min1 = (x); \
112 typeof(y) _min2 = (y); \
113 (void) (&_min1 == &_min2); \
114 _min1 < _min2 ? _min1 : _min2; })
115
116asmlinkage int sys_perf_counter_open(
117 struct perf_counter_hw_event *hw_event_uptr __user,
118 pid_t pid,
119 int cpu,
120 int group_fd,
121 unsigned long flags)
122{
123 return syscall(
124 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
125}
126
127#define MAX_COUNTERS 64
128#define MAX_NR_CPUS 256
129
130#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
131
132static int system_wide = 0;
133
134static int nr_counters = 0;
135static __u64 event_id[MAX_COUNTERS] = {
136 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
137 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
138 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
139 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
140
141 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
142 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
143 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
144 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
145};
146static int default_interval = 100000;
147static int event_count[MAX_COUNTERS];
148static int fd[MAX_NR_CPUS][MAX_COUNTERS];
149
150static __u64 count_filter = 100;
151
152static int tid = -1;
153static int profile_cpu = -1;
154static int nr_cpus = 0;
155static int nmi = 1;
156static unsigned int realtime_prio = 0;
157static int group = 0;
158static unsigned int page_size;
159static unsigned int mmap_pages = 16;
160static int use_mmap = 0;
161static int use_munmap = 0;
162
163static char *vmlinux;
164
165static char *sym_filter;
166static unsigned long filter_start;
167static unsigned long filter_end;
168
169static int delay_secs = 2;
170static int zero;
171static int dump_symtab;
172
173static int scale;
174
175struct source_line {
176 uint64_t EIP;
177 unsigned long count;
178 char *line;
179 struct source_line *next;
180};
181
182static struct source_line *lines;
183static struct source_line **lines_tail;
184
185static const unsigned int default_count[] = {
186 1000000,
187 1000000,
188 10000,
189 10000,
190 1000000,
191 10000,
192};
193
194static char *hw_event_names[] = {
195 "CPU cycles",
196 "instructions",
197 "cache references",
198 "cache misses",
199 "branches",
200 "branch misses",
201 "bus cycles",
202};
203
204static char *sw_event_names[] = {
205 "cpu clock ticks",
206 "task clock ticks",
207 "pagefaults",
208 "context switches",
209 "CPU migrations",
210 "minor faults",
211 "major faults",
212};
213
214struct event_symbol {
215 __u64 event;
216 char *symbol;
217};
218
219static struct event_symbol event_symbols[] = {
220 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
221 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
222 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
223 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
224 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
225 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
226 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
227 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
228 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
229
230 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
231 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
232 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
233 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
234 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
235 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
236 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
237 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
238 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
239 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
240};
241
242#define __PERF_COUNTER_FIELD(config, name) \
243 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
244
245#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
246#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
247#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
248#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
249
250static void display_events_help(void)
251{
252 unsigned int i;
253 __u64 e;
254
255 printf(
256 " -e EVENT --event=EVENT # symbolic-name abbreviations");
257
258 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
259 int type, id;
260
261 e = event_symbols[i].event;
262 type = PERF_COUNTER_TYPE(e);
263 id = PERF_COUNTER_ID(e);
264
265 printf("\n %d:%d: %-20s",
266 type, id, event_symbols[i].symbol);
267 }
268
269 printf("\n"
270 " rNNN: raw PMU events (eventsel+umask)\n\n");
271}
272
273static void display_help(void)
274{
275 printf(
276 "Usage: kerneltop [<options>]\n"
277 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
278 "KernelTop Options (up to %d event types can be specified at once):\n\n",
279 MAX_COUNTERS);
280
281 display_events_help();
282
283 printf(
284 " -c CNT --count=CNT # event period to sample\n\n"
285 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
286 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
287 " -l # show scale factor for RR events\n"
288 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
289 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
290 " -r prio --realtime=<prio> # event acquisition runs with SCHED_FIFO policy\n"
291 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
292 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
293 " -z --zero # zero counts after display\n"
294 " -D --dump_symtab # dump symbol table to stderr on startup\n"
295 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
296 " -M --mmap_info # print mmap info stream\n"
297 " -U --munmap_info # print munmap info stream\n"
298 );
299
300 exit(0);
301}
302
303static char *event_name(int ctr)
304{
305 __u64 config = event_id[ctr];
306 int type = PERF_COUNTER_TYPE(config);
307 int id = PERF_COUNTER_ID(config);
308 static char buf[32];
309
310 if (PERF_COUNTER_RAW(config)) {
311 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
312 return buf;
313 }
314
315 switch (type) {
316 case PERF_TYPE_HARDWARE:
317 if (id < PERF_HW_EVENTS_MAX)
318 return hw_event_names[id];
319 return "unknown-hardware";
320
321 case PERF_TYPE_SOFTWARE:
322 if (id < PERF_SW_EVENTS_MAX)
323 return sw_event_names[id];
324 return "unknown-software";
325
326 default:
327 break;
328 }
329
330 return "unknown";
331}
332
333/*
334 * Each event can have multiple symbolic names.
335 * Symbolic names are (almost) exactly matched.
336 */
337static __u64 match_event_symbols(char *str)
338{
339 __u64 config, id;
340 int type;
341 unsigned int i;
342
343 if (sscanf(str, "r%llx", &config) == 1)
344 return config | PERF_COUNTER_RAW_MASK;
345
346 if (sscanf(str, "%d:%llu", &type, &id) == 2)
347 return EID(type, id);
348
349 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
350 if (!strncmp(str, event_symbols[i].symbol,
351 strlen(event_symbols[i].symbol)))
352 return event_symbols[i].event;
353 }
354
355 return ~0ULL;
356}
357
358static int parse_events(char *str)
359{
360 __u64 config;
361
362again:
363 if (nr_counters == MAX_COUNTERS)
364 return -1;
365
366 config = match_event_symbols(str);
367 if (config == ~0ULL)
368 return -1;
369
370 event_id[nr_counters] = config;
371 nr_counters++;
372
373 str = strstr(str, ",");
374 if (str) {
375 str++;
376 goto again;
377 }
378
379 return 0;
380}
381
382/*
383 * Symbols
384 */
385
386static uint64_t min_ip;
387static uint64_t max_ip = -1ll;
388
389struct sym_entry {
390 unsigned long long addr;
391 char *sym;
392 unsigned long count[MAX_COUNTERS];
393 int skip;
394 struct source_line *source;
395};
396
397#define MAX_SYMS 100000
398
399static int sym_table_count;
400
401struct sym_entry *sym_filter_entry;
402
403static struct sym_entry sym_table[MAX_SYMS];
404
405static void show_details(struct sym_entry *sym);
406
407/*
408 * Ordering weight: count-1 * count-2 * ... / count-n
409 */
410static double sym_weight(const struct sym_entry *sym)
411{
412 double weight;
413 int counter;
414
415 weight = sym->count[0];
416
417 for (counter = 1; counter < nr_counters-1; counter++)
418 weight *= sym->count[counter];
419
420 weight /= (sym->count[counter] + 1);
421
422 return weight;
423}
424
425static int compare(const void *__sym1, const void *__sym2)
426{
427 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
428
429 return sym_weight(sym1) < sym_weight(sym2);
430}
431
432static long events;
433static long userspace_events;
434static const char CONSOLE_CLEAR[] = "";
435
436static struct sym_entry tmp[MAX_SYMS];
437
438static void print_sym_table(void)
439{
440 int i, printed;
441 int counter;
442 float events_per_sec = events/delay_secs;
443 float kevents_per_sec = (events-userspace_events)/delay_secs;
444 float sum_kevents = 0.0;
445
446 events = userspace_events = 0;
447 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
448 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
449
450 for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
451 sum_kevents += tmp[i].count[0];
452
453 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
454
455 printf(
456"------------------------------------------------------------------------------\n");
457 printf( " KernelTop:%8.0f irqs/sec kernel:%4.1f%% [%s, ",
458 events_per_sec,
459 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
460 nmi ? "NMI" : "IRQ");
461
462 if (nr_counters == 1)
463 printf("%d ", event_count[0]);
464
465 for (counter = 0; counter < nr_counters; counter++) {
466 if (counter)
467 printf("/");
468
469 printf("%s", event_name(counter));
470 }
471
472 printf( "], ");
473
474 if (tid != -1)
475 printf(" (tid: %d", tid);
476 else
477 printf(" (all");
478
479 if (profile_cpu != -1)
480 printf(", cpu: %d)\n", profile_cpu);
481 else {
482 if (tid != -1)
483 printf(")\n");
484 else
485 printf(", %d CPUs)\n", nr_cpus);
486 }
487
488 printf("------------------------------------------------------------------------------\n\n");
489
490 if (nr_counters == 1)
491 printf(" events pcnt");
492 else
493 printf(" weight events pcnt");
494
495 printf(" RIP kernel function\n"
496 " ______ ______ _____ ________________ _______________\n\n"
497 );
498
499 for (i = 0, printed = 0; i < sym_table_count; i++) {
500 float pcnt;
501 int count;
502
503 if (printed <= 18 && tmp[i].count[0] >= count_filter) {
504 pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
505
506 if (nr_counters == 1)
507 printf("%19.2f - %4.1f%% - %016llx : %s\n",
508 sym_weight(tmp + i),
509 pcnt, tmp[i].addr, tmp[i].sym);
510 else
511 printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
512 sym_weight(tmp + i),
513 tmp[i].count[0],
514 pcnt, tmp[i].addr, tmp[i].sym);
515 printed++;
516 }
517 /*
518 * Add decay to the counts:
519 */
520 for (count = 0; count < nr_counters; count++)
521 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
522 }
523
524 if (sym_filter_entry)
525 show_details(sym_filter_entry);
526
527 {
528 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
529
530 if (poll(&stdin_poll, 1, 0) == 1) {
531 printf("key pressed - exiting.\n");
532 exit(0);
533 }
534 }
535}
536
537static void *display_thread(void *arg)
538{
539 printf("KernelTop refresh period: %d seconds\n", delay_secs);
540
541 while (!sleep(delay_secs))
542 print_sym_table();
543
544 return NULL;
545}
546
547static int read_symbol(FILE *in, struct sym_entry *s)
548{
549 static int filter_match = 0;
550 char *sym, stype;
551 char str[500];
552 int rc, pos;
553
554 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
555 if (rc == EOF)
556 return -1;
557
558 assert(rc == 3);
559
560 /* skip until end of line: */
561 pos = strlen(str);
562 do {
563 rc = fgetc(in);
564 if (rc == '\n' || rc == EOF || pos >= 499)
565 break;
566 str[pos] = rc;
567 pos++;
568 } while (1);
569 str[pos] = 0;
570
571 sym = str;
572
573 /* Filter out known duplicates and non-text symbols. */
574 if (!strcmp(sym, "_text"))
575 return 1;
576 if (!min_ip && !strcmp(sym, "_stext"))
577 return 1;
578 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
579 return 1;
580 if (stype != 'T' && stype != 't')
581 return 1;
582 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
583 return 1;
584 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
585 return 1;
586
587 s->sym = malloc(strlen(str));
588 assert(s->sym);
589
590 strcpy((char *)s->sym, str);
591 s->skip = 0;
592
593 /* Tag events to be skipped. */
594 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
595 s->skip = 1;
596 else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
597 s->skip = 1;
598 else if (!strcmp("mwait_idle", s->sym))
599 s->skip = 1;
600
601 if (filter_match == 1) {
602 filter_end = s->addr;
603 filter_match = -1;
604 if (filter_end - filter_start > 10000) {
605 printf("hm, too large filter symbol <%s> - skipping.\n",
606 sym_filter);
607 printf("symbol filter start: %016lx\n", filter_start);
608 printf(" end: %016lx\n", filter_end);
609 filter_end = filter_start = 0;
610 sym_filter = NULL;
611 sleep(1);
612 }
613 }
614 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
615 filter_match = 1;
616 filter_start = s->addr;
617 }
618
619 return 0;
620}
621
622static int compare_addr(const void *__sym1, const void *__sym2)
623{
624 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
625
626 return sym1->addr > sym2->addr;
627}
628
629static void sort_symbol_table(void)
630{
631 int i, dups;
632
633 do {
634 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
635 for (i = 0, dups = 0; i < sym_table_count; i++) {
636 if (sym_table[i].addr == sym_table[i+1].addr) {
637 sym_table[i+1].addr = -1ll;
638 dups++;
639 }
640 }
641 sym_table_count -= dups;
642 } while(dups);
643}
644
645static void parse_symbols(void)
646{
647 struct sym_entry *last;
648
649 FILE *kallsyms = fopen("/proc/kallsyms", "r");
650
651 if (!kallsyms) {
652 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
653 exit(-1);
654 }
655
656 while (!feof(kallsyms)) {
657 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
658 sym_table_count++;
659 assert(sym_table_count <= MAX_SYMS);
660 }
661 }
662
663 sort_symbol_table();
664 min_ip = sym_table[0].addr;
665 max_ip = sym_table[sym_table_count-1].addr;
666 last = sym_table + sym_table_count++;
667
668 last->addr = -1ll;
669 last->sym = "<end>";
670
671 if (filter_end) {
672 int count;
673 for (count=0; count < sym_table_count; count ++) {
674 if (!strcmp(sym_table[count].sym, sym_filter)) {
675 sym_filter_entry = &sym_table[count];
676 break;
677 }
678 }
679 }
680 if (dump_symtab) {
681 int i;
682
683 for (i = 0; i < sym_table_count; i++)
684 fprintf(stderr, "%llx %s\n",
685 sym_table[i].addr, sym_table[i].sym);
686 }
687}
688
689/*
690 * Source lines
691 */
692
693static void parse_vmlinux(char *filename)
694{
695 FILE *file;
696 char command[PATH_MAX*2];
697 if (!filename)
698 return;
699
700 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
701
702 file = popen(command, "r");
703 if (!file)
704 return;
705
706 lines_tail = &lines;
707 while (!feof(file)) {
708 struct source_line *src;
709 size_t dummy = 0;
710 char *c;
711
712 src = malloc(sizeof(struct source_line));
713 assert(src != NULL);
714 memset(src, 0, sizeof(struct source_line));
715
716 if (getline(&src->line, &dummy, file) < 0)
717 break;
718 if (!src->line)
719 break;
720
721 c = strchr(src->line, '\n');
722 if (c)
723 *c = 0;
724
725 src->next = NULL;
726 *lines_tail = src;
727 lines_tail = &src->next;
728
729 if (strlen(src->line)>8 && src->line[8] == ':')
730 src->EIP = strtoull(src->line, NULL, 16);
731 if (strlen(src->line)>8 && src->line[16] == ':')
732 src->EIP = strtoull(src->line, NULL, 16);
733 }
734 pclose(file);
735}
736
737static void record_precise_ip(uint64_t ip)
738{
739 struct source_line *line;
740
741 for (line = lines; line; line = line->next) {
742 if (line->EIP == ip)
743 line->count++;
744 if (line->EIP > ip)
745 break;
746 }
747}
748
749static void lookup_sym_in_vmlinux(struct sym_entry *sym)
750{
751 struct source_line *line;
752 char pattern[PATH_MAX];
753 sprintf(pattern, "<%s>:", sym->sym);
754
755 for (line = lines; line; line = line->next) {
756 if (strstr(line->line, pattern)) {
757 sym->source = line;
758 break;
759 }
760 }
761}
762
763static void show_lines(struct source_line *line_queue, int line_queue_count)
764{
765 int i;
766 struct source_line *line;
767
768 line = line_queue;
769 for (i = 0; i < line_queue_count; i++) {
770 printf("%8li\t%s\n", line->count, line->line);
771 line = line->next;
772 }
773}
774
775#define TRACE_COUNT 3
776
777static void show_details(struct sym_entry *sym)
778{
779 struct source_line *line;
780 struct source_line *line_queue = NULL;
781 int displayed = 0;
782 int line_queue_count = 0;
783
784 if (!sym->source)
785 lookup_sym_in_vmlinux(sym);
786 if (!sym->source)
787 return;
788
789 printf("Showing details for %s\n", sym->sym);
790
791 line = sym->source;
792 while (line) {
793 if (displayed && strstr(line->line, ">:"))
794 break;
795
796 if (!line_queue_count)
797 line_queue = line;
798 line_queue_count ++;
799
800 if (line->count >= count_filter) {
801 show_lines(line_queue, line_queue_count);
802 line_queue_count = 0;
803 line_queue = NULL;
804 } else if (line_queue_count > TRACE_COUNT) {
805 line_queue = line_queue->next;
806 line_queue_count --;
807 }
808
809 line->count = 0;
810 displayed++;
811 if (displayed > 300)
812 break;
813 line = line->next;
814 }
815}
816
817/*
818 * Binary search in the histogram table and record the hit:
819 */
820static void record_ip(uint64_t ip, int counter)
821{
822 int left_idx, middle_idx, right_idx, idx;
823 unsigned long left, middle, right;
824
825 record_precise_ip(ip);
826
827 left_idx = 0;
828 right_idx = sym_table_count-1;
829 assert(ip <= max_ip && ip >= min_ip);
830
831 while (left_idx + 1 < right_idx) {
832 middle_idx = (left_idx + right_idx) / 2;
833
834 left = sym_table[ left_idx].addr;
835 middle = sym_table[middle_idx].addr;
836 right = sym_table[ right_idx].addr;
837
838 if (!(left <= middle && middle <= right)) {
839 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
840 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
841 }
842 assert(left <= middle && middle <= right);
843 if (!(left <= ip && ip <= right)) {
844 printf(" left: %016lx\n", left);
845 printf(" ip: %016lx\n", (unsigned long)ip);
846 printf("right: %016lx\n", right);
847 }
848 assert(left <= ip && ip <= right);
849 /*
850 * [ left .... target .... middle .... right ]
851 * => right := middle
852 */
853 if (ip < middle) {
854 right_idx = middle_idx;
855 continue;
856 }
857 /*
858 * [ left .... middle ... target ... right ]
859 * => left := middle
860 */
861 left_idx = middle_idx;
862 }
863
864 idx = left_idx;
865
866 if (!sym_table[idx].skip)
867 sym_table[idx].count[counter]++;
868 else events--;
869}
870
871static void process_event(uint64_t ip, int counter)
872{
873 events++;
874
875 if (ip < min_ip || ip > max_ip) {
876 userspace_events++;
877 return;
878 }
879
880 record_ip(ip, counter);
881}
882
883static void process_options(int argc, char **argv)
884{
885 int error = 0, counter;
886
887 for (;;) {
888 int option_index = 0;
889 /** Options for getopt */
890 static struct option long_options[] = {
891 {"count", required_argument, NULL, 'c'},
892 {"cpu", required_argument, NULL, 'C'},
893 {"delay", required_argument, NULL, 'd'},
894 {"dump_symtab", no_argument, NULL, 'D'},
895 {"event", required_argument, NULL, 'e'},
896 {"filter", required_argument, NULL, 'f'},
897 {"group", required_argument, NULL, 'g'},
898 {"help", no_argument, NULL, 'h'},
899 {"nmi", required_argument, NULL, 'n'},
900 {"mmap_info", no_argument, NULL, 'M'},
901 {"mmap_pages", required_argument, NULL, 'm'},
902 {"munmap_info", no_argument, NULL, 'U'},
903 {"pid", required_argument, NULL, 'p'},
904 {"realtime", required_argument, NULL, 'r'},
905 {"scale", no_argument, NULL, 'l'},
906 {"symbol", required_argument, NULL, 's'},
907 {"stat", no_argument, NULL, 'S'},
908 {"vmlinux", required_argument, NULL, 'x'},
909 {"zero", no_argument, NULL, 'z'},
910 {NULL, 0, NULL, 0 }
911 };
912 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
913 long_options, &option_index);
914 if (c == -1)
915 break;
916
917 switch (c) {
918 case 'a': system_wide = 1; break;
919 case 'c': default_interval = atoi(optarg); break;
920 case 'C':
921 /* CPU and PID are mutually exclusive */
922 if (tid != -1) {
923 printf("WARNING: CPU switch overriding PID\n");
924 sleep(1);
925 tid = -1;
926 }
927 profile_cpu = atoi(optarg); break;
928 case 'd': delay_secs = atoi(optarg); break;
929 case 'D': dump_symtab = 1; break;
930
931 case 'e': error = parse_events(optarg); break;
932
933 case 'f': count_filter = atoi(optarg); break;
934 case 'g': group = atoi(optarg); break;
935 case 'h': display_help(); break;
936 case 'l': scale = 1; break;
937 case 'n': nmi = atoi(optarg); break;
938 case 'p':
939 /* CPU and PID are mutually exclusive */
940 if (profile_cpu != -1) {
941 printf("WARNING: PID switch overriding CPU\n");
942 sleep(1);
943 profile_cpu = -1;
944 }
945 tid = atoi(optarg); break;
946 case 'r': realtime_prio = atoi(optarg); break;
947 case 's': sym_filter = strdup(optarg); break;
948 case 'x': vmlinux = strdup(optarg); break;
949 case 'z': zero = 1; break;
950 case 'm': mmap_pages = atoi(optarg); break;
951 case 'M': use_mmap = 1; break;
952 case 'U': use_munmap = 1; break;
953 default: error = 1; break;
954 }
955 }
956 if (error)
957 display_help();
958
959 if (!nr_counters) {
960 nr_counters = 1;
961 event_id[0] = 0;
962 }
963
964 for (counter = 0; counter < nr_counters; counter++) {
965 if (event_count[counter])
966 continue;
967
968 event_count[counter] = default_interval;
969 }
970}
971
972struct mmap_data {
973 int counter;
974 void *base;
975 unsigned int mask;
976 unsigned int prev;
977};
978
979static unsigned int mmap_read_head(struct mmap_data *md)
980{
981 struct perf_counter_mmap_page *pc = md->base;
982 int head;
983
984 head = pc->data_head;
985 rmb();
986
987 return head;
988}
989
990struct timeval last_read, this_read;
991
992static void mmap_read(struct mmap_data *md)
993{
994 unsigned int head = mmap_read_head(md);
995 unsigned int old = md->prev;
996 unsigned char *data = md->base + page_size;
997 int diff;
998
999 gettimeofday(&this_read, NULL);
1000
1001 /*
1002 * If we're further behind than half the buffer, there's a chance
1003 * the writer will bite our tail and screw up the events under us.
1004 *
1005 * If we somehow ended up ahead of the head, we got messed up.
1006 *
1007 * In either case, truncate and restart at head.
1008 */
1009 diff = head - old;
1010 if (diff > md->mask / 2 || diff < 0) {
1011 struct timeval iv;
1012 unsigned long msecs;
1013
1014 timersub(&this_read, &last_read, &iv);
1015 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
1016
1017 fprintf(stderr, "WARNING: failed to keep up with mmap data."
1018 " Last read %lu msecs ago.\n", msecs);
1019
1020 /*
1021 * head points to a known good entry, start there.
1022 */
1023 old = head;
1024 }
1025
1026 last_read = this_read;
1027
1028 for (; old != head;) {
1029 struct ip_event {
1030 struct perf_event_header header;
1031 __u64 ip;
1032 __u32 pid, tid;
1033 };
1034 struct mmap_event {
1035 struct perf_event_header header;
1036 __u32 pid, tid;
1037 __u64 start;
1038 __u64 len;
1039 __u64 pgoff;
1040 char filename[PATH_MAX];
1041 };
1042
1043 typedef union event_union {
1044 struct perf_event_header header;
1045 struct ip_event ip;
1046 struct mmap_event mmap;
1047 } event_t;
1048
1049 event_t *event = (event_t *)&data[old & md->mask];
1050
1051 event_t event_copy;
1052
1053 size_t size = event->header.size;
1054
1055 /*
1056 * Event straddles the mmap boundary -- header should always
1057 * be inside due to u64 alignment of output.
1058 */
1059 if ((old & md->mask) + size != ((old + size) & md->mask)) {
1060 unsigned int offset = old;
1061 unsigned int len = min(sizeof(*event), size), cpy;
1062 void *dst = &event_copy;
1063
1064 do {
1065 cpy = min(md->mask + 1 - (offset & md->mask), len);
1066 memcpy(dst, &data[offset & md->mask], cpy);
1067 offset += cpy;
1068 dst += cpy;
1069 len -= cpy;
1070 } while (len);
1071
1072 event = &event_copy;
1073 }
1074
1075 old += size;
1076
1077 if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
1078 if (event->header.type & PERF_RECORD_IP)
1079 process_event(event->ip.ip, md->counter);
1080 } else {
1081 switch (event->header.type) {
1082 case PERF_EVENT_MMAP:
1083 case PERF_EVENT_MUNMAP:
1084 printf("%s: %Lu %Lu %Lu %s\n",
1085 event->header.type == PERF_EVENT_MMAP
1086 ? "mmap" : "munmap",
1087 event->mmap.start,
1088 event->mmap.len,
1089 event->mmap.pgoff,
1090 event->mmap.filename);
1091 break;
1092 }
1093 }
1094 }
1095
1096 md->prev = old;
1097}
1098
1099int cmd_top(int argc, char **argv, const char *prefix)
1100{
1101 struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
1102 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1103 struct perf_counter_hw_event hw_event;
1104 pthread_t thread;
1105 int i, counter, group_fd, nr_poll = 0;
1106 unsigned int cpu;
1107 int ret;
1108
1109 page_size = sysconf(_SC_PAGE_SIZE);
1110
1111 process_options(argc, argv);
1112
1113 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1114 assert(nr_cpus <= MAX_NR_CPUS);
1115 assert(nr_cpus >= 0);
1116
1117 if (tid != -1 || profile_cpu != -1)
1118 nr_cpus = 1;
1119
1120 parse_symbols();
1121 if (vmlinux && sym_filter_entry)
1122 parse_vmlinux(vmlinux);
1123
1124 for (i = 0; i < nr_cpus; i++) {
1125 group_fd = -1;
1126 for (counter = 0; counter < nr_counters; counter++) {
1127
1128 cpu = profile_cpu;
1129 if (tid == -1 && profile_cpu == -1)
1130 cpu = i;
1131
1132 memset(&hw_event, 0, sizeof(hw_event));
1133 hw_event.config = event_id[counter];
1134 hw_event.irq_period = event_count[counter];
1135 hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID;
1136 hw_event.nmi = nmi;
1137 hw_event.mmap = use_mmap;
1138 hw_event.munmap = use_munmap;
1139
1140 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1141 if (fd[i][counter] < 0) {
1142 int err = errno;
1143 printf("kerneltop error: syscall returned with %d (%s)\n",
1144 fd[i][counter], strerror(err));
1145 if (err == EPERM)
1146 printf("Are you root?\n");
1147 exit(-1);
1148 }
1149 assert(fd[i][counter] >= 0);
1150 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1151
1152 /*
1153 * First counter acts as the group leader:
1154 */
1155 if (group && group_fd == -1)
1156 group_fd = fd[i][counter];
1157
1158 event_array[nr_poll].fd = fd[i][counter];
1159 event_array[nr_poll].events = POLLIN;
1160 nr_poll++;
1161
1162 mmap_array[i][counter].counter = counter;
1163 mmap_array[i][counter].prev = 0;
1164 mmap_array[i][counter].mask = mmap_pages*page_size - 1;
1165 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
1166 PROT_READ, MAP_SHARED, fd[i][counter], 0);
1167 if (mmap_array[i][counter].base == MAP_FAILED) {
1168 printf("kerneltop error: failed to mmap with %d (%s)\n",
1169 errno, strerror(errno));
1170 exit(-1);
1171 }
1172 }
1173 }
1174
1175 if (pthread_create(&thread, NULL, display_thread, NULL)) {
1176 printf("Could not create display thread.\n");
1177 exit(-1);
1178 }
1179
1180 if (realtime_prio) {
1181 struct sched_param param;
1182
1183 param.sched_priority = realtime_prio;
1184 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1185 printf("Could not set realtime priority.\n");
1186 exit(-1);
1187 }
1188 }
1189
1190 while (1) {
1191 int hits = events;
1192
1193 for (i = 0; i < nr_cpus; i++) {
1194 for (counter = 0; counter < nr_counters; counter++)
1195 mmap_read(&mmap_array[i][counter]);
1196 }
1197
1198 if (hits == events)
1199 ret = poll(event_array, nr_poll, 100);
1200 }
1201
1202 return 0;
1203}
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
new file mode 100644
index 000000000000..d32318aed8cf
--- /dev/null
+++ b/Documentation/perf_counter/builtin.h
@@ -0,0 +1,22 @@
1#ifndef BUILTIN_H
2#define BUILTIN_H
3
4#include "util/util.h"
5#include "util/strbuf.h"
6
7extern const char perf_version_string[];
8extern const char perf_usage_string[];
9extern const char perf_more_info_string[];
10
11extern void list_common_cmds_help(void);
12extern const char *help_unknown_cmd(const char *cmd);
13extern void prune_packed_objects(int);
14extern int read_line_with_nul(char *buf, int size, FILE *file);
15extern int check_pager_config(const char *cmd);
16
17extern int cmd_help(int argc, const char **argv, const char *prefix);
18extern int cmd_record(int argc, const char **argv, const char *prefix);
19extern int cmd_stat(int argc, const char **argv, const char *prefix);
20extern int cmd_top(int argc, const char **argv, const char *prefix);
21extern int cmd_version(int argc, const char **argv, const char *prefix);
22#endif
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
new file mode 100644
index 000000000000..d15210aa0cae
--- /dev/null
+++ b/Documentation/perf_counter/command-list.txt
@@ -0,0 +1,6 @@
1# List of known perf commands.
2# command name category [deprecated] [common]
3perf-record mainporcelain common
4perf-stat mainporcelain common
5perf-top mainporcelain common
6
diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
new file mode 100644
index 000000000000..aaf105c02fba
--- /dev/null
+++ b/Documentation/perf_counter/design.txt
@@ -0,0 +1,283 @@
1
2Performance Counters for Linux
3------------------------------
4
5Performance counters are special hardware registers available on most modern
6CPUs. These registers count the number of certain types of hw events: such
7as instructions executed, cachemisses suffered, or branches mis-predicted -
8without slowing down the kernel or applications. These registers can also
9trigger interrupts when a threshold number of events have passed - and can
10thus be used to profile the code that runs on that CPU.
11
12The Linux Performance Counter subsystem provides an abstraction of these
13hardware capabilities. It provides per task and per CPU counters, counter
14groups, and it provides event capabilities on top of those. It
15provides "virtual" 64-bit counters, regardless of the width of the
16underlying hardware counters.
17
18Performance counters are accessed via special file descriptors.
19There's one file descriptor per virtual counter used.
20
21The special file descriptor is opened via the perf_counter_open()
22system call:
23
24 int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
25 pid_t pid, int cpu, int group_fd,
26 unsigned long flags);
27
28The syscall returns the new fd. The fd can be used via the normal
29VFS system calls: read() can be used to read the counter, fcntl()
30can be used to set the blocking mode, etc.
31
32Multiple counters can be kept open at a time, and the counters
33can be poll()ed.
34
35When creating a new counter fd, 'perf_counter_hw_event' is:
36
37/*
38 * Event to monitor via a performance monitoring counter:
39 */
40struct perf_counter_hw_event {
41 __u64 event_config;
42
43 __u64 irq_period;
44 __u64 record_type;
45 __u64 read_format;
46
47 __u64 disabled : 1, /* off by default */
48 nmi : 1, /* NMI sampling */
49 inherit : 1, /* children inherit it */
50 pinned : 1, /* must always be on PMU */
51 exclusive : 1, /* only group on PMU */
52 exclude_user : 1, /* don't count user */
53 exclude_kernel : 1, /* ditto kernel */
54 exclude_hv : 1, /* ditto hypervisor */
55 exclude_idle : 1, /* don't count when idle */
56
57 __reserved_1 : 55;
58
59 __u32 extra_config_len;
60
61 __u32 __reserved_4;
62 __u64 __reserved_2;
63 __u64 __reserved_3;
64};
65
66The 'event_config' field specifies what the counter should count. It
67is divided into 3 bit-fields:
68
69raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000
70type: 7 bits (next most significant) 0x7f00_0000_0000_0000
71event_id: 56 bits (least significant) 0x00ff_0000_0000_0000
72
73If 'raw_type' is 1, then the counter will count a hardware event
74specified by the remaining 63 bits of event_config. The encoding is
75machine-specific.
76
77If 'raw_type' is 0, then the 'type' field says what kind of counter
78this is, with the following encoding:
79
80enum perf_event_types {
81 PERF_TYPE_HARDWARE = 0,
82 PERF_TYPE_SOFTWARE = 1,
83 PERF_TYPE_TRACEPOINT = 2,
84};
85
86A counter of PERF_TYPE_HARDWARE will count the hardware event
87specified by 'event_id':
88
89/*
90 * Generalized performance counter event types, used by the hw_event.event_id
91 * parameter of the sys_perf_counter_open() syscall:
92 */
93enum hw_event_ids {
94 /*
95 * Common hardware events, generalized by the kernel:
96 */
97 PERF_COUNT_CPU_CYCLES = 0,
98 PERF_COUNT_INSTRUCTIONS = 1,
99 PERF_COUNT_CACHE_REFERENCES = 2,
100 PERF_COUNT_CACHE_MISSES = 3,
101 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
102 PERF_COUNT_BRANCH_MISSES = 5,
103 PERF_COUNT_BUS_CYCLES = 6,
104};
105
106These are standardized types of events that work relatively uniformly
107on all CPUs that implement Performance Counters support under Linux,
108although there may be variations (e.g., different CPUs might count
109cache references and misses at different levels of the cache hierarchy).
110If a CPU is not able to count the selected event, then the system call
111will return -EINVAL.
112
113More hw_event_types are supported as well, but they are CPU-specific
114and accessed as raw events. For example, to count "External bus
115cycles while bus lock signal asserted" events on Intel Core CPUs, pass
116in a 0x4064 event_id value and set hw_event.raw_type to 1.
117
118A counter of type PERF_TYPE_SOFTWARE will count one of the available
119software events, selected by 'event_id':
120
121/*
122 * Special "software" counters provided by the kernel, even if the hardware
123 * does not support performance counters. These counters measure various
124 * physical and sw events of the kernel (and allow the profiling of them as
125 * well):
126 */
127enum sw_event_ids {
128 PERF_COUNT_CPU_CLOCK = 0,
129 PERF_COUNT_TASK_CLOCK = 1,
130 PERF_COUNT_PAGE_FAULTS = 2,
131 PERF_COUNT_CONTEXT_SWITCHES = 3,
132 PERF_COUNT_CPU_MIGRATIONS = 4,
133 PERF_COUNT_PAGE_FAULTS_MIN = 5,
134 PERF_COUNT_PAGE_FAULTS_MAJ = 6,
135};
136
137Counters come in two flavours: counting counters and sampling
138counters. A "counting" counter is one that is used for counting the
139number of events that occur, and is characterised by having
140irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a
141counting counter simply returns the current value of the counter as
142an 8-byte number.
143
144A "sampling" counter is one that is set up to generate an interrupt
145every N events, where N is given by 'irq_period'. A sampling counter
146has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The
147record_type controls what data is recorded on each interrupt, and the
148available values are currently:
149
150/*
151 * IRQ-notification data record type:
152 */
153enum perf_counter_record_type {
154 PERF_RECORD_SIMPLE = 0,
155 PERF_RECORD_IRQ = 1,
156 PERF_RECORD_GROUP = 2,
157};
158
159A record_type value of PERF_RECORD_IRQ will record the instruction
160pointer (IP) at which the interrupt occurred. A record_type value of
161PERF_RECORD_GROUP will record the event_config and counter value of
162all of the other counters in the group, and should only be used on a
163group leader (see below). Currently these two values are mutually
164exclusive, but record_type will become a bit-mask in future and
165support other values.
166
167A sampling counter has an event queue, into which an event is placed
168on each interrupt. A read() on a sampling counter will read the next
169event from the event queue. If the queue is empty, the read() will
170either block or return an EAGAIN error, depending on whether the fd
171has been set to non-blocking mode or not.
172
173The 'disabled' bit specifies whether the counter starts out disabled
174or enabled. If it is initially disabled, it can be enabled by ioctl
175or prctl (see below).
176
177The 'nmi' bit specifies, for hardware events, whether the counter
178should be set up to request non-maskable interrupts (NMIs) or normal
179interrupts. This bit is ignored if the user doesn't have
180CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't
181generate NMIs from hardware counters.
182
183The 'inherit' bit, if set, specifies that this counter should count
184events on descendant tasks as well as the task specified. This only
185applies to new descendents, not to any existing descendents at the
186time the counter is created (nor to any new descendents of existing
187descendents).
188
189The 'pinned' bit, if set, specifies that the counter should always be
190on the CPU if at all possible. It only applies to hardware counters
191and only to group leaders. If a pinned counter cannot be put onto the
192CPU (e.g. because there are not enough hardware counters or because of
193a conflict with some other event), then the counter goes into an
194'error' state, where reads return end-of-file (i.e. read() returns 0)
195until the counter is subsequently enabled or disabled.
196
197The 'exclusive' bit, if set, specifies that when this counter's group
198is on the CPU, it should be the only group using the CPU's counters.
199In future, this will allow sophisticated monitoring programs to supply
200extra configuration information via 'extra_config_len' to exploit
201advanced features of the CPU's Performance Monitor Unit (PMU) that are
202not otherwise accessible and that might disrupt other hardware
203counters.
204
205The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
206way to request that counting of events be restricted to times when the
207CPU is in user, kernel and/or hypervisor mode.
208
209
210The 'pid' parameter to the perf_counter_open() system call allows the
211counter to be specific to a task:
212
213 pid == 0: if the pid parameter is zero, the counter is attached to the
214 current task.
215
216 pid > 0: the counter is attached to a specific task (if the current task
217 has sufficient privilege to do so)
218
219 pid < 0: all tasks are counted (per cpu counters)
220
221The 'cpu' parameter allows a counter to be made specific to a CPU:
222
223 cpu >= 0: the counter is restricted to a specific CPU
224 cpu == -1: the counter counts on all CPUs
225
226(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
227
228A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
229events of that task and 'follows' that task to whatever CPU the task
230gets schedule to. Per task counters can be created by any user, for
231their own tasks.
232
233A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
234all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
235
236The 'flags' parameter is currently unused and must be zero.
237
238The 'group_fd' parameter allows counter "groups" to be set up. A
239counter group has one counter which is the group "leader". The leader
240is created first, with group_fd = -1 in the perf_counter_open call
241that creates it. The rest of the group members are created
242subsequently, with group_fd giving the fd of the group leader.
243(A single counter on its own is created with group_fd = -1 and is
244considered to be a group with only 1 member.)
245
246A counter group is scheduled onto the CPU as a unit, that is, it will
247only be put onto the CPU if all of the counters in the group can be
248put onto the CPU. This means that the values of the member counters
249can be meaningfully compared, added, divided (to get ratios), etc.,
250with each other, since they have counted events for the same set of
251executed instructions.
252
253Counters can be enabled and disabled in two ways: via ioctl and via
254prctl. When a counter is disabled, it doesn't count or generate
255events but does continue to exist and maintain its count value.
256
257An individual counter or counter group can be enabled with
258
259 ioctl(fd, PERF_COUNTER_IOC_ENABLE);
260
261or disabled with
262
263 ioctl(fd, PERF_COUNTER_IOC_DISABLE);
264
265Enabling or disabling the leader of a group enables or disables the
266whole group; that is, while the group leader is disabled, none of the
267counters in the group will count. Enabling or disabling a member of a
268group other than the leader only affects that counter - disabling an
269non-leader stops that counter from counting but doesn't affect any
270other counter.
271
272A process can enable or disable all the counter groups that are
273attached to it, using prctl:
274
275 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
276
277 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
278
279This applies to all counters on the current process, whether created
280by this process or by another, and doesn't affect any counters that
281this process has created on other processes. It only enables or
282disables the group leaders, not any other members in the groups.
283
diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
new file mode 100644
index 000000000000..1727317352bf
--- /dev/null
+++ b/Documentation/perf_counter/perf-report.cc
@@ -0,0 +1,479 @@
1#define _GNU_SOURCE
2#include <sys/types.h>
3#include <sys/stat.h>
4#include <sys/time.h>
5#include <unistd.h>
6#include <stdint.h>
7#include <stdlib.h>
8#include <string.h>
9#include <limits.h>
10#include <fcntl.h>
11#include <stdio.h>
12#include <errno.h>
13#include <ctype.h>
14#include <time.h>
15#include <getopt.h>
16
17#include <sys/ioctl.h>
18#include <sys/poll.h>
19#include <sys/prctl.h>
20#include <sys/wait.h>
21#include <sys/mman.h>
22#include <sys/types.h>
23#include <sys/stat.h>
24
25#include <linux/unistd.h>
26#include <linux/types.h>
27
28#include "../../include/linux/perf_counter.h"
29
30#include <set>
31#include <map>
32#include <string>
33
34
35static char const *input_name = "output.perf";
36static int input;
37
38static unsigned long page_size;
39static unsigned long mmap_window = 32;
40
41struct ip_event {
42 struct perf_event_header header;
43 __u64 ip;
44 __u32 pid, tid;
45};
46struct mmap_event {
47 struct perf_event_header header;
48 __u32 pid, tid;
49 __u64 start;
50 __u64 len;
51 __u64 pgoff;
52 char filename[PATH_MAX];
53};
54struct comm_event {
55 struct perf_event_header header;
56 __u32 pid,tid;
57 char comm[16];
58};
59
60typedef union event_union {
61 struct perf_event_header header;
62 struct ip_event ip;
63 struct mmap_event mmap;
64 struct comm_event comm;
65} event_t;
66
67struct section {
68 uint64_t start;
69 uint64_t end;
70
71 uint64_t offset;
72
73 std::string name;
74
75 section() { };
76
77 section(uint64_t stab) : end(stab) { };
78
79 section(uint64_t start, uint64_t size, uint64_t offset, std::string name) :
80 start(start), end(start + size), offset(offset), name(name)
81 { };
82
83 bool operator < (const struct section &s) const {
84 return end < s.end;
85 };
86};
87
88typedef std::set<struct section> sections_t;
89
90struct symbol {
91 uint64_t start;
92 uint64_t end;
93
94 std::string name;
95
96 symbol() { };
97
98 symbol(uint64_t ip) : start(ip) { }
99
100 symbol(uint64_t start, uint64_t len, std::string name) :
101 start(start), end(start + len), name(name)
102 { };
103
104 bool operator < (const struct symbol &s) const {
105 return start < s.start;
106 };
107};
108
109typedef std::set<struct symbol> symbols_t;
110
111struct dso {
112 sections_t sections;
113 symbols_t syms;
114};
115
116static std::map<std::string, struct dso> dsos;
117
118static void load_dso_sections(std::string dso_name)
119{
120 struct dso &dso = dsos[dso_name];
121
122 std::string cmd = "readelf -DSW " + dso_name;
123
124 FILE *file = popen(cmd.c_str(), "r");
125 if (!file) {
126 perror("failed to open pipe");
127 exit(-1);
128 }
129
130 char *line = NULL;
131 size_t n = 0;
132
133 while (!feof(file)) {
134 uint64_t addr, off, size;
135 char name[32];
136
137 if (getline(&line, &n, file) < 0)
138 break;
139 if (!line)
140 break;
141
142 if (sscanf(line, " [%*2d] %16s %*14s %Lx %Lx %Lx",
143 name, &addr, &off, &size) == 4) {
144
145 dso.sections.insert(section(addr, size, addr - off, name));
146 }
147#if 0
148 /*
149 * for reading readelf symbols (-s), however these don't seem
150 * to include nearly everything, so use nm for that.
151 */
152 if (sscanf(line, " %*4d %*3d: %Lx %5Lu %*7s %*6s %*7s %3d %s",
153 &start, &size, &section, sym) == 4) {
154
155 start -= dso.section_offsets[section];
156
157 dso.syms.insert(symbol(start, size, std::string(sym)));
158 }
159#endif
160 }
161 pclose(file);
162}
163
164static void load_dso_symbols(std::string dso_name, std::string args)
165{
166 struct dso &dso = dsos[dso_name];
167
168 std::string cmd = "nm -nSC " + args + " " + dso_name;
169
170 FILE *file = popen(cmd.c_str(), "r");
171 if (!file) {
172 perror("failed to open pipe");
173 exit(-1);
174 }
175
176 char *line = NULL;
177 size_t n = 0;
178
179 while (!feof(file)) {
180 uint64_t start, size;
181 char c;
182 char sym[1024];
183
184 if (getline(&line, &n, file) < 0)
185 break;
186 if (!line)
187 break;
188
189
190 if (sscanf(line, "%Lx %Lx %c %s", &start, &size, &c, sym) == 4) {
191 sections_t::const_iterator si =
192 dso.sections.upper_bound(section(start));
193 if (si == dso.sections.end()) {
194 printf("symbol in unknown section: %s\n", sym);
195 continue;
196 }
197
198 start -= si->offset;
199
200 dso.syms.insert(symbol(start, size, sym));
201 }
202 }
203 pclose(file);
204}
205
206static void load_dso(std::string dso_name)
207{
208 load_dso_sections(dso_name);
209 load_dso_symbols(dso_name, "-D"); /* dynamic symbols */
210 load_dso_symbols(dso_name, ""); /* regular ones */
211}
212
213void load_kallsyms(void)
214{
215 struct dso &dso = dsos["[kernel]"];
216
217 FILE *file = fopen("/proc/kallsyms", "r");
218 if (!file) {
219 perror("failed to open kallsyms");
220 exit(-1);
221 }
222
223 char *line;
224 size_t n;
225
226 while (!feof(file)) {
227 uint64_t start;
228 char c;
229 char sym[1024];
230
231 if (getline(&line, &n, file) < 0)
232 break;
233 if (!line)
234 break;
235
236 if (sscanf(line, "%Lx %c %s", &start, &c, sym) == 3)
237 dso.syms.insert(symbol(start, 0x1000000, std::string(sym)));
238 }
239 fclose(file);
240}
241
242struct map {
243 uint64_t start;
244 uint64_t end;
245 uint64_t pgoff;
246
247 std::string dso;
248
249 map() { };
250
251 map(uint64_t ip) : end(ip) { }
252
253 map(mmap_event *mmap) {
254 start = mmap->start;
255 end = mmap->start + mmap->len;
256 pgoff = mmap->pgoff;
257
258 dso = std::string(mmap->filename);
259
260 if (dsos.find(dso) == dsos.end())
261 load_dso(dso);
262 };
263
264 bool operator < (const struct map &m) const {
265 return end < m.end;
266 };
267};
268
269typedef std::set<struct map> maps_t;
270
271static std::map<int, maps_t> maps;
272
273static std::map<int, std::string> comms;
274
275static std::map<std::string, int> hist;
276static std::multimap<int, std::string> rev_hist;
277
278static std::string resolve_comm(int pid)
279{
280 std::string comm;
281
282 std::map<int, std::string>::const_iterator ci = comms.find(pid);
283 if (ci != comms.end()) {
284 comm = ci->second;
285 } else {
286 char pid_str[30];
287
288 sprintf(pid_str, ":%d", pid);
289 comm = pid_str;
290 }
291
292 return comm;
293}
294
295static std::string resolve_user_symbol(int pid, uint64_t ip)
296{
297 std::string sym = "<unknown>";
298
299 maps_t &m = maps[pid];
300 maps_t::const_iterator mi = m.upper_bound(map(ip));
301 if (mi == m.end())
302 return sym;
303
304 ip -= mi->start + mi->pgoff;
305
306 symbols_t &s = dsos[mi->dso].syms;
307 symbols_t::const_iterator si = s.upper_bound(symbol(ip));
308
309 sym = mi->dso + ": <unknown>";
310
311 if (si == s.begin())
312 return sym;
313 si--;
314
315 if (si->start <= ip && ip < si->end)
316 sym = mi->dso + ": " + si->name;
317#if 0
318 else if (si->start <= ip)
319 sym = mi->dso + ": ?" + si->name;
320#endif
321
322 return sym;
323}
324
325static std::string resolve_kernel_symbol(uint64_t ip)
326{
327 std::string sym = "<unknown>";
328
329 symbols_t &s = dsos["[kernel]"].syms;
330 symbols_t::const_iterator si = s.upper_bound(symbol(ip));
331
332 if (si == s.begin())
333 return sym;
334 si--;
335
336 if (si->start <= ip && ip < si->end)
337 sym = si->name;
338
339 return sym;
340}
341
342static void display_help(void)
343{
344 printf(
345 "Usage: perf-report [<options>]\n"
346 " -i file --input=<file> # input file\n"
347 );
348
349 exit(0);
350}
351
352static void process_options(int argc, char *argv[])
353{
354 int error = 0;
355
356 for (;;) {
357 int option_index = 0;
358 /** Options for getopt */
359 static struct option long_options[] = {
360 {"input", required_argument, NULL, 'i'},
361 {NULL, 0, NULL, 0 }
362 };
363 int c = getopt_long(argc, argv, "+:i:",
364 long_options, &option_index);
365 if (c == -1)
366 break;
367
368 switch (c) {
369 case 'i': input_name = strdup(optarg); break;
370 default: error = 1; break;
371 }
372 }
373
374 if (error)
375 display_help();
376}
377
378int main(int argc, char *argv[])
379{
380 unsigned long offset = 0;
381 unsigned long head = 0;
382 struct stat stat;
383 char *buf;
384 event_t *event;
385 int ret;
386 unsigned long total = 0;
387
388 page_size = getpagesize();
389
390 process_options(argc, argv);
391
392 input = open(input_name, O_RDONLY);
393 if (input < 0) {
394 perror("failed to open file");
395 exit(-1);
396 }
397
398 ret = fstat(input, &stat);
399 if (ret < 0) {
400 perror("failed to stat file");
401 exit(-1);
402 }
403
404 load_kallsyms();
405
406remap:
407 buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
408 MAP_SHARED, input, offset);
409 if (buf == MAP_FAILED) {
410 perror("failed to mmap file");
411 exit(-1);
412 }
413
414more:
415 event = (event_t *)(buf + head);
416
417 if (head + event->header.size >= page_size * mmap_window) {
418 unsigned long shift = page_size * (head / page_size);
419
420 munmap(buf, page_size * mmap_window);
421 offset += shift;
422 head -= shift;
423 goto remap;
424 }
425 head += event->header.size;
426
427 if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
428 std::string comm, sym, level;
429 char output[1024];
430
431 if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
432 level = " [k] ";
433 sym = resolve_kernel_symbol(event->ip.ip);
434 } else if (event->header.misc & PERF_EVENT_MISC_USER) {
435 level = " [.] ";
436 sym = resolve_user_symbol(event->ip.pid, event->ip.ip);
437 } else {
438 level = " [H] ";
439 }
440 comm = resolve_comm(event->ip.pid);
441
442 snprintf(output, sizeof(output), "%16s %s %s",
443 comm.c_str(), level.c_str(), sym.c_str());
444 hist[output]++;
445
446 total++;
447
448 } else switch (event->header.type) {
449 case PERF_EVENT_MMAP:
450 maps[event->mmap.pid].insert(map(&event->mmap));
451 break;
452
453 case PERF_EVENT_COMM:
454 comms[event->comm.pid] = std::string(event->comm.comm);
455 break;
456 }
457
458 if (offset + head < stat.st_size)
459 goto more;
460
461 close(input);
462
463 std::map<std::string, int>::iterator hi = hist.begin();
464
465 while (hi != hist.end()) {
466 rev_hist.insert(std::pair<int, std::string>(hi->second, hi->first));
467 hist.erase(hi++);
468 }
469
470 std::multimap<int, std::string>::const_iterator ri = rev_hist.begin();
471
472 while (ri != rev_hist.end()) {
473 printf(" %5.2f %s\n", (100.0 * ri->first)/total, ri->second.c_str());
474 ri++;
475 }
476
477 return 0;
478}
479
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
new file mode 100644
index 000000000000..594d270be390
--- /dev/null
+++ b/Documentation/perf_counter/perf.c
@@ -0,0 +1,414 @@
1#include "builtin.h"
2#include "util/exec_cmd.h"
3#include "util/cache.h"
4#include "util/quote.h"
5#include "util/run-command.h"
6
7const char perf_usage_string[] =
8 "perf [--version] [--help] COMMAND [ARGS]";
9
10const char perf_more_info_string[] =
11 "See 'perf help COMMAND' for more information on a specific command.";
12
13static int use_pager = -1;
14struct pager_config {
15 const char *cmd;
16 int val;
17};
18
19static int pager_command_config(const char *var, const char *value, void *data)
20{
21 struct pager_config *c = data;
22 if (!prefixcmp(var, "pager.") && !strcmp(var + 6, c->cmd))
23 c->val = perf_config_bool(var, value);
24 return 0;
25}
26
27/* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */
28int check_pager_config(const char *cmd)
29{
30 struct pager_config c;
31 c.cmd = cmd;
32 c.val = -1;
33 perf_config(pager_command_config, &c);
34 return c.val;
35}
36
37static void commit_pager_choice(void) {
38 switch (use_pager) {
39 case 0:
40 setenv("PERF_PAGER", "cat", 1);
41 break;
42 case 1:
43 /* setup_pager(); */
44 break;
45 default:
46 break;
47 }
48}
49
50static int handle_options(const char*** argv, int* argc, int* envchanged)
51{
52 int handled = 0;
53
54 while (*argc > 0) {
55 const char *cmd = (*argv)[0];
56 if (cmd[0] != '-')
57 break;
58
59 /*
60 * For legacy reasons, the "version" and "help"
61 * commands can be written with "--" prepended
62 * to make them look like flags.
63 */
64 if (!strcmp(cmd, "--help") || !strcmp(cmd, "--version"))
65 break;
66
67 /*
68 * Check remaining flags.
69 */
70 if (!prefixcmp(cmd, "--exec-path")) {
71 cmd += 11;
72 if (*cmd == '=')
73 perf_set_argv_exec_path(cmd + 1);
74 else {
75 puts(perf_exec_path());
76 exit(0);
77 }
78 } else if (!strcmp(cmd, "--html-path")) {
79 puts(system_path(PERF_HTML_PATH));
80 exit(0);
81 } else if (!strcmp(cmd, "-p") || !strcmp(cmd, "--paginate")) {
82 use_pager = 1;
83 } else if (!strcmp(cmd, "--no-pager")) {
84 use_pager = 0;
85 if (envchanged)
86 *envchanged = 1;
87 } else if (!strcmp(cmd, "--perf-dir")) {
88 if (*argc < 2) {
89 fprintf(stderr, "No directory given for --perf-dir.\n" );
90 usage(perf_usage_string);
91 }
92 setenv(PERF_DIR_ENVIRONMENT, (*argv)[1], 1);
93 if (envchanged)
94 *envchanged = 1;
95 (*argv)++;
96 (*argc)--;
97 handled++;
98 } else if (!prefixcmp(cmd, "--perf-dir=")) {
99 setenv(PERF_DIR_ENVIRONMENT, cmd + 10, 1);
100 if (envchanged)
101 *envchanged = 1;
102 } else if (!strcmp(cmd, "--work-tree")) {
103 if (*argc < 2) {
104 fprintf(stderr, "No directory given for --work-tree.\n" );
105 usage(perf_usage_string);
106 }
107 setenv(PERF_WORK_TREE_ENVIRONMENT, (*argv)[1], 1);
108 if (envchanged)
109 *envchanged = 1;
110 (*argv)++;
111 (*argc)--;
112 } else if (!prefixcmp(cmd, "--work-tree=")) {
113 setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1);
114 if (envchanged)
115 *envchanged = 1;
116 } else {
117 fprintf(stderr, "Unknown option: %s\n", cmd);
118 usage(perf_usage_string);
119 }
120
121 (*argv)++;
122 (*argc)--;
123 handled++;
124 }
125 return handled;
126}
127
128static int handle_alias(int *argcp, const char ***argv)
129{
130 int envchanged = 0, ret = 0, saved_errno = errno;
131 int count, option_count;
132 const char** new_argv;
133 const char *alias_command;
134 char *alias_string;
135
136 alias_command = (*argv)[0];
137 alias_string = alias_lookup(alias_command);
138 if (alias_string) {
139 if (alias_string[0] == '!') {
140 if (*argcp > 1) {
141 struct strbuf buf;
142
143 strbuf_init(&buf, PATH_MAX);
144 strbuf_addstr(&buf, alias_string);
145 sq_quote_argv(&buf, (*argv) + 1, PATH_MAX);
146 free(alias_string);
147 alias_string = buf.buf;
148 }
149 ret = system(alias_string + 1);
150 if (ret >= 0 && WIFEXITED(ret) &&
151 WEXITSTATUS(ret) != 127)
152 exit(WEXITSTATUS(ret));
153 die("Failed to run '%s' when expanding alias '%s'",
154 alias_string + 1, alias_command);
155 }
156 count = split_cmdline(alias_string, &new_argv);
157 if (count < 0)
158 die("Bad alias.%s string", alias_command);
159 option_count = handle_options(&new_argv, &count, &envchanged);
160 if (envchanged)
161 die("alias '%s' changes environment variables\n"
162 "You can use '!perf' in the alias to do this.",
163 alias_command);
164 memmove(new_argv - option_count, new_argv,
165 count * sizeof(char *));
166 new_argv -= option_count;
167
168 if (count < 1)
169 die("empty alias for %s", alias_command);
170
171 if (!strcmp(alias_command, new_argv[0]))
172 die("recursive alias: %s", alias_command);
173
174 new_argv = realloc(new_argv, sizeof(char*) *
175 (count + *argcp + 1));
176 /* insert after command name */
177 memcpy(new_argv + count, *argv + 1, sizeof(char*) * *argcp);
178 new_argv[count+*argcp] = NULL;
179
180 *argv = new_argv;
181 *argcp += count - 1;
182
183 ret = 1;
184 }
185
186 errno = saved_errno;
187
188 return ret;
189}
190
191const char perf_version_string[] = PERF_VERSION;
192
193#define RUN_SETUP (1<<0)
194#define USE_PAGER (1<<1)
195/*
196 * require working tree to be present -- anything uses this needs
197 * RUN_SETUP for reading from the configuration file.
198 */
199#define NEED_WORK_TREE (1<<2)
200
201struct cmd_struct {
202 const char *cmd;
203 int (*fn)(int, const char **, const char *);
204 int option;
205};
206
207static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
208{
209 int status;
210 struct stat st;
211 const char *prefix;
212
213 prefix = NULL;
214 if (p->option & RUN_SETUP)
215 prefix = NULL; /* setup_perf_directory(); */
216
217 if (use_pager == -1 && p->option & RUN_SETUP)
218 use_pager = check_pager_config(p->cmd);
219 if (use_pager == -1 && p->option & USE_PAGER)
220 use_pager = 1;
221 commit_pager_choice();
222
223 if (p->option & NEED_WORK_TREE)
224 /* setup_work_tree() */;
225
226 status = p->fn(argc, argv, prefix);
227 if (status)
228 return status & 0xff;
229
230 /* Somebody closed stdout? */
231 if (fstat(fileno(stdout), &st))
232 return 0;
233 /* Ignore write errors for pipes and sockets.. */
234 if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode))
235 return 0;
236
237 /* Check for ENOSPC and EIO errors.. */
238 if (fflush(stdout))
239 die("write failure on standard output: %s", strerror(errno));
240 if (ferror(stdout))
241 die("unknown write failure on standard output");
242 if (fclose(stdout))
243 die("close failed on standard output: %s", strerror(errno));
244 return 0;
245}
246
247static void handle_internal_command(int argc, const char **argv)
248{
249 const char *cmd = argv[0];
250 static struct cmd_struct commands[] = {
251 { "help", cmd_help, 0 },
252 { "record", cmd_record, 0 },
253 { "stat", cmd_stat, 0 },
254 { "top", cmd_top, 0 },
255 { "version", cmd_version, 0 },
256 };
257 int i;
258 static const char ext[] = STRIP_EXTENSION;
259
260 if (sizeof(ext) > 1) {
261 i = strlen(argv[0]) - strlen(ext);
262 if (i > 0 && !strcmp(argv[0] + i, ext)) {
263 char *argv0 = strdup(argv[0]);
264 argv[0] = cmd = argv0;
265 argv0[i] = '\0';
266 }
267 }
268
269 /* Turn "perf cmd --help" into "perf help cmd" */
270 if (argc > 1 && !strcmp(argv[1], "--help")) {
271 argv[1] = argv[0];
272 argv[0] = cmd = "help";
273 }
274
275 for (i = 0; i < ARRAY_SIZE(commands); i++) {
276 struct cmd_struct *p = commands+i;
277 if (strcmp(p->cmd, cmd))
278 continue;
279 exit(run_builtin(p, argc, argv));
280 }
281}
282
283static void execv_dashed_external(const char **argv)
284{
285 struct strbuf cmd = STRBUF_INIT;
286 const char *tmp;
287 int status;
288
289 strbuf_addf(&cmd, "perf-%s", argv[0]);
290
291 /*
292 * argv[0] must be the perf command, but the argv array
293 * belongs to the caller, and may be reused in
294 * subsequent loop iterations. Save argv[0] and
295 * restore it on error.
296 */
297 tmp = argv[0];
298 argv[0] = cmd.buf;
299
300 /*
301 * if we fail because the command is not found, it is
302 * OK to return. Otherwise, we just pass along the status code.
303 */
304 status = run_command_v_opt(argv, 0);
305 if (status != -ERR_RUN_COMMAND_EXEC) {
306 if (IS_RUN_COMMAND_ERR(status))
307 die("unable to run '%s'", argv[0]);
308 exit(-status);
309 }
310 errno = ENOENT; /* as if we called execvp */
311
312 argv[0] = tmp;
313
314 strbuf_release(&cmd);
315}
316
317static int run_argv(int *argcp, const char ***argv)
318{
319 int done_alias = 0;
320
321 while (1) {
322 /* See if it's an internal command */
323 handle_internal_command(*argcp, *argv);
324
325 /* .. then try the external ones */
326 execv_dashed_external(*argv);
327
328 /* It could be an alias -- this works around the insanity
329 * of overriding "perf log" with "perf show" by having
330 * alias.log = show
331 */
332 if (done_alias || !handle_alias(argcp, argv))
333 break;
334 done_alias = 1;
335 }
336
337 return done_alias;
338}
339
340
341int main(int argc, const char **argv)
342{
343 const char *cmd;
344
345 cmd = perf_extract_argv0_path(argv[0]);
346 if (!cmd)
347 cmd = "perf-help";
348
349 /*
350 * "perf-xxxx" is the same as "perf xxxx", but we obviously:
351 *
352 * - cannot take flags in between the "perf" and the "xxxx".
353 * - cannot execute it externally (since it would just do
354 * the same thing over again)
355 *
356 * So we just directly call the internal command handler, and
357 * die if that one cannot handle it.
358 */
359 if (!prefixcmp(cmd, "perf-")) {
360 cmd += 4;
361 argv[0] = cmd;
362 handle_internal_command(argc, argv);
363 die("cannot handle %s internally", cmd);
364 }
365
366 /* Look for flags.. */
367 argv++;
368 argc--;
369 handle_options(&argv, &argc, NULL);
370 commit_pager_choice();
371 if (argc > 0) {
372 if (!prefixcmp(argv[0], "--"))
373 argv[0] += 2;
374 } else {
375 /* The user didn't specify a command; give them help */
376 printf("usage: %s\n\n", perf_usage_string);
377 list_common_cmds_help();
378 printf("\n%s\n", perf_more_info_string);
379 exit(1);
380 }
381 cmd = argv[0];
382
383 /*
384 * We use PATH to find perf commands, but we prepend some higher
385 * precidence paths: the "--exec-path" option, the PERF_EXEC_PATH
386 * environment, and the $(perfexecdir) from the Makefile at build
387 * time.
388 */
389 setup_path();
390
391 while (1) {
392 static int done_help = 0;
393 static int was_alias = 0;
394 was_alias = run_argv(&argc, &argv);
395 if (errno != ENOENT)
396 break;
397 if (was_alias) {
398 fprintf(stderr, "Expansion of alias '%s' failed; "
399 "'%s' is not a perf-command\n",
400 cmd, argv[0]);
401 exit(1);
402 }
403 if (!done_help) {
404 cmd = argv[0] = help_unknown_cmd(cmd);
405 done_help = 1;
406 } else
407 break;
408 }
409
410 fprintf(stderr, "Failed to run command '%s': %s\n",
411 cmd, strerror(errno));
412
413 return 1;
414}
diff --git a/Documentation/perf_counter/util/PERF-VERSION-GEN b/Documentation/perf_counter/util/PERF-VERSION-GEN
new file mode 100755
index 000000000000..c561d1538c03
--- /dev/null
+++ b/Documentation/perf_counter/util/PERF-VERSION-GEN
@@ -0,0 +1,42 @@
1#!/bin/sh
2
3GVF=PERF-VERSION-FILE
4DEF_VER=v0.0.1.PERF
5
6LF='
7'
8
9# First see if there is a version file (included in release tarballs),
10# then try git-describe, then default.
11if test -f version
12then
13 VN=$(cat version) || VN="$DEF_VER"
14elif test -d .git -o -f .git &&
15 VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
16 case "$VN" in
17 *$LF*) (exit 1) ;;
18 v[0-9]*)
19 git update-index -q --refresh
20 test -z "$(git diff-index --name-only HEAD --)" ||
21 VN="$VN-dirty" ;;
22 esac
23then
24 VN=$(echo "$VN" | sed -e 's/-/./g');
25else
26 VN="$DEF_VER"
27fi
28
29VN=$(expr "$VN" : v*'\(.*\)')
30
31if test -r $GVF
32then
33 VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
34else
35 VC=unset
36fi
37test "$VN" = "$VC" || {
38 echo >&2 "PERF_VERSION = $VN"
39 echo "PERF_VERSION = $VN" >$GVF
40}
41
42
diff --git a/Documentation/perf_counter/util/abspath.c b/Documentation/perf_counter/util/abspath.c
new file mode 100644
index 000000000000..649f34f83365
--- /dev/null
+++ b/Documentation/perf_counter/util/abspath.c
@@ -0,0 +1,117 @@
1#include "cache.h"
2
3/*
4 * Do not use this for inspecting *tracked* content. When path is a
5 * symlink to a directory, we do not want to say it is a directory when
6 * dealing with tracked content in the working tree.
7 */
8int is_directory(const char *path)
9{
10 struct stat st;
11 return (!stat(path, &st) && S_ISDIR(st.st_mode));
12}
13
14/* We allow "recursive" symbolic links. Only within reason, though. */
15#define MAXDEPTH 5
16
17const char *make_absolute_path(const char *path)
18{
19 static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1];
20 char cwd[1024] = "";
21 int buf_index = 1, len;
22
23 int depth = MAXDEPTH;
24 char *last_elem = NULL;
25 struct stat st;
26
27 if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
28 die ("Too long path: %.*s", 60, path);
29
30 while (depth--) {
31 if (!is_directory(buf)) {
32 char *last_slash = strrchr(buf, '/');
33 if (last_slash) {
34 *last_slash = '\0';
35 last_elem = xstrdup(last_slash + 1);
36 } else {
37 last_elem = xstrdup(buf);
38 *buf = '\0';
39 }
40 }
41
42 if (*buf) {
43 if (!*cwd && !getcwd(cwd, sizeof(cwd)))
44 die ("Could not get current working directory");
45
46 if (chdir(buf))
47 die ("Could not switch to '%s'", buf);
48 }
49 if (!getcwd(buf, PATH_MAX))
50 die ("Could not get current working directory");
51
52 if (last_elem) {
53 int len = strlen(buf);
54 if (len + strlen(last_elem) + 2 > PATH_MAX)
55 die ("Too long path name: '%s/%s'",
56 buf, last_elem);
57 buf[len] = '/';
58 strcpy(buf + len + 1, last_elem);
59 free(last_elem);
60 last_elem = NULL;
61 }
62
63 if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) {
64 len = readlink(buf, next_buf, PATH_MAX);
65 if (len < 0)
66 die ("Invalid symlink: %s", buf);
67 if (PATH_MAX <= len)
68 die("symbolic link too long: %s", buf);
69 next_buf[len] = '\0';
70 buf = next_buf;
71 buf_index = 1 - buf_index;
72 next_buf = bufs[buf_index];
73 } else
74 break;
75 }
76
77 if (*cwd && chdir(cwd))
78 die ("Could not change back to '%s'", cwd);
79
80 return buf;
81}
82
83static const char *get_pwd_cwd(void)
84{
85 static char cwd[PATH_MAX + 1];
86 char *pwd;
87 struct stat cwd_stat, pwd_stat;
88 if (getcwd(cwd, PATH_MAX) == NULL)
89 return NULL;
90 pwd = getenv("PWD");
91 if (pwd && strcmp(pwd, cwd)) {
92 stat(cwd, &cwd_stat);
93 if (!stat(pwd, &pwd_stat) &&
94 pwd_stat.st_dev == cwd_stat.st_dev &&
95 pwd_stat.st_ino == cwd_stat.st_ino) {
96 strlcpy(cwd, pwd, PATH_MAX);
97 }
98 }
99 return cwd;
100}
101
102const char *make_nonrelative_path(const char *path)
103{
104 static char buf[PATH_MAX + 1];
105
106 if (is_absolute_path(path)) {
107 if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
108 die("Too long path: %.*s", 60, path);
109 } else {
110 const char *cwd = get_pwd_cwd();
111 if (!cwd)
112 die("Cannot determine the current working directory");
113 if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
114 die("Too long path: %.*s", 60, path);
115 }
116 return buf;
117}
diff --git a/Documentation/perf_counter/util/alias.c b/Documentation/perf_counter/util/alias.c
new file mode 100644
index 000000000000..9b3dd2b428df
--- /dev/null
+++ b/Documentation/perf_counter/util/alias.c
@@ -0,0 +1,77 @@
1#include "cache.h"
2
3static const char *alias_key;
4static char *alias_val;
5
6static int alias_lookup_cb(const char *k, const char *v, void *cb)
7{
8 if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) {
9 if (!v)
10 return config_error_nonbool(k);
11 alias_val = strdup(v);
12 return 0;
13 }
14 return 0;
15}
16
17char *alias_lookup(const char *alias)
18{
19 alias_key = alias;
20 alias_val = NULL;
21 perf_config(alias_lookup_cb, NULL);
22 return alias_val;
23}
24
25int split_cmdline(char *cmdline, const char ***argv)
26{
27 int src, dst, count = 0, size = 16;
28 char quoted = 0;
29
30 *argv = malloc(sizeof(char*) * size);
31
32 /* split alias_string */
33 (*argv)[count++] = cmdline;
34 for (src = dst = 0; cmdline[src];) {
35 char c = cmdline[src];
36 if (!quoted && isspace(c)) {
37 cmdline[dst++] = 0;
38 while (cmdline[++src]
39 && isspace(cmdline[src]))
40 ; /* skip */
41 if (count >= size) {
42 size += 16;
43 *argv = realloc(*argv, sizeof(char*) * size);
44 }
45 (*argv)[count++] = cmdline + dst;
46 } else if (!quoted && (c == '\'' || c == '"')) {
47 quoted = c;
48 src++;
49 } else if (c == quoted) {
50 quoted = 0;
51 src++;
52 } else {
53 if (c == '\\' && quoted != '\'') {
54 src++;
55 c = cmdline[src];
56 if (!c) {
57 free(*argv);
58 *argv = NULL;
59 return error("cmdline ends with \\");
60 }
61 }
62 cmdline[dst++] = c;
63 src++;
64 }
65 }
66
67 cmdline[dst] = 0;
68
69 if (quoted) {
70 free(*argv);
71 *argv = NULL;
72 return error("unclosed quote");
73 }
74
75 return count;
76}
77
diff --git a/Documentation/perf_counter/util/cache.h b/Documentation/perf_counter/util/cache.h
new file mode 100644
index 000000000000..71080512fa86
--- /dev/null
+++ b/Documentation/perf_counter/util/cache.h
@@ -0,0 +1,117 @@
1#ifndef CACHE_H
2#define CACHE_H
3
4#include "util.h"
5#include "strbuf.h"
6
7#define PERF_DIR_ENVIRONMENT "PERF_DIR"
8#define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE"
9#define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
10#define DB_ENVIRONMENT "PERF_OBJECT_DIRECTORY"
11#define INDEX_ENVIRONMENT "PERF_INDEX_FILE"
12#define GRAFT_ENVIRONMENT "PERF_GRAFT_FILE"
13#define TEMPLATE_DIR_ENVIRONMENT "PERF_TEMPLATE_DIR"
14#define CONFIG_ENVIRONMENT "PERF_CONFIG"
15#define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH"
16#define CEILING_DIRECTORIES_ENVIRONMENT "PERF_CEILING_DIRECTORIES"
17#define PERFATTRIBUTES_FILE ".perfattributes"
18#define INFOATTRIBUTES_FILE "info/attributes"
19#define ATTRIBUTE_MACRO_PREFIX "[attr]"
20
21typedef int (*config_fn_t)(const char *, const char *, void *);
22extern int perf_default_config(const char *, const char *, void *);
23extern int perf_config_from_file(config_fn_t fn, const char *, void *);
24extern int perf_config(config_fn_t fn, void *);
25extern int perf_parse_ulong(const char *, unsigned long *);
26extern int perf_config_int(const char *, const char *);
27extern unsigned long perf_config_ulong(const char *, const char *);
28extern int perf_config_bool_or_int(const char *, const char *, int *);
29extern int perf_config_bool(const char *, const char *);
30extern int perf_config_string(const char **, const char *, const char *);
31extern int perf_config_set(const char *, const char *);
32extern int perf_config_set_multivar(const char *, const char *, const char *, int);
33extern int perf_config_rename_section(const char *, const char *);
34extern const char *perf_etc_perfconfig(void);
35extern int check_repository_format_version(const char *var, const char *value, void *cb);
36extern int perf_config_system(void);
37extern int perf_config_global(void);
38extern int config_error_nonbool(const char *);
39extern const char *config_exclusive_filename;
40
41#define MAX_PERFNAME (1000)
42extern char perf_default_email[MAX_PERFNAME];
43extern char perf_default_name[MAX_PERFNAME];
44extern int user_ident_explicitly_given;
45
46extern const char *perf_log_output_encoding;
47extern const char *perf_mailmap_file;
48
49/* IO helper functions */
50extern void maybe_flush_or_die(FILE *, const char *);
51extern int copy_fd(int ifd, int ofd);
52extern int copy_file(const char *dst, const char *src, int mode);
53extern ssize_t read_in_full(int fd, void *buf, size_t count);
54extern ssize_t write_in_full(int fd, const void *buf, size_t count);
55extern void write_or_die(int fd, const void *buf, size_t count);
56extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg);
57extern int write_or_whine_pipe(int fd, const void *buf, size_t count, const char *msg);
58extern void fsync_or_die(int fd, const char *);
59
60/* pager.c */
61extern void setup_pager(void);
62extern const char *pager_program;
63extern int pager_in_use(void);
64extern int pager_use_color;
65
66extern const char *editor_program;
67extern const char *excludes_file;
68
69char *alias_lookup(const char *alias);
70int split_cmdline(char *cmdline, const char ***argv);
71
72#define alloc_nr(x) (((x)+16)*3/2)
73
74/*
75 * Realloc the buffer pointed at by variable 'x' so that it can hold
76 * at least 'nr' entries; the number of entries currently allocated
77 * is 'alloc', using the standard growing factor alloc_nr() macro.
78 *
79 * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
80 */
81#define ALLOC_GROW(x, nr, alloc) \
82 do { \
83 if ((nr) > alloc) { \
84 if (alloc_nr(alloc) < (nr)) \
85 alloc = (nr); \
86 else \
87 alloc = alloc_nr(alloc); \
88 x = xrealloc((x), alloc * sizeof(*(x))); \
89 } \
90 } while(0)
91
92
93static inline int is_absolute_path(const char *path)
94{
95 return path[0] == '/';
96}
97
98const char *make_absolute_path(const char *path);
99const char *make_nonrelative_path(const char *path);
100const char *make_relative_path(const char *abs, const char *base);
101int normalize_path_copy(char *dst, const char *src);
102int longest_ancestor_length(const char *path, const char *prefix_list);
103char *strip_path_suffix(const char *path, const char *suffix);
104
105extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
106extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
107
108extern char *mksnpath(char *buf, size_t n, const char *fmt, ...)
109 __attribute__((format (printf, 3, 4)));
110extern char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
111 __attribute__((format (printf, 3, 4)));
112extern char *perf_pathdup(const char *fmt, ...)
113 __attribute__((format (printf, 1, 2)));
114
115extern size_t strlcpy(char *dest, const char *src, size_t size);
116
117#endif /* CACHE_H */
diff --git a/Documentation/perf_counter/util/config.c b/Documentation/perf_counter/util/config.c
new file mode 100644
index 000000000000..3dd13faa6a27
--- /dev/null
+++ b/Documentation/perf_counter/util/config.c
@@ -0,0 +1,873 @@
1/*
2 * GIT - The information manager from hell
3 *
4 * Copyright (C) Linus Torvalds, 2005
5 * Copyright (C) Johannes Schindelin, 2005
6 *
7 */
8#include "util.h"
9#include "cache.h"
10#include "exec_cmd.h"
11
12#define MAXNAME (256)
13
14static FILE *config_file;
15static const char *config_file_name;
16static int config_linenr;
17static int config_file_eof;
18
19const char *config_exclusive_filename = NULL;
20
21static int get_next_char(void)
22{
23 int c;
24 FILE *f;
25
26 c = '\n';
27 if ((f = config_file) != NULL) {
28 c = fgetc(f);
29 if (c == '\r') {
30 /* DOS like systems */
31 c = fgetc(f);
32 if (c != '\n') {
33 ungetc(c, f);
34 c = '\r';
35 }
36 }
37 if (c == '\n')
38 config_linenr++;
39 if (c == EOF) {
40 config_file_eof = 1;
41 c = '\n';
42 }
43 }
44 return c;
45}
46
47static char *parse_value(void)
48{
49 static char value[1024];
50 int quote = 0, comment = 0, len = 0, space = 0;
51
52 for (;;) {
53 int c = get_next_char();
54 if (len >= sizeof(value) - 1)
55 return NULL;
56 if (c == '\n') {
57 if (quote)
58 return NULL;
59 value[len] = 0;
60 return value;
61 }
62 if (comment)
63 continue;
64 if (isspace(c) && !quote) {
65 space = 1;
66 continue;
67 }
68 if (!quote) {
69 if (c == ';' || c == '#') {
70 comment = 1;
71 continue;
72 }
73 }
74 if (space) {
75 if (len)
76 value[len++] = ' ';
77 space = 0;
78 }
79 if (c == '\\') {
80 c = get_next_char();
81 switch (c) {
82 case '\n':
83 continue;
84 case 't':
85 c = '\t';
86 break;
87 case 'b':
88 c = '\b';
89 break;
90 case 'n':
91 c = '\n';
92 break;
93 /* Some characters escape as themselves */
94 case '\\': case '"':
95 break;
96 /* Reject unknown escape sequences */
97 default:
98 return NULL;
99 }
100 value[len++] = c;
101 continue;
102 }
103 if (c == '"') {
104 quote = 1-quote;
105 continue;
106 }
107 value[len++] = c;
108 }
109}
110
111static inline int iskeychar(int c)
112{
113 return isalnum(c) || c == '-';
114}
115
116static int get_value(config_fn_t fn, void *data, char *name, unsigned int len)
117{
118 int c;
119 char *value;
120
121 /* Get the full name */
122 for (;;) {
123 c = get_next_char();
124 if (config_file_eof)
125 break;
126 if (!iskeychar(c))
127 break;
128 name[len++] = tolower(c);
129 if (len >= MAXNAME)
130 return -1;
131 }
132 name[len] = 0;
133 while (c == ' ' || c == '\t')
134 c = get_next_char();
135
136 value = NULL;
137 if (c != '\n') {
138 if (c != '=')
139 return -1;
140 value = parse_value();
141 if (!value)
142 return -1;
143 }
144 return fn(name, value, data);
145}
146
147static int get_extended_base_var(char *name, int baselen, int c)
148{
149 do {
150 if (c == '\n')
151 return -1;
152 c = get_next_char();
153 } while (isspace(c));
154
155 /* We require the format to be '[base "extension"]' */
156 if (c != '"')
157 return -1;
158 name[baselen++] = '.';
159
160 for (;;) {
161 int c = get_next_char();
162 if (c == '\n')
163 return -1;
164 if (c == '"')
165 break;
166 if (c == '\\') {
167 c = get_next_char();
168 if (c == '\n')
169 return -1;
170 }
171 name[baselen++] = c;
172 if (baselen > MAXNAME / 2)
173 return -1;
174 }
175
176 /* Final ']' */
177 if (get_next_char() != ']')
178 return -1;
179 return baselen;
180}
181
182static int get_base_var(char *name)
183{
184 int baselen = 0;
185
186 for (;;) {
187 int c = get_next_char();
188 if (config_file_eof)
189 return -1;
190 if (c == ']')
191 return baselen;
192 if (isspace(c))
193 return get_extended_base_var(name, baselen, c);
194 if (!iskeychar(c) && c != '.')
195 return -1;
196 if (baselen > MAXNAME / 2)
197 return -1;
198 name[baselen++] = tolower(c);
199 }
200}
201
202static int perf_parse_file(config_fn_t fn, void *data)
203{
204 int comment = 0;
205 int baselen = 0;
206 static char var[MAXNAME];
207
208 /* U+FEFF Byte Order Mark in UTF8 */
209 static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf";
210 const unsigned char *bomptr = utf8_bom;
211
212 for (;;) {
213 int c = get_next_char();
214 if (bomptr && *bomptr) {
215 /* We are at the file beginning; skip UTF8-encoded BOM
216 * if present. Sane editors won't put this in on their
217 * own, but e.g. Windows Notepad will do it happily. */
218 if ((unsigned char) c == *bomptr) {
219 bomptr++;
220 continue;
221 } else {
222 /* Do not tolerate partial BOM. */
223 if (bomptr != utf8_bom)
224 break;
225 /* No BOM at file beginning. Cool. */
226 bomptr = NULL;
227 }
228 }
229 if (c == '\n') {
230 if (config_file_eof)
231 return 0;
232 comment = 0;
233 continue;
234 }
235 if (comment || isspace(c))
236 continue;
237 if (c == '#' || c == ';') {
238 comment = 1;
239 continue;
240 }
241 if (c == '[') {
242 baselen = get_base_var(var);
243 if (baselen <= 0)
244 break;
245 var[baselen++] = '.';
246 var[baselen] = 0;
247 continue;
248 }
249 if (!isalpha(c))
250 break;
251 var[baselen] = tolower(c);
252 if (get_value(fn, data, var, baselen+1) < 0)
253 break;
254 }
255 die("bad config file line %d in %s", config_linenr, config_file_name);
256}
257
258static int parse_unit_factor(const char *end, unsigned long *val)
259{
260 if (!*end)
261 return 1;
262 else if (!strcasecmp(end, "k")) {
263 *val *= 1024;
264 return 1;
265 }
266 else if (!strcasecmp(end, "m")) {
267 *val *= 1024 * 1024;
268 return 1;
269 }
270 else if (!strcasecmp(end, "g")) {
271 *val *= 1024 * 1024 * 1024;
272 return 1;
273 }
274 return 0;
275}
276
277static int perf_parse_long(const char *value, long *ret)
278{
279 if (value && *value) {
280 char *end;
281 long val = strtol(value, &end, 0);
282 unsigned long factor = 1;
283 if (!parse_unit_factor(end, &factor))
284 return 0;
285 *ret = val * factor;
286 return 1;
287 }
288 return 0;
289}
290
291int perf_parse_ulong(const char *value, unsigned long *ret)
292{
293 if (value && *value) {
294 char *end;
295 unsigned long val = strtoul(value, &end, 0);
296 if (!parse_unit_factor(end, &val))
297 return 0;
298 *ret = val;
299 return 1;
300 }
301 return 0;
302}
303
304static void die_bad_config(const char *name)
305{
306 if (config_file_name)
307 die("bad config value for '%s' in %s", name, config_file_name);
308 die("bad config value for '%s'", name);
309}
310
311int perf_config_int(const char *name, const char *value)
312{
313 long ret = 0;
314 if (!perf_parse_long(value, &ret))
315 die_bad_config(name);
316 return ret;
317}
318
319unsigned long perf_config_ulong(const char *name, const char *value)
320{
321 unsigned long ret;
322 if (!perf_parse_ulong(value, &ret))
323 die_bad_config(name);
324 return ret;
325}
326
327int perf_config_bool_or_int(const char *name, const char *value, int *is_bool)
328{
329 *is_bool = 1;
330 if (!value)
331 return 1;
332 if (!*value)
333 return 0;
334 if (!strcasecmp(value, "true") || !strcasecmp(value, "yes") || !strcasecmp(value, "on"))
335 return 1;
336 if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off"))
337 return 0;
338 *is_bool = 0;
339 return perf_config_int(name, value);
340}
341
342int perf_config_bool(const char *name, const char *value)
343{
344 int discard;
345 return !!perf_config_bool_or_int(name, value, &discard);
346}
347
348int perf_config_string(const char **dest, const char *var, const char *value)
349{
350 if (!value)
351 return config_error_nonbool(var);
352 *dest = strdup(value);
353 return 0;
354}
355
356static int perf_default_core_config(const char *var, const char *value)
357{
358 /* Add other config variables here and to Documentation/config.txt. */
359 return 0;
360}
361
362int perf_default_config(const char *var, const char *value, void *dummy)
363{
364 if (!prefixcmp(var, "core."))
365 return perf_default_core_config(var, value);
366
367 /* Add other config variables here and to Documentation/config.txt. */
368 return 0;
369}
370
371int perf_config_from_file(config_fn_t fn, const char *filename, void *data)
372{
373 int ret;
374 FILE *f = fopen(filename, "r");
375
376 ret = -1;
377 if (f) {
378 config_file = f;
379 config_file_name = filename;
380 config_linenr = 1;
381 config_file_eof = 0;
382 ret = perf_parse_file(fn, data);
383 fclose(f);
384 config_file_name = NULL;
385 }
386 return ret;
387}
388
389const char *perf_etc_perfconfig(void)
390{
391 static const char *system_wide;
392 if (!system_wide)
393 system_wide = system_path(ETC_PERFCONFIG);
394 return system_wide;
395}
396
397static int perf_env_bool(const char *k, int def)
398{
399 const char *v = getenv(k);
400 return v ? perf_config_bool(k, v) : def;
401}
402
403int perf_config_system(void)
404{
405 return !perf_env_bool("PERF_CONFIG_NOSYSTEM", 0);
406}
407
408int perf_config_global(void)
409{
410 return !perf_env_bool("PERF_CONFIG_NOGLOBAL", 0);
411}
412
413int perf_config(config_fn_t fn, void *data)
414{
415 int ret = 0, found = 0;
416 char *repo_config = NULL;
417 const char *home = NULL;
418
419 /* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
420 if (config_exclusive_filename)
421 return perf_config_from_file(fn, config_exclusive_filename, data);
422 if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
423 ret += perf_config_from_file(fn, perf_etc_perfconfig(),
424 data);
425 found += 1;
426 }
427
428 home = getenv("HOME");
429 if (perf_config_global() && home) {
430 char *user_config = strdup(mkpath("%s/.perfconfig", home));
431 if (!access(user_config, R_OK)) {
432 ret += perf_config_from_file(fn, user_config, data);
433 found += 1;
434 }
435 free(user_config);
436 }
437
438 repo_config = perf_pathdup("config");
439 if (!access(repo_config, R_OK)) {
440 ret += perf_config_from_file(fn, repo_config, data);
441 found += 1;
442 }
443 free(repo_config);
444 if (found == 0)
445 return -1;
446 return ret;
447}
448
449/*
450 * Find all the stuff for perf_config_set() below.
451 */
452
453#define MAX_MATCHES 512
454
455static struct {
456 int baselen;
457 char* key;
458 int do_not_match;
459 regex_t* value_regex;
460 int multi_replace;
461 size_t offset[MAX_MATCHES];
462 enum { START, SECTION_SEEN, SECTION_END_SEEN, KEY_SEEN } state;
463 int seen;
464} store;
465
466static int matches(const char* key, const char* value)
467{
468 return !strcmp(key, store.key) &&
469 (store.value_regex == NULL ||
470 (store.do_not_match ^
471 !regexec(store.value_regex, value, 0, NULL, 0)));
472}
473
474static int store_aux(const char* key, const char* value, void *cb)
475{
476 const char *ep;
477 size_t section_len;
478
479 switch (store.state) {
480 case KEY_SEEN:
481 if (matches(key, value)) {
482 if (store.seen == 1 && store.multi_replace == 0) {
483 warning("%s has multiple values", key);
484 } else if (store.seen >= MAX_MATCHES) {
485 error("too many matches for %s", key);
486 return 1;
487 }
488
489 store.offset[store.seen] = ftell(config_file);
490 store.seen++;
491 }
492 break;
493 case SECTION_SEEN:
494 /*
495 * What we are looking for is in store.key (both
496 * section and var), and its section part is baselen
497 * long. We found key (again, both section and var).
498 * We would want to know if this key is in the same
499 * section as what we are looking for. We already
500 * know we are in the same section as what should
501 * hold store.key.
502 */
503 ep = strrchr(key, '.');
504 section_len = ep - key;
505
506 if ((section_len != store.baselen) ||
507 memcmp(key, store.key, section_len+1)) {
508 store.state = SECTION_END_SEEN;
509 break;
510 }
511
512 /*
513 * Do not increment matches: this is no match, but we
514 * just made sure we are in the desired section.
515 */
516 store.offset[store.seen] = ftell(config_file);
517 /* fallthru */
518 case SECTION_END_SEEN:
519 case START:
520 if (matches(key, value)) {
521 store.offset[store.seen] = ftell(config_file);
522 store.state = KEY_SEEN;
523 store.seen++;
524 } else {
525 if (strrchr(key, '.') - key == store.baselen &&
526 !strncmp(key, store.key, store.baselen)) {
527 store.state = SECTION_SEEN;
528 store.offset[store.seen] = ftell(config_file);
529 }
530 }
531 }
532 return 0;
533}
534
535static int store_write_section(int fd, const char* key)
536{
537 const char *dot;
538 int i, success;
539 struct strbuf sb = STRBUF_INIT;
540
541 dot = memchr(key, '.', store.baselen);
542 if (dot) {
543 strbuf_addf(&sb, "[%.*s \"", (int)(dot - key), key);
544 for (i = dot - key + 1; i < store.baselen; i++) {
545 if (key[i] == '"' || key[i] == '\\')
546 strbuf_addch(&sb, '\\');
547 strbuf_addch(&sb, key[i]);
548 }
549 strbuf_addstr(&sb, "\"]\n");
550 } else {
551 strbuf_addf(&sb, "[%.*s]\n", store.baselen, key);
552 }
553
554 success = write_in_full(fd, sb.buf, sb.len) == sb.len;
555 strbuf_release(&sb);
556
557 return success;
558}
559
560static int store_write_pair(int fd, const char* key, const char* value)
561{
562 int i, success;
563 int length = strlen(key + store.baselen + 1);
564 const char *quote = "";
565 struct strbuf sb = STRBUF_INIT;
566
567 /*
568 * Check to see if the value needs to be surrounded with a dq pair.
569 * Note that problematic characters are always backslash-quoted; this
570 * check is about not losing leading or trailing SP and strings that
571 * follow beginning-of-comment characters (i.e. ';' and '#') by the
572 * configuration parser.
573 */
574 if (value[0] == ' ')
575 quote = "\"";
576 for (i = 0; value[i]; i++)
577 if (value[i] == ';' || value[i] == '#')
578 quote = "\"";
579 if (i && value[i - 1] == ' ')
580 quote = "\"";
581
582 strbuf_addf(&sb, "\t%.*s = %s",
583 length, key + store.baselen + 1, quote);
584
585 for (i = 0; value[i]; i++)
586 switch (value[i]) {
587 case '\n':
588 strbuf_addstr(&sb, "\\n");
589 break;
590 case '\t':
591 strbuf_addstr(&sb, "\\t");
592 break;
593 case '"':
594 case '\\':
595 strbuf_addch(&sb, '\\');
596 default:
597 strbuf_addch(&sb, value[i]);
598 break;
599 }
600 strbuf_addf(&sb, "%s\n", quote);
601
602 success = write_in_full(fd, sb.buf, sb.len) == sb.len;
603 strbuf_release(&sb);
604
605 return success;
606}
607
608static ssize_t find_beginning_of_line(const char* contents, size_t size,
609 size_t offset_, int* found_bracket)
610{
611 size_t equal_offset = size, bracket_offset = size;
612 ssize_t offset;
613
614contline:
615 for (offset = offset_-2; offset > 0
616 && contents[offset] != '\n'; offset--)
617 switch (contents[offset]) {
618 case '=': equal_offset = offset; break;
619 case ']': bracket_offset = offset; break;
620 }
621 if (offset > 0 && contents[offset-1] == '\\') {
622 offset_ = offset;
623 goto contline;
624 }
625 if (bracket_offset < equal_offset) {
626 *found_bracket = 1;
627 offset = bracket_offset+1;
628 } else
629 offset++;
630
631 return offset;
632}
633
634int perf_config_set(const char* key, const char* value)
635{
636 return perf_config_set_multivar(key, value, NULL, 0);
637}
638
639/*
640 * If value==NULL, unset in (remove from) config,
641 * if value_regex!=NULL, disregard key/value pairs where value does not match.
642 * if multi_replace==0, nothing, or only one matching key/value is replaced,
643 * else all matching key/values (regardless how many) are removed,
644 * before the new pair is written.
645 *
646 * Returns 0 on success.
647 *
648 * This function does this:
649 *
650 * - it locks the config file by creating ".perf/config.lock"
651 *
652 * - it then parses the config using store_aux() as validator to find
653 * the position on the key/value pair to replace. If it is to be unset,
654 * it must be found exactly once.
655 *
656 * - the config file is mmap()ed and the part before the match (if any) is
657 * written to the lock file, then the changed part and the rest.
658 *
659 * - the config file is removed and the lock file rename()d to it.
660 *
661 */
662int perf_config_set_multivar(const char* key, const char* value,
663 const char* value_regex, int multi_replace)
664{
665 int i, dot;
666 int fd = -1, in_fd;
667 int ret = 0;
668 char* config_filename;
669 const char* last_dot = strrchr(key, '.');
670
671 if (config_exclusive_filename)
672 config_filename = strdup(config_exclusive_filename);
673 else
674 config_filename = perf_pathdup("config");
675
676 /*
677 * Since "key" actually contains the section name and the real
678 * key name separated by a dot, we have to know where the dot is.
679 */
680
681 if (last_dot == NULL) {
682 error("key does not contain a section: %s", key);
683 ret = 2;
684 goto out_free;
685 }
686 store.baselen = last_dot - key;
687
688 store.multi_replace = multi_replace;
689
690 /*
691 * Validate the key and while at it, lower case it for matching.
692 */
693 store.key = malloc(strlen(key) + 1);
694 dot = 0;
695 for (i = 0; key[i]; i++) {
696 unsigned char c = key[i];
697 if (c == '.')
698 dot = 1;
699 /* Leave the extended basename untouched.. */
700 if (!dot || i > store.baselen) {
701 if (!iskeychar(c) || (i == store.baselen+1 && !isalpha(c))) {
702 error("invalid key: %s", key);
703 free(store.key);
704 ret = 1;
705 goto out_free;
706 }
707 c = tolower(c);
708 } else if (c == '\n') {
709 error("invalid key (newline): %s", key);
710 free(store.key);
711 ret = 1;
712 goto out_free;
713 }
714 store.key[i] = c;
715 }
716 store.key[i] = 0;
717
718 /*
719 * If .perf/config does not exist yet, write a minimal version.
720 */
721 in_fd = open(config_filename, O_RDONLY);
722 if ( in_fd < 0 ) {
723 free(store.key);
724
725 if ( ENOENT != errno ) {
726 error("opening %s: %s", config_filename,
727 strerror(errno));
728 ret = 3; /* same as "invalid config file" */
729 goto out_free;
730 }
731 /* if nothing to unset, error out */
732 if (value == NULL) {
733 ret = 5;
734 goto out_free;
735 }
736
737 store.key = (char*)key;
738 if (!store_write_section(fd, key) ||
739 !store_write_pair(fd, key, value))
740 goto write_err_out;
741 } else {
742 struct stat st;
743 char* contents;
744 size_t contents_sz, copy_begin, copy_end;
745 int i, new_line = 0;
746
747 if (value_regex == NULL)
748 store.value_regex = NULL;
749 else {
750 if (value_regex[0] == '!') {
751 store.do_not_match = 1;
752 value_regex++;
753 } else
754 store.do_not_match = 0;
755
756 store.value_regex = (regex_t*)malloc(sizeof(regex_t));
757 if (regcomp(store.value_regex, value_regex,
758 REG_EXTENDED)) {
759 error("invalid pattern: %s", value_regex);
760 free(store.value_regex);
761 ret = 6;
762 goto out_free;
763 }
764 }
765
766 store.offset[0] = 0;
767 store.state = START;
768 store.seen = 0;
769
770 /*
771 * After this, store.offset will contain the *end* offset
772 * of the last match, or remain at 0 if no match was found.
773 * As a side effect, we make sure to transform only a valid
774 * existing config file.
775 */
776 if (perf_config_from_file(store_aux, config_filename, NULL)) {
777 error("invalid config file %s", config_filename);
778 free(store.key);
779 if (store.value_regex != NULL) {
780 regfree(store.value_regex);
781 free(store.value_regex);
782 }
783 ret = 3;
784 goto out_free;
785 }
786
787 free(store.key);
788 if (store.value_regex != NULL) {
789 regfree(store.value_regex);
790 free(store.value_regex);
791 }
792
793 /* if nothing to unset, or too many matches, error out */
794 if ((store.seen == 0 && value == NULL) ||
795 (store.seen > 1 && multi_replace == 0)) {
796 ret = 5;
797 goto out_free;
798 }
799
800 fstat(in_fd, &st);
801 contents_sz = xsize_t(st.st_size);
802 contents = mmap(NULL, contents_sz, PROT_READ,
803 MAP_PRIVATE, in_fd, 0);
804 close(in_fd);
805
806 if (store.seen == 0)
807 store.seen = 1;
808
809 for (i = 0, copy_begin = 0; i < store.seen; i++) {
810 if (store.offset[i] == 0) {
811 store.offset[i] = copy_end = contents_sz;
812 } else if (store.state != KEY_SEEN) {
813 copy_end = store.offset[i];
814 } else
815 copy_end = find_beginning_of_line(
816 contents, contents_sz,
817 store.offset[i]-2, &new_line);
818
819 if (copy_end > 0 && contents[copy_end-1] != '\n')
820 new_line = 1;
821
822 /* write the first part of the config */
823 if (copy_end > copy_begin) {
824 if (write_in_full(fd, contents + copy_begin,
825 copy_end - copy_begin) <
826 copy_end - copy_begin)
827 goto write_err_out;
828 if (new_line &&
829 write_in_full(fd, "\n", 1) != 1)
830 goto write_err_out;
831 }
832 copy_begin = store.offset[i];
833 }
834
835 /* write the pair (value == NULL means unset) */
836 if (value != NULL) {
837 if (store.state == START) {
838 if (!store_write_section(fd, key))
839 goto write_err_out;
840 }
841 if (!store_write_pair(fd, key, value))
842 goto write_err_out;
843 }
844
845 /* write the rest of the config */
846 if (copy_begin < contents_sz)
847 if (write_in_full(fd, contents + copy_begin,
848 contents_sz - copy_begin) <
849 contents_sz - copy_begin)
850 goto write_err_out;
851
852 munmap(contents, contents_sz);
853 }
854
855 ret = 0;
856
857out_free:
858 free(config_filename);
859 return ret;
860
861write_err_out:
862 goto out_free;
863
864}
865
866/*
867 * Call this to report error for your variable that should not
868 * get a boolean value (i.e. "[my] var" means "true").
869 */
870int config_error_nonbool(const char *var)
871{
872 return error("Missing value for '%s'", var);
873}
diff --git a/Documentation/perf_counter/util/ctype.c b/Documentation/perf_counter/util/ctype.c
new file mode 100644
index 000000000000..b90ec004f29c
--- /dev/null
+++ b/Documentation/perf_counter/util/ctype.c
@@ -0,0 +1,26 @@
1/*
2 * Sane locale-independent, ASCII ctype.
3 *
4 * No surprises, and works with signed and unsigned chars.
5 */
6#include "cache.h"
7
8enum {
9 S = GIT_SPACE,
10 A = GIT_ALPHA,
11 D = GIT_DIGIT,
12 G = GIT_GLOB_SPECIAL, /* *, ?, [, \\ */
13 R = GIT_REGEX_SPECIAL, /* $, (, ), +, ., ^, {, | * */
14};
15
16unsigned char sane_ctype[256] = {
17 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0, /* 0.. 15 */
18 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16.. 31 */
19 S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0, /* 32.. 47 */
20 D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G, /* 48.. 63 */
21 0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 64.. 79 */
22 A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0, /* 80.. 95 */
23 0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 96..111 */
24 A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0, /* 112..127 */
25 /* Nothing in the 128.. range */
26};
diff --git a/Documentation/perf_counter/util/exec_cmd.c b/Documentation/perf_counter/util/exec_cmd.c
new file mode 100644
index 000000000000..d39292263153
--- /dev/null
+++ b/Documentation/perf_counter/util/exec_cmd.c
@@ -0,0 +1,165 @@
1#include "cache.h"
2#include "exec_cmd.h"
3#include "quote.h"
4#define MAX_ARGS 32
5
6extern char **environ;
7static const char *argv_exec_path;
8static const char *argv0_path;
9
10const char *system_path(const char *path)
11{
12#ifdef RUNTIME_PREFIX
13 static const char *prefix;
14#else
15 static const char *prefix = PREFIX;
16#endif
17 struct strbuf d = STRBUF_INIT;
18
19 if (is_absolute_path(path))
20 return path;
21
22#ifdef RUNTIME_PREFIX
23 assert(argv0_path);
24 assert(is_absolute_path(argv0_path));
25
26 if (!prefix &&
27 !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) &&
28 !(prefix = strip_path_suffix(argv0_path, BINDIR)) &&
29 !(prefix = strip_path_suffix(argv0_path, "perf"))) {
30 prefix = PREFIX;
31 fprintf(stderr, "RUNTIME_PREFIX requested, "
32 "but prefix computation failed. "
33 "Using static fallback '%s'.\n", prefix);
34 }
35#endif
36
37 strbuf_addf(&d, "%s/%s", prefix, path);
38 path = strbuf_detach(&d, NULL);
39 return path;
40}
41
42const char *perf_extract_argv0_path(const char *argv0)
43{
44 const char *slash;
45
46 if (!argv0 || !*argv0)
47 return NULL;
48 slash = argv0 + strlen(argv0);
49
50 while (argv0 <= slash && !is_dir_sep(*slash))
51 slash--;
52
53 if (slash >= argv0) {
54 argv0_path = strndup(argv0, slash - argv0);
55 return slash + 1;
56 }
57
58 return argv0;
59}
60
61void perf_set_argv_exec_path(const char *exec_path)
62{
63 argv_exec_path = exec_path;
64 /*
65 * Propagate this setting to external programs.
66 */
67 setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1);
68}
69
70
71/* Returns the highest-priority, location to look for perf programs. */
72const char *perf_exec_path(void)
73{
74 const char *env;
75
76 if (argv_exec_path)
77 return argv_exec_path;
78
79 env = getenv(EXEC_PATH_ENVIRONMENT);
80 if (env && *env) {
81 return env;
82 }
83
84 return system_path(PERF_EXEC_PATH);
85}
86
87static void add_path(struct strbuf *out, const char *path)
88{
89 if (path && *path) {
90 if (is_absolute_path(path))
91 strbuf_addstr(out, path);
92 else
93 strbuf_addstr(out, make_nonrelative_path(path));
94
95 strbuf_addch(out, PATH_SEP);
96 }
97}
98
99void setup_path(void)
100{
101 const char *old_path = getenv("PATH");
102 struct strbuf new_path = STRBUF_INIT;
103
104 add_path(&new_path, perf_exec_path());
105 add_path(&new_path, argv0_path);
106
107 if (old_path)
108 strbuf_addstr(&new_path, old_path);
109 else
110 strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin");
111
112 setenv("PATH", new_path.buf, 1);
113
114 strbuf_release(&new_path);
115}
116
117const char **prepare_perf_cmd(const char **argv)
118{
119 int argc;
120 const char **nargv;
121
122 for (argc = 0; argv[argc]; argc++)
123 ; /* just counting */
124 nargv = malloc(sizeof(*nargv) * (argc + 2));
125
126 nargv[0] = "perf";
127 for (argc = 0; argv[argc]; argc++)
128 nargv[argc + 1] = argv[argc];
129 nargv[argc + 1] = NULL;
130 return nargv;
131}
132
133int execv_perf_cmd(const char **argv) {
134 const char **nargv = prepare_perf_cmd(argv);
135
136 /* execvp() can only ever return if it fails */
137 execvp("perf", (char **)nargv);
138
139 free(nargv);
140 return -1;
141}
142
143
144int execl_perf_cmd(const char *cmd,...)
145{
146 int argc;
147 const char *argv[MAX_ARGS + 1];
148 const char *arg;
149 va_list param;
150
151 va_start(param, cmd);
152 argv[0] = cmd;
153 argc = 1;
154 while (argc < MAX_ARGS) {
155 arg = argv[argc++] = va_arg(param, char *);
156 if (!arg)
157 break;
158 }
159 va_end(param);
160 if (MAX_ARGS <= argc)
161 return error("too many args to run %s", cmd);
162
163 argv[argc] = NULL;
164 return execv_perf_cmd(argv);
165}
diff --git a/Documentation/perf_counter/util/exec_cmd.h b/Documentation/perf_counter/util/exec_cmd.h
new file mode 100644
index 000000000000..effe25eb1545
--- /dev/null
+++ b/Documentation/perf_counter/util/exec_cmd.h
@@ -0,0 +1,13 @@
1#ifndef PERF_EXEC_CMD_H
2#define PERF_EXEC_CMD_H
3
4extern void perf_set_argv_exec_path(const char *exec_path);
5extern const char *perf_extract_argv0_path(const char *path);
6extern const char *perf_exec_path(void);
7extern void setup_path(void);
8extern const char **prepare_perf_cmd(const char **argv);
9extern int execv_perf_cmd(const char **argv); /* NULL terminated */
10extern int execl_perf_cmd(const char *cmd, ...);
11extern const char *system_path(const char *path);
12
13#endif /* PERF_EXEC_CMD_H */
diff --git a/Documentation/perf_counter/util/generate-cmdlist.sh b/Documentation/perf_counter/util/generate-cmdlist.sh
new file mode 100755
index 000000000000..f06f6fd148f8
--- /dev/null
+++ b/Documentation/perf_counter/util/generate-cmdlist.sh
@@ -0,0 +1,24 @@
1#!/bin/sh
2
3echo "/* Automatically generated by $0 */
4struct cmdname_help
5{
6 char name[16];
7 char help[80];
8};
9
10static struct cmdname_help common_cmds[] = {"
11
12sed -n -e 's/^perf-\([^ ]*\)[ ].* common.*/\1/p' command-list.txt |
13sort |
14while read cmd
15do
16 sed -n '
17 /^NAME/,/perf-'"$cmd"'/H
18 ${
19 x
20 s/.*perf-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/
21 p
22 }' "Documentation/perf-$cmd.txt"
23done
24echo "};"
diff --git a/Documentation/perf_counter/util/help.c b/Documentation/perf_counter/util/help.c
new file mode 100644
index 000000000000..edde541d238d
--- /dev/null
+++ b/Documentation/perf_counter/util/help.c
@@ -0,0 +1,366 @@
1#include "cache.h"
2#include "../builtin.h"
3#include "exec_cmd.h"
4#include "levenshtein.h"
5#include "help.h"
6
7/* most GUI terminals set COLUMNS (although some don't export it) */
8static int term_columns(void)
9{
10 char *col_string = getenv("COLUMNS");
11 int n_cols;
12
13 if (col_string && (n_cols = atoi(col_string)) > 0)
14 return n_cols;
15
16#ifdef TIOCGWINSZ
17 {
18 struct winsize ws;
19 if (!ioctl(1, TIOCGWINSZ, &ws)) {
20 if (ws.ws_col)
21 return ws.ws_col;
22 }
23 }
24#endif
25
26 return 80;
27}
28
29void add_cmdname(struct cmdnames *cmds, const char *name, int len)
30{
31 struct cmdname *ent = malloc(sizeof(*ent) + len + 1);
32
33 ent->len = len;
34 memcpy(ent->name, name, len);
35 ent->name[len] = 0;
36
37 ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc);
38 cmds->names[cmds->cnt++] = ent;
39}
40
41static void clean_cmdnames(struct cmdnames *cmds)
42{
43 int i;
44 for (i = 0; i < cmds->cnt; ++i)
45 free(cmds->names[i]);
46 free(cmds->names);
47 cmds->cnt = 0;
48 cmds->alloc = 0;
49}
50
51static int cmdname_compare(const void *a_, const void *b_)
52{
53 struct cmdname *a = *(struct cmdname **)a_;
54 struct cmdname *b = *(struct cmdname **)b_;
55 return strcmp(a->name, b->name);
56}
57
58static void uniq(struct cmdnames *cmds)
59{
60 int i, j;
61
62 if (!cmds->cnt)
63 return;
64
65 for (i = j = 1; i < cmds->cnt; i++)
66 if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name))
67 cmds->names[j++] = cmds->names[i];
68
69 cmds->cnt = j;
70}
71
72void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
73{
74 int ci, cj, ei;
75 int cmp;
76
77 ci = cj = ei = 0;
78 while (ci < cmds->cnt && ei < excludes->cnt) {
79 cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
80 if (cmp < 0)
81 cmds->names[cj++] = cmds->names[ci++];
82 else if (cmp == 0)
83 ci++, ei++;
84 else if (cmp > 0)
85 ei++;
86 }
87
88 while (ci < cmds->cnt)
89 cmds->names[cj++] = cmds->names[ci++];
90
91 cmds->cnt = cj;
92}
93
94static void pretty_print_string_list(struct cmdnames *cmds, int longest)
95{
96 int cols = 1, rows;
97 int space = longest + 1; /* min 1 SP between words */
98 int max_cols = term_columns() - 1; /* don't print *on* the edge */
99 int i, j;
100
101 if (space < max_cols)
102 cols = max_cols / space;
103 rows = (cmds->cnt + cols - 1) / cols;
104
105 for (i = 0; i < rows; i++) {
106 printf(" ");
107
108 for (j = 0; j < cols; j++) {
109 int n = j * rows + i;
110 int size = space;
111 if (n >= cmds->cnt)
112 break;
113 if (j == cols-1 || n + rows >= cmds->cnt)
114 size = 1;
115 printf("%-*s", size, cmds->names[n]->name);
116 }
117 putchar('\n');
118 }
119}
120
121static int is_executable(const char *name)
122{
123 struct stat st;
124
125 if (stat(name, &st) || /* stat, not lstat */
126 !S_ISREG(st.st_mode))
127 return 0;
128
129#ifdef __MINGW32__
130 /* cannot trust the executable bit, peek into the file instead */
131 char buf[3] = { 0 };
132 int n;
133 int fd = open(name, O_RDONLY);
134 st.st_mode &= ~S_IXUSR;
135 if (fd >= 0) {
136 n = read(fd, buf, 2);
137 if (n == 2)
138 /* DOS executables start with "MZ" */
139 if (!strcmp(buf, "#!") || !strcmp(buf, "MZ"))
140 st.st_mode |= S_IXUSR;
141 close(fd);
142 }
143#endif
144 return st.st_mode & S_IXUSR;
145}
146
147static void list_commands_in_dir(struct cmdnames *cmds,
148 const char *path,
149 const char *prefix)
150{
151 int prefix_len;
152 DIR *dir = opendir(path);
153 struct dirent *de;
154 struct strbuf buf = STRBUF_INIT;
155 int len;
156
157 if (!dir)
158 return;
159 if (!prefix)
160 prefix = "perf-";
161 prefix_len = strlen(prefix);
162
163 strbuf_addf(&buf, "%s/", path);
164 len = buf.len;
165
166 while ((de = readdir(dir)) != NULL) {
167 int entlen;
168
169 if (prefixcmp(de->d_name, prefix))
170 continue;
171
172 strbuf_setlen(&buf, len);
173 strbuf_addstr(&buf, de->d_name);
174 if (!is_executable(buf.buf))
175 continue;
176
177 entlen = strlen(de->d_name) - prefix_len;
178 if (has_extension(de->d_name, ".exe"))
179 entlen -= 4;
180
181 add_cmdname(cmds, de->d_name + prefix_len, entlen);
182 }
183 closedir(dir);
184 strbuf_release(&buf);
185}
186
187void load_command_list(const char *prefix,
188 struct cmdnames *main_cmds,
189 struct cmdnames *other_cmds)
190{
191 const char *env_path = getenv("PATH");
192 const char *exec_path = perf_exec_path();
193
194 if (exec_path) {
195 list_commands_in_dir(main_cmds, exec_path, prefix);
196 qsort(main_cmds->names, main_cmds->cnt,
197 sizeof(*main_cmds->names), cmdname_compare);
198 uniq(main_cmds);
199 }
200
201 if (env_path) {
202 char *paths, *path, *colon;
203 path = paths = strdup(env_path);
204 while (1) {
205 if ((colon = strchr(path, PATH_SEP)))
206 *colon = 0;
207 if (!exec_path || strcmp(path, exec_path))
208 list_commands_in_dir(other_cmds, path, prefix);
209
210 if (!colon)
211 break;
212 path = colon + 1;
213 }
214 free(paths);
215
216 qsort(other_cmds->names, other_cmds->cnt,
217 sizeof(*other_cmds->names), cmdname_compare);
218 uniq(other_cmds);
219 }
220 exclude_cmds(other_cmds, main_cmds);
221}
222
223void list_commands(const char *title, struct cmdnames *main_cmds,
224 struct cmdnames *other_cmds)
225{
226 int i, longest = 0;
227
228 for (i = 0; i < main_cmds->cnt; i++)
229 if (longest < main_cmds->names[i]->len)
230 longest = main_cmds->names[i]->len;
231 for (i = 0; i < other_cmds->cnt; i++)
232 if (longest < other_cmds->names[i]->len)
233 longest = other_cmds->names[i]->len;
234
235 if (main_cmds->cnt) {
236 const char *exec_path = perf_exec_path();
237 printf("available %s in '%s'\n", title, exec_path);
238 printf("----------------");
239 mput_char('-', strlen(title) + strlen(exec_path));
240 putchar('\n');
241 pretty_print_string_list(main_cmds, longest);
242 putchar('\n');
243 }
244
245 if (other_cmds->cnt) {
246 printf("%s available from elsewhere on your $PATH\n", title);
247 printf("---------------------------------------");
248 mput_char('-', strlen(title));
249 putchar('\n');
250 pretty_print_string_list(other_cmds, longest);
251 putchar('\n');
252 }
253}
254
255int is_in_cmdlist(struct cmdnames *c, const char *s)
256{
257 int i;
258 for (i = 0; i < c->cnt; i++)
259 if (!strcmp(s, c->names[i]->name))
260 return 1;
261 return 0;
262}
263
264static int autocorrect;
265static struct cmdnames aliases;
266
267static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
268{
269 if (!strcmp(var, "help.autocorrect"))
270 autocorrect = perf_config_int(var,value);
271 /* Also use aliases for command lookup */
272 if (!prefixcmp(var, "alias."))
273 add_cmdname(&aliases, var + 6, strlen(var + 6));
274
275 return perf_default_config(var, value, cb);
276}
277
278static int levenshtein_compare(const void *p1, const void *p2)
279{
280 const struct cmdname *const *c1 = p1, *const *c2 = p2;
281 const char *s1 = (*c1)->name, *s2 = (*c2)->name;
282 int l1 = (*c1)->len;
283 int l2 = (*c2)->len;
284 return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
285}
286
287static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
288{
289 int i;
290 ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
291
292 for (i = 0; i < old->cnt; i++)
293 cmds->names[cmds->cnt++] = old->names[i];
294 free(old->names);
295 old->cnt = 0;
296 old->names = NULL;
297}
298
299const char *help_unknown_cmd(const char *cmd)
300{
301 int i, n, best_similarity = 0;
302 struct cmdnames main_cmds, other_cmds;
303
304 memset(&main_cmds, 0, sizeof(main_cmds));
305 memset(&other_cmds, 0, sizeof(main_cmds));
306 memset(&aliases, 0, sizeof(aliases));
307
308 perf_config(perf_unknown_cmd_config, NULL);
309
310 load_command_list("perf-", &main_cmds, &other_cmds);
311
312 add_cmd_list(&main_cmds, &aliases);
313 add_cmd_list(&main_cmds, &other_cmds);
314 qsort(main_cmds.names, main_cmds.cnt,
315 sizeof(main_cmds.names), cmdname_compare);
316 uniq(&main_cmds);
317
318 /* This reuses cmdname->len for similarity index */
319 for (i = 0; i < main_cmds.cnt; ++i)
320 main_cmds.names[i]->len =
321 levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
322
323 qsort(main_cmds.names, main_cmds.cnt,
324 sizeof(*main_cmds.names), levenshtein_compare);
325
326 if (!main_cmds.cnt)
327 die ("Uh oh. Your system reports no Git commands at all.");
328
329 best_similarity = main_cmds.names[0]->len;
330 n = 1;
331 while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
332 ++n;
333 if (autocorrect && n == 1) {
334 const char *assumed = main_cmds.names[0]->name;
335 main_cmds.names[0] = NULL;
336 clean_cmdnames(&main_cmds);
337 fprintf(stderr, "WARNING: You called a Git program named '%s', "
338 "which does not exist.\n"
339 "Continuing under the assumption that you meant '%s'\n",
340 cmd, assumed);
341 if (autocorrect > 0) {
342 fprintf(stderr, "in %0.1f seconds automatically...\n",
343 (float)autocorrect/10.0);
344 poll(NULL, 0, autocorrect * 100);
345 }
346 return assumed;
347 }
348
349 fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
350
351 if (best_similarity < 6) {
352 fprintf(stderr, "\nDid you mean %s?\n",
353 n < 2 ? "this": "one of these");
354
355 for (i = 0; i < n; i++)
356 fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
357 }
358
359 exit(1);
360}
361
362int cmd_version(int argc, const char **argv, const char *prefix)
363{
364 printf("perf version %s\n", perf_version_string);
365 return 0;
366}
diff --git a/Documentation/perf_counter/util/help.h b/Documentation/perf_counter/util/help.h
new file mode 100644
index 000000000000..56bc15406ffc
--- /dev/null
+++ b/Documentation/perf_counter/util/help.h
@@ -0,0 +1,29 @@
1#ifndef HELP_H
2#define HELP_H
3
4struct cmdnames {
5 int alloc;
6 int cnt;
7 struct cmdname {
8 size_t len; /* also used for similarity index in help.c */
9 char name[FLEX_ARRAY];
10 } **names;
11};
12
13static inline void mput_char(char c, unsigned int num)
14{
15 while(num--)
16 putchar(c);
17}
18
19void load_command_list(const char *prefix,
20 struct cmdnames *main_cmds,
21 struct cmdnames *other_cmds);
22void add_cmdname(struct cmdnames *cmds, const char *name, int len);
23/* Here we require that excludes is a sorted list. */
24void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
25int is_in_cmdlist(struct cmdnames *c, const char *s);
26void list_commands(const char *title, struct cmdnames *main_cmds,
27 struct cmdnames *other_cmds);
28
29#endif /* HELP_H */
diff --git a/Documentation/perf_counter/util/levenshtein.c b/Documentation/perf_counter/util/levenshtein.c
new file mode 100644
index 000000000000..e521d1516df6
--- /dev/null
+++ b/Documentation/perf_counter/util/levenshtein.c
@@ -0,0 +1,84 @@
1#include "cache.h"
2#include "levenshtein.h"
3
4/*
5 * This function implements the Damerau-Levenshtein algorithm to
6 * calculate a distance between strings.
7 *
8 * Basically, it says how many letters need to be swapped, substituted,
9 * deleted from, or added to string1, at least, to get string2.
10 *
11 * The idea is to build a distance matrix for the substrings of both
12 * strings. To avoid a large space complexity, only the last three rows
13 * are kept in memory (if swaps had the same or higher cost as one deletion
14 * plus one insertion, only two rows would be needed).
15 *
16 * At any stage, "i + 1" denotes the length of the current substring of
17 * string1 that the distance is calculated for.
18 *
19 * row2 holds the current row, row1 the previous row (i.e. for the substring
20 * of string1 of length "i"), and row0 the row before that.
21 *
22 * In other words, at the start of the big loop, row2[j + 1] contains the
23 * Damerau-Levenshtein distance between the substring of string1 of length
24 * "i" and the substring of string2 of length "j + 1".
25 *
26 * All the big loop does is determine the partial minimum-cost paths.
27 *
28 * It does so by calculating the costs of the path ending in characters
29 * i (in string1) and j (in string2), respectively, given that the last
30 * operation is a substition, a swap, a deletion, or an insertion.
31 *
32 * This implementation allows the costs to be weighted:
33 *
34 * - w (as in "sWap")
35 * - s (as in "Substitution")
36 * - a (for insertion, AKA "Add")
37 * - d (as in "Deletion")
38 *
39 * Note that this algorithm calculates a distance _iff_ d == a.
40 */
41int levenshtein(const char *string1, const char *string2,
42 int w, int s, int a, int d)
43{
44 int len1 = strlen(string1), len2 = strlen(string2);
45 int *row0 = malloc(sizeof(int) * (len2 + 1));
46 int *row1 = malloc(sizeof(int) * (len2 + 1));
47 int *row2 = malloc(sizeof(int) * (len2 + 1));
48 int i, j;
49
50 for (j = 0; j <= len2; j++)
51 row1[j] = j * a;
52 for (i = 0; i < len1; i++) {
53 int *dummy;
54
55 row2[0] = (i + 1) * d;
56 for (j = 0; j < len2; j++) {
57 /* substitution */
58 row2[j + 1] = row1[j] + s * (string1[i] != string2[j]);
59 /* swap */
60 if (i > 0 && j > 0 && string1[i - 1] == string2[j] &&
61 string1[i] == string2[j - 1] &&
62 row2[j + 1] > row0[j - 1] + w)
63 row2[j + 1] = row0[j - 1] + w;
64 /* deletion */
65 if (row2[j + 1] > row1[j + 1] + d)
66 row2[j + 1] = row1[j + 1] + d;
67 /* insertion */
68 if (row2[j + 1] > row2[j] + a)
69 row2[j + 1] = row2[j] + a;
70 }
71
72 dummy = row0;
73 row0 = row1;
74 row1 = row2;
75 row2 = dummy;
76 }
77
78 i = row1[len2];
79 free(row0);
80 free(row1);
81 free(row2);
82
83 return i;
84}
diff --git a/Documentation/perf_counter/util/levenshtein.h b/Documentation/perf_counter/util/levenshtein.h
new file mode 100644
index 000000000000..0173abeef52c
--- /dev/null
+++ b/Documentation/perf_counter/util/levenshtein.h
@@ -0,0 +1,8 @@
1#ifndef LEVENSHTEIN_H
2#define LEVENSHTEIN_H
3
4int levenshtein(const char *string1, const char *string2,
5 int swap_penalty, int substition_penalty,
6 int insertion_penalty, int deletion_penalty);
7
8#endif
diff --git a/Documentation/perf_counter/util/parse-options.c b/Documentation/perf_counter/util/parse-options.c
new file mode 100644
index 000000000000..28b34c1c29cf
--- /dev/null
+++ b/Documentation/perf_counter/util/parse-options.c
@@ -0,0 +1,492 @@
1#include "util.h"
2#include "parse-options.h"
3#include "cache.h"
4
5#define OPT_SHORT 1
6#define OPT_UNSET 2
7
8static int opterror(const struct option *opt, const char *reason, int flags)
9{
10 if (flags & OPT_SHORT)
11 return error("switch `%c' %s", opt->short_name, reason);
12 if (flags & OPT_UNSET)
13 return error("option `no-%s' %s", opt->long_name, reason);
14 return error("option `%s' %s", opt->long_name, reason);
15}
16
17static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
18 int flags, const char **arg)
19{
20 if (p->opt) {
21 *arg = p->opt;
22 p->opt = NULL;
23 } else if (p->argc == 1 && (opt->flags & PARSE_OPT_LASTARG_DEFAULT)) {
24 *arg = (const char *)opt->defval;
25 } else if (p->argc > 1) {
26 p->argc--;
27 *arg = *++p->argv;
28 } else
29 return opterror(opt, "requires a value", flags);
30 return 0;
31}
32
33static int get_value(struct parse_opt_ctx_t *p,
34 const struct option *opt, int flags)
35{
36 const char *s, *arg;
37 const int unset = flags & OPT_UNSET;
38
39 if (unset && p->opt)
40 return opterror(opt, "takes no value", flags);
41 if (unset && (opt->flags & PARSE_OPT_NONEG))
42 return opterror(opt, "isn't available", flags);
43
44 if (!(flags & OPT_SHORT) && p->opt) {
45 switch (opt->type) {
46 case OPTION_CALLBACK:
47 if (!(opt->flags & PARSE_OPT_NOARG))
48 break;
49 /* FALLTHROUGH */
50 case OPTION_BOOLEAN:
51 case OPTION_BIT:
52 case OPTION_SET_INT:
53 case OPTION_SET_PTR:
54 return opterror(opt, "takes no value", flags);
55 default:
56 break;
57 }
58 }
59
60 switch (opt->type) {
61 case OPTION_BIT:
62 if (unset)
63 *(int *)opt->value &= ~opt->defval;
64 else
65 *(int *)opt->value |= opt->defval;
66 return 0;
67
68 case OPTION_BOOLEAN:
69 *(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
70 return 0;
71
72 case OPTION_SET_INT:
73 *(int *)opt->value = unset ? 0 : opt->defval;
74 return 0;
75
76 case OPTION_SET_PTR:
77 *(void **)opt->value = unset ? NULL : (void *)opt->defval;
78 return 0;
79
80 case OPTION_STRING:
81 if (unset)
82 *(const char **)opt->value = NULL;
83 else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
84 *(const char **)opt->value = (const char *)opt->defval;
85 else
86 return get_arg(p, opt, flags, (const char **)opt->value);
87 return 0;
88
89 case OPTION_CALLBACK:
90 if (unset)
91 return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
92 if (opt->flags & PARSE_OPT_NOARG)
93 return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
94 if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
95 return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
96 if (get_arg(p, opt, flags, &arg))
97 return -1;
98 return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
99
100 case OPTION_INTEGER:
101 if (unset) {
102 *(int *)opt->value = 0;
103 return 0;
104 }
105 if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
106 *(int *)opt->value = opt->defval;
107 return 0;
108 }
109 if (get_arg(p, opt, flags, &arg))
110 return -1;
111 *(int *)opt->value = strtol(arg, (char **)&s, 10);
112 if (*s)
113 return opterror(opt, "expects a numerical value", flags);
114 return 0;
115
116 default:
117 die("should not happen, someone must be hit on the forehead");
118 }
119}
120
121static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options)
122{
123 for (; options->type != OPTION_END; options++) {
124 if (options->short_name == *p->opt) {
125 p->opt = p->opt[1] ? p->opt + 1 : NULL;
126 return get_value(p, options, OPT_SHORT);
127 }
128 }
129 return -2;
130}
131
132static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
133 const struct option *options)
134{
135 const char *arg_end = strchr(arg, '=');
136 const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
137 int abbrev_flags = 0, ambiguous_flags = 0;
138
139 if (!arg_end)
140 arg_end = arg + strlen(arg);
141
142 for (; options->type != OPTION_END; options++) {
143 const char *rest;
144 int flags = 0;
145
146 if (!options->long_name)
147 continue;
148
149 rest = skip_prefix(arg, options->long_name);
150 if (options->type == OPTION_ARGUMENT) {
151 if (!rest)
152 continue;
153 if (*rest == '=')
154 return opterror(options, "takes no value", flags);
155 if (*rest)
156 continue;
157 p->out[p->cpidx++] = arg - 2;
158 return 0;
159 }
160 if (!rest) {
161 /* abbreviated? */
162 if (!strncmp(options->long_name, arg, arg_end - arg)) {
163is_abbreviated:
164 if (abbrev_option) {
165 /*
166 * If this is abbreviated, it is
167 * ambiguous. So when there is no
168 * exact match later, we need to
169 * error out.
170 */
171 ambiguous_option = abbrev_option;
172 ambiguous_flags = abbrev_flags;
173 }
174 if (!(flags & OPT_UNSET) && *arg_end)
175 p->opt = arg_end + 1;
176 abbrev_option = options;
177 abbrev_flags = flags;
178 continue;
179 }
180 /* negated and abbreviated very much? */
181 if (!prefixcmp("no-", arg)) {
182 flags |= OPT_UNSET;
183 goto is_abbreviated;
184 }
185 /* negated? */
186 if (strncmp(arg, "no-", 3))
187 continue;
188 flags |= OPT_UNSET;
189 rest = skip_prefix(arg + 3, options->long_name);
190 /* abbreviated and negated? */
191 if (!rest && !prefixcmp(options->long_name, arg + 3))
192 goto is_abbreviated;
193 if (!rest)
194 continue;
195 }
196 if (*rest) {
197 if (*rest != '=')
198 continue;
199 p->opt = rest + 1;
200 }
201 return get_value(p, options, flags);
202 }
203
204 if (ambiguous_option)
205 return error("Ambiguous option: %s "
206 "(could be --%s%s or --%s%s)",
207 arg,
208 (ambiguous_flags & OPT_UNSET) ? "no-" : "",
209 ambiguous_option->long_name,
210 (abbrev_flags & OPT_UNSET) ? "no-" : "",
211 abbrev_option->long_name);
212 if (abbrev_option)
213 return get_value(p, abbrev_option, abbrev_flags);
214 return -2;
215}
216
217static void check_typos(const char *arg, const struct option *options)
218{
219 if (strlen(arg) < 3)
220 return;
221
222 if (!prefixcmp(arg, "no-")) {
223 error ("did you mean `--%s` (with two dashes ?)", arg);
224 exit(129);
225 }
226
227 for (; options->type != OPTION_END; options++) {
228 if (!options->long_name)
229 continue;
230 if (!prefixcmp(options->long_name, arg)) {
231 error ("did you mean `--%s` (with two dashes ?)", arg);
232 exit(129);
233 }
234 }
235}
236
237void parse_options_start(struct parse_opt_ctx_t *ctx,
238 int argc, const char **argv, int flags)
239{
240 memset(ctx, 0, sizeof(*ctx));
241 ctx->argc = argc - 1;
242 ctx->argv = argv + 1;
243 ctx->out = argv;
244 ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
245 ctx->flags = flags;
246 if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
247 (flags & PARSE_OPT_STOP_AT_NON_OPTION))
248 die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
249}
250
251static int usage_with_options_internal(const char * const *,
252 const struct option *, int);
253
254int parse_options_step(struct parse_opt_ctx_t *ctx,
255 const struct option *options,
256 const char * const usagestr[])
257{
258 int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
259
260 /* we must reset ->opt, unknown short option leave it dangling */
261 ctx->opt = NULL;
262
263 for (; ctx->argc; ctx->argc--, ctx->argv++) {
264 const char *arg = ctx->argv[0];
265
266 if (*arg != '-' || !arg[1]) {
267 if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
268 break;
269 ctx->out[ctx->cpidx++] = ctx->argv[0];
270 continue;
271 }
272
273 if (arg[1] != '-') {
274 ctx->opt = arg + 1;
275 if (internal_help && *ctx->opt == 'h')
276 return parse_options_usage(usagestr, options);
277 switch (parse_short_opt(ctx, options)) {
278 case -1:
279 return parse_options_usage(usagestr, options);
280 case -2:
281 goto unknown;
282 }
283 if (ctx->opt)
284 check_typos(arg + 1, options);
285 while (ctx->opt) {
286 if (internal_help && *ctx->opt == 'h')
287 return parse_options_usage(usagestr, options);
288 switch (parse_short_opt(ctx, options)) {
289 case -1:
290 return parse_options_usage(usagestr, options);
291 case -2:
292 /* fake a short option thing to hide the fact that we may have
293 * started to parse aggregated stuff
294 *
295 * This is leaky, too bad.
296 */
297 ctx->argv[0] = strdup(ctx->opt - 1);
298 *(char *)ctx->argv[0] = '-';
299 goto unknown;
300 }
301 }
302 continue;
303 }
304
305 if (!arg[2]) { /* "--" */
306 if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
307 ctx->argc--;
308 ctx->argv++;
309 }
310 break;
311 }
312
313 if (internal_help && !strcmp(arg + 2, "help-all"))
314 return usage_with_options_internal(usagestr, options, 1);
315 if (internal_help && !strcmp(arg + 2, "help"))
316 return parse_options_usage(usagestr, options);
317 switch (parse_long_opt(ctx, arg + 2, options)) {
318 case -1:
319 return parse_options_usage(usagestr, options);
320 case -2:
321 goto unknown;
322 }
323 continue;
324unknown:
325 if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
326 return PARSE_OPT_UNKNOWN;
327 ctx->out[ctx->cpidx++] = ctx->argv[0];
328 ctx->opt = NULL;
329 }
330 return PARSE_OPT_DONE;
331}
332
333int parse_options_end(struct parse_opt_ctx_t *ctx)
334{
335 memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
336 ctx->out[ctx->cpidx + ctx->argc] = NULL;
337 return ctx->cpidx + ctx->argc;
338}
339
340int parse_options(int argc, const char **argv, const struct option *options,
341 const char * const usagestr[], int flags)
342{
343 struct parse_opt_ctx_t ctx;
344
345 parse_options_start(&ctx, argc, argv, flags);
346 switch (parse_options_step(&ctx, options, usagestr)) {
347 case PARSE_OPT_HELP:
348 exit(129);
349 case PARSE_OPT_DONE:
350 break;
351 default: /* PARSE_OPT_UNKNOWN */
352 if (ctx.argv[0][1] == '-') {
353 error("unknown option `%s'", ctx.argv[0] + 2);
354 } else {
355 error("unknown switch `%c'", *ctx.opt);
356 }
357 usage_with_options(usagestr, options);
358 }
359
360 return parse_options_end(&ctx);
361}
362
363#define USAGE_OPTS_WIDTH 24
364#define USAGE_GAP 2
365
366int usage_with_options_internal(const char * const *usagestr,
367 const struct option *opts, int full)
368{
369 if (!usagestr)
370 return PARSE_OPT_HELP;
371
372 fprintf(stderr, "usage: %s\n", *usagestr++);
373 while (*usagestr && **usagestr)
374 fprintf(stderr, " or: %s\n", *usagestr++);
375 while (*usagestr) {
376 fprintf(stderr, "%s%s\n",
377 **usagestr ? " " : "",
378 *usagestr);
379 usagestr++;
380 }
381
382 if (opts->type != OPTION_GROUP)
383 fputc('\n', stderr);
384
385 for (; opts->type != OPTION_END; opts++) {
386 size_t pos;
387 int pad;
388
389 if (opts->type == OPTION_GROUP) {
390 fputc('\n', stderr);
391 if (*opts->help)
392 fprintf(stderr, "%s\n", opts->help);
393 continue;
394 }
395 if (!full && (opts->flags & PARSE_OPT_HIDDEN))
396 continue;
397
398 pos = fprintf(stderr, " ");
399 if (opts->short_name)
400 pos += fprintf(stderr, "-%c", opts->short_name);
401 if (opts->long_name && opts->short_name)
402 pos += fprintf(stderr, ", ");
403 if (opts->long_name)
404 pos += fprintf(stderr, "--%s", opts->long_name);
405
406 switch (opts->type) {
407 case OPTION_ARGUMENT:
408 break;
409 case OPTION_INTEGER:
410 if (opts->flags & PARSE_OPT_OPTARG)
411 if (opts->long_name)
412 pos += fprintf(stderr, "[=<n>]");
413 else
414 pos += fprintf(stderr, "[<n>]");
415 else
416 pos += fprintf(stderr, " <n>");
417 break;
418 case OPTION_CALLBACK:
419 if (opts->flags & PARSE_OPT_NOARG)
420 break;
421 /* FALLTHROUGH */
422 case OPTION_STRING:
423 if (opts->argh) {
424 if (opts->flags & PARSE_OPT_OPTARG)
425 if (opts->long_name)
426 pos += fprintf(stderr, "[=<%s>]", opts->argh);
427 else
428 pos += fprintf(stderr, "[<%s>]", opts->argh);
429 else
430 pos += fprintf(stderr, " <%s>", opts->argh);
431 } else {
432 if (opts->flags & PARSE_OPT_OPTARG)
433 if (opts->long_name)
434 pos += fprintf(stderr, "[=...]");
435 else
436 pos += fprintf(stderr, "[...]");
437 else
438 pos += fprintf(stderr, " ...");
439 }
440 break;
441 default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */
442 break;
443 }
444
445 if (pos <= USAGE_OPTS_WIDTH)
446 pad = USAGE_OPTS_WIDTH - pos;
447 else {
448 fputc('\n', stderr);
449 pad = USAGE_OPTS_WIDTH;
450 }
451 fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
452 }
453 fputc('\n', stderr);
454
455 return PARSE_OPT_HELP;
456}
457
458void usage_with_options(const char * const *usagestr,
459 const struct option *opts)
460{
461 usage_with_options_internal(usagestr, opts, 0);
462 exit(129);
463}
464
465int parse_options_usage(const char * const *usagestr,
466 const struct option *opts)
467{
468 return usage_with_options_internal(usagestr, opts, 0);
469}
470
471
472int parse_opt_verbosity_cb(const struct option *opt, const char *arg,
473 int unset)
474{
475 int *target = opt->value;
476
477 if (unset)
478 /* --no-quiet, --no-verbose */
479 *target = 0;
480 else if (opt->short_name == 'v') {
481 if (*target >= 0)
482 (*target)++;
483 else
484 *target = 1;
485 } else {
486 if (*target <= 0)
487 (*target)--;
488 else
489 *target = -1;
490 }
491 return 0;
492}
diff --git a/Documentation/perf_counter/util/parse-options.h b/Documentation/perf_counter/util/parse-options.h
new file mode 100644
index 000000000000..a81c7faff68e
--- /dev/null
+++ b/Documentation/perf_counter/util/parse-options.h
@@ -0,0 +1,172 @@
1#ifndef PARSE_OPTIONS_H
2#define PARSE_OPTIONS_H
3
4enum parse_opt_type {
5 /* special types */
6 OPTION_END,
7 OPTION_ARGUMENT,
8 OPTION_GROUP,
9 /* options with no arguments */
10 OPTION_BIT,
11 OPTION_BOOLEAN, /* _INCR would have been a better name */
12 OPTION_SET_INT,
13 OPTION_SET_PTR,
14 /* options with arguments (usually) */
15 OPTION_STRING,
16 OPTION_INTEGER,
17 OPTION_CALLBACK,
18};
19
20enum parse_opt_flags {
21 PARSE_OPT_KEEP_DASHDASH = 1,
22 PARSE_OPT_STOP_AT_NON_OPTION = 2,
23 PARSE_OPT_KEEP_ARGV0 = 4,
24 PARSE_OPT_KEEP_UNKNOWN = 8,
25 PARSE_OPT_NO_INTERNAL_HELP = 16,
26};
27
28enum parse_opt_option_flags {
29 PARSE_OPT_OPTARG = 1,
30 PARSE_OPT_NOARG = 2,
31 PARSE_OPT_NONEG = 4,
32 PARSE_OPT_HIDDEN = 8,
33 PARSE_OPT_LASTARG_DEFAULT = 16,
34};
35
36struct option;
37typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
38
39/*
40 * `type`::
41 * holds the type of the option, you must have an OPTION_END last in your
42 * array.
43 *
44 * `short_name`::
45 * the character to use as a short option name, '\0' if none.
46 *
47 * `long_name`::
48 * the long option name, without the leading dashes, NULL if none.
49 *
50 * `value`::
51 * stores pointers to the values to be filled.
52 *
53 * `argh`::
54 * token to explain the kind of argument this option wants. Keep it
55 * homogenous across the repository.
56 *
57 * `help`::
58 * the short help associated to what the option does.
59 * Must never be NULL (except for OPTION_END).
60 * OPTION_GROUP uses this pointer to store the group header.
61 *
62 * `flags`::
63 * mask of parse_opt_option_flags.
64 * PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
65 * PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
66 * PARSE_OPT_NONEG: says that this option cannot be negated
67 * PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
68 * the long one.
69 *
70 * `callback`::
71 * pointer to the callback to use for OPTION_CALLBACK.
72 *
73 * `defval`::
74 * default value to fill (*->value) with for PARSE_OPT_OPTARG.
75 * OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in
76 * the value when met.
77 * CALLBACKS can use it like they want.
78 */
79struct option {
80 enum parse_opt_type type;
81 int short_name;
82 const char *long_name;
83 void *value;
84 const char *argh;
85 const char *help;
86
87 int flags;
88 parse_opt_cb *callback;
89 intptr_t defval;
90};
91
92#define OPT_END() { OPTION_END }
93#define OPT_ARGUMENT(l, h) { OPTION_ARGUMENT, 0, (l), NULL, NULL, (h) }
94#define OPT_GROUP(h) { OPTION_GROUP, 0, NULL, NULL, NULL, (h) }
95#define OPT_BIT(s, l, v, h, b) { OPTION_BIT, (s), (l), (v), NULL, (h), 0, NULL, (b) }
96#define OPT_BOOLEAN(s, l, v, h) { OPTION_BOOLEAN, (s), (l), (v), NULL, (h) }
97#define OPT_SET_INT(s, l, v, h, i) { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) }
98#define OPT_SET_PTR(s, l, v, h, p) { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) }
99#define OPT_INTEGER(s, l, v, h) { OPTION_INTEGER, (s), (l), (v), NULL, (h) }
100#define OPT_STRING(s, l, v, a, h) { OPTION_STRING, (s), (l), (v), (a), (h) }
101#define OPT_DATE(s, l, v, h) \
102 { OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \
103 parse_opt_approxidate_cb }
104#define OPT_CALLBACK(s, l, v, a, h, f) \
105 { OPTION_CALLBACK, (s), (l), (v), (a), (h), 0, (f) }
106
107/* parse_options() will filter out the processed options and leave the
108 * non-option argments in argv[].
109 * Returns the number of arguments left in argv[].
110 */
111extern int parse_options(int argc, const char **argv,
112 const struct option *options,
113 const char * const usagestr[], int flags);
114
115extern NORETURN void usage_with_options(const char * const *usagestr,
116 const struct option *options);
117
118/*----- incremantal advanced APIs -----*/
119
120enum {
121 PARSE_OPT_HELP = -1,
122 PARSE_OPT_DONE,
123 PARSE_OPT_UNKNOWN,
124};
125
126/*
127 * It's okay for the caller to consume argv/argc in the usual way.
128 * Other fields of that structure are private to parse-options and should not
129 * be modified in any way.
130 */
131struct parse_opt_ctx_t {
132 const char **argv;
133 const char **out;
134 int argc, cpidx;
135 const char *opt;
136 int flags;
137};
138
139extern int parse_options_usage(const char * const *usagestr,
140 const struct option *opts);
141
142extern void parse_options_start(struct parse_opt_ctx_t *ctx,
143 int argc, const char **argv, int flags);
144
145extern int parse_options_step(struct parse_opt_ctx_t *ctx,
146 const struct option *options,
147 const char * const usagestr[]);
148
149extern int parse_options_end(struct parse_opt_ctx_t *ctx);
150
151
152/*----- some often used options -----*/
153extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
154extern int parse_opt_approxidate_cb(const struct option *, const char *, int);
155extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
156
157#define OPT__VERBOSE(var) OPT_BOOLEAN('v', "verbose", (var), "be verbose")
158#define OPT__QUIET(var) OPT_BOOLEAN('q', "quiet", (var), "be quiet")
159#define OPT__VERBOSITY(var) \
160 { OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \
161 PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \
162 { OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \
163 PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }
164#define OPT__DRY_RUN(var) OPT_BOOLEAN('n', "dry-run", (var), "dry run")
165#define OPT__ABBREV(var) \
166 { OPTION_CALLBACK, 0, "abbrev", (var), "n", \
167 "use <n> digits to display SHA-1s", \
168 PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 }
169
170extern const char *parse_options_fix_filename(const char *prefix, const char *file);
171
172#endif
diff --git a/Documentation/perf_counter/util/path.c b/Documentation/perf_counter/util/path.c
new file mode 100644
index 000000000000..a501a40dd2cb
--- /dev/null
+++ b/Documentation/perf_counter/util/path.c
@@ -0,0 +1,353 @@
1/*
2 * I'm tired of doing "vsnprintf()" etc just to open a
3 * file, so here's a "return static buffer with printf"
4 * interface for paths.
5 *
6 * It's obviously not thread-safe. Sue me. But it's quite
7 * useful for doing things like
8 *
9 * f = open(mkpath("%s/%s.perf", base, name), O_RDONLY);
10 *
11 * which is what it's designed for.
12 */
13#include "cache.h"
14
15static char bad_path[] = "/bad-path/";
16/*
17 * Two hacks:
18 */
19
20static char *get_perf_dir(void)
21{
22 return ".";
23}
24
25size_t strlcpy(char *dest, const char *src, size_t size)
26{
27 size_t ret = strlen(src);
28
29 if (size) {
30 size_t len = (ret >= size) ? size - 1 : ret;
31 memcpy(dest, src, len);
32 dest[len] = '\0';
33 }
34 return ret;
35}
36
37
38static char *get_pathname(void)
39{
40 static char pathname_array[4][PATH_MAX];
41 static int index;
42 return pathname_array[3 & ++index];
43}
44
45static char *cleanup_path(char *path)
46{
47 /* Clean it up */
48 if (!memcmp(path, "./", 2)) {
49 path += 2;
50 while (*path == '/')
51 path++;
52 }
53 return path;
54}
55
56char *mksnpath(char *buf, size_t n, const char *fmt, ...)
57{
58 va_list args;
59 unsigned len;
60
61 va_start(args, fmt);
62 len = vsnprintf(buf, n, fmt, args);
63 va_end(args);
64 if (len >= n) {
65 strlcpy(buf, bad_path, n);
66 return buf;
67 }
68 return cleanup_path(buf);
69}
70
71static char *perf_vsnpath(char *buf, size_t n, const char *fmt, va_list args)
72{
73 const char *perf_dir = get_perf_dir();
74 size_t len;
75
76 len = strlen(perf_dir);
77 if (n < len + 1)
78 goto bad;
79 memcpy(buf, perf_dir, len);
80 if (len && !is_dir_sep(perf_dir[len-1]))
81 buf[len++] = '/';
82 len += vsnprintf(buf + len, n - len, fmt, args);
83 if (len >= n)
84 goto bad;
85 return cleanup_path(buf);
86bad:
87 strlcpy(buf, bad_path, n);
88 return buf;
89}
90
91char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
92{
93 va_list args;
94 va_start(args, fmt);
95 (void)perf_vsnpath(buf, n, fmt, args);
96 va_end(args);
97 return buf;
98}
99
100char *perf_pathdup(const char *fmt, ...)
101{
102 char path[PATH_MAX];
103 va_list args;
104 va_start(args, fmt);
105 (void)perf_vsnpath(path, sizeof(path), fmt, args);
106 va_end(args);
107 return xstrdup(path);
108}
109
110char *mkpath(const char *fmt, ...)
111{
112 va_list args;
113 unsigned len;
114 char *pathname = get_pathname();
115
116 va_start(args, fmt);
117 len = vsnprintf(pathname, PATH_MAX, fmt, args);
118 va_end(args);
119 if (len >= PATH_MAX)
120 return bad_path;
121 return cleanup_path(pathname);
122}
123
124char *perf_path(const char *fmt, ...)
125{
126 const char *perf_dir = get_perf_dir();
127 char *pathname = get_pathname();
128 va_list args;
129 unsigned len;
130
131 len = strlen(perf_dir);
132 if (len > PATH_MAX-100)
133 return bad_path;
134 memcpy(pathname, perf_dir, len);
135 if (len && perf_dir[len-1] != '/')
136 pathname[len++] = '/';
137 va_start(args, fmt);
138 len += vsnprintf(pathname + len, PATH_MAX - len, fmt, args);
139 va_end(args);
140 if (len >= PATH_MAX)
141 return bad_path;
142 return cleanup_path(pathname);
143}
144
145
146/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
147int perf_mkstemp(char *path, size_t len, const char *template)
148{
149 const char *tmp;
150 size_t n;
151
152 tmp = getenv("TMPDIR");
153 if (!tmp)
154 tmp = "/tmp";
155 n = snprintf(path, len, "%s/%s", tmp, template);
156 if (len <= n) {
157 errno = ENAMETOOLONG;
158 return -1;
159 }
160 return mkstemp(path);
161}
162
163
164const char *make_relative_path(const char *abs, const char *base)
165{
166 static char buf[PATH_MAX + 1];
167 int baselen;
168 if (!base)
169 return abs;
170 baselen = strlen(base);
171 if (prefixcmp(abs, base))
172 return abs;
173 if (abs[baselen] == '/')
174 baselen++;
175 else if (base[baselen - 1] != '/')
176 return abs;
177 strcpy(buf, abs + baselen);
178 return buf;
179}
180
181/*
182 * It is okay if dst == src, but they should not overlap otherwise.
183 *
184 * Performs the following normalizations on src, storing the result in dst:
185 * - Ensures that components are separated by '/' (Windows only)
186 * - Squashes sequences of '/'.
187 * - Removes "." components.
188 * - Removes ".." components, and the components the precede them.
189 * Returns failure (non-zero) if a ".." component appears as first path
190 * component anytime during the normalization. Otherwise, returns success (0).
191 *
192 * Note that this function is purely textual. It does not follow symlinks,
193 * verify the existence of the path, or make any system calls.
194 */
195int normalize_path_copy(char *dst, const char *src)
196{
197 char *dst0;
198
199 if (has_dos_drive_prefix(src)) {
200 *dst++ = *src++;
201 *dst++ = *src++;
202 }
203 dst0 = dst;
204
205 if (is_dir_sep(*src)) {
206 *dst++ = '/';
207 while (is_dir_sep(*src))
208 src++;
209 }
210
211 for (;;) {
212 char c = *src;
213
214 /*
215 * A path component that begins with . could be
216 * special:
217 * (1) "." and ends -- ignore and terminate.
218 * (2) "./" -- ignore them, eat slash and continue.
219 * (3) ".." and ends -- strip one and terminate.
220 * (4) "../" -- strip one, eat slash and continue.
221 */
222 if (c == '.') {
223 if (!src[1]) {
224 /* (1) */
225 src++;
226 } else if (is_dir_sep(src[1])) {
227 /* (2) */
228 src += 2;
229 while (is_dir_sep(*src))
230 src++;
231 continue;
232 } else if (src[1] == '.') {
233 if (!src[2]) {
234 /* (3) */
235 src += 2;
236 goto up_one;
237 } else if (is_dir_sep(src[2])) {
238 /* (4) */
239 src += 3;
240 while (is_dir_sep(*src))
241 src++;
242 goto up_one;
243 }
244 }
245 }
246
247 /* copy up to the next '/', and eat all '/' */
248 while ((c = *src++) != '\0' && !is_dir_sep(c))
249 *dst++ = c;
250 if (is_dir_sep(c)) {
251 *dst++ = '/';
252 while (is_dir_sep(c))
253 c = *src++;
254 src--;
255 } else if (!c)
256 break;
257 continue;
258
259 up_one:
260 /*
261 * dst0..dst is prefix portion, and dst[-1] is '/';
262 * go up one level.
263 */
264 dst--; /* go to trailing '/' */
265 if (dst <= dst0)
266 return -1;
267 /* Windows: dst[-1] cannot be backslash anymore */
268 while (dst0 < dst && dst[-1] != '/')
269 dst--;
270 }
271 *dst = '\0';
272 return 0;
273}
274
275/*
276 * path = Canonical absolute path
277 * prefix_list = Colon-separated list of absolute paths
278 *
279 * Determines, for each path in prefix_list, whether the "prefix" really
280 * is an ancestor directory of path. Returns the length of the longest
281 * ancestor directory, excluding any trailing slashes, or -1 if no prefix
282 * is an ancestor. (Note that this means 0 is returned if prefix_list is
283 * "/".) "/foo" is not considered an ancestor of "/foobar". Directories
284 * are not considered to be their own ancestors. path must be in a
285 * canonical form: empty components, or "." or ".." components are not
286 * allowed. prefix_list may be null, which is like "".
287 */
288int longest_ancestor_length(const char *path, const char *prefix_list)
289{
290 char buf[PATH_MAX+1];
291 const char *ceil, *colon;
292 int len, max_len = -1;
293
294 if (prefix_list == NULL || !strcmp(path, "/"))
295 return -1;
296
297 for (colon = ceil = prefix_list; *colon; ceil = colon+1) {
298 for (colon = ceil; *colon && *colon != PATH_SEP; colon++);
299 len = colon - ceil;
300 if (len == 0 || len > PATH_MAX || !is_absolute_path(ceil))
301 continue;
302 strlcpy(buf, ceil, len+1);
303 if (normalize_path_copy(buf, buf) < 0)
304 continue;
305 len = strlen(buf);
306 if (len > 0 && buf[len-1] == '/')
307 buf[--len] = '\0';
308
309 if (!strncmp(path, buf, len) &&
310 path[len] == '/' &&
311 len > max_len) {
312 max_len = len;
313 }
314 }
315
316 return max_len;
317}
318
319/* strip arbitrary amount of directory separators at end of path */
320static inline int chomp_trailing_dir_sep(const char *path, int len)
321{
322 while (len && is_dir_sep(path[len - 1]))
323 len--;
324 return len;
325}
326
327/*
328 * If path ends with suffix (complete path components), returns the
329 * part before suffix (sans trailing directory separators).
330 * Otherwise returns NULL.
331 */
332char *strip_path_suffix(const char *path, const char *suffix)
333{
334 int path_len = strlen(path), suffix_len = strlen(suffix);
335
336 while (suffix_len) {
337 if (!path_len)
338 return NULL;
339
340 if (is_dir_sep(path[path_len - 1])) {
341 if (!is_dir_sep(suffix[suffix_len - 1]))
342 return NULL;
343 path_len = chomp_trailing_dir_sep(path, path_len);
344 suffix_len = chomp_trailing_dir_sep(suffix, suffix_len);
345 }
346 else if (path[--path_len] != suffix[--suffix_len])
347 return NULL;
348 }
349
350 if (path_len && !is_dir_sep(path[path_len - 1]))
351 return NULL;
352 return xstrndup(path, chomp_trailing_dir_sep(path, path_len));
353}
diff --git a/Documentation/perf_counter/util/quote.c b/Documentation/perf_counter/util/quote.c
new file mode 100644
index 000000000000..7a49fcf69671
--- /dev/null
+++ b/Documentation/perf_counter/util/quote.c
@@ -0,0 +1,478 @@
1#include "cache.h"
2#include "quote.h"
3
4int quote_path_fully = 1;
5
6/* Help to copy the thing properly quoted for the shell safety.
7 * any single quote is replaced with '\'', any exclamation point
8 * is replaced with '\!', and the whole thing is enclosed in a
9 *
10 * E.g.
11 * original sq_quote result
12 * name ==> name ==> 'name'
13 * a b ==> a b ==> 'a b'
14 * a'b ==> a'\''b ==> 'a'\''b'
15 * a!b ==> a'\!'b ==> 'a'\!'b'
16 */
17static inline int need_bs_quote(char c)
18{
19 return (c == '\'' || c == '!');
20}
21
22void sq_quote_buf(struct strbuf *dst, const char *src)
23{
24 char *to_free = NULL;
25
26 if (dst->buf == src)
27 to_free = strbuf_detach(dst, NULL);
28
29 strbuf_addch(dst, '\'');
30 while (*src) {
31 size_t len = strcspn(src, "'!");
32 strbuf_add(dst, src, len);
33 src += len;
34 while (need_bs_quote(*src)) {
35 strbuf_addstr(dst, "'\\");
36 strbuf_addch(dst, *src++);
37 strbuf_addch(dst, '\'');
38 }
39 }
40 strbuf_addch(dst, '\'');
41 free(to_free);
42}
43
44void sq_quote_print(FILE *stream, const char *src)
45{
46 char c;
47
48 fputc('\'', stream);
49 while ((c = *src++)) {
50 if (need_bs_quote(c)) {
51 fputs("'\\", stream);
52 fputc(c, stream);
53 fputc('\'', stream);
54 } else {
55 fputc(c, stream);
56 }
57 }
58 fputc('\'', stream);
59}
60
61void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
62{
63 int i;
64
65 /* Copy into destination buffer. */
66 strbuf_grow(dst, 255);
67 for (i = 0; argv[i]; ++i) {
68 strbuf_addch(dst, ' ');
69 sq_quote_buf(dst, argv[i]);
70 if (maxlen && dst->len > maxlen)
71 die("Too many or long arguments");
72 }
73}
74
75char *sq_dequote_step(char *arg, char **next)
76{
77 char *dst = arg;
78 char *src = arg;
79 char c;
80
81 if (*src != '\'')
82 return NULL;
83 for (;;) {
84 c = *++src;
85 if (!c)
86 return NULL;
87 if (c != '\'') {
88 *dst++ = c;
89 continue;
90 }
91 /* We stepped out of sq */
92 switch (*++src) {
93 case '\0':
94 *dst = 0;
95 if (next)
96 *next = NULL;
97 return arg;
98 case '\\':
99 c = *++src;
100 if (need_bs_quote(c) && *++src == '\'') {
101 *dst++ = c;
102 continue;
103 }
104 /* Fallthrough */
105 default:
106 if (!next || !isspace(*src))
107 return NULL;
108 do {
109 c = *++src;
110 } while (isspace(c));
111 *dst = 0;
112 *next = src;
113 return arg;
114 }
115 }
116}
117
118char *sq_dequote(char *arg)
119{
120 return sq_dequote_step(arg, NULL);
121}
122
123int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc)
124{
125 char *next = arg;
126
127 if (!*arg)
128 return 0;
129 do {
130 char *dequoted = sq_dequote_step(next, &next);
131 if (!dequoted)
132 return -1;
133 ALLOC_GROW(*argv, *nr + 1, *alloc);
134 (*argv)[(*nr)++] = dequoted;
135 } while (next);
136
137 return 0;
138}
139
140/* 1 means: quote as octal
141 * 0 means: quote as octal if (quote_path_fully)
142 * -1 means: never quote
143 * c: quote as "\\c"
144 */
145#define X8(x) x, x, x, x, x, x, x, x
146#define X16(x) X8(x), X8(x)
147static signed char const sq_lookup[256] = {
148 /* 0 1 2 3 4 5 6 7 */
149 /* 0x00 */ 1, 1, 1, 1, 1, 1, 1, 'a',
150 /* 0x08 */ 'b', 't', 'n', 'v', 'f', 'r', 1, 1,
151 /* 0x10 */ X16(1),
152 /* 0x20 */ -1, -1, '"', -1, -1, -1, -1, -1,
153 /* 0x28 */ X16(-1), X16(-1), X16(-1),
154 /* 0x58 */ -1, -1, -1, -1,'\\', -1, -1, -1,
155 /* 0x60 */ X16(-1), X8(-1),
156 /* 0x78 */ -1, -1, -1, -1, -1, -1, -1, 1,
157 /* 0x80 */ /* set to 0 */
158};
159
160static inline int sq_must_quote(char c)
161{
162 return sq_lookup[(unsigned char)c] + quote_path_fully > 0;
163}
164
165/* returns the longest prefix not needing a quote up to maxlen if positive.
166 This stops at the first \0 because it's marked as a character needing an
167 escape */
168static size_t next_quote_pos(const char *s, ssize_t maxlen)
169{
170 size_t len;
171 if (maxlen < 0) {
172 for (len = 0; !sq_must_quote(s[len]); len++);
173 } else {
174 for (len = 0; len < maxlen && !sq_must_quote(s[len]); len++);
175 }
176 return len;
177}
178
179/*
180 * C-style name quoting.
181 *
182 * (1) if sb and fp are both NULL, inspect the input name and counts the
183 * number of bytes that are needed to hold c_style quoted version of name,
184 * counting the double quotes around it but not terminating NUL, and
185 * returns it.
186 * However, if name does not need c_style quoting, it returns 0.
187 *
188 * (2) if sb or fp are not NULL, it emits the c_style quoted version
189 * of name, enclosed with double quotes if asked and needed only.
190 * Return value is the same as in (1).
191 */
192static size_t quote_c_style_counted(const char *name, ssize_t maxlen,
193 struct strbuf *sb, FILE *fp, int no_dq)
194{
195#undef EMIT
196#define EMIT(c) \
197 do { \
198 if (sb) strbuf_addch(sb, (c)); \
199 if (fp) fputc((c), fp); \
200 count++; \
201 } while (0)
202#define EMITBUF(s, l) \
203 do { \
204 if (sb) strbuf_add(sb, (s), (l)); \
205 if (fp) fwrite((s), (l), 1, fp); \
206 count += (l); \
207 } while (0)
208
209 size_t len, count = 0;
210 const char *p = name;
211
212 for (;;) {
213 int ch;
214
215 len = next_quote_pos(p, maxlen);
216 if (len == maxlen || !p[len])
217 break;
218
219 if (!no_dq && p == name)
220 EMIT('"');
221
222 EMITBUF(p, len);
223 EMIT('\\');
224 p += len;
225 ch = (unsigned char)*p++;
226 if (sq_lookup[ch] >= ' ') {
227 EMIT(sq_lookup[ch]);
228 } else {
229 EMIT(((ch >> 6) & 03) + '0');
230 EMIT(((ch >> 3) & 07) + '0');
231 EMIT(((ch >> 0) & 07) + '0');
232 }
233 }
234
235 EMITBUF(p, len);
236 if (p == name) /* no ending quote needed */
237 return 0;
238
239 if (!no_dq)
240 EMIT('"');
241 return count;
242}
243
244size_t quote_c_style(const char *name, struct strbuf *sb, FILE *fp, int nodq)
245{
246 return quote_c_style_counted(name, -1, sb, fp, nodq);
247}
248
249void quote_two_c_style(struct strbuf *sb, const char *prefix, const char *path, int nodq)
250{
251 if (quote_c_style(prefix, NULL, NULL, 0) ||
252 quote_c_style(path, NULL, NULL, 0)) {
253 if (!nodq)
254 strbuf_addch(sb, '"');
255 quote_c_style(prefix, sb, NULL, 1);
256 quote_c_style(path, sb, NULL, 1);
257 if (!nodq)
258 strbuf_addch(sb, '"');
259 } else {
260 strbuf_addstr(sb, prefix);
261 strbuf_addstr(sb, path);
262 }
263}
264
265void write_name_quoted(const char *name, FILE *fp, int terminator)
266{
267 if (terminator) {
268 quote_c_style(name, NULL, fp, 0);
269 } else {
270 fputs(name, fp);
271 }
272 fputc(terminator, fp);
273}
274
275extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
276 const char *name, FILE *fp, int terminator)
277{
278 int needquote = 0;
279
280 if (terminator) {
281 needquote = next_quote_pos(pfx, pfxlen) < pfxlen
282 || name[next_quote_pos(name, -1)];
283 }
284 if (needquote) {
285 fputc('"', fp);
286 quote_c_style_counted(pfx, pfxlen, NULL, fp, 1);
287 quote_c_style(name, NULL, fp, 1);
288 fputc('"', fp);
289 } else {
290 fwrite(pfx, pfxlen, 1, fp);
291 fputs(name, fp);
292 }
293 fputc(terminator, fp);
294}
295
296/* quote path as relative to the given prefix */
297char *quote_path_relative(const char *in, int len,
298 struct strbuf *out, const char *prefix)
299{
300 int needquote;
301
302 if (len < 0)
303 len = strlen(in);
304
305 /* "../" prefix itself does not need quoting, but "in" might. */
306 needquote = next_quote_pos(in, len) < len;
307 strbuf_setlen(out, 0);
308 strbuf_grow(out, len);
309
310 if (needquote)
311 strbuf_addch(out, '"');
312 if (prefix) {
313 int off = 0;
314 while (prefix[off] && off < len && prefix[off] == in[off])
315 if (prefix[off] == '/') {
316 prefix += off + 1;
317 in += off + 1;
318 len -= off + 1;
319 off = 0;
320 } else
321 off++;
322
323 for (; *prefix; prefix++)
324 if (*prefix == '/')
325 strbuf_addstr(out, "../");
326 }
327
328 quote_c_style_counted (in, len, out, NULL, 1);
329
330 if (needquote)
331 strbuf_addch(out, '"');
332 if (!out->len)
333 strbuf_addstr(out, "./");
334
335 return out->buf;
336}
337
338/*
339 * C-style name unquoting.
340 *
341 * Quoted should point at the opening double quote.
342 * + Returns 0 if it was able to unquote the string properly, and appends the
343 * result in the strbuf `sb'.
344 * + Returns -1 in case of error, and doesn't touch the strbuf. Though note
345 * that this function will allocate memory in the strbuf, so calling
346 * strbuf_release is mandatory whichever result unquote_c_style returns.
347 *
348 * Updates endp pointer to point at one past the ending double quote if given.
349 */
350int unquote_c_style(struct strbuf *sb, const char *quoted, const char **endp)
351{
352 size_t oldlen = sb->len, len;
353 int ch, ac;
354
355 if (*quoted++ != '"')
356 return -1;
357
358 for (;;) {
359 len = strcspn(quoted, "\"\\");
360 strbuf_add(sb, quoted, len);
361 quoted += len;
362
363 switch (*quoted++) {
364 case '"':
365 if (endp)
366 *endp = quoted;
367 return 0;
368 case '\\':
369 break;
370 default:
371 goto error;
372 }
373
374 switch ((ch = *quoted++)) {
375 case 'a': ch = '\a'; break;
376 case 'b': ch = '\b'; break;
377 case 'f': ch = '\f'; break;
378 case 'n': ch = '\n'; break;
379 case 'r': ch = '\r'; break;
380 case 't': ch = '\t'; break;
381 case 'v': ch = '\v'; break;
382
383 case '\\': case '"':
384 break; /* verbatim */
385
386 /* octal values with first digit over 4 overflow */
387 case '0': case '1': case '2': case '3':
388 ac = ((ch - '0') << 6);
389 if ((ch = *quoted++) < '0' || '7' < ch)
390 goto error;
391 ac |= ((ch - '0') << 3);
392 if ((ch = *quoted++) < '0' || '7' < ch)
393 goto error;
394 ac |= (ch - '0');
395 ch = ac;
396 break;
397 default:
398 goto error;
399 }
400 strbuf_addch(sb, ch);
401 }
402
403 error:
404 strbuf_setlen(sb, oldlen);
405 return -1;
406}
407
408/* quoting as a string literal for other languages */
409
410void perl_quote_print(FILE *stream, const char *src)
411{
412 const char sq = '\'';
413 const char bq = '\\';
414 char c;
415
416 fputc(sq, stream);
417 while ((c = *src++)) {
418 if (c == sq || c == bq)
419 fputc(bq, stream);
420 fputc(c, stream);
421 }
422 fputc(sq, stream);
423}
424
425void python_quote_print(FILE *stream, const char *src)
426{
427 const char sq = '\'';
428 const char bq = '\\';
429 const char nl = '\n';
430 char c;
431
432 fputc(sq, stream);
433 while ((c = *src++)) {
434 if (c == nl) {
435 fputc(bq, stream);
436 fputc('n', stream);
437 continue;
438 }
439 if (c == sq || c == bq)
440 fputc(bq, stream);
441 fputc(c, stream);
442 }
443 fputc(sq, stream);
444}
445
446void tcl_quote_print(FILE *stream, const char *src)
447{
448 char c;
449
450 fputc('"', stream);
451 while ((c = *src++)) {
452 switch (c) {
453 case '[': case ']':
454 case '{': case '}':
455 case '$': case '\\': case '"':
456 fputc('\\', stream);
457 default:
458 fputc(c, stream);
459 break;
460 case '\f':
461 fputs("\\f", stream);
462 break;
463 case '\r':
464 fputs("\\r", stream);
465 break;
466 case '\n':
467 fputs("\\n", stream);
468 break;
469 case '\t':
470 fputs("\\t", stream);
471 break;
472 case '\v':
473 fputs("\\v", stream);
474 break;
475 }
476 }
477 fputc('"', stream);
478}
diff --git a/Documentation/perf_counter/util/quote.h b/Documentation/perf_counter/util/quote.h
new file mode 100644
index 000000000000..5dfad89816db
--- /dev/null
+++ b/Documentation/perf_counter/util/quote.h
@@ -0,0 +1,68 @@
1#ifndef QUOTE_H
2#define QUOTE_H
3
4#include <stddef.h>
5#include <stdio.h>
6
7/* Help to copy the thing properly quoted for the shell safety.
8 * any single quote is replaced with '\'', any exclamation point
9 * is replaced with '\!', and the whole thing is enclosed in a
10 * single quote pair.
11 *
12 * For example, if you are passing the result to system() as an
13 * argument:
14 *
15 * sprintf(cmd, "foobar %s %s", sq_quote(arg0), sq_quote(arg1))
16 *
17 * would be appropriate. If the system() is going to call ssh to
18 * run the command on the other side:
19 *
20 * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1));
21 * sprintf(rcmd, "ssh %s %s", sq_util/quote.host), sq_quote(cmd));
22 *
23 * Note that the above examples leak memory! Remember to free result from
24 * sq_quote() in a real application.
25 *
26 * sq_quote_buf() writes to an existing buffer of specified size; it
27 * will return the number of characters that would have been written
28 * excluding the final null regardless of the buffer size.
29 */
30
31extern void sq_quote_print(FILE *stream, const char *src);
32
33extern void sq_quote_buf(struct strbuf *, const char *src);
34extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
35
36/* This unwraps what sq_quote() produces in place, but returns
37 * NULL if the input does not look like what sq_quote would have
38 * produced.
39 */
40extern char *sq_dequote(char *);
41
42/*
43 * Same as the above, but can be used to unwrap many arguments in the
44 * same string separated by space. "next" is changed to point to the
45 * next argument that should be passed as first parameter. When there
46 * is no more argument to be dequoted, "next" is updated to point to NULL.
47 */
48extern char *sq_dequote_step(char *arg, char **next);
49extern int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc);
50
51extern int unquote_c_style(struct strbuf *, const char *quoted, const char **endp);
52extern size_t quote_c_style(const char *name, struct strbuf *, FILE *, int no_dq);
53extern void quote_two_c_style(struct strbuf *, const char *, const char *, int);
54
55extern void write_name_quoted(const char *name, FILE *, int terminator);
56extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
57 const char *name, FILE *, int terminator);
58
59/* quote path as relative to the given prefix */
60char *quote_path_relative(const char *in, int len,
61 struct strbuf *out, const char *prefix);
62
63/* quoting as a string literal for other languages */
64extern void perl_quote_print(FILE *stream, const char *src);
65extern void python_quote_print(FILE *stream, const char *src);
66extern void tcl_quote_print(FILE *stream, const char *src);
67
68#endif
diff --git a/Documentation/perf_counter/util/run-command.c b/Documentation/perf_counter/util/run-command.c
new file mode 100644
index 000000000000..b2f5e854f40a
--- /dev/null
+++ b/Documentation/perf_counter/util/run-command.c
@@ -0,0 +1,395 @@
1#include "cache.h"
2#include "run-command.h"
3#include "exec_cmd.h"
4
5static inline void close_pair(int fd[2])
6{
7 close(fd[0]);
8 close(fd[1]);
9}
10
11static inline void dup_devnull(int to)
12{
13 int fd = open("/dev/null", O_RDWR);
14 dup2(fd, to);
15 close(fd);
16}
17
18int start_command(struct child_process *cmd)
19{
20 int need_in, need_out, need_err;
21 int fdin[2], fdout[2], fderr[2];
22
23 /*
24 * In case of errors we must keep the promise to close FDs
25 * that have been passed in via ->in and ->out.
26 */
27
28 need_in = !cmd->no_stdin && cmd->in < 0;
29 if (need_in) {
30 if (pipe(fdin) < 0) {
31 if (cmd->out > 0)
32 close(cmd->out);
33 return -ERR_RUN_COMMAND_PIPE;
34 }
35 cmd->in = fdin[1];
36 }
37
38 need_out = !cmd->no_stdout
39 && !cmd->stdout_to_stderr
40 && cmd->out < 0;
41 if (need_out) {
42 if (pipe(fdout) < 0) {
43 if (need_in)
44 close_pair(fdin);
45 else if (cmd->in)
46 close(cmd->in);
47 return -ERR_RUN_COMMAND_PIPE;
48 }
49 cmd->out = fdout[0];
50 }
51
52 need_err = !cmd->no_stderr && cmd->err < 0;
53 if (need_err) {
54 if (pipe(fderr) < 0) {
55 if (need_in)
56 close_pair(fdin);
57 else if (cmd->in)
58 close(cmd->in);
59 if (need_out)
60 close_pair(fdout);
61 else if (cmd->out)
62 close(cmd->out);
63 return -ERR_RUN_COMMAND_PIPE;
64 }
65 cmd->err = fderr[0];
66 }
67
68#ifndef __MINGW32__
69 fflush(NULL);
70 cmd->pid = fork();
71 if (!cmd->pid) {
72 if (cmd->no_stdin)
73 dup_devnull(0);
74 else if (need_in) {
75 dup2(fdin[0], 0);
76 close_pair(fdin);
77 } else if (cmd->in) {
78 dup2(cmd->in, 0);
79 close(cmd->in);
80 }
81
82 if (cmd->no_stderr)
83 dup_devnull(2);
84 else if (need_err) {
85 dup2(fderr[1], 2);
86 close_pair(fderr);
87 }
88
89 if (cmd->no_stdout)
90 dup_devnull(1);
91 else if (cmd->stdout_to_stderr)
92 dup2(2, 1);
93 else if (need_out) {
94 dup2(fdout[1], 1);
95 close_pair(fdout);
96 } else if (cmd->out > 1) {
97 dup2(cmd->out, 1);
98 close(cmd->out);
99 }
100
101 if (cmd->dir && chdir(cmd->dir))
102 die("exec %s: cd to %s failed (%s)", cmd->argv[0],
103 cmd->dir, strerror(errno));
104 if (cmd->env) {
105 for (; *cmd->env; cmd->env++) {
106 if (strchr(*cmd->env, '='))
107 putenv((char*)*cmd->env);
108 else
109 unsetenv(*cmd->env);
110 }
111 }
112 if (cmd->preexec_cb)
113 cmd->preexec_cb();
114 if (cmd->perf_cmd) {
115 execv_perf_cmd(cmd->argv);
116 } else {
117 execvp(cmd->argv[0], (char *const*) cmd->argv);
118 }
119 exit(127);
120 }
121#else
122 int s0 = -1, s1 = -1, s2 = -1; /* backups of stdin, stdout, stderr */
123 const char **sargv = cmd->argv;
124 char **env = environ;
125
126 if (cmd->no_stdin) {
127 s0 = dup(0);
128 dup_devnull(0);
129 } else if (need_in) {
130 s0 = dup(0);
131 dup2(fdin[0], 0);
132 } else if (cmd->in) {
133 s0 = dup(0);
134 dup2(cmd->in, 0);
135 }
136
137 if (cmd->no_stderr) {
138 s2 = dup(2);
139 dup_devnull(2);
140 } else if (need_err) {
141 s2 = dup(2);
142 dup2(fderr[1], 2);
143 }
144
145 if (cmd->no_stdout) {
146 s1 = dup(1);
147 dup_devnull(1);
148 } else if (cmd->stdout_to_stderr) {
149 s1 = dup(1);
150 dup2(2, 1);
151 } else if (need_out) {
152 s1 = dup(1);
153 dup2(fdout[1], 1);
154 } else if (cmd->out > 1) {
155 s1 = dup(1);
156 dup2(cmd->out, 1);
157 }
158
159 if (cmd->dir)
160 die("chdir in start_command() not implemented");
161 if (cmd->env) {
162 env = copy_environ();
163 for (; *cmd->env; cmd->env++)
164 env = env_setenv(env, *cmd->env);
165 }
166
167 if (cmd->perf_cmd) {
168 cmd->argv = prepare_perf_cmd(cmd->argv);
169 }
170
171 cmd->pid = mingw_spawnvpe(cmd->argv[0], cmd->argv, env);
172
173 if (cmd->env)
174 free_environ(env);
175 if (cmd->perf_cmd)
176 free(cmd->argv);
177
178 cmd->argv = sargv;
179 if (s0 >= 0)
180 dup2(s0, 0), close(s0);
181 if (s1 >= 0)
182 dup2(s1, 1), close(s1);
183 if (s2 >= 0)
184 dup2(s2, 2), close(s2);
185#endif
186
187 if (cmd->pid < 0) {
188 int err = errno;
189 if (need_in)
190 close_pair(fdin);
191 else if (cmd->in)
192 close(cmd->in);
193 if (need_out)
194 close_pair(fdout);
195 else if (cmd->out)
196 close(cmd->out);
197 if (need_err)
198 close_pair(fderr);
199 return err == ENOENT ?
200 -ERR_RUN_COMMAND_EXEC :
201 -ERR_RUN_COMMAND_FORK;
202 }
203
204 if (need_in)
205 close(fdin[0]);
206 else if (cmd->in)
207 close(cmd->in);
208
209 if (need_out)
210 close(fdout[1]);
211 else if (cmd->out)
212 close(cmd->out);
213
214 if (need_err)
215 close(fderr[1]);
216
217 return 0;
218}
219
220static int wait_or_whine(pid_t pid)
221{
222 for (;;) {
223 int status, code;
224 pid_t waiting = waitpid(pid, &status, 0);
225
226 if (waiting < 0) {
227 if (errno == EINTR)
228 continue;
229 error("waitpid failed (%s)", strerror(errno));
230 return -ERR_RUN_COMMAND_WAITPID;
231 }
232 if (waiting != pid)
233 return -ERR_RUN_COMMAND_WAITPID_WRONG_PID;
234 if (WIFSIGNALED(status))
235 return -ERR_RUN_COMMAND_WAITPID_SIGNAL;
236
237 if (!WIFEXITED(status))
238 return -ERR_RUN_COMMAND_WAITPID_NOEXIT;
239 code = WEXITSTATUS(status);
240 switch (code) {
241 case 127:
242 return -ERR_RUN_COMMAND_EXEC;
243 case 0:
244 return 0;
245 default:
246 return -code;
247 }
248 }
249}
250
251int finish_command(struct child_process *cmd)
252{
253 return wait_or_whine(cmd->pid);
254}
255
256int run_command(struct child_process *cmd)
257{
258 int code = start_command(cmd);
259 if (code)
260 return code;
261 return finish_command(cmd);
262}
263
264static void prepare_run_command_v_opt(struct child_process *cmd,
265 const char **argv,
266 int opt)
267{
268 memset(cmd, 0, sizeof(*cmd));
269 cmd->argv = argv;
270 cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
271 cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0;
272 cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
273}
274
275int run_command_v_opt(const char **argv, int opt)
276{
277 struct child_process cmd;
278 prepare_run_command_v_opt(&cmd, argv, opt);
279 return run_command(&cmd);
280}
281
282int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env)
283{
284 struct child_process cmd;
285 prepare_run_command_v_opt(&cmd, argv, opt);
286 cmd.dir = dir;
287 cmd.env = env;
288 return run_command(&cmd);
289}
290
291#ifdef __MINGW32__
292static __stdcall unsigned run_thread(void *data)
293{
294 struct async *async = data;
295 return async->proc(async->fd_for_proc, async->data);
296}
297#endif
298
299int start_async(struct async *async)
300{
301 int pipe_out[2];
302
303 if (pipe(pipe_out) < 0)
304 return error("cannot create pipe: %s", strerror(errno));
305 async->out = pipe_out[0];
306
307#ifndef __MINGW32__
308 /* Flush stdio before fork() to avoid cloning buffers */
309 fflush(NULL);
310
311 async->pid = fork();
312 if (async->pid < 0) {
313 error("fork (async) failed: %s", strerror(errno));
314 close_pair(pipe_out);
315 return -1;
316 }
317 if (!async->pid) {
318 close(pipe_out[0]);
319 exit(!!async->proc(pipe_out[1], async->data));
320 }
321 close(pipe_out[1]);
322#else
323 async->fd_for_proc = pipe_out[1];
324 async->tid = (HANDLE) _beginthreadex(NULL, 0, run_thread, async, 0, NULL);
325 if (!async->tid) {
326 error("cannot create thread: %s", strerror(errno));
327 close_pair(pipe_out);
328 return -1;
329 }
330#endif
331 return 0;
332}
333
334int finish_async(struct async *async)
335{
336#ifndef __MINGW32__
337 int ret = 0;
338
339 if (wait_or_whine(async->pid))
340 ret = error("waitpid (async) failed");
341#else
342 DWORD ret = 0;
343 if (WaitForSingleObject(async->tid, INFINITE) != WAIT_OBJECT_0)
344 ret = error("waiting for thread failed: %lu", GetLastError());
345 else if (!GetExitCodeThread(async->tid, &ret))
346 ret = error("cannot get thread exit code: %lu", GetLastError());
347 CloseHandle(async->tid);
348#endif
349 return ret;
350}
351
352int run_hook(const char *index_file, const char *name, ...)
353{
354 struct child_process hook;
355 const char **argv = NULL, *env[2];
356 char index[PATH_MAX];
357 va_list args;
358 int ret;
359 size_t i = 0, alloc = 0;
360
361 if (access(perf_path("hooks/%s", name), X_OK) < 0)
362 return 0;
363
364 va_start(args, name);
365 ALLOC_GROW(argv, i + 1, alloc);
366 argv[i++] = perf_path("hooks/%s", name);
367 while (argv[i-1]) {
368 ALLOC_GROW(argv, i + 1, alloc);
369 argv[i++] = va_arg(args, const char *);
370 }
371 va_end(args);
372
373 memset(&hook, 0, sizeof(hook));
374 hook.argv = argv;
375 hook.no_stdin = 1;
376 hook.stdout_to_stderr = 1;
377 if (index_file) {
378 snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file);
379 env[0] = index;
380 env[1] = NULL;
381 hook.env = env;
382 }
383
384 ret = start_command(&hook);
385 free(argv);
386 if (ret) {
387 warning("Could not spawn %s", argv[0]);
388 return ret;
389 }
390 ret = finish_command(&hook);
391 if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
392 warning("%s exited due to uncaught signal", argv[0]);
393
394 return ret;
395}
diff --git a/Documentation/perf_counter/util/run-command.h b/Documentation/perf_counter/util/run-command.h
new file mode 100644
index 000000000000..328289f23669
--- /dev/null
+++ b/Documentation/perf_counter/util/run-command.h
@@ -0,0 +1,93 @@
1#ifndef RUN_COMMAND_H
2#define RUN_COMMAND_H
3
4enum {
5 ERR_RUN_COMMAND_FORK = 10000,
6 ERR_RUN_COMMAND_EXEC,
7 ERR_RUN_COMMAND_PIPE,
8 ERR_RUN_COMMAND_WAITPID,
9 ERR_RUN_COMMAND_WAITPID_WRONG_PID,
10 ERR_RUN_COMMAND_WAITPID_SIGNAL,
11 ERR_RUN_COMMAND_WAITPID_NOEXIT,
12};
13#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK)
14
15struct child_process {
16 const char **argv;
17 pid_t pid;
18 /*
19 * Using .in, .out, .err:
20 * - Specify 0 for no redirections (child inherits stdin, stdout,
21 * stderr from parent).
22 * - Specify -1 to have a pipe allocated as follows:
23 * .in: returns the writable pipe end; parent writes to it,
24 * the readable pipe end becomes child's stdin
25 * .out, .err: returns the readable pipe end; parent reads from
26 * it, the writable pipe end becomes child's stdout/stderr
27 * The caller of start_command() must close the returned FDs
28 * after it has completed reading from/writing to it!
29 * - Specify > 0 to set a channel to a particular FD as follows:
30 * .in: a readable FD, becomes child's stdin
31 * .out: a writable FD, becomes child's stdout/stderr
32 * .err > 0 not supported
33 * The specified FD is closed by start_command(), even in case
34 * of errors!
35 */
36 int in;
37 int out;
38 int err;
39 const char *dir;
40 const char *const *env;
41 unsigned no_stdin:1;
42 unsigned no_stdout:1;
43 unsigned no_stderr:1;
44 unsigned perf_cmd:1; /* if this is to be perf sub-command */
45 unsigned stdout_to_stderr:1;
46 void (*preexec_cb)(void);
47};
48
49int start_command(struct child_process *);
50int finish_command(struct child_process *);
51int run_command(struct child_process *);
52
53extern int run_hook(const char *index_file, const char *name, ...);
54
55#define RUN_COMMAND_NO_STDIN 1
56#define RUN_PERF_CMD 2 /*If this is to be perf sub-command */
57#define RUN_COMMAND_STDOUT_TO_STDERR 4
58int run_command_v_opt(const char **argv, int opt);
59
60/*
61 * env (the environment) is to be formatted like environ: "VAR=VALUE".
62 * To unset an environment variable use just "VAR".
63 */
64int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env);
65
66/*
67 * The purpose of the following functions is to feed a pipe by running
68 * a function asynchronously and providing output that the caller reads.
69 *
70 * It is expected that no synchronization and mutual exclusion between
71 * the caller and the feed function is necessary so that the function
72 * can run in a thread without interfering with the caller.
73 */
74struct async {
75 /*
76 * proc writes to fd and closes it;
77 * returns 0 on success, non-zero on failure
78 */
79 int (*proc)(int fd, void *data);
80 void *data;
81 int out; /* caller reads from here and closes it */
82#ifndef __MINGW32__
83 pid_t pid;
84#else
85 HANDLE tid;
86 int fd_for_proc;
87#endif
88};
89
90int start_async(struct async *async);
91int finish_async(struct async *async);
92
93#endif
diff --git a/Documentation/perf_counter/util/strbuf.c b/Documentation/perf_counter/util/strbuf.c
new file mode 100644
index 000000000000..eaba09306802
--- /dev/null
+++ b/Documentation/perf_counter/util/strbuf.c
@@ -0,0 +1,359 @@
1#include "cache.h"
2
3int prefixcmp(const char *str, const char *prefix)
4{
5 for (; ; str++, prefix++)
6 if (!*prefix)
7 return 0;
8 else if (*str != *prefix)
9 return (unsigned char)*prefix - (unsigned char)*str;
10}
11
12/*
13 * Used as the default ->buf value, so that people can always assume
14 * buf is non NULL and ->buf is NUL terminated even for a freshly
15 * initialized strbuf.
16 */
17char strbuf_slopbuf[1];
18
19void strbuf_init(struct strbuf *sb, size_t hint)
20{
21 sb->alloc = sb->len = 0;
22 sb->buf = strbuf_slopbuf;
23 if (hint)
24 strbuf_grow(sb, hint);
25}
26
27void strbuf_release(struct strbuf *sb)
28{
29 if (sb->alloc) {
30 free(sb->buf);
31 strbuf_init(sb, 0);
32 }
33}
34
35char *strbuf_detach(struct strbuf *sb, size_t *sz)
36{
37 char *res = sb->alloc ? sb->buf : NULL;
38 if (sz)
39 *sz = sb->len;
40 strbuf_init(sb, 0);
41 return res;
42}
43
44void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc)
45{
46 strbuf_release(sb);
47 sb->buf = buf;
48 sb->len = len;
49 sb->alloc = alloc;
50 strbuf_grow(sb, 0);
51 sb->buf[sb->len] = '\0';
52}
53
54void strbuf_grow(struct strbuf *sb, size_t extra)
55{
56 if (sb->len + extra + 1 <= sb->len)
57 die("you want to use way too much memory");
58 if (!sb->alloc)
59 sb->buf = NULL;
60 ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
61}
62
63void strbuf_trim(struct strbuf *sb)
64{
65 char *b = sb->buf;
66 while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
67 sb->len--;
68 while (sb->len > 0 && isspace(*b)) {
69 b++;
70 sb->len--;
71 }
72 memmove(sb->buf, b, sb->len);
73 sb->buf[sb->len] = '\0';
74}
75void strbuf_rtrim(struct strbuf *sb)
76{
77 while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
78 sb->len--;
79 sb->buf[sb->len] = '\0';
80}
81
82void strbuf_ltrim(struct strbuf *sb)
83{
84 char *b = sb->buf;
85 while (sb->len > 0 && isspace(*b)) {
86 b++;
87 sb->len--;
88 }
89 memmove(sb->buf, b, sb->len);
90 sb->buf[sb->len] = '\0';
91}
92
93void strbuf_tolower(struct strbuf *sb)
94{
95 int i;
96 for (i = 0; i < sb->len; i++)
97 sb->buf[i] = tolower(sb->buf[i]);
98}
99
100struct strbuf **strbuf_split(const struct strbuf *sb, int delim)
101{
102 int alloc = 2, pos = 0;
103 char *n, *p;
104 struct strbuf **ret;
105 struct strbuf *t;
106
107 ret = calloc(alloc, sizeof(struct strbuf *));
108 p = n = sb->buf;
109 while (n < sb->buf + sb->len) {
110 int len;
111 n = memchr(n, delim, sb->len - (n - sb->buf));
112 if (pos + 1 >= alloc) {
113 alloc = alloc * 2;
114 ret = realloc(ret, sizeof(struct strbuf *) * alloc);
115 }
116 if (!n)
117 n = sb->buf + sb->len - 1;
118 len = n - p + 1;
119 t = malloc(sizeof(struct strbuf));
120 strbuf_init(t, len);
121 strbuf_add(t, p, len);
122 ret[pos] = t;
123 ret[++pos] = NULL;
124 p = ++n;
125 }
126 return ret;
127}
128
129void strbuf_list_free(struct strbuf **sbs)
130{
131 struct strbuf **s = sbs;
132
133 while (*s) {
134 strbuf_release(*s);
135 free(*s++);
136 }
137 free(sbs);
138}
139
140int strbuf_cmp(const struct strbuf *a, const struct strbuf *b)
141{
142 int len = a->len < b->len ? a->len: b->len;
143 int cmp = memcmp(a->buf, b->buf, len);
144 if (cmp)
145 return cmp;
146 return a->len < b->len ? -1: a->len != b->len;
147}
148
149void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
150 const void *data, size_t dlen)
151{
152 if (pos + len < pos)
153 die("you want to use way too much memory");
154 if (pos > sb->len)
155 die("`pos' is too far after the end of the buffer");
156 if (pos + len > sb->len)
157 die("`pos + len' is too far after the end of the buffer");
158
159 if (dlen >= len)
160 strbuf_grow(sb, dlen - len);
161 memmove(sb->buf + pos + dlen,
162 sb->buf + pos + len,
163 sb->len - pos - len);
164 memcpy(sb->buf + pos, data, dlen);
165 strbuf_setlen(sb, sb->len + dlen - len);
166}
167
168void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len)
169{
170 strbuf_splice(sb, pos, 0, data, len);
171}
172
173void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
174{
175 strbuf_splice(sb, pos, len, NULL, 0);
176}
177
178void strbuf_add(struct strbuf *sb, const void *data, size_t len)
179{
180 strbuf_grow(sb, len);
181 memcpy(sb->buf + sb->len, data, len);
182 strbuf_setlen(sb, sb->len + len);
183}
184
185void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len)
186{
187 strbuf_grow(sb, len);
188 memcpy(sb->buf + sb->len, sb->buf + pos, len);
189 strbuf_setlen(sb, sb->len + len);
190}
191
192void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
193{
194 int len;
195 va_list ap;
196
197 if (!strbuf_avail(sb))
198 strbuf_grow(sb, 64);
199 va_start(ap, fmt);
200 len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
201 va_end(ap);
202 if (len < 0)
203 die("your vsnprintf is broken");
204 if (len > strbuf_avail(sb)) {
205 strbuf_grow(sb, len);
206 va_start(ap, fmt);
207 len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
208 va_end(ap);
209 if (len > strbuf_avail(sb)) {
210 die("this should not happen, your snprintf is broken");
211 }
212 }
213 strbuf_setlen(sb, sb->len + len);
214}
215
216void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn,
217 void *context)
218{
219 for (;;) {
220 const char *percent;
221 size_t consumed;
222
223 percent = strchrnul(format, '%');
224 strbuf_add(sb, format, percent - format);
225 if (!*percent)
226 break;
227 format = percent + 1;
228
229 consumed = fn(sb, format, context);
230 if (consumed)
231 format += consumed;
232 else
233 strbuf_addch(sb, '%');
234 }
235}
236
237size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder,
238 void *context)
239{
240 struct strbuf_expand_dict_entry *e = context;
241 size_t len;
242
243 for (; e->placeholder && (len = strlen(e->placeholder)); e++) {
244 if (!strncmp(placeholder, e->placeholder, len)) {
245 if (e->value)
246 strbuf_addstr(sb, e->value);
247 return len;
248 }
249 }
250 return 0;
251}
252
253size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f)
254{
255 size_t res;
256 size_t oldalloc = sb->alloc;
257
258 strbuf_grow(sb, size);
259 res = fread(sb->buf + sb->len, 1, size, f);
260 if (res > 0)
261 strbuf_setlen(sb, sb->len + res);
262 else if (res < 0 && oldalloc == 0)
263 strbuf_release(sb);
264 return res;
265}
266
267ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint)
268{
269 size_t oldlen = sb->len;
270 size_t oldalloc = sb->alloc;
271
272 strbuf_grow(sb, hint ? hint : 8192);
273 for (;;) {
274 ssize_t cnt;
275
276 cnt = read(fd, sb->buf + sb->len, sb->alloc - sb->len - 1);
277 if (cnt < 0) {
278 if (oldalloc == 0)
279 strbuf_release(sb);
280 else
281 strbuf_setlen(sb, oldlen);
282 return -1;
283 }
284 if (!cnt)
285 break;
286 sb->len += cnt;
287 strbuf_grow(sb, 8192);
288 }
289
290 sb->buf[sb->len] = '\0';
291 return sb->len - oldlen;
292}
293
294#define STRBUF_MAXLINK (2*PATH_MAX)
295
296int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint)
297{
298 size_t oldalloc = sb->alloc;
299
300 if (hint < 32)
301 hint = 32;
302
303 while (hint < STRBUF_MAXLINK) {
304 int len;
305
306 strbuf_grow(sb, hint);
307 len = readlink(path, sb->buf, hint);
308 if (len < 0) {
309 if (errno != ERANGE)
310 break;
311 } else if (len < hint) {
312 strbuf_setlen(sb, len);
313 return 0;
314 }
315
316 /* .. the buffer was too small - try again */
317 hint *= 2;
318 }
319 if (oldalloc == 0)
320 strbuf_release(sb);
321 return -1;
322}
323
324int strbuf_getline(struct strbuf *sb, FILE *fp, int term)
325{
326 int ch;
327
328 strbuf_grow(sb, 0);
329 if (feof(fp))
330 return EOF;
331
332 strbuf_reset(sb);
333 while ((ch = fgetc(fp)) != EOF) {
334 if (ch == term)
335 break;
336 strbuf_grow(sb, 1);
337 sb->buf[sb->len++] = ch;
338 }
339 if (ch == EOF && sb->len == 0)
340 return EOF;
341
342 sb->buf[sb->len] = '\0';
343 return 0;
344}
345
346int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint)
347{
348 int fd, len;
349
350 fd = open(path, O_RDONLY);
351 if (fd < 0)
352 return -1;
353 len = strbuf_read(sb, fd, hint);
354 close(fd);
355 if (len < 0)
356 return -1;
357
358 return len;
359}
diff --git a/Documentation/perf_counter/util/strbuf.h b/Documentation/perf_counter/util/strbuf.h
new file mode 100644
index 000000000000..9ee908a3ec5d
--- /dev/null
+++ b/Documentation/perf_counter/util/strbuf.h
@@ -0,0 +1,137 @@
1#ifndef STRBUF_H
2#define STRBUF_H
3
4/*
5 * Strbuf's can be use in many ways: as a byte array, or to store arbitrary
6 * long, overflow safe strings.
7 *
8 * Strbufs has some invariants that are very important to keep in mind:
9 *
10 * 1. the ->buf member is always malloc-ed, hence strbuf's can be used to
11 * build complex strings/buffers whose final size isn't easily known.
12 *
13 * It is NOT legal to copy the ->buf pointer away.
14 * `strbuf_detach' is the operation that detachs a buffer from its shell
15 * while keeping the shell valid wrt its invariants.
16 *
17 * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
18 * allocated. The extra byte is used to store a '\0', allowing the ->buf
19 * member to be a valid C-string. Every strbuf function ensure this
20 * invariant is preserved.
21 *
22 * Note that it is OK to "play" with the buffer directly if you work it
23 * that way:
24 *
25 * strbuf_grow(sb, SOME_SIZE);
26 * ... Here, the memory array starting at sb->buf, and of length
27 * ... strbuf_avail(sb) is all yours, and you are sure that
28 * ... strbuf_avail(sb) is at least SOME_SIZE.
29 * strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE);
30 *
31 * Of course, SOME_OTHER_SIZE must be smaller or equal to strbuf_avail(sb).
32 *
33 * Doing so is safe, though if it has to be done in many places, adding the
34 * missing API to the strbuf module is the way to go.
35 *
36 * XXX: do _not_ assume that the area that is yours is of size ->alloc - 1
37 * even if it's true in the current implementation. Alloc is somehow a
38 * "private" member that should not be messed with.
39 */
40
41#include <assert.h>
42
43extern char strbuf_slopbuf[];
44struct strbuf {
45 size_t alloc;
46 size_t len;
47 char *buf;
48};
49
50#define STRBUF_INIT { 0, 0, strbuf_slopbuf }
51
52/*----- strbuf life cycle -----*/
53extern void strbuf_init(struct strbuf *, size_t);
54extern void strbuf_release(struct strbuf *);
55extern char *strbuf_detach(struct strbuf *, size_t *);
56extern void strbuf_attach(struct strbuf *, void *, size_t, size_t);
57static inline void strbuf_swap(struct strbuf *a, struct strbuf *b) {
58 struct strbuf tmp = *a;
59 *a = *b;
60 *b = tmp;
61}
62
63/*----- strbuf size related -----*/
64static inline size_t strbuf_avail(const struct strbuf *sb) {
65 return sb->alloc ? sb->alloc - sb->len - 1 : 0;
66}
67
68extern void strbuf_grow(struct strbuf *, size_t);
69
70static inline void strbuf_setlen(struct strbuf *sb, size_t len) {
71 if (!sb->alloc)
72 strbuf_grow(sb, 0);
73 assert(len < sb->alloc);
74 sb->len = len;
75 sb->buf[len] = '\0';
76}
77#define strbuf_reset(sb) strbuf_setlen(sb, 0)
78
79/*----- content related -----*/
80extern void strbuf_trim(struct strbuf *);
81extern void strbuf_rtrim(struct strbuf *);
82extern void strbuf_ltrim(struct strbuf *);
83extern int strbuf_cmp(const struct strbuf *, const struct strbuf *);
84extern void strbuf_tolower(struct strbuf *);
85
86extern struct strbuf **strbuf_split(const struct strbuf *, int delim);
87extern void strbuf_list_free(struct strbuf **);
88
89/*----- add data in your buffer -----*/
90static inline void strbuf_addch(struct strbuf *sb, int c) {
91 strbuf_grow(sb, 1);
92 sb->buf[sb->len++] = c;
93 sb->buf[sb->len] = '\0';
94}
95
96extern void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t);
97extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
98
99/* splice pos..pos+len with given data */
100extern void strbuf_splice(struct strbuf *, size_t pos, size_t len,
101 const void *, size_t);
102
103extern void strbuf_add(struct strbuf *, const void *, size_t);
104static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
105 strbuf_add(sb, s, strlen(s));
106}
107static inline void strbuf_addbuf(struct strbuf *sb, const struct strbuf *sb2) {
108 strbuf_add(sb, sb2->buf, sb2->len);
109}
110extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len);
111
112typedef size_t (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context);
113extern void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, void *context);
114struct strbuf_expand_dict_entry {
115 const char *placeholder;
116 const char *value;
117};
118extern size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void *context);
119
120__attribute__((format(printf,2,3)))
121extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
122
123extern size_t strbuf_fread(struct strbuf *, size_t, FILE *);
124/* XXX: if read fails, any partial read is undone */
125extern ssize_t strbuf_read(struct strbuf *, int fd, size_t hint);
126extern int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint);
127extern int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint);
128
129extern int strbuf_getline(struct strbuf *, FILE *, int);
130
131extern void stripspace(struct strbuf *buf, int skip_comments);
132extern int launch_editor(const char *path, struct strbuf *buffer, const char *const *env);
133
134extern int strbuf_branchname(struct strbuf *sb, const char *name);
135extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name);
136
137#endif /* STRBUF_H */
diff --git a/Documentation/perf_counter/util/usage.c b/Documentation/perf_counter/util/usage.c
new file mode 100644
index 000000000000..7a10421fe6b4
--- /dev/null
+++ b/Documentation/perf_counter/util/usage.c
@@ -0,0 +1,80 @@
1/*
2 * GIT - The information manager from hell
3 *
4 * Copyright (C) Linus Torvalds, 2005
5 */
6#include "util.h"
7
8static void report(const char *prefix, const char *err, va_list params)
9{
10 char msg[1024];
11 vsnprintf(msg, sizeof(msg), err, params);
12 fprintf(stderr, "%s%s\n", prefix, msg);
13}
14
15static NORETURN void usage_builtin(const char *err)
16{
17 fprintf(stderr, "usage: %s\n", err);
18 exit(129);
19}
20
21static NORETURN void die_builtin(const char *err, va_list params)
22{
23 report("fatal: ", err, params);
24 exit(128);
25}
26
27static void error_builtin(const char *err, va_list params)
28{
29 report("error: ", err, params);
30}
31
32static void warn_builtin(const char *warn, va_list params)
33{
34 report("warning: ", warn, params);
35}
36
37/* If we are in a dlopen()ed .so write to a global variable would segfault
38 * (ugh), so keep things static. */
39static void (*usage_routine)(const char *err) NORETURN = usage_builtin;
40static void (*die_routine)(const char *err, va_list params) NORETURN = die_builtin;
41static void (*error_routine)(const char *err, va_list params) = error_builtin;
42static void (*warn_routine)(const char *err, va_list params) = warn_builtin;
43
44void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN)
45{
46 die_routine = routine;
47}
48
49void usage(const char *err)
50{
51 usage_routine(err);
52}
53
54void die(const char *err, ...)
55{
56 va_list params;
57
58 va_start(params, err);
59 die_routine(err, params);
60 va_end(params);
61}
62
63int error(const char *err, ...)
64{
65 va_list params;
66
67 va_start(params, err);
68 error_routine(err, params);
69 va_end(params);
70 return -1;
71}
72
73void warning(const char *warn, ...)
74{
75 va_list params;
76
77 va_start(params, warn);
78 warn_routine(warn, params);
79 va_end(params);
80}
diff --git a/Documentation/perf_counter/util/util.h b/Documentation/perf_counter/util/util.h
new file mode 100644
index 000000000000..36e40c38e093
--- /dev/null
+++ b/Documentation/perf_counter/util/util.h
@@ -0,0 +1,408 @@
1#ifndef GIT_COMPAT_UTIL_H
2#define GIT_COMPAT_UTIL_H
3
4#define _FILE_OFFSET_BITS 64
5
6#ifndef FLEX_ARRAY
7/*
8 * See if our compiler is known to support flexible array members.
9 */
10#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
11# define FLEX_ARRAY /* empty */
12#elif defined(__GNUC__)
13# if (__GNUC__ >= 3)
14# define FLEX_ARRAY /* empty */
15# else
16# define FLEX_ARRAY 0 /* older GNU extension */
17# endif
18#endif
19
20/*
21 * Otherwise, default to safer but a bit wasteful traditional style
22 */
23#ifndef FLEX_ARRAY
24# define FLEX_ARRAY 1
25#endif
26#endif
27
28#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
29
30#ifdef __GNUC__
31#define TYPEOF(x) (__typeof__(x))
32#else
33#define TYPEOF(x)
34#endif
35
36#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits))))
37#define HAS_MULTI_BITS(i) ((i) & ((i) - 1)) /* checks if an integer has more than 1 bit set */
38
39/* Approximation of the length of the decimal representation of this type. */
40#define decimal_length(x) ((int)(sizeof(x) * 2.56 + 0.5) + 1)
41
42#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__USLC__) && !defined(_M_UNIX)
43#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
44#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
45#endif
46#define _ALL_SOURCE 1
47#define _GNU_SOURCE 1
48#define _BSD_SOURCE 1
49
50#include <unistd.h>
51#include <stdio.h>
52#include <sys/stat.h>
53#include <fcntl.h>
54#include <stddef.h>
55#include <stdlib.h>
56#include <stdarg.h>
57#include <string.h>
58#include <errno.h>
59#include <limits.h>
60#include <sys/param.h>
61#include <sys/types.h>
62#include <dirent.h>
63#include <sys/time.h>
64#include <time.h>
65#include <signal.h>
66#include <fnmatch.h>
67#include <assert.h>
68#include <regex.h>
69#include <utime.h>
70#ifndef __MINGW32__
71#include <sys/wait.h>
72#include <sys/poll.h>
73#include <sys/socket.h>
74#include <sys/ioctl.h>
75#ifndef NO_SYS_SELECT_H
76#include <sys/select.h>
77#endif
78#include <netinet/in.h>
79#include <netinet/tcp.h>
80#include <arpa/inet.h>
81#include <netdb.h>
82#include <pwd.h>
83#include <inttypes.h>
84#if defined(__CYGWIN__)
85#undef _XOPEN_SOURCE
86#include <grp.h>
87#define _XOPEN_SOURCE 600
88#include "compat/cygwin.h"
89#else
90#undef _ALL_SOURCE /* AIX 5.3L defines a struct list with _ALL_SOURCE. */
91#include <grp.h>
92#define _ALL_SOURCE 1
93#endif
94#else /* __MINGW32__ */
95/* pull in Windows compatibility stuff */
96#include "compat/mingw.h"
97#endif /* __MINGW32__ */
98
99#ifndef NO_ICONV
100#include <iconv.h>
101#endif
102
103#ifndef NO_OPENSSL
104#include <openssl/ssl.h>
105#include <openssl/err.h>
106#endif
107
108/* On most systems <limits.h> would have given us this, but
109 * not on some systems (e.g. GNU/Hurd).
110 */
111#ifndef PATH_MAX
112#define PATH_MAX 4096
113#endif
114
115#ifndef PRIuMAX
116#define PRIuMAX "llu"
117#endif
118
119#ifndef PRIu32
120#define PRIu32 "u"
121#endif
122
123#ifndef PRIx32
124#define PRIx32 "x"
125#endif
126
127#ifndef PATH_SEP
128#define PATH_SEP ':'
129#endif
130
131#ifndef STRIP_EXTENSION
132#define STRIP_EXTENSION ""
133#endif
134
135#ifndef has_dos_drive_prefix
136#define has_dos_drive_prefix(path) 0
137#endif
138
139#ifndef is_dir_sep
140#define is_dir_sep(c) ((c) == '/')
141#endif
142
143#ifdef __GNUC__
144#define NORETURN __attribute__((__noreturn__))
145#else
146#define NORETURN
147#ifndef __attribute__
148#define __attribute__(x)
149#endif
150#endif
151
152/* General helper functions */
153extern void usage(const char *err) NORETURN;
154extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
155extern int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
156extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
157
158extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
159
160extern int prefixcmp(const char *str, const char *prefix);
161extern time_t tm_to_time_t(const struct tm *tm);
162
163static inline const char *skip_prefix(const char *str, const char *prefix)
164{
165 size_t len = strlen(prefix);
166 return strncmp(str, prefix, len) ? NULL : str + len;
167}
168
169#if defined(NO_MMAP) || defined(USE_WIN32_MMAP)
170
171#ifndef PROT_READ
172#define PROT_READ 1
173#define PROT_WRITE 2
174#define MAP_PRIVATE 1
175#define MAP_FAILED ((void*)-1)
176#endif
177
178#define mmap git_mmap
179#define munmap git_munmap
180extern void *git_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
181extern int git_munmap(void *start, size_t length);
182
183#else /* NO_MMAP || USE_WIN32_MMAP */
184
185#include <sys/mman.h>
186
187#endif /* NO_MMAP || USE_WIN32_MMAP */
188
189#ifdef NO_MMAP
190
191/* This value must be multiple of (pagesize * 2) */
192#define DEFAULT_PACKED_GIT_WINDOW_SIZE (1 * 1024 * 1024)
193
194#else /* NO_MMAP */
195
196/* This value must be multiple of (pagesize * 2) */
197#define DEFAULT_PACKED_GIT_WINDOW_SIZE \
198 (sizeof(void*) >= 8 \
199 ? 1 * 1024 * 1024 * 1024 \
200 : 32 * 1024 * 1024)
201
202#endif /* NO_MMAP */
203
204#ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
205#define on_disk_bytes(st) ((st).st_size)
206#else
207#define on_disk_bytes(st) ((st).st_blocks * 512)
208#endif
209
210#define DEFAULT_PACKED_GIT_LIMIT \
211 ((1024L * 1024L) * (sizeof(void*) >= 8 ? 8192 : 256))
212
213#ifdef NO_PREAD
214#define pread git_pread
215extern ssize_t git_pread(int fd, void *buf, size_t count, off_t offset);
216#endif
217/*
218 * Forward decl that will remind us if its twin in cache.h changes.
219 * This function is used in compat/pread.c. But we can't include
220 * cache.h there.
221 */
222extern ssize_t read_in_full(int fd, void *buf, size_t count);
223
224#ifdef NO_SETENV
225#define setenv gitsetenv
226extern int gitsetenv(const char *, const char *, int);
227#endif
228
229#ifdef NO_MKDTEMP
230#define mkdtemp gitmkdtemp
231extern char *gitmkdtemp(char *);
232#endif
233
234#ifdef NO_UNSETENV
235#define unsetenv gitunsetenv
236extern void gitunsetenv(const char *);
237#endif
238
239#ifdef NO_STRCASESTR
240#define strcasestr gitstrcasestr
241extern char *gitstrcasestr(const char *haystack, const char *needle);
242#endif
243
244#ifdef NO_STRLCPY
245#define strlcpy gitstrlcpy
246extern size_t gitstrlcpy(char *, const char *, size_t);
247#endif
248
249#ifdef NO_STRTOUMAX
250#define strtoumax gitstrtoumax
251extern uintmax_t gitstrtoumax(const char *, char **, int);
252#endif
253
254#ifdef NO_HSTRERROR
255#define hstrerror githstrerror
256extern const char *githstrerror(int herror);
257#endif
258
259#ifdef NO_MEMMEM
260#define memmem gitmemmem
261void *gitmemmem(const void *haystack, size_t haystacklen,
262 const void *needle, size_t needlelen);
263#endif
264
265#ifdef FREAD_READS_DIRECTORIES
266#ifdef fopen
267#undef fopen
268#endif
269#define fopen(a,b) git_fopen(a,b)
270extern FILE *git_fopen(const char*, const char*);
271#endif
272
273#ifdef SNPRINTF_RETURNS_BOGUS
274#define snprintf git_snprintf
275extern int git_snprintf(char *str, size_t maxsize,
276 const char *format, ...);
277#define vsnprintf git_vsnprintf
278extern int git_vsnprintf(char *str, size_t maxsize,
279 const char *format, va_list ap);
280#endif
281
282#ifdef __GLIBC_PREREQ
283#if __GLIBC_PREREQ(2, 1)
284#define HAVE_STRCHRNUL
285#endif
286#endif
287
288#ifndef HAVE_STRCHRNUL
289#define strchrnul gitstrchrnul
290static inline char *gitstrchrnul(const char *s, int c)
291{
292 while (*s && *s != c)
293 s++;
294 return (char *)s;
295}
296#endif
297
298/*
299 * Wrappers:
300 */
301extern char *xstrdup(const char *str);
302extern void *xmalloc(size_t size);
303extern void *xmemdupz(const void *data, size_t len);
304extern char *xstrndup(const char *str, size_t len);
305extern void *xrealloc(void *ptr, size_t size);
306extern void *xcalloc(size_t nmemb, size_t size);
307extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
308extern ssize_t xread(int fd, void *buf, size_t len);
309extern ssize_t xwrite(int fd, const void *buf, size_t len);
310extern int xdup(int fd);
311extern FILE *xfdopen(int fd, const char *mode);
312static inline size_t xsize_t(off_t len)
313{
314 return (size_t)len;
315}
316
317static inline int has_extension(const char *filename, const char *ext)
318{
319 size_t len = strlen(filename);
320 size_t extlen = strlen(ext);
321 return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
322}
323
324/* Sane ctype - no locale, and works with signed chars */
325#undef isascii
326#undef isspace
327#undef isdigit
328#undef isalpha
329#undef isalnum
330#undef tolower
331#undef toupper
332extern unsigned char sane_ctype[256];
333#define GIT_SPACE 0x01
334#define GIT_DIGIT 0x02
335#define GIT_ALPHA 0x04
336#define GIT_GLOB_SPECIAL 0x08
337#define GIT_REGEX_SPECIAL 0x10
338#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
339#define isascii(x) (((x) & ~0x7f) == 0)
340#define isspace(x) sane_istest(x,GIT_SPACE)
341#define isdigit(x) sane_istest(x,GIT_DIGIT)
342#define isalpha(x) sane_istest(x,GIT_ALPHA)
343#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
344#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
345#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
346#define tolower(x) sane_case((unsigned char)(x), 0x20)
347#define toupper(x) sane_case((unsigned char)(x), 0)
348
349static inline int sane_case(int x, int high)
350{
351 if (sane_istest(x, GIT_ALPHA))
352 x = (x & ~0x20) | high;
353 return x;
354}
355
356static inline int strtoul_ui(char const *s, int base, unsigned int *result)
357{
358 unsigned long ul;
359 char *p;
360
361 errno = 0;
362 ul = strtoul(s, &p, base);
363 if (errno || *p || p == s || (unsigned int) ul != ul)
364 return -1;
365 *result = ul;
366 return 0;
367}
368
369static inline int strtol_i(char const *s, int base, int *result)
370{
371 long ul;
372 char *p;
373
374 errno = 0;
375 ul = strtol(s, &p, base);
376 if (errno || *p || p == s || (int) ul != ul)
377 return -1;
378 *result = ul;
379 return 0;
380}
381
382#ifdef INTERNAL_QSORT
383void git_qsort(void *base, size_t nmemb, size_t size,
384 int(*compar)(const void *, const void *));
385#define qsort git_qsort
386#endif
387
388#ifndef DIR_HAS_BSD_GROUP_SEMANTICS
389# define FORCE_DIR_SET_GID S_ISGID
390#else
391# define FORCE_DIR_SET_GID 0
392#endif
393
394#ifdef NO_NSEC
395#undef USE_NSEC
396#define ST_CTIME_NSEC(st) 0
397#define ST_MTIME_NSEC(st) 0
398#else
399#ifdef USE_ST_TIMESPEC
400#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec))
401#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec))
402#else
403#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec))
404#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec))
405#endif
406#endif
407
408#endif
diff --git a/Documentation/perf_counter/util/wrapper.c b/Documentation/perf_counter/util/wrapper.c
new file mode 100644
index 000000000000..6350d65f6d9e
--- /dev/null
+++ b/Documentation/perf_counter/util/wrapper.c
@@ -0,0 +1,206 @@
1/*
2 * Various trivial helper wrappers around standard functions
3 */
4#include "cache.h"
5
6/*
7 * There's no pack memory to release - but stay close to the Git
8 * version so wrap this away:
9 */
10static inline void release_pack_memory(size_t size, int flag)
11{
12}
13
14char *xstrdup(const char *str)
15{
16 char *ret = strdup(str);
17 if (!ret) {
18 release_pack_memory(strlen(str) + 1, -1);
19 ret = strdup(str);
20 if (!ret)
21 die("Out of memory, strdup failed");
22 }
23 return ret;
24}
25
26void *xmalloc(size_t size)
27{
28 void *ret = malloc(size);
29 if (!ret && !size)
30 ret = malloc(1);
31 if (!ret) {
32 release_pack_memory(size, -1);
33 ret = malloc(size);
34 if (!ret && !size)
35 ret = malloc(1);
36 if (!ret)
37 die("Out of memory, malloc failed");
38 }
39#ifdef XMALLOC_POISON
40 memset(ret, 0xA5, size);
41#endif
42 return ret;
43}
44
45/*
46 * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of
47 * "data" to the allocated memory, zero terminates the allocated memory,
48 * and returns a pointer to the allocated memory. If the allocation fails,
49 * the program dies.
50 */
51void *xmemdupz(const void *data, size_t len)
52{
53 char *p = xmalloc(len + 1);
54 memcpy(p, data, len);
55 p[len] = '\0';
56 return p;
57}
58
59char *xstrndup(const char *str, size_t len)
60{
61 char *p = memchr(str, '\0', len);
62 return xmemdupz(str, p ? p - str : len);
63}
64
65void *xrealloc(void *ptr, size_t size)
66{
67 void *ret = realloc(ptr, size);
68 if (!ret && !size)
69 ret = realloc(ptr, 1);
70 if (!ret) {
71 release_pack_memory(size, -1);
72 ret = realloc(ptr, size);
73 if (!ret && !size)
74 ret = realloc(ptr, 1);
75 if (!ret)
76 die("Out of memory, realloc failed");
77 }
78 return ret;
79}
80
81void *xcalloc(size_t nmemb, size_t size)
82{
83 void *ret = calloc(nmemb, size);
84 if (!ret && (!nmemb || !size))
85 ret = calloc(1, 1);
86 if (!ret) {
87 release_pack_memory(nmemb * size, -1);
88 ret = calloc(nmemb, size);
89 if (!ret && (!nmemb || !size))
90 ret = calloc(1, 1);
91 if (!ret)
92 die("Out of memory, calloc failed");
93 }
94 return ret;
95}
96
97void *xmmap(void *start, size_t length,
98 int prot, int flags, int fd, off_t offset)
99{
100 void *ret = mmap(start, length, prot, flags, fd, offset);
101 if (ret == MAP_FAILED) {
102 if (!length)
103 return NULL;
104 release_pack_memory(length, fd);
105 ret = mmap(start, length, prot, flags, fd, offset);
106 if (ret == MAP_FAILED)
107 die("Out of memory? mmap failed: %s", strerror(errno));
108 }
109 return ret;
110}
111
112/*
113 * xread() is the same a read(), but it automatically restarts read()
114 * operations with a recoverable error (EAGAIN and EINTR). xread()
115 * DOES NOT GUARANTEE that "len" bytes is read even if the data is available.
116 */
117ssize_t xread(int fd, void *buf, size_t len)
118{
119 ssize_t nr;
120 while (1) {
121 nr = read(fd, buf, len);
122 if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
123 continue;
124 return nr;
125 }
126}
127
128/*
129 * xwrite() is the same a write(), but it automatically restarts write()
130 * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT
131 * GUARANTEE that "len" bytes is written even if the operation is successful.
132 */
133ssize_t xwrite(int fd, const void *buf, size_t len)
134{
135 ssize_t nr;
136 while (1) {
137 nr = write(fd, buf, len);
138 if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
139 continue;
140 return nr;
141 }
142}
143
144ssize_t read_in_full(int fd, void *buf, size_t count)
145{
146 char *p = buf;
147 ssize_t total = 0;
148
149 while (count > 0) {
150 ssize_t loaded = xread(fd, p, count);
151 if (loaded <= 0)
152 return total ? total : loaded;
153 count -= loaded;
154 p += loaded;
155 total += loaded;
156 }
157
158 return total;
159}
160
161ssize_t write_in_full(int fd, const void *buf, size_t count)
162{
163 const char *p = buf;
164 ssize_t total = 0;
165
166 while (count > 0) {
167 ssize_t written = xwrite(fd, p, count);
168 if (written < 0)
169 return -1;
170 if (!written) {
171 errno = ENOSPC;
172 return -1;
173 }
174 count -= written;
175 p += written;
176 total += written;
177 }
178
179 return total;
180}
181
182int xdup(int fd)
183{
184 int ret = dup(fd);
185 if (ret < 0)
186 die("dup failed: %s", strerror(errno));
187 return ret;
188}
189
190FILE *xfdopen(int fd, const char *mode)
191{
192 FILE *stream = fdopen(fd, mode);
193 if (stream == NULL)
194 die("Out of memory? fdopen failed: %s", strerror(errno));
195 return stream;
196}
197
198int xmkstemp(char *template)
199{
200 int fd;
201
202 fd = mkstemp(template);
203 if (fd < 0)
204 die("Unable to create temporary file: %s", strerror(errno));
205 return fd;
206}
diff --git a/MAINTAINERS b/MAINTAINERS
index c547f4a2bb62..5114b5341df4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4375,6 +4375,16 @@ S: Maintained
4375F: include/linux/delayacct.h 4375F: include/linux/delayacct.h
4376F: kernel/delayacct.c 4376F: kernel/delayacct.c
4377 4377
4378PERFORMANCE COUNTER SUBSYSTEM
4379P: Peter Zijlstra
4380M: a.p.zijlstra@chello.nl
4381P: Paul Mackerras
4382M: paulus@samba.org
4383P: Ingo Molnar
4384M: mingo@elte.hu
4385L: linux-kernel@vger.kernel.org
4386S: Supported
4387
4378PERSONALITY HANDLING 4388PERSONALITY HANDLING
4379P: Christoph Hellwig 4389P: Christoph Hellwig
4380M: hch@infradead.org 4390M: hch@infradead.org
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index b7e034b0a6dd..20a44d0c9fdd 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,5 +131,44 @@ static inline int irqs_disabled_flags(unsigned long flags)
131 */ 131 */
132struct irq_chip; 132struct irq_chip;
133 133
134#ifdef CONFIG_PERF_COUNTERS
135static inline unsigned long test_perf_counter_pending(void)
136{
137 unsigned long x;
138
139 asm volatile("lbz %0,%1(13)"
140 : "=r" (x)
141 : "i" (offsetof(struct paca_struct, perf_counter_pending)));
142 return x;
143}
144
145static inline void set_perf_counter_pending(void)
146{
147 asm volatile("stb %0,%1(13)" : :
148 "r" (1),
149 "i" (offsetof(struct paca_struct, perf_counter_pending)));
150}
151
152static inline void clear_perf_counter_pending(void)
153{
154 asm volatile("stb %0,%1(13)" : :
155 "r" (0),
156 "i" (offsetof(struct paca_struct, perf_counter_pending)));
157}
158
159extern void perf_counter_do_pending(void);
160
161#else
162
163static inline unsigned long test_perf_counter_pending(void)
164{
165 return 0;
166}
167
168static inline void set_perf_counter_pending(void) {}
169static inline void clear_perf_counter_pending(void) {}
170static inline void perf_counter_do_pending(void) {}
171#endif /* CONFIG_PERF_COUNTERS */
172
134#endif /* __KERNEL__ */ 173#endif /* __KERNEL__ */
135#endif /* _ASM_POWERPC_HW_IRQ_H */ 174#endif /* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 082b3aedf145..6ef055723019 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,6 +99,7 @@ struct paca_struct {
99 u8 soft_enabled; /* irq soft-enable flag */ 99 u8 soft_enabled; /* irq soft-enable flag */
100 u8 hard_enabled; /* set if irqs are enabled in MSR */ 100 u8 hard_enabled; /* set if irqs are enabled in MSR */
101 u8 io_sync; /* writel() needs spin_unlock sync */ 101 u8 io_sync; /* writel() needs spin_unlock sync */
102 u8 perf_counter_pending; /* PM interrupt while soft-disabled */
102 103
103 /* Stuff for accurate time accounting */ 104 /* Stuff for accurate time accounting */
104 u64 user_time; /* accumulated usermode TB ticks */ 105 u64 user_time; /* accumulated usermode TB ticks */
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
new file mode 100644
index 000000000000..9d7ff6d7fb56
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -0,0 +1,72 @@
1/*
2 * Performance counter support - PowerPC-specific definitions.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/types.h>
12
13#define MAX_HWCOUNTERS 8
14#define MAX_EVENT_ALTERNATIVES 8
15
16/*
17 * This struct provides the constants and functions needed to
18 * describe the PMU on a particular POWER-family CPU.
19 */
20struct power_pmu {
21 int n_counter;
22 int max_alternatives;
23 u64 add_fields;
24 u64 test_adder;
25 int (*compute_mmcr)(unsigned int events[], int n_ev,
26 unsigned int hwc[], u64 mmcr[]);
27 int (*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
28 int (*get_alternatives)(unsigned int event, unsigned int alt[]);
29 void (*disable_pmc)(unsigned int pmc, u64 mmcr[]);
30 int n_generic;
31 int *generic_events;
32};
33
34extern struct power_pmu *ppmu;
35
36/*
37 * The power_pmu.get_constraint function returns a 64-bit value and
38 * a 64-bit mask that express the constraints between this event and
39 * other events.
40 *
41 * The value and mask are divided up into (non-overlapping) bitfields
42 * of three different types:
43 *
44 * Select field: this expresses the constraint that some set of bits
45 * in MMCR* needs to be set to a specific value for this event. For a
46 * select field, the mask contains 1s in every bit of the field, and
47 * the value contains a unique value for each possible setting of the
48 * MMCR* bits. The constraint checking code will ensure that two events
49 * that set the same field in their masks have the same value in their
50 * value dwords.
51 *
52 * Add field: this expresses the constraint that there can be at most
53 * N events in a particular class. A field of k bits can be used for
54 * N <= 2^(k-1) - 1. The mask has the most significant bit of the field
55 * set (and the other bits 0), and the value has only the least significant
56 * bit of the field set. In addition, the 'add_fields' and 'test_adder'
57 * in the struct power_pmu for this processor come into play. The
58 * add_fields value contains 1 in the LSB of the field, and the
59 * test_adder contains 2^(k-1) - 1 - N in the field.
60 *
61 * NAND field: this expresses the constraint that you may not have events
62 * in all of a set of classes. (For example, on PPC970, you can't select
63 * events from the FPU, ISU and IDU simultaneously, although any two are
64 * possible.) For N classes, the field is N+1 bits wide, and each class
65 * is assigned one bit from the least-significant N bits. The mask has
66 * only the most-significant bit set, and the value has only the bit
67 * for the event's class set. The test_adder has the least significant
68 * bit set in the field.
69 *
70 * If an event is not subject to the constraint expressed by a particular
71 * field, then it will have 0 in both the mask and value for that field.
72 */
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index d98a30dfd41c..a0b92de51c7e 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,6 +322,6 @@ SYSCALL_SPU(epoll_create1)
322SYSCALL_SPU(dup3) 322SYSCALL_SPU(dup3)
323SYSCALL_SPU(pipe2) 323SYSCALL_SPU(pipe2)
324SYSCALL(inotify_init1) 324SYSCALL(inotify_init1)
325SYSCALL(ni_syscall) 325SYSCALL_SPU(perf_counter_open)
326COMPAT_SYS_SPU(preadv) 326COMPAT_SYS_SPU(preadv)
327COMPAT_SYS_SPU(pwritev) 327COMPAT_SYS_SPU(pwritev)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index 3f06f8ec81c5..4badac2d11d1 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,6 +341,7 @@
341#define __NR_dup3 316 341#define __NR_dup3 316
342#define __NR_pipe2 317 342#define __NR_pipe2 317
343#define __NR_inotify_init1 318 343#define __NR_inotify_init1 318
344#define __NR_perf_counter_open 319
344#define __NR_preadv 320 345#define __NR_preadv 320
345#define __NR_pwritev 321 346#define __NR_pwritev 321
346 347
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 71901fbda4a5..9ba1bb731fcc 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,6 +94,8 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o
94 94
95obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 95obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
96obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 96obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
97obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o power4-pmu.o ppc970-pmu.o \
98 power5-pmu.o power5+-pmu.o power6-pmu.o
97 99
98obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o 100obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
99 101
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 1e40bc053946..e981d1ce1914 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -131,6 +131,7 @@ int main(void)
131 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); 131 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
132 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); 132 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
133 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); 133 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
134 DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
134 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); 135 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
135 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); 136 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
136 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); 137 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index abfc32330479..43e073477c34 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
5262: 5262:
527 TRACE_AND_RESTORE_IRQ(r5); 527 TRACE_AND_RESTORE_IRQ(r5);
528 528
529#ifdef CONFIG_PERF_COUNTERS
530 /* check paca->perf_counter_pending if we're enabling ints */
531 lbz r3,PACAPERFPEND(r13)
532 and. r3,r3,r5
533 beq 27f
534 bl .perf_counter_do_pending
53527:
536#endif /* CONFIG_PERF_COUNTERS */
537
529 /* extract EE bit and use it to restore paca->hard_enabled */ 538 /* extract EE bit and use it to restore paca->hard_enabled */
530 ld r3,_MSR(r1) 539 ld r3,_MSR(r1)
531 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ 540 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 8c1a4966867e..feff792ed0f9 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,6 +135,11 @@ notrace void raw_local_irq_restore(unsigned long en)
135 iseries_handle_interrupts(); 135 iseries_handle_interrupts();
136 } 136 }
137 137
138 if (test_perf_counter_pending()) {
139 clear_perf_counter_pending();
140 perf_counter_do_pending();
141 }
142
138 /* 143 /*
139 * if (get_paca()->hard_enabled) return; 144 * if (get_paca()->hard_enabled) return;
140 * But again we need to take care that gcc gets hard_enabled directly 145 * But again we need to take care that gcc gets hard_enabled directly
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
new file mode 100644
index 000000000000..bd76d0fa2c35
--- /dev/null
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,866 @@
1/*
2 * Performance counter support - powerpc architecture code
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/sched.h>
13#include <linux/perf_counter.h>
14#include <linux/percpu.h>
15#include <linux/hardirq.h>
16#include <asm/reg.h>
17#include <asm/pmc.h>
18#include <asm/machdep.h>
19#include <asm/firmware.h>
20
21struct cpu_hw_counters {
22 int n_counters;
23 int n_percpu;
24 int disabled;
25 int n_added;
26 struct perf_counter *counter[MAX_HWCOUNTERS];
27 unsigned int events[MAX_HWCOUNTERS];
28 u64 mmcr[3];
29 u8 pmcs_enabled;
30};
31DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
32
33struct power_pmu *ppmu;
34
35/*
36 * Normally, to ignore kernel events we set the FCS (freeze counters
37 * in supervisor mode) bit in MMCR0, but if the kernel runs with the
38 * hypervisor bit set in the MSR, or if we are running on a processor
39 * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
40 * then we need to use the FCHV bit to ignore kernel events.
41 */
42static unsigned int freeze_counters_kernel = MMCR0_FCS;
43
44static void perf_counter_interrupt(struct pt_regs *regs);
45
46void perf_counter_print_debug(void)
47{
48}
49
50/*
51 * Read one performance monitor counter (PMC).
52 */
53static unsigned long read_pmc(int idx)
54{
55 unsigned long val;
56
57 switch (idx) {
58 case 1:
59 val = mfspr(SPRN_PMC1);
60 break;
61 case 2:
62 val = mfspr(SPRN_PMC2);
63 break;
64 case 3:
65 val = mfspr(SPRN_PMC3);
66 break;
67 case 4:
68 val = mfspr(SPRN_PMC4);
69 break;
70 case 5:
71 val = mfspr(SPRN_PMC5);
72 break;
73 case 6:
74 val = mfspr(SPRN_PMC6);
75 break;
76 case 7:
77 val = mfspr(SPRN_PMC7);
78 break;
79 case 8:
80 val = mfspr(SPRN_PMC8);
81 break;
82 default:
83 printk(KERN_ERR "oops trying to read PMC%d\n", idx);
84 val = 0;
85 }
86 return val;
87}
88
89/*
90 * Write one PMC.
91 */
92static void write_pmc(int idx, unsigned long val)
93{
94 switch (idx) {
95 case 1:
96 mtspr(SPRN_PMC1, val);
97 break;
98 case 2:
99 mtspr(SPRN_PMC2, val);
100 break;
101 case 3:
102 mtspr(SPRN_PMC3, val);
103 break;
104 case 4:
105 mtspr(SPRN_PMC4, val);
106 break;
107 case 5:
108 mtspr(SPRN_PMC5, val);
109 break;
110 case 6:
111 mtspr(SPRN_PMC6, val);
112 break;
113 case 7:
114 mtspr(SPRN_PMC7, val);
115 break;
116 case 8:
117 mtspr(SPRN_PMC8, val);
118 break;
119 default:
120 printk(KERN_ERR "oops trying to write PMC%d\n", idx);
121 }
122}
123
124/*
125 * Check if a set of events can all go on the PMU at once.
126 * If they can't, this will look at alternative codes for the events
127 * and see if any combination of alternative codes is feasible.
128 * The feasible set is returned in event[].
129 */
130static int power_check_constraints(unsigned int event[], int n_ev)
131{
132 u64 mask, value, nv;
133 unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
134 u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
135 u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
136 u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
137 int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
138 int i, j;
139 u64 addf = ppmu->add_fields;
140 u64 tadd = ppmu->test_adder;
141
142 if (n_ev > ppmu->n_counter)
143 return -1;
144
145 /* First see if the events will go on as-is */
146 for (i = 0; i < n_ev; ++i) {
147 alternatives[i][0] = event[i];
148 if (ppmu->get_constraint(event[i], &amasks[i][0],
149 &avalues[i][0]))
150 return -1;
151 choice[i] = 0;
152 }
153 value = mask = 0;
154 for (i = 0; i < n_ev; ++i) {
155 nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
156 if ((((nv + tadd) ^ value) & mask) != 0 ||
157 (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
158 break;
159 value = nv;
160 mask |= amasks[i][0];
161 }
162 if (i == n_ev)
163 return 0; /* all OK */
164
165 /* doesn't work, gather alternatives... */
166 if (!ppmu->get_alternatives)
167 return -1;
168 for (i = 0; i < n_ev; ++i) {
169 n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
170 for (j = 1; j < n_alt[i]; ++j)
171 ppmu->get_constraint(alternatives[i][j],
172 &amasks[i][j], &avalues[i][j]);
173 }
174
175 /* enumerate all possibilities and see if any will work */
176 i = 0;
177 j = -1;
178 value = mask = nv = 0;
179 while (i < n_ev) {
180 if (j >= 0) {
181 /* we're backtracking, restore context */
182 value = svalues[i];
183 mask = smasks[i];
184 j = choice[i];
185 }
186 /*
187 * See if any alternative k for event i,
188 * where k > j, will satisfy the constraints.
189 */
190 while (++j < n_alt[i]) {
191 nv = (value | avalues[i][j]) +
192 (value & avalues[i][j] & addf);
193 if ((((nv + tadd) ^ value) & mask) == 0 &&
194 (((nv + tadd) ^ avalues[i][j])
195 & amasks[i][j]) == 0)
196 break;
197 }
198 if (j >= n_alt[i]) {
199 /*
200 * No feasible alternative, backtrack
201 * to event i-1 and continue enumerating its
202 * alternatives from where we got up to.
203 */
204 if (--i < 0)
205 return -1;
206 } else {
207 /*
208 * Found a feasible alternative for event i,
209 * remember where we got up to with this event,
210 * go on to the next event, and start with
211 * the first alternative for it.
212 */
213 choice[i] = j;
214 svalues[i] = value;
215 smasks[i] = mask;
216 value = nv;
217 mask |= amasks[i][j];
218 ++i;
219 j = -1;
220 }
221 }
222
223 /* OK, we have a feasible combination, tell the caller the solution */
224 for (i = 0; i < n_ev; ++i)
225 event[i] = alternatives[i][choice[i]];
226 return 0;
227}
228
229/*
230 * Check if newly-added counters have consistent settings for
231 * exclude_{user,kernel,hv} with each other and any previously
232 * added counters.
233 */
234static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
235{
236 int eu, ek, eh;
237 int i, n;
238 struct perf_counter *counter;
239
240 n = n_prev + n_new;
241 if (n <= 1)
242 return 0;
243
244 eu = ctrs[0]->hw_event.exclude_user;
245 ek = ctrs[0]->hw_event.exclude_kernel;
246 eh = ctrs[0]->hw_event.exclude_hv;
247 if (n_prev == 0)
248 n_prev = 1;
249 for (i = n_prev; i < n; ++i) {
250 counter = ctrs[i];
251 if (counter->hw_event.exclude_user != eu ||
252 counter->hw_event.exclude_kernel != ek ||
253 counter->hw_event.exclude_hv != eh)
254 return -EAGAIN;
255 }
256 return 0;
257}
258
259static void power_perf_read(struct perf_counter *counter)
260{
261 long val, delta, prev;
262
263 if (!counter->hw.idx)
264 return;
265 /*
266 * Performance monitor interrupts come even when interrupts
267 * are soft-disabled, as long as interrupts are hard-enabled.
268 * Therefore we treat them like NMIs.
269 */
270 do {
271 prev = atomic64_read(&counter->hw.prev_count);
272 barrier();
273 val = read_pmc(counter->hw.idx);
274 } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
275
276 /* The counters are only 32 bits wide */
277 delta = (val - prev) & 0xfffffffful;
278 atomic64_add(delta, &counter->count);
279 atomic64_sub(delta, &counter->hw.period_left);
280}
281
282/*
283 * Disable all counters to prevent PMU interrupts and to allow
284 * counters to be added or removed.
285 */
286u64 hw_perf_save_disable(void)
287{
288 struct cpu_hw_counters *cpuhw;
289 unsigned long ret;
290 unsigned long flags;
291
292 local_irq_save(flags);
293 cpuhw = &__get_cpu_var(cpu_hw_counters);
294
295 ret = cpuhw->disabled;
296 if (!ret) {
297 cpuhw->disabled = 1;
298 cpuhw->n_added = 0;
299
300 /*
301 * Check if we ever enabled the PMU on this cpu.
302 */
303 if (!cpuhw->pmcs_enabled) {
304 if (ppc_md.enable_pmcs)
305 ppc_md.enable_pmcs();
306 cpuhw->pmcs_enabled = 1;
307 }
308
309 /*
310 * Disable instruction sampling if it was enabled
311 */
312 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
313 mtspr(SPRN_MMCRA,
314 cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
315 mb();
316 }
317
318 /*
319 * Set the 'freeze counters' bit.
320 * The barrier is to make sure the mtspr has been
321 * executed and the PMU has frozen the counters
322 * before we return.
323 */
324 mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
325 mb();
326 }
327 local_irq_restore(flags);
328 return ret;
329}
330
331/*
332 * Re-enable all counters if disable == 0.
333 * If we were previously disabled and counters were added, then
334 * put the new config on the PMU.
335 */
336void hw_perf_restore(u64 disable)
337{
338 struct perf_counter *counter;
339 struct cpu_hw_counters *cpuhw;
340 unsigned long flags;
341 long i;
342 unsigned long val;
343 s64 left;
344 unsigned int hwc_index[MAX_HWCOUNTERS];
345
346 if (disable)
347 return;
348 local_irq_save(flags);
349 cpuhw = &__get_cpu_var(cpu_hw_counters);
350 cpuhw->disabled = 0;
351
352 /*
353 * If we didn't change anything, or only removed counters,
354 * no need to recalculate MMCR* settings and reset the PMCs.
355 * Just reenable the PMU with the current MMCR* settings
356 * (possibly updated for removal of counters).
357 */
358 if (!cpuhw->n_added) {
359 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
360 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
361 if (cpuhw->n_counters == 0)
362 get_lppaca()->pmcregs_in_use = 0;
363 goto out_enable;
364 }
365
366 /*
367 * Compute MMCR* values for the new set of counters
368 */
369 if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
370 cpuhw->mmcr)) {
371 /* shouldn't ever get here */
372 printk(KERN_ERR "oops compute_mmcr failed\n");
373 goto out;
374 }
375
376 /*
377 * Add in MMCR0 freeze bits corresponding to the
378 * hw_event.exclude_* bits for the first counter.
379 * We have already checked that all counters have the
380 * same values for these bits as the first counter.
381 */
382 counter = cpuhw->counter[0];
383 if (counter->hw_event.exclude_user)
384 cpuhw->mmcr[0] |= MMCR0_FCP;
385 if (counter->hw_event.exclude_kernel)
386 cpuhw->mmcr[0] |= freeze_counters_kernel;
387 if (counter->hw_event.exclude_hv)
388 cpuhw->mmcr[0] |= MMCR0_FCHV;
389
390 /*
391 * Write the new configuration to MMCR* with the freeze
392 * bit set and set the hardware counters to their initial values.
393 * Then unfreeze the counters.
394 */
395 get_lppaca()->pmcregs_in_use = 1;
396 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
397 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
398 mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
399 | MMCR0_FC);
400
401 /*
402 * Read off any pre-existing counters that need to move
403 * to another PMC.
404 */
405 for (i = 0; i < cpuhw->n_counters; ++i) {
406 counter = cpuhw->counter[i];
407 if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
408 power_perf_read(counter);
409 write_pmc(counter->hw.idx, 0);
410 counter->hw.idx = 0;
411 }
412 }
413
414 /*
415 * Initialize the PMCs for all the new and moved counters.
416 */
417 for (i = 0; i < cpuhw->n_counters; ++i) {
418 counter = cpuhw->counter[i];
419 if (counter->hw.idx)
420 continue;
421 val = 0;
422 if (counter->hw_event.irq_period) {
423 left = atomic64_read(&counter->hw.period_left);
424 if (left < 0x80000000L)
425 val = 0x80000000L - left;
426 }
427 atomic64_set(&counter->hw.prev_count, val);
428 counter->hw.idx = hwc_index[i] + 1;
429 write_pmc(counter->hw.idx, val);
430 perf_counter_update_userpage(counter);
431 }
432 cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
433
434 out_enable:
435 mb();
436 mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
437
438 /*
439 * Enable instruction sampling if necessary
440 */
441 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
442 mb();
443 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
444 }
445
446 out:
447 local_irq_restore(flags);
448}
449
450static int collect_events(struct perf_counter *group, int max_count,
451 struct perf_counter *ctrs[], unsigned int *events)
452{
453 int n = 0;
454 struct perf_counter *counter;
455
456 if (!is_software_counter(group)) {
457 if (n >= max_count)
458 return -1;
459 ctrs[n] = group;
460 events[n++] = group->hw.config;
461 }
462 list_for_each_entry(counter, &group->sibling_list, list_entry) {
463 if (!is_software_counter(counter) &&
464 counter->state != PERF_COUNTER_STATE_OFF) {
465 if (n >= max_count)
466 return -1;
467 ctrs[n] = counter;
468 events[n++] = counter->hw.config;
469 }
470 }
471 return n;
472}
473
474static void counter_sched_in(struct perf_counter *counter, int cpu)
475{
476 counter->state = PERF_COUNTER_STATE_ACTIVE;
477 counter->oncpu = cpu;
478 counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
479 if (is_software_counter(counter))
480 counter->hw_ops->enable(counter);
481}
482
483/*
484 * Called to enable a whole group of counters.
485 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
486 * Assumes the caller has disabled interrupts and has
487 * frozen the PMU with hw_perf_save_disable.
488 */
489int hw_perf_group_sched_in(struct perf_counter *group_leader,
490 struct perf_cpu_context *cpuctx,
491 struct perf_counter_context *ctx, int cpu)
492{
493 struct cpu_hw_counters *cpuhw;
494 long i, n, n0;
495 struct perf_counter *sub;
496
497 cpuhw = &__get_cpu_var(cpu_hw_counters);
498 n0 = cpuhw->n_counters;
499 n = collect_events(group_leader, ppmu->n_counter - n0,
500 &cpuhw->counter[n0], &cpuhw->events[n0]);
501 if (n < 0)
502 return -EAGAIN;
503 if (check_excludes(cpuhw->counter, n0, n))
504 return -EAGAIN;
505 if (power_check_constraints(cpuhw->events, n + n0))
506 return -EAGAIN;
507 cpuhw->n_counters = n0 + n;
508 cpuhw->n_added += n;
509
510 /*
511 * OK, this group can go on; update counter states etc.,
512 * and enable any software counters
513 */
514 for (i = n0; i < n0 + n; ++i)
515 cpuhw->counter[i]->hw.config = cpuhw->events[i];
516 cpuctx->active_oncpu += n;
517 n = 1;
518 counter_sched_in(group_leader, cpu);
519 list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
520 if (sub->state != PERF_COUNTER_STATE_OFF) {
521 counter_sched_in(sub, cpu);
522 ++n;
523 }
524 }
525 ctx->nr_active += n;
526
527 return 1;
528}
529
530/*
531 * Add a counter to the PMU.
532 * If all counters are not already frozen, then we disable and
533 * re-enable the PMU in order to get hw_perf_restore to do the
534 * actual work of reconfiguring the PMU.
535 */
536static int power_perf_enable(struct perf_counter *counter)
537{
538 struct cpu_hw_counters *cpuhw;
539 unsigned long flags;
540 u64 pmudis;
541 int n0;
542 int ret = -EAGAIN;
543
544 local_irq_save(flags);
545 pmudis = hw_perf_save_disable();
546
547 /*
548 * Add the counter to the list (if there is room)
549 * and check whether the total set is still feasible.
550 */
551 cpuhw = &__get_cpu_var(cpu_hw_counters);
552 n0 = cpuhw->n_counters;
553 if (n0 >= ppmu->n_counter)
554 goto out;
555 cpuhw->counter[n0] = counter;
556 cpuhw->events[n0] = counter->hw.config;
557 if (check_excludes(cpuhw->counter, n0, 1))
558 goto out;
559 if (power_check_constraints(cpuhw->events, n0 + 1))
560 goto out;
561
562 counter->hw.config = cpuhw->events[n0];
563 ++cpuhw->n_counters;
564 ++cpuhw->n_added;
565
566 ret = 0;
567 out:
568 hw_perf_restore(pmudis);
569 local_irq_restore(flags);
570 return ret;
571}
572
573/*
574 * Remove a counter from the PMU.
575 */
576static void power_perf_disable(struct perf_counter *counter)
577{
578 struct cpu_hw_counters *cpuhw;
579 long i;
580 u64 pmudis;
581 unsigned long flags;
582
583 local_irq_save(flags);
584 pmudis = hw_perf_save_disable();
585
586 power_perf_read(counter);
587
588 cpuhw = &__get_cpu_var(cpu_hw_counters);
589 for (i = 0; i < cpuhw->n_counters; ++i) {
590 if (counter == cpuhw->counter[i]) {
591 while (++i < cpuhw->n_counters)
592 cpuhw->counter[i-1] = cpuhw->counter[i];
593 --cpuhw->n_counters;
594 ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
595 write_pmc(counter->hw.idx, 0);
596 counter->hw.idx = 0;
597 perf_counter_update_userpage(counter);
598 break;
599 }
600 }
601 if (cpuhw->n_counters == 0) {
602 /* disable exceptions if no counters are running */
603 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
604 }
605
606 hw_perf_restore(pmudis);
607 local_irq_restore(flags);
608}
609
610struct hw_perf_counter_ops power_perf_ops = {
611 .enable = power_perf_enable,
612 .disable = power_perf_disable,
613 .read = power_perf_read
614};
615
616/* Number of perf_counters counting hardware events */
617static atomic_t num_counters;
618/* Used to avoid races in calling reserve/release_pmc_hardware */
619static DEFINE_MUTEX(pmc_reserve_mutex);
620
621/*
622 * Release the PMU if this is the last perf_counter.
623 */
624static void hw_perf_counter_destroy(struct perf_counter *counter)
625{
626 if (!atomic_add_unless(&num_counters, -1, 1)) {
627 mutex_lock(&pmc_reserve_mutex);
628 if (atomic_dec_return(&num_counters) == 0)
629 release_pmc_hardware();
630 mutex_unlock(&pmc_reserve_mutex);
631 }
632}
633
634const struct hw_perf_counter_ops *
635hw_perf_counter_init(struct perf_counter *counter)
636{
637 unsigned long ev;
638 struct perf_counter *ctrs[MAX_HWCOUNTERS];
639 unsigned int events[MAX_HWCOUNTERS];
640 int n;
641 int err;
642
643 if (!ppmu)
644 return ERR_PTR(-ENXIO);
645 if ((s64)counter->hw_event.irq_period < 0)
646 return ERR_PTR(-EINVAL);
647 if (!perf_event_raw(&counter->hw_event)) {
648 ev = perf_event_id(&counter->hw_event);
649 if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
650 return ERR_PTR(-EOPNOTSUPP);
651 ev = ppmu->generic_events[ev];
652 } else {
653 ev = perf_event_config(&counter->hw_event);
654 }
655 counter->hw.config_base = ev;
656 counter->hw.idx = 0;
657
658 /*
659 * If we are not running on a hypervisor, force the
660 * exclude_hv bit to 0 so that we don't care what
661 * the user set it to.
662 */
663 if (!firmware_has_feature(FW_FEATURE_LPAR))
664 counter->hw_event.exclude_hv = 0;
665
666 /*
667 * If this is in a group, check if it can go on with all the
668 * other hardware counters in the group. We assume the counter
669 * hasn't been linked into its leader's sibling list at this point.
670 */
671 n = 0;
672 if (counter->group_leader != counter) {
673 n = collect_events(counter->group_leader, ppmu->n_counter - 1,
674 ctrs, events);
675 if (n < 0)
676 return ERR_PTR(-EINVAL);
677 }
678 events[n] = ev;
679 ctrs[n] = counter;
680 if (check_excludes(ctrs, n, 1))
681 return ERR_PTR(-EINVAL);
682 if (power_check_constraints(events, n + 1))
683 return ERR_PTR(-EINVAL);
684
685 counter->hw.config = events[n];
686 atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
687
688 /*
689 * See if we need to reserve the PMU.
690 * If no counters are currently in use, then we have to take a
691 * mutex to ensure that we don't race with another task doing
692 * reserve_pmc_hardware or release_pmc_hardware.
693 */
694 err = 0;
695 if (!atomic_inc_not_zero(&num_counters)) {
696 mutex_lock(&pmc_reserve_mutex);
697 if (atomic_read(&num_counters) == 0 &&
698 reserve_pmc_hardware(perf_counter_interrupt))
699 err = -EBUSY;
700 else
701 atomic_inc(&num_counters);
702 mutex_unlock(&pmc_reserve_mutex);
703 }
704 counter->destroy = hw_perf_counter_destroy;
705
706 if (err)
707 return ERR_PTR(err);
708 return &power_perf_ops;
709}
710
711/*
712 * A counter has overflowed; update its count and record
713 * things if requested. Note that interrupts are hard-disabled
714 * here so there is no possibility of being interrupted.
715 */
716static void record_and_restart(struct perf_counter *counter, long val,
717 struct pt_regs *regs, int nmi)
718{
719 s64 prev, delta, left;
720 int record = 0;
721
722 /* we don't have to worry about interrupts here */
723 prev = atomic64_read(&counter->hw.prev_count);
724 delta = (val - prev) & 0xfffffffful;
725 atomic64_add(delta, &counter->count);
726
727 /*
728 * See if the total period for this counter has expired,
729 * and update for the next period.
730 */
731 val = 0;
732 left = atomic64_read(&counter->hw.period_left) - delta;
733 if (counter->hw_event.irq_period) {
734 if (left <= 0) {
735 left += counter->hw_event.irq_period;
736 if (left <= 0)
737 left = counter->hw_event.irq_period;
738 record = 1;
739 }
740 if (left < 0x80000000L)
741 val = 0x80000000L - left;
742 }
743 write_pmc(counter->hw.idx, val);
744 atomic64_set(&counter->hw.prev_count, val);
745 atomic64_set(&counter->hw.period_left, left);
746 perf_counter_update_userpage(counter);
747
748 /*
749 * Finally record data if requested.
750 */
751 if (record)
752 perf_counter_overflow(counter, nmi, regs, 0);
753}
754
755/*
756 * Performance monitor interrupt stuff
757 */
758static void perf_counter_interrupt(struct pt_regs *regs)
759{
760 int i;
761 struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
762 struct perf_counter *counter;
763 long val;
764 int found = 0;
765 int nmi;
766
767 /*
768 * If interrupts were soft-disabled when this PMU interrupt
769 * occurred, treat it as an NMI.
770 */
771 nmi = !regs->softe;
772 if (nmi)
773 nmi_enter();
774 else
775 irq_enter();
776
777 for (i = 0; i < cpuhw->n_counters; ++i) {
778 counter = cpuhw->counter[i];
779 val = read_pmc(counter->hw.idx);
780 if ((int)val < 0) {
781 /* counter has overflowed */
782 found = 1;
783 record_and_restart(counter, val, regs, nmi);
784 }
785 }
786
787 /*
788 * In case we didn't find and reset the counter that caused
789 * the interrupt, scan all counters and reset any that are
790 * negative, to avoid getting continual interrupts.
791 * Any that we processed in the previous loop will not be negative.
792 */
793 if (!found) {
794 for (i = 0; i < ppmu->n_counter; ++i) {
795 val = read_pmc(i + 1);
796 if ((int)val < 0)
797 write_pmc(i + 1, 0);
798 }
799 }
800
801 /*
802 * Reset MMCR0 to its normal value. This will set PMXE and
803 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
804 * and thus allow interrupts to occur again.
805 * XXX might want to use MSR.PM to keep the counters frozen until
806 * we get back out of this interrupt.
807 */
808 mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
809
810 if (nmi)
811 nmi_exit();
812 else
813 irq_exit();
814}
815
816void hw_perf_counter_setup(int cpu)
817{
818 struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
819
820 memset(cpuhw, 0, sizeof(*cpuhw));
821 cpuhw->mmcr[0] = MMCR0_FC;
822}
823
824extern struct power_pmu power4_pmu;
825extern struct power_pmu ppc970_pmu;
826extern struct power_pmu power5_pmu;
827extern struct power_pmu power5p_pmu;
828extern struct power_pmu power6_pmu;
829
830static int init_perf_counters(void)
831{
832 unsigned long pvr;
833
834 /* XXX should get this from cputable */
835 pvr = mfspr(SPRN_PVR);
836 switch (PVR_VER(pvr)) {
837 case PV_POWER4:
838 case PV_POWER4p:
839 ppmu = &power4_pmu;
840 break;
841 case PV_970:
842 case PV_970FX:
843 case PV_970MP:
844 ppmu = &ppc970_pmu;
845 break;
846 case PV_POWER5:
847 ppmu = &power5_pmu;
848 break;
849 case PV_POWER5p:
850 ppmu = &power5p_pmu;
851 break;
852 case 0x3e:
853 ppmu = &power6_pmu;
854 break;
855 }
856
857 /*
858 * Use FCHV to ignore kernel events if MSR.HV is set.
859 */
860 if (mfmsr() & MSR_HV)
861 freeze_counters_kernel = MMCR0_FCHV;
862
863 return 0;
864}
865
866arch_initcall(init_perf_counters);
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
new file mode 100644
index 000000000000..1407b19ab619
--- /dev/null
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -0,0 +1,557 @@
1/*
2 * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER4
17 */
18#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
21#define PM_UNIT_MSK 0xf
22#define PM_LOWER_SH 6
23#define PM_LOWER_MSK 1
24#define PM_LOWER_MSKS 0x40
25#define PM_BYTE_SH 4 /* Byte number of event bus to use */
26#define PM_BYTE_MSK 3
27#define PM_PMCSEL_MSK 7
28
29/*
30 * Unit code values
31 */
32#define PM_FPU 1
33#define PM_ISU1 2
34#define PM_IFU 3
35#define PM_IDU0 4
36#define PM_ISU1_ALT 6
37#define PM_ISU2 7
38#define PM_IFU_ALT 8
39#define PM_LSU0 9
40#define PM_LSU1 0xc
41#define PM_GPS 0xf
42
43/*
44 * Bits in MMCR0 for POWER4
45 */
46#define MMCR0_PMC1SEL_SH 8
47#define MMCR0_PMC2SEL_SH 1
48#define MMCR_PMCSEL_MSK 0x1f
49
50/*
51 * Bits in MMCR1 for POWER4
52 */
53#define MMCR1_TTM0SEL_SH 62
54#define MMCR1_TTC0SEL_SH 61
55#define MMCR1_TTM1SEL_SH 59
56#define MMCR1_TTC1SEL_SH 58
57#define MMCR1_TTM2SEL_SH 56
58#define MMCR1_TTC2SEL_SH 55
59#define MMCR1_TTM3SEL_SH 53
60#define MMCR1_TTC3SEL_SH 52
61#define MMCR1_TTMSEL_MSK 3
62#define MMCR1_TD_CP_DBG0SEL_SH 50
63#define MMCR1_TD_CP_DBG1SEL_SH 48
64#define MMCR1_TD_CP_DBG2SEL_SH 46
65#define MMCR1_TD_CP_DBG3SEL_SH 44
66#define MMCR1_DEBUG0SEL_SH 43
67#define MMCR1_DEBUG1SEL_SH 42
68#define MMCR1_DEBUG2SEL_SH 41
69#define MMCR1_DEBUG3SEL_SH 40
70#define MMCR1_PMC1_ADDER_SEL_SH 39
71#define MMCR1_PMC2_ADDER_SEL_SH 38
72#define MMCR1_PMC6_ADDER_SEL_SH 37
73#define MMCR1_PMC5_ADDER_SEL_SH 36
74#define MMCR1_PMC8_ADDER_SEL_SH 35
75#define MMCR1_PMC7_ADDER_SEL_SH 34
76#define MMCR1_PMC3_ADDER_SEL_SH 33
77#define MMCR1_PMC4_ADDER_SEL_SH 32
78#define MMCR1_PMC3SEL_SH 27
79#define MMCR1_PMC4SEL_SH 22
80#define MMCR1_PMC5SEL_SH 17
81#define MMCR1_PMC6SEL_SH 12
82#define MMCR1_PMC7SEL_SH 7
83#define MMCR1_PMC8SEL_SH 2 /* note bit 0 is in MMCRA for GP */
84
85static short mmcr1_adder_bits[8] = {
86 MMCR1_PMC1_ADDER_SEL_SH,
87 MMCR1_PMC2_ADDER_SEL_SH,
88 MMCR1_PMC3_ADDER_SEL_SH,
89 MMCR1_PMC4_ADDER_SEL_SH,
90 MMCR1_PMC5_ADDER_SEL_SH,
91 MMCR1_PMC6_ADDER_SEL_SH,
92 MMCR1_PMC7_ADDER_SEL_SH,
93 MMCR1_PMC8_ADDER_SEL_SH
94};
95
96/*
97 * Bits in MMCRA
98 */
99#define MMCRA_PMC8SEL0_SH 17 /* PMC8SEL bit 0 for GP */
100
101/*
102 * Layout of constraint bits:
103 * 6666555555555544444444443333333333222222222211111111110000000000
104 * 3210987654321098765432109876543210987654321098765432109876543210
105 * |[ >[ >[ >|||[ >[ >< >< >< >< ><><><><><><><><>
106 * | UC1 UC2 UC3 ||| PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
107 * \SMPL ||\TTC3SEL
108 * |\TTC_IFU_SEL
109 * \TTM2SEL0
110 *
111 * SMPL - SAMPLE_ENABLE constraint
112 * 56: SAMPLE_ENABLE value 0x0100_0000_0000_0000
113 *
114 * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2
115 * 55: UC1 error 0x0080_0000_0000_0000
116 * 54: FPU events needed 0x0040_0000_0000_0000
117 * 53: ISU1 events needed 0x0020_0000_0000_0000
118 * 52: IDU0|ISU2 events needed 0x0010_0000_0000_0000
119 *
120 * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0
121 * 51: UC2 error 0x0008_0000_0000_0000
122 * 50: FPU events needed 0x0004_0000_0000_0000
123 * 49: IFU events needed 0x0002_0000_0000_0000
124 * 48: LSU0 events needed 0x0001_0000_0000_0000
125 *
126 * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1
127 * 47: UC3 error 0x8000_0000_0000
128 * 46: LSU0 events needed 0x4000_0000_0000
129 * 45: IFU events needed 0x2000_0000_0000
130 * 44: IDU0|ISU2 events needed 0x1000_0000_0000
131 * 43: ISU1 events needed 0x0800_0000_0000
132 *
133 * TTM2SEL0
134 * 42: 0 = IDU0 events needed
135 * 1 = ISU2 events needed 0x0400_0000_0000
136 *
137 * TTC_IFU_SEL
138 * 41: 0 = IFU.U events needed
139 * 1 = IFU.L events needed 0x0200_0000_0000
140 *
141 * TTC3SEL
142 * 40: 0 = LSU1.U events needed
143 * 1 = LSU1.L events needed 0x0100_0000_0000
144 *
145 * PS1
146 * 39: PS1 error 0x0080_0000_0000
147 * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
148 *
149 * PS2
150 * 35: PS2 error 0x0008_0000_0000
151 * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
152 *
153 * B0
154 * 28-31: Byte 0 event source 0xf000_0000
155 * 1 = FPU
156 * 2 = ISU1
157 * 3 = IFU
158 * 4 = IDU0
159 * 7 = ISU2
160 * 9 = LSU0
161 * c = LSU1
162 * f = GPS
163 *
164 * B1, B2, B3
165 * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
166 *
167 * P8
168 * 15: P8 error 0x8000
169 * 14-15: Count of events needing PMC8
170 *
171 * P1..P7
172 * 0-13: Count of events needing PMC1..PMC7
173 *
174 * Note: this doesn't allow events using IFU.U to be combined with events
175 * using IFU.L, though that is feasible (using TTM0 and TTM2). However
176 * there are no listed events for IFU.L (they are debug events not
177 * verified for performance monitoring) so this shouldn't cause a
178 * problem.
179 */
180
181static struct unitinfo {
182 u64 value, mask;
183 int unit;
184 int lowerbit;
185} p4_unitinfo[16] = {
186 [PM_FPU] = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 },
187 [PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
188 [PM_ISU1_ALT] =
189 { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
190 [PM_IFU] = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
191 [PM_IFU_ALT] =
192 { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
193 [PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 },
194 [PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 },
195 [PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 },
196 [PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 },
197 [PM_GPS] = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 }
198};
199
200static unsigned char direct_marked_event[8] = {
201 (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
202 (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
203 (1<<3), /* PMC3: PM_MRK_ST_CMPL_INT */
204 (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
205 (1<<4) | (1<<5), /* PMC5: PM_MRK_GRP_TIMEO */
206 (1<<3) | (1<<4) | (1<<5),
207 /* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
208 (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
209 (1<<4), /* PMC8: PM_MRK_LSU_FIN */
210};
211
212/*
213 * Returns 1 if event counts things relating to marked instructions
214 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
215 */
216static int p4_marked_instr_event(unsigned int event)
217{
218 int pmc, psel, unit, byte, bit;
219 unsigned int mask;
220
221 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
222 psel = event & PM_PMCSEL_MSK;
223 if (pmc) {
224 if (direct_marked_event[pmc - 1] & (1 << psel))
225 return 1;
226 if (psel == 0) /* add events */
227 bit = (pmc <= 4)? pmc - 1: 8 - pmc;
228 else if (psel == 6) /* decode events */
229 bit = 4;
230 else
231 return 0;
232 } else
233 bit = psel;
234
235 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
236 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
237 mask = 0;
238 switch (unit) {
239 case PM_LSU1:
240 if (event & PM_LOWER_MSKS)
241 mask = 1 << 28; /* byte 7 bit 4 */
242 else
243 mask = 6 << 24; /* byte 3 bits 1 and 2 */
244 break;
245 case PM_LSU0:
246 /* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */
247 mask = 0x083dff00;
248 }
249 return (mask >> (byte * 8 + bit)) & 1;
250}
251
252static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
253{
254 int pmc, byte, unit, lower, sh;
255 u64 mask = 0, value = 0;
256 int grp = -1;
257
258 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
259 if (pmc) {
260 if (pmc > 8)
261 return -1;
262 sh = (pmc - 1) * 2;
263 mask |= 2 << sh;
264 value |= 1 << sh;
265 grp = ((pmc - 1) >> 1) & 1;
266 }
267 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
268 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
269 if (unit) {
270 lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK;
271
272 /*
273 * Bus events on bytes 0 and 2 can be counted
274 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
275 */
276 if (!pmc)
277 grp = byte & 1;
278
279 if (!p4_unitinfo[unit].unit)
280 return -1;
281 mask |= p4_unitinfo[unit].mask;
282 value |= p4_unitinfo[unit].value;
283 sh = p4_unitinfo[unit].lowerbit;
284 if (sh > 1)
285 value |= (u64)lower << sh;
286 else if (lower != sh)
287 return -1;
288 unit = p4_unitinfo[unit].unit;
289
290 /* Set byte lane select field */
291 mask |= 0xfULL << (28 - 4 * byte);
292 value |= (u64)unit << (28 - 4 * byte);
293 }
294 if (grp == 0) {
295 /* increment PMC1/2/5/6 field */
296 mask |= 0x8000000000ull;
297 value |= 0x1000000000ull;
298 } else {
299 /* increment PMC3/4/7/8 field */
300 mask |= 0x800000000ull;
301 value |= 0x100000000ull;
302 }
303
304 /* Marked instruction events need sample_enable set */
305 if (p4_marked_instr_event(event)) {
306 mask |= 1ull << 56;
307 value |= 1ull << 56;
308 }
309
310 /* PMCSEL=6 decode events on byte 2 need sample_enable clear */
311 if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2)
312 mask |= 1ull << 56;
313
314 *maskp = mask;
315 *valp = value;
316 return 0;
317}
318
319static unsigned int ppc_inst_cmpl[] = {
320 0x1001, 0x4001, 0x6001, 0x7001, 0x8001
321};
322
323static int p4_get_alternatives(unsigned int event, unsigned int alt[])
324{
325 int i, j, na;
326
327 alt[0] = event;
328 na = 1;
329
330 /* 2 possibilities for PM_GRP_DISP_REJECT */
331 if (event == 0x8003 || event == 0x0224) {
332 alt[1] = event ^ (0x8003 ^ 0x0224);
333 return 2;
334 }
335
336 /* 2 possibilities for PM_ST_MISS_L1 */
337 if (event == 0x0c13 || event == 0x0c23) {
338 alt[1] = event ^ (0x0c13 ^ 0x0c23);
339 return 2;
340 }
341
342 /* several possibilities for PM_INST_CMPL */
343 for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) {
344 if (event == ppc_inst_cmpl[i]) {
345 for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j)
346 if (j != i)
347 alt[na++] = ppc_inst_cmpl[j];
348 break;
349 }
350 }
351
352 return na;
353}
354
355static int p4_compute_mmcr(unsigned int event[], int n_ev,
356 unsigned int hwc[], u64 mmcr[])
357{
358 u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
359 unsigned int pmc, unit, byte, psel, lower;
360 unsigned int ttm, grp;
361 unsigned int pmc_inuse = 0;
362 unsigned int pmc_grp_use[2];
363 unsigned char busbyte[4];
364 unsigned char unituse[16];
365 unsigned int unitlower = 0;
366 int i;
367
368 if (n_ev > 8)
369 return -1;
370
371 /* First pass to count resource use */
372 pmc_grp_use[0] = pmc_grp_use[1] = 0;
373 memset(busbyte, 0, sizeof(busbyte));
374 memset(unituse, 0, sizeof(unituse));
375 for (i = 0; i < n_ev; ++i) {
376 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
377 if (pmc) {
378 if (pmc_inuse & (1 << (pmc - 1)))
379 return -1;
380 pmc_inuse |= 1 << (pmc - 1);
381 /* count 1/2/5/6 vs 3/4/7/8 use */
382 ++pmc_grp_use[((pmc - 1) >> 1) & 1];
383 }
384 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
385 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
386 lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK;
387 if (unit) {
388 if (!pmc)
389 ++pmc_grp_use[byte & 1];
390 if (unit == 6 || unit == 8)
391 /* map alt ISU1/IFU codes: 6->2, 8->3 */
392 unit = (unit >> 1) - 1;
393 if (busbyte[byte] && busbyte[byte] != unit)
394 return -1;
395 busbyte[byte] = unit;
396 lower <<= unit;
397 if (unituse[unit] && lower != (unitlower & lower))
398 return -1;
399 unituse[unit] = 1;
400 unitlower |= lower;
401 }
402 }
403 if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
404 return -1;
405
406 /*
407 * Assign resources and set multiplexer selects.
408 *
409 * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2.
410 * Each TTMx can only select one unit, but since
411 * units 2 and 6 are both ISU1, and 3 and 8 are both IFU,
412 * we have some choices.
413 */
414 if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) {
415 unituse[6] = 1; /* Move 2 to 6 */
416 unituse[2] = 0;
417 }
418 if (unituse[3] & (unituse[1] | unituse[2])) {
419 unituse[8] = 1; /* Move 3 to 8 */
420 unituse[3] = 0;
421 unitlower = (unitlower & ~8) | ((unitlower & 8) << 5);
422 }
423 /* Check only one unit per TTMx */
424 if (unituse[1] + unituse[2] + unituse[3] > 1 ||
425 unituse[4] + unituse[6] + unituse[7] > 1 ||
426 unituse[8] + unituse[9] > 1 ||
427 (unituse[5] | unituse[10] | unituse[11] |
428 unituse[13] | unituse[14]))
429 return -1;
430
431 /* Set TTMxSEL fields. Note, units 1-3 => TTM0SEL codes 0-2 */
432 mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH;
433 mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH;
434 mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH;
435
436 /* Set TTCxSEL fields. */
437 if (unitlower & 0xe)
438 mmcr1 |= 1ull << MMCR1_TTC0SEL_SH;
439 if (unitlower & 0xf0)
440 mmcr1 |= 1ull << MMCR1_TTC1SEL_SH;
441 if (unitlower & 0xf00)
442 mmcr1 |= 1ull << MMCR1_TTC2SEL_SH;
443 if (unitlower & 0x7000)
444 mmcr1 |= 1ull << MMCR1_TTC3SEL_SH;
445
446 /* Set byte lane select fields. */
447 for (byte = 0; byte < 4; ++byte) {
448 unit = busbyte[byte];
449 if (!unit)
450 continue;
451 if (unit == 0xf) {
452 /* special case for GPS */
453 mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte);
454 } else {
455 if (!unituse[unit])
456 ttm = unit - 1; /* 2->1, 3->2 */
457 else
458 ttm = unit >> 2;
459 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte);
460 }
461 }
462
463 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
464 for (i = 0; i < n_ev; ++i) {
465 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
466 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
467 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
468 psel = event[i] & PM_PMCSEL_MSK;
469 if (!pmc) {
470 /* Bus event or 00xxx direct event (off or cycles) */
471 if (unit)
472 psel |= 0x10 | ((byte & 2) << 2);
473 for (pmc = 0; pmc < 8; ++pmc) {
474 if (pmc_inuse & (1 << pmc))
475 continue;
476 grp = (pmc >> 1) & 1;
477 if (unit) {
478 if (grp == (byte & 1))
479 break;
480 } else if (pmc_grp_use[grp] < 4) {
481 ++pmc_grp_use[grp];
482 break;
483 }
484 }
485 pmc_inuse |= 1 << pmc;
486 } else {
487 /* Direct event */
488 --pmc;
489 if (psel == 0 && (byte & 2))
490 /* add events on higher-numbered bus */
491 mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
492 else if (psel == 6 && byte == 3)
493 /* seem to need to set sample_enable here */
494 mmcra |= MMCRA_SAMPLE_ENABLE;
495 psel |= 8;
496 }
497 if (pmc <= 1)
498 mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc);
499 else
500 mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
501 if (pmc == 7) /* PMC8 */
502 mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH;
503 hwc[i] = pmc;
504 if (p4_marked_instr_event(event[i]))
505 mmcra |= MMCRA_SAMPLE_ENABLE;
506 }
507
508 if (pmc_inuse & 1)
509 mmcr0 |= MMCR0_PMC1CE;
510 if (pmc_inuse & 0xfe)
511 mmcr0 |= MMCR0_PMCjCE;
512
513 mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
514
515 /* Return MMCRx values */
516 mmcr[0] = mmcr0;
517 mmcr[1] = mmcr1;
518 mmcr[2] = mmcra;
519 return 0;
520}
521
522static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
523{
524 /*
525 * Setting the PMCxSEL field to 0 disables PMC x.
526 * (Note that pmc is 0-based here, not 1-based.)
527 */
528 if (pmc <= 1) {
529 mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc));
530 } else {
531 mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)));
532 if (pmc == 7)
533 mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH);
534 }
535}
536
537static int p4_generic_events[] = {
538 [PERF_COUNT_CPU_CYCLES] = 7,
539 [PERF_COUNT_INSTRUCTIONS] = 0x1001,
540 [PERF_COUNT_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */
541 [PERF_COUNT_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */
542 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */
543 [PERF_COUNT_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */
544};
545
546struct power_pmu power4_pmu = {
547 .n_counter = 8,
548 .max_alternatives = 5,
549 .add_fields = 0x0000001100005555ull,
550 .test_adder = 0x0011083300000000ull,
551 .compute_mmcr = p4_compute_mmcr,
552 .get_constraint = p4_get_constraint,
553 .get_alternatives = p4_get_alternatives,
554 .disable_pmc = p4_disable_pmc,
555 .n_generic = ARRAY_SIZE(p4_generic_events),
556 .generic_events = p4_generic_events,
557};
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
new file mode 100644
index 000000000000..1222c8ea3c26
--- /dev/null
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -0,0 +1,551 @@
1/*
2 * Performance counter support for POWER5+/++ (not POWER5) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3)
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
22#define PM_UNIT_MSK 0xf
23#define PM_BYTE_SH 12 /* Byte number of event bus to use */
24#define PM_BYTE_MSK 7
25#define PM_GRS_SH 8 /* Storage subsystem mux select */
26#define PM_GRS_MSK 7
27#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
28#define PM_PMCSEL_MSK 0x7f
29
30/* Values in PM_UNIT field */
31#define PM_FPU 0
32#define PM_ISU0 1
33#define PM_IFU 2
34#define PM_ISU1 3
35#define PM_IDU 4
36#define PM_ISU0_ALT 6
37#define PM_GRS 7
38#define PM_LSU0 8
39#define PM_LSU1 0xc
40#define PM_LASTUNIT 0xc
41
42/*
43 * Bits in MMCR1 for POWER5+
44 */
45#define MMCR1_TTM0SEL_SH 62
46#define MMCR1_TTM1SEL_SH 60
47#define MMCR1_TTM2SEL_SH 58
48#define MMCR1_TTM3SEL_SH 56
49#define MMCR1_TTMSEL_MSK 3
50#define MMCR1_TD_CP_DBG0SEL_SH 54
51#define MMCR1_TD_CP_DBG1SEL_SH 52
52#define MMCR1_TD_CP_DBG2SEL_SH 50
53#define MMCR1_TD_CP_DBG3SEL_SH 48
54#define MMCR1_GRS_L2SEL_SH 46
55#define MMCR1_GRS_L2SEL_MSK 3
56#define MMCR1_GRS_L3SEL_SH 44
57#define MMCR1_GRS_L3SEL_MSK 3
58#define MMCR1_GRS_MCSEL_SH 41
59#define MMCR1_GRS_MCSEL_MSK 7
60#define MMCR1_GRS_FABSEL_SH 39
61#define MMCR1_GRS_FABSEL_MSK 3
62#define MMCR1_PMC1_ADDER_SEL_SH 35
63#define MMCR1_PMC2_ADDER_SEL_SH 34
64#define MMCR1_PMC3_ADDER_SEL_SH 33
65#define MMCR1_PMC4_ADDER_SEL_SH 32
66#define MMCR1_PMC1SEL_SH 25
67#define MMCR1_PMC2SEL_SH 17
68#define MMCR1_PMC3SEL_SH 9
69#define MMCR1_PMC4SEL_SH 1
70#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
71#define MMCR1_PMCSEL_MSK 0x7f
72
73/*
74 * Bits in MMCRA
75 */
76
77/*
78 * Layout of constraint bits:
79 * 6666555555555544444444443333333333222222222211111111110000000000
80 * 3210987654321098765432109876543210987654321098765432109876543210
81 * [ ><><>< ><> <><>[ > < >< >< >< ><><><><>
82 * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P4P3P2P1
83 *
84 * NC - number of counters
85 * 51: NC error 0x0008_0000_0000_0000
86 * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
87 *
88 * G0..G3 - GRS mux constraints
89 * 46-47: GRS_L2SEL value
90 * 44-45: GRS_L3SEL value
91 * 41-44: GRS_MCSEL value
92 * 39-40: GRS_FABSEL value
93 * Note that these match up with their bit positions in MMCR1
94 *
95 * T0 - TTM0 constraint
96 * 36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000
97 *
98 * T1 - TTM1 constraint
99 * 34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000
100 *
101 * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
102 * 33: UC3 error 0x02_0000_0000
103 * 32: FPU|IFU|ISU1 events needed 0x01_0000_0000
104 * 31: ISU0 events needed 0x01_8000_0000
105 * 30: IDU|GRS events needed 0x00_4000_0000
106 *
107 * B0
108 * 20-23: Byte 0 event source 0x00f0_0000
109 * Encoding as for the event code
110 *
111 * B1, B2, B3
112 * 16-19, 12-15, 8-11: Byte 1, 2, 3 event sources
113 *
114 * P4
115 * 7: P1 error 0x80
116 * 6-7: Count of events needing PMC4
117 *
118 * P1..P3
119 * 0-6: Count of events needing PMC1..PMC3
120 */
121
122static const int grsel_shift[8] = {
123 MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
124 MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
125 MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
126};
127
128/* Masks and values for using events from the various units */
129static u64 unit_cons[PM_LASTUNIT+1][2] = {
130 [PM_FPU] = { 0x3200000000ull, 0x0100000000ull },
131 [PM_ISU0] = { 0x0200000000ull, 0x0080000000ull },
132 [PM_ISU1] = { 0x3200000000ull, 0x3100000000ull },
133 [PM_IFU] = { 0x3200000000ull, 0x2100000000ull },
134 [PM_IDU] = { 0x0e00000000ull, 0x0040000000ull },
135 [PM_GRS] = { 0x0e00000000ull, 0x0c40000000ull },
136};
137
138static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
139{
140 int pmc, byte, unit, sh;
141 int bit, fmask;
142 u64 mask = 0, value = 0;
143
144 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
145 if (pmc) {
146 if (pmc > 4)
147 return -1;
148 sh = (pmc - 1) * 2;
149 mask |= 2 << sh;
150 value |= 1 << sh;
151 }
152 if (event & PM_BUSEVENT_MSK) {
153 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
154 if (unit > PM_LASTUNIT)
155 return -1;
156 if (unit == PM_ISU0_ALT)
157 unit = PM_ISU0;
158 mask |= unit_cons[unit][0];
159 value |= unit_cons[unit][1];
160 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
161 if (byte >= 4) {
162 if (unit != PM_LSU1)
163 return -1;
164 /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
165 ++unit;
166 byte &= 3;
167 }
168 if (unit == PM_GRS) {
169 bit = event & 7;
170 fmask = (bit == 6)? 7: 3;
171 sh = grsel_shift[bit];
172 mask |= (u64)fmask << sh;
173 value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
174 }
175 /* Set byte lane select field */
176 mask |= 0xfULL << (20 - 4 * byte);
177 value |= (u64)unit << (20 - 4 * byte);
178 }
179 mask |= 0x8000000000000ull;
180 value |= 0x1000000000000ull;
181 *maskp = mask;
182 *valp = value;
183 return 0;
184}
185
186#define MAX_ALT 3 /* at most 3 alternatives for any event */
187
188static const unsigned int event_alternatives[][MAX_ALT] = {
189 { 0x100c0, 0x40001f }, /* PM_GCT_FULL_CYC */
190 { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
191 { 0x230e2, 0x323087 }, /* PM_BR_PRED_CR */
192 { 0x230e3, 0x223087, 0x3230a0 }, /* PM_BR_PRED_TA */
193 { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
194 { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */
195 { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */
196 { 0x100009, 0x200009 }, /* PM_INST_CMPL */
197 { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */
198 { 0x300009, 0x400009 }, /* PM_INST_DISP */
199};
200
201/*
202 * Scan the alternatives table for a match and return the
203 * index into the alternatives table if found, else -1.
204 */
205static int find_alternative(unsigned int event)
206{
207 int i, j;
208
209 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
210 if (event < event_alternatives[i][0])
211 break;
212 for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
213 if (event == event_alternatives[i][j])
214 return i;
215 }
216 return -1;
217}
218
219static const unsigned char bytedecode_alternatives[4][4] = {
220 /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
221 /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
222 /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
223 /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
224};
225
226/*
227 * Some direct events for decodes of event bus byte 3 have alternative
228 * PMCSEL values on other counters. This returns the alternative
229 * event code for those that do, or -1 otherwise. This also handles
230 * alternative PCMSEL values for add events.
231 */
232static int find_alternative_bdecode(unsigned int event)
233{
234 int pmc, altpmc, pp, j;
235
236 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
237 if (pmc == 0 || pmc > 4)
238 return -1;
239 altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
240 pp = event & PM_PMCSEL_MSK;
241 for (j = 0; j < 4; ++j) {
242 if (bytedecode_alternatives[pmc - 1][j] == pp) {
243 return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
244 (altpmc << PM_PMC_SH) |
245 bytedecode_alternatives[altpmc - 1][j];
246 }
247 }
248
249 /* new decode alternatives for power5+ */
250 if (pmc == 1 && (pp == 0x0d || pp == 0x0e))
251 return event + (2 << PM_PMC_SH) + (0x2e - 0x0d);
252 if (pmc == 3 && (pp == 0x2e || pp == 0x2f))
253 return event - (2 << PM_PMC_SH) - (0x2e - 0x0d);
254
255 /* alternative add event encodings */
256 if (pp == 0x10 || pp == 0x28)
257 return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) |
258 (altpmc << PM_PMC_SH);
259
260 return -1;
261}
262
263static int power5p_get_alternatives(unsigned int event, unsigned int alt[])
264{
265 int i, j, ae, nalt = 1;
266
267 alt[0] = event;
268 nalt = 1;
269 i = find_alternative(event);
270 if (i >= 0) {
271 for (j = 0; j < MAX_ALT; ++j) {
272 ae = event_alternatives[i][j];
273 if (ae && ae != event)
274 alt[nalt++] = ae;
275 }
276 } else {
277 ae = find_alternative_bdecode(event);
278 if (ae > 0)
279 alt[nalt++] = ae;
280 }
281 return nalt;
282}
283
284/*
285 * Map of which direct events on which PMCs are marked instruction events.
286 * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
287 * Bit 0 is set if it is marked for all PMCs.
288 * The 0x80 bit indicates a byte decode PMCSEL value.
289 */
290static unsigned char direct_event_is_marked[0x28] = {
291 0, /* 00 */
292 0x1f, /* 01 PM_IOPS_CMPL */
293 0x2, /* 02 PM_MRK_GRP_DISP */
294 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
295 0, /* 04 */
296 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
297 0x80, /* 06 */
298 0x80, /* 07 */
299 0, 0, 0,/* 08 - 0a */
300 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
301 0, /* 0c */
302 0x80, /* 0d */
303 0x80, /* 0e */
304 0, /* 0f */
305 0, /* 10 */
306 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
307 0, /* 12 */
308 0x10, /* 13 PM_MRK_GRP_CMPL */
309 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
310 0x2, /* 15 PM_MRK_GRP_ISSUED */
311 0x80, /* 16 */
312 0x80, /* 17 */
313 0, 0, 0, 0, 0,
314 0x80, /* 1d */
315 0x80, /* 1e */
316 0, /* 1f */
317 0x80, /* 20 */
318 0x80, /* 21 */
319 0x80, /* 22 */
320 0x80, /* 23 */
321 0x80, /* 24 */
322 0x80, /* 25 */
323 0x80, /* 26 */
324 0x80, /* 27 */
325};
326
327/*
328 * Returns 1 if event counts things relating to marked instructions
329 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
330 */
331static int power5p_marked_instr_event(unsigned int event)
332{
333 int pmc, psel;
334 int bit, byte, unit;
335 u32 mask;
336
337 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
338 psel = event & PM_PMCSEL_MSK;
339 if (pmc >= 5)
340 return 0;
341
342 bit = -1;
343 if (psel < sizeof(direct_event_is_marked)) {
344 if (direct_event_is_marked[psel] & (1 << pmc))
345 return 1;
346 if (direct_event_is_marked[psel] & 0x80)
347 bit = 4;
348 else if (psel == 0x08)
349 bit = pmc - 1;
350 else if (psel == 0x10)
351 bit = 4 - pmc;
352 else if (psel == 0x1b && (pmc == 1 || pmc == 3))
353 bit = 4;
354 } else if ((psel & 0x48) == 0x40) {
355 bit = psel & 7;
356 } else if (psel == 0x28) {
357 bit = pmc - 1;
358 } else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) {
359 bit = 4;
360 }
361
362 if (!(event & PM_BUSEVENT_MSK) || bit == -1)
363 return 0;
364
365 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
366 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
367 if (unit == PM_LSU0) {
368 /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
369 mask = 0x5dff00;
370 } else if (unit == PM_LSU1 && byte >= 4) {
371 byte -= 4;
372 /* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */
373 mask = 0x5f11c000;
374 } else
375 return 0;
376
377 return (mask >> (byte * 8 + bit)) & 1;
378}
379
380static int power5p_compute_mmcr(unsigned int event[], int n_ev,
381 unsigned int hwc[], u64 mmcr[])
382{
383 u64 mmcr1 = 0;
384 u64 mmcra = 0;
385 unsigned int pmc, unit, byte, psel;
386 unsigned int ttm;
387 int i, isbus, bit, grsel;
388 unsigned int pmc_inuse = 0;
389 unsigned char busbyte[4];
390 unsigned char unituse[16];
391 int ttmuse;
392
393 if (n_ev > 4)
394 return -1;
395
396 /* First pass to count resource use */
397 memset(busbyte, 0, sizeof(busbyte));
398 memset(unituse, 0, sizeof(unituse));
399 for (i = 0; i < n_ev; ++i) {
400 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
401 if (pmc) {
402 if (pmc > 4)
403 return -1;
404 if (pmc_inuse & (1 << (pmc - 1)))
405 return -1;
406 pmc_inuse |= 1 << (pmc - 1);
407 }
408 if (event[i] & PM_BUSEVENT_MSK) {
409 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
410 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
411 if (unit > PM_LASTUNIT)
412 return -1;
413 if (unit == PM_ISU0_ALT)
414 unit = PM_ISU0;
415 if (byte >= 4) {
416 if (unit != PM_LSU1)
417 return -1;
418 ++unit;
419 byte &= 3;
420 }
421 if (busbyte[byte] && busbyte[byte] != unit)
422 return -1;
423 busbyte[byte] = unit;
424 unituse[unit] = 1;
425 }
426 }
427
428 /*
429 * Assign resources and set multiplexer selects.
430 *
431 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
432 * choice we have to deal with.
433 */
434 if (unituse[PM_ISU0] &
435 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
436 unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
437 unituse[PM_ISU0] = 0;
438 }
439 /* Set TTM[01]SEL fields. */
440 ttmuse = 0;
441 for (i = PM_FPU; i <= PM_ISU1; ++i) {
442 if (!unituse[i])
443 continue;
444 if (ttmuse++)
445 return -1;
446 mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
447 }
448 ttmuse = 0;
449 for (; i <= PM_GRS; ++i) {
450 if (!unituse[i])
451 continue;
452 if (ttmuse++)
453 return -1;
454 mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
455 }
456 if (ttmuse > 1)
457 return -1;
458
459 /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
460 for (byte = 0; byte < 4; ++byte) {
461 unit = busbyte[byte];
462 if (!unit)
463 continue;
464 if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
465 /* get ISU0 through TTM1 rather than TTM0 */
466 unit = PM_ISU0_ALT;
467 } else if (unit == PM_LSU1 + 1) {
468 /* select lower word of LSU1 for this byte */
469 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
470 }
471 ttm = unit >> 2;
472 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
473 }
474
475 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
476 for (i = 0; i < n_ev; ++i) {
477 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
478 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
479 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
480 psel = event[i] & PM_PMCSEL_MSK;
481 isbus = event[i] & PM_BUSEVENT_MSK;
482 if (!pmc) {
483 /* Bus event or any-PMC direct event */
484 for (pmc = 0; pmc < 4; ++pmc) {
485 if (!(pmc_inuse & (1 << pmc)))
486 break;
487 }
488 if (pmc >= 4)
489 return -1;
490 pmc_inuse |= 1 << pmc;
491 } else {
492 /* Direct event */
493 --pmc;
494 if (isbus && (byte & 2) &&
495 (psel == 8 || psel == 0x10 || psel == 0x28))
496 /* add events on higher-numbered bus */
497 mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
498 }
499 if (isbus && unit == PM_GRS) {
500 bit = psel & 7;
501 grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
502 mmcr1 |= (u64)grsel << grsel_shift[bit];
503 }
504 if (power5p_marked_instr_event(event[i]))
505 mmcra |= MMCRA_SAMPLE_ENABLE;
506 if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
507 /* select alternate byte lane */
508 psel |= 0x10;
509 if (pmc <= 3)
510 mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
511 hwc[i] = pmc;
512 }
513
514 /* Return MMCRx values */
515 mmcr[0] = 0;
516 if (pmc_inuse & 1)
517 mmcr[0] = MMCR0_PMC1CE;
518 if (pmc_inuse & 0x3e)
519 mmcr[0] |= MMCR0_PMCjCE;
520 mmcr[1] = mmcr1;
521 mmcr[2] = mmcra;
522 return 0;
523}
524
525static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
526{
527 if (pmc <= 3)
528 mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
529}
530
531static int power5p_generic_events[] = {
532 [PERF_COUNT_CPU_CYCLES] = 0xf,
533 [PERF_COUNT_INSTRUCTIONS] = 0x100009,
534 [PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */
535 [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
536 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
537 [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
538};
539
540struct power_pmu power5p_pmu = {
541 .n_counter = 4,
542 .max_alternatives = MAX_ALT,
543 .add_fields = 0x7000000000055ull,
544 .test_adder = 0x3000040000000ull,
545 .compute_mmcr = power5p_compute_mmcr,
546 .get_constraint = power5p_get_constraint,
547 .get_alternatives = power5p_get_alternatives,
548 .disable_pmc = power5p_disable_pmc,
549 .n_generic = ARRAY_SIZE(power5p_generic_events),
550 .generic_events = power5p_generic_events,
551};
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
new file mode 100644
index 000000000000..116c4bb1809e
--- /dev/null
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -0,0 +1,569 @@
1/*
2 * Performance counter support for POWER5 (not POWER5++) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER5 (not POWER5++)
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
22#define PM_UNIT_MSK 0xf
23#define PM_BYTE_SH 12 /* Byte number of event bus to use */
24#define PM_BYTE_MSK 7
25#define PM_GRS_SH 8 /* Storage subsystem mux select */
26#define PM_GRS_MSK 7
27#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
28#define PM_PMCSEL_MSK 0x7f
29
30/* Values in PM_UNIT field */
31#define PM_FPU 0
32#define PM_ISU0 1
33#define PM_IFU 2
34#define PM_ISU1 3
35#define PM_IDU 4
36#define PM_ISU0_ALT 6
37#define PM_GRS 7
38#define PM_LSU0 8
39#define PM_LSU1 0xc
40#define PM_LASTUNIT 0xc
41
42/*
43 * Bits in MMCR1 for POWER5
44 */
45#define MMCR1_TTM0SEL_SH 62
46#define MMCR1_TTM1SEL_SH 60
47#define MMCR1_TTM2SEL_SH 58
48#define MMCR1_TTM3SEL_SH 56
49#define MMCR1_TTMSEL_MSK 3
50#define MMCR1_TD_CP_DBG0SEL_SH 54
51#define MMCR1_TD_CP_DBG1SEL_SH 52
52#define MMCR1_TD_CP_DBG2SEL_SH 50
53#define MMCR1_TD_CP_DBG3SEL_SH 48
54#define MMCR1_GRS_L2SEL_SH 46
55#define MMCR1_GRS_L2SEL_MSK 3
56#define MMCR1_GRS_L3SEL_SH 44
57#define MMCR1_GRS_L3SEL_MSK 3
58#define MMCR1_GRS_MCSEL_SH 41
59#define MMCR1_GRS_MCSEL_MSK 7
60#define MMCR1_GRS_FABSEL_SH 39
61#define MMCR1_GRS_FABSEL_MSK 3
62#define MMCR1_PMC1_ADDER_SEL_SH 35
63#define MMCR1_PMC2_ADDER_SEL_SH 34
64#define MMCR1_PMC3_ADDER_SEL_SH 33
65#define MMCR1_PMC4_ADDER_SEL_SH 32
66#define MMCR1_PMC1SEL_SH 25
67#define MMCR1_PMC2SEL_SH 17
68#define MMCR1_PMC3SEL_SH 9
69#define MMCR1_PMC4SEL_SH 1
70#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
71#define MMCR1_PMCSEL_MSK 0x7f
72
73/*
74 * Bits in MMCRA
75 */
76
77/*
78 * Layout of constraint bits:
79 * 6666555555555544444444443333333333222222222211111111110000000000
80 * 3210987654321098765432109876543210987654321098765432109876543210
81 * <><>[ ><><>< ><> [ >[ >[ >< >< >< >< ><><><><><><>
82 * T0T1 NC G0G1G2 G3 UC PS1PS2 B0 B1 B2 B3 P6P5P4P3P2P1
83 *
84 * T0 - TTM0 constraint
85 * 54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000
86 *
87 * T1 - TTM1 constraint
88 * 52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000
89 *
90 * NC - number of counters
91 * 51: NC error 0x0008_0000_0000_0000
92 * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
93 *
94 * G0..G3 - GRS mux constraints
95 * 46-47: GRS_L2SEL value
96 * 44-45: GRS_L3SEL value
97 * 41-44: GRS_MCSEL value
98 * 39-40: GRS_FABSEL value
99 * Note that these match up with their bit positions in MMCR1
100 *
101 * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
102 * 37: UC3 error 0x20_0000_0000
103 * 36: FPU|IFU|ISU1 events needed 0x10_0000_0000
104 * 35: ISU0 events needed 0x08_0000_0000
105 * 34: IDU|GRS events needed 0x04_0000_0000
106 *
107 * PS1
108 * 33: PS1 error 0x2_0000_0000
109 * 31-32: count of events needing PMC1/2 0x1_8000_0000
110 *
111 * PS2
112 * 30: PS2 error 0x4000_0000
113 * 28-29: count of events needing PMC3/4 0x3000_0000
114 *
115 * B0
116 * 24-27: Byte 0 event source 0x0f00_0000
117 * Encoding as for the event code
118 *
119 * B1, B2, B3
120 * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
121 *
122 * P1..P6
123 * 0-11: Count of events needing PMC1..PMC6
124 */
125
126static const int grsel_shift[8] = {
127 MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
128 MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
129 MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
130};
131
132/* Masks and values for using events from the various units */
133static u64 unit_cons[PM_LASTUNIT+1][2] = {
134 [PM_FPU] = { 0xc0002000000000ull, 0x00001000000000ull },
135 [PM_ISU0] = { 0x00002000000000ull, 0x00000800000000ull },
136 [PM_ISU1] = { 0xc0002000000000ull, 0xc0001000000000ull },
137 [PM_IFU] = { 0xc0002000000000ull, 0x80001000000000ull },
138 [PM_IDU] = { 0x30002000000000ull, 0x00000400000000ull },
139 [PM_GRS] = { 0x30002000000000ull, 0x30000400000000ull },
140};
141
142static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
143{
144 int pmc, byte, unit, sh;
145 int bit, fmask;
146 u64 mask = 0, value = 0;
147 int grp = -1;
148
149 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
150 if (pmc) {
151 if (pmc > 6)
152 return -1;
153 sh = (pmc - 1) * 2;
154 mask |= 2 << sh;
155 value |= 1 << sh;
156 if (pmc <= 4)
157 grp = (pmc - 1) >> 1;
158 else if (event != 0x500009 && event != 0x600005)
159 return -1;
160 }
161 if (event & PM_BUSEVENT_MSK) {
162 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
163 if (unit > PM_LASTUNIT)
164 return -1;
165 if (unit == PM_ISU0_ALT)
166 unit = PM_ISU0;
167 mask |= unit_cons[unit][0];
168 value |= unit_cons[unit][1];
169 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
170 if (byte >= 4) {
171 if (unit != PM_LSU1)
172 return -1;
173 /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
174 ++unit;
175 byte &= 3;
176 }
177 if (unit == PM_GRS) {
178 bit = event & 7;
179 fmask = (bit == 6)? 7: 3;
180 sh = grsel_shift[bit];
181 mask |= (u64)fmask << sh;
182 value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
183 }
184 /*
185 * Bus events on bytes 0 and 2 can be counted
186 * on PMC1/2; bytes 1 and 3 on PMC3/4.
187 */
188 if (!pmc)
189 grp = byte & 1;
190 /* Set byte lane select field */
191 mask |= 0xfULL << (24 - 4 * byte);
192 value |= (u64)unit << (24 - 4 * byte);
193 }
194 if (grp == 0) {
195 /* increment PMC1/2 field */
196 mask |= 0x200000000ull;
197 value |= 0x080000000ull;
198 } else if (grp == 1) {
199 /* increment PMC3/4 field */
200 mask |= 0x40000000ull;
201 value |= 0x10000000ull;
202 }
203 if (pmc < 5) {
204 /* need a counter from PMC1-4 set */
205 mask |= 0x8000000000000ull;
206 value |= 0x1000000000000ull;
207 }
208 *maskp = mask;
209 *valp = value;
210 return 0;
211}
212
213#define MAX_ALT 3 /* at most 3 alternatives for any event */
214
215static const unsigned int event_alternatives[][MAX_ALT] = {
216 { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
217 { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
218 { 0x100005, 0x600005 }, /* PM_RUN_CYC */
219 { 0x100009, 0x200009, 0x500009 }, /* PM_INST_CMPL */
220 { 0x300009, 0x400009 }, /* PM_INST_DISP */
221};
222
223/*
224 * Scan the alternatives table for a match and return the
225 * index into the alternatives table if found, else -1.
226 */
227static int find_alternative(unsigned int event)
228{
229 int i, j;
230
231 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
232 if (event < event_alternatives[i][0])
233 break;
234 for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
235 if (event == event_alternatives[i][j])
236 return i;
237 }
238 return -1;
239}
240
241static const unsigned char bytedecode_alternatives[4][4] = {
242 /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
243 /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
244 /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
245 /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
246};
247
248/*
249 * Some direct events for decodes of event bus byte 3 have alternative
250 * PMCSEL values on other counters. This returns the alternative
251 * event code for those that do, or -1 otherwise.
252 */
253static int find_alternative_bdecode(unsigned int event)
254{
255 int pmc, altpmc, pp, j;
256
257 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
258 if (pmc == 0 || pmc > 4)
259 return -1;
260 altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
261 pp = event & PM_PMCSEL_MSK;
262 for (j = 0; j < 4; ++j) {
263 if (bytedecode_alternatives[pmc - 1][j] == pp) {
264 return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
265 (altpmc << PM_PMC_SH) |
266 bytedecode_alternatives[altpmc - 1][j];
267 }
268 }
269 return -1;
270}
271
272static int power5_get_alternatives(unsigned int event, unsigned int alt[])
273{
274 int i, j, ae, nalt = 1;
275
276 alt[0] = event;
277 nalt = 1;
278 i = find_alternative(event);
279 if (i >= 0) {
280 for (j = 0; j < MAX_ALT; ++j) {
281 ae = event_alternatives[i][j];
282 if (ae && ae != event)
283 alt[nalt++] = ae;
284 }
285 } else {
286 ae = find_alternative_bdecode(event);
287 if (ae > 0)
288 alt[nalt++] = ae;
289 }
290 return nalt;
291}
292
293/*
294 * Map of which direct events on which PMCs are marked instruction events.
295 * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
296 * Bit 0 is set if it is marked for all PMCs.
297 * The 0x80 bit indicates a byte decode PMCSEL value.
298 */
299static unsigned char direct_event_is_marked[0x28] = {
300 0, /* 00 */
301 0x1f, /* 01 PM_IOPS_CMPL */
302 0x2, /* 02 PM_MRK_GRP_DISP */
303 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
304 0, /* 04 */
305 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
306 0x80, /* 06 */
307 0x80, /* 07 */
308 0, 0, 0,/* 08 - 0a */
309 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
310 0, /* 0c */
311 0x80, /* 0d */
312 0x80, /* 0e */
313 0, /* 0f */
314 0, /* 10 */
315 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
316 0, /* 12 */
317 0x10, /* 13 PM_MRK_GRP_CMPL */
318 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
319 0x2, /* 15 PM_MRK_GRP_ISSUED */
320 0x80, /* 16 */
321 0x80, /* 17 */
322 0, 0, 0, 0, 0,
323 0x80, /* 1d */
324 0x80, /* 1e */
325 0, /* 1f */
326 0x80, /* 20 */
327 0x80, /* 21 */
328 0x80, /* 22 */
329 0x80, /* 23 */
330 0x80, /* 24 */
331 0x80, /* 25 */
332 0x80, /* 26 */
333 0x80, /* 27 */
334};
335
336/*
337 * Returns 1 if event counts things relating to marked instructions
338 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
339 */
340static int power5_marked_instr_event(unsigned int event)
341{
342 int pmc, psel;
343 int bit, byte, unit;
344 u32 mask;
345
346 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
347 psel = event & PM_PMCSEL_MSK;
348 if (pmc >= 5)
349 return 0;
350
351 bit = -1;
352 if (psel < sizeof(direct_event_is_marked)) {
353 if (direct_event_is_marked[psel] & (1 << pmc))
354 return 1;
355 if (direct_event_is_marked[psel] & 0x80)
356 bit = 4;
357 else if (psel == 0x08)
358 bit = pmc - 1;
359 else if (psel == 0x10)
360 bit = 4 - pmc;
361 else if (psel == 0x1b && (pmc == 1 || pmc == 3))
362 bit = 4;
363 } else if ((psel & 0x58) == 0x40)
364 bit = psel & 7;
365
366 if (!(event & PM_BUSEVENT_MSK))
367 return 0;
368
369 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
370 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
371 if (unit == PM_LSU0) {
372 /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
373 mask = 0x5dff00;
374 } else if (unit == PM_LSU1 && byte >= 4) {
375 byte -= 4;
376 /* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */
377 mask = 0x5f00c0aa;
378 } else
379 return 0;
380
381 return (mask >> (byte * 8 + bit)) & 1;
382}
383
384static int power5_compute_mmcr(unsigned int event[], int n_ev,
385 unsigned int hwc[], u64 mmcr[])
386{
387 u64 mmcr1 = 0;
388 u64 mmcra = 0;
389 unsigned int pmc, unit, byte, psel;
390 unsigned int ttm, grp;
391 int i, isbus, bit, grsel;
392 unsigned int pmc_inuse = 0;
393 unsigned int pmc_grp_use[2];
394 unsigned char busbyte[4];
395 unsigned char unituse[16];
396 int ttmuse;
397
398 if (n_ev > 6)
399 return -1;
400
401 /* First pass to count resource use */
402 pmc_grp_use[0] = pmc_grp_use[1] = 0;
403 memset(busbyte, 0, sizeof(busbyte));
404 memset(unituse, 0, sizeof(unituse));
405 for (i = 0; i < n_ev; ++i) {
406 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
407 if (pmc) {
408 if (pmc > 6)
409 return -1;
410 if (pmc_inuse & (1 << (pmc - 1)))
411 return -1;
412 pmc_inuse |= 1 << (pmc - 1);
413 /* count 1/2 vs 3/4 use */
414 if (pmc <= 4)
415 ++pmc_grp_use[(pmc - 1) >> 1];
416 }
417 if (event[i] & PM_BUSEVENT_MSK) {
418 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
419 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
420 if (unit > PM_LASTUNIT)
421 return -1;
422 if (unit == PM_ISU0_ALT)
423 unit = PM_ISU0;
424 if (byte >= 4) {
425 if (unit != PM_LSU1)
426 return -1;
427 ++unit;
428 byte &= 3;
429 }
430 if (!pmc)
431 ++pmc_grp_use[byte & 1];
432 if (busbyte[byte] && busbyte[byte] != unit)
433 return -1;
434 busbyte[byte] = unit;
435 unituse[unit] = 1;
436 }
437 }
438 if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2)
439 return -1;
440
441 /*
442 * Assign resources and set multiplexer selects.
443 *
444 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
445 * choice we have to deal with.
446 */
447 if (unituse[PM_ISU0] &
448 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
449 unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
450 unituse[PM_ISU0] = 0;
451 }
452 /* Set TTM[01]SEL fields. */
453 ttmuse = 0;
454 for (i = PM_FPU; i <= PM_ISU1; ++i) {
455 if (!unituse[i])
456 continue;
457 if (ttmuse++)
458 return -1;
459 mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
460 }
461 ttmuse = 0;
462 for (; i <= PM_GRS; ++i) {
463 if (!unituse[i])
464 continue;
465 if (ttmuse++)
466 return -1;
467 mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
468 }
469 if (ttmuse > 1)
470 return -1;
471
472 /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
473 for (byte = 0; byte < 4; ++byte) {
474 unit = busbyte[byte];
475 if (!unit)
476 continue;
477 if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
478 /* get ISU0 through TTM1 rather than TTM0 */
479 unit = PM_ISU0_ALT;
480 } else if (unit == PM_LSU1 + 1) {
481 /* select lower word of LSU1 for this byte */
482 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
483 }
484 ttm = unit >> 2;
485 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
486 }
487
488 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
489 for (i = 0; i < n_ev; ++i) {
490 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
491 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
492 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
493 psel = event[i] & PM_PMCSEL_MSK;
494 isbus = event[i] & PM_BUSEVENT_MSK;
495 if (!pmc) {
496 /* Bus event or any-PMC direct event */
497 for (pmc = 0; pmc < 4; ++pmc) {
498 if (pmc_inuse & (1 << pmc))
499 continue;
500 grp = (pmc >> 1) & 1;
501 if (isbus) {
502 if (grp == (byte & 1))
503 break;
504 } else if (pmc_grp_use[grp] < 2) {
505 ++pmc_grp_use[grp];
506 break;
507 }
508 }
509 pmc_inuse |= 1 << pmc;
510 } else if (pmc <= 4) {
511 /* Direct event */
512 --pmc;
513 if ((psel == 8 || psel == 0x10) && isbus && (byte & 2))
514 /* add events on higher-numbered bus */
515 mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
516 } else {
517 /* Instructions or run cycles on PMC5/6 */
518 --pmc;
519 }
520 if (isbus && unit == PM_GRS) {
521 bit = psel & 7;
522 grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
523 mmcr1 |= (u64)grsel << grsel_shift[bit];
524 }
525 if (power5_marked_instr_event(event[i]))
526 mmcra |= MMCRA_SAMPLE_ENABLE;
527 if (pmc <= 3)
528 mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
529 hwc[i] = pmc;
530 }
531
532 /* Return MMCRx values */
533 mmcr[0] = 0;
534 if (pmc_inuse & 1)
535 mmcr[0] = MMCR0_PMC1CE;
536 if (pmc_inuse & 0x3e)
537 mmcr[0] |= MMCR0_PMCjCE;
538 mmcr[1] = mmcr1;
539 mmcr[2] = mmcra;
540 return 0;
541}
542
543static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
544{
545 if (pmc <= 3)
546 mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
547}
548
549static int power5_generic_events[] = {
550 [PERF_COUNT_CPU_CYCLES] = 0xf,
551 [PERF_COUNT_INSTRUCTIONS] = 0x100009,
552 [PERF_COUNT_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */
553 [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
554 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
555 [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
556};
557
558struct power_pmu power5_pmu = {
559 .n_counter = 6,
560 .max_alternatives = MAX_ALT,
561 .add_fields = 0x7000090000555ull,
562 .test_adder = 0x3000490000000ull,
563 .compute_mmcr = power5_compute_mmcr,
564 .get_constraint = power5_get_constraint,
565 .get_alternatives = power5_get_alternatives,
566 .disable_pmc = power5_disable_pmc,
567 .n_generic = ARRAY_SIZE(power5_generic_events),
568 .generic_events = power5_generic_events,
569};
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
new file mode 100644
index 000000000000..fce1fc290a1d
--- /dev/null
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -0,0 +1,407 @@
1/*
2 * Performance counter support for POWER6 processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER6
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0x7
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */
22#define PM_UNIT_MSK 0xf
23#define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH)
24#define PM_LLAV 0x8000 /* Load lookahead match value */
25#define PM_LLA 0x4000 /* Load lookahead match enable */
26#define PM_BYTE_SH 12 /* Byte of event bus to use */
27#define PM_BYTE_MSK 3
28#define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */
29#define PM_SUBUNIT_MSK 7
30#define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
31#define PM_PMCSEL_MSK 0xff /* PMCxSEL value */
32#define PM_BUSEVENT_MSK 0xf3700
33
34/*
35 * Bits in MMCR1 for POWER6
36 */
37#define MMCR1_TTM0SEL_SH 60
38#define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4)
39#define MMCR1_TTMSEL_MSK 0xf
40#define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
41#define MMCR1_NESTSEL_SH 45
42#define MMCR1_NESTSEL_MSK 0x7
43#define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
44#define MMCR1_PMC1_LLA ((u64)1 << 44)
45#define MMCR1_PMC1_LLA_VALUE ((u64)1 << 39)
46#define MMCR1_PMC1_ADDR_SEL ((u64)1 << 35)
47#define MMCR1_PMC1SEL_SH 24
48#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
49#define MMCR1_PMCSEL_MSK 0xff
50
51/*
52 * Map of which direct events on which PMCs are marked instruction events.
53 * Indexed by PMCSEL value >> 1.
54 * Bottom 4 bits are a map of which PMCs are interesting,
55 * top 4 bits say what sort of event:
56 * 0 = direct marked event,
57 * 1 = byte decode event,
58 * 4 = add/and event (PMC1 -> bits 0 & 4),
59 * 5 = add/and event (PMC1 -> bits 1 & 5),
60 * 6 = add/and event (PMC1 -> bits 2 & 6),
61 * 7 = add/and event (PMC1 -> bits 3 & 7).
62 */
63static unsigned char direct_event_is_marked[0x60 >> 1] = {
64 0, /* 00 */
65 0, /* 02 */
66 0, /* 04 */
67 0x07, /* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
68 0x04, /* 08 PM_MRK_DFU_FIN */
69 0x06, /* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */
70 0, /* 0c */
71 0, /* 0e */
72 0x02, /* 10 PM_MRK_INST_DISP */
73 0x08, /* 12 PM_MRK_LSU_DERAT_MISS */
74 0, /* 14 */
75 0, /* 16 */
76 0x0c, /* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */
77 0x0f, /* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */
78 0x01, /* 1c PM_MRK_INST_ISSUED */
79 0, /* 1e */
80 0, /* 20 */
81 0, /* 22 */
82 0, /* 24 */
83 0, /* 26 */
84 0x15, /* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */
85 0, /* 2a */
86 0, /* 2c */
87 0, /* 2e */
88 0x4f, /* 30 */
89 0x7f, /* 32 */
90 0x4f, /* 34 */
91 0x5f, /* 36 */
92 0x6f, /* 38 */
93 0x4f, /* 3a */
94 0, /* 3c */
95 0x08, /* 3e PM_MRK_INST_TIMEO */
96 0x1f, /* 40 */
97 0x1f, /* 42 */
98 0x1f, /* 44 */
99 0x1f, /* 46 */
100 0x1f, /* 48 */
101 0x1f, /* 4a */
102 0x1f, /* 4c */
103 0x1f, /* 4e */
104 0, /* 50 */
105 0x05, /* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */
106 0x1c, /* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */
107 0x02, /* 56 PM_MRK_LD_MISS_L1 */
108 0, /* 58 */
109 0, /* 5a */
110 0, /* 5c */
111 0, /* 5e */
112};
113
114/*
115 * Masks showing for each unit which bits are marked events.
116 * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0.
117 */
118static u32 marked_bus_events[16] = {
119 0x01000000, /* direct events set 1: byte 3 bit 0 */
120 0x00010000, /* direct events set 2: byte 2 bit 0 */
121 0, 0, 0, 0, /* IDU, IFU, nest: nothing */
122 0x00000088, /* VMX set 1: byte 0 bits 3, 7 */
123 0x000000c0, /* VMX set 2: byte 0 bits 4-7 */
124 0x04010000, /* LSU set 1: byte 2 bit 0, byte 3 bit 2 */
125 0xff010000u, /* LSU set 2: byte 2 bit 0, all of byte 3 */
126 0, /* LSU set 3 */
127 0x00000010, /* VMX set 3: byte 0 bit 4 */
128 0, /* BFP set 1 */
129 0x00000022, /* BFP set 2: byte 0 bits 1, 5 */
130 0, 0
131};
132
133/*
134 * Returns 1 if event counts things relating to marked instructions
135 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
136 */
137static int power6_marked_instr_event(unsigned int event)
138{
139 int pmc, psel, ptype;
140 int bit, byte, unit;
141 u32 mask;
142
143 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
144 psel = (event & PM_PMCSEL_MSK) >> 1; /* drop edge/level bit */
145 if (pmc >= 5)
146 return 0;
147
148 bit = -1;
149 if (psel < sizeof(direct_event_is_marked)) {
150 ptype = direct_event_is_marked[psel];
151 if (pmc == 0 || !(ptype & (1 << (pmc - 1))))
152 return 0;
153 ptype >>= 4;
154 if (ptype == 0)
155 return 1;
156 if (ptype == 1)
157 bit = 0;
158 else
159 bit = ptype ^ (pmc - 1);
160 } else if ((psel & 0x48) == 0x40)
161 bit = psel & 7;
162
163 if (!(event & PM_BUSEVENT_MSK) || bit == -1)
164 return 0;
165
166 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
167 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
168 mask = marked_bus_events[unit];
169 return (mask >> (byte * 8 + bit)) & 1;
170}
171
172/*
173 * Assign PMC numbers and compute MMCR1 value for a set of events
174 */
175static int p6_compute_mmcr(unsigned int event[], int n_ev,
176 unsigned int hwc[], u64 mmcr[])
177{
178 u64 mmcr1 = 0;
179 u64 mmcra = 0;
180 int i;
181 unsigned int pmc, ev, b, u, s, psel;
182 unsigned int ttmset = 0;
183 unsigned int pmc_inuse = 0;
184
185 if (n_ev > 4)
186 return -1;
187 for (i = 0; i < n_ev; ++i) {
188 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
189 if (pmc) {
190 if (pmc_inuse & (1 << (pmc - 1)))
191 return -1; /* collision! */
192 pmc_inuse |= 1 << (pmc - 1);
193 }
194 }
195 for (i = 0; i < n_ev; ++i) {
196 ev = event[i];
197 pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
198 if (pmc) {
199 --pmc;
200 } else {
201 /* can go on any PMC; find a free one */
202 for (pmc = 0; pmc < 4; ++pmc)
203 if (!(pmc_inuse & (1 << pmc)))
204 break;
205 pmc_inuse |= 1 << pmc;
206 }
207 hwc[i] = pmc;
208 psel = ev & PM_PMCSEL_MSK;
209 if (ev & PM_BUSEVENT_MSK) {
210 /* this event uses the event bus */
211 b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
212 u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
213 /* check for conflict on this byte of event bus */
214 if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
215 return -1;
216 mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
217 ttmset |= 1 << b;
218 if (u == 5) {
219 /* Nest events have a further mux */
220 s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
221 if ((ttmset & 0x10) &&
222 MMCR1_NESTSEL(mmcr1) != s)
223 return -1;
224 ttmset |= 0x10;
225 mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
226 }
227 if (0x30 <= psel && psel <= 0x3d) {
228 /* these need the PMCx_ADDR_SEL bits */
229 if (b >= 2)
230 mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
231 }
232 /* bus select values are different for PMC3/4 */
233 if (pmc >= 2 && (psel & 0x90) == 0x80)
234 psel ^= 0x20;
235 }
236 if (ev & PM_LLA) {
237 mmcr1 |= MMCR1_PMC1_LLA >> pmc;
238 if (ev & PM_LLAV)
239 mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
240 }
241 if (power6_marked_instr_event(event[i]))
242 mmcra |= MMCRA_SAMPLE_ENABLE;
243 mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
244 }
245 mmcr[0] = 0;
246 if (pmc_inuse & 1)
247 mmcr[0] = MMCR0_PMC1CE;
248 if (pmc_inuse & 0xe)
249 mmcr[0] |= MMCR0_PMCjCE;
250 mmcr[1] = mmcr1;
251 mmcr[2] = mmcra;
252 return 0;
253}
254
255/*
256 * Layout of constraint bits:
257 *
258 * 0-1 add field: number of uses of PMC1 (max 1)
259 * 2-3, 4-5, 6-7: ditto for PMC2, 3, 4
260 * 8-10 select field: nest (subunit) event selector
261 * 16-19 select field: unit on byte 0 of event bus
262 * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
263 */
264static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
265{
266 int pmc, byte, sh;
267 unsigned int mask = 0, value = 0;
268
269 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
270 if (pmc) {
271 if (pmc > 4)
272 return -1;
273 sh = (pmc - 1) * 2;
274 mask |= 2 << sh;
275 value |= 1 << sh;
276 }
277 if (event & PM_BUSEVENT_MSK) {
278 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
279 sh = byte * 4;
280 mask |= PM_UNIT_MSKS << sh;
281 value |= (event & PM_UNIT_MSKS) << sh;
282 if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
283 mask |= PM_SUBUNIT_MSKS;
284 value |= event & PM_SUBUNIT_MSKS;
285 }
286 }
287 *maskp = mask;
288 *valp = value;
289 return 0;
290}
291
292#define MAX_ALT 4 /* at most 4 alternatives for any event */
293
294static const unsigned int event_alternatives[][MAX_ALT] = {
295 { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */
296 { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
297 { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */
298 { 0x10000a, 0x2000f4 }, /* PM_RUN_CYC */
299 { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */
300 { 0x10000e, 0x400010 }, /* PM_PURR */
301 { 0x100010, 0x4000f8 }, /* PM_FLUSH */
302 { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */
303 { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */
304 { 0x100054, 0x2000f0 }, /* PM_ST_FIN */
305 { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */
306 { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */
307 { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */
308 { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */
309 { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */
310 { 0x200012, 0x300012 }, /* PM_INST_DISP */
311 { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */
312 { 0x2000f8, 0x300010 }, /* PM_EXT_INT */
313 { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */
314 { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */
315 { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */
316 { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */
317 { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */
318};
319
320/*
321 * This could be made more efficient with a binary search on
322 * a presorted list, if necessary
323 */
324static int find_alternatives_list(unsigned int event)
325{
326 int i, j;
327 unsigned int alt;
328
329 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
330 if (event < event_alternatives[i][0])
331 return -1;
332 for (j = 0; j < MAX_ALT; ++j) {
333 alt = event_alternatives[i][j];
334 if (!alt || event < alt)
335 break;
336 if (event == alt)
337 return i;
338 }
339 }
340 return -1;
341}
342
343static int p6_get_alternatives(unsigned int event, unsigned int alt[])
344{
345 int i, j;
346 unsigned int aevent, psel, pmc;
347 unsigned int nalt = 1;
348
349 alt[0] = event;
350
351 /* check the alternatives table */
352 i = find_alternatives_list(event);
353 if (i >= 0) {
354 /* copy out alternatives from list */
355 for (j = 0; j < MAX_ALT; ++j) {
356 aevent = event_alternatives[i][j];
357 if (!aevent)
358 break;
359 if (aevent != event)
360 alt[nalt++] = aevent;
361 }
362
363 } else {
364 /* Check for alternative ways of computing sum events */
365 /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
366 psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */
367 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
368 if (pmc && (psel == 0x32 || psel == 0x34))
369 alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
370 ((5 - pmc) << PM_PMC_SH);
371
372 /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
373 if (pmc && (psel == 0x38 || psel == 0x3a))
374 alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
375 ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
376 }
377
378 return nalt;
379}
380
381static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
382{
383 /* Set PMCxSEL to 0 to disable PMCx */
384 mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
385}
386
387static int power6_generic_events[] = {
388 [PERF_COUNT_CPU_CYCLES] = 0x1e,
389 [PERF_COUNT_INSTRUCTIONS] = 2,
390 [PERF_COUNT_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */
391 [PERF_COUNT_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */
392 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */
393 [PERF_COUNT_BRANCH_MISSES] = 0x400052, /* BR_MPRED */
394};
395
396struct power_pmu power6_pmu = {
397 .n_counter = 4,
398 .max_alternatives = MAX_ALT,
399 .add_fields = 0x55,
400 .test_adder = 0,
401 .compute_mmcr = p6_compute_mmcr,
402 .get_constraint = p6_get_constraint,
403 .get_alternatives = p6_get_alternatives,
404 .disable_pmc = p6_disable_pmc,
405 .n_generic = ARRAY_SIZE(power6_generic_events),
406 .generic_events = power6_generic_events,
407};
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
new file mode 100644
index 000000000000..aed8ccd7c077
--- /dev/null
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -0,0 +1,441 @@
1/*
2 * Performance counter support for PPC970-family processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/string.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for PPC970
17 */
18#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
21#define PM_UNIT_MSK 0xf
22#define PM_SPCSEL_SH 6
23#define PM_SPCSEL_MSK 3
24#define PM_BYTE_SH 4 /* Byte number of event bus to use */
25#define PM_BYTE_MSK 3
26#define PM_PMCSEL_MSK 0xf
27
28/* Values in PM_UNIT field */
29#define PM_NONE 0
30#define PM_FPU 1
31#define PM_VPU 2
32#define PM_ISU 3
33#define PM_IFU 4
34#define PM_IDU 5
35#define PM_STS 6
36#define PM_LSU0 7
37#define PM_LSU1U 8
38#define PM_LSU1L 9
39#define PM_LASTUNIT 9
40
41/*
42 * Bits in MMCR0 for PPC970
43 */
44#define MMCR0_PMC1SEL_SH 8
45#define MMCR0_PMC2SEL_SH 1
46#define MMCR_PMCSEL_MSK 0x1f
47
48/*
49 * Bits in MMCR1 for PPC970
50 */
51#define MMCR1_TTM0SEL_SH 62
52#define MMCR1_TTM1SEL_SH 59
53#define MMCR1_TTM3SEL_SH 53
54#define MMCR1_TTMSEL_MSK 3
55#define MMCR1_TD_CP_DBG0SEL_SH 50
56#define MMCR1_TD_CP_DBG1SEL_SH 48
57#define MMCR1_TD_CP_DBG2SEL_SH 46
58#define MMCR1_TD_CP_DBG3SEL_SH 44
59#define MMCR1_PMC1_ADDER_SEL_SH 39
60#define MMCR1_PMC2_ADDER_SEL_SH 38
61#define MMCR1_PMC6_ADDER_SEL_SH 37
62#define MMCR1_PMC5_ADDER_SEL_SH 36
63#define MMCR1_PMC8_ADDER_SEL_SH 35
64#define MMCR1_PMC7_ADDER_SEL_SH 34
65#define MMCR1_PMC3_ADDER_SEL_SH 33
66#define MMCR1_PMC4_ADDER_SEL_SH 32
67#define MMCR1_PMC3SEL_SH 27
68#define MMCR1_PMC4SEL_SH 22
69#define MMCR1_PMC5SEL_SH 17
70#define MMCR1_PMC6SEL_SH 12
71#define MMCR1_PMC7SEL_SH 7
72#define MMCR1_PMC8SEL_SH 2
73
74static short mmcr1_adder_bits[8] = {
75 MMCR1_PMC1_ADDER_SEL_SH,
76 MMCR1_PMC2_ADDER_SEL_SH,
77 MMCR1_PMC3_ADDER_SEL_SH,
78 MMCR1_PMC4_ADDER_SEL_SH,
79 MMCR1_PMC5_ADDER_SEL_SH,
80 MMCR1_PMC6_ADDER_SEL_SH,
81 MMCR1_PMC7_ADDER_SEL_SH,
82 MMCR1_PMC8_ADDER_SEL_SH
83};
84
85/*
86 * Bits in MMCRA
87 */
88
89/*
90 * Layout of constraint bits:
91 * 6666555555555544444444443333333333222222222211111111110000000000
92 * 3210987654321098765432109876543210987654321098765432109876543210
93 * <><><>[ >[ >[ >< >< >< >< ><><><><><><><><>
94 * SPT0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
95 *
96 * SP - SPCSEL constraint
97 * 48-49: SPCSEL value 0x3_0000_0000_0000
98 *
99 * T0 - TTM0 constraint
100 * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
101 *
102 * T1 - TTM1 constraint
103 * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
104 *
105 * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
106 * 43: UC3 error 0x0800_0000_0000
107 * 42: FPU|IFU|VPU events needed 0x0400_0000_0000
108 * 41: ISU events needed 0x0200_0000_0000
109 * 40: IDU|STS events needed 0x0100_0000_0000
110 *
111 * PS1
112 * 39: PS1 error 0x0080_0000_0000
113 * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
114 *
115 * PS2
116 * 35: PS2 error 0x0008_0000_0000
117 * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
118 *
119 * B0
120 * 28-31: Byte 0 event source 0xf000_0000
121 * Encoding as for the event code
122 *
123 * B1, B2, B3
124 * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
125 *
126 * P1
127 * 15: P1 error 0x8000
128 * 14-15: Count of events needing PMC1
129 *
130 * P2..P8
131 * 0-13: Count of events needing PMC2..PMC8
132 */
133
134static unsigned char direct_marked_event[8] = {
135 (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
136 (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
137 (1<<3) | (1<<5), /* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */
138 (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
139 (1<<4) | (1<<5), /* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */
140 (1<<3) | (1<<4) | (1<<5),
141 /* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
142 (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
143 (1<<4) /* PMC8: PM_MRK_LSU_FIN */
144};
145
146/*
147 * Returns 1 if event counts things relating to marked instructions
148 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
149 */
150static int p970_marked_instr_event(unsigned int event)
151{
152 int pmc, psel, unit, byte, bit;
153 unsigned int mask;
154
155 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
156 psel = event & PM_PMCSEL_MSK;
157 if (pmc) {
158 if (direct_marked_event[pmc - 1] & (1 << psel))
159 return 1;
160 if (psel == 0) /* add events */
161 bit = (pmc <= 4)? pmc - 1: 8 - pmc;
162 else if (psel == 7 || psel == 13) /* decode events */
163 bit = 4;
164 else
165 return 0;
166 } else
167 bit = psel;
168
169 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
170 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
171 mask = 0;
172 switch (unit) {
173 case PM_VPU:
174 mask = 0x4c; /* byte 0 bits 2,3,6 */
175 case PM_LSU0:
176 /* byte 2 bits 0,2,3,4,6; all of byte 1 */
177 mask = 0x085dff00;
178 case PM_LSU1L:
179 mask = 0x50 << 24; /* byte 3 bits 4,6 */
180 break;
181 }
182 return (mask >> (byte * 8 + bit)) & 1;
183}
184
185/* Masks and values for using events from the various units */
186static u64 unit_cons[PM_LASTUNIT+1][2] = {
187 [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull },
188 [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull },
189 [PM_ISU] = { 0x080000000000ull, 0x020000000000ull },
190 [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull },
191 [PM_IDU] = { 0x380000000000ull, 0x010000000000ull },
192 [PM_STS] = { 0x380000000000ull, 0x310000000000ull },
193};
194
195static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
196{
197 int pmc, byte, unit, sh, spcsel;
198 u64 mask = 0, value = 0;
199 int grp = -1;
200
201 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
202 if (pmc) {
203 if (pmc > 8)
204 return -1;
205 sh = (pmc - 1) * 2;
206 mask |= 2 << sh;
207 value |= 1 << sh;
208 grp = ((pmc - 1) >> 1) & 1;
209 }
210 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
211 if (unit) {
212 if (unit > PM_LASTUNIT)
213 return -1;
214 mask |= unit_cons[unit][0];
215 value |= unit_cons[unit][1];
216 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
217 /*
218 * Bus events on bytes 0 and 2 can be counted
219 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
220 */
221 if (!pmc)
222 grp = byte & 1;
223 /* Set byte lane select field */
224 mask |= 0xfULL << (28 - 4 * byte);
225 value |= (u64)unit << (28 - 4 * byte);
226 }
227 if (grp == 0) {
228 /* increment PMC1/2/5/6 field */
229 mask |= 0x8000000000ull;
230 value |= 0x1000000000ull;
231 } else if (grp == 1) {
232 /* increment PMC3/4/7/8 field */
233 mask |= 0x800000000ull;
234 value |= 0x100000000ull;
235 }
236 spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
237 if (spcsel) {
238 mask |= 3ull << 48;
239 value |= (u64)spcsel << 48;
240 }
241 *maskp = mask;
242 *valp = value;
243 return 0;
244}
245
246static int p970_get_alternatives(unsigned int event, unsigned int alt[])
247{
248 alt[0] = event;
249
250 /* 2 alternatives for LSU empty */
251 if (event == 0x2002 || event == 0x3002) {
252 alt[1] = event ^ 0x1000;
253 return 2;
254 }
255
256 return 1;
257}
258
259static int p970_compute_mmcr(unsigned int event[], int n_ev,
260 unsigned int hwc[], u64 mmcr[])
261{
262 u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
263 unsigned int pmc, unit, byte, psel;
264 unsigned int ttm, grp;
265 unsigned int pmc_inuse = 0;
266 unsigned int pmc_grp_use[2];
267 unsigned char busbyte[4];
268 unsigned char unituse[16];
269 unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
270 unsigned char ttmuse[2];
271 unsigned char pmcsel[8];
272 int i;
273 int spcsel;
274
275 if (n_ev > 8)
276 return -1;
277
278 /* First pass to count resource use */
279 pmc_grp_use[0] = pmc_grp_use[1] = 0;
280 memset(busbyte, 0, sizeof(busbyte));
281 memset(unituse, 0, sizeof(unituse));
282 for (i = 0; i < n_ev; ++i) {
283 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
284 if (pmc) {
285 if (pmc_inuse & (1 << (pmc - 1)))
286 return -1;
287 pmc_inuse |= 1 << (pmc - 1);
288 /* count 1/2/5/6 vs 3/4/7/8 use */
289 ++pmc_grp_use[((pmc - 1) >> 1) & 1];
290 }
291 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
292 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
293 if (unit) {
294 if (unit > PM_LASTUNIT)
295 return -1;
296 if (!pmc)
297 ++pmc_grp_use[byte & 1];
298 if (busbyte[byte] && busbyte[byte] != unit)
299 return -1;
300 busbyte[byte] = unit;
301 unituse[unit] = 1;
302 }
303 }
304 if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
305 return -1;
306
307 /*
308 * Assign resources and set multiplexer selects.
309 *
310 * PM_ISU can go either on TTM0 or TTM1, but that's the only
311 * choice we have to deal with.
312 */
313 if (unituse[PM_ISU] &
314 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
315 unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */
316 /* Set TTM[01]SEL fields. */
317 ttmuse[0] = ttmuse[1] = 0;
318 for (i = PM_FPU; i <= PM_STS; ++i) {
319 if (!unituse[i])
320 continue;
321 ttm = unitmap[i];
322 ++ttmuse[(ttm >> 2) & 1];
323 mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
324 }
325 /* Check only one unit per TTMx */
326 if (ttmuse[0] > 1 || ttmuse[1] > 1)
327 return -1;
328
329 /* Set byte lane select fields and TTM3SEL. */
330 for (byte = 0; byte < 4; ++byte) {
331 unit = busbyte[byte];
332 if (!unit)
333 continue;
334 if (unit <= PM_STS)
335 ttm = (unitmap[unit] >> 2) & 1;
336 else if (unit == PM_LSU0)
337 ttm = 2;
338 else {
339 ttm = 3;
340 if (unit == PM_LSU1L && byte >= 2)
341 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
342 }
343 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
344 }
345
346 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
347 memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */
348 for (i = 0; i < n_ev; ++i) {
349 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
350 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
351 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
352 psel = event[i] & PM_PMCSEL_MSK;
353 if (!pmc) {
354 /* Bus event or any-PMC direct event */
355 if (unit)
356 psel |= 0x10 | ((byte & 2) << 2);
357 else
358 psel |= 8;
359 for (pmc = 0; pmc < 8; ++pmc) {
360 if (pmc_inuse & (1 << pmc))
361 continue;
362 grp = (pmc >> 1) & 1;
363 if (unit) {
364 if (grp == (byte & 1))
365 break;
366 } else if (pmc_grp_use[grp] < 4) {
367 ++pmc_grp_use[grp];
368 break;
369 }
370 }
371 pmc_inuse |= 1 << pmc;
372 } else {
373 /* Direct event */
374 --pmc;
375 if (psel == 0 && (byte & 2))
376 /* add events on higher-numbered bus */
377 mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
378 }
379 pmcsel[pmc] = psel;
380 hwc[i] = pmc;
381 spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
382 mmcr1 |= spcsel;
383 if (p970_marked_instr_event(event[i]))
384 mmcra |= MMCRA_SAMPLE_ENABLE;
385 }
386 for (pmc = 0; pmc < 2; ++pmc)
387 mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
388 for (; pmc < 8; ++pmc)
389 mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
390 if (pmc_inuse & 1)
391 mmcr0 |= MMCR0_PMC1CE;
392 if (pmc_inuse & 0xfe)
393 mmcr0 |= MMCR0_PMCjCE;
394
395 mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
396
397 /* Return MMCRx values */
398 mmcr[0] = mmcr0;
399 mmcr[1] = mmcr1;
400 mmcr[2] = mmcra;
401 return 0;
402}
403
404static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
405{
406 int shift, i;
407
408 if (pmc <= 1) {
409 shift = MMCR0_PMC1SEL_SH - 7 * pmc;
410 i = 0;
411 } else {
412 shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
413 i = 1;
414 }
415 /*
416 * Setting the PMCxSEL field to 0x08 disables PMC x.
417 */
418 mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
419}
420
421static int ppc970_generic_events[] = {
422 [PERF_COUNT_CPU_CYCLES] = 7,
423 [PERF_COUNT_INSTRUCTIONS] = 1,
424 [PERF_COUNT_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */
425 [PERF_COUNT_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */
426 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */
427 [PERF_COUNT_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */
428};
429
430struct power_pmu ppc970_pmu = {
431 .n_counter = 8,
432 .max_alternatives = 2,
433 .add_fields = 0x001100005555ull,
434 .test_adder = 0x013300000000ull,
435 .compute_mmcr = p970_compute_mmcr,
436 .get_constraint = p970_get_constraint,
437 .get_alternatives = p970_get_alternatives,
438 .disable_pmc = p970_disable_pmc,
439 .n_generic = ARRAY_SIZE(ppc970_generic_events),
440 .generic_events = ppc970_generic_events,
441};
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 76993941cac9..ac0e112031b2 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,6 +29,7 @@
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/kprobes.h> 30#include <linux/kprobes.h>
31#include <linux/kdebug.h> 31#include <linux/kdebug.h>
32#include <linux/perf_counter.h>
32 33
33#include <asm/firmware.h> 34#include <asm/firmware.h>
34#include <asm/page.h> 35#include <asm/page.h>
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
170 die("Weird page fault", regs, SIGSEGV); 171 die("Weird page fault", regs, SIGSEGV);
171 } 172 }
172 173
174 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
175
173 /* When running in the kernel we expect faults to occur only to 176 /* When running in the kernel we expect faults to occur only to
174 * addresses in user space. All other faults represent errors in the 177 * addresses in user space. All other faults represent errors in the
175 * kernel and should generate an OOPS. Unfortunately, in the case of an 178 * kernel and should generate an OOPS. Unfortunately, in the case of an
@@ -309,6 +312,8 @@ good_area:
309 } 312 }
310 if (ret & VM_FAULT_MAJOR) { 313 if (ret & VM_FAULT_MAJOR) {
311 current->maj_flt++; 314 current->maj_flt++;
315 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
316 regs, address);
312#ifdef CONFIG_PPC_SMLPAR 317#ifdef CONFIG_PPC_SMLPAR
313 if (firmware_has_feature(FW_FEATURE_CMO)) { 318 if (firmware_has_feature(FW_FEATURE_CMO)) {
314 preempt_disable(); 319 preempt_disable();
@@ -316,8 +321,11 @@ good_area:
316 preempt_enable(); 321 preempt_enable();
317 } 322 }
318#endif 323#endif
319 } else 324 } else {
320 current->min_flt++; 325 current->min_flt++;
326 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
327 regs, address);
328 }
321 up_read(&mm->mmap_sem); 329 up_read(&mm->mmap_sem);
322 return 0; 330 return 0;
323 331
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 9da795e49337..732ee93a8e98 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
1config PPC64 1config PPC64
2 bool "64-bit kernel" 2 bool "64-bit kernel"
3 default n 3 default n
4 select HAVE_PERF_COUNTERS
4 help 5 help
5 This option selects whether a 32-bit or a 64-bit kernel 6 This option selects whether a 32-bit or a 64-bit kernel
6 will be built. 7 will be built.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index df9e885eee14..32ada97c964d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -727,6 +727,7 @@ config X86_UP_IOAPIC
727config X86_LOCAL_APIC 727config X86_LOCAL_APIC
728 def_bool y 728 def_bool y
729 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 729 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
730 select HAVE_PERF_COUNTERS if (!M386 && !M486)
730 731
731config X86_IO_APIC 732config X86_IO_APIC
732 def_bool y 733 def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a505202086e8..19c61ef6ab57 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -825,9 +825,10 @@ ia32_sys_call_table:
825 .quad compat_sys_signalfd4 825 .quad compat_sys_signalfd4
826 .quad sys_eventfd2 826 .quad sys_eventfd2
827 .quad sys_epoll_create1 827 .quad sys_epoll_create1
828 .quad sys_dup3 /* 330 */ 828 .quad sys_dup3 /* 330 */
829 .quad sys_pipe2 829 .quad sys_pipe2
830 .quad sys_inotify_init1 830 .quad sys_inotify_init1
831 .quad compat_sys_preadv 831 .quad compat_sys_preadv
832 .quad compat_sys_pwritev 832 .quad compat_sys_pwritev
833 .quad sys_perf_counter_open
833ia32_syscall_end: 834ia32_syscall_end:
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba4229..aff9f1fcdcd7 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,241 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
247#define smp_mb__before_atomic_inc() barrier() 247#define smp_mb__before_atomic_inc() barrier()
248#define smp_mb__after_atomic_inc() barrier() 248#define smp_mb__after_atomic_inc() barrier()
249 249
250/* An 64bit atomic type */
251
252typedef struct {
253 unsigned long long counter;
254} atomic64_t;
255
256#define ATOMIC64_INIT(val) { (val) }
257
258/**
259 * atomic64_read - read atomic64 variable
260 * @v: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292
293/**
294 * atomic64_xchg - xchg atomic64 variable
295 * @ptr: pointer to type atomic64_t
296 * @new_val: value to assign
297 * @old_val: old value that was there
298 *
299 * Atomically xchgs the value of @ptr to @new_val and returns
300 * the old value.
301 */
302
303static inline unsigned long long
304atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
305{
306 unsigned long long old_val;
307
308 do {
309 old_val = atomic_read(ptr);
310 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
311
312 return old_val;
313}
314
315/**
316 * atomic64_set - set atomic64 variable
317 * @ptr: pointer to type atomic64_t
318 * @new_val: value to assign
319 *
320 * Atomically sets the value of @ptr to @new_val.
321 */
322static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
323{
324 atomic64_xchg(ptr, new_val);
325}
326
327/**
328 * atomic64_read - read atomic64 variable
329 * @ptr: pointer to type atomic64_t
330 *
331 * Atomically reads the value of @ptr and returns it.
332 */
333static inline unsigned long long atomic64_read(atomic64_t *ptr)
334{
335 unsigned long long curr_val;
336
337 do {
338 curr_val = __atomic64_read(ptr);
339 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
340
341 return curr_val;
342}
343
344/**
345 * atomic64_add_return - add and return
346 * @delta: integer value to add
347 * @ptr: pointer to type atomic64_t
348 *
349 * Atomically adds @delta to @ptr and returns @delta + *@ptr
350 */
351static inline unsigned long long
352atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
353{
354 unsigned long long old_val, new_val;
355
356 do {
357 old_val = atomic_read(ptr);
358 new_val = old_val + delta;
359
360 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
361
362 return new_val;
363}
364
365static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
366{
367 return atomic64_add_return(-delta, ptr);
368}
369
370static inline long atomic64_inc_return(atomic64_t *ptr)
371{
372 return atomic64_add_return(1, ptr);
373}
374
375static inline long atomic64_dec_return(atomic64_t *ptr)
376{
377 return atomic64_sub_return(1, ptr);
378}
379
380/**
381 * atomic64_add - add integer to atomic64 variable
382 * @delta: integer value to add
383 * @ptr: pointer to type atomic64_t
384 *
385 * Atomically adds @delta to @ptr.
386 */
387static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
388{
389 atomic64_add_return(delta, ptr);
390}
391
392/**
393 * atomic64_sub - subtract the atomic64 variable
394 * @delta: integer value to subtract
395 * @ptr: pointer to type atomic64_t
396 *
397 * Atomically subtracts @delta from @ptr.
398 */
399static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
400{
401 atomic64_add(-delta, ptr);
402}
403
404/**
405 * atomic64_sub_and_test - subtract value from variable and test result
406 * @delta: integer value to subtract
407 * @ptr: pointer to type atomic64_t
408 *
409 * Atomically subtracts @delta from @ptr and returns
410 * true if the result is zero, or false for all
411 * other cases.
412 */
413static inline int
414atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
415{
416 unsigned long long old_val = atomic64_sub_return(delta, ptr);
417
418 return old_val == 0;
419}
420
421/**
422 * atomic64_inc - increment atomic64 variable
423 * @ptr: pointer to type atomic64_t
424 *
425 * Atomically increments @ptr by 1.
426 */
427static inline void atomic64_inc(atomic64_t *ptr)
428{
429 atomic64_add(1, ptr);
430}
431
432/**
433 * atomic64_dec - decrement atomic64 variable
434 * @ptr: pointer to type atomic64_t
435 *
436 * Atomically decrements @ptr by 1.
437 */
438static inline void atomic64_dec(atomic64_t *ptr)
439{
440 atomic64_sub(1, ptr);
441}
442
443/**
444 * atomic64_dec_and_test - decrement and test
445 * @ptr: pointer to type atomic64_t
446 *
447 * Atomically decrements @ptr by 1 and
448 * returns true if the result is 0, or false for all other
449 * cases.
450 */
451static inline int atomic64_dec_and_test(atomic64_t *ptr)
452{
453 return atomic64_sub_and_test(1, ptr);
454}
455
456/**
457 * atomic64_inc_and_test - increment and test
458 * @ptr: pointer to type atomic64_t
459 *
460 * Atomically increments @ptr by 1
461 * and returns true if the result is zero, or false for all
462 * other cases.
463 */
464static inline int atomic64_inc_and_test(atomic64_t *ptr)
465{
466 return atomic64_sub_and_test(-1, ptr);
467}
468
469/**
470 * atomic64_add_negative - add and test if negative
471 * @delta: integer value to add
472 * @ptr: pointer to type atomic64_t
473 *
474 * Atomically adds @delta to @ptr and returns true
475 * if the result is negative, or false when
476 * result is greater than or equal to zero.
477 */
478static inline int
479atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
480{
481 long long old_val = atomic64_add_return(delta, ptr);
482
483 return old_val < 0;
484}
485
250#include <asm-generic/atomic.h> 486#include <asm-generic/atomic.h>
251#endif /* _ASM_X86_ATOMIC_32_H */ 487#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf258..fe24d2802490 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -50,6 +50,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
50 50
51#ifdef CONFIG_PERF_COUNTERS 51#ifdef CONFIG_PERF_COUNTERS
52BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) 52BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
53#endif 54#endif
54 55
55#ifdef CONFIG_X86_MCE_P4THERMAL 56#ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 37555e52f980..9ebc5c255032 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int generic_irqs; /* arch dependent */
16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs;
16#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
17 unsigned int irq_resched_count; 19 unsigned int irq_resched_count;
18 unsigned int irq_call_count; 20 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b762ea49bd70..7309c0ad6902 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,9 @@
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void generic_interrupt(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_counter_interrupt(void);
33extern void perf_pending_interrupt(void);
34
32extern void spurious_interrupt(void); 35extern void spurious_interrupt(void);
33extern void thermal_interrupt(void); 36extern void thermal_interrupt(void);
34extern void reschedule_interrupt(void); 37extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2e..000000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47c..545bb811ccb5 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -117,6 +117,11 @@
117#define GENERIC_INTERRUPT_VECTOR 0xed 117#define GENERIC_INTERRUPT_VECTOR 0xed
118 118
119/* 119/*
120 * Performance monitoring pending work vector:
121 */
122#define LOCAL_PENDING_VECTOR 0xec
123
124/*
120 * First APIC vector available to drivers: (vectors 0x30-0xee) we 125 * First APIC vector available to drivers: (vectors 0x30-0xee) we
121 * start at 0x31(0x41) to spread out vectors evenly between priority 126 * start at 0x31(0x41) to spread out vectors evenly between priority
122 * levels. (0x80 is the syscall vector) 127 * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 000000000000..d08dd52cb8ff
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,100 @@
1#ifndef _ASM_X86_PERF_COUNTER_H
2#define _ASM_X86_PERF_COUNTER_H
3
4/*
5 * Performance counter hw details:
6 */
7
8#define X86_PMC_MAX_GENERIC 8
9#define X86_PMC_MAX_FIXED 3
10
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64
14
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
16#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
17
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
25
26/*
27 * Includes eventsel and unit mask as well:
28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff
30
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36
37#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
38
39/*
40 * Intel "Architectural Performance Monitoring" CPUID
41 * detection/enumeration details:
42 */
43union cpuid10_eax {
44 struct {
45 unsigned int version_id:8;
46 unsigned int num_counters:8;
47 unsigned int bit_width:8;
48 unsigned int mask_length:8;
49 } split;
50 unsigned int full;
51};
52
53union cpuid10_edx {
54 struct {
55 unsigned int num_counters_fixed:4;
56 unsigned int reserved:28;
57 } split;
58 unsigned int full;
59};
60
61
62/*
63 * Fixed-purpose performance counters:
64 */
65
66/*
67 * All 3 fixed-mode PMCs are configured via this single MSR:
68 */
69#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
70
71/*
72 * The counts are available in three separate MSRs:
73 */
74
75/* Instr_Retired.Any: */
76#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
77#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
78
79/* CPU_CLK_Unhalted.Core: */
80#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
81#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
82
83/* CPU_CLK_Unhalted.Ref: */
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86
87extern void set_perf_counter_pending(void);
88
89#define clear_perf_counter_pending() do { } while (0)
90#define test_perf_counter_pending() (0)
91
92#ifdef CONFIG_PERF_COUNTERS
93extern void init_hw_perf_counters(void);
94extern void perf_counters_lapic_init(int nmi);
95#else
96static inline void init_hw_perf_counters(void) { }
97static inline void perf_counters_lapic_init(int nmi) { }
98#endif
99
100#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74cf8dc..0b4d8c2b157d 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,7 @@
340#define __NR_inotify_init1 332 340#define __NR_inotify_init1 332
341#define __NR_preadv 333 341#define __NR_preadv 333
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_perf_counter_open 333
343 344
344#ifdef __KERNEL__ 345#ifdef __KERNEL__
345 346
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index f81829462325..d9aad876ad76 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -657,7 +657,8 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
657__SYSCALL(__NR_preadv, sys_preadv) 657__SYSCALL(__NR_preadv, sys_preadv)
658#define __NR_pwritev 296 658#define __NR_pwritev 296
659__SYSCALL(__NR_pwritev, sys_pwritev) 659__SYSCALL(__NR_pwritev, sys_pwritev)
660 660#define __NR_perf_counter_open 295
661__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
661 662
662#ifndef __NO_STUBS 663#ifndef __NO_STUBS
663#define __ARCH_WANT_OLD_READDIR 664#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f2870920f246..e9021a908020 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -34,6 +34,7 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/mm.h> 35#include <linux/mm.h>
36 36
37#include <asm/perf_counter.h>
37#include <asm/pgalloc.h> 38#include <asm/pgalloc.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <asm/mpspec.h> 40#include <asm/mpspec.h>
@@ -761,6 +762,8 @@ static void local_apic_timer_interrupt(void)
761 inc_irq_stat(apic_timer_irqs); 762 inc_irq_stat(apic_timer_irqs);
762 763
763 evt->event_handler(evt); 764 evt->event_handler(evt);
765
766 perf_counter_unthrottle();
764} 767}
765 768
766/* 769/*
@@ -1133,6 +1136,7 @@ void __cpuinit setup_local_APIC(void)
1133 apic_write(APIC_ESR, 0); 1136 apic_write(APIC_ESR, 0);
1134 } 1137 }
1135#endif 1138#endif
1139 perf_counters_lapic_init(0);
1136 1140
1137 preempt_disable(); 1141 preempt_disable();
1138 1142
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e4..3efcb2b96a15 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
25 25
26obj-$(CONFIG_X86_MCE) += mcheck/ 26obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
27obj-$(CONFIG_MTRR) += mtrr/
28obj-$(CONFIG_CPU_FREQ) += cpufreq/
29 27
30obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 28obj-$(CONFIG_X86_MCE) += mcheck/
29obj-$(CONFIG_MTRR) += mtrr/
30obj-$(CONFIG_CPU_FREQ) += cpufreq/
31
32obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
31 33
32quiet_cmd_mkcapflags = MKCAP $@ 34quiet_cmd_mkcapflags = MKCAP $@
33 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 35 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e4a459daa64..fd69c514ca2a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -420,6 +420,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
420 if (c->x86 >= 6) 420 if (c->x86 >= 6)
421 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); 421 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
422 422
423 /* Enable Performance counter for K7 and later */
424 if (c->x86 > 6 && c->x86 <= 0x11)
425 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
426
423 if (!c->x86_model_id[0]) { 427 if (!c->x86_model_id[0]) {
424 switch (c->x86) { 428 switch (c->x86) {
425 case 0xf: 429 case 0xf:
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c4f667896c28..a86769efe0df 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h>
16#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
17#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -854,6 +855,7 @@ void __init identify_boot_cpu(void)
854#else 855#else
855 vgetcpu_set_mode(); 856 vgetcpu_set_mode();
856#endif 857#endif
858 init_hw_perf_counters();
857} 859}
858 860
859void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 861void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..0fcbaab83f9b
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1213 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 * Copyright(C) 2009 Jaswinder Singh Rajput
7 *
8 * For licencing details see kernel-base/COPYING
9 */
10
11#include <linux/perf_counter.h>
12#include <linux/capability.h>
13#include <linux/notifier.h>
14#include <linux/hardirq.h>
15#include <linux/kprobes.h>
16#include <linux/module.h>
17#include <linux/kdebug.h>
18#include <linux/sched.h>
19#include <linux/uaccess.h>
20
21#include <asm/apic.h>
22#include <asm/stacktrace.h>
23#include <asm/nmi.h>
24
25static bool perf_counters_initialized __read_mostly;
26
27/*
28 * Number of (generic) HW counters:
29 */
30static int nr_counters_generic __read_mostly;
31static u64 perf_counter_mask __read_mostly;
32static u64 counter_value_mask __read_mostly;
33static int counter_value_bits __read_mostly;
34
35static int nr_counters_fixed __read_mostly;
36
37struct cpu_hw_counters {
38 struct perf_counter *counters[X86_PMC_IDX_MAX];
39 unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
40 unsigned long interrupts;
41 u64 throttle_ctrl;
42 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
43 int enabled;
44};
45
46/*
47 * struct pmc_x86_ops - performance counter x86 ops
48 */
49struct pmc_x86_ops {
50 u64 (*save_disable_all)(void);
51 void (*restore_all)(u64);
52 u64 (*get_status)(u64);
53 void (*ack_status)(u64);
54 void (*enable)(int, u64);
55 void (*disable)(int, u64);
56 unsigned eventsel;
57 unsigned perfctr;
58 u64 (*event_map)(int);
59 u64 (*raw_event)(u64);
60 int max_events;
61};
62
63static struct pmc_x86_ops *pmc_ops __read_mostly;
64
65static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
66 .enabled = 1,
67};
68
69static __read_mostly int intel_perfmon_version;
70
71/*
72 * Intel PerfMon v3. Used on Core2 and later.
73 */
74static const u64 intel_perfmon_event_map[] =
75{
76 [PERF_COUNT_CPU_CYCLES] = 0x003c,
77 [PERF_COUNT_INSTRUCTIONS] = 0x00c0,
78 [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e,
79 [PERF_COUNT_CACHE_MISSES] = 0x412e,
80 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4,
81 [PERF_COUNT_BRANCH_MISSES] = 0x00c5,
82 [PERF_COUNT_BUS_CYCLES] = 0x013c,
83};
84
85static u64 pmc_intel_event_map(int event)
86{
87 return intel_perfmon_event_map[event];
88}
89
90static u64 pmc_intel_raw_event(u64 event)
91{
92#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
93#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
94#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
95
96#define CORE_EVNTSEL_MASK \
97 (CORE_EVNTSEL_EVENT_MASK | \
98 CORE_EVNTSEL_UNIT_MASK | \
99 CORE_EVNTSEL_COUNTER_MASK)
100
101 return event & CORE_EVNTSEL_MASK;
102}
103
104/*
105 * AMD Performance Monitor K7 and later.
106 */
107static const u64 amd_perfmon_event_map[] =
108{
109 [PERF_COUNT_CPU_CYCLES] = 0x0076,
110 [PERF_COUNT_INSTRUCTIONS] = 0x00c0,
111 [PERF_COUNT_CACHE_REFERENCES] = 0x0080,
112 [PERF_COUNT_CACHE_MISSES] = 0x0081,
113 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4,
114 [PERF_COUNT_BRANCH_MISSES] = 0x00c5,
115};
116
117static u64 pmc_amd_event_map(int event)
118{
119 return amd_perfmon_event_map[event];
120}
121
122static u64 pmc_amd_raw_event(u64 event)
123{
124#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
125#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
126#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
127
128#define K7_EVNTSEL_MASK \
129 (K7_EVNTSEL_EVENT_MASK | \
130 K7_EVNTSEL_UNIT_MASK | \
131 K7_EVNTSEL_COUNTER_MASK)
132
133 return event & K7_EVNTSEL_MASK;
134}
135
136/*
137 * Propagate counter elapsed time into the generic counter.
138 * Can only be executed on the CPU where the counter is active.
139 * Returns the delta events processed.
140 */
141static void
142x86_perf_counter_update(struct perf_counter *counter,
143 struct hw_perf_counter *hwc, int idx)
144{
145 u64 prev_raw_count, new_raw_count, delta;
146
147 /*
148 * Careful: an NMI might modify the previous counter value.
149 *
150 * Our tactic to handle this is to first atomically read and
151 * exchange a new raw count - then add that new-prev delta
152 * count to the generic counter atomically:
153 */
154again:
155 prev_raw_count = atomic64_read(&hwc->prev_count);
156 rdmsrl(hwc->counter_base + idx, new_raw_count);
157
158 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
159 new_raw_count) != prev_raw_count)
160 goto again;
161
162 /*
163 * Now we have the new raw value and have updated the prev
164 * timestamp already. We can now calculate the elapsed delta
165 * (counter-)time and add that to the generic counter.
166 *
167 * Careful, not all hw sign-extends above the physical width
168 * of the count, so we do that by clipping the delta to 32 bits:
169 */
170 delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
171
172 atomic64_add(delta, &counter->count);
173 atomic64_sub(delta, &hwc->period_left);
174}
175
176static atomic_t num_counters;
177static DEFINE_MUTEX(pmc_reserve_mutex);
178
179static bool reserve_pmc_hardware(void)
180{
181 int i;
182
183 if (nmi_watchdog == NMI_LOCAL_APIC)
184 disable_lapic_nmi_watchdog();
185
186 for (i = 0; i < nr_counters_generic; i++) {
187 if (!reserve_perfctr_nmi(pmc_ops->perfctr + i))
188 goto perfctr_fail;
189 }
190
191 for (i = 0; i < nr_counters_generic; i++) {
192 if (!reserve_evntsel_nmi(pmc_ops->eventsel + i))
193 goto eventsel_fail;
194 }
195
196 return true;
197
198eventsel_fail:
199 for (i--; i >= 0; i--)
200 release_evntsel_nmi(pmc_ops->eventsel + i);
201
202 i = nr_counters_generic;
203
204perfctr_fail:
205 for (i--; i >= 0; i--)
206 release_perfctr_nmi(pmc_ops->perfctr + i);
207
208 if (nmi_watchdog == NMI_LOCAL_APIC)
209 enable_lapic_nmi_watchdog();
210
211 return false;
212}
213
214static void release_pmc_hardware(void)
215{
216 int i;
217
218 for (i = 0; i < nr_counters_generic; i++) {
219 release_perfctr_nmi(pmc_ops->perfctr + i);
220 release_evntsel_nmi(pmc_ops->eventsel + i);
221 }
222
223 if (nmi_watchdog == NMI_LOCAL_APIC)
224 enable_lapic_nmi_watchdog();
225}
226
227static void hw_perf_counter_destroy(struct perf_counter *counter)
228{
229 if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
230 release_pmc_hardware();
231 mutex_unlock(&pmc_reserve_mutex);
232 }
233}
234
235/*
236 * Setup the hardware configuration for a given hw_event_type
237 */
238static int __hw_perf_counter_init(struct perf_counter *counter)
239{
240 struct perf_counter_hw_event *hw_event = &counter->hw_event;
241 struct hw_perf_counter *hwc = &counter->hw;
242 int err;
243
244 if (unlikely(!perf_counters_initialized))
245 return -EINVAL;
246
247 err = 0;
248 if (atomic_inc_not_zero(&num_counters)) {
249 mutex_lock(&pmc_reserve_mutex);
250 if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
251 err = -EBUSY;
252 else
253 atomic_inc(&num_counters);
254 mutex_unlock(&pmc_reserve_mutex);
255 }
256 if (err)
257 return err;
258
259 /*
260 * Generate PMC IRQs:
261 * (keep 'enabled' bit clear for now)
262 */
263 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
264
265 /*
266 * Count user and OS events unless requested not to.
267 */
268 if (!hw_event->exclude_user)
269 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
270 if (!hw_event->exclude_kernel)
271 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
272
273 /*
274 * If privileged enough, allow NMI events:
275 */
276 hwc->nmi = 0;
277 if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
278 hwc->nmi = 1;
279
280 hwc->irq_period = hw_event->irq_period;
281 /*
282 * Intel PMCs cannot be accessed sanely above 32 bit width,
283 * so we install an artificial 1<<31 period regardless of
284 * the generic counter period:
285 */
286 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
287 if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
288 hwc->irq_period = 0x7FFFFFFF;
289
290 atomic64_set(&hwc->period_left, hwc->irq_period);
291
292 /*
293 * Raw event type provide the config in the event structure
294 */
295 if (perf_event_raw(hw_event)) {
296 hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
297 } else {
298 if (perf_event_id(hw_event) >= pmc_ops->max_events)
299 return -EINVAL;
300 /*
301 * The generic map:
302 */
303 hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
304 }
305
306 counter->destroy = hw_perf_counter_destroy;
307
308 return 0;
309}
310
311static u64 pmc_intel_save_disable_all(void)
312{
313 u64 ctrl;
314
315 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
316 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
317
318 return ctrl;
319}
320
321static u64 pmc_amd_save_disable_all(void)
322{
323 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
324 int enabled, idx;
325
326 enabled = cpuc->enabled;
327 cpuc->enabled = 0;
328 /*
329 * ensure we write the disable before we start disabling the
330 * counters proper, so that pcm_amd_enable() does the right thing.
331 */
332 barrier();
333
334 for (idx = 0; idx < nr_counters_generic; idx++) {
335 u64 val;
336
337 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
338 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) {
339 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
340 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
341 }
342 }
343
344 return enabled;
345}
346
347u64 hw_perf_save_disable(void)
348{
349 if (unlikely(!perf_counters_initialized))
350 return 0;
351
352 return pmc_ops->save_disable_all();
353}
354/*
355 * Exported because of ACPI idle
356 */
357EXPORT_SYMBOL_GPL(hw_perf_save_disable);
358
359static void pmc_intel_restore_all(u64 ctrl)
360{
361 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
362}
363
364static void pmc_amd_restore_all(u64 ctrl)
365{
366 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
367 int idx;
368
369 cpuc->enabled = ctrl;
370 barrier();
371 if (!ctrl)
372 return;
373
374 for (idx = 0; idx < nr_counters_generic; idx++) {
375 if (test_bit(idx, cpuc->active_mask)) {
376 u64 val;
377
378 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
379 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
380 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
381 }
382 }
383}
384
385void hw_perf_restore(u64 ctrl)
386{
387 if (unlikely(!perf_counters_initialized))
388 return;
389
390 pmc_ops->restore_all(ctrl);
391}
392/*
393 * Exported because of ACPI idle
394 */
395EXPORT_SYMBOL_GPL(hw_perf_restore);
396
397static u64 pmc_intel_get_status(u64 mask)
398{
399 u64 status;
400
401 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
402
403 return status;
404}
405
406static u64 pmc_amd_get_status(u64 mask)
407{
408 u64 status = 0;
409 int idx;
410
411 for (idx = 0; idx < nr_counters_generic; idx++) {
412 s64 val;
413
414 if (!(mask & (1 << idx)))
415 continue;
416
417 rdmsrl(MSR_K7_PERFCTR0 + idx, val);
418 val <<= (64 - counter_value_bits);
419 if (val >= 0)
420 status |= (1 << idx);
421 }
422
423 return status;
424}
425
426static u64 hw_perf_get_status(u64 mask)
427{
428 if (unlikely(!perf_counters_initialized))
429 return 0;
430
431 return pmc_ops->get_status(mask);
432}
433
434static void pmc_intel_ack_status(u64 ack)
435{
436 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
437}
438
439static void pmc_amd_ack_status(u64 ack)
440{
441}
442
443static void hw_perf_ack_status(u64 ack)
444{
445 if (unlikely(!perf_counters_initialized))
446 return;
447
448 pmc_ops->ack_status(ack);
449}
450
451static void pmc_intel_enable(int idx, u64 config)
452{
453 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
454 config | ARCH_PERFMON_EVENTSEL0_ENABLE);
455}
456
457static void pmc_amd_enable(int idx, u64 config)
458{
459 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
460
461 set_bit(idx, cpuc->active_mask);
462 if (cpuc->enabled)
463 config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
464
465 wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
466}
467
468static void hw_perf_enable(int idx, u64 config)
469{
470 if (unlikely(!perf_counters_initialized))
471 return;
472
473 pmc_ops->enable(idx, config);
474}
475
476static void pmc_intel_disable(int idx, u64 config)
477{
478 wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
479}
480
481static void pmc_amd_disable(int idx, u64 config)
482{
483 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
484
485 clear_bit(idx, cpuc->active_mask);
486 wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
487
488}
489
490static void hw_perf_disable(int idx, u64 config)
491{
492 if (unlikely(!perf_counters_initialized))
493 return;
494
495 pmc_ops->disable(idx, config);
496}
497
498static inline void
499__pmc_fixed_disable(struct perf_counter *counter,
500 struct hw_perf_counter *hwc, unsigned int __idx)
501{
502 int idx = __idx - X86_PMC_IDX_FIXED;
503 u64 ctrl_val, mask;
504 int err;
505
506 mask = 0xfULL << (idx * 4);
507
508 rdmsrl(hwc->config_base, ctrl_val);
509 ctrl_val &= ~mask;
510 err = checking_wrmsrl(hwc->config_base, ctrl_val);
511}
512
513static inline void
514__pmc_generic_disable(struct perf_counter *counter,
515 struct hw_perf_counter *hwc, unsigned int idx)
516{
517 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
518 __pmc_fixed_disable(counter, hwc, idx);
519 else
520 hw_perf_disable(idx, hwc->config);
521}
522
523static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
524
525/*
526 * Set the next IRQ period, based on the hwc->period_left value.
527 * To be called with the counter disabled in hw:
528 */
529static void
530__hw_perf_counter_set_period(struct perf_counter *counter,
531 struct hw_perf_counter *hwc, int idx)
532{
533 s64 left = atomic64_read(&hwc->period_left);
534 s64 period = hwc->irq_period;
535 int err;
536
537 /*
538 * If we are way outside a reasoable range then just skip forward:
539 */
540 if (unlikely(left <= -period)) {
541 left = period;
542 atomic64_set(&hwc->period_left, left);
543 }
544
545 if (unlikely(left <= 0)) {
546 left += period;
547 atomic64_set(&hwc->period_left, left);
548 }
549
550 per_cpu(prev_left[idx], smp_processor_id()) = left;
551
552 /*
553 * The hw counter starts counting from this counter offset,
554 * mark it to be able to extra future deltas:
555 */
556 atomic64_set(&hwc->prev_count, (u64)-left);
557
558 err = checking_wrmsrl(hwc->counter_base + idx,
559 (u64)(-left) & counter_value_mask);
560}
561
562static inline void
563__pmc_fixed_enable(struct perf_counter *counter,
564 struct hw_perf_counter *hwc, unsigned int __idx)
565{
566 int idx = __idx - X86_PMC_IDX_FIXED;
567 u64 ctrl_val, bits, mask;
568 int err;
569
570 /*
571 * Enable IRQ generation (0x8),
572 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
573 * if requested:
574 */
575 bits = 0x8ULL;
576 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
577 bits |= 0x2;
578 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
579 bits |= 0x1;
580 bits <<= (idx * 4);
581 mask = 0xfULL << (idx * 4);
582
583 rdmsrl(hwc->config_base, ctrl_val);
584 ctrl_val &= ~mask;
585 ctrl_val |= bits;
586 err = checking_wrmsrl(hwc->config_base, ctrl_val);
587}
588
589static void
590__pmc_generic_enable(struct perf_counter *counter,
591 struct hw_perf_counter *hwc, int idx)
592{
593 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
594 __pmc_fixed_enable(counter, hwc, idx);
595 else
596 hw_perf_enable(idx, hwc->config);
597}
598
599static int
600fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
601{
602 unsigned int event;
603
604 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
605 return -1;
606
607 if (unlikely(hwc->nmi))
608 return -1;
609
610 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
611
612 if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
613 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
614 if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
615 return X86_PMC_IDX_FIXED_CPU_CYCLES;
616 if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
617 return X86_PMC_IDX_FIXED_BUS_CYCLES;
618
619 return -1;
620}
621
622/*
623 * Find a PMC slot for the freshly enabled / scheduled in counter:
624 */
625static int pmc_generic_enable(struct perf_counter *counter)
626{
627 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
628 struct hw_perf_counter *hwc = &counter->hw;
629 int idx;
630
631 idx = fixed_mode_idx(counter, hwc);
632 if (idx >= 0) {
633 /*
634 * Try to get the fixed counter, if that is already taken
635 * then try to get a generic counter:
636 */
637 if (test_and_set_bit(idx, cpuc->used))
638 goto try_generic;
639
640 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
641 /*
642 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
643 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
644 */
645 hwc->counter_base =
646 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
647 hwc->idx = idx;
648 } else {
649 idx = hwc->idx;
650 /* Try to get the previous generic counter again */
651 if (test_and_set_bit(idx, cpuc->used)) {
652try_generic:
653 idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
654 if (idx == nr_counters_generic)
655 return -EAGAIN;
656
657 set_bit(idx, cpuc->used);
658 hwc->idx = idx;
659 }
660 hwc->config_base = pmc_ops->eventsel;
661 hwc->counter_base = pmc_ops->perfctr;
662 }
663
664 perf_counters_lapic_init(hwc->nmi);
665
666 __pmc_generic_disable(counter, hwc, idx);
667
668 cpuc->counters[idx] = counter;
669 /*
670 * Make it visible before enabling the hw:
671 */
672 smp_wmb();
673
674 __hw_perf_counter_set_period(counter, hwc, idx);
675 __pmc_generic_enable(counter, hwc, idx);
676
677 return 0;
678}
679
680void perf_counter_print_debug(void)
681{
682 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
683 struct cpu_hw_counters *cpuc;
684 int cpu, idx;
685
686 if (!nr_counters_generic)
687 return;
688
689 local_irq_disable();
690
691 cpu = smp_processor_id();
692 cpuc = &per_cpu(cpu_hw_counters, cpu);
693
694 if (intel_perfmon_version >= 2) {
695 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
696 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
697 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
698 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
699
700 pr_info("\n");
701 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
702 pr_info("CPU#%d: status: %016llx\n", cpu, status);
703 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
704 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
705 }
706 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used);
707
708 for (idx = 0; idx < nr_counters_generic; idx++) {
709 rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
710 rdmsrl(pmc_ops->perfctr + idx, pmc_count);
711
712 prev_left = per_cpu(prev_left[idx], cpu);
713
714 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
715 cpu, idx, pmc_ctrl);
716 pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
717 cpu, idx, pmc_count);
718 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
719 cpu, idx, prev_left);
720 }
721 for (idx = 0; idx < nr_counters_fixed; idx++) {
722 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
723
724 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
725 cpu, idx, pmc_count);
726 }
727 local_irq_enable();
728}
729
730static void pmc_generic_disable(struct perf_counter *counter)
731{
732 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
733 struct hw_perf_counter *hwc = &counter->hw;
734 unsigned int idx = hwc->idx;
735
736 __pmc_generic_disable(counter, hwc, idx);
737
738 clear_bit(idx, cpuc->used);
739 cpuc->counters[idx] = NULL;
740 /*
741 * Make sure the cleared pointer becomes visible before we
742 * (potentially) free the counter:
743 */
744 smp_wmb();
745
746 /*
747 * Drain the remaining delta count out of a counter
748 * that we are disabling:
749 */
750 x86_perf_counter_update(counter, hwc, idx);
751}
752
753/*
754 * Save and restart an expired counter. Called by NMI contexts,
755 * so it has to be careful about preempting normal counter ops:
756 */
757static void perf_save_and_restart(struct perf_counter *counter)
758{
759 struct hw_perf_counter *hwc = &counter->hw;
760 int idx = hwc->idx;
761
762 x86_perf_counter_update(counter, hwc, idx);
763 __hw_perf_counter_set_period(counter, hwc, idx);
764
765 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
766 __pmc_generic_enable(counter, hwc, idx);
767}
768
769/*
770 * Maximum interrupt frequency of 100KHz per CPU
771 */
772#define PERFMON_MAX_INTERRUPTS (100000/HZ)
773
774/*
775 * This handler is triggered by the local APIC, so the APIC IRQ handling
776 * rules apply:
777 */
778static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
779{
780 int bit, cpu = smp_processor_id();
781 u64 ack, status;
782 struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
783 int ret = 0;
784
785 cpuc->throttle_ctrl = hw_perf_save_disable();
786
787 status = hw_perf_get_status(cpuc->throttle_ctrl);
788 if (!status)
789 goto out;
790
791 ret = 1;
792again:
793 inc_irq_stat(apic_perf_irqs);
794 ack = status;
795 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
796 struct perf_counter *counter = cpuc->counters[bit];
797
798 clear_bit(bit, (unsigned long *) &status);
799 if (!counter)
800 continue;
801
802 perf_save_and_restart(counter);
803 if (perf_counter_overflow(counter, nmi, regs, 0))
804 __pmc_generic_disable(counter, &counter->hw, bit);
805 }
806
807 hw_perf_ack_status(ack);
808
809 /*
810 * Repeat if there is more work to be done:
811 */
812 status = hw_perf_get_status(cpuc->throttle_ctrl);
813 if (status)
814 goto again;
815out:
816 /*
817 * Restore - do not reenable when global enable is off or throttled:
818 */
819 if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
820 hw_perf_restore(cpuc->throttle_ctrl);
821
822 return ret;
823}
824
825void perf_counter_unthrottle(void)
826{
827 struct cpu_hw_counters *cpuc;
828
829 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
830 return;
831
832 if (unlikely(!perf_counters_initialized))
833 return;
834
835 cpuc = &__get_cpu_var(cpu_hw_counters);
836 if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
837 if (printk_ratelimit())
838 printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
839 hw_perf_restore(cpuc->throttle_ctrl);
840 }
841 cpuc->interrupts = 0;
842}
843
844void smp_perf_counter_interrupt(struct pt_regs *regs)
845{
846 irq_enter();
847 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
848 ack_APIC_irq();
849 __smp_perf_counter_interrupt(regs, 0);
850 irq_exit();
851}
852
853void smp_perf_pending_interrupt(struct pt_regs *regs)
854{
855 irq_enter();
856 ack_APIC_irq();
857 inc_irq_stat(apic_pending_irqs);
858 perf_counter_do_pending();
859 irq_exit();
860}
861
862void set_perf_counter_pending(void)
863{
864 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
865}
866
867void perf_counters_lapic_init(int nmi)
868{
869 u32 apic_val;
870
871 if (!perf_counters_initialized)
872 return;
873 /*
874 * Enable the performance counter vector in the APIC LVT:
875 */
876 apic_val = apic_read(APIC_LVTERR);
877
878 apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
879 if (nmi)
880 apic_write(APIC_LVTPC, APIC_DM_NMI);
881 else
882 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
883 apic_write(APIC_LVTERR, apic_val);
884}
885
886static int __kprobes
887perf_counter_nmi_handler(struct notifier_block *self,
888 unsigned long cmd, void *__args)
889{
890 struct die_args *args = __args;
891 struct pt_regs *regs;
892 int ret;
893
894 switch (cmd) {
895 case DIE_NMI:
896 case DIE_NMI_IPI:
897 break;
898
899 default:
900 return NOTIFY_DONE;
901 }
902
903 regs = args->regs;
904
905 apic_write(APIC_LVTPC, APIC_DM_NMI);
906 ret = __smp_perf_counter_interrupt(regs, 1);
907
908 return ret ? NOTIFY_STOP : NOTIFY_OK;
909}
910
911static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
912 .notifier_call = perf_counter_nmi_handler,
913 .next = NULL,
914 .priority = 1
915};
916
917static struct pmc_x86_ops pmc_intel_ops = {
918 .save_disable_all = pmc_intel_save_disable_all,
919 .restore_all = pmc_intel_restore_all,
920 .get_status = pmc_intel_get_status,
921 .ack_status = pmc_intel_ack_status,
922 .enable = pmc_intel_enable,
923 .disable = pmc_intel_disable,
924 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
925 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
926 .event_map = pmc_intel_event_map,
927 .raw_event = pmc_intel_raw_event,
928 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
929};
930
931static struct pmc_x86_ops pmc_amd_ops = {
932 .save_disable_all = pmc_amd_save_disable_all,
933 .restore_all = pmc_amd_restore_all,
934 .get_status = pmc_amd_get_status,
935 .ack_status = pmc_amd_ack_status,
936 .enable = pmc_amd_enable,
937 .disable = pmc_amd_disable,
938 .eventsel = MSR_K7_EVNTSEL0,
939 .perfctr = MSR_K7_PERFCTR0,
940 .event_map = pmc_amd_event_map,
941 .raw_event = pmc_amd_raw_event,
942 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
943};
944
945static struct pmc_x86_ops *pmc_intel_init(void)
946{
947 union cpuid10_edx edx;
948 union cpuid10_eax eax;
949 unsigned int unused;
950 unsigned int ebx;
951
952 /*
953 * Check whether the Architectural PerfMon supports
954 * Branch Misses Retired Event or not.
955 */
956 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
957 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
958 return NULL;
959
960 intel_perfmon_version = eax.split.version_id;
961 if (intel_perfmon_version < 2)
962 return NULL;
963
964 pr_info("Intel Performance Monitoring support detected.\n");
965 pr_info("... version: %d\n", intel_perfmon_version);
966 pr_info("... bit width: %d\n", eax.split.bit_width);
967 pr_info("... mask length: %d\n", eax.split.mask_length);
968
969 nr_counters_generic = eax.split.num_counters;
970 nr_counters_fixed = edx.split.num_counters_fixed;
971 counter_value_mask = (1ULL << eax.split.bit_width) - 1;
972
973 return &pmc_intel_ops;
974}
975
976static struct pmc_x86_ops *pmc_amd_init(void)
977{
978 nr_counters_generic = 4;
979 nr_counters_fixed = 0;
980 counter_value_mask = 0x0000FFFFFFFFFFFFULL;
981 counter_value_bits = 48;
982
983 pr_info("AMD Performance Monitoring support detected.\n");
984
985 return &pmc_amd_ops;
986}
987
988void __init init_hw_perf_counters(void)
989{
990 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
991 return;
992
993 switch (boot_cpu_data.x86_vendor) {
994 case X86_VENDOR_INTEL:
995 pmc_ops = pmc_intel_init();
996 break;
997 case X86_VENDOR_AMD:
998 pmc_ops = pmc_amd_init();
999 break;
1000 }
1001 if (!pmc_ops)
1002 return;
1003
1004 pr_info("... num counters: %d\n", nr_counters_generic);
1005 if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
1006 nr_counters_generic = X86_PMC_MAX_GENERIC;
1007 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1008 nr_counters_generic, X86_PMC_MAX_GENERIC);
1009 }
1010 perf_counter_mask = (1 << nr_counters_generic) - 1;
1011 perf_max_counters = nr_counters_generic;
1012
1013 pr_info("... value mask: %016Lx\n", counter_value_mask);
1014
1015 if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
1016 nr_counters_fixed = X86_PMC_MAX_FIXED;
1017 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1018 nr_counters_fixed, X86_PMC_MAX_FIXED);
1019 }
1020 pr_info("... fixed counters: %d\n", nr_counters_fixed);
1021
1022 perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1023
1024 pr_info("... counter mask: %016Lx\n", perf_counter_mask);
1025 perf_counters_initialized = true;
1026
1027 perf_counters_lapic_init(0);
1028 register_die_notifier(&perf_counter_nmi_notifier);
1029}
1030
1031static void pmc_generic_read(struct perf_counter *counter)
1032{
1033 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1034}
1035
1036static const struct hw_perf_counter_ops x86_perf_counter_ops = {
1037 .enable = pmc_generic_enable,
1038 .disable = pmc_generic_disable,
1039 .read = pmc_generic_read,
1040};
1041
1042const struct hw_perf_counter_ops *
1043hw_perf_counter_init(struct perf_counter *counter)
1044{
1045 int err;
1046
1047 err = __hw_perf_counter_init(counter);
1048 if (err)
1049 return ERR_PTR(err);
1050
1051 return &x86_perf_counter_ops;
1052}
1053
1054/*
1055 * callchain support
1056 */
1057
1058static inline
1059void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
1060{
1061 if (entry->nr < MAX_STACK_DEPTH)
1062 entry->ip[entry->nr++] = ip;
1063}
1064
1065static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1066static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1067
1068
1069static void
1070backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1071{
1072 /* Ignore warnings */
1073}
1074
1075static void backtrace_warning(void *data, char *msg)
1076{
1077 /* Ignore warnings */
1078}
1079
1080static int backtrace_stack(void *data, char *name)
1081{
1082 /* Don't bother with IRQ stacks for now */
1083 return -1;
1084}
1085
1086static void backtrace_address(void *data, unsigned long addr, int reliable)
1087{
1088 struct perf_callchain_entry *entry = data;
1089
1090 if (reliable)
1091 callchain_store(entry, addr);
1092}
1093
1094static const struct stacktrace_ops backtrace_ops = {
1095 .warning = backtrace_warning,
1096 .warning_symbol = backtrace_warning_symbol,
1097 .stack = backtrace_stack,
1098 .address = backtrace_address,
1099};
1100
1101static void
1102perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1103{
1104 unsigned long bp;
1105 char *stack;
1106 int nr = entry->nr;
1107
1108 callchain_store(entry, instruction_pointer(regs));
1109
1110 stack = ((char *)regs + sizeof(struct pt_regs));
1111#ifdef CONFIG_FRAME_POINTER
1112 bp = frame_pointer(regs);
1113#else
1114 bp = 0;
1115#endif
1116
1117 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1118
1119 entry->kernel = entry->nr - nr;
1120}
1121
1122
1123struct stack_frame {
1124 const void __user *next_fp;
1125 unsigned long return_address;
1126};
1127
1128static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1129{
1130 int ret;
1131
1132 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
1133 return 0;
1134
1135 ret = 1;
1136 pagefault_disable();
1137 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
1138 ret = 0;
1139 pagefault_enable();
1140
1141 return ret;
1142}
1143
1144static void
1145perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1146{
1147 struct stack_frame frame;
1148 const void __user *fp;
1149 int nr = entry->nr;
1150
1151 regs = (struct pt_regs *)current->thread.sp0 - 1;
1152 fp = (void __user *)regs->bp;
1153
1154 callchain_store(entry, regs->ip);
1155
1156 while (entry->nr < MAX_STACK_DEPTH) {
1157 frame.next_fp = NULL;
1158 frame.return_address = 0;
1159
1160 if (!copy_stack_frame(fp, &frame))
1161 break;
1162
1163 if ((unsigned long)fp < user_stack_pointer(regs))
1164 break;
1165
1166 callchain_store(entry, frame.return_address);
1167 fp = frame.next_fp;
1168 }
1169
1170 entry->user = entry->nr - nr;
1171}
1172
1173static void
1174perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1175{
1176 int is_user;
1177
1178 if (!regs)
1179 return;
1180
1181 is_user = user_mode(regs);
1182
1183 if (!current || current->pid == 0)
1184 return;
1185
1186 if (is_user && current->state != TASK_RUNNING)
1187 return;
1188
1189 if (!is_user)
1190 perf_callchain_kernel(regs, entry);
1191
1192 if (current->mm)
1193 perf_callchain_user(regs, entry);
1194}
1195
1196struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1197{
1198 struct perf_callchain_entry *entry;
1199
1200 if (in_nmi())
1201 entry = &__get_cpu_var(nmi_entry);
1202 else
1203 entry = &__get_cpu_var(irq_entry);
1204
1205 entry->nr = 0;
1206 entry->hv = 0;
1207 entry->kernel = 0;
1208 entry->user = 0;
1209
1210 perf_do_callchain(regs, entry);
1211
1212 return entry;
1213}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e32..d6f5b9fbde32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/genapic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 38946c6e8433..891004619142 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1025,6 +1025,13 @@ apicinterrupt ERROR_APIC_VECTOR \
1025apicinterrupt SPURIOUS_APIC_VECTOR \ 1025apicinterrupt SPURIOUS_APIC_VECTOR \
1026 spurious_interrupt smp_spurious_interrupt 1026 spurious_interrupt smp_spurious_interrupt
1027 1027
1028#ifdef CONFIG_PERF_COUNTERS
1029apicinterrupt LOCAL_PERF_VECTOR \
1030 perf_counter_interrupt smp_perf_counter_interrupt
1031apicinterrupt LOCAL_PENDING_VECTOR \
1032 perf_pending_interrupt smp_perf_pending_interrupt
1033#endif
1034
1028/* 1035/*
1029 * Exception entry points. 1036 * Exception entry points.
1030 */ 1037 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c3fe010d74c8..8279fb8df17f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -63,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
63 for_each_online_cpu(j) 63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n"); 65 seq_printf(p, " Spurious interrupts\n");
66 seq_printf(p, "%*s: ", prec, "CNT");
67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance counter interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND");
71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n");
66#endif 74#endif
67 if (generic_interrupt_extension) { 75 if (generic_interrupt_extension) {
68 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
@@ -166,6 +174,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
166#ifdef CONFIG_X86_LOCAL_APIC 174#ifdef CONFIG_X86_LOCAL_APIC
167 sum += irq_stats(cpu)->apic_timer_irqs; 175 sum += irq_stats(cpu)->apic_timer_irqs;
168 sum += irq_stats(cpu)->irq_spurious_count; 176 sum += irq_stats(cpu)->irq_spurious_count;
177 sum += irq_stats(cpu)->apic_perf_irqs;
178 sum += irq_stats(cpu)->apic_pending_irqs;
169#endif 179#endif
170 if (generic_interrupt_extension) 180 if (generic_interrupt_extension)
171 sum += irq_stats(cpu)->generic_irqs; 181 sum += irq_stats(cpu)->generic_irqs;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 368b0a8836f9..3190a6b961e6 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -118,28 +118,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
118 return 0; 118 return 0;
119} 119}
120 120
121/* Overridden in paravirt.c */ 121static void __init smp_intr_init(void)
122void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
123
124void __init native_init_IRQ(void)
125{ 122{
126 int i;
127
128 /* Execute any quirks before the call gates are initialised: */
129 x86_quirk_pre_intr_init();
130
131 /*
132 * Cover the whole vector space, no vector can escape
133 * us. (some of these will be overridden and become
134 * 'special' SMP interrupts)
135 */
136 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
137 /* SYSCALL_VECTOR was reserved in trap_init. */
138 if (i != SYSCALL_VECTOR)
139 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
140 }
141
142
143#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) 123#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
144 /* 124 /*
145 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 125 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -168,6 +148,11 @@ void __init native_init_IRQ(void)
168 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 148 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
169 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); 149 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
170#endif 150#endif
151}
152
153static void __init apic_intr_init(void)
154{
155 smp_intr_init();
171 156
172#ifdef CONFIG_X86_LOCAL_APIC 157#ifdef CONFIG_X86_LOCAL_APIC
173 /* self generated IPI for local APIC timer */ 158 /* self generated IPI for local APIC timer */
@@ -179,12 +164,41 @@ void __init native_init_IRQ(void)
179 /* IPI vectors for APIC spurious and error interrupts */ 164 /* IPI vectors for APIC spurious and error interrupts */
180 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 165 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
181 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 166 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
182#endif 167# ifdef CONFIG_PERF_COUNTERS
168 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
169 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
170# endif
183 171
184#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) 172# ifdef CONFIG_X86_MCE_P4THERMAL
185 /* thermal monitor LVT interrupt */ 173 /* thermal monitor LVT interrupt */
186 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 174 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
175# endif
187#endif 176#endif
177}
178
179/* Overridden in paravirt.c */
180void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
181
182void __init native_init_IRQ(void)
183{
184 int i;
185
186 /* Execute any quirks before the call gates are initialised: */
187 x86_quirk_pre_intr_init();
188
189 apic_intr_init();
190
191 /*
192 * Cover the whole vector space, no vector can escape
193 * us. (some of these will be overridden and become
194 * 'special' SMP interrupts)
195 */
196 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
197 int vector = FIRST_EXTERNAL_VECTOR + i;
198 /* SYSCALL_VECTOR was reserved in trap_init. */
199 if (!test_bit(vector, used_vectors))
200 set_intr_gate(vector, interrupt[i]);
201 }
188 202
189 if (!acpi_ioapic) 203 if (!acpi_ioapic)
190 setup_irq(2, &irq2); 204 setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8cd10537fd46..53ceb26f80ff 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -152,6 +152,12 @@ static void __init apic_intr_init(void)
152 /* IPI vectors for APIC spurious and error interrupts */ 152 /* IPI vectors for APIC spurious and error interrupts */
153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
155
156 /* Performance monitoring interrupt: */
157#ifdef CONFIG_PERF_COUNTERS
158 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
159 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
160#endif
155} 161}
156 162
157void __init native_init_IRQ(void) 163void __init native_init_IRQ(void)
@@ -159,6 +165,9 @@ void __init native_init_IRQ(void)
159 int i; 165 int i;
160 166
161 init_ISA_irqs(); 167 init_ISA_irqs();
168
169 apic_intr_init();
170
162 /* 171 /*
163 * Cover the whole vector space, no vector can escape 172 * Cover the whole vector space, no vector can escape
164 * us. (some of these will be overridden and become 173 * us. (some of these will be overridden and become
@@ -166,12 +175,10 @@ void __init native_init_IRQ(void)
166 */ 175 */
167 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { 176 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
168 int vector = FIRST_EXTERNAL_VECTOR + i; 177 int vector = FIRST_EXTERNAL_VECTOR + i;
169 if (vector != IA32_SYSCALL_VECTOR) 178 if (!test_bit(vector, used_vectors))
170 set_intr_gate(vector, interrupt[i]); 179 set_intr_gate(vector, interrupt[i]);
171 } 180 }
172 181
173 apic_intr_init();
174
175 if (!acpi_ioapic) 182 if (!acpi_ioapic)
176 setup_irq(2, &irq2); 183 setup_irq(2, &irq2);
177} 184}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 14425166b8e3..0a813b17b172 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#include <linux/sched.h> 9#include <linux/sched.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c8736b491..c3ebbb901379 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,5 +332,6 @@ ENTRY(sys_call_table)
332 .long sys_dup3 /* 330 */ 332 .long sys_dup3 /* 330 */
333 .long sys_pipe2 333 .long sys_pipe2
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_perf_counter_open
335 .long sys_preadv 336 .long sys_preadv
336 .long sys_pwritev 337 .long sys_pwritev
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d288327ff0..2cc162e09c4b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -945,8 +945,13 @@ void __init trap_init(void)
945#endif 945#endif
946 set_intr_gate(19, &simd_coprocessor_error); 946 set_intr_gate(19, &simd_coprocessor_error);
947 947
948 /* Reserve all the builtin and the syscall vector: */
949 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
950 set_bit(i, used_vectors);
951
948#ifdef CONFIG_IA32_EMULATION 952#ifdef CONFIG_IA32_EMULATION
949 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 953 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
954 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
950#endif 955#endif
951 956
952#ifdef CONFIG_X86_32 957#ifdef CONFIG_X86_32
@@ -963,17 +968,9 @@ void __init trap_init(void)
963 } 968 }
964 969
965 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 970 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
966#endif
967
968 /* Reserve all the builtin and the syscall vector: */
969 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
970 set_bit(i, used_vectors);
971
972#ifdef CONFIG_X86_64
973 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
974#else
975 set_bit(SYSCALL_VECTOR, used_vectors); 971 set_bit(SYSCALL_VECTOR, used_vectors);
976#endif 972#endif
973
977 /* 974 /*
978 * Should be a barrier for any external CPU state: 975 * Should be a barrier for any external CPU state:
979 */ 976 */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa0..6f9df2babe48 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
27#include <linux/tty.h> 27#include <linux/tty.h>
28#include <linux/smp.h> 28#include <linux/smp.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/perf_counter.h>
30 31
31#include <asm-generic/sections.h> 32#include <asm-generic/sections.h>
32 33
@@ -1044,6 +1045,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1044 if (unlikely(error_code & PF_RSVD)) 1045 if (unlikely(error_code & PF_RSVD))
1045 pgtable_bad(regs, error_code, address); 1046 pgtable_bad(regs, error_code, address);
1046 1047
1048 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
1049
1047 /* 1050 /*
1048 * If we're in an interrupt, have no user context or are running 1051 * If we're in an interrupt, have no user context or are running
1049 * in an atomic region then we must not take the fault: 1052 * in an atomic region then we must not take the fault:
@@ -1137,10 +1140,15 @@ good_area:
1137 return; 1140 return;
1138 } 1141 }
1139 1142
1140 if (fault & VM_FAULT_MAJOR) 1143 if (fault & VM_FAULT_MAJOR) {
1141 tsk->maj_flt++; 1144 tsk->maj_flt++;
1142 else 1145 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
1146 regs, address);
1147 } else {
1143 tsk->min_flt++; 1148 tsk->min_flt++;
1149 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
1150 regs, address);
1151 }
1144 1152
1145 check_v8086_mode(regs, address, tsk); 1153 check_v8086_mode(regs, address, tsk);
1146 1154
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864ad49a7..c638685136e1 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
40 40
41 switch (val) { 41 switch (val) {
42 case DIE_NMI: 42 case DIE_NMI:
43 if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) 43 case DIE_NMI_IPI:
44 ret = NOTIFY_STOP; 44 model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
45 ret = NOTIFY_STOP;
45 break; 46 break;
46 default: 47 default:
47 break; 48 break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
134static struct notifier_block profile_exceptions_nb = { 135static struct notifier_block profile_exceptions_nb = {
135 .notifier_call = profile_exceptions_notify, 136 .notifier_call = profile_exceptions_notify,
136 .next = NULL, 137 .next = NULL,
137 .priority = 0 138 .priority = 2
138}; 139};
139 140
140static int nmi_setup(void) 141static int nmi_setup(void)
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 10131fbdaada..4da7230b3d17 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/apic.h> 19#include <asm/apic.h>
20#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h> 21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
@@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
136 u64 val; 136 u64 val;
137 int i; 137 int i;
138 138
139 /*
140 * This can happen if perf counters are in use when
141 * we steal the die notifier NMI.
142 */
143 if (unlikely(!reset_value))
144 goto out;
145
139 for (i = 0 ; i < num_counters; ++i) { 146 for (i = 0 ; i < num_counters; ++i) {
140 if (!reset_value[i]) 147 if (!reset_value[i])
141 continue; 148 continue;
@@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
146 } 153 }
147 } 154 }
148 155
156out:
149 /* Only P6 based Pentium M need to re-unmask the apic vector but it 157 /* Only P6 based Pentium M need to re-unmask the apic vector but it
150 * doesn't hurt other P6 variant */ 158 * doesn't hurt other P6 variant */
151 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); 159 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index f7ca8c55956b..d2830f39d46b 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -763,8 +763,11 @@ static int acpi_idle_bm_check(void)
763 */ 763 */
764static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) 764static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
765{ 765{
766 u64 perf_flags;
767
766 /* Don't trace irqs off for idle */ 768 /* Don't trace irqs off for idle */
767 stop_critical_timings(); 769 stop_critical_timings();
770 perf_flags = hw_perf_save_disable();
768 if (cx->entry_method == ACPI_CSTATE_FFH) { 771 if (cx->entry_method == ACPI_CSTATE_FFH) {
769 /* Call into architectural FFH based C-state */ 772 /* Call into architectural FFH based C-state */
770 acpi_processor_ffh_cstate_enter(cx); 773 acpi_processor_ffh_cstate_enter(cx);
@@ -779,6 +782,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
779 gets asserted in time to freeze execution properly. */ 782 gets asserted in time to freeze execution properly. */
780 unused = inl(acpi_gbl_FADT.xpm_timer_block.address); 783 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
781 } 784 }
785 hw_perf_restore(perf_flags);
782 start_critical_timings(); 786 start_critical_timings();
783} 787}
784 788
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index b0a6a3e51924..aed2b2936ecf 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
25#include <linux/kbd_kern.h> 25#include <linux/kbd_kern.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/quotaops.h> 27#include <linux/quotaops.h>
28#include <linux/perf_counter.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/suspend.h> 31#include <linux/suspend.h>
@@ -243,6 +244,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
243 struct pt_regs *regs = get_irq_regs(); 244 struct pt_regs *regs = get_irq_regs();
244 if (regs) 245 if (regs)
245 show_regs(regs); 246 show_regs(regs);
247 perf_counter_print_debug();
246} 248}
247static struct sysrq_key_op sysrq_showregs_op = { 249static struct sysrq_key_op sysrq_showregs_op = {
248 .handler = sysrq_handle_showregs, 250 .handler = sysrq_handle_showregs,
diff --git a/fs/exec.c b/fs/exec.c
index a3a8ce83940f..fe75dcff023a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h>
36#include <linux/highmem.h> 37#include <linux/highmem.h>
37#include <linux/spinlock.h> 38#include <linux/spinlock.h>
38#include <linux/key.h> 39#include <linux/key.h>
@@ -950,6 +951,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
950 task_lock(tsk); 951 task_lock(tsk);
951 strlcpy(tsk->comm, buf, sizeof(tsk->comm)); 952 strlcpy(tsk->comm, buf, sizeof(tsk->comm));
952 task_unlock(tsk); 953 task_unlock(tsk);
954 perf_counter_comm(tsk);
953} 955}
954 956
955int flush_old_exec(struct linux_binprm * bprm) 957int flush_old_exec(struct linux_binprm * bprm)
@@ -1018,6 +1020,13 @@ int flush_old_exec(struct linux_binprm * bprm)
1018 1020
1019 current->personality &= ~bprm->per_clear; 1021 current->personality &= ~bprm->per_clear;
1020 1022
1023 /*
1024 * Flush performance counters when crossing a
1025 * security domain:
1026 */
1027 if (!get_dumpable(current->mm))
1028 perf_counter_exit_task(current);
1029
1021 /* An exec changes our domain. We are no longer part of the thread 1030 /* An exec changes our domain. We are no longer part of the thread
1022 group */ 1031 group */
1023 1032
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d87247d2641f..503afaa0afa7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,6 +108,18 @@ extern struct group_info init_groups;
108 108
109extern struct cred init_cred; 109extern struct cred init_cred;
110 110
111#ifdef CONFIG_PERF_COUNTERS
112# define INIT_PERF_COUNTERS(tsk) \
113 .perf_counter_ctx.counter_list = \
114 LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \
115 .perf_counter_ctx.event_list = \
116 LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \
117 .perf_counter_ctx.lock = \
118 __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
119#else
120# define INIT_PERF_COUNTERS(tsk)
121#endif
122
111/* 123/*
112 * INIT_TASK is used to set up the first task table, touch at 124 * INIT_TASK is used to set up the first task table, touch at
113 * your own risk!. Base=0, limit=0x1fffff (=2MB) 125 * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -171,6 +183,7 @@ extern struct cred init_cred;
171 }, \ 183 }, \
172 .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ 184 .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \
173 INIT_IDS \ 185 INIT_IDS \
186 INIT_PERF_COUNTERS(tsk) \
174 INIT_TRACE_IRQFLAGS \ 187 INIT_TRACE_IRQFLAGS \
175 INIT_LOCKDEP \ 188 INIT_LOCKDEP \
176 INIT_FTRACE_GRAPH \ 189 INIT_FTRACE_GRAPH \
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 0c8b89f28a95..a77c6007dc99 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -81,7 +81,12 @@ static inline unsigned int kstat_irqs(unsigned int irq)
81 return sum; 81 return sum;
82} 82}
83 83
84
85/*
86 * Lock/unlock the current runqueue - to extract task statistics:
87 */
84extern unsigned long long task_delta_exec(struct task_struct *); 88extern unsigned long long task_delta_exec(struct task_struct *);
89
85extern void account_user_time(struct task_struct *, cputime_t, cputime_t); 90extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
86extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); 91extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
87extern void account_steal_time(cputime_t); 92extern void account_steal_time(cputime_t);
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3069ec7e0ab8..93054fc3635c 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
151extern int mutex_trylock(struct mutex *lock); 151extern int mutex_trylock(struct mutex *lock);
152extern void mutex_unlock(struct mutex *lock); 152extern void mutex_unlock(struct mutex *lock);
153 153
154/**
155 * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
156 * @cnt: the atomic which we are to dec
157 * @lock: the mutex to return holding if we dec to 0
158 *
159 * return true and hold lock if we dec to 0, return false otherwise
160 */
161static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
162{
163 /* dec if we can't possibly hit 0 */
164 if (atomic_add_unless(cnt, -1, 1))
165 return 0;
166 /* we might hit 0, so take the lock */
167 mutex_lock(lock);
168 if (!atomic_dec_and_test(cnt)) {
169 /* when we actually did the dec, we didn't hit 0 */
170 mutex_unlock(lock);
171 return 0;
172 }
173 /* we hit 0, and we hold the lock */
174 return 1;
175}
176
154#endif 177#endif
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 000000000000..981432885301
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,606 @@
1/*
2 * Performance counters:
3 *
4 * Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
6 *
7 * Data type definitions, declarations, prototypes.
8 *
9 * Started by: Thomas Gleixner and Ingo Molnar
10 *
11 * For licencing details see kernel-base/COPYING
12 */
13#ifndef _LINUX_PERF_COUNTER_H
14#define _LINUX_PERF_COUNTER_H
15
16#include <linux/types.h>
17#include <linux/ioctl.h>
18#include <asm/byteorder.h>
19
20/*
21 * User-space ABI bits:
22 */
23
24/*
25 * hw_event.type
26 */
27enum perf_event_types {
28 PERF_TYPE_HARDWARE = 0,
29 PERF_TYPE_SOFTWARE = 1,
30 PERF_TYPE_TRACEPOINT = 2,
31
32 /*
33 * available TYPE space, raw is the max value.
34 */
35
36 PERF_TYPE_RAW = 128,
37};
38
39/*
40 * Generalized performance counter event types, used by the hw_event.event_id
41 * parameter of the sys_perf_counter_open() syscall:
42 */
43enum hw_event_ids {
44 /*
45 * Common hardware events, generalized by the kernel:
46 */
47 PERF_COUNT_CPU_CYCLES = 0,
48 PERF_COUNT_INSTRUCTIONS = 1,
49 PERF_COUNT_CACHE_REFERENCES = 2,
50 PERF_COUNT_CACHE_MISSES = 3,
51 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
52 PERF_COUNT_BRANCH_MISSES = 5,
53 PERF_COUNT_BUS_CYCLES = 6,
54
55 PERF_HW_EVENTS_MAX = 7,
56};
57
58/*
59 * Special "software" counters provided by the kernel, even if the hardware
60 * does not support performance counters. These counters measure various
61 * physical and sw events of the kernel (and allow the profiling of them as
62 * well):
63 */
64enum sw_event_ids {
65 PERF_COUNT_CPU_CLOCK = 0,
66 PERF_COUNT_TASK_CLOCK = 1,
67 PERF_COUNT_PAGE_FAULTS = 2,
68 PERF_COUNT_CONTEXT_SWITCHES = 3,
69 PERF_COUNT_CPU_MIGRATIONS = 4,
70 PERF_COUNT_PAGE_FAULTS_MIN = 5,
71 PERF_COUNT_PAGE_FAULTS_MAJ = 6,
72
73 PERF_SW_EVENTS_MAX = 7,
74};
75
76#define __PERF_COUNTER_MASK(name) \
77 (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \
78 PERF_COUNTER_##name##_SHIFT)
79
80#define PERF_COUNTER_RAW_BITS 1
81#define PERF_COUNTER_RAW_SHIFT 63
82#define PERF_COUNTER_RAW_MASK __PERF_COUNTER_MASK(RAW)
83
84#define PERF_COUNTER_CONFIG_BITS 63
85#define PERF_COUNTER_CONFIG_SHIFT 0
86#define PERF_COUNTER_CONFIG_MASK __PERF_COUNTER_MASK(CONFIG)
87
88#define PERF_COUNTER_TYPE_BITS 7
89#define PERF_COUNTER_TYPE_SHIFT 56
90#define PERF_COUNTER_TYPE_MASK __PERF_COUNTER_MASK(TYPE)
91
92#define PERF_COUNTER_EVENT_BITS 56
93#define PERF_COUNTER_EVENT_SHIFT 0
94#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT)
95
96/*
97 * Bits that can be set in hw_event.record_type to request information
98 * in the overflow packets.
99 */
100enum perf_counter_record_format {
101 PERF_RECORD_IP = 1U << 0,
102 PERF_RECORD_TID = 1U << 1,
103 PERF_RECORD_TIME = 1U << 2,
104 PERF_RECORD_ADDR = 1U << 3,
105 PERF_RECORD_GROUP = 1U << 4,
106 PERF_RECORD_CALLCHAIN = 1U << 5,
107};
108
109/*
110 * Bits that can be set in hw_event.read_format to request that
111 * reads on the counter should return the indicated quantities,
112 * in increasing order of bit value, after the counter value.
113 */
114enum perf_counter_read_format {
115 PERF_FORMAT_TOTAL_TIME_ENABLED = 1,
116 PERF_FORMAT_TOTAL_TIME_RUNNING = 2,
117};
118
119/*
120 * Hardware event to monitor via a performance monitoring counter:
121 */
122struct perf_counter_hw_event {
123 /*
124 * The MSB of the config word signifies if the rest contains cpu
125 * specific (raw) counter configuration data, if unset, the next
126 * 7 bits are an event type and the rest of the bits are the event
127 * identifier.
128 */
129 __u64 config;
130
131 __u64 irq_period;
132 __u32 record_type;
133 __u32 read_format;
134
135 __u64 disabled : 1, /* off by default */
136 nmi : 1, /* NMI sampling */
137 inherit : 1, /* children inherit it */
138 pinned : 1, /* must always be on PMU */
139 exclusive : 1, /* only group on PMU */
140 exclude_user : 1, /* don't count user */
141 exclude_kernel : 1, /* ditto kernel */
142 exclude_hv : 1, /* ditto hypervisor */
143 exclude_idle : 1, /* don't count when idle */
144 mmap : 1, /* include mmap data */
145 munmap : 1, /* include munmap data */
146 comm : 1, /* include comm data */
147
148 __reserved_1 : 52;
149
150 __u32 extra_config_len;
151 __u32 wakeup_events; /* wakeup every n events */
152
153 __u64 __reserved_2;
154 __u64 __reserved_3;
155};
156
157/*
158 * Ioctls that can be done on a perf counter fd:
159 */
160#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0)
161#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1)
162#define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32)
163
164/*
165 * Structure of the page that can be mapped via mmap
166 */
167struct perf_counter_mmap_page {
168 __u32 version; /* version number of this structure */
169 __u32 compat_version; /* lowest version this is compat with */
170
171 /*
172 * Bits needed to read the hw counters in user-space.
173 *
174 * u32 seq;
175 * s64 count;
176 *
177 * do {
178 * seq = pc->lock;
179 *
180 * barrier()
181 * if (pc->index) {
182 * count = pmc_read(pc->index - 1);
183 * count += pc->offset;
184 * } else
185 * goto regular_read;
186 *
187 * barrier();
188 * } while (pc->lock != seq);
189 *
190 * NOTE: for obvious reason this only works on self-monitoring
191 * processes.
192 */
193 __u32 lock; /* seqlock for synchronization */
194 __u32 index; /* hardware counter identifier */
195 __s64 offset; /* add to hardware counter value */
196
197 /*
198 * Control data for the mmap() data buffer.
199 *
200 * User-space reading this value should issue an rmb(), on SMP capable
201 * platforms, after reading this value -- see perf_counter_wakeup().
202 */
203 __u32 data_head; /* head in the data section */
204};
205
206#define PERF_EVENT_MISC_KERNEL (1 << 0)
207#define PERF_EVENT_MISC_USER (1 << 1)
208#define PERF_EVENT_MISC_OVERFLOW (1 << 2)
209
210struct perf_event_header {
211 __u32 type;
212 __u16 misc;
213 __u16 size;
214};
215
216enum perf_event_type {
217
218 /*
219 * The MMAP events record the PROT_EXEC mappings so that we can
220 * correlate userspace IPs to code. They have the following structure:
221 *
222 * struct {
223 * struct perf_event_header header;
224 *
225 * u32 pid, tid;
226 * u64 addr;
227 * u64 len;
228 * u64 pgoff;
229 * char filename[];
230 * };
231 */
232 PERF_EVENT_MMAP = 1,
233 PERF_EVENT_MUNMAP = 2,
234
235 /*
236 * struct {
237 * struct perf_event_header header;
238 *
239 * u32 pid, tid;
240 * char comm[];
241 * };
242 */
243 PERF_EVENT_COMM = 3,
244
245 /*
246 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
247 * will be PERF_RECORD_*
248 *
249 * struct {
250 * struct perf_event_header header;
251 *
252 * { u64 ip; } && PERF_RECORD_IP
253 * { u32 pid, tid; } && PERF_RECORD_TID
254 * { u64 time; } && PERF_RECORD_TIME
255 * { u64 addr; } && PERF_RECORD_ADDR
256 *
257 * { u64 nr;
258 * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP
259 *
260 * { u16 nr,
261 * hv,
262 * kernel,
263 * user;
264 * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN
265 * };
266 */
267};
268
269#ifdef __KERNEL__
270/*
271 * Kernel-internal data types and definitions:
272 */
273
274#ifdef CONFIG_PERF_COUNTERS
275# include <asm/perf_counter.h>
276#endif
277
278#include <linux/list.h>
279#include <linux/mutex.h>
280#include <linux/rculist.h>
281#include <linux/rcupdate.h>
282#include <linux/spinlock.h>
283#include <linux/hrtimer.h>
284#include <linux/fs.h>
285#include <asm/atomic.h>
286
287struct task_struct;
288
289static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
290{
291 return hw_event->config & PERF_COUNTER_RAW_MASK;
292}
293
294static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
295{
296 return hw_event->config & PERF_COUNTER_CONFIG_MASK;
297}
298
299static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
300{
301 return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
302 PERF_COUNTER_TYPE_SHIFT;
303}
304
305static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
306{
307 return hw_event->config & PERF_COUNTER_EVENT_MASK;
308}
309
310/**
311 * struct hw_perf_counter - performance counter hardware details:
312 */
313struct hw_perf_counter {
314#ifdef CONFIG_PERF_COUNTERS
315 union {
316 struct { /* hardware */
317 u64 config;
318 unsigned long config_base;
319 unsigned long counter_base;
320 int nmi;
321 unsigned int idx;
322 };
323 union { /* software */
324 atomic64_t count;
325 struct hrtimer hrtimer;
326 };
327 };
328 atomic64_t prev_count;
329 u64 irq_period;
330 atomic64_t period_left;
331#endif
332};
333
334struct perf_counter;
335
336/**
337 * struct hw_perf_counter_ops - performance counter hw ops
338 */
339struct hw_perf_counter_ops {
340 int (*enable) (struct perf_counter *counter);
341 void (*disable) (struct perf_counter *counter);
342 void (*read) (struct perf_counter *counter);
343};
344
345/**
346 * enum perf_counter_active_state - the states of a counter
347 */
348enum perf_counter_active_state {
349 PERF_COUNTER_STATE_ERROR = -2,
350 PERF_COUNTER_STATE_OFF = -1,
351 PERF_COUNTER_STATE_INACTIVE = 0,
352 PERF_COUNTER_STATE_ACTIVE = 1,
353};
354
355struct file;
356
357struct perf_mmap_data {
358 struct rcu_head rcu_head;
359 int nr_pages; /* nr of data pages */
360
361 atomic_t wakeup; /* POLL_ for wakeups */
362 atomic_t head; /* write position */
363 atomic_t events; /* event limit */
364
365 struct perf_counter_mmap_page *user_page;
366 void *data_pages[0];
367};
368
369struct perf_pending_entry {
370 struct perf_pending_entry *next;
371 void (*func)(struct perf_pending_entry *);
372};
373
374/**
375 * struct perf_counter - performance counter kernel representation:
376 */
377struct perf_counter {
378#ifdef CONFIG_PERF_COUNTERS
379 struct list_head list_entry;
380 struct list_head event_entry;
381 struct list_head sibling_list;
382 int nr_siblings;
383 struct perf_counter *group_leader;
384 const struct hw_perf_counter_ops *hw_ops;
385
386 enum perf_counter_active_state state;
387 enum perf_counter_active_state prev_state;
388 atomic64_t count;
389
390 /*
391 * These are the total time in nanoseconds that the counter
392 * has been enabled (i.e. eligible to run, and the task has
393 * been scheduled in, if this is a per-task counter)
394 * and running (scheduled onto the CPU), respectively.
395 *
396 * They are computed from tstamp_enabled, tstamp_running and
397 * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
398 */
399 u64 total_time_enabled;
400 u64 total_time_running;
401
402 /*
403 * These are timestamps used for computing total_time_enabled
404 * and total_time_running when the counter is in INACTIVE or
405 * ACTIVE state, measured in nanoseconds from an arbitrary point
406 * in time.
407 * tstamp_enabled: the notional time when the counter was enabled
408 * tstamp_running: the notional time when the counter was scheduled on
409 * tstamp_stopped: in INACTIVE state, the notional time when the
410 * counter was scheduled off.
411 */
412 u64 tstamp_enabled;
413 u64 tstamp_running;
414 u64 tstamp_stopped;
415
416 struct perf_counter_hw_event hw_event;
417 struct hw_perf_counter hw;
418
419 struct perf_counter_context *ctx;
420 struct task_struct *task;
421 struct file *filp;
422
423 struct perf_counter *parent;
424 struct list_head child_list;
425
426 /*
427 * These accumulate total time (in nanoseconds) that children
428 * counters have been enabled and running, respectively.
429 */
430 atomic64_t child_total_time_enabled;
431 atomic64_t child_total_time_running;
432
433 /*
434 * Protect attach/detach and child_list:
435 */
436 struct mutex mutex;
437
438 int oncpu;
439 int cpu;
440
441 /* mmap bits */
442 struct mutex mmap_mutex;
443 atomic_t mmap_count;
444 struct perf_mmap_data *data;
445
446 /* poll related */
447 wait_queue_head_t waitq;
448 struct fasync_struct *fasync;
449
450 /* delayed work for NMIs and such */
451 int pending_wakeup;
452 int pending_kill;
453 int pending_disable;
454 struct perf_pending_entry pending;
455
456 atomic_t event_limit;
457
458 void (*destroy)(struct perf_counter *);
459 struct rcu_head rcu_head;
460#endif
461};
462
463/**
464 * struct perf_counter_context - counter context structure
465 *
466 * Used as a container for task counters and CPU counters as well:
467 */
468struct perf_counter_context {
469#ifdef CONFIG_PERF_COUNTERS
470 /*
471 * Protect the states of the counters in the list,
472 * nr_active, and the list:
473 */
474 spinlock_t lock;
475 /*
476 * Protect the list of counters. Locking either mutex or lock
477 * is sufficient to ensure the list doesn't change; to change
478 * the list you need to lock both the mutex and the spinlock.
479 */
480 struct mutex mutex;
481
482 struct list_head counter_list;
483 struct list_head event_list;
484 int nr_counters;
485 int nr_active;
486 int is_active;
487 struct task_struct *task;
488
489 /*
490 * Context clock, runs when context enabled.
491 */
492 u64 time;
493 u64 timestamp;
494#endif
495};
496
497/**
498 * struct perf_counter_cpu_context - per cpu counter context structure
499 */
500struct perf_cpu_context {
501 struct perf_counter_context ctx;
502 struct perf_counter_context *task_ctx;
503 int active_oncpu;
504 int max_pertask;
505 int exclusive;
506
507 /*
508 * Recursion avoidance:
509 *
510 * task, softirq, irq, nmi context
511 */
512 int recursion[4];
513};
514
515/*
516 * Set by architecture code:
517 */
518extern int perf_max_counters;
519
520#ifdef CONFIG_PERF_COUNTERS
521extern const struct hw_perf_counter_ops *
522hw_perf_counter_init(struct perf_counter *counter);
523
524extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
525extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
526extern void perf_counter_task_tick(struct task_struct *task, int cpu);
527extern void perf_counter_init_task(struct task_struct *child);
528extern void perf_counter_exit_task(struct task_struct *child);
529extern void perf_counter_do_pending(void);
530extern void perf_counter_print_debug(void);
531extern void perf_counter_unthrottle(void);
532extern u64 hw_perf_save_disable(void);
533extern void hw_perf_restore(u64 ctrl);
534extern int perf_counter_task_disable(void);
535extern int perf_counter_task_enable(void);
536extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
537 struct perf_cpu_context *cpuctx,
538 struct perf_counter_context *ctx, int cpu);
539extern void perf_counter_update_userpage(struct perf_counter *counter);
540
541extern int perf_counter_overflow(struct perf_counter *counter,
542 int nmi, struct pt_regs *regs, u64 addr);
543/*
544 * Return 1 for a software counter, 0 for a hardware counter
545 */
546static inline int is_software_counter(struct perf_counter *counter)
547{
548 return !perf_event_raw(&counter->hw_event) &&
549 perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
550}
551
552extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
553
554extern void perf_counter_mmap(unsigned long addr, unsigned long len,
555 unsigned long pgoff, struct file *file);
556
557extern void perf_counter_munmap(unsigned long addr, unsigned long len,
558 unsigned long pgoff, struct file *file);
559
560extern void perf_counter_comm(struct task_struct *tsk);
561
562#define MAX_STACK_DEPTH 255
563
564struct perf_callchain_entry {
565 u16 nr, hv, kernel, user;
566 u64 ip[MAX_STACK_DEPTH];
567};
568
569extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
570
571extern int sysctl_perf_counter_priv;
572
573#else
574static inline void
575perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
576static inline void
577perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
578static inline void
579perf_counter_task_tick(struct task_struct *task, int cpu) { }
580static inline void perf_counter_init_task(struct task_struct *child) { }
581static inline void perf_counter_exit_task(struct task_struct *child) { }
582static inline void perf_counter_do_pending(void) { }
583static inline void perf_counter_print_debug(void) { }
584static inline void perf_counter_unthrottle(void) { }
585static inline void hw_perf_restore(u64 ctrl) { }
586static inline u64 hw_perf_save_disable(void) { return 0; }
587static inline int perf_counter_task_disable(void) { return -EINVAL; }
588static inline int perf_counter_task_enable(void) { return -EINVAL; }
589
590static inline void
591perf_swcounter_event(u32 event, u64 nr, int nmi,
592 struct pt_regs *regs, u64 addr) { }
593
594static inline void
595perf_counter_mmap(unsigned long addr, unsigned long len,
596 unsigned long pgoff, struct file *file) { }
597
598static inline void
599perf_counter_munmap(unsigned long addr, unsigned long len,
600 unsigned long pgoff, struct file *file) { }
601
602static inline void perf_counter_comm(struct task_struct *tsk) { }
603#endif
604
605#endif /* __KERNEL__ */
606#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 48d887e3c6e7..b00df4c79c63 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
85#define PR_SET_TIMERSLACK 29 85#define PR_SET_TIMERSLACK 29
86#define PR_GET_TIMERSLACK 30 86#define PR_GET_TIMERSLACK 30
87 87
88#define PR_TASK_PERF_COUNTERS_DISABLE 31
89#define PR_TASK_PERF_COUNTERS_ENABLE 32
90
88#endif /* _LINUX_PRCTL_H */ 91#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049c..d1857580a132 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,7 @@ struct sched_param {
71#include <linux/path.h> 71#include <linux/path.h>
72#include <linux/compiler.h> 72#include <linux/compiler.h>
73#include <linux/completion.h> 73#include <linux/completion.h>
74#include <linux/perf_counter.h>
74#include <linux/pid.h> 75#include <linux/pid.h>
75#include <linux/percpu.h> 76#include <linux/percpu.h>
76#include <linux/topology.h> 77#include <linux/topology.h>
@@ -137,6 +138,7 @@ extern unsigned long nr_running(void);
137extern unsigned long nr_uninterruptible(void); 138extern unsigned long nr_uninterruptible(void);
138extern unsigned long nr_active(void); 139extern unsigned long nr_active(void);
139extern unsigned long nr_iowait(void); 140extern unsigned long nr_iowait(void);
141extern u64 cpu_nr_migrations(int cpu);
140 142
141extern unsigned long get_parent_ip(unsigned long addr); 143extern unsigned long get_parent_ip(unsigned long addr);
142 144
@@ -1052,9 +1054,10 @@ struct sched_entity {
1052 u64 last_wakeup; 1054 u64 last_wakeup;
1053 u64 avg_overlap; 1055 u64 avg_overlap;
1054 1056
1057 u64 nr_migrations;
1058
1055 u64 start_runtime; 1059 u64 start_runtime;
1056 u64 avg_wakeup; 1060 u64 avg_wakeup;
1057 u64 nr_migrations;
1058 1061
1059#ifdef CONFIG_SCHEDSTATS 1062#ifdef CONFIG_SCHEDSTATS
1060 u64 wait_start; 1063 u64 wait_start;
@@ -1380,6 +1383,7 @@ struct task_struct {
1380 struct list_head pi_state_list; 1383 struct list_head pi_state_list;
1381 struct futex_pi_state *pi_state_cache; 1384 struct futex_pi_state *pi_state_cache;
1382#endif 1385#endif
1386 struct perf_counter_context perf_counter_ctx;
1383#ifdef CONFIG_NUMA 1387#ifdef CONFIG_NUMA
1384 struct mempolicy *mempolicy; 1388 struct mempolicy *mempolicy;
1385 short il_next; 1389 short il_next;
@@ -2388,6 +2392,13 @@ static inline void inc_syscw(struct task_struct *tsk)
2388#define TASK_SIZE_OF(tsk) TASK_SIZE 2392#define TASK_SIZE_OF(tsk) TASK_SIZE
2389#endif 2393#endif
2390 2394
2395/*
2396 * Call the function if the target task is executing on a CPU right now:
2397 */
2398extern void task_oncpu_function_call(struct task_struct *p,
2399 void (*func) (void *info), void *info);
2400
2401
2391#ifdef CONFIG_MM_OWNER 2402#ifdef CONFIG_MM_OWNER
2392extern void mm_update_next_owner(struct mm_struct *mm); 2403extern void mm_update_next_owner(struct mm_struct *mm);
2393extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); 2404extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 40617c1d8976..677d159fe5f4 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,6 +55,7 @@ struct compat_timeval;
55struct robust_list_head; 55struct robust_list_head;
56struct getcpu_cache; 56struct getcpu_cache;
57struct old_linux_dirent; 57struct old_linux_dirent;
58struct perf_counter_hw_event;
58 59
59#include <linux/types.h> 60#include <linux/types.h>
60#include <linux/aio_abi.h> 61#include <linux/aio_abi.h>
@@ -754,4 +755,8 @@ asmlinkage long sys_pipe(int __user *);
754 755
755int kernel_execve(const char *filename, char *const argv[], char *const envp[]); 756int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
756 757
758
759asmlinkage long sys_perf_counter_open(
760 const struct perf_counter_hw_event __user *hw_event_uptr,
761 pid_t pid, int cpu, int group_fd, unsigned long flags);
757#endif 762#endif
diff --git a/init/Kconfig b/init/Kconfig
index 7be4d3836745..8158f1f44694 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -933,6 +933,41 @@ config AIO
933 by some high performance threaded applications. Disabling 933 by some high performance threaded applications. Disabling
934 this option saves about 7k. 934 this option saves about 7k.
935 935
936config HAVE_PERF_COUNTERS
937 bool
938
939menu "Performance Counters"
940
941config PERF_COUNTERS
942 bool "Kernel Performance Counters"
943 depends on HAVE_PERF_COUNTERS
944 default y
945 select ANON_INODES
946 help
947 Enable kernel support for performance counter hardware.
948
949 Performance counters are special hardware registers available
950 on most modern CPUs. These registers count the number of certain
951 types of hw events: such as instructions executed, cachemisses
952 suffered, or branches mis-predicted - without slowing down the
953 kernel or applications. These registers can also trigger interrupts
954 when a threshold number of events have passed - and can thus be
955 used to profile the code that runs on that CPU.
956
957 The Linux Performance Counter subsystem provides an abstraction of
958 these hardware capabilities, available via a system call. It
959 provides per task and per CPU counters, and it provides event
960 capabilities on top of those.
961
962 Say Y if unsure.
963
964config EVENT_PROFILE
965 bool "Tracepoint profile sources"
966 depends on PERF_COUNTERS && EVENT_TRACER
967 default y
968
969endmenu
970
936config VM_EVENT_COUNTERS 971config VM_EVENT_COUNTERS
937 default y 972 default y
938 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED 973 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
diff --git a/kernel/Makefile b/kernel/Makefile
index 42423665660a..e914ca992d70 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
95obj-$(CONFIG_TRACING) += trace/ 95obj-$(CONFIG_TRACING) += trace/
96obj-$(CONFIG_SMP) += sched_cpupri.o 96obj-$(CONFIG_SMP) += sched_cpupri.o
97obj-$(CONFIG_SLOW_WORK) += slow-work.o 97obj-$(CONFIG_SLOW_WORK) += slow-work.o
98obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
98 99
99ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 100ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
100# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 101# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/exit.c b/kernel/exit.c
index abf9cf3b95c6..4741376c8dec 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -158,6 +158,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
158{ 158{
159 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 159 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
160 160
161#ifdef CONFIG_PERF_COUNTERS
162 WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
163#endif
161 trace_sched_process_free(tsk); 164 trace_sched_process_free(tsk);
162 put_task_struct(tsk); 165 put_task_struct(tsk);
163} 166}
@@ -981,10 +984,6 @@ NORET_TYPE void do_exit(long code)
981 tsk->mempolicy = NULL; 984 tsk->mempolicy = NULL;
982#endif 985#endif
983#ifdef CONFIG_FUTEX 986#ifdef CONFIG_FUTEX
984 /*
985 * This must happen late, after the PID is not
986 * hashed anymore:
987 */
988 if (unlikely(!list_empty(&tsk->pi_state_list))) 987 if (unlikely(!list_empty(&tsk->pi_state_list)))
989 exit_pi_state_list(tsk); 988 exit_pi_state_list(tsk);
990 if (unlikely(current->pi_state_cache)) 989 if (unlikely(current->pi_state_cache))
@@ -1251,6 +1250,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
1251 */ 1250 */
1252 read_unlock(&tasklist_lock); 1251 read_unlock(&tasklist_lock);
1253 1252
1253 /*
1254 * Flush inherited counters to the parent - before the parent
1255 * gets woken up by child-exit notifications.
1256 */
1257 perf_counter_exit_task(p);
1258
1254 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1259 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
1255 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1260 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1256 ? p->signal->group_exit_code : p->exit_code; 1261 ? p->signal->group_exit_code : p->exit_code;
diff --git a/kernel/fork.c b/kernel/fork.c
index b9e2edd00726..d32fef4d38e5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -983,6 +983,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
983 goto fork_out; 983 goto fork_out;
984 984
985 rt_mutex_init_task(p); 985 rt_mutex_init_task(p);
986 perf_counter_init_task(p);
986 987
987#ifdef CONFIG_PROVE_LOCKING 988#ifdef CONFIG_PROVE_LOCKING
988 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 989 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f1..f415e80a9119 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
89 * 89 *
90 * This function is similar to (but not equivalent to) down(). 90 * This function is similar to (but not equivalent to) down().
91 */ 91 */
92void inline __sched mutex_lock(struct mutex *lock) 92void __sched mutex_lock(struct mutex *lock)
93{ 93{
94 might_sleep(); 94 might_sleep();
95 /* 95 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..09396098dd0d
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,3302 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
6 *
7 *
8 * For licensing details see kernel-base/COPYING
9 */
10
11#include <linux/fs.h>
12#include <linux/mm.h>
13#include <linux/cpu.h>
14#include <linux/smp.h>
15#include <linux/file.h>
16#include <linux/poll.h>
17#include <linux/sysfs.h>
18#include <linux/ptrace.h>
19#include <linux/percpu.h>
20#include <linux/vmstat.h>
21#include <linux/hardirq.h>
22#include <linux/rculist.h>
23#include <linux/uaccess.h>
24#include <linux/syscalls.h>
25#include <linux/anon_inodes.h>
26#include <linux/kernel_stat.h>
27#include <linux/perf_counter.h>
28#include <linux/dcache.h>
29
30#include <asm/irq_regs.h>
31
32/*
33 * Each CPU has a list of per CPU counters:
34 */
35DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
36
37int perf_max_counters __read_mostly = 1;
38static int perf_reserved_percpu __read_mostly;
39static int perf_overcommit __read_mostly = 1;
40
41static atomic_t nr_mmap_tracking __read_mostly;
42static atomic_t nr_munmap_tracking __read_mostly;
43static atomic_t nr_comm_tracking __read_mostly;
44
45int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
46
47/*
48 * Mutex for (sysadmin-configurable) counter reservations:
49 */
50static DEFINE_MUTEX(perf_resource_mutex);
51
52/*
53 * Architecture provided APIs - weak aliases:
54 */
55extern __weak const struct hw_perf_counter_ops *
56hw_perf_counter_init(struct perf_counter *counter)
57{
58 return NULL;
59}
60
61u64 __weak hw_perf_save_disable(void) { return 0; }
62void __weak hw_perf_restore(u64 ctrl) { barrier(); }
63void __weak hw_perf_counter_setup(int cpu) { barrier(); }
64int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
65 struct perf_cpu_context *cpuctx,
66 struct perf_counter_context *ctx, int cpu)
67{
68 return 0;
69}
70
71void __weak perf_counter_print_debug(void) { }
72
73static void
74list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
75{
76 struct perf_counter *group_leader = counter->group_leader;
77
78 /*
79 * Depending on whether it is a standalone or sibling counter,
80 * add it straight to the context's counter list, or to the group
81 * leader's sibling list:
82 */
83 if (counter->group_leader == counter)
84 list_add_tail(&counter->list_entry, &ctx->counter_list);
85 else {
86 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
87 group_leader->nr_siblings++;
88 }
89
90 list_add_rcu(&counter->event_entry, &ctx->event_list);
91}
92
93static void
94list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
95{
96 struct perf_counter *sibling, *tmp;
97
98 list_del_init(&counter->list_entry);
99 list_del_rcu(&counter->event_entry);
100
101 if (counter->group_leader != counter)
102 counter->group_leader->nr_siblings--;
103
104 /*
105 * If this was a group counter with sibling counters then
106 * upgrade the siblings to singleton counters by adding them
107 * to the context list directly:
108 */
109 list_for_each_entry_safe(sibling, tmp,
110 &counter->sibling_list, list_entry) {
111
112 list_move_tail(&sibling->list_entry, &ctx->counter_list);
113 sibling->group_leader = sibling;
114 }
115}
116
117static void
118counter_sched_out(struct perf_counter *counter,
119 struct perf_cpu_context *cpuctx,
120 struct perf_counter_context *ctx)
121{
122 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
123 return;
124
125 counter->state = PERF_COUNTER_STATE_INACTIVE;
126 counter->tstamp_stopped = ctx->time;
127 counter->hw_ops->disable(counter);
128 counter->oncpu = -1;
129
130 if (!is_software_counter(counter))
131 cpuctx->active_oncpu--;
132 ctx->nr_active--;
133 if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
134 cpuctx->exclusive = 0;
135}
136
137static void
138group_sched_out(struct perf_counter *group_counter,
139 struct perf_cpu_context *cpuctx,
140 struct perf_counter_context *ctx)
141{
142 struct perf_counter *counter;
143
144 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
145 return;
146
147 counter_sched_out(group_counter, cpuctx, ctx);
148
149 /*
150 * Schedule out siblings (if any):
151 */
152 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
153 counter_sched_out(counter, cpuctx, ctx);
154
155 if (group_counter->hw_event.exclusive)
156 cpuctx->exclusive = 0;
157}
158
159/*
160 * Cross CPU call to remove a performance counter
161 *
162 * We disable the counter on the hardware level first. After that we
163 * remove it from the context list.
164 */
165static void __perf_counter_remove_from_context(void *info)
166{
167 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
168 struct perf_counter *counter = info;
169 struct perf_counter_context *ctx = counter->ctx;
170 unsigned long flags;
171 u64 perf_flags;
172
173 /*
174 * If this is a task context, we need to check whether it is
175 * the current task context of this cpu. If not it has been
176 * scheduled out before the smp call arrived.
177 */
178 if (ctx->task && cpuctx->task_ctx != ctx)
179 return;
180
181 spin_lock_irqsave(&ctx->lock, flags);
182
183 counter_sched_out(counter, cpuctx, ctx);
184
185 counter->task = NULL;
186 ctx->nr_counters--;
187
188 /*
189 * Protect the list operation against NMI by disabling the
190 * counters on a global level. NOP for non NMI based counters.
191 */
192 perf_flags = hw_perf_save_disable();
193 list_del_counter(counter, ctx);
194 hw_perf_restore(perf_flags);
195
196 if (!ctx->task) {
197 /*
198 * Allow more per task counters with respect to the
199 * reservation:
200 */
201 cpuctx->max_pertask =
202 min(perf_max_counters - ctx->nr_counters,
203 perf_max_counters - perf_reserved_percpu);
204 }
205
206 spin_unlock_irqrestore(&ctx->lock, flags);
207}
208
209
210/*
211 * Remove the counter from a task's (or a CPU's) list of counters.
212 *
213 * Must be called with counter->mutex and ctx->mutex held.
214 *
215 * CPU counters are removed with a smp call. For task counters we only
216 * call when the task is on a CPU.
217 */
218static void perf_counter_remove_from_context(struct perf_counter *counter)
219{
220 struct perf_counter_context *ctx = counter->ctx;
221 struct task_struct *task = ctx->task;
222
223 if (!task) {
224 /*
225 * Per cpu counters are removed via an smp call and
226 * the removal is always sucessful.
227 */
228 smp_call_function_single(counter->cpu,
229 __perf_counter_remove_from_context,
230 counter, 1);
231 return;
232 }
233
234retry:
235 task_oncpu_function_call(task, __perf_counter_remove_from_context,
236 counter);
237
238 spin_lock_irq(&ctx->lock);
239 /*
240 * If the context is active we need to retry the smp call.
241 */
242 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
243 spin_unlock_irq(&ctx->lock);
244 goto retry;
245 }
246
247 /*
248 * The lock prevents that this context is scheduled in so we
249 * can remove the counter safely, if the call above did not
250 * succeed.
251 */
252 if (!list_empty(&counter->list_entry)) {
253 ctx->nr_counters--;
254 list_del_counter(counter, ctx);
255 counter->task = NULL;
256 }
257 spin_unlock_irq(&ctx->lock);
258}
259
260static inline u64 perf_clock(void)
261{
262 return cpu_clock(smp_processor_id());
263}
264
265/*
266 * Update the record of the current time in a context.
267 */
268static void update_context_time(struct perf_counter_context *ctx)
269{
270 u64 now = perf_clock();
271
272 ctx->time += now - ctx->timestamp;
273 ctx->timestamp = now;
274}
275
276/*
277 * Update the total_time_enabled and total_time_running fields for a counter.
278 */
279static void update_counter_times(struct perf_counter *counter)
280{
281 struct perf_counter_context *ctx = counter->ctx;
282 u64 run_end;
283
284 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
285 return;
286
287 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
288
289 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
290 run_end = counter->tstamp_stopped;
291 else
292 run_end = ctx->time;
293
294 counter->total_time_running = run_end - counter->tstamp_running;
295}
296
297/*
298 * Update total_time_enabled and total_time_running for all counters in a group.
299 */
300static void update_group_times(struct perf_counter *leader)
301{
302 struct perf_counter *counter;
303
304 update_counter_times(leader);
305 list_for_each_entry(counter, &leader->sibling_list, list_entry)
306 update_counter_times(counter);
307}
308
309/*
310 * Cross CPU call to disable a performance counter
311 */
312static void __perf_counter_disable(void *info)
313{
314 struct perf_counter *counter = info;
315 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
316 struct perf_counter_context *ctx = counter->ctx;
317 unsigned long flags;
318
319 /*
320 * If this is a per-task counter, need to check whether this
321 * counter's task is the current task on this cpu.
322 */
323 if (ctx->task && cpuctx->task_ctx != ctx)
324 return;
325
326 spin_lock_irqsave(&ctx->lock, flags);
327
328 /*
329 * If the counter is on, turn it off.
330 * If it is in error state, leave it in error state.
331 */
332 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
333 update_context_time(ctx);
334 update_counter_times(counter);
335 if (counter == counter->group_leader)
336 group_sched_out(counter, cpuctx, ctx);
337 else
338 counter_sched_out(counter, cpuctx, ctx);
339 counter->state = PERF_COUNTER_STATE_OFF;
340 }
341
342 spin_unlock_irqrestore(&ctx->lock, flags);
343}
344
345/*
346 * Disable a counter.
347 */
348static void perf_counter_disable(struct perf_counter *counter)
349{
350 struct perf_counter_context *ctx = counter->ctx;
351 struct task_struct *task = ctx->task;
352
353 if (!task) {
354 /*
355 * Disable the counter on the cpu that it's on
356 */
357 smp_call_function_single(counter->cpu, __perf_counter_disable,
358 counter, 1);
359 return;
360 }
361
362 retry:
363 task_oncpu_function_call(task, __perf_counter_disable, counter);
364
365 spin_lock_irq(&ctx->lock);
366 /*
367 * If the counter is still active, we need to retry the cross-call.
368 */
369 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
370 spin_unlock_irq(&ctx->lock);
371 goto retry;
372 }
373
374 /*
375 * Since we have the lock this context can't be scheduled
376 * in, so we can change the state safely.
377 */
378 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
379 update_counter_times(counter);
380 counter->state = PERF_COUNTER_STATE_OFF;
381 }
382
383 spin_unlock_irq(&ctx->lock);
384}
385
386/*
387 * Disable a counter and all its children.
388 */
389static void perf_counter_disable_family(struct perf_counter *counter)
390{
391 struct perf_counter *child;
392
393 perf_counter_disable(counter);
394
395 /*
396 * Lock the mutex to protect the list of children
397 */
398 mutex_lock(&counter->mutex);
399 list_for_each_entry(child, &counter->child_list, child_list)
400 perf_counter_disable(child);
401 mutex_unlock(&counter->mutex);
402}
403
404static int
405counter_sched_in(struct perf_counter *counter,
406 struct perf_cpu_context *cpuctx,
407 struct perf_counter_context *ctx,
408 int cpu)
409{
410 if (counter->state <= PERF_COUNTER_STATE_OFF)
411 return 0;
412
413 counter->state = PERF_COUNTER_STATE_ACTIVE;
414 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
415 /*
416 * The new state must be visible before we turn it on in the hardware:
417 */
418 smp_wmb();
419
420 if (counter->hw_ops->enable(counter)) {
421 counter->state = PERF_COUNTER_STATE_INACTIVE;
422 counter->oncpu = -1;
423 return -EAGAIN;
424 }
425
426 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
427
428 if (!is_software_counter(counter))
429 cpuctx->active_oncpu++;
430 ctx->nr_active++;
431
432 if (counter->hw_event.exclusive)
433 cpuctx->exclusive = 1;
434
435 return 0;
436}
437
438/*
439 * Return 1 for a group consisting entirely of software counters,
440 * 0 if the group contains any hardware counters.
441 */
442static int is_software_only_group(struct perf_counter *leader)
443{
444 struct perf_counter *counter;
445
446 if (!is_software_counter(leader))
447 return 0;
448
449 list_for_each_entry(counter, &leader->sibling_list, list_entry)
450 if (!is_software_counter(counter))
451 return 0;
452
453 return 1;
454}
455
456/*
457 * Work out whether we can put this counter group on the CPU now.
458 */
459static int group_can_go_on(struct perf_counter *counter,
460 struct perf_cpu_context *cpuctx,
461 int can_add_hw)
462{
463 /*
464 * Groups consisting entirely of software counters can always go on.
465 */
466 if (is_software_only_group(counter))
467 return 1;
468 /*
469 * If an exclusive group is already on, no other hardware
470 * counters can go on.
471 */
472 if (cpuctx->exclusive)
473 return 0;
474 /*
475 * If this group is exclusive and there are already
476 * counters on the CPU, it can't go on.
477 */
478 if (counter->hw_event.exclusive && cpuctx->active_oncpu)
479 return 0;
480 /*
481 * Otherwise, try to add it if all previous groups were able
482 * to go on.
483 */
484 return can_add_hw;
485}
486
487static void add_counter_to_ctx(struct perf_counter *counter,
488 struct perf_counter_context *ctx)
489{
490 list_add_counter(counter, ctx);
491 ctx->nr_counters++;
492 counter->prev_state = PERF_COUNTER_STATE_OFF;
493 counter->tstamp_enabled = ctx->time;
494 counter->tstamp_running = ctx->time;
495 counter->tstamp_stopped = ctx->time;
496}
497
498/*
499 * Cross CPU call to install and enable a performance counter
500 */
501static void __perf_install_in_context(void *info)
502{
503 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
504 struct perf_counter *counter = info;
505 struct perf_counter_context *ctx = counter->ctx;
506 struct perf_counter *leader = counter->group_leader;
507 int cpu = smp_processor_id();
508 unsigned long flags;
509 u64 perf_flags;
510 int err;
511
512 /*
513 * If this is a task context, we need to check whether it is
514 * the current task context of this cpu. If not it has been
515 * scheduled out before the smp call arrived.
516 */
517 if (ctx->task && cpuctx->task_ctx != ctx)
518 return;
519
520 spin_lock_irqsave(&ctx->lock, flags);
521 update_context_time(ctx);
522
523 /*
524 * Protect the list operation against NMI by disabling the
525 * counters on a global level. NOP for non NMI based counters.
526 */
527 perf_flags = hw_perf_save_disable();
528
529 add_counter_to_ctx(counter, ctx);
530
531 /*
532 * Don't put the counter on if it is disabled or if
533 * it is in a group and the group isn't on.
534 */
535 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
536 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
537 goto unlock;
538
539 /*
540 * An exclusive counter can't go on if there are already active
541 * hardware counters, and no hardware counter can go on if there
542 * is already an exclusive counter on.
543 */
544 if (!group_can_go_on(counter, cpuctx, 1))
545 err = -EEXIST;
546 else
547 err = counter_sched_in(counter, cpuctx, ctx, cpu);
548
549 if (err) {
550 /*
551 * This counter couldn't go on. If it is in a group
552 * then we have to pull the whole group off.
553 * If the counter group is pinned then put it in error state.
554 */
555 if (leader != counter)
556 group_sched_out(leader, cpuctx, ctx);
557 if (leader->hw_event.pinned) {
558 update_group_times(leader);
559 leader->state = PERF_COUNTER_STATE_ERROR;
560 }
561 }
562
563 if (!err && !ctx->task && cpuctx->max_pertask)
564 cpuctx->max_pertask--;
565
566 unlock:
567 hw_perf_restore(perf_flags);
568
569 spin_unlock_irqrestore(&ctx->lock, flags);
570}
571
572/*
573 * Attach a performance counter to a context
574 *
575 * First we add the counter to the list with the hardware enable bit
576 * in counter->hw_config cleared.
577 *
578 * If the counter is attached to a task which is on a CPU we use a smp
579 * call to enable it in the task context. The task might have been
580 * scheduled away, but we check this in the smp call again.
581 *
582 * Must be called with ctx->mutex held.
583 */
584static void
585perf_install_in_context(struct perf_counter_context *ctx,
586 struct perf_counter *counter,
587 int cpu)
588{
589 struct task_struct *task = ctx->task;
590
591 if (!task) {
592 /*
593 * Per cpu counters are installed via an smp call and
594 * the install is always sucessful.
595 */
596 smp_call_function_single(cpu, __perf_install_in_context,
597 counter, 1);
598 return;
599 }
600
601 counter->task = task;
602retry:
603 task_oncpu_function_call(task, __perf_install_in_context,
604 counter);
605
606 spin_lock_irq(&ctx->lock);
607 /*
608 * we need to retry the smp call.
609 */
610 if (ctx->is_active && list_empty(&counter->list_entry)) {
611 spin_unlock_irq(&ctx->lock);
612 goto retry;
613 }
614
615 /*
616 * The lock prevents that this context is scheduled in so we
617 * can add the counter safely, if it the call above did not
618 * succeed.
619 */
620 if (list_empty(&counter->list_entry))
621 add_counter_to_ctx(counter, ctx);
622 spin_unlock_irq(&ctx->lock);
623}
624
625/*
626 * Cross CPU call to enable a performance counter
627 */
628static void __perf_counter_enable(void *info)
629{
630 struct perf_counter *counter = info;
631 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
632 struct perf_counter_context *ctx = counter->ctx;
633 struct perf_counter *leader = counter->group_leader;
634 unsigned long flags;
635 int err;
636
637 /*
638 * If this is a per-task counter, need to check whether this
639 * counter's task is the current task on this cpu.
640 */
641 if (ctx->task && cpuctx->task_ctx != ctx)
642 return;
643
644 spin_lock_irqsave(&ctx->lock, flags);
645 update_context_time(ctx);
646
647 counter->prev_state = counter->state;
648 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
649 goto unlock;
650 counter->state = PERF_COUNTER_STATE_INACTIVE;
651 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
652
653 /*
654 * If the counter is in a group and isn't the group leader,
655 * then don't put it on unless the group is on.
656 */
657 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
658 goto unlock;
659
660 if (!group_can_go_on(counter, cpuctx, 1))
661 err = -EEXIST;
662 else
663 err = counter_sched_in(counter, cpuctx, ctx,
664 smp_processor_id());
665
666 if (err) {
667 /*
668 * If this counter can't go on and it's part of a
669 * group, then the whole group has to come off.
670 */
671 if (leader != counter)
672 group_sched_out(leader, cpuctx, ctx);
673 if (leader->hw_event.pinned) {
674 update_group_times(leader);
675 leader->state = PERF_COUNTER_STATE_ERROR;
676 }
677 }
678
679 unlock:
680 spin_unlock_irqrestore(&ctx->lock, flags);
681}
682
683/*
684 * Enable a counter.
685 */
686static void perf_counter_enable(struct perf_counter *counter)
687{
688 struct perf_counter_context *ctx = counter->ctx;
689 struct task_struct *task = ctx->task;
690
691 if (!task) {
692 /*
693 * Enable the counter on the cpu that it's on
694 */
695 smp_call_function_single(counter->cpu, __perf_counter_enable,
696 counter, 1);
697 return;
698 }
699
700 spin_lock_irq(&ctx->lock);
701 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
702 goto out;
703
704 /*
705 * If the counter is in error state, clear that first.
706 * That way, if we see the counter in error state below, we
707 * know that it has gone back into error state, as distinct
708 * from the task having been scheduled away before the
709 * cross-call arrived.
710 */
711 if (counter->state == PERF_COUNTER_STATE_ERROR)
712 counter->state = PERF_COUNTER_STATE_OFF;
713
714 retry:
715 spin_unlock_irq(&ctx->lock);
716 task_oncpu_function_call(task, __perf_counter_enable, counter);
717
718 spin_lock_irq(&ctx->lock);
719
720 /*
721 * If the context is active and the counter is still off,
722 * we need to retry the cross-call.
723 */
724 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
725 goto retry;
726
727 /*
728 * Since we have the lock this context can't be scheduled
729 * in, so we can change the state safely.
730 */
731 if (counter->state == PERF_COUNTER_STATE_OFF) {
732 counter->state = PERF_COUNTER_STATE_INACTIVE;
733 counter->tstamp_enabled =
734 ctx->time - counter->total_time_enabled;
735 }
736 out:
737 spin_unlock_irq(&ctx->lock);
738}
739
740static void perf_counter_refresh(struct perf_counter *counter, int refresh)
741{
742 atomic_add(refresh, &counter->event_limit);
743 perf_counter_enable(counter);
744}
745
746/*
747 * Enable a counter and all its children.
748 */
749static void perf_counter_enable_family(struct perf_counter *counter)
750{
751 struct perf_counter *child;
752
753 perf_counter_enable(counter);
754
755 /*
756 * Lock the mutex to protect the list of children
757 */
758 mutex_lock(&counter->mutex);
759 list_for_each_entry(child, &counter->child_list, child_list)
760 perf_counter_enable(child);
761 mutex_unlock(&counter->mutex);
762}
763
764void __perf_counter_sched_out(struct perf_counter_context *ctx,
765 struct perf_cpu_context *cpuctx)
766{
767 struct perf_counter *counter;
768 u64 flags;
769
770 spin_lock(&ctx->lock);
771 ctx->is_active = 0;
772 if (likely(!ctx->nr_counters))
773 goto out;
774 update_context_time(ctx);
775
776 flags = hw_perf_save_disable();
777 if (ctx->nr_active) {
778 list_for_each_entry(counter, &ctx->counter_list, list_entry)
779 group_sched_out(counter, cpuctx, ctx);
780 }
781 hw_perf_restore(flags);
782 out:
783 spin_unlock(&ctx->lock);
784}
785
786/*
787 * Called from scheduler to remove the counters of the current task,
788 * with interrupts disabled.
789 *
790 * We stop each counter and update the counter value in counter->count.
791 *
792 * This does not protect us against NMI, but disable()
793 * sets the disabled bit in the control field of counter _before_
794 * accessing the counter control register. If a NMI hits, then it will
795 * not restart the counter.
796 */
797void perf_counter_task_sched_out(struct task_struct *task, int cpu)
798{
799 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
800 struct perf_counter_context *ctx = &task->perf_counter_ctx;
801 struct pt_regs *regs;
802
803 if (likely(!cpuctx->task_ctx))
804 return;
805
806 update_context_time(ctx);
807
808 regs = task_pt_regs(task);
809 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
810 __perf_counter_sched_out(ctx, cpuctx);
811
812 cpuctx->task_ctx = NULL;
813}
814
815static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
816{
817 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
818}
819
820static int
821group_sched_in(struct perf_counter *group_counter,
822 struct perf_cpu_context *cpuctx,
823 struct perf_counter_context *ctx,
824 int cpu)
825{
826 struct perf_counter *counter, *partial_group;
827 int ret;
828
829 if (group_counter->state == PERF_COUNTER_STATE_OFF)
830 return 0;
831
832 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
833 if (ret)
834 return ret < 0 ? ret : 0;
835
836 group_counter->prev_state = group_counter->state;
837 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
838 return -EAGAIN;
839
840 /*
841 * Schedule in siblings as one group (if any):
842 */
843 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
844 counter->prev_state = counter->state;
845 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
846 partial_group = counter;
847 goto group_error;
848 }
849 }
850
851 return 0;
852
853group_error:
854 /*
855 * Groups can be scheduled in as one unit only, so undo any
856 * partial group before returning:
857 */
858 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
859 if (counter == partial_group)
860 break;
861 counter_sched_out(counter, cpuctx, ctx);
862 }
863 counter_sched_out(group_counter, cpuctx, ctx);
864
865 return -EAGAIN;
866}
867
868static void
869__perf_counter_sched_in(struct perf_counter_context *ctx,
870 struct perf_cpu_context *cpuctx, int cpu)
871{
872 struct perf_counter *counter;
873 u64 flags;
874 int can_add_hw = 1;
875
876 spin_lock(&ctx->lock);
877 ctx->is_active = 1;
878 if (likely(!ctx->nr_counters))
879 goto out;
880
881 ctx->timestamp = perf_clock();
882
883 flags = hw_perf_save_disable();
884
885 /*
886 * First go through the list and put on any pinned groups
887 * in order to give them the best chance of going on.
888 */
889 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
890 if (counter->state <= PERF_COUNTER_STATE_OFF ||
891 !counter->hw_event.pinned)
892 continue;
893 if (counter->cpu != -1 && counter->cpu != cpu)
894 continue;
895
896 if (group_can_go_on(counter, cpuctx, 1))
897 group_sched_in(counter, cpuctx, ctx, cpu);
898
899 /*
900 * If this pinned group hasn't been scheduled,
901 * put it in error state.
902 */
903 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
904 update_group_times(counter);
905 counter->state = PERF_COUNTER_STATE_ERROR;
906 }
907 }
908
909 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
910 /*
911 * Ignore counters in OFF or ERROR state, and
912 * ignore pinned counters since we did them already.
913 */
914 if (counter->state <= PERF_COUNTER_STATE_OFF ||
915 counter->hw_event.pinned)
916 continue;
917
918 /*
919 * Listen to the 'cpu' scheduling filter constraint
920 * of counters:
921 */
922 if (counter->cpu != -1 && counter->cpu != cpu)
923 continue;
924
925 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
926 if (group_sched_in(counter, cpuctx, ctx, cpu))
927 can_add_hw = 0;
928 }
929 }
930 hw_perf_restore(flags);
931 out:
932 spin_unlock(&ctx->lock);
933}
934
935/*
936 * Called from scheduler to add the counters of the current task
937 * with interrupts disabled.
938 *
939 * We restore the counter value and then enable it.
940 *
941 * This does not protect us against NMI, but enable()
942 * sets the enabled bit in the control field of counter _before_
943 * accessing the counter control register. If a NMI hits, then it will
944 * keep the counter running.
945 */
946void perf_counter_task_sched_in(struct task_struct *task, int cpu)
947{
948 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
949 struct perf_counter_context *ctx = &task->perf_counter_ctx;
950
951 __perf_counter_sched_in(ctx, cpuctx, cpu);
952 cpuctx->task_ctx = ctx;
953}
954
955static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
956{
957 struct perf_counter_context *ctx = &cpuctx->ctx;
958
959 __perf_counter_sched_in(ctx, cpuctx, cpu);
960}
961
962int perf_counter_task_disable(void)
963{
964 struct task_struct *curr = current;
965 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
966 struct perf_counter *counter;
967 unsigned long flags;
968 u64 perf_flags;
969 int cpu;
970
971 if (likely(!ctx->nr_counters))
972 return 0;
973
974 local_irq_save(flags);
975 cpu = smp_processor_id();
976
977 perf_counter_task_sched_out(curr, cpu);
978
979 spin_lock(&ctx->lock);
980
981 /*
982 * Disable all the counters:
983 */
984 perf_flags = hw_perf_save_disable();
985
986 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
987 if (counter->state != PERF_COUNTER_STATE_ERROR) {
988 update_group_times(counter);
989 counter->state = PERF_COUNTER_STATE_OFF;
990 }
991 }
992
993 hw_perf_restore(perf_flags);
994
995 spin_unlock_irqrestore(&ctx->lock, flags);
996
997 return 0;
998}
999
1000int perf_counter_task_enable(void)
1001{
1002 struct task_struct *curr = current;
1003 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1004 struct perf_counter *counter;
1005 unsigned long flags;
1006 u64 perf_flags;
1007 int cpu;
1008
1009 if (likely(!ctx->nr_counters))
1010 return 0;
1011
1012 local_irq_save(flags);
1013 cpu = smp_processor_id();
1014
1015 perf_counter_task_sched_out(curr, cpu);
1016
1017 spin_lock(&ctx->lock);
1018
1019 /*
1020 * Disable all the counters:
1021 */
1022 perf_flags = hw_perf_save_disable();
1023
1024 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1025 if (counter->state > PERF_COUNTER_STATE_OFF)
1026 continue;
1027 counter->state = PERF_COUNTER_STATE_INACTIVE;
1028 counter->tstamp_enabled =
1029 ctx->time - counter->total_time_enabled;
1030 counter->hw_event.disabled = 0;
1031 }
1032 hw_perf_restore(perf_flags);
1033
1034 spin_unlock(&ctx->lock);
1035
1036 perf_counter_task_sched_in(curr, cpu);
1037
1038 local_irq_restore(flags);
1039
1040 return 0;
1041}
1042
1043/*
1044 * Round-robin a context's counters:
1045 */
1046static void rotate_ctx(struct perf_counter_context *ctx)
1047{
1048 struct perf_counter *counter;
1049 u64 perf_flags;
1050
1051 if (!ctx->nr_counters)
1052 return;
1053
1054 spin_lock(&ctx->lock);
1055 /*
1056 * Rotate the first entry last (works just fine for group counters too):
1057 */
1058 perf_flags = hw_perf_save_disable();
1059 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1060 list_move_tail(&counter->list_entry, &ctx->counter_list);
1061 break;
1062 }
1063 hw_perf_restore(perf_flags);
1064
1065 spin_unlock(&ctx->lock);
1066}
1067
1068void perf_counter_task_tick(struct task_struct *curr, int cpu)
1069{
1070 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1071 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1072 const int rotate_percpu = 0;
1073
1074 if (rotate_percpu)
1075 perf_counter_cpu_sched_out(cpuctx);
1076 perf_counter_task_sched_out(curr, cpu);
1077
1078 if (rotate_percpu)
1079 rotate_ctx(&cpuctx->ctx);
1080 rotate_ctx(ctx);
1081
1082 if (rotate_percpu)
1083 perf_counter_cpu_sched_in(cpuctx, cpu);
1084 perf_counter_task_sched_in(curr, cpu);
1085}
1086
1087/*
1088 * Cross CPU call to read the hardware counter
1089 */
1090static void __read(void *info)
1091{
1092 struct perf_counter *counter = info;
1093 struct perf_counter_context *ctx = counter->ctx;
1094 unsigned long flags;
1095
1096 local_irq_save(flags);
1097 if (ctx->is_active)
1098 update_context_time(ctx);
1099 counter->hw_ops->read(counter);
1100 update_counter_times(counter);
1101 local_irq_restore(flags);
1102}
1103
1104static u64 perf_counter_read(struct perf_counter *counter)
1105{
1106 /*
1107 * If counter is enabled and currently active on a CPU, update the
1108 * value in the counter structure:
1109 */
1110 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1111 smp_call_function_single(counter->oncpu,
1112 __read, counter, 1);
1113 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1114 update_counter_times(counter);
1115 }
1116
1117 return atomic64_read(&counter->count);
1118}
1119
1120static void put_context(struct perf_counter_context *ctx)
1121{
1122 if (ctx->task)
1123 put_task_struct(ctx->task);
1124}
1125
1126static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1127{
1128 struct perf_cpu_context *cpuctx;
1129 struct perf_counter_context *ctx;
1130 struct task_struct *task;
1131
1132 /*
1133 * If cpu is not a wildcard then this is a percpu counter:
1134 */
1135 if (cpu != -1) {
1136 /* Must be root to operate on a CPU counter: */
1137 if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
1138 return ERR_PTR(-EACCES);
1139
1140 if (cpu < 0 || cpu > num_possible_cpus())
1141 return ERR_PTR(-EINVAL);
1142
1143 /*
1144 * We could be clever and allow to attach a counter to an
1145 * offline CPU and activate it when the CPU comes up, but
1146 * that's for later.
1147 */
1148 if (!cpu_isset(cpu, cpu_online_map))
1149 return ERR_PTR(-ENODEV);
1150
1151 cpuctx = &per_cpu(perf_cpu_context, cpu);
1152 ctx = &cpuctx->ctx;
1153
1154 return ctx;
1155 }
1156
1157 rcu_read_lock();
1158 if (!pid)
1159 task = current;
1160 else
1161 task = find_task_by_vpid(pid);
1162 if (task)
1163 get_task_struct(task);
1164 rcu_read_unlock();
1165
1166 if (!task)
1167 return ERR_PTR(-ESRCH);
1168
1169 ctx = &task->perf_counter_ctx;
1170 ctx->task = task;
1171
1172 /* Reuse ptrace permission checks for now. */
1173 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1174 put_context(ctx);
1175 return ERR_PTR(-EACCES);
1176 }
1177
1178 return ctx;
1179}
1180
1181static void free_counter_rcu(struct rcu_head *head)
1182{
1183 struct perf_counter *counter;
1184
1185 counter = container_of(head, struct perf_counter, rcu_head);
1186 kfree(counter);
1187}
1188
1189static void perf_pending_sync(struct perf_counter *counter);
1190
1191static void free_counter(struct perf_counter *counter)
1192{
1193 perf_pending_sync(counter);
1194
1195 if (counter->hw_event.mmap)
1196 atomic_dec(&nr_mmap_tracking);
1197 if (counter->hw_event.munmap)
1198 atomic_dec(&nr_munmap_tracking);
1199 if (counter->hw_event.comm)
1200 atomic_dec(&nr_comm_tracking);
1201
1202 if (counter->destroy)
1203 counter->destroy(counter);
1204
1205 call_rcu(&counter->rcu_head, free_counter_rcu);
1206}
1207
1208/*
1209 * Called when the last reference to the file is gone.
1210 */
1211static int perf_release(struct inode *inode, struct file *file)
1212{
1213 struct perf_counter *counter = file->private_data;
1214 struct perf_counter_context *ctx = counter->ctx;
1215
1216 file->private_data = NULL;
1217
1218 mutex_lock(&ctx->mutex);
1219 mutex_lock(&counter->mutex);
1220
1221 perf_counter_remove_from_context(counter);
1222
1223 mutex_unlock(&counter->mutex);
1224 mutex_unlock(&ctx->mutex);
1225
1226 free_counter(counter);
1227 put_context(ctx);
1228
1229 return 0;
1230}
1231
1232/*
1233 * Read the performance counter - simple non blocking version for now
1234 */
1235static ssize_t
1236perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1237{
1238 u64 values[3];
1239 int n;
1240
1241 /*
1242 * Return end-of-file for a read on a counter that is in
1243 * error state (i.e. because it was pinned but it couldn't be
1244 * scheduled on to the CPU at some point).
1245 */
1246 if (counter->state == PERF_COUNTER_STATE_ERROR)
1247 return 0;
1248
1249 mutex_lock(&counter->mutex);
1250 values[0] = perf_counter_read(counter);
1251 n = 1;
1252 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1253 values[n++] = counter->total_time_enabled +
1254 atomic64_read(&counter->child_total_time_enabled);
1255 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1256 values[n++] = counter->total_time_running +
1257 atomic64_read(&counter->child_total_time_running);
1258 mutex_unlock(&counter->mutex);
1259
1260 if (count < n * sizeof(u64))
1261 return -EINVAL;
1262 count = n * sizeof(u64);
1263
1264 if (copy_to_user(buf, values, count))
1265 return -EFAULT;
1266
1267 return count;
1268}
1269
1270static ssize_t
1271perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1272{
1273 struct perf_counter *counter = file->private_data;
1274
1275 return perf_read_hw(counter, buf, count);
1276}
1277
1278static unsigned int perf_poll(struct file *file, poll_table *wait)
1279{
1280 struct perf_counter *counter = file->private_data;
1281 struct perf_mmap_data *data;
1282 unsigned int events;
1283
1284 rcu_read_lock();
1285 data = rcu_dereference(counter->data);
1286 if (data)
1287 events = atomic_xchg(&data->wakeup, 0);
1288 else
1289 events = POLL_HUP;
1290 rcu_read_unlock();
1291
1292 poll_wait(file, &counter->waitq, wait);
1293
1294 return events;
1295}
1296
1297static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1298{
1299 struct perf_counter *counter = file->private_data;
1300 int err = 0;
1301
1302 switch (cmd) {
1303 case PERF_COUNTER_IOC_ENABLE:
1304 perf_counter_enable_family(counter);
1305 break;
1306 case PERF_COUNTER_IOC_DISABLE:
1307 perf_counter_disable_family(counter);
1308 break;
1309 case PERF_COUNTER_IOC_REFRESH:
1310 perf_counter_refresh(counter, arg);
1311 break;
1312 default:
1313 err = -ENOTTY;
1314 }
1315 return err;
1316}
1317
1318/*
1319 * Callers need to ensure there can be no nesting of this function, otherwise
1320 * the seqlock logic goes bad. We can not serialize this because the arch
1321 * code calls this from NMI context.
1322 */
1323void perf_counter_update_userpage(struct perf_counter *counter)
1324{
1325 struct perf_mmap_data *data;
1326 struct perf_counter_mmap_page *userpg;
1327
1328 rcu_read_lock();
1329 data = rcu_dereference(counter->data);
1330 if (!data)
1331 goto unlock;
1332
1333 userpg = data->user_page;
1334
1335 /*
1336 * Disable preemption so as to not let the corresponding user-space
1337 * spin too long if we get preempted.
1338 */
1339 preempt_disable();
1340 ++userpg->lock;
1341 barrier();
1342 userpg->index = counter->hw.idx;
1343 userpg->offset = atomic64_read(&counter->count);
1344 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1345 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1346
1347 barrier();
1348 ++userpg->lock;
1349 preempt_enable();
1350unlock:
1351 rcu_read_unlock();
1352}
1353
1354static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1355{
1356 struct perf_counter *counter = vma->vm_file->private_data;
1357 struct perf_mmap_data *data;
1358 int ret = VM_FAULT_SIGBUS;
1359
1360 rcu_read_lock();
1361 data = rcu_dereference(counter->data);
1362 if (!data)
1363 goto unlock;
1364
1365 if (vmf->pgoff == 0) {
1366 vmf->page = virt_to_page(data->user_page);
1367 } else {
1368 int nr = vmf->pgoff - 1;
1369
1370 if ((unsigned)nr > data->nr_pages)
1371 goto unlock;
1372
1373 vmf->page = virt_to_page(data->data_pages[nr]);
1374 }
1375 get_page(vmf->page);
1376 ret = 0;
1377unlock:
1378 rcu_read_unlock();
1379
1380 return ret;
1381}
1382
1383static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1384{
1385 struct perf_mmap_data *data;
1386 unsigned long size;
1387 int i;
1388
1389 WARN_ON(atomic_read(&counter->mmap_count));
1390
1391 size = sizeof(struct perf_mmap_data);
1392 size += nr_pages * sizeof(void *);
1393
1394 data = kzalloc(size, GFP_KERNEL);
1395 if (!data)
1396 goto fail;
1397
1398 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1399 if (!data->user_page)
1400 goto fail_user_page;
1401
1402 for (i = 0; i < nr_pages; i++) {
1403 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1404 if (!data->data_pages[i])
1405 goto fail_data_pages;
1406 }
1407
1408 data->nr_pages = nr_pages;
1409
1410 rcu_assign_pointer(counter->data, data);
1411
1412 return 0;
1413
1414fail_data_pages:
1415 for (i--; i >= 0; i--)
1416 free_page((unsigned long)data->data_pages[i]);
1417
1418 free_page((unsigned long)data->user_page);
1419
1420fail_user_page:
1421 kfree(data);
1422
1423fail:
1424 return -ENOMEM;
1425}
1426
1427static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1428{
1429 struct perf_mmap_data *data = container_of(rcu_head,
1430 struct perf_mmap_data, rcu_head);
1431 int i;
1432
1433 free_page((unsigned long)data->user_page);
1434 for (i = 0; i < data->nr_pages; i++)
1435 free_page((unsigned long)data->data_pages[i]);
1436 kfree(data);
1437}
1438
1439static void perf_mmap_data_free(struct perf_counter *counter)
1440{
1441 struct perf_mmap_data *data = counter->data;
1442
1443 WARN_ON(atomic_read(&counter->mmap_count));
1444
1445 rcu_assign_pointer(counter->data, NULL);
1446 call_rcu(&data->rcu_head, __perf_mmap_data_free);
1447}
1448
1449static void perf_mmap_open(struct vm_area_struct *vma)
1450{
1451 struct perf_counter *counter = vma->vm_file->private_data;
1452
1453 atomic_inc(&counter->mmap_count);
1454}
1455
1456static void perf_mmap_close(struct vm_area_struct *vma)
1457{
1458 struct perf_counter *counter = vma->vm_file->private_data;
1459
1460 if (atomic_dec_and_mutex_lock(&counter->mmap_count,
1461 &counter->mmap_mutex)) {
1462 vma->vm_mm->locked_vm -= counter->data->nr_pages + 1;
1463 perf_mmap_data_free(counter);
1464 mutex_unlock(&counter->mmap_mutex);
1465 }
1466}
1467
1468static struct vm_operations_struct perf_mmap_vmops = {
1469 .open = perf_mmap_open,
1470 .close = perf_mmap_close,
1471 .fault = perf_mmap_fault,
1472};
1473
1474static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1475{
1476 struct perf_counter *counter = file->private_data;
1477 unsigned long vma_size;
1478 unsigned long nr_pages;
1479 unsigned long locked, lock_limit;
1480 int ret = 0;
1481
1482 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1483 return -EINVAL;
1484
1485 vma_size = vma->vm_end - vma->vm_start;
1486 nr_pages = (vma_size / PAGE_SIZE) - 1;
1487
1488 /*
1489 * If we have data pages ensure they're a power-of-two number, so we
1490 * can do bitmasks instead of modulo.
1491 */
1492 if (nr_pages != 0 && !is_power_of_2(nr_pages))
1493 return -EINVAL;
1494
1495 if (vma_size != PAGE_SIZE * (1 + nr_pages))
1496 return -EINVAL;
1497
1498 if (vma->vm_pgoff != 0)
1499 return -EINVAL;
1500
1501 mutex_lock(&counter->mmap_mutex);
1502 if (atomic_inc_not_zero(&counter->mmap_count)) {
1503 if (nr_pages != counter->data->nr_pages)
1504 ret = -EINVAL;
1505 goto unlock;
1506 }
1507
1508 locked = vma->vm_mm->locked_vm;
1509 locked += nr_pages + 1;
1510
1511 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1512 lock_limit >>= PAGE_SHIFT;
1513
1514 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1515 ret = -EPERM;
1516 goto unlock;
1517 }
1518
1519 WARN_ON(counter->data);
1520 ret = perf_mmap_data_alloc(counter, nr_pages);
1521 if (ret)
1522 goto unlock;
1523
1524 atomic_set(&counter->mmap_count, 1);
1525 vma->vm_mm->locked_vm += nr_pages + 1;
1526unlock:
1527 mutex_unlock(&counter->mmap_mutex);
1528
1529 vma->vm_flags &= ~VM_MAYWRITE;
1530 vma->vm_flags |= VM_RESERVED;
1531 vma->vm_ops = &perf_mmap_vmops;
1532
1533 return ret;
1534}
1535
1536static int perf_fasync(int fd, struct file *filp, int on)
1537{
1538 struct perf_counter *counter = filp->private_data;
1539 struct inode *inode = filp->f_path.dentry->d_inode;
1540 int retval;
1541
1542 mutex_lock(&inode->i_mutex);
1543 retval = fasync_helper(fd, filp, on, &counter->fasync);
1544 mutex_unlock(&inode->i_mutex);
1545
1546 if (retval < 0)
1547 return retval;
1548
1549 return 0;
1550}
1551
1552static const struct file_operations perf_fops = {
1553 .release = perf_release,
1554 .read = perf_read,
1555 .poll = perf_poll,
1556 .unlocked_ioctl = perf_ioctl,
1557 .compat_ioctl = perf_ioctl,
1558 .mmap = perf_mmap,
1559 .fasync = perf_fasync,
1560};
1561
1562/*
1563 * Perf counter wakeup
1564 *
1565 * If there's data, ensure we set the poll() state and publish everything
1566 * to user-space before waking everybody up.
1567 */
1568
1569void perf_counter_wakeup(struct perf_counter *counter)
1570{
1571 struct perf_mmap_data *data;
1572
1573 rcu_read_lock();
1574 data = rcu_dereference(counter->data);
1575 if (data) {
1576 atomic_set(&data->wakeup, POLL_IN);
1577 /*
1578 * Ensure all data writes are issued before updating the
1579 * user-space data head information. The matching rmb()
1580 * will be in userspace after reading this value.
1581 */
1582 smp_wmb();
1583 data->user_page->data_head = atomic_read(&data->head);
1584 }
1585 rcu_read_unlock();
1586
1587 wake_up_all(&counter->waitq);
1588
1589 if (counter->pending_kill) {
1590 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
1591 counter->pending_kill = 0;
1592 }
1593}
1594
1595/*
1596 * Pending wakeups
1597 *
1598 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
1599 *
1600 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
1601 * single linked list and use cmpxchg() to add entries lockless.
1602 */
1603
1604static void perf_pending_counter(struct perf_pending_entry *entry)
1605{
1606 struct perf_counter *counter = container_of(entry,
1607 struct perf_counter, pending);
1608
1609 if (counter->pending_disable) {
1610 counter->pending_disable = 0;
1611 perf_counter_disable(counter);
1612 }
1613
1614 if (counter->pending_wakeup) {
1615 counter->pending_wakeup = 0;
1616 perf_counter_wakeup(counter);
1617 }
1618}
1619
1620#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
1621
1622static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
1623 PENDING_TAIL,
1624};
1625
1626static void perf_pending_queue(struct perf_pending_entry *entry,
1627 void (*func)(struct perf_pending_entry *))
1628{
1629 struct perf_pending_entry **head;
1630
1631 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
1632 return;
1633
1634 entry->func = func;
1635
1636 head = &get_cpu_var(perf_pending_head);
1637
1638 do {
1639 entry->next = *head;
1640 } while (cmpxchg(head, entry->next, entry) != entry->next);
1641
1642 set_perf_counter_pending();
1643
1644 put_cpu_var(perf_pending_head);
1645}
1646
1647static int __perf_pending_run(void)
1648{
1649 struct perf_pending_entry *list;
1650 int nr = 0;
1651
1652 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
1653 while (list != PENDING_TAIL) {
1654 void (*func)(struct perf_pending_entry *);
1655 struct perf_pending_entry *entry = list;
1656
1657 list = list->next;
1658
1659 func = entry->func;
1660 entry->next = NULL;
1661 /*
1662 * Ensure we observe the unqueue before we issue the wakeup,
1663 * so that we won't be waiting forever.
1664 * -- see perf_not_pending().
1665 */
1666 smp_wmb();
1667
1668 func(entry);
1669 nr++;
1670 }
1671
1672 return nr;
1673}
1674
1675static inline int perf_not_pending(struct perf_counter *counter)
1676{
1677 /*
1678 * If we flush on whatever cpu we run, there is a chance we don't
1679 * need to wait.
1680 */
1681 get_cpu();
1682 __perf_pending_run();
1683 put_cpu();
1684
1685 /*
1686 * Ensure we see the proper queue state before going to sleep
1687 * so that we do not miss the wakeup. -- see perf_pending_handle()
1688 */
1689 smp_rmb();
1690 return counter->pending.next == NULL;
1691}
1692
1693static void perf_pending_sync(struct perf_counter *counter)
1694{
1695 wait_event(counter->waitq, perf_not_pending(counter));
1696}
1697
1698void perf_counter_do_pending(void)
1699{
1700 __perf_pending_run();
1701}
1702
1703/*
1704 * Callchain support -- arch specific
1705 */
1706
1707__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1708{
1709 return NULL;
1710}
1711
1712/*
1713 * Output
1714 */
1715
1716struct perf_output_handle {
1717 struct perf_counter *counter;
1718 struct perf_mmap_data *data;
1719 unsigned int offset;
1720 unsigned int head;
1721 int wakeup;
1722 int nmi;
1723 int overflow;
1724};
1725
1726static inline void __perf_output_wakeup(struct perf_output_handle *handle)
1727{
1728 if (handle->nmi) {
1729 handle->counter->pending_wakeup = 1;
1730 perf_pending_queue(&handle->counter->pending,
1731 perf_pending_counter);
1732 } else
1733 perf_counter_wakeup(handle->counter);
1734}
1735
1736static int perf_output_begin(struct perf_output_handle *handle,
1737 struct perf_counter *counter, unsigned int size,
1738 int nmi, int overflow)
1739{
1740 struct perf_mmap_data *data;
1741 unsigned int offset, head;
1742
1743 rcu_read_lock();
1744 data = rcu_dereference(counter->data);
1745 if (!data)
1746 goto out;
1747
1748 handle->counter = counter;
1749 handle->nmi = nmi;
1750 handle->overflow = overflow;
1751
1752 if (!data->nr_pages)
1753 goto fail;
1754
1755 do {
1756 offset = head = atomic_read(&data->head);
1757 head += size;
1758 } while (atomic_cmpxchg(&data->head, offset, head) != offset);
1759
1760 handle->data = data;
1761 handle->offset = offset;
1762 handle->head = head;
1763 handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
1764
1765 return 0;
1766
1767fail:
1768 __perf_output_wakeup(handle);
1769out:
1770 rcu_read_unlock();
1771
1772 return -ENOSPC;
1773}
1774
1775static void perf_output_copy(struct perf_output_handle *handle,
1776 void *buf, unsigned int len)
1777{
1778 unsigned int pages_mask;
1779 unsigned int offset;
1780 unsigned int size;
1781 void **pages;
1782
1783 offset = handle->offset;
1784 pages_mask = handle->data->nr_pages - 1;
1785 pages = handle->data->data_pages;
1786
1787 do {
1788 unsigned int page_offset;
1789 int nr;
1790
1791 nr = (offset >> PAGE_SHIFT) & pages_mask;
1792 page_offset = offset & (PAGE_SIZE - 1);
1793 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
1794
1795 memcpy(pages[nr] + page_offset, buf, size);
1796
1797 len -= size;
1798 buf += size;
1799 offset += size;
1800 } while (len);
1801
1802 handle->offset = offset;
1803
1804 WARN_ON_ONCE(handle->offset > handle->head);
1805}
1806
1807#define perf_output_put(handle, x) \
1808 perf_output_copy((handle), &(x), sizeof(x))
1809
1810static void perf_output_end(struct perf_output_handle *handle)
1811{
1812 int wakeup_events = handle->counter->hw_event.wakeup_events;
1813
1814 if (handle->overflow && wakeup_events) {
1815 int events = atomic_inc_return(&handle->data->events);
1816 if (events >= wakeup_events) {
1817 atomic_sub(wakeup_events, &handle->data->events);
1818 __perf_output_wakeup(handle);
1819 }
1820 } else if (handle->wakeup)
1821 __perf_output_wakeup(handle);
1822 rcu_read_unlock();
1823}
1824
1825static void perf_counter_output(struct perf_counter *counter,
1826 int nmi, struct pt_regs *regs, u64 addr)
1827{
1828 int ret;
1829 u64 record_type = counter->hw_event.record_type;
1830 struct perf_output_handle handle;
1831 struct perf_event_header header;
1832 u64 ip;
1833 struct {
1834 u32 pid, tid;
1835 } tid_entry;
1836 struct {
1837 u64 event;
1838 u64 counter;
1839 } group_entry;
1840 struct perf_callchain_entry *callchain = NULL;
1841 int callchain_size = 0;
1842 u64 time;
1843
1844 header.type = 0;
1845 header.size = sizeof(header);
1846
1847 header.misc = PERF_EVENT_MISC_OVERFLOW;
1848 header.misc |= user_mode(regs) ?
1849 PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
1850
1851 if (record_type & PERF_RECORD_IP) {
1852 ip = instruction_pointer(regs);
1853 header.type |= PERF_RECORD_IP;
1854 header.size += sizeof(ip);
1855 }
1856
1857 if (record_type & PERF_RECORD_TID) {
1858 /* namespace issues */
1859 tid_entry.pid = current->group_leader->pid;
1860 tid_entry.tid = current->pid;
1861
1862 header.type |= PERF_RECORD_TID;
1863 header.size += sizeof(tid_entry);
1864 }
1865
1866 if (record_type & PERF_RECORD_TIME) {
1867 /*
1868 * Maybe do better on x86 and provide cpu_clock_nmi()
1869 */
1870 time = sched_clock();
1871
1872 header.type |= PERF_RECORD_TIME;
1873 header.size += sizeof(u64);
1874 }
1875
1876 if (record_type & PERF_RECORD_ADDR) {
1877 header.type |= PERF_RECORD_ADDR;
1878 header.size += sizeof(u64);
1879 }
1880
1881 if (record_type & PERF_RECORD_GROUP) {
1882 header.type |= PERF_RECORD_GROUP;
1883 header.size += sizeof(u64) +
1884 counter->nr_siblings * sizeof(group_entry);
1885 }
1886
1887 if (record_type & PERF_RECORD_CALLCHAIN) {
1888 callchain = perf_callchain(regs);
1889
1890 if (callchain) {
1891 callchain_size = (1 + callchain->nr) * sizeof(u64);
1892
1893 header.type |= PERF_RECORD_CALLCHAIN;
1894 header.size += callchain_size;
1895 }
1896 }
1897
1898 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
1899 if (ret)
1900 return;
1901
1902 perf_output_put(&handle, header);
1903
1904 if (record_type & PERF_RECORD_IP)
1905 perf_output_put(&handle, ip);
1906
1907 if (record_type & PERF_RECORD_TID)
1908 perf_output_put(&handle, tid_entry);
1909
1910 if (record_type & PERF_RECORD_TIME)
1911 perf_output_put(&handle, time);
1912
1913 if (record_type & PERF_RECORD_ADDR)
1914 perf_output_put(&handle, addr);
1915
1916 if (record_type & PERF_RECORD_GROUP) {
1917 struct perf_counter *leader, *sub;
1918 u64 nr = counter->nr_siblings;
1919
1920 perf_output_put(&handle, nr);
1921
1922 leader = counter->group_leader;
1923 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1924 if (sub != counter)
1925 sub->hw_ops->read(sub);
1926
1927 group_entry.event = sub->hw_event.config;
1928 group_entry.counter = atomic64_read(&sub->count);
1929
1930 perf_output_put(&handle, group_entry);
1931 }
1932 }
1933
1934 if (callchain)
1935 perf_output_copy(&handle, callchain, callchain_size);
1936
1937 perf_output_end(&handle);
1938}
1939
1940/*
1941 * comm tracking
1942 */
1943
1944struct perf_comm_event {
1945 struct task_struct *task;
1946 char *comm;
1947 int comm_size;
1948
1949 struct {
1950 struct perf_event_header header;
1951
1952 u32 pid;
1953 u32 tid;
1954 } event;
1955};
1956
1957static void perf_counter_comm_output(struct perf_counter *counter,
1958 struct perf_comm_event *comm_event)
1959{
1960 struct perf_output_handle handle;
1961 int size = comm_event->event.header.size;
1962 int ret = perf_output_begin(&handle, counter, size, 0, 0);
1963
1964 if (ret)
1965 return;
1966
1967 perf_output_put(&handle, comm_event->event);
1968 perf_output_copy(&handle, comm_event->comm,
1969 comm_event->comm_size);
1970 perf_output_end(&handle);
1971}
1972
1973static int perf_counter_comm_match(struct perf_counter *counter,
1974 struct perf_comm_event *comm_event)
1975{
1976 if (counter->hw_event.comm &&
1977 comm_event->event.header.type == PERF_EVENT_COMM)
1978 return 1;
1979
1980 return 0;
1981}
1982
1983static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
1984 struct perf_comm_event *comm_event)
1985{
1986 struct perf_counter *counter;
1987
1988 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
1989 return;
1990
1991 rcu_read_lock();
1992 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
1993 if (perf_counter_comm_match(counter, comm_event))
1994 perf_counter_comm_output(counter, comm_event);
1995 }
1996 rcu_read_unlock();
1997}
1998
1999static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2000{
2001 struct perf_cpu_context *cpuctx;
2002 unsigned int size;
2003 char *comm = comm_event->task->comm;
2004
2005 size = ALIGN(strlen(comm)+1, sizeof(u64));
2006
2007 comm_event->comm = comm;
2008 comm_event->comm_size = size;
2009
2010 comm_event->event.header.size = sizeof(comm_event->event) + size;
2011
2012 cpuctx = &get_cpu_var(perf_cpu_context);
2013 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
2014 put_cpu_var(perf_cpu_context);
2015
2016 perf_counter_comm_ctx(&current->perf_counter_ctx, comm_event);
2017}
2018
2019void perf_counter_comm(struct task_struct *task)
2020{
2021 struct perf_comm_event comm_event;
2022
2023 if (!atomic_read(&nr_comm_tracking))
2024 return;
2025
2026 comm_event = (struct perf_comm_event){
2027 .task = task,
2028 .event = {
2029 .header = { .type = PERF_EVENT_COMM, },
2030 .pid = task->group_leader->pid,
2031 .tid = task->pid,
2032 },
2033 };
2034
2035 perf_counter_comm_event(&comm_event);
2036}
2037
2038/*
2039 * mmap tracking
2040 */
2041
2042struct perf_mmap_event {
2043 struct file *file;
2044 char *file_name;
2045 int file_size;
2046
2047 struct {
2048 struct perf_event_header header;
2049
2050 u32 pid;
2051 u32 tid;
2052 u64 start;
2053 u64 len;
2054 u64 pgoff;
2055 } event;
2056};
2057
2058static void perf_counter_mmap_output(struct perf_counter *counter,
2059 struct perf_mmap_event *mmap_event)
2060{
2061 struct perf_output_handle handle;
2062 int size = mmap_event->event.header.size;
2063 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2064
2065 if (ret)
2066 return;
2067
2068 perf_output_put(&handle, mmap_event->event);
2069 perf_output_copy(&handle, mmap_event->file_name,
2070 mmap_event->file_size);
2071 perf_output_end(&handle);
2072}
2073
2074static int perf_counter_mmap_match(struct perf_counter *counter,
2075 struct perf_mmap_event *mmap_event)
2076{
2077 if (counter->hw_event.mmap &&
2078 mmap_event->event.header.type == PERF_EVENT_MMAP)
2079 return 1;
2080
2081 if (counter->hw_event.munmap &&
2082 mmap_event->event.header.type == PERF_EVENT_MUNMAP)
2083 return 1;
2084
2085 return 0;
2086}
2087
2088static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
2089 struct perf_mmap_event *mmap_event)
2090{
2091 struct perf_counter *counter;
2092
2093 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2094 return;
2095
2096 rcu_read_lock();
2097 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2098 if (perf_counter_mmap_match(counter, mmap_event))
2099 perf_counter_mmap_output(counter, mmap_event);
2100 }
2101 rcu_read_unlock();
2102}
2103
2104static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2105{
2106 struct perf_cpu_context *cpuctx;
2107 struct file *file = mmap_event->file;
2108 unsigned int size;
2109 char tmp[16];
2110 char *buf = NULL;
2111 char *name;
2112
2113 if (file) {
2114 buf = kzalloc(PATH_MAX, GFP_KERNEL);
2115 if (!buf) {
2116 name = strncpy(tmp, "//enomem", sizeof(tmp));
2117 goto got_name;
2118 }
2119 name = d_path(&file->f_path, buf, PATH_MAX);
2120 if (IS_ERR(name)) {
2121 name = strncpy(tmp, "//toolong", sizeof(tmp));
2122 goto got_name;
2123 }
2124 } else {
2125 name = strncpy(tmp, "//anon", sizeof(tmp));
2126 goto got_name;
2127 }
2128
2129got_name:
2130 size = ALIGN(strlen(name)+1, sizeof(u64));
2131
2132 mmap_event->file_name = name;
2133 mmap_event->file_size = size;
2134
2135 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2136
2137 cpuctx = &get_cpu_var(perf_cpu_context);
2138 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2139 put_cpu_var(perf_cpu_context);
2140
2141 perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
2142
2143 kfree(buf);
2144}
2145
2146void perf_counter_mmap(unsigned long addr, unsigned long len,
2147 unsigned long pgoff, struct file *file)
2148{
2149 struct perf_mmap_event mmap_event;
2150
2151 if (!atomic_read(&nr_mmap_tracking))
2152 return;
2153
2154 mmap_event = (struct perf_mmap_event){
2155 .file = file,
2156 .event = {
2157 .header = { .type = PERF_EVENT_MMAP, },
2158 .pid = current->group_leader->pid,
2159 .tid = current->pid,
2160 .start = addr,
2161 .len = len,
2162 .pgoff = pgoff,
2163 },
2164 };
2165
2166 perf_counter_mmap_event(&mmap_event);
2167}
2168
2169void perf_counter_munmap(unsigned long addr, unsigned long len,
2170 unsigned long pgoff, struct file *file)
2171{
2172 struct perf_mmap_event mmap_event;
2173
2174 if (!atomic_read(&nr_munmap_tracking))
2175 return;
2176
2177 mmap_event = (struct perf_mmap_event){
2178 .file = file,
2179 .event = {
2180 .header = { .type = PERF_EVENT_MUNMAP, },
2181 .pid = current->group_leader->pid,
2182 .tid = current->pid,
2183 .start = addr,
2184 .len = len,
2185 .pgoff = pgoff,
2186 },
2187 };
2188
2189 perf_counter_mmap_event(&mmap_event);
2190}
2191
2192/*
2193 * Generic counter overflow handling.
2194 */
2195
2196int perf_counter_overflow(struct perf_counter *counter,
2197 int nmi, struct pt_regs *regs, u64 addr)
2198{
2199 int events = atomic_read(&counter->event_limit);
2200 int ret = 0;
2201
2202 counter->pending_kill = POLL_IN;
2203 if (events && atomic_dec_and_test(&counter->event_limit)) {
2204 ret = 1;
2205 counter->pending_kill = POLL_HUP;
2206 if (nmi) {
2207 counter->pending_disable = 1;
2208 perf_pending_queue(&counter->pending,
2209 perf_pending_counter);
2210 } else
2211 perf_counter_disable(counter);
2212 }
2213
2214 perf_counter_output(counter, nmi, regs, addr);
2215 return ret;
2216}
2217
2218/*
2219 * Generic software counter infrastructure
2220 */
2221
2222static void perf_swcounter_update(struct perf_counter *counter)
2223{
2224 struct hw_perf_counter *hwc = &counter->hw;
2225 u64 prev, now;
2226 s64 delta;
2227
2228again:
2229 prev = atomic64_read(&hwc->prev_count);
2230 now = atomic64_read(&hwc->count);
2231 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
2232 goto again;
2233
2234 delta = now - prev;
2235
2236 atomic64_add(delta, &counter->count);
2237 atomic64_sub(delta, &hwc->period_left);
2238}
2239
2240static void perf_swcounter_set_period(struct perf_counter *counter)
2241{
2242 struct hw_perf_counter *hwc = &counter->hw;
2243 s64 left = atomic64_read(&hwc->period_left);
2244 s64 period = hwc->irq_period;
2245
2246 if (unlikely(left <= -period)) {
2247 left = period;
2248 atomic64_set(&hwc->period_left, left);
2249 }
2250
2251 if (unlikely(left <= 0)) {
2252 left += period;
2253 atomic64_add(period, &hwc->period_left);
2254 }
2255
2256 atomic64_set(&hwc->prev_count, -left);
2257 atomic64_set(&hwc->count, -left);
2258}
2259
2260static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
2261{
2262 enum hrtimer_restart ret = HRTIMER_RESTART;
2263 struct perf_counter *counter;
2264 struct pt_regs *regs;
2265
2266 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
2267 counter->hw_ops->read(counter);
2268
2269 regs = get_irq_regs();
2270 /*
2271 * In case we exclude kernel IPs or are somehow not in interrupt
2272 * context, provide the next best thing, the user IP.
2273 */
2274 if ((counter->hw_event.exclude_kernel || !regs) &&
2275 !counter->hw_event.exclude_user)
2276 regs = task_pt_regs(current);
2277
2278 if (regs) {
2279 if (perf_counter_overflow(counter, 0, regs, 0))
2280 ret = HRTIMER_NORESTART;
2281 }
2282
2283 hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
2284
2285 return ret;
2286}
2287
2288static void perf_swcounter_overflow(struct perf_counter *counter,
2289 int nmi, struct pt_regs *regs, u64 addr)
2290{
2291 perf_swcounter_update(counter);
2292 perf_swcounter_set_period(counter);
2293 if (perf_counter_overflow(counter, nmi, regs, addr))
2294 /* soft-disable the counter */
2295 ;
2296
2297}
2298
2299static int perf_swcounter_match(struct perf_counter *counter,
2300 enum perf_event_types type,
2301 u32 event, struct pt_regs *regs)
2302{
2303 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2304 return 0;
2305
2306 if (perf_event_raw(&counter->hw_event))
2307 return 0;
2308
2309 if (perf_event_type(&counter->hw_event) != type)
2310 return 0;
2311
2312 if (perf_event_id(&counter->hw_event) != event)
2313 return 0;
2314
2315 if (counter->hw_event.exclude_user && user_mode(regs))
2316 return 0;
2317
2318 if (counter->hw_event.exclude_kernel && !user_mode(regs))
2319 return 0;
2320
2321 return 1;
2322}
2323
2324static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
2325 int nmi, struct pt_regs *regs, u64 addr)
2326{
2327 int neg = atomic64_add_negative(nr, &counter->hw.count);
2328 if (counter->hw.irq_period && !neg)
2329 perf_swcounter_overflow(counter, nmi, regs, addr);
2330}
2331
2332static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
2333 enum perf_event_types type, u32 event,
2334 u64 nr, int nmi, struct pt_regs *regs,
2335 u64 addr)
2336{
2337 struct perf_counter *counter;
2338
2339 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2340 return;
2341
2342 rcu_read_lock();
2343 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2344 if (perf_swcounter_match(counter, type, event, regs))
2345 perf_swcounter_add(counter, nr, nmi, regs, addr);
2346 }
2347 rcu_read_unlock();
2348}
2349
2350static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
2351{
2352 if (in_nmi())
2353 return &cpuctx->recursion[3];
2354
2355 if (in_irq())
2356 return &cpuctx->recursion[2];
2357
2358 if (in_softirq())
2359 return &cpuctx->recursion[1];
2360
2361 return &cpuctx->recursion[0];
2362}
2363
2364static void __perf_swcounter_event(enum perf_event_types type, u32 event,
2365 u64 nr, int nmi, struct pt_regs *regs,
2366 u64 addr)
2367{
2368 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
2369 int *recursion = perf_swcounter_recursion_context(cpuctx);
2370
2371 if (*recursion)
2372 goto out;
2373
2374 (*recursion)++;
2375 barrier();
2376
2377 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
2378 nr, nmi, regs, addr);
2379 if (cpuctx->task_ctx) {
2380 perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
2381 nr, nmi, regs, addr);
2382 }
2383
2384 barrier();
2385 (*recursion)--;
2386
2387out:
2388 put_cpu_var(perf_cpu_context);
2389}
2390
2391void
2392perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
2393{
2394 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
2395}
2396
2397static void perf_swcounter_read(struct perf_counter *counter)
2398{
2399 perf_swcounter_update(counter);
2400}
2401
2402static int perf_swcounter_enable(struct perf_counter *counter)
2403{
2404 perf_swcounter_set_period(counter);
2405 return 0;
2406}
2407
2408static void perf_swcounter_disable(struct perf_counter *counter)
2409{
2410 perf_swcounter_update(counter);
2411}
2412
2413static const struct hw_perf_counter_ops perf_ops_generic = {
2414 .enable = perf_swcounter_enable,
2415 .disable = perf_swcounter_disable,
2416 .read = perf_swcounter_read,
2417};
2418
2419/*
2420 * Software counter: cpu wall time clock
2421 */
2422
2423static void cpu_clock_perf_counter_update(struct perf_counter *counter)
2424{
2425 int cpu = raw_smp_processor_id();
2426 s64 prev;
2427 u64 now;
2428
2429 now = cpu_clock(cpu);
2430 prev = atomic64_read(&counter->hw.prev_count);
2431 atomic64_set(&counter->hw.prev_count, now);
2432 atomic64_add(now - prev, &counter->count);
2433}
2434
2435static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
2436{
2437 struct hw_perf_counter *hwc = &counter->hw;
2438 int cpu = raw_smp_processor_id();
2439
2440 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
2441 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2442 hwc->hrtimer.function = perf_swcounter_hrtimer;
2443 if (hwc->irq_period) {
2444 __hrtimer_start_range_ns(&hwc->hrtimer,
2445 ns_to_ktime(hwc->irq_period), 0,
2446 HRTIMER_MODE_REL, 0);
2447 }
2448
2449 return 0;
2450}
2451
2452static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
2453{
2454 hrtimer_cancel(&counter->hw.hrtimer);
2455 cpu_clock_perf_counter_update(counter);
2456}
2457
2458static void cpu_clock_perf_counter_read(struct perf_counter *counter)
2459{
2460 cpu_clock_perf_counter_update(counter);
2461}
2462
2463static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
2464 .enable = cpu_clock_perf_counter_enable,
2465 .disable = cpu_clock_perf_counter_disable,
2466 .read = cpu_clock_perf_counter_read,
2467};
2468
2469/*
2470 * Software counter: task time clock
2471 */
2472
2473static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
2474{
2475 u64 prev;
2476 s64 delta;
2477
2478 prev = atomic64_xchg(&counter->hw.prev_count, now);
2479 delta = now - prev;
2480 atomic64_add(delta, &counter->count);
2481}
2482
2483static int task_clock_perf_counter_enable(struct perf_counter *counter)
2484{
2485 struct hw_perf_counter *hwc = &counter->hw;
2486 u64 now;
2487
2488 now = counter->ctx->time;
2489
2490 atomic64_set(&hwc->prev_count, now);
2491 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2492 hwc->hrtimer.function = perf_swcounter_hrtimer;
2493 if (hwc->irq_period) {
2494 __hrtimer_start_range_ns(&hwc->hrtimer,
2495 ns_to_ktime(hwc->irq_period), 0,
2496 HRTIMER_MODE_REL, 0);
2497 }
2498
2499 return 0;
2500}
2501
2502static void task_clock_perf_counter_disable(struct perf_counter *counter)
2503{
2504 hrtimer_cancel(&counter->hw.hrtimer);
2505 task_clock_perf_counter_update(counter, counter->ctx->time);
2506
2507}
2508
2509static void task_clock_perf_counter_read(struct perf_counter *counter)
2510{
2511 u64 time;
2512
2513 if (!in_nmi()) {
2514 update_context_time(counter->ctx);
2515 time = counter->ctx->time;
2516 } else {
2517 u64 now = perf_clock();
2518 u64 delta = now - counter->ctx->timestamp;
2519 time = counter->ctx->time + delta;
2520 }
2521
2522 task_clock_perf_counter_update(counter, time);
2523}
2524
2525static const struct hw_perf_counter_ops perf_ops_task_clock = {
2526 .enable = task_clock_perf_counter_enable,
2527 .disable = task_clock_perf_counter_disable,
2528 .read = task_clock_perf_counter_read,
2529};
2530
2531/*
2532 * Software counter: cpu migrations
2533 */
2534
2535static inline u64 get_cpu_migrations(struct perf_counter *counter)
2536{
2537 struct task_struct *curr = counter->ctx->task;
2538
2539 if (curr)
2540 return curr->se.nr_migrations;
2541 return cpu_nr_migrations(smp_processor_id());
2542}
2543
2544static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
2545{
2546 u64 prev, now;
2547 s64 delta;
2548
2549 prev = atomic64_read(&counter->hw.prev_count);
2550 now = get_cpu_migrations(counter);
2551
2552 atomic64_set(&counter->hw.prev_count, now);
2553
2554 delta = now - prev;
2555
2556 atomic64_add(delta, &counter->count);
2557}
2558
2559static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
2560{
2561 cpu_migrations_perf_counter_update(counter);
2562}
2563
2564static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
2565{
2566 if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
2567 atomic64_set(&counter->hw.prev_count,
2568 get_cpu_migrations(counter));
2569 return 0;
2570}
2571
2572static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
2573{
2574 cpu_migrations_perf_counter_update(counter);
2575}
2576
2577static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
2578 .enable = cpu_migrations_perf_counter_enable,
2579 .disable = cpu_migrations_perf_counter_disable,
2580 .read = cpu_migrations_perf_counter_read,
2581};
2582
2583#ifdef CONFIG_EVENT_PROFILE
2584void perf_tpcounter_event(int event_id)
2585{
2586 struct pt_regs *regs = get_irq_regs();
2587
2588 if (!regs)
2589 regs = task_pt_regs(current);
2590
2591 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
2592}
2593EXPORT_SYMBOL_GPL(perf_tpcounter_event);
2594
2595extern int ftrace_profile_enable(int);
2596extern void ftrace_profile_disable(int);
2597
2598static void tp_perf_counter_destroy(struct perf_counter *counter)
2599{
2600 ftrace_profile_disable(perf_event_id(&counter->hw_event));
2601}
2602
2603static const struct hw_perf_counter_ops *
2604tp_perf_counter_init(struct perf_counter *counter)
2605{
2606 int event_id = perf_event_id(&counter->hw_event);
2607 int ret;
2608
2609 ret = ftrace_profile_enable(event_id);
2610 if (ret)
2611 return NULL;
2612
2613 counter->destroy = tp_perf_counter_destroy;
2614 counter->hw.irq_period = counter->hw_event.irq_period;
2615
2616 return &perf_ops_generic;
2617}
2618#else
2619static const struct hw_perf_counter_ops *
2620tp_perf_counter_init(struct perf_counter *counter)
2621{
2622 return NULL;
2623}
2624#endif
2625
2626static const struct hw_perf_counter_ops *
2627sw_perf_counter_init(struct perf_counter *counter)
2628{
2629 struct perf_counter_hw_event *hw_event = &counter->hw_event;
2630 const struct hw_perf_counter_ops *hw_ops = NULL;
2631 struct hw_perf_counter *hwc = &counter->hw;
2632
2633 /*
2634 * Software counters (currently) can't in general distinguish
2635 * between user, kernel and hypervisor events.
2636 * However, context switches and cpu migrations are considered
2637 * to be kernel events, and page faults are never hypervisor
2638 * events.
2639 */
2640 switch (perf_event_id(&counter->hw_event)) {
2641 case PERF_COUNT_CPU_CLOCK:
2642 hw_ops = &perf_ops_cpu_clock;
2643
2644 if (hw_event->irq_period && hw_event->irq_period < 10000)
2645 hw_event->irq_period = 10000;
2646 break;
2647 case PERF_COUNT_TASK_CLOCK:
2648 /*
2649 * If the user instantiates this as a per-cpu counter,
2650 * use the cpu_clock counter instead.
2651 */
2652 if (counter->ctx->task)
2653 hw_ops = &perf_ops_task_clock;
2654 else
2655 hw_ops = &perf_ops_cpu_clock;
2656
2657 if (hw_event->irq_period && hw_event->irq_period < 10000)
2658 hw_event->irq_period = 10000;
2659 break;
2660 case PERF_COUNT_PAGE_FAULTS:
2661 case PERF_COUNT_PAGE_FAULTS_MIN:
2662 case PERF_COUNT_PAGE_FAULTS_MAJ:
2663 case PERF_COUNT_CONTEXT_SWITCHES:
2664 hw_ops = &perf_ops_generic;
2665 break;
2666 case PERF_COUNT_CPU_MIGRATIONS:
2667 if (!counter->hw_event.exclude_kernel)
2668 hw_ops = &perf_ops_cpu_migrations;
2669 break;
2670 }
2671
2672 if (hw_ops)
2673 hwc->irq_period = hw_event->irq_period;
2674
2675 return hw_ops;
2676}
2677
2678/*
2679 * Allocate and initialize a counter structure
2680 */
2681static struct perf_counter *
2682perf_counter_alloc(struct perf_counter_hw_event *hw_event,
2683 int cpu,
2684 struct perf_counter_context *ctx,
2685 struct perf_counter *group_leader,
2686 gfp_t gfpflags)
2687{
2688 const struct hw_perf_counter_ops *hw_ops;
2689 struct perf_counter *counter;
2690 long err;
2691
2692 counter = kzalloc(sizeof(*counter), gfpflags);
2693 if (!counter)
2694 return ERR_PTR(-ENOMEM);
2695
2696 /*
2697 * Single counters are their own group leaders, with an
2698 * empty sibling list:
2699 */
2700 if (!group_leader)
2701 group_leader = counter;
2702
2703 mutex_init(&counter->mutex);
2704 INIT_LIST_HEAD(&counter->list_entry);
2705 INIT_LIST_HEAD(&counter->event_entry);
2706 INIT_LIST_HEAD(&counter->sibling_list);
2707 init_waitqueue_head(&counter->waitq);
2708
2709 mutex_init(&counter->mmap_mutex);
2710
2711 INIT_LIST_HEAD(&counter->child_list);
2712
2713 counter->cpu = cpu;
2714 counter->hw_event = *hw_event;
2715 counter->group_leader = group_leader;
2716 counter->hw_ops = NULL;
2717 counter->ctx = ctx;
2718
2719 counter->state = PERF_COUNTER_STATE_INACTIVE;
2720 if (hw_event->disabled)
2721 counter->state = PERF_COUNTER_STATE_OFF;
2722
2723 hw_ops = NULL;
2724
2725 if (perf_event_raw(hw_event)) {
2726 hw_ops = hw_perf_counter_init(counter);
2727 goto done;
2728 }
2729
2730 switch (perf_event_type(hw_event)) {
2731 case PERF_TYPE_HARDWARE:
2732 hw_ops = hw_perf_counter_init(counter);
2733 break;
2734
2735 case PERF_TYPE_SOFTWARE:
2736 hw_ops = sw_perf_counter_init(counter);
2737 break;
2738
2739 case PERF_TYPE_TRACEPOINT:
2740 hw_ops = tp_perf_counter_init(counter);
2741 break;
2742 }
2743done:
2744 err = 0;
2745 if (!hw_ops)
2746 err = -EINVAL;
2747 else if (IS_ERR(hw_ops))
2748 err = PTR_ERR(hw_ops);
2749
2750 if (err) {
2751 kfree(counter);
2752 return ERR_PTR(err);
2753 }
2754
2755 counter->hw_ops = hw_ops;
2756
2757 if (counter->hw_event.mmap)
2758 atomic_inc(&nr_mmap_tracking);
2759 if (counter->hw_event.munmap)
2760 atomic_inc(&nr_munmap_tracking);
2761 if (counter->hw_event.comm)
2762 atomic_inc(&nr_comm_tracking);
2763
2764 return counter;
2765}
2766
2767/**
2768 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
2769 *
2770 * @hw_event_uptr: event type attributes for monitoring/sampling
2771 * @pid: target pid
2772 * @cpu: target cpu
2773 * @group_fd: group leader counter fd
2774 */
2775SYSCALL_DEFINE5(perf_counter_open,
2776 const struct perf_counter_hw_event __user *, hw_event_uptr,
2777 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
2778{
2779 struct perf_counter *counter, *group_leader;
2780 struct perf_counter_hw_event hw_event;
2781 struct perf_counter_context *ctx;
2782 struct file *counter_file = NULL;
2783 struct file *group_file = NULL;
2784 int fput_needed = 0;
2785 int fput_needed2 = 0;
2786 int ret;
2787
2788 /* for future expandability... */
2789 if (flags)
2790 return -EINVAL;
2791
2792 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
2793 return -EFAULT;
2794
2795 /*
2796 * Get the target context (task or percpu):
2797 */
2798 ctx = find_get_context(pid, cpu);
2799 if (IS_ERR(ctx))
2800 return PTR_ERR(ctx);
2801
2802 /*
2803 * Look up the group leader (we will attach this counter to it):
2804 */
2805 group_leader = NULL;
2806 if (group_fd != -1) {
2807 ret = -EINVAL;
2808 group_file = fget_light(group_fd, &fput_needed);
2809 if (!group_file)
2810 goto err_put_context;
2811 if (group_file->f_op != &perf_fops)
2812 goto err_put_context;
2813
2814 group_leader = group_file->private_data;
2815 /*
2816 * Do not allow a recursive hierarchy (this new sibling
2817 * becoming part of another group-sibling):
2818 */
2819 if (group_leader->group_leader != group_leader)
2820 goto err_put_context;
2821 /*
2822 * Do not allow to attach to a group in a different
2823 * task or CPU context:
2824 */
2825 if (group_leader->ctx != ctx)
2826 goto err_put_context;
2827 /*
2828 * Only a group leader can be exclusive or pinned
2829 */
2830 if (hw_event.exclusive || hw_event.pinned)
2831 goto err_put_context;
2832 }
2833
2834 counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
2835 GFP_KERNEL);
2836 ret = PTR_ERR(counter);
2837 if (IS_ERR(counter))
2838 goto err_put_context;
2839
2840 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
2841 if (ret < 0)
2842 goto err_free_put_context;
2843
2844 counter_file = fget_light(ret, &fput_needed2);
2845 if (!counter_file)
2846 goto err_free_put_context;
2847
2848 counter->filp = counter_file;
2849 mutex_lock(&ctx->mutex);
2850 perf_install_in_context(ctx, counter, cpu);
2851 mutex_unlock(&ctx->mutex);
2852
2853 fput_light(counter_file, fput_needed2);
2854
2855out_fput:
2856 fput_light(group_file, fput_needed);
2857
2858 return ret;
2859
2860err_free_put_context:
2861 kfree(counter);
2862
2863err_put_context:
2864 put_context(ctx);
2865
2866 goto out_fput;
2867}
2868
2869/*
2870 * Initialize the perf_counter context in a task_struct:
2871 */
2872static void
2873__perf_counter_init_context(struct perf_counter_context *ctx,
2874 struct task_struct *task)
2875{
2876 memset(ctx, 0, sizeof(*ctx));
2877 spin_lock_init(&ctx->lock);
2878 mutex_init(&ctx->mutex);
2879 INIT_LIST_HEAD(&ctx->counter_list);
2880 INIT_LIST_HEAD(&ctx->event_list);
2881 ctx->task = task;
2882}
2883
2884/*
2885 * inherit a counter from parent task to child task:
2886 */
2887static struct perf_counter *
2888inherit_counter(struct perf_counter *parent_counter,
2889 struct task_struct *parent,
2890 struct perf_counter_context *parent_ctx,
2891 struct task_struct *child,
2892 struct perf_counter *group_leader,
2893 struct perf_counter_context *child_ctx)
2894{
2895 struct perf_counter *child_counter;
2896
2897 /*
2898 * Instead of creating recursive hierarchies of counters,
2899 * we link inherited counters back to the original parent,
2900 * which has a filp for sure, which we use as the reference
2901 * count:
2902 */
2903 if (parent_counter->parent)
2904 parent_counter = parent_counter->parent;
2905
2906 child_counter = perf_counter_alloc(&parent_counter->hw_event,
2907 parent_counter->cpu, child_ctx,
2908 group_leader, GFP_KERNEL);
2909 if (IS_ERR(child_counter))
2910 return child_counter;
2911
2912 /*
2913 * Link it up in the child's context:
2914 */
2915 child_counter->task = child;
2916 add_counter_to_ctx(child_counter, child_ctx);
2917
2918 child_counter->parent = parent_counter;
2919 /*
2920 * inherit into child's child as well:
2921 */
2922 child_counter->hw_event.inherit = 1;
2923
2924 /*
2925 * Get a reference to the parent filp - we will fput it
2926 * when the child counter exits. This is safe to do because
2927 * we are in the parent and we know that the filp still
2928 * exists and has a nonzero count:
2929 */
2930 atomic_long_inc(&parent_counter->filp->f_count);
2931
2932 /*
2933 * Link this into the parent counter's child list
2934 */
2935 mutex_lock(&parent_counter->mutex);
2936 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
2937
2938 /*
2939 * Make the child state follow the state of the parent counter,
2940 * not its hw_event.disabled bit. We hold the parent's mutex,
2941 * so we won't race with perf_counter_{en,dis}able_family.
2942 */
2943 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
2944 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
2945 else
2946 child_counter->state = PERF_COUNTER_STATE_OFF;
2947
2948 mutex_unlock(&parent_counter->mutex);
2949
2950 return child_counter;
2951}
2952
2953static int inherit_group(struct perf_counter *parent_counter,
2954 struct task_struct *parent,
2955 struct perf_counter_context *parent_ctx,
2956 struct task_struct *child,
2957 struct perf_counter_context *child_ctx)
2958{
2959 struct perf_counter *leader;
2960 struct perf_counter *sub;
2961 struct perf_counter *child_ctr;
2962
2963 leader = inherit_counter(parent_counter, parent, parent_ctx,
2964 child, NULL, child_ctx);
2965 if (IS_ERR(leader))
2966 return PTR_ERR(leader);
2967 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
2968 child_ctr = inherit_counter(sub, parent, parent_ctx,
2969 child, leader, child_ctx);
2970 if (IS_ERR(child_ctr))
2971 return PTR_ERR(child_ctr);
2972 }
2973 return 0;
2974}
2975
2976static void sync_child_counter(struct perf_counter *child_counter,
2977 struct perf_counter *parent_counter)
2978{
2979 u64 parent_val, child_val;
2980
2981 parent_val = atomic64_read(&parent_counter->count);
2982 child_val = atomic64_read(&child_counter->count);
2983
2984 /*
2985 * Add back the child's count to the parent's count:
2986 */
2987 atomic64_add(child_val, &parent_counter->count);
2988 atomic64_add(child_counter->total_time_enabled,
2989 &parent_counter->child_total_time_enabled);
2990 atomic64_add(child_counter->total_time_running,
2991 &parent_counter->child_total_time_running);
2992
2993 /*
2994 * Remove this counter from the parent's list
2995 */
2996 mutex_lock(&parent_counter->mutex);
2997 list_del_init(&child_counter->child_list);
2998 mutex_unlock(&parent_counter->mutex);
2999
3000 /*
3001 * Release the parent counter, if this was the last
3002 * reference to it.
3003 */
3004 fput(parent_counter->filp);
3005}
3006
3007static void
3008__perf_counter_exit_task(struct task_struct *child,
3009 struct perf_counter *child_counter,
3010 struct perf_counter_context *child_ctx)
3011{
3012 struct perf_counter *parent_counter;
3013 struct perf_counter *sub, *tmp;
3014
3015 /*
3016 * If we do not self-reap then we have to wait for the
3017 * child task to unschedule (it will happen for sure),
3018 * so that its counter is at its final count. (This
3019 * condition triggers rarely - child tasks usually get
3020 * off their CPU before the parent has a chance to
3021 * get this far into the reaping action)
3022 */
3023 if (child != current) {
3024 wait_task_inactive(child, 0);
3025 list_del_init(&child_counter->list_entry);
3026 update_counter_times(child_counter);
3027 } else {
3028 struct perf_cpu_context *cpuctx;
3029 unsigned long flags;
3030 u64 perf_flags;
3031
3032 /*
3033 * Disable and unlink this counter.
3034 *
3035 * Be careful about zapping the list - IRQ/NMI context
3036 * could still be processing it:
3037 */
3038 local_irq_save(flags);
3039 perf_flags = hw_perf_save_disable();
3040
3041 cpuctx = &__get_cpu_var(perf_cpu_context);
3042
3043 group_sched_out(child_counter, cpuctx, child_ctx);
3044 update_counter_times(child_counter);
3045
3046 list_del_init(&child_counter->list_entry);
3047
3048 child_ctx->nr_counters--;
3049
3050 hw_perf_restore(perf_flags);
3051 local_irq_restore(flags);
3052 }
3053
3054 parent_counter = child_counter->parent;
3055 /*
3056 * It can happen that parent exits first, and has counters
3057 * that are still around due to the child reference. These
3058 * counters need to be zapped - but otherwise linger.
3059 */
3060 if (parent_counter) {
3061 sync_child_counter(child_counter, parent_counter);
3062 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
3063 list_entry) {
3064 if (sub->parent) {
3065 sync_child_counter(sub, sub->parent);
3066 free_counter(sub);
3067 }
3068 }
3069 free_counter(child_counter);
3070 }
3071}
3072
3073/*
3074 * When a child task exits, feed back counter values to parent counters.
3075 *
3076 * Note: we may be running in child context, but the PID is not hashed
3077 * anymore so new counters will not be added.
3078 */
3079void perf_counter_exit_task(struct task_struct *child)
3080{
3081 struct perf_counter *child_counter, *tmp;
3082 struct perf_counter_context *child_ctx;
3083
3084 child_ctx = &child->perf_counter_ctx;
3085
3086 if (likely(!child_ctx->nr_counters))
3087 return;
3088
3089 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
3090 list_entry)
3091 __perf_counter_exit_task(child, child_counter, child_ctx);
3092}
3093
3094/*
3095 * Initialize the perf_counter context in task_struct
3096 */
3097void perf_counter_init_task(struct task_struct *child)
3098{
3099 struct perf_counter_context *child_ctx, *parent_ctx;
3100 struct perf_counter *counter;
3101 struct task_struct *parent = current;
3102
3103 child_ctx = &child->perf_counter_ctx;
3104 parent_ctx = &parent->perf_counter_ctx;
3105
3106 __perf_counter_init_context(child_ctx, child);
3107
3108 /*
3109 * This is executed from the parent task context, so inherit
3110 * counters that have been marked for cloning:
3111 */
3112
3113 if (likely(!parent_ctx->nr_counters))
3114 return;
3115
3116 /*
3117 * Lock the parent list. No need to lock the child - not PID
3118 * hashed yet and not running, so nobody can access it.
3119 */
3120 mutex_lock(&parent_ctx->mutex);
3121
3122 /*
3123 * We dont have to disable NMIs - we are only looking at
3124 * the list, not manipulating it:
3125 */
3126 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
3127 if (!counter->hw_event.inherit)
3128 continue;
3129
3130 if (inherit_group(counter, parent,
3131 parent_ctx, child, child_ctx))
3132 break;
3133 }
3134
3135 mutex_unlock(&parent_ctx->mutex);
3136}
3137
3138static void __cpuinit perf_counter_init_cpu(int cpu)
3139{
3140 struct perf_cpu_context *cpuctx;
3141
3142 cpuctx = &per_cpu(perf_cpu_context, cpu);
3143 __perf_counter_init_context(&cpuctx->ctx, NULL);
3144
3145 mutex_lock(&perf_resource_mutex);
3146 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
3147 mutex_unlock(&perf_resource_mutex);
3148
3149 hw_perf_counter_setup(cpu);
3150}
3151
3152#ifdef CONFIG_HOTPLUG_CPU
3153static void __perf_counter_exit_cpu(void *info)
3154{
3155 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3156 struct perf_counter_context *ctx = &cpuctx->ctx;
3157 struct perf_counter *counter, *tmp;
3158
3159 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
3160 __perf_counter_remove_from_context(counter);
3161}
3162static void perf_counter_exit_cpu(int cpu)
3163{
3164 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3165 struct perf_counter_context *ctx = &cpuctx->ctx;
3166
3167 mutex_lock(&ctx->mutex);
3168 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
3169 mutex_unlock(&ctx->mutex);
3170}
3171#else
3172static inline void perf_counter_exit_cpu(int cpu) { }
3173#endif
3174
3175static int __cpuinit
3176perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
3177{
3178 unsigned int cpu = (long)hcpu;
3179
3180 switch (action) {
3181
3182 case CPU_UP_PREPARE:
3183 case CPU_UP_PREPARE_FROZEN:
3184 perf_counter_init_cpu(cpu);
3185 break;
3186
3187 case CPU_DOWN_PREPARE:
3188 case CPU_DOWN_PREPARE_FROZEN:
3189 perf_counter_exit_cpu(cpu);
3190 break;
3191
3192 default:
3193 break;
3194 }
3195
3196 return NOTIFY_OK;
3197}
3198
3199static struct notifier_block __cpuinitdata perf_cpu_nb = {
3200 .notifier_call = perf_cpu_notify,
3201};
3202
3203static int __init perf_counter_init(void)
3204{
3205 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
3206 (void *)(long)smp_processor_id());
3207 register_cpu_notifier(&perf_cpu_nb);
3208
3209 return 0;
3210}
3211early_initcall(perf_counter_init);
3212
3213static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
3214{
3215 return sprintf(buf, "%d\n", perf_reserved_percpu);
3216}
3217
3218static ssize_t
3219perf_set_reserve_percpu(struct sysdev_class *class,
3220 const char *buf,
3221 size_t count)
3222{
3223 struct perf_cpu_context *cpuctx;
3224 unsigned long val;
3225 int err, cpu, mpt;
3226
3227 err = strict_strtoul(buf, 10, &val);
3228 if (err)
3229 return err;
3230 if (val > perf_max_counters)
3231 return -EINVAL;
3232
3233 mutex_lock(&perf_resource_mutex);
3234 perf_reserved_percpu = val;
3235 for_each_online_cpu(cpu) {
3236 cpuctx = &per_cpu(perf_cpu_context, cpu);
3237 spin_lock_irq(&cpuctx->ctx.lock);
3238 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
3239 perf_max_counters - perf_reserved_percpu);
3240 cpuctx->max_pertask = mpt;
3241 spin_unlock_irq(&cpuctx->ctx.lock);
3242 }
3243 mutex_unlock(&perf_resource_mutex);
3244
3245 return count;
3246}
3247
3248static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
3249{
3250 return sprintf(buf, "%d\n", perf_overcommit);
3251}
3252
3253static ssize_t
3254perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
3255{
3256 unsigned long val;
3257 int err;
3258
3259 err = strict_strtoul(buf, 10, &val);
3260 if (err)
3261 return err;
3262 if (val > 1)
3263 return -EINVAL;
3264
3265 mutex_lock(&perf_resource_mutex);
3266 perf_overcommit = val;
3267 mutex_unlock(&perf_resource_mutex);
3268
3269 return count;
3270}
3271
3272static SYSDEV_CLASS_ATTR(
3273 reserve_percpu,
3274 0644,
3275 perf_show_reserve_percpu,
3276 perf_set_reserve_percpu
3277 );
3278
3279static SYSDEV_CLASS_ATTR(
3280 overcommit,
3281 0644,
3282 perf_show_overcommit,
3283 perf_set_overcommit
3284 );
3285
3286static struct attribute *perfclass_attrs[] = {
3287 &attr_reserve_percpu.attr,
3288 &attr_overcommit.attr,
3289 NULL
3290};
3291
3292static struct attribute_group perfclass_attr_group = {
3293 .attrs = perfclass_attrs,
3294 .name = "perf_counters",
3295};
3296
3297static int __init perf_counter_sysfs_init(void)
3298{
3299 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
3300 &perfclass_attr_group);
3301}
3302device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index b902e587a3a0..2f600e30dcf0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -584,6 +584,7 @@ struct rq {
584 struct load_weight load; 584 struct load_weight load;
585 unsigned long nr_load_updates; 585 unsigned long nr_load_updates;
586 u64 nr_switches; 586 u64 nr_switches;
587 u64 nr_migrations_in;
587 588
588 struct cfs_rq cfs; 589 struct cfs_rq cfs;
589 struct rt_rq rt; 590 struct rt_rq rt;
@@ -692,7 +693,7 @@ static inline int cpu_of(struct rq *rq)
692#define task_rq(p) cpu_rq(task_cpu(p)) 693#define task_rq(p) cpu_rq(task_cpu(p))
693#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 694#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
694 695
695static inline void update_rq_clock(struct rq *rq) 696inline void update_rq_clock(struct rq *rq)
696{ 697{
697 rq->clock = sched_clock_cpu(cpu_of(rq)); 698 rq->clock = sched_clock_cpu(cpu_of(rq));
698} 699}
@@ -1967,12 +1968,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1967 p->se.sleep_start -= clock_offset; 1968 p->se.sleep_start -= clock_offset;
1968 if (p->se.block_start) 1969 if (p->se.block_start)
1969 p->se.block_start -= clock_offset; 1970 p->se.block_start -= clock_offset;
1971#endif
1970 if (old_cpu != new_cpu) { 1972 if (old_cpu != new_cpu) {
1971 schedstat_inc(p, se.nr_migrations); 1973 p->se.nr_migrations++;
1974 new_rq->nr_migrations_in++;
1975#ifdef CONFIG_SCHEDSTATS
1972 if (task_hot(p, old_rq->clock, NULL)) 1976 if (task_hot(p, old_rq->clock, NULL))
1973 schedstat_inc(p, se.nr_forced2_migrations); 1977 schedstat_inc(p, se.nr_forced2_migrations);
1974 }
1975#endif 1978#endif
1979 }
1976 p->se.vruntime -= old_cfsrq->min_vruntime - 1980 p->se.vruntime -= old_cfsrq->min_vruntime -
1977 new_cfsrq->min_vruntime; 1981 new_cfsrq->min_vruntime;
1978 1982
@@ -2324,6 +2328,27 @@ static int sched_balance_self(int cpu, int flag)
2324 2328
2325#endif /* CONFIG_SMP */ 2329#endif /* CONFIG_SMP */
2326 2330
2331/**
2332 * task_oncpu_function_call - call a function on the cpu on which a task runs
2333 * @p: the task to evaluate
2334 * @func: the function to be called
2335 * @info: the function call argument
2336 *
2337 * Calls the function @func when the task is currently running. This might
2338 * be on the current CPU, which just calls the function directly
2339 */
2340void task_oncpu_function_call(struct task_struct *p,
2341 void (*func) (void *info), void *info)
2342{
2343 int cpu;
2344
2345 preempt_disable();
2346 cpu = task_cpu(p);
2347 if (task_curr(p))
2348 smp_call_function_single(cpu, func, info, 1);
2349 preempt_enable();
2350}
2351
2327/*** 2352/***
2328 * try_to_wake_up - wake up a thread 2353 * try_to_wake_up - wake up a thread
2329 * @p: the to-be-woken-up thread 2354 * @p: the to-be-woken-up thread
@@ -2480,6 +2505,7 @@ static void __sched_fork(struct task_struct *p)
2480 p->se.exec_start = 0; 2505 p->se.exec_start = 0;
2481 p->se.sum_exec_runtime = 0; 2506 p->se.sum_exec_runtime = 0;
2482 p->se.prev_sum_exec_runtime = 0; 2507 p->se.prev_sum_exec_runtime = 0;
2508 p->se.nr_migrations = 0;
2483 p->se.last_wakeup = 0; 2509 p->se.last_wakeup = 0;
2484 p->se.avg_overlap = 0; 2510 p->se.avg_overlap = 0;
2485 p->se.start_runtime = 0; 2511 p->se.start_runtime = 0;
@@ -2710,6 +2736,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2710 */ 2736 */
2711 prev_state = prev->state; 2737 prev_state = prev->state;
2712 finish_arch_switch(prev); 2738 finish_arch_switch(prev);
2739 perf_counter_task_sched_in(current, cpu_of(rq));
2713 finish_lock_switch(rq, prev); 2740 finish_lock_switch(rq, prev);
2714#ifdef CONFIG_SMP 2741#ifdef CONFIG_SMP
2715 if (post_schedule) 2742 if (post_schedule)
@@ -2872,6 +2899,15 @@ unsigned long nr_active(void)
2872} 2899}
2873 2900
2874/* 2901/*
2902 * Externally visible per-cpu scheduler statistics:
2903 * cpu_nr_migrations(cpu) - number of migrations into that cpu
2904 */
2905u64 cpu_nr_migrations(int cpu)
2906{
2907 return cpu_rq(cpu)->nr_migrations_in;
2908}
2909
2910/*
2875 * Update rq->cpu_load[] statistics. This function is usually called every 2911 * Update rq->cpu_load[] statistics. This function is usually called every
2876 * scheduler tick (TICK_NSEC). 2912 * scheduler tick (TICK_NSEC).
2877 */ 2913 */
@@ -4838,6 +4874,7 @@ void scheduler_tick(void)
4838 update_rq_clock(rq); 4874 update_rq_clock(rq);
4839 update_cpu_load(rq); 4875 update_cpu_load(rq);
4840 curr->sched_class->task_tick(rq, curr, 0); 4876 curr->sched_class->task_tick(rq, curr, 0);
4877 perf_counter_task_tick(curr, cpu);
4841 spin_unlock(&rq->lock); 4878 spin_unlock(&rq->lock);
4842 4879
4843#ifdef CONFIG_SMP 4880#ifdef CONFIG_SMP
@@ -5053,6 +5090,7 @@ need_resched_nonpreemptible:
5053 5090
5054 if (likely(prev != next)) { 5091 if (likely(prev != next)) {
5055 sched_info_switch(prev, next); 5092 sched_info_switch(prev, next);
5093 perf_counter_task_sched_out(prev, cpu);
5056 5094
5057 rq->nr_switches++; 5095 rq->nr_switches++;
5058 rq->curr = next; 5096 rq->curr = next;
diff --git a/kernel/sys.c b/kernel/sys.c
index e7998cf31498..438d99a38c87 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h>
17#include <linux/resource.h> 18#include <linux/resource.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1793,6 +1794,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1793 case PR_SET_TSC: 1794 case PR_SET_TSC:
1794 error = SET_TSC_CTL(arg2); 1795 error = SET_TSC_CTL(arg2);
1795 break; 1796 break;
1797 case PR_TASK_PERF_COUNTERS_DISABLE:
1798 error = perf_counter_task_disable();
1799 break;
1800 case PR_TASK_PERF_COUNTERS_ENABLE:
1801 error = perf_counter_task_enable();
1802 break;
1796 case PR_GET_TIMERSLACK: 1803 case PR_GET_TIMERSLACK:
1797 error = current->timer_slack_ns; 1804 error = current->timer_slack_ns;
1798 break; 1805 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad2967387..68320f6b07b5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
175cond_syscall(compat_sys_timerfd_gettime); 175cond_syscall(compat_sys_timerfd_gettime);
176cond_syscall(sys_eventfd); 176cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 177cond_syscall(sys_eventfd2);
178
179/* performance counters: */
180cond_syscall(sys_perf_counter_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e3d2c7dd59b9..8203d70928d5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,6 +49,7 @@
49#include <linux/reboot.h> 49#include <linux/reboot.h>
50#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/slow-work.h> 51#include <linux/slow-work.h>
52#include <linux/perf_counter.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/processor.h> 55#include <asm/processor.h>
@@ -910,6 +911,16 @@ static struct ctl_table kern_table[] = {
910 .child = slow_work_sysctls, 911 .child = slow_work_sysctls,
911 }, 912 },
912#endif 913#endif
914#ifdef CONFIG_PERF_COUNTERS
915 {
916 .ctl_name = CTL_UNNUMBERED,
917 .procname = "perf_counter_privileged",
918 .data = &sysctl_perf_counter_priv,
919 .maxlen = sizeof(sysctl_perf_counter_priv),
920 .mode = 0644,
921 .proc_handler = &proc_dointvec,
922 },
923#endif
913/* 924/*
914 * NOTE: do not add new entries to this table unless you have read 925 * NOTE: do not add new entries to this table unless you have read
915 * Documentation/sysctl/ctl_unnumbered.txt 926 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..fed53be44fd9 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -1170,6 +1171,8 @@ static void run_timer_softirq(struct softirq_action *h)
1170{ 1171{
1171 struct tvec_base *base = __get_cpu_var(tvec_bases); 1172 struct tvec_base *base = __get_cpu_var(tvec_bases);
1172 1173
1174 perf_counter_do_pending();
1175
1173 hrtimer_run_pending(); 1176 hrtimer_run_pending();
1174 1177
1175 if (time_after_eq(jiffies, base->timer_jiffies)) 1178 if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/mm/mmap.c b/mm/mmap.c
index 3303d1ba8e87..8a49df4c7363 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
28#include <linux/mempolicy.h> 28#include <linux/mempolicy.h>
29#include <linux/rmap.h> 29#include <linux/rmap.h>
30#include <linux/mmu_notifier.h> 30#include <linux/mmu_notifier.h>
31#include <linux/perf_counter.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -1223,6 +1224,9 @@ munmap_back:
1223 if (correct_wcount) 1224 if (correct_wcount)
1224 atomic_inc(&inode->i_writecount); 1225 atomic_inc(&inode->i_writecount);
1225out: 1226out:
1227 if (vm_flags & VM_EXEC)
1228 perf_counter_mmap(addr, len, pgoff, file);
1229
1226 mm->total_vm += len >> PAGE_SHIFT; 1230 mm->total_vm += len >> PAGE_SHIFT;
1227 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1231 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1228 if (vm_flags & VM_LOCKED) { 1232 if (vm_flags & VM_LOCKED) {
@@ -1756,6 +1760,12 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1756 do { 1760 do {
1757 long nrpages = vma_pages(vma); 1761 long nrpages = vma_pages(vma);
1758 1762
1763 if (vma->vm_flags & VM_EXEC) {
1764 perf_counter_munmap(vma->vm_start,
1765 nrpages << PAGE_SHIFT,
1766 vma->vm_pgoff, vma->vm_file);
1767 }
1768
1759 mm->total_vm -= nrpages; 1769 mm->total_vm -= nrpages;
1760 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 1770 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1761 vma = remove_vma(vma); 1771 vma = remove_vma(vma);