aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/isdn/00-INDEX17
-rw-r--r--Documentation/isdn/INTERFACE.CAPI213
-rw-r--r--Documentation/perf_counter/.gitignore179
-rw-r--r--Documentation/perf_counter/Documentation/perf-help.txt38
-rw-r--r--Documentation/perf_counter/Documentation/perf-record.txt63
-rw-r--r--Documentation/perf_counter/Documentation/perf-stat.txt76
-rw-r--r--Documentation/perf_counter/Documentation/perf-top.txt61
-rw-r--r--Documentation/perf_counter/Makefile849
-rw-r--r--Documentation/perf_counter/builtin-help.c461
-rw-r--r--Documentation/perf_counter/builtin-record.c484
-rw-r--r--Documentation/perf_counter/builtin-stat.c554
-rw-r--r--Documentation/perf_counter/builtin-top.c1142
-rw-r--r--Documentation/perf_counter/builtin.h22
-rw-r--r--Documentation/perf_counter/command-list.txt6
-rw-r--r--Documentation/perf_counter/design.txt449
-rw-r--r--Documentation/perf_counter/perf-report.cc515
-rw-r--r--Documentation/perf_counter/perf.c414
-rw-r--r--Documentation/perf_counter/perf.h62
-rwxr-xr-xDocumentation/perf_counter/util/PERF-VERSION-GEN42
-rw-r--r--Documentation/perf_counter/util/abspath.c117
-rw-r--r--Documentation/perf_counter/util/alias.c77
-rw-r--r--Documentation/perf_counter/util/cache.h117
-rw-r--r--Documentation/perf_counter/util/config.c873
-rw-r--r--Documentation/perf_counter/util/ctype.c26
-rw-r--r--Documentation/perf_counter/util/exec_cmd.c165
-rw-r--r--Documentation/perf_counter/util/exec_cmd.h13
-rwxr-xr-xDocumentation/perf_counter/util/generate-cmdlist.sh24
-rw-r--r--Documentation/perf_counter/util/help.c366
-rw-r--r--Documentation/perf_counter/util/help.h29
-rw-r--r--Documentation/perf_counter/util/levenshtein.c84
-rw-r--r--Documentation/perf_counter/util/levenshtein.h8
-rw-r--r--Documentation/perf_counter/util/parse-options.c492
-rw-r--r--Documentation/perf_counter/util/parse-options.h172
-rw-r--r--Documentation/perf_counter/util/path.c353
-rw-r--r--Documentation/perf_counter/util/quote.c478
-rw-r--r--Documentation/perf_counter/util/quote.h68
-rw-r--r--Documentation/perf_counter/util/run-command.c395
-rw-r--r--Documentation/perf_counter/util/run-command.h93
-rw-r--r--Documentation/perf_counter/util/strbuf.c359
-rw-r--r--Documentation/perf_counter/util/strbuf.h137
-rw-r--r--Documentation/perf_counter/util/usage.c80
-rw-r--r--Documentation/perf_counter/util/util.h408
-rw-r--r--Documentation/perf_counter/util/wrapper.c206
-rw-r--r--MAINTAINERS10
-rw-r--r--Makefile4
-rw-r--r--arch/powerpc/include/asm/hw_irq.h39
-rw-r--r--arch/powerpc/include/asm/paca.h1
-rw-r--r--arch/powerpc/include/asm/perf_counter.h83
-rw-r--r--arch/powerpc/include/asm/systbl.h2
-rw-r--r--arch/powerpc/include/asm/unistd.h1
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kernel/asm-offsets.c1
-rw-r--r--arch/powerpc/kernel/entry_64.S9
-rw-r--r--arch/powerpc/kernel/irq.c5
-rw-r--r--arch/powerpc/kernel/perf_counter.c1092
-rw-r--r--arch/powerpc/kernel/power4-pmu.c558
-rw-r--r--arch/powerpc/kernel/power5+-pmu.c630
-rw-r--r--arch/powerpc/kernel/power5-pmu.c570
-rw-r--r--arch/powerpc/kernel/power6-pmu.c490
-rw-r--r--arch/powerpc/kernel/ppc970-pmu.c442
-rw-r--r--arch/powerpc/mm/fault.c10
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype1
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/ia32/ia32entry.S4
-rw-r--r--arch/x86/include/asm/atomic_32.h236
-rw-r--r--arch/x86/include/asm/entry_arch.h1
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h3
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/irq_vectors.h5
-rw-r--r--arch/x86/include/asm/perf_counter.h100
-rw-r--r--arch/x86/include/asm/unistd_32.h2
-rw-r--r--arch/x86/include/asm/unistd_64.h5
-rw-r--r--arch/x86/kernel/apic/apic.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile12
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c1214
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/entry_64.S7
-rw-r--r--arch/x86/kernel/irq.c10
-rw-r--r--arch/x86/kernel/irqinit_32.c60
-rw-r--r--arch/x86/kernel/irqinit_64.c13
-rw-r--r--arch/x86/kernel/signal.c1
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/traps.c15
-rw-r--r--arch/x86/mm/fault.c12
-rw-r--r--arch/x86/oprofile/nmi_int.c7
-rw-r--r--arch/x86/oprofile/op_model_ppro.c10
-rw-r--r--drivers/Makefile4
-rw-r--r--drivers/acpi/processor_idle.c4
-rw-r--r--drivers/char/sysrq.c2
-rw-r--r--drivers/isdn/capi/kcapi.c171
-rw-r--r--drivers/media/video/au0828/au0828-core.c6
-rw-r--r--drivers/media/video/cx18/cx18-audio.c2
-rw-r--r--drivers/media/video/cx18/cx18-i2c.c16
-rw-r--r--drivers/media/video/cx231xx/Kconfig44
-rw-r--r--drivers/media/video/cx23885/cx23885-cards.c4
-rw-r--r--drivers/media/video/cx23885/cx23885-dvb.c1
-rw-r--r--drivers/media/video/mx3_camera.c4
-rw-r--r--drivers/media/video/s2255drv.c2
-rw-r--r--drivers/media/video/saa5246a.c3
-rw-r--r--drivers/media/video/saa5249.c4
-rw-r--r--drivers/net/e100.c30
-rw-r--r--drivers/net/forcedeth.c31
-rw-r--r--drivers/net/ixgbe/ixgbe_common.c51
-rw-r--r--drivers/net/ixgbe/ixgbe_main.c10
-rw-r--r--drivers/net/mlx4/en_netdev.c2
-rw-r--r--drivers/net/mlx4/en_rx.c4
-rw-r--r--drivers/net/veth.c41
-rw-r--r--fs/ecryptfs/inode.c4
-rw-r--r--fs/exec.c9
-rw-r--r--include/linux/compat.h2
-rw-r--r--include/linux/init_task.h13
-rw-r--r--include/linux/kernel_stat.h5
-rw-r--r--include/linux/netdevice.h4
-rw-r--r--include/linux/netfilter/nfnetlink_conntrack.h1
-rw-r--r--include/linux/netfilter/x_tables.h73
-rw-r--r--include/linux/perf_counter.h617
-rw-r--r--include/linux/prctl.h3
-rw-r--r--include/linux/sched.h13
-rw-r--r--include/linux/signal.h2
-rw-r--r--include/linux/syscalls.h5
-rw-r--r--include/linux/wait.h6
-rw-r--r--include/net/bluetooth/hci.h1
-rw-r--r--include/net/bluetooth/hci_core.h8
-rw-r--r--init/Kconfig35
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/compat.c11
-rw-r--r--kernel/exit.c13
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/perf_counter.c3406
-rw-r--r--kernel/sched.c49
-rw-r--r--kernel/signal.c56
-rw-r--r--kernel/sys.c7
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c19
-rw-r--r--kernel/timer.c3
-rw-r--r--mm/mmap.c10
-rw-r--r--net/8021q/vlan.c2
-rw-r--r--net/8021q/vlan_dev.c5
-rw-r--r--net/bluetooth/hci_conn.c10
-rw-r--r--net/bluetooth/hci_event.c36
-rw-r--r--net/bluetooth/hci_sysfs.c37
-rw-r--r--net/bridge/br_netfilter.c10
-rw-r--r--net/core/datagram.c14
-rw-r--r--net/ipv4/netfilter/arp_tables.c125
-rw-r--r--net/ipv4/netfilter/ip_tables.c126
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/ipv6/netfilter/ip6_tables.c123
-rw-r--r--net/netfilter/Kconfig4
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c16
-rw-r--r--net/netfilter/nf_conntrack_proto_udplite.c1
-rw-r--r--net/netfilter/x_tables.c53
-rw-r--r--net/netfilter/xt_recent.c9
-rw-r--r--net/xfrm/xfrm_state.c6
156 files changed, 21299 insertions, 598 deletions
diff --git a/Documentation/isdn/00-INDEX b/Documentation/isdn/00-INDEX
index 9fee5f2e5c62..5a2d69989a8c 100644
--- a/Documentation/isdn/00-INDEX
+++ b/Documentation/isdn/00-INDEX
@@ -2,8 +2,14 @@
2 - this file (info on ISDN implementation for Linux) 2 - this file (info on ISDN implementation for Linux)
3CREDITS 3CREDITS
4 - list of the kind folks that brought you this stuff. 4 - list of the kind folks that brought you this stuff.
5HiSax.cert
6 - information about the ITU approval certification of the HiSax driver.
5INTERFACE 7INTERFACE
6 - description of Linklevel and Hardwarelevel ISDN interface. 8 - description of isdn4linux Link Level and Hardware Level interfaces.
9INTERFACE.fax
10 - description of the fax subinterface of isdn4linux.
11INTERFACE.CAPI
12 - description of kernel CAPI Link Level to Hardware Level interface.
7README 13README
8 - general info on what you need and what to do for Linux ISDN. 14 - general info on what you need and what to do for Linux ISDN.
9README.FAQ 15README.FAQ
@@ -12,6 +18,8 @@ README.audio
12 - info for running audio over ISDN. 18 - info for running audio over ISDN.
13README.fax 19README.fax
14 - info for using Fax over ISDN. 20 - info for using Fax over ISDN.
21README.gigaset
22 - info on the drivers for Siemens Gigaset ISDN adapters.
15README.icn 23README.icn
16 - info on the ICN-ISDN-card and its driver. 24 - info on the ICN-ISDN-card and its driver.
17README.HiSax 25README.HiSax
@@ -37,7 +45,8 @@ README.diversion
37README.sc 45README.sc
38 - info on driver for Spellcaster cards. 46 - info on driver for Spellcaster cards.
39README.x25 47README.x25
40 _ info for running X.25 over ISDN. 48 - info for running X.25 over ISDN.
41README.hysdn 49README.hysdn
42 - info on driver for Hypercope active HYSDN cards 50 - info on driver for Hypercope active HYSDN cards
43 51README.mISDN
52 - info on the Modular ISDN subsystem (mISDN).
diff --git a/Documentation/isdn/INTERFACE.CAPI b/Documentation/isdn/INTERFACE.CAPI
new file mode 100644
index 000000000000..786d619b36e5
--- /dev/null
+++ b/Documentation/isdn/INTERFACE.CAPI
@@ -0,0 +1,213 @@
1Kernel CAPI Interface to Hardware Drivers
2-----------------------------------------
3
41. Overview
5
6From the CAPI 2.0 specification:
7COMMON-ISDN-API (CAPI) is an application programming interface standard used
8to access ISDN equipment connected to basic rate interfaces (BRI) and primary
9rate interfaces (PRI).
10
11Kernel CAPI operates as a dispatching layer between CAPI applications and CAPI
12hardware drivers. Hardware drivers register ISDN devices (controllers, in CAPI
13lingo) with Kernel CAPI to indicate their readiness to provide their service
14to CAPI applications. CAPI applications also register with Kernel CAPI,
15requesting association with a CAPI device. Kernel CAPI then dispatches the
16application registration to an available device, forwarding it to the
17corresponding hardware driver. Kernel CAPI then forwards CAPI messages in both
18directions between the application and the hardware driver.
19
20Format and semantics of CAPI messages are specified in the CAPI 2.0 standard.
21This standard is freely available from http://www.capi.org.
22
23
242. Driver and Device Registration
25
26CAPI drivers optionally register themselves with Kernel CAPI by calling the
27Kernel CAPI function register_capi_driver() with a pointer to a struct
28capi_driver. This structure must be filled with the name and revision of the
29driver, and optionally a pointer to a callback function, add_card(). The
30registration can be revoked by calling the function unregister_capi_driver()
31with a pointer to the same struct capi_driver.
32
33CAPI drivers must register each of the ISDN devices they control with Kernel
34CAPI by calling the Kernel CAPI function attach_capi_ctr() with a pointer to a
35struct capi_ctr before they can be used. This structure must be filled with
36the names of the driver and controller, and a number of callback function
37pointers which are subsequently used by Kernel CAPI for communicating with the
38driver. The registration can be revoked by calling the function
39detach_capi_ctr() with a pointer to the same struct capi_ctr.
40
41Before the device can be actually used, the driver must fill in the device
42information fields 'manu', 'version', 'profile' and 'serial' in the capi_ctr
43structure of the device, and signal its readiness by calling capi_ctr_ready().
44From then on, Kernel CAPI may call the registered callback functions for the
45device.
46
47If the device becomes unusable for any reason (shutdown, disconnect ...), the
48driver has to call capi_ctr_reseted(). This will prevent further calls to the
49callback functions by Kernel CAPI.
50
51
523. Application Registration and Communication
53
54Kernel CAPI forwards registration requests from applications (calls to CAPI
55operation CAPI_REGISTER) to an appropriate hardware driver by calling its
56register_appl() callback function. A unique Application ID (ApplID, u16) is
57allocated by Kernel CAPI and passed to register_appl() along with the
58parameter structure provided by the application. This is analogous to the
59open() operation on regular files or character devices.
60
61After a successful return from register_appl(), CAPI messages from the
62application may be passed to the driver for the device via calls to the
63send_message() callback function. The CAPI message to send is stored in the
64data portion of an skb. Conversely, the driver may call Kernel CAPI's
65capi_ctr_handle_message() function to pass a received CAPI message to Kernel
66CAPI for forwarding to an application, specifying its ApplID.
67
68Deregistration requests (CAPI operation CAPI_RELEASE) from applications are
69forwarded as calls to the release_appl() callback function, passing the same
70ApplID as with register_appl(). After return from release_appl(), no CAPI
71messages for that application may be passed to or from the device anymore.
72
73
744. Data Structures
75
764.1 struct capi_driver
77
78This structure describes a Kernel CAPI driver itself. It is used in the
79register_capi_driver() and unregister_capi_driver() functions, and contains
80the following non-private fields, all to be set by the driver before calling
81register_capi_driver():
82
83char name[32]
84 the name of the driver, as a zero-terminated ASCII string
85char revision[32]
86 the revision number of the driver, as a zero-terminated ASCII string
87int (*add_card)(struct capi_driver *driver, capicardparams *data)
88 a callback function pointer (may be NULL)
89
90
914.2 struct capi_ctr
92
93This structure describes an ISDN device (controller) handled by a Kernel CAPI
94driver. After registration via the attach_capi_ctr() function it is passed to
95all controller specific lower layer interface and callback functions to
96identify the controller to operate on.
97
98It contains the following non-private fields:
99
100- to be set by the driver before calling attach_capi_ctr():
101
102struct module *owner
103 pointer to the driver module owning the device
104
105void *driverdata
106 an opaque pointer to driver specific data, not touched by Kernel CAPI
107
108char name[32]
109 the name of the controller, as a zero-terminated ASCII string
110
111char *driver_name
112 the name of the driver, as a zero-terminated ASCII string
113
114int (*load_firmware)(struct capi_ctr *ctrlr, capiloaddata *ldata)
115 (optional) pointer to a callback function for sending firmware and
116 configuration data to the device
117
118void (*reset_ctr)(struct capi_ctr *ctrlr)
119 pointer to a callback function for performing a reset on the device,
120 releasing all registered applications
121
122void (*register_appl)(struct capi_ctr *ctrlr, u16 applid,
123 capi_register_params *rparam)
124void (*release_appl)(struct capi_ctr *ctrlr, u16 applid)
125 pointers to callback functions for registration and deregistration of
126 applications with the device
127
128u16 (*send_message)(struct capi_ctr *ctrlr, struct sk_buff *skb)
129 pointer to a callback function for sending a CAPI message to the
130 device
131
132char *(*procinfo)(struct capi_ctr *ctrlr)
133 pointer to a callback function returning the entry for the device in
134 the CAPI controller info table, /proc/capi/controller
135
136read_proc_t *ctr_read_proc
137 pointer to the read_proc callback function for the device's proc file
138 system entry, /proc/capi/controllers/<n>; will be called with a
139 pointer to the device's capi_ctr structure as the last (data) argument
140
141- to be filled in before calling capi_ctr_ready():
142
143u8 manu[CAPI_MANUFACTURER_LEN]
144 value to return for CAPI_GET_MANUFACTURER
145
146capi_version version
147 value to return for CAPI_GET_VERSION
148
149capi_profile profile
150 value to return for CAPI_GET_PROFILE
151
152u8 serial[CAPI_SERIAL_LEN]
153 value to return for CAPI_GET_SERIAL
154
155
1565. Lower Layer Interface Functions
157
158(declared in <linux/isdn/capilli.h>)
159
160void register_capi_driver(struct capi_driver *drvr)
161void unregister_capi_driver(struct capi_driver *drvr)
162 register/unregister a driver with Kernel CAPI
163
164int attach_capi_ctr(struct capi_ctr *ctrlr)
165int detach_capi_ctr(struct capi_ctr *ctrlr)
166 register/unregister a device (controller) with Kernel CAPI
167
168void capi_ctr_ready(struct capi_ctr *ctrlr)
169void capi_ctr_reseted(struct capi_ctr *ctrlr)
170 signal controller ready/not ready
171
172void capi_ctr_suspend_output(struct capi_ctr *ctrlr)
173void capi_ctr_resume_output(struct capi_ctr *ctrlr)
174 signal suspend/resume
175
176void capi_ctr_handle_message(struct capi_ctr * ctrlr, u16 applid,
177 struct sk_buff *skb)
178 pass a received CAPI message to Kernel CAPI
179 for forwarding to the specified application
180
181
1826. Helper Functions and Macros
183
184Library functions (from <linux/isdn/capilli.h>):
185
186void capilib_new_ncci(struct list_head *head, u16 applid,
187 u32 ncci, u32 winsize)
188void capilib_free_ncci(struct list_head *head, u16 applid, u32 ncci)
189void capilib_release_appl(struct list_head *head, u16 applid)
190void capilib_release(struct list_head *head)
191void capilib_data_b3_conf(struct list_head *head, u16 applid,
192 u32 ncci, u16 msgid)
193u16 capilib_data_b3_req(struct list_head *head, u16 applid,
194 u32 ncci, u16 msgid)
195
196
197Macros to extract/set element values from/in a CAPI message header
198(from <linux/isdn/capiutil.h>):
199
200Get Macro Set Macro Element (Type)
201
202CAPIMSG_LEN(m) CAPIMSG_SETLEN(m, len) Total Length (u16)
203CAPIMSG_APPID(m) CAPIMSG_SETAPPID(m, applid) ApplID (u16)
204CAPIMSG_COMMAND(m) CAPIMSG_SETCOMMAND(m,cmd) Command (u8)
205CAPIMSG_SUBCOMMAND(m) CAPIMSG_SETSUBCOMMAND(m, cmd) Subcommand (u8)
206CAPIMSG_CMD(m) - Command*256
207 + Subcommand (u16)
208CAPIMSG_MSGID(m) CAPIMSG_SETMSGID(m, msgid) Message Number (u16)
209
210CAPIMSG_CONTROL(m) CAPIMSG_SETCONTROL(m, contr) Controller/PLCI/NCCI
211 (u32)
212CAPIMSG_DATALEN(m) CAPIMSG_SETDATALEN(m, len) Data Length (u16)
213
diff --git a/Documentation/perf_counter/.gitignore b/Documentation/perf_counter/.gitignore
new file mode 100644
index 000000000000..41c0b20a76ce
--- /dev/null
+++ b/Documentation/perf_counter/.gitignore
@@ -0,0 +1,179 @@
1GIT-BUILD-OPTIONS
2GIT-CFLAGS
3GIT-GUI-VARS
4GIT-VERSION-FILE
5git
6git-add
7git-add--interactive
8git-am
9git-annotate
10git-apply
11git-archimport
12git-archive
13git-bisect
14git-bisect--helper
15git-blame
16git-branch
17git-bundle
18git-cat-file
19git-check-attr
20git-check-ref-format
21git-checkout
22git-checkout-index
23git-cherry
24git-cherry-pick
25git-clean
26git-clone
27git-commit
28git-commit-tree
29git-config
30git-count-objects
31git-cvsexportcommit
32git-cvsimport
33git-cvsserver
34git-daemon
35git-diff
36git-diff-files
37git-diff-index
38git-diff-tree
39git-difftool
40git-difftool--helper
41git-describe
42git-fast-export
43git-fast-import
44git-fetch
45git-fetch--tool
46git-fetch-pack
47git-filter-branch
48git-fmt-merge-msg
49git-for-each-ref
50git-format-patch
51git-fsck
52git-fsck-objects
53git-gc
54git-get-tar-commit-id
55git-grep
56git-hash-object
57git-help
58git-http-fetch
59git-http-push
60git-imap-send
61git-index-pack
62git-init
63git-init-db
64git-instaweb
65git-log
66git-lost-found
67git-ls-files
68git-ls-remote
69git-ls-tree
70git-mailinfo
71git-mailsplit
72git-merge
73git-merge-base
74git-merge-index
75git-merge-file
76git-merge-tree
77git-merge-octopus
78git-merge-one-file
79git-merge-ours
80git-merge-recursive
81git-merge-resolve
82git-merge-subtree
83git-mergetool
84git-mergetool--lib
85git-mktag
86git-mktree
87git-name-rev
88git-mv
89git-pack-redundant
90git-pack-objects
91git-pack-refs
92git-parse-remote
93git-patch-id
94git-peek-remote
95git-prune
96git-prune-packed
97git-pull
98git-push
99git-quiltimport
100git-read-tree
101git-rebase
102git-rebase--interactive
103git-receive-pack
104git-reflog
105git-relink
106git-remote
107git-repack
108git-repo-config
109git-request-pull
110git-rerere
111git-reset
112git-rev-list
113git-rev-parse
114git-revert
115git-rm
116git-send-email
117git-send-pack
118git-sh-setup
119git-shell
120git-shortlog
121git-show
122git-show-branch
123git-show-index
124git-show-ref
125git-stage
126git-stash
127git-status
128git-stripspace
129git-submodule
130git-svn
131git-symbolic-ref
132git-tag
133git-tar-tree
134git-unpack-file
135git-unpack-objects
136git-update-index
137git-update-ref
138git-update-server-info
139git-upload-archive
140git-upload-pack
141git-var
142git-verify-pack
143git-verify-tag
144git-web--browse
145git-whatchanged
146git-write-tree
147git-core-*/?*
148gitk-wish
149gitweb/gitweb.cgi
150test-chmtime
151test-ctype
152test-date
153test-delta
154test-dump-cache-tree
155test-genrandom
156test-match-trees
157test-parse-options
158test-path-utils
159test-sha1
160test-sigchain
161common-cmds.h
162*.tar.gz
163*.dsc
164*.deb
165git.spec
166*.exe
167*.[aos]
168*.py[co]
169config.mak
170autom4te.cache
171config.cache
172config.log
173config.status
174config.mak.autogen
175config.mak.append
176configure
177tags
178TAGS
179cscope*
diff --git a/Documentation/perf_counter/Documentation/perf-help.txt b/Documentation/perf_counter/Documentation/perf-help.txt
new file mode 100644
index 000000000000..f85fed5a7edb
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-help.txt
@@ -0,0 +1,38 @@
1perf-help(1)
2===========
3
4NAME
5----
6perf-help - display help information about perf
7
8SYNOPSIS
9--------
10'perf help' [-a|--all] [COMMAND]
11
12DESCRIPTION
13-----------
14
15With no options and no COMMAND given, the synopsis of the 'perf'
16command and a list of the most commonly used perf commands are printed
17on the standard output.
18
19If the option '--all' or '-a' is given, then all available commands are
20printed on the standard output.
21
22If a perf command is named, a manual page for that command is brought
23up. The 'man' program is used by default for this purpose, but this
24can be overridden by other options or configuration variables.
25
26Note that `perf --help ...` is identical to `perf help ...` because the
27former is internally converted into the latter.
28
29OPTIONS
30-------
31-a::
32--all::
33 Prints all the available commands on the standard output. This
34 option supersedes any other option.
35
36PERF
37----
38Part of the linkperf:perf[1] suite
diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
new file mode 100644
index 000000000000..d07700e35eb2
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-record.txt
@@ -0,0 +1,63 @@
1perf-record(1)
2==========
3
4NAME
5----
6perf-record - Run a command and record its profile into output.perf
7
8SYNOPSIS
9--------
10[verse]
11'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
12
13DESCRIPTION
14-----------
15This command runs a command and gathers a performance counter profile
16from it, into output.perf - without displaying anything.
17
18This file can then be inspected later on, using 'perf report'.
19
20
21OPTIONS
22-------
23<command>...::
24 Any command you can specify in a shell.
25
26-e::
27--event=::
28 0:0: cpu-cycles
29 0:0: cycles
30 0:1: instructions
31 0:2: cache-references
32 0:3: cache-misses
33 0:4: branch-instructions
34 0:4: branches
35 0:5: branch-misses
36 0:6: bus-cycles
37 1:0: cpu-clock
38 1:1: task-clock
39 1:2: page-faults
40 1:2: faults
41 1:5: minor-faults
42 1:6: major-faults
43 1:3: context-switches
44 1:3: cs
45 1:4: cpu-migrations
46 1:4: migrations
47 rNNN: raw PMU events (eventsel+umask)
48
49-a::
50 system-wide collection
51
52-l::
53 scale counter values
54
55Configuration
56-------------
57
58EXAMPLES
59--------
60
61SEE ALSO
62--------
63linkperf:git-stat[1]
diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
new file mode 100644
index 000000000000..7fcab271e570
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -0,0 +1,76 @@
1perf-stat(1)
2==========
3
4NAME
5----
6perf-stat - Run a command and gather performance counter statistics
7
8SYNOPSIS
9--------
10[verse]
11'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
12
13DESCRIPTION
14-----------
15This command runs a command and gathers performance counter statistics
16from it.
17
18
19OPTIONS
20-------
21<command>...::
22 Any command you can specify in a shell.
23
24-e::
25--event=::
26 0:0: cpu-cycles
27 0:0: cycles
28 0:1: instructions
29 0:2: cache-references
30 0:3: cache-misses
31 0:4: branch-instructions
32 0:4: branches
33 0:5: branch-misses
34 0:6: bus-cycles
35 1:0: cpu-clock
36 1:1: task-clock
37 1:2: page-faults
38 1:2: faults
39 1:5: minor-faults
40 1:6: major-faults
41 1:3: context-switches
42 1:3: cs
43 1:4: cpu-migrations
44 1:4: migrations
45 rNNN: raw PMU events (eventsel+umask)
46
47-a::
48 system-wide collection
49
50-l::
51 scale counter values
52
53Configuration
54-------------
55
56EXAMPLES
57--------
58
59$ perf stat sleep 1
60
61 Performance counter stats for 'sleep':
62
63 0.678356 task clock ticks (msecs)
64 7 context switches (events)
65 4 CPU migrations (events)
66 232 pagefaults (events)
67 1810403 CPU cycles (events)
68 946759 instructions (events)
69 18952 cache references (events)
70 4885 cache misses (events)
71
72 Wall-clock time elapsed: 1001.252894 msecs
73
74SEE ALSO
75--------
76linkperf:git-tops[1]
diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/Documentation/perf_counter/Documentation/perf-top.txt
new file mode 100644
index 000000000000..057333b72534
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-top.txt
@@ -0,0 +1,61 @@
1perf-top(1)
2==========
3
4NAME
5----
6perf-top - Run a command and profile it
7
8SYNOPSIS
9--------
10[verse]
11'perf top' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
12
13DESCRIPTION
14-----------
15This command runs a command and gathers a performance counter profile
16from it.
17
18
19OPTIONS
20-------
21<command>...::
22 Any command you can specify in a shell.
23
24-e::
25--event=::
26 0:0: cpu-cycles
27 0:0: cycles
28 0:1: instructions
29 0:2: cache-references
30 0:3: cache-misses
31 0:4: branch-instructions
32 0:4: branches
33 0:5: branch-misses
34 0:6: bus-cycles
35 1:0: cpu-clock
36 1:1: task-clock
37 1:2: page-faults
38 1:2: faults
39 1:5: minor-faults
40 1:6: major-faults
41 1:3: context-switches
42 1:3: cs
43 1:4: cpu-migrations
44 1:4: migrations
45 rNNN: raw PMU events (eventsel+umask)
46
47-a::
48 system-wide collection
49
50-l::
51 scale counter values
52
53Configuration
54-------------
55
56EXAMPLES
57--------
58
59SEE ALSO
60--------
61linkperf:git-stat[1]
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
new file mode 100644
index 000000000000..481e4c26cd45
--- /dev/null
+++ b/Documentation/perf_counter/Makefile
@@ -0,0 +1,849 @@
1# The default target of this Makefile is...
2all::
3
4# Define V=1 to have a more verbose compile.
5#
6# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
7# or vsnprintf() return -1 instead of number of characters which would
8# have been written to the final string if enough space had been available.
9#
10# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
11# when attempting to read from an fopen'ed directory.
12#
13# Define NO_OPENSSL environment variable if you do not have OpenSSL.
14# This also implies MOZILLA_SHA1.
15#
16# Define CURLDIR=/foo/bar if your curl header and library files are in
17# /foo/bar/include and /foo/bar/lib directories.
18#
19# Define EXPATDIR=/foo/bar if your expat header and library files are in
20# /foo/bar/include and /foo/bar/lib directories.
21#
22# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent.
23#
24# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks
25# d_type in struct dirent (latest Cygwin -- will be fixed soonish).
26#
27# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
28# do not support the 'size specifiers' introduced by C99, namely ll, hh,
29# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
30# some C compilers supported these specifiers prior to C99 as an extension.
31#
32# Define NO_STRCASESTR if you don't have strcasestr.
33#
34# Define NO_MEMMEM if you don't have memmem.
35#
36# Define NO_STRTOUMAX if you don't have strtoumax in the C library.
37# If your compiler also does not support long long or does not have
38# strtoull, define NO_STRTOULL.
39#
40# Define NO_SETENV if you don't have setenv in the C library.
41#
42# Define NO_UNSETENV if you don't have unsetenv in the C library.
43#
44# Define NO_MKDTEMP if you don't have mkdtemp in the C library.
45#
46# Define NO_SYS_SELECT_H if you don't have sys/select.h.
47#
48# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link.
49# Enable it on Windows. By default, symrefs are still used.
50#
51# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
52# tests. These tests take up a significant amount of the total test time
53# but are not needed unless you plan to talk to SVN repos.
54#
55# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
56# installed in /sw, but don't want PERF to link against any libraries
57# installed there. If defined you may specify your own (or Fink's)
58# include directories and library directories by defining CFLAGS
59# and LDFLAGS appropriately.
60#
61# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
62# have DarwinPorts installed in /opt/local, but don't want PERF to
63# link against any libraries installed there. If defined you may
64# specify your own (or DarwinPort's) include directories and
65# library directories by defining CFLAGS and LDFLAGS appropriately.
66#
67# Define PPC_SHA1 environment variable when running make to make use of
68# a bundled SHA1 routine optimized for PowerPC.
69#
70# Define ARM_SHA1 environment variable when running make to make use of
71# a bundled SHA1 routine optimized for ARM.
72#
73# Define MOZILLA_SHA1 environment variable when running make to make use of
74# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast
75# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default
76# choice) has very fast version optimized for i586.
77#
78# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
79#
80# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
81#
82# Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
83# Patrick Mauritz).
84#
85# Define NO_MMAP if you want to avoid mmap.
86#
87# Define NO_PTHREADS if you do not have or do not want to use Pthreads.
88#
89# Define NO_PREAD if you have a problem with pread() system call (e.g.
90# cygwin.dll before v1.5.22).
91#
92# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is
93# generally faster on your platform than accessing the working directory.
94#
95# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
96# the executable mode bit, but doesn't really do so.
97#
98# Define NO_IPV6 if you lack IPv6 support and getaddrinfo().
99#
100# Define NO_SOCKADDR_STORAGE if your platform does not have struct
101# sockaddr_storage.
102#
103# Define NO_ICONV if your libc does not properly support iconv.
104#
105# Define OLD_ICONV if your library has an old iconv(), where the second
106# (input buffer pointer) parameter is declared with type (const char **).
107#
108# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
109#
110# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
111# that tells runtime paths to dynamic libraries;
112# "-Wl,-rpath=/path/lib" is used instead.
113#
114# Define USE_NSEC below if you want perf to care about sub-second file mtimes
115# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
116# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
117# randomly break unless your underlying filesystem supports those sub-second
118# times (my ext3 doesn't).
119#
120# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of
121# "st_ctim"
122#
123# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
124# available. This automatically turns USE_NSEC off.
125#
126# Define USE_STDEV below if you want perf to care about the underlying device
127# change being considered an inode change from the update-index perspective.
128#
129# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
130# field that counts the on-disk footprint in 512-byte blocks.
131#
132# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
133#
134# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
135#
136# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's
137# MakeMaker (e.g. using ActiveState under Cygwin).
138#
139# Define NO_PERL if you do not want Perl scripts or libraries at all.
140#
141# Define INTERNAL_QSORT to use Git's implementation of qsort(), which
142# is a simplified version of the merge sort used in glibc. This is
143# recommended if Git triggers O(n^2) behavior in your platform's qsort().
144#
145# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
146# your external grep (e.g., if your system lacks grep, if its grep is
147# broken, or spawning external process is slower than built-in grep perf has).
148
149PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
150 @$(SHELL_PATH) util/PERF-VERSION-GEN
151-include PERF-VERSION-FILE
152
153uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
154uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
155uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not')
156uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
157uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
158uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
159
160# CFLAGS and LDFLAGS are for the users to override from the command line.
161
162CFLAGS = -g -O2 -Wall
163LDFLAGS = -lpthread -lrt
164ALL_CFLAGS = $(CFLAGS)
165ALL_LDFLAGS = $(LDFLAGS)
166STRIP ?= strip
167
168# Among the variables below, these:
169# perfexecdir
170# template_dir
171# mandir
172# infodir
173# htmldir
174# ETC_PERFCONFIG (but not sysconfdir)
175# can be specified as a relative path some/where/else;
176# this is interpreted as relative to $(prefix) and "perf" at
177# runtime figures out where they are based on the path to the executable.
178# This can help installing the suite in a relocatable way.
179
180prefix = $(HOME)
181bindir_relative = bin
182bindir = $(prefix)/$(bindir_relative)
183mandir = share/man
184infodir = share/info
185perfexecdir = libexec/perf-core
186sharedir = $(prefix)/share
187template_dir = share/perf-core/templates
188htmldir = share/doc/perf-doc
189ifeq ($(prefix),/usr)
190sysconfdir = /etc
191ETC_PERFCONFIG = $(sysconfdir)/perfconfig
192else
193sysconfdir = $(prefix)/etc
194ETC_PERFCONFIG = etc/perfconfig
195endif
196lib = lib
197# DESTDIR=
198
199export prefix bindir sharedir sysconfdir
200
201CC = gcc
202AR = ar
203RM = rm -f
204TAR = tar
205FIND = find
206INSTALL = install
207RPMBUILD = rpmbuild
208PTHREAD_LIBS = -lpthread
209
210# sparse is architecture-neutral, which means that we need to tell it
211# explicitly what architecture to check for. Fix this up for yours..
212SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
213
214
215
216### --- END CONFIGURATION SECTION ---
217
218# Those must not be GNU-specific; they are shared with perl/ which may
219# be built by a different compiler. (Note that this is an artifact now
220# but it still might be nice to keep that distinction.)
221BASIC_CFLAGS =
222BASIC_LDFLAGS =
223
224# Guard against environment variables
225BUILTIN_OBJS =
226BUILT_INS =
227COMPAT_CFLAGS =
228COMPAT_OBJS =
229LIB_H =
230LIB_OBJS =
231PROGRAMS = perf-report
232SCRIPT_PERL =
233SCRIPT_SH =
234TEST_PROGRAMS =
235
236#
237# No scripts right now:
238#
239
240# SCRIPT_SH += perf-am.sh
241
242#
243# No Perl scripts right now:
244#
245
246# SCRIPT_PERL += perf-add--interactive.perl
247
248SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
249 $(patsubst %.perl,%,$(SCRIPT_PERL))
250
251# Empty...
252EXTRA_PROGRAMS =
253
254# ... and all the rest that could be moved out of bindir to perfexecdir
255PROGRAMS += $(EXTRA_PROGRAMS)
256
257#
258# Single 'perf' binary right now:
259#
260PROGRAMS += perf
261
262# List built-in command $C whose implementation cmd_$C() is not in
263# builtin-$C.o but is linked in as part of some other command.
264BUILT_INS += $(patsubst builtin-%.o,perf-%$X,$(BUILTIN_OBJS))
265
266#
267# None right now:
268#
269# BUILT_INS += perf-init $X
270
271# what 'all' will build and 'install' will install, in perfexecdir
272ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
273
274# what 'all' will build but not install in perfexecdir
275OTHER_PROGRAMS = perf$X
276
277# Set paths to tools early so that they can be used for version tests.
278ifndef SHELL_PATH
279 SHELL_PATH = /bin/sh
280endif
281ifndef PERL_PATH
282 PERL_PATH = /usr/bin/perl
283endif
284
285export PERL_PATH
286
287LIB_FILE=libperf.a
288
289LIB_H += ../../include/linux/perf_counter.h
290LIB_H += perf.h
291LIB_H += util/levenshtein.h
292LIB_H += util/parse-options.h
293LIB_H += util/quote.h
294LIB_H += util/util.h
295LIB_H += util/help.h
296LIB_H += util/strbuf.h
297LIB_H += util/run-command.h
298
299LIB_OBJS += util/abspath.o
300LIB_OBJS += util/alias.o
301LIB_OBJS += util/config.o
302LIB_OBJS += util/ctype.o
303LIB_OBJS += util/exec_cmd.o
304LIB_OBJS += util/help.o
305LIB_OBJS += util/levenshtein.o
306LIB_OBJS += util/parse-options.o
307LIB_OBJS += util/path.o
308LIB_OBJS += util/run-command.o
309LIB_OBJS += util/quote.o
310LIB_OBJS += util/strbuf.o
311LIB_OBJS += util/usage.o
312LIB_OBJS += util/wrapper.o
313
314BUILTIN_OBJS += builtin-help.o
315BUILTIN_OBJS += builtin-record.o
316BUILTIN_OBJS += builtin-stat.o
317BUILTIN_OBJS += builtin-top.o
318
319PERFLIBS = $(LIB_FILE)
320EXTLIBS =
321
322#
323# Platform specific tweaks
324#
325
326# We choose to avoid "if .. else if .. else .. endif endif"
327# because maintaining the nesting to match is a pain. If
328# we had "elif" things would have been much nicer...
329
330-include config.mak.autogen
331-include config.mak
332
333ifeq ($(uname_S),Darwin)
334 ifndef NO_FINK
335 ifeq ($(shell test -d /sw/lib && echo y),y)
336 BASIC_CFLAGS += -I/sw/include
337 BASIC_LDFLAGS += -L/sw/lib
338 endif
339 endif
340 ifndef NO_DARWIN_PORTS
341 ifeq ($(shell test -d /opt/local/lib && echo y),y)
342 BASIC_CFLAGS += -I/opt/local/include
343 BASIC_LDFLAGS += -L/opt/local/lib
344 endif
345 endif
346 PTHREAD_LIBS =
347endif
348
349ifndef CC_LD_DYNPATH
350 ifdef NO_R_TO_GCC_LINKER
351 # Some gcc does not accept and pass -R to the linker to specify
352 # the runtime dynamic library path.
353 CC_LD_DYNPATH = -Wl,-rpath,
354 else
355 CC_LD_DYNPATH = -R
356 endif
357endif
358
359ifdef ZLIB_PATH
360 BASIC_CFLAGS += -I$(ZLIB_PATH)/include
361 EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib)
362endif
363EXTLIBS += -lz
364
365ifdef NEEDS_SOCKET
366 EXTLIBS += -lsocket
367endif
368ifdef NEEDS_NSL
369 EXTLIBS += -lnsl
370endif
371ifdef NO_D_TYPE_IN_DIRENT
372 BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT
373endif
374ifdef NO_D_INO_IN_DIRENT
375 BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT
376endif
377ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
378 BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT
379endif
380ifdef USE_NSEC
381 BASIC_CFLAGS += -DUSE_NSEC
382endif
383ifdef USE_ST_TIMESPEC
384 BASIC_CFLAGS += -DUSE_ST_TIMESPEC
385endif
386ifdef NO_NSEC
387 BASIC_CFLAGS += -DNO_NSEC
388endif
389ifdef NO_C99_FORMAT
390 BASIC_CFLAGS += -DNO_C99_FORMAT
391endif
392ifdef SNPRINTF_RETURNS_BOGUS
393 COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
394 COMPAT_OBJS += compat/snprintf.o
395endif
396ifdef FREAD_READS_DIRECTORIES
397 COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
398 COMPAT_OBJS += compat/fopen.o
399endif
400ifdef NO_SYMLINK_HEAD
401 BASIC_CFLAGS += -DNO_SYMLINK_HEAD
402endif
403ifdef NO_STRCASESTR
404 COMPAT_CFLAGS += -DNO_STRCASESTR
405 COMPAT_OBJS += compat/strcasestr.o
406endif
407ifdef NO_STRTOUMAX
408 COMPAT_CFLAGS += -DNO_STRTOUMAX
409 COMPAT_OBJS += compat/strtoumax.o
410endif
411ifdef NO_STRTOULL
412 COMPAT_CFLAGS += -DNO_STRTOULL
413endif
414ifdef NO_SETENV
415 COMPAT_CFLAGS += -DNO_SETENV
416 COMPAT_OBJS += compat/setenv.o
417endif
418ifdef NO_MKDTEMP
419 COMPAT_CFLAGS += -DNO_MKDTEMP
420 COMPAT_OBJS += compat/mkdtemp.o
421endif
422ifdef NO_UNSETENV
423 COMPAT_CFLAGS += -DNO_UNSETENV
424 COMPAT_OBJS += compat/unsetenv.o
425endif
426ifdef NO_SYS_SELECT_H
427 BASIC_CFLAGS += -DNO_SYS_SELECT_H
428endif
429ifdef NO_MMAP
430 COMPAT_CFLAGS += -DNO_MMAP
431 COMPAT_OBJS += compat/mmap.o
432else
433 ifdef USE_WIN32_MMAP
434 COMPAT_CFLAGS += -DUSE_WIN32_MMAP
435 COMPAT_OBJS += compat/win32mmap.o
436 endif
437endif
438ifdef NO_PREAD
439 COMPAT_CFLAGS += -DNO_PREAD
440 COMPAT_OBJS += compat/pread.o
441endif
442ifdef NO_FAST_WORKING_DIRECTORY
443 BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
444endif
445ifdef NO_TRUSTABLE_FILEMODE
446 BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
447endif
448ifdef NO_IPV6
449 BASIC_CFLAGS += -DNO_IPV6
450endif
451ifdef NO_UINTMAX_T
452 BASIC_CFLAGS += -Duintmax_t=uint32_t
453endif
454ifdef NO_SOCKADDR_STORAGE
455ifdef NO_IPV6
456 BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in
457else
458 BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6
459endif
460endif
461ifdef NO_INET_NTOP
462 LIB_OBJS += compat/inet_ntop.o
463endif
464ifdef NO_INET_PTON
465 LIB_OBJS += compat/inet_pton.o
466endif
467
468ifdef NO_ICONV
469 BASIC_CFLAGS += -DNO_ICONV
470endif
471
472ifdef OLD_ICONV
473 BASIC_CFLAGS += -DOLD_ICONV
474endif
475
476ifdef NO_DEFLATE_BOUND
477 BASIC_CFLAGS += -DNO_DEFLATE_BOUND
478endif
479
480ifdef PPC_SHA1
481 SHA1_HEADER = "ppc/sha1.h"
482 LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
483else
484ifdef ARM_SHA1
485 SHA1_HEADER = "arm/sha1.h"
486 LIB_OBJS += arm/sha1.o arm/sha1_arm.o
487else
488ifdef MOZILLA_SHA1
489 SHA1_HEADER = "mozilla-sha1/sha1.h"
490 LIB_OBJS += mozilla-sha1/sha1.o
491else
492 SHA1_HEADER = <openssl/sha.h>
493 EXTLIBS += $(LIB_4_CRYPTO)
494endif
495endif
496endif
497ifdef NO_PERL_MAKEMAKER
498 export NO_PERL_MAKEMAKER
499endif
500ifdef NO_HSTRERROR
501 COMPAT_CFLAGS += -DNO_HSTRERROR
502 COMPAT_OBJS += compat/hstrerror.o
503endif
504ifdef NO_MEMMEM
505 COMPAT_CFLAGS += -DNO_MEMMEM
506 COMPAT_OBJS += compat/memmem.o
507endif
508ifdef INTERNAL_QSORT
509 COMPAT_CFLAGS += -DINTERNAL_QSORT
510 COMPAT_OBJS += compat/qsort.o
511endif
512ifdef RUNTIME_PREFIX
513 COMPAT_CFLAGS += -DRUNTIME_PREFIX
514endif
515
516ifdef DIR_HAS_BSD_GROUP_SEMANTICS
517 COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
518endif
519ifdef NO_EXTERNAL_GREP
520 BASIC_CFLAGS += -DNO_EXTERNAL_GREP
521endif
522
523ifeq ($(PERL_PATH),)
524NO_PERL=NoThanks
525endif
526
527QUIET_SUBDIR0 = +$(MAKE) -C # space to separate -C and subdir
528QUIET_SUBDIR1 =
529
530ifneq ($(findstring $(MAKEFLAGS),w),w)
531PRINT_DIR = --no-print-directory
532else # "make -w"
533NO_SUBDIR = :
534endif
535
536ifneq ($(findstring $(MAKEFLAGS),s),s)
537ifndef V
538 QUIET_CC = @echo ' ' CC $@;
539 QUIET_AR = @echo ' ' AR $@;
540 QUIET_LINK = @echo ' ' LINK $@;
541 QUIET_BUILT_IN = @echo ' ' BUILTIN $@;
542 QUIET_GEN = @echo ' ' GEN $@;
543 QUIET_SUBDIR0 = +@subdir=
544 QUIET_SUBDIR1 = ;$(NO_SUBDIR) echo ' ' SUBDIR $$subdir; \
545 $(MAKE) $(PRINT_DIR) -C $$subdir
546 export V
547 export QUIET_GEN
548 export QUIET_BUILT_IN
549endif
550endif
551
552ifdef ASCIIDOC8
553 export ASCIIDOC8
554endif
555
556# Shell quote (do not use $(call) to accommodate ancient setups);
557
558SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
559ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
560
561DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
562bindir_SQ = $(subst ','\'',$(bindir))
563bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
564mandir_SQ = $(subst ','\'',$(mandir))
565infodir_SQ = $(subst ','\'',$(infodir))
566perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
567template_dir_SQ = $(subst ','\'',$(template_dir))
568htmldir_SQ = $(subst ','\'',$(htmldir))
569prefix_SQ = $(subst ','\'',$(prefix))
570
571SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
572PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
573
574LIBS = $(PERFLIBS) $(EXTLIBS)
575
576BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
577 $(COMPAT_CFLAGS)
578LIB_OBJS += $(COMPAT_OBJS)
579
580ALL_CFLAGS += $(BASIC_CFLAGS)
581ALL_LDFLAGS += $(BASIC_LDFLAGS)
582
583export TAR INSTALL DESTDIR SHELL_PATH
584
585
586### Build rules
587
588SHELL = $(SHELL_PATH)
589
590all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS
591ifneq (,$X)
592 $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
593endif
594
595all::
596
597please_set_SHELL_PATH_to_a_more_modern_shell:
598 @$$(:)
599
600shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
601
602strip: $(PROGRAMS) perf$X
603 $(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X
604
605perf.o: perf.c common-cmds.h PERF-CFLAGS
606 $(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
607 '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
608 $(ALL_CFLAGS) -c $(filter %.c,$^)
609
610perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS)
611 $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \
612 $(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
613
614builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS
615 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
616 '-DPERF_HTML_PATH="$(htmldir_SQ)"' \
617 '-DPERF_MAN_PATH="$(mandir_SQ)"' \
618 '-DPERF_INFO_PATH="$(infodir_SQ)"' $<
619
620$(BUILT_INS): perf$X
621 $(QUIET_BUILT_IN)$(RM) $@ && \
622 ln perf$X $@ 2>/dev/null || \
623 ln -s perf$X $@ 2>/dev/null || \
624 cp perf$X $@
625
626common-cmds.h: util/generate-cmdlist.sh command-list.txt
627
628common-cmds.h: $(wildcard Documentation/perf-*.txt)
629 $(QUIET_GEN)util/generate-cmdlist.sh > $@+ && mv $@+ $@
630
631$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
632 $(QUIET_GEN)$(RM) $@ $@+ && \
633 sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
634 -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
635 -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
636 -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
637 -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
638 $@.sh >$@+ && \
639 chmod +x $@+ && \
640 mv $@+ $@
641
642configure: configure.ac
643 $(QUIET_GEN)$(RM) $@ $<+ && \
644 sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
645 $< > $<+ && \
646 autoconf -o $@ $<+ && \
647 $(RM) $<+
648
649# These can record PERF_VERSION
650perf.o perf.spec \
651 $(patsubst %.sh,%,$(SCRIPT_SH)) \
652 $(patsubst %.perl,%,$(SCRIPT_PERL)) \
653 : PERF-VERSION-FILE
654
655%.o: %.c PERF-CFLAGS
656 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
657%.s: %.c PERF-CFLAGS
658 $(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
659%.o: %.S
660 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
661
662util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS
663 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
664 '-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
665 '-DBINDIR="$(bindir_relative_SQ)"' \
666 '-DPREFIX="$(prefix_SQ)"' \
667 $<
668
669builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
670 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
671
672util/config.o: util/config.c PERF-CFLAGS
673 $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
674
675perf-%$X: %.o $(PERFLIBS)
676 $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
677
678$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
679$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
680builtin-revert.o wt-status.o: wt-status.h
681
682$(LIB_FILE): $(LIB_OBJS)
683 $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
684
685TAGS:
686 $(RM) TAGS
687 $(FIND) . -name '*.[hcS]' -print | xargs etags -a
688
689tags:
690 $(RM) tags
691 $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
692
693cscope:
694 $(RM) cscope*
695 $(FIND) . -name '*.[hcS]' -print | xargs cscope -b
696
697### Detect prefix changes
698TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
699 $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
700
701PERF-CFLAGS: .FORCE-PERF-CFLAGS
702 @FLAGS='$(TRACK_CFLAGS)'; \
703 if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \
704 echo 1>&2 " * new build flags or prefix"; \
705 echo "$$FLAGS" >PERF-CFLAGS; \
706 fi
707
708# We need to apply sq twice, once to protect from the shell
709# that runs PERF-BUILD-OPTIONS, and then again to protect it
710# and the first level quoting from the shell that runs "echo".
711PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
712 @echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
713 @echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
714 @echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
715 @echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
716
717### Testing rules
718
719#
720# None right now:
721#
722# TEST_PROGRAMS += test-something$X
723
724all:: $(TEST_PROGRAMS)
725
726# GNU make supports exporting all variables by "export" without parameters.
727# However, the environment gets quite big, and some programs have problems
728# with that.
729
730export NO_SVN_TESTS
731
732check: common-cmds.h
733 if sparse; \
734 then \
735 for i in *.c */*.c; \
736 do \
737 sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
738 done; \
739 else \
740 echo 2>&1 "Did you mean 'make test'?"; \
741 exit 1; \
742 fi
743
744remove-dashes:
745 ./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS)
746
747### Installation rules
748
749ifneq ($(filter /%,$(firstword $(template_dir))),)
750template_instdir = $(template_dir)
751else
752template_instdir = $(prefix)/$(template_dir)
753endif
754export template_instdir
755
756ifneq ($(filter /%,$(firstword $(perfexecdir))),)
757perfexec_instdir = $(perfexecdir)
758else
759perfexec_instdir = $(prefix)/$(perfexecdir)
760endif
761perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
762export perfexec_instdir
763
764install: all
765 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
766 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
767 $(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
768ifneq (,$X)
769 $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
770endif
771
772### Maintainer's dist rules
773
774perf.spec: perf.spec.in
775 sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
776 mv $@+ $@
777
778PERF_TARNAME=perf-$(PERF_VERSION)
779dist: perf.spec perf-archive$(X) configure
780 ./perf-archive --format=tar \
781 --prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
782 @mkdir -p $(PERF_TARNAME)
783 @cp perf.spec configure $(PERF_TARNAME)
784 @echo $(PERF_VERSION) > $(PERF_TARNAME)/version
785 $(TAR) rf $(PERF_TARNAME).tar \
786 $(PERF_TARNAME)/perf.spec \
787 $(PERF_TARNAME)/configure \
788 $(PERF_TARNAME)/version
789 @$(RM) -r $(PERF_TARNAME)
790 gzip -f -9 $(PERF_TARNAME).tar
791
792rpm: dist
793 $(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
794
795### Cleaning rules
796
797distclean: clean
798 $(RM) configure
799
800clean:
801 $(RM) *.o */*.o $(LIB_FILE)
802 $(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
803 $(RM) $(TEST_PROGRAMS)
804 $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
805 $(RM) -r autom4te.cache
806 $(RM) config.log config.mak.autogen config.mak.append config.status config.cache
807 $(RM) -r $(PERF_TARNAME) .doc-tmp-dir
808 $(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
809 $(RM) $(htmldocs).tar.gz $(manpages).tar.gz
810 $(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
811
812# temporary hack:
813perf-report: perf-report.cc ../../include/linux/perf_counter.h Makefile
814 g++ -g -O2 -Wall -lrt -o $@ $<
815
816.PHONY: all install clean strip
817.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
818.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
819.PHONY: .FORCE-PERF-BUILD-OPTIONS
820
821### Make sure built-ins do not have dups and listed in perf.c
822#
823check-builtins::
824 ./check-builtins.sh
825
826### Test suite coverage testing
827#
828.PHONY: coverage coverage-clean coverage-build coverage-report
829
830coverage:
831 $(MAKE) coverage-build
832 $(MAKE) coverage-report
833
834coverage-clean:
835 rm -f *.gcda *.gcno
836
837COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
838COVERAGE_LDFLAGS = $(CFLAGS) -O0 -lgcov
839
840coverage-build: coverage-clean
841 $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
842 $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
843 -j1 test
844
845coverage-report:
846 gcov -b *.c */*.c
847 grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
848 | sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
849 | tee coverage-untested-functions
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
new file mode 100644
index 000000000000..6616de0ef053
--- /dev/null
+++ b/Documentation/perf_counter/builtin-help.c
@@ -0,0 +1,461 @@
1/*
2 * builtin-help.c
3 *
4 * Builtin help command
5 */
6#include "util/cache.h"
7#include "builtin.h"
8#include "util/exec_cmd.h"
9#include "common-cmds.h"
10#include "util/parse-options.h"
11#include "util/run-command.h"
12#include "util/help.h"
13
14static struct man_viewer_list {
15 struct man_viewer_list *next;
16 char name[FLEX_ARRAY];
17} *man_viewer_list;
18
19static struct man_viewer_info_list {
20 struct man_viewer_info_list *next;
21 const char *info;
22 char name[FLEX_ARRAY];
23} *man_viewer_info_list;
24
25enum help_format {
26 HELP_FORMAT_MAN,
27 HELP_FORMAT_INFO,
28 HELP_FORMAT_WEB,
29};
30
31static int show_all = 0;
32static enum help_format help_format = HELP_FORMAT_MAN;
33static struct option builtin_help_options[] = {
34 OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
35 OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
36 OPT_SET_INT('w', "web", &help_format, "show manual in web browser",
37 HELP_FORMAT_WEB),
38 OPT_SET_INT('i', "info", &help_format, "show info page",
39 HELP_FORMAT_INFO),
40 OPT_END(),
41};
42
43static const char * const builtin_help_usage[] = {
44 "perf help [--all] [--man|--web|--info] [command]",
45 NULL
46};
47
48static enum help_format parse_help_format(const char *format)
49{
50 if (!strcmp(format, "man"))
51 return HELP_FORMAT_MAN;
52 if (!strcmp(format, "info"))
53 return HELP_FORMAT_INFO;
54 if (!strcmp(format, "web") || !strcmp(format, "html"))
55 return HELP_FORMAT_WEB;
56 die("unrecognized help format '%s'", format);
57}
58
59static const char *get_man_viewer_info(const char *name)
60{
61 struct man_viewer_info_list *viewer;
62
63 for (viewer = man_viewer_info_list; viewer; viewer = viewer->next)
64 {
65 if (!strcasecmp(name, viewer->name))
66 return viewer->info;
67 }
68 return NULL;
69}
70
71static int check_emacsclient_version(void)
72{
73 struct strbuf buffer = STRBUF_INIT;
74 struct child_process ec_process;
75 const char *argv_ec[] = { "emacsclient", "--version", NULL };
76 int version;
77
78 /* emacsclient prints its version number on stderr */
79 memset(&ec_process, 0, sizeof(ec_process));
80 ec_process.argv = argv_ec;
81 ec_process.err = -1;
82 ec_process.stdout_to_stderr = 1;
83 if (start_command(&ec_process)) {
84 fprintf(stderr, "Failed to start emacsclient.\n");
85 return -1;
86 }
87 strbuf_read(&buffer, ec_process.err, 20);
88 close(ec_process.err);
89
90 /*
91 * Don't bother checking return value, because "emacsclient --version"
92 * seems to always exits with code 1.
93 */
94 finish_command(&ec_process);
95
96 if (prefixcmp(buffer.buf, "emacsclient")) {
97 fprintf(stderr, "Failed to parse emacsclient version.\n");
98 strbuf_release(&buffer);
99 return -1;
100 }
101
102 strbuf_remove(&buffer, 0, strlen("emacsclient"));
103 version = atoi(buffer.buf);
104
105 if (version < 22) {
106 fprintf(stderr,
107 "emacsclient version '%d' too old (< 22).\n",
108 version);
109 strbuf_release(&buffer);
110 return -1;
111 }
112
113 strbuf_release(&buffer);
114 return 0;
115}
116
117static void exec_woman_emacs(const char* path, const char *page)
118{
119 if (!check_emacsclient_version()) {
120 /* This works only with emacsclient version >= 22. */
121 struct strbuf man_page = STRBUF_INIT;
122
123 if (!path)
124 path = "emacsclient";
125 strbuf_addf(&man_page, "(woman \"%s\")", page);
126 execlp(path, "emacsclient", "-e", man_page.buf, NULL);
127 warning("failed to exec '%s': %s", path, strerror(errno));
128 }
129}
130
131static void exec_man_konqueror(const char* path, const char *page)
132{
133 const char *display = getenv("DISPLAY");
134 if (display && *display) {
135 struct strbuf man_page = STRBUF_INIT;
136 const char *filename = "kfmclient";
137
138 /* It's simpler to launch konqueror using kfmclient. */
139 if (path) {
140 const char *file = strrchr(path, '/');
141 if (file && !strcmp(file + 1, "konqueror")) {
142 char *new = strdup(path);
143 char *dest = strrchr(new, '/');
144
145 /* strlen("konqueror") == strlen("kfmclient") */
146 strcpy(dest + 1, "kfmclient");
147 path = new;
148 }
149 if (file)
150 filename = file;
151 } else
152 path = "kfmclient";
153 strbuf_addf(&man_page, "man:%s(1)", page);
154 execlp(path, filename, "newTab", man_page.buf, NULL);
155 warning("failed to exec '%s': %s", path, strerror(errno));
156 }
157}
158
159static void exec_man_man(const char* path, const char *page)
160{
161 if (!path)
162 path = "man";
163 execlp(path, "man", page, NULL);
164 warning("failed to exec '%s': %s", path, strerror(errno));
165}
166
167static void exec_man_cmd(const char *cmd, const char *page)
168{
169 struct strbuf shell_cmd = STRBUF_INIT;
170 strbuf_addf(&shell_cmd, "%s %s", cmd, page);
171 execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL);
172 warning("failed to exec '%s': %s", cmd, strerror(errno));
173}
174
175static void add_man_viewer(const char *name)
176{
177 struct man_viewer_list **p = &man_viewer_list;
178 size_t len = strlen(name);
179
180 while (*p)
181 p = &((*p)->next);
182 *p = calloc(1, (sizeof(**p) + len + 1));
183 strncpy((*p)->name, name, len);
184}
185
186static int supported_man_viewer(const char *name, size_t len)
187{
188 return (!strncasecmp("man", name, len) ||
189 !strncasecmp("woman", name, len) ||
190 !strncasecmp("konqueror", name, len));
191}
192
193static void do_add_man_viewer_info(const char *name,
194 size_t len,
195 const char *value)
196{
197 struct man_viewer_info_list *new = calloc(1, sizeof(*new) + len + 1);
198
199 strncpy(new->name, name, len);
200 new->info = strdup(value);
201 new->next = man_viewer_info_list;
202 man_viewer_info_list = new;
203}
204
205static int add_man_viewer_path(const char *name,
206 size_t len,
207 const char *value)
208{
209 if (supported_man_viewer(name, len))
210 do_add_man_viewer_info(name, len, value);
211 else
212 warning("'%s': path for unsupported man viewer.\n"
213 "Please consider using 'man.<tool>.cmd' instead.",
214 name);
215
216 return 0;
217}
218
219static int add_man_viewer_cmd(const char *name,
220 size_t len,
221 const char *value)
222{
223 if (supported_man_viewer(name, len))
224 warning("'%s': cmd for supported man viewer.\n"
225 "Please consider using 'man.<tool>.path' instead.",
226 name);
227 else
228 do_add_man_viewer_info(name, len, value);
229
230 return 0;
231}
232
233static int add_man_viewer_info(const char *var, const char *value)
234{
235 const char *name = var + 4;
236 const char *subkey = strrchr(name, '.');
237
238 if (!subkey)
239 return error("Config with no key for man viewer: %s", name);
240
241 if (!strcmp(subkey, ".path")) {
242 if (!value)
243 return config_error_nonbool(var);
244 return add_man_viewer_path(name, subkey - name, value);
245 }
246 if (!strcmp(subkey, ".cmd")) {
247 if (!value)
248 return config_error_nonbool(var);
249 return add_man_viewer_cmd(name, subkey - name, value);
250 }
251
252 warning("'%s': unsupported man viewer sub key.", subkey);
253 return 0;
254}
255
256static int perf_help_config(const char *var, const char *value, void *cb)
257{
258 if (!strcmp(var, "help.format")) {
259 if (!value)
260 return config_error_nonbool(var);
261 help_format = parse_help_format(value);
262 return 0;
263 }
264 if (!strcmp(var, "man.viewer")) {
265 if (!value)
266 return config_error_nonbool(var);
267 add_man_viewer(value);
268 return 0;
269 }
270 if (!prefixcmp(var, "man."))
271 return add_man_viewer_info(var, value);
272
273 return perf_default_config(var, value, cb);
274}
275
276static struct cmdnames main_cmds, other_cmds;
277
278void list_common_cmds_help(void)
279{
280 int i, longest = 0;
281
282 for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
283 if (longest < strlen(common_cmds[i].name))
284 longest = strlen(common_cmds[i].name);
285 }
286
287 puts("The most commonly used perf commands are:");
288 for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
289 printf(" %s ", common_cmds[i].name);
290 mput_char(' ', longest - strlen(common_cmds[i].name));
291 puts(common_cmds[i].help);
292 }
293}
294
295static int is_perf_command(const char *s)
296{
297 return is_in_cmdlist(&main_cmds, s) ||
298 is_in_cmdlist(&other_cmds, s);
299}
300
301static const char *prepend(const char *prefix, const char *cmd)
302{
303 size_t pre_len = strlen(prefix);
304 size_t cmd_len = strlen(cmd);
305 char *p = malloc(pre_len + cmd_len + 1);
306 memcpy(p, prefix, pre_len);
307 strcpy(p + pre_len, cmd);
308 return p;
309}
310
311static const char *cmd_to_page(const char *perf_cmd)
312{
313 if (!perf_cmd)
314 return "perf";
315 else if (!prefixcmp(perf_cmd, "perf"))
316 return perf_cmd;
317 else if (is_perf_command(perf_cmd))
318 return prepend("perf-", perf_cmd);
319 else
320 return prepend("perf", perf_cmd);
321}
322
323static void setup_man_path(void)
324{
325 struct strbuf new_path = STRBUF_INIT;
326 const char *old_path = getenv("MANPATH");
327
328 /* We should always put ':' after our path. If there is no
329 * old_path, the ':' at the end will let 'man' to try
330 * system-wide paths after ours to find the manual page. If
331 * there is old_path, we need ':' as delimiter. */
332 strbuf_addstr(&new_path, system_path(PERF_MAN_PATH));
333 strbuf_addch(&new_path, ':');
334 if (old_path)
335 strbuf_addstr(&new_path, old_path);
336
337 setenv("MANPATH", new_path.buf, 1);
338
339 strbuf_release(&new_path);
340}
341
342static void exec_viewer(const char *name, const char *page)
343{
344 const char *info = get_man_viewer_info(name);
345
346 if (!strcasecmp(name, "man"))
347 exec_man_man(info, page);
348 else if (!strcasecmp(name, "woman"))
349 exec_woman_emacs(info, page);
350 else if (!strcasecmp(name, "konqueror"))
351 exec_man_konqueror(info, page);
352 else if (info)
353 exec_man_cmd(info, page);
354 else
355 warning("'%s': unknown man viewer.", name);
356}
357
358static void show_man_page(const char *perf_cmd)
359{
360 struct man_viewer_list *viewer;
361 const char *page = cmd_to_page(perf_cmd);
362 const char *fallback = getenv("PERF_MAN_VIEWER");
363
364 setup_man_path();
365 for (viewer = man_viewer_list; viewer; viewer = viewer->next)
366 {
367 exec_viewer(viewer->name, page); /* will return when unable */
368 }
369 if (fallback)
370 exec_viewer(fallback, page);
371 exec_viewer("man", page);
372 die("no man viewer handled the request");
373}
374
375static void show_info_page(const char *perf_cmd)
376{
377 const char *page = cmd_to_page(perf_cmd);
378 setenv("INFOPATH", system_path(PERF_INFO_PATH), 1);
379 execlp("info", "info", "perfman", page, NULL);
380}
381
382static void get_html_page_path(struct strbuf *page_path, const char *page)
383{
384 struct stat st;
385 const char *html_path = system_path(PERF_HTML_PATH);
386
387 /* Check that we have a perf documentation directory. */
388 if (stat(mkpath("%s/perf.html", html_path), &st)
389 || !S_ISREG(st.st_mode))
390 die("'%s': not a documentation directory.", html_path);
391
392 strbuf_init(page_path, 0);
393 strbuf_addf(page_path, "%s/%s.html", html_path, page);
394}
395
396/*
397 * If open_html is not defined in a platform-specific way (see for
398 * example compat/mingw.h), we use the script web--browse to display
399 * HTML.
400 */
401#ifndef open_html
402void open_html(const char *path)
403{
404 execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
405}
406#endif
407
408static void show_html_page(const char *perf_cmd)
409{
410 const char *page = cmd_to_page(perf_cmd);
411 struct strbuf page_path; /* it leaks but we exec bellow */
412
413 get_html_page_path(&page_path, page);
414
415 open_html(page_path.buf);
416}
417
418int cmd_help(int argc, const char **argv, const char *prefix)
419{
420 const char *alias;
421 load_command_list("perf-", &main_cmds, &other_cmds);
422
423 perf_config(perf_help_config, NULL);
424
425 argc = parse_options(argc, argv, builtin_help_options,
426 builtin_help_usage, 0);
427
428 if (show_all) {
429 printf("usage: %s\n\n", perf_usage_string);
430 list_commands("perf commands", &main_cmds, &other_cmds);
431 printf("%s\n", perf_more_info_string);
432 return 0;
433 }
434
435 if (!argv[0]) {
436 printf("usage: %s\n\n", perf_usage_string);
437 list_common_cmds_help();
438 printf("\n%s\n", perf_more_info_string);
439 return 0;
440 }
441
442 alias = alias_lookup(argv[0]);
443 if (alias && !is_perf_command(argv[0])) {
444 printf("`perf %s' is aliased to `%s'\n", argv[0], alias);
445 return 0;
446 }
447
448 switch (help_format) {
449 case HELP_FORMAT_MAN:
450 show_man_page(argv[0]);
451 break;
452 case HELP_FORMAT_INFO:
453 show_info_page(argv[0]);
454 break;
455 case HELP_FORMAT_WEB:
456 show_html_page(argv[0]);
457 break;
458 }
459
460 return 0;
461}
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
new file mode 100644
index 000000000000..5f5e6df0260d
--- /dev/null
+++ b/Documentation/perf_counter/builtin-record.c
@@ -0,0 +1,484 @@
1
2
3#include "util/util.h"
4
5#include <sys/types.h>
6#include <sys/stat.h>
7#include <sys/time.h>
8#include <unistd.h>
9#include <stdint.h>
10#include <stdlib.h>
11#include <string.h>
12#include <limits.h>
13#include <getopt.h>
14#include <assert.h>
15#include <fcntl.h>
16#include <stdio.h>
17#include <errno.h>
18#include <time.h>
19#include <sched.h>
20#include <pthread.h>
21
22#include <sys/syscall.h>
23#include <sys/ioctl.h>
24#include <sys/poll.h>
25#include <sys/prctl.h>
26#include <sys/wait.h>
27#include <sys/uio.h>
28#include <sys/mman.h>
29
30#include <linux/unistd.h>
31#include <linux/types.h>
32
33#include "../../include/linux/perf_counter.h"
34
35#include "perf.h"
36
37static int nr_counters = 0;
38static __u64 event_id[MAX_COUNTERS] = { };
39static int default_interval = 100000;
40static int event_count[MAX_COUNTERS];
41static int fd[MAX_NR_CPUS][MAX_COUNTERS];
42static int nr_cpus = 0;
43static unsigned int page_size;
44static unsigned int mmap_pages = 16;
45static int output;
46static char *output_name = "output.perf";
47static int group = 0;
48static unsigned int realtime_prio = 0;
49static int system_wide = 0;
50static int inherit = 1;
51static int nmi = 1;
52
53const unsigned int default_count[] = {
54 1000000,
55 1000000,
56 10000,
57 10000,
58 1000000,
59 10000,
60};
61
62struct event_symbol {
63 __u64 event;
64 char *symbol;
65};
66
67static struct event_symbol event_symbols[] = {
68 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
69 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
70 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
71 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
72 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
73 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
74 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
75 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
76 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
77
78 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
79 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
80 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
81 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
82 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
83 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
84 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
85 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
86 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
87 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
88};
89
90/*
91 * Each event can have multiple symbolic names.
92 * Symbolic names are (almost) exactly matched.
93 */
94static __u64 match_event_symbols(char *str)
95{
96 __u64 config, id;
97 int type;
98 unsigned int i;
99
100 if (sscanf(str, "r%llx", &config) == 1)
101 return config | PERF_COUNTER_RAW_MASK;
102
103 if (sscanf(str, "%d:%llu", &type, &id) == 2)
104 return EID(type, id);
105
106 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
107 if (!strncmp(str, event_symbols[i].symbol,
108 strlen(event_symbols[i].symbol)))
109 return event_symbols[i].event;
110 }
111
112 return ~0ULL;
113}
114
115static int parse_events(char *str)
116{
117 __u64 config;
118
119again:
120 if (nr_counters == MAX_COUNTERS)
121 return -1;
122
123 config = match_event_symbols(str);
124 if (config == ~0ULL)
125 return -1;
126
127 event_id[nr_counters] = config;
128 nr_counters++;
129
130 str = strstr(str, ",");
131 if (str) {
132 str++;
133 goto again;
134 }
135
136 return 0;
137}
138
139#define __PERF_COUNTER_FIELD(config, name) \
140 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
141
142#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
143#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
144#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
145#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
146
147static void display_events_help(void)
148{
149 unsigned int i;
150 __u64 e;
151
152 printf(
153 " -e EVENT --event=EVENT # symbolic-name abbreviations");
154
155 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
156 int type, id;
157
158 e = event_symbols[i].event;
159 type = PERF_COUNTER_TYPE(e);
160 id = PERF_COUNTER_ID(e);
161
162 printf("\n %d:%d: %-20s",
163 type, id, event_symbols[i].symbol);
164 }
165
166 printf("\n"
167 " rNNN: raw PMU events (eventsel+umask)\n\n");
168}
169
170static void display_help(void)
171{
172 printf(
173 "Usage: perf-record [<options>] <cmd>\n"
174 "perf-record Options (up to %d event types can be specified at once):\n\n",
175 MAX_COUNTERS);
176
177 display_events_help();
178
179 printf(
180 " -c CNT --count=CNT # event period to sample\n"
181 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
182 " -o file --output=<file> # output file\n"
183 " -r prio --realtime=<prio> # use RT prio\n"
184 " -s --system # system wide profiling\n"
185 );
186
187 exit(0);
188}
189
190static void process_options(int argc, const char *argv[])
191{
192 int error = 0, counter;
193
194 for (;;) {
195 int option_index = 0;
196 /** Options for getopt */
197 static struct option long_options[] = {
198 {"count", required_argument, NULL, 'c'},
199 {"event", required_argument, NULL, 'e'},
200 {"mmap_pages", required_argument, NULL, 'm'},
201 {"output", required_argument, NULL, 'o'},
202 {"realtime", required_argument, NULL, 'r'},
203 {"system", no_argument, NULL, 's'},
204 {"inherit", no_argument, NULL, 'i'},
205 {"nmi", no_argument, NULL, 'n'},
206 {NULL, 0, NULL, 0 }
207 };
208 int c = getopt_long(argc, argv, "+:c:e:m:o:r:sin",
209 long_options, &option_index);
210 if (c == -1)
211 break;
212
213 switch (c) {
214 case 'c': default_interval = atoi(optarg); break;
215 case 'e': error = parse_events(optarg); break;
216 case 'm': mmap_pages = atoi(optarg); break;
217 case 'o': output_name = strdup(optarg); break;
218 case 'r': realtime_prio = atoi(optarg); break;
219 case 's': system_wide ^= 1; break;
220 case 'i': inherit ^= 1; break;
221 case 'n': nmi ^= 1; break;
222 default: error = 1; break;
223 }
224 }
225
226 if (argc - optind == 0)
227 error = 1;
228
229 if (error)
230 display_help();
231
232 if (!nr_counters) {
233 nr_counters = 1;
234 event_id[0] = 0;
235 }
236
237 for (counter = 0; counter < nr_counters; counter++) {
238 if (event_count[counter])
239 continue;
240
241 event_count[counter] = default_interval;
242 }
243}
244
245struct mmap_data {
246 int counter;
247 void *base;
248 unsigned int mask;
249 unsigned int prev;
250};
251
252static unsigned int mmap_read_head(struct mmap_data *md)
253{
254 struct perf_counter_mmap_page *pc = md->base;
255 int head;
256
257 head = pc->data_head;
258 rmb();
259
260 return head;
261}
262
263static long events;
264static struct timeval last_read, this_read;
265
266static void mmap_read(struct mmap_data *md)
267{
268 unsigned int head = mmap_read_head(md);
269 unsigned int old = md->prev;
270 unsigned char *data = md->base + page_size;
271 unsigned long size;
272 void *buf;
273 int diff;
274
275 gettimeofday(&this_read, NULL);
276
277 /*
278 * If we're further behind than half the buffer, there's a chance
279 * the writer will bite our tail and screw up the events under us.
280 *
281 * If we somehow ended up ahead of the head, we got messed up.
282 *
283 * In either case, truncate and restart at head.
284 */
285 diff = head - old;
286 if (diff > md->mask / 2 || diff < 0) {
287 struct timeval iv;
288 unsigned long msecs;
289
290 timersub(&this_read, &last_read, &iv);
291 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
292
293 fprintf(stderr, "WARNING: failed to keep up with mmap data."
294 " Last read %lu msecs ago.\n", msecs);
295
296 /*
297 * head points to a known good entry, start there.
298 */
299 old = head;
300 }
301
302 last_read = this_read;
303
304 if (old != head)
305 events++;
306
307 size = head - old;
308
309 if ((old & md->mask) + size != (head & md->mask)) {
310 buf = &data[old & md->mask];
311 size = md->mask + 1 - (old & md->mask);
312 old += size;
313 while (size) {
314 int ret = write(output, buf, size);
315 if (ret < 0) {
316 perror("failed to write");
317 exit(-1);
318 }
319 size -= ret;
320 buf += ret;
321 }
322 }
323
324 buf = &data[old & md->mask];
325 size = head - old;
326 old += size;
327 while (size) {
328 int ret = write(output, buf, size);
329 if (ret < 0) {
330 perror("failed to write");
331 exit(-1);
332 }
333 size -= ret;
334 buf += ret;
335 }
336
337 md->prev = old;
338}
339
340static volatile int done = 0;
341
342static void sig_handler(int sig)
343{
344 done = 1;
345}
346
347static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
348static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
349
350static int nr_poll;
351static int nr_cpu;
352
353static void open_counters(int cpu)
354{
355 struct perf_counter_hw_event hw_event;
356 int counter, group_fd;
357 int track = 1;
358 pid_t pid = -1;
359
360 if (cpu < 0)
361 pid = 0;
362
363 group_fd = -1;
364 for (counter = 0; counter < nr_counters; counter++) {
365
366 memset(&hw_event, 0, sizeof(hw_event));
367 hw_event.config = event_id[counter];
368 hw_event.irq_period = event_count[counter];
369 hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID;
370 hw_event.nmi = nmi;
371 hw_event.mmap = track;
372 hw_event.comm = track;
373 hw_event.inherit = (cpu < 0) && inherit;
374
375 track = 0; // only the first counter needs these
376
377 fd[nr_cpu][counter] =
378 sys_perf_counter_open(&hw_event, pid, cpu, group_fd, 0);
379
380 if (fd[nr_cpu][counter] < 0) {
381 int err = errno;
382 printf("kerneltop error: syscall returned with %d (%s)\n",
383 fd[nr_cpu][counter], strerror(err));
384 if (err == EPERM)
385 printf("Are you root?\n");
386 exit(-1);
387 }
388 assert(fd[nr_cpu][counter] >= 0);
389 fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
390
391 /*
392 * First counter acts as the group leader:
393 */
394 if (group && group_fd == -1)
395 group_fd = fd[nr_cpu][counter];
396
397 event_array[nr_poll].fd = fd[nr_cpu][counter];
398 event_array[nr_poll].events = POLLIN;
399 nr_poll++;
400
401 mmap_array[nr_cpu][counter].counter = counter;
402 mmap_array[nr_cpu][counter].prev = 0;
403 mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
404 mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
405 PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
406 if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
407 printf("kerneltop error: failed to mmap with %d (%s)\n",
408 errno, strerror(errno));
409 exit(-1);
410 }
411 }
412 nr_cpu++;
413}
414
415int cmd_record(int argc, const char **argv)
416{
417 int i, counter;
418 pid_t pid;
419 int ret;
420
421 page_size = sysconf(_SC_PAGE_SIZE);
422
423 process_options(argc, argv);
424
425 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
426 assert(nr_cpus <= MAX_NR_CPUS);
427 assert(nr_cpus >= 0);
428
429 output = open(output_name, O_CREAT|O_RDWR, S_IRWXU);
430 if (output < 0) {
431 perror("failed to create output file");
432 exit(-1);
433 }
434
435 argc -= optind;
436 argv += optind;
437
438 if (!system_wide)
439 open_counters(-1);
440 else for (i = 0; i < nr_cpus; i++)
441 open_counters(i);
442
443 signal(SIGCHLD, sig_handler);
444 signal(SIGINT, sig_handler);
445
446 pid = fork();
447 if (pid < 0)
448 perror("failed to fork");
449
450 if (!pid) {
451 if (execvp(argv[0], argv)) {
452 perror(argv[0]);
453 exit(-1);
454 }
455 }
456
457 if (realtime_prio) {
458 struct sched_param param;
459
460 param.sched_priority = realtime_prio;
461 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
462 printf("Could not set realtime priority.\n");
463 exit(-1);
464 }
465 }
466
467 /*
468 * TODO: store the current /proc/$/maps information somewhere
469 */
470
471 while (!done) {
472 int hits = events;
473
474 for (i = 0; i < nr_cpu; i++) {
475 for (counter = 0; counter < nr_counters; counter++)
476 mmap_read(&mmap_array[i][counter]);
477 }
478
479 if (hits == events)
480 ret = poll(event_array, nr_poll, 100);
481 }
482
483 return 0;
484}
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
new file mode 100644
index 000000000000..e2fa117eab58
--- /dev/null
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -0,0 +1,554 @@
1/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31/*
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
33
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
38
39 Sample output:
40
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
42
43 Performance counter stats for 'ls':
44
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
48 */
49
50 /*
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
52 *
53 * Improvements and fixes by:
54 *
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
60 *
61 * Released under the GPL v2. (and only v2, not any later version)
62 */
63
64#include "util/util.h"
65
66#include <getopt.h>
67#include <assert.h>
68#include <fcntl.h>
69#include <stdio.h>
70#include <errno.h>
71#include <time.h>
72#include <sched.h>
73#include <pthread.h>
74
75#include <sys/syscall.h>
76#include <sys/ioctl.h>
77#include <sys/poll.h>
78#include <sys/prctl.h>
79#include <sys/wait.h>
80#include <sys/uio.h>
81#include <sys/mman.h>
82
83#include <linux/unistd.h>
84#include <linux/types.h>
85
86#include "../../include/linux/perf_counter.h"
87
88#include "perf.h"
89
90#define EVENT_MASK_KERNEL 1
91#define EVENT_MASK_USER 2
92
93static int system_wide = 0;
94
95static int nr_counters = 0;
96static __u64 event_id[MAX_COUNTERS] = {
97 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
98 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
99 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
100 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
101
102 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
103 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
104 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
105 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
106};
107static int default_interval = 100000;
108static int event_count[MAX_COUNTERS];
109static int fd[MAX_NR_CPUS][MAX_COUNTERS];
110static int event_mask[MAX_COUNTERS];
111
112static int tid = -1;
113static int profile_cpu = -1;
114static int nr_cpus = 0;
115static int nmi = 1;
116static int group = 0;
117static unsigned int page_size;
118
119static int zero;
120
121static int scale = 1;
122
123static const unsigned int default_count[] = {
124 1000000,
125 1000000,
126 10000,
127 10000,
128 1000000,
129 10000,
130};
131
132static char *hw_event_names[] = {
133 "CPU cycles",
134 "instructions",
135 "cache references",
136 "cache misses",
137 "branches",
138 "branch misses",
139 "bus cycles",
140};
141
142static char *sw_event_names[] = {
143 "cpu clock ticks",
144 "task clock ticks",
145 "pagefaults",
146 "context switches",
147 "CPU migrations",
148 "minor faults",
149 "major faults",
150};
151
152struct event_symbol {
153 __u64 event;
154 char *symbol;
155};
156
157static struct event_symbol event_symbols[] = {
158 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
159 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
160 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
161 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
162 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
163 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
164 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
165 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
166 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
167
168 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
169 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
170 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
171 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
172 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
173 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
174 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
175 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
176 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
177 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
178};
179
180#define __PERF_COUNTER_FIELD(config, name) \
181 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
182
183#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
184#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
185#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
186#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
187
188static void display_events_help(void)
189{
190 unsigned int i;
191 __u64 e;
192
193 printf(
194 " -e EVENT --event=EVENT # symbolic-name abbreviations");
195
196 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
197 int type, id;
198
199 e = event_symbols[i].event;
200 type = PERF_COUNTER_TYPE(e);
201 id = PERF_COUNTER_ID(e);
202
203 printf("\n %d:%d: %-20s",
204 type, id, event_symbols[i].symbol);
205 }
206
207 printf("\n"
208 " rNNN: raw PMU events (eventsel+umask)\n\n");
209}
210
211static void display_help(void)
212{
213 printf(
214 "Usage: perfstat [<events...>] <cmd...>\n\n"
215 "PerfStat Options (up to %d event types can be specified):\n\n",
216 MAX_COUNTERS);
217
218 display_events_help();
219
220 printf(
221 " -l # scale counter values\n"
222 " -a # system-wide collection\n");
223 exit(0);
224}
225
226static char *event_name(int ctr)
227{
228 __u64 config = event_id[ctr];
229 int type = PERF_COUNTER_TYPE(config);
230 int id = PERF_COUNTER_ID(config);
231 static char buf[32];
232
233 if (PERF_COUNTER_RAW(config)) {
234 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
235 return buf;
236 }
237
238 switch (type) {
239 case PERF_TYPE_HARDWARE:
240 if (id < PERF_HW_EVENTS_MAX)
241 return hw_event_names[id];
242 return "unknown-hardware";
243
244 case PERF_TYPE_SOFTWARE:
245 if (id < PERF_SW_EVENTS_MAX)
246 return sw_event_names[id];
247 return "unknown-software";
248
249 default:
250 break;
251 }
252
253 return "unknown";
254}
255
256/*
257 * Each event can have multiple symbolic names.
258 * Symbolic names are (almost) exactly matched.
259 */
260static __u64 match_event_symbols(char *str)
261{
262 __u64 config, id;
263 int type;
264 unsigned int i;
265 char mask_str[4];
266
267 if (sscanf(str, "r%llx", &config) == 1)
268 return config | PERF_COUNTER_RAW_MASK;
269
270 switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
271 case 3:
272 if (strchr(mask_str, 'u'))
273 event_mask[nr_counters] |= EVENT_MASK_USER;
274 if (strchr(mask_str, 'k'))
275 event_mask[nr_counters] |= EVENT_MASK_KERNEL;
276 case 2:
277 return EID(type, id);
278
279 default:
280 break;
281 }
282
283 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
284 if (!strncmp(str, event_symbols[i].symbol,
285 strlen(event_symbols[i].symbol)))
286 return event_symbols[i].event;
287 }
288
289 return ~0ULL;
290}
291
292static int parse_events(char *str)
293{
294 __u64 config;
295
296again:
297 if (nr_counters == MAX_COUNTERS)
298 return -1;
299
300 config = match_event_symbols(str);
301 if (config == ~0ULL)
302 return -1;
303
304 event_id[nr_counters] = config;
305 nr_counters++;
306
307 str = strstr(str, ",");
308 if (str) {
309 str++;
310 goto again;
311 }
312
313 return 0;
314}
315
316
317/*
318 * perfstat
319 */
320
321char fault_here[1000000];
322
323static void create_perfstat_counter(int counter)
324{
325 struct perf_counter_hw_event hw_event;
326
327 memset(&hw_event, 0, sizeof(hw_event));
328 hw_event.config = event_id[counter];
329 hw_event.record_type = 0;
330 hw_event.nmi = 0;
331 hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
332 hw_event.exclude_user = event_mask[counter] & EVENT_MASK_USER;
333
334printf("exclude: %d\n", event_mask[counter]);
335
336 if (scale)
337 hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
338 PERF_FORMAT_TOTAL_TIME_RUNNING;
339
340 if (system_wide) {
341 int cpu;
342 for (cpu = 0; cpu < nr_cpus; cpu ++) {
343 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
344 if (fd[cpu][counter] < 0) {
345 printf("perfstat error: syscall returned with %d (%s)\n",
346 fd[cpu][counter], strerror(errno));
347 exit(-1);
348 }
349 }
350 } else {
351 hw_event.inherit = 1;
352 hw_event.disabled = 1;
353
354 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
355 if (fd[0][counter] < 0) {
356 printf("perfstat error: syscall returned with %d (%s)\n",
357 fd[0][counter], strerror(errno));
358 exit(-1);
359 }
360 }
361}
362
363int do_perfstat(int argc, char *argv[])
364{
365 unsigned long long t0, t1;
366 int counter;
367 ssize_t res;
368 int status;
369 int pid;
370
371 if (!system_wide)
372 nr_cpus = 1;
373
374 for (counter = 0; counter < nr_counters; counter++)
375 create_perfstat_counter(counter);
376
377 argc -= optind;
378 argv += optind;
379
380 if (!argc)
381 display_help();
382
383 /*
384 * Enable counters and exec the command:
385 */
386 t0 = rdclock();
387 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
388
389 if ((pid = fork()) < 0)
390 perror("failed to fork");
391 if (!pid) {
392 if (execvp(argv[0], argv)) {
393 perror(argv[0]);
394 exit(-1);
395 }
396 }
397 while (wait(&status) >= 0)
398 ;
399 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
400 t1 = rdclock();
401
402 fflush(stdout);
403
404 fprintf(stderr, "\n");
405 fprintf(stderr, " Performance counter stats for \'%s\':\n",
406 argv[0]);
407 fprintf(stderr, "\n");
408
409 for (counter = 0; counter < nr_counters; counter++) {
410 int cpu, nv;
411 __u64 count[3], single_count[3];
412 int scaled;
413
414 count[0] = count[1] = count[2] = 0;
415 nv = scale ? 3 : 1;
416 for (cpu = 0; cpu < nr_cpus; cpu ++) {
417 res = read(fd[cpu][counter],
418 single_count, nv * sizeof(__u64));
419 assert(res == nv * sizeof(__u64));
420
421 count[0] += single_count[0];
422 if (scale) {
423 count[1] += single_count[1];
424 count[2] += single_count[2];
425 }
426 }
427
428 scaled = 0;
429 if (scale) {
430 if (count[2] == 0) {
431 fprintf(stderr, " %14s %-20s\n",
432 "<not counted>", event_name(counter));
433 continue;
434 }
435 if (count[2] < count[1]) {
436 scaled = 1;
437 count[0] = (unsigned long long)
438 ((double)count[0] * count[1] / count[2] + 0.5);
439 }
440 }
441
442 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
443 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
444
445 double msecs = (double)count[0] / 1000000;
446
447 fprintf(stderr, " %14.6f %-20s (msecs)",
448 msecs, event_name(counter));
449 } else {
450 fprintf(stderr, " %14Ld %-20s (events)",
451 count[0], event_name(counter));
452 }
453 if (scaled)
454 fprintf(stderr, " (scaled from %.2f%%)",
455 (double) count[2] / count[1] * 100);
456 fprintf(stderr, "\n");
457 }
458 fprintf(stderr, "\n");
459 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
460 (double)(t1-t0)/1e6);
461 fprintf(stderr, "\n");
462
463 return 0;
464}
465
466static void process_options(int argc, char **argv)
467{
468 int error = 0, counter;
469
470 for (;;) {
471 int option_index = 0;
472 /** Options for getopt */
473 static struct option long_options[] = {
474 {"count", required_argument, NULL, 'c'},
475 {"cpu", required_argument, NULL, 'C'},
476 {"delay", required_argument, NULL, 'd'},
477 {"dump_symtab", no_argument, NULL, 'D'},
478 {"event", required_argument, NULL, 'e'},
479 {"filter", required_argument, NULL, 'f'},
480 {"group", required_argument, NULL, 'g'},
481 {"help", no_argument, NULL, 'h'},
482 {"nmi", required_argument, NULL, 'n'},
483 {"munmap_info", no_argument, NULL, 'U'},
484 {"pid", required_argument, NULL, 'p'},
485 {"realtime", required_argument, NULL, 'r'},
486 {"scale", no_argument, NULL, 'l'},
487 {"symbol", required_argument, NULL, 's'},
488 {"stat", no_argument, NULL, 'S'},
489 {"vmlinux", required_argument, NULL, 'x'},
490 {"zero", no_argument, NULL, 'z'},
491 {NULL, 0, NULL, 0 }
492 };
493 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
494 long_options, &option_index);
495 if (c == -1)
496 break;
497
498 switch (c) {
499 case 'a': system_wide = 1; break;
500 case 'c': default_interval = atoi(optarg); break;
501 case 'C':
502 /* CPU and PID are mutually exclusive */
503 if (tid != -1) {
504 printf("WARNING: CPU switch overriding PID\n");
505 sleep(1);
506 tid = -1;
507 }
508 profile_cpu = atoi(optarg); break;
509
510 case 'e': error = parse_events(optarg); break;
511
512 case 'g': group = atoi(optarg); break;
513 case 'h': display_help(); break;
514 case 'l': scale = 1; break;
515 case 'n': nmi = atoi(optarg); break;
516 case 'p':
517 /* CPU and PID are mutually exclusive */
518 if (profile_cpu != -1) {
519 printf("WARNING: PID switch overriding CPU\n");
520 sleep(1);
521 profile_cpu = -1;
522 }
523 tid = atoi(optarg); break;
524 case 'z': zero = 1; break;
525 default: error = 1; break;
526 }
527 }
528 if (error)
529 display_help();
530
531 if (!nr_counters) {
532 nr_counters = 8;
533 }
534
535 for (counter = 0; counter < nr_counters; counter++) {
536 if (event_count[counter])
537 continue;
538
539 event_count[counter] = default_interval;
540 }
541}
542
543int cmd_stat(int argc, char **argv, const char *prefix)
544{
545 page_size = sysconf(_SC_PAGE_SIZE);
546
547 process_options(argc, argv);
548
549 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
550 assert(nr_cpus <= MAX_NR_CPUS);
551 assert(nr_cpus >= 0);
552
553 return do_perfstat(argc, argv);
554}
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
new file mode 100644
index 000000000000..cd6f61d73418
--- /dev/null
+++ b/Documentation/perf_counter/builtin-top.c
@@ -0,0 +1,1142 @@
1/*
2 * kerneltop.c: show top kernel functions - performance counters showcase
3
4 Build with:
5
6 make -C Documentation/perf_counter/
7
8 Sample output:
9
10------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12------------------------------------------------------------------------------
13
14 weight RIP kernel function
15 ______ ________________ _______________
16
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
29 */
30
31 /*
32 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
33 *
34 * Improvements and fixes by:
35 *
36 * Arjan van de Ven <arjan@linux.intel.com>
37 * Yanmin Zhang <yanmin.zhang@intel.com>
38 * Wu Fengguang <fengguang.wu@intel.com>
39 * Mike Galbraith <efault@gmx.de>
40 * Paul Mackerras <paulus@samba.org>
41 *
42 * Released under the GPL v2. (and only v2, not any later version)
43 */
44
45#include "util/util.h"
46
47#include <getopt.h>
48#include <assert.h>
49#include <fcntl.h>
50#include <stdio.h>
51#include <errno.h>
52#include <time.h>
53#include <sched.h>
54#include <pthread.h>
55
56#include <sys/syscall.h>
57#include <sys/ioctl.h>
58#include <sys/poll.h>
59#include <sys/prctl.h>
60#include <sys/wait.h>
61#include <sys/uio.h>
62#include <sys/mman.h>
63
64#include <linux/unistd.h>
65#include <linux/types.h>
66
67#include "../../include/linux/perf_counter.h"
68
69#include "perf.h"
70
71static int system_wide = 0;
72
73static int nr_counters = 0;
74static __u64 event_id[MAX_COUNTERS] = {
75 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
76 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
77 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
78 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
79
80 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
81 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
82 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
83 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
84};
85static int default_interval = 100000;
86static int event_count[MAX_COUNTERS];
87static int fd[MAX_NR_CPUS][MAX_COUNTERS];
88
89static __u64 count_filter = 100;
90
91static int tid = -1;
92static int profile_cpu = -1;
93static int nr_cpus = 0;
94static int nmi = 1;
95static unsigned int realtime_prio = 0;
96static int group = 0;
97static unsigned int page_size;
98static unsigned int mmap_pages = 16;
99static int use_mmap = 0;
100static int use_munmap = 0;
101
102static char *vmlinux;
103
104static char *sym_filter;
105static unsigned long filter_start;
106static unsigned long filter_end;
107
108static int delay_secs = 2;
109static int zero;
110static int dump_symtab;
111
112static int scale;
113
114struct source_line {
115 uint64_t EIP;
116 unsigned long count;
117 char *line;
118 struct source_line *next;
119};
120
121static struct source_line *lines;
122static struct source_line **lines_tail;
123
124static const unsigned int default_count[] = {
125 1000000,
126 1000000,
127 10000,
128 10000,
129 1000000,
130 10000,
131};
132
133static char *hw_event_names[] = {
134 "CPU cycles",
135 "instructions",
136 "cache references",
137 "cache misses",
138 "branches",
139 "branch misses",
140 "bus cycles",
141};
142
143static char *sw_event_names[] = {
144 "cpu clock ticks",
145 "task clock ticks",
146 "pagefaults",
147 "context switches",
148 "CPU migrations",
149 "minor faults",
150 "major faults",
151};
152
153struct event_symbol {
154 __u64 event;
155 char *symbol;
156};
157
158static struct event_symbol event_symbols[] = {
159 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
160 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
161 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
162 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
163 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
164 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
165 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
166 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
167 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
168
169 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
170 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
171 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
172 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
173 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
174 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
175 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
176 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
177 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
178 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
179};
180
181#define __PERF_COUNTER_FIELD(config, name) \
182 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
183
184#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
185#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
186#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
187#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
188
189static void display_events_help(void)
190{
191 unsigned int i;
192 __u64 e;
193
194 printf(
195 " -e EVENT --event=EVENT # symbolic-name abbreviations");
196
197 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
198 int type, id;
199
200 e = event_symbols[i].event;
201 type = PERF_COUNTER_TYPE(e);
202 id = PERF_COUNTER_ID(e);
203
204 printf("\n %d:%d: %-20s",
205 type, id, event_symbols[i].symbol);
206 }
207
208 printf("\n"
209 " rNNN: raw PMU events (eventsel+umask)\n\n");
210}
211
212static void display_help(void)
213{
214 printf(
215 "Usage: kerneltop [<options>]\n"
216 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
217 "KernelTop Options (up to %d event types can be specified at once):\n\n",
218 MAX_COUNTERS);
219
220 display_events_help();
221
222 printf(
223 " -c CNT --count=CNT # event period to sample\n\n"
224 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
225 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
226 " -l # show scale factor for RR events\n"
227 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
228 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
229 " -r prio --realtime=<prio> # event acquisition runs with SCHED_FIFO policy\n"
230 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
231 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
232 " -z --zero # zero counts after display\n"
233 " -D --dump_symtab # dump symbol table to stderr on startup\n"
234 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
235 " -M --mmap_info # print mmap info stream\n"
236 " -U --munmap_info # print munmap info stream\n"
237 );
238
239 exit(0);
240}
241
242static char *event_name(int ctr)
243{
244 __u64 config = event_id[ctr];
245 int type = PERF_COUNTER_TYPE(config);
246 int id = PERF_COUNTER_ID(config);
247 static char buf[32];
248
249 if (PERF_COUNTER_RAW(config)) {
250 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
251 return buf;
252 }
253
254 switch (type) {
255 case PERF_TYPE_HARDWARE:
256 if (id < PERF_HW_EVENTS_MAX)
257 return hw_event_names[id];
258 return "unknown-hardware";
259
260 case PERF_TYPE_SOFTWARE:
261 if (id < PERF_SW_EVENTS_MAX)
262 return sw_event_names[id];
263 return "unknown-software";
264
265 default:
266 break;
267 }
268
269 return "unknown";
270}
271
272/*
273 * Each event can have multiple symbolic names.
274 * Symbolic names are (almost) exactly matched.
275 */
276static __u64 match_event_symbols(char *str)
277{
278 __u64 config, id;
279 int type;
280 unsigned int i;
281
282 if (sscanf(str, "r%llx", &config) == 1)
283 return config | PERF_COUNTER_RAW_MASK;
284
285 if (sscanf(str, "%d:%llu", &type, &id) == 2)
286 return EID(type, id);
287
288 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
289 if (!strncmp(str, event_symbols[i].symbol,
290 strlen(event_symbols[i].symbol)))
291 return event_symbols[i].event;
292 }
293
294 return ~0ULL;
295}
296
297static int parse_events(char *str)
298{
299 __u64 config;
300
301again:
302 if (nr_counters == MAX_COUNTERS)
303 return -1;
304
305 config = match_event_symbols(str);
306 if (config == ~0ULL)
307 return -1;
308
309 event_id[nr_counters] = config;
310 nr_counters++;
311
312 str = strstr(str, ",");
313 if (str) {
314 str++;
315 goto again;
316 }
317
318 return 0;
319}
320
321/*
322 * Symbols
323 */
324
325static uint64_t min_ip;
326static uint64_t max_ip = -1ll;
327
328struct sym_entry {
329 unsigned long long addr;
330 char *sym;
331 unsigned long count[MAX_COUNTERS];
332 int skip;
333 struct source_line *source;
334};
335
336#define MAX_SYMS 100000
337
338static int sym_table_count;
339
340struct sym_entry *sym_filter_entry;
341
342static struct sym_entry sym_table[MAX_SYMS];
343
344static void show_details(struct sym_entry *sym);
345
346/*
347 * Ordering weight: count-1 * count-2 * ... / count-n
348 */
349static double sym_weight(const struct sym_entry *sym)
350{
351 double weight;
352 int counter;
353
354 weight = sym->count[0];
355
356 for (counter = 1; counter < nr_counters-1; counter++)
357 weight *= sym->count[counter];
358
359 weight /= (sym->count[counter] + 1);
360
361 return weight;
362}
363
364static int compare(const void *__sym1, const void *__sym2)
365{
366 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
367
368 return sym_weight(sym1) < sym_weight(sym2);
369}
370
371static long events;
372static long userspace_events;
373static const char CONSOLE_CLEAR[] = "";
374
375static struct sym_entry tmp[MAX_SYMS];
376
377static void print_sym_table(void)
378{
379 int i, printed;
380 int counter;
381 float events_per_sec = events/delay_secs;
382 float kevents_per_sec = (events-userspace_events)/delay_secs;
383 float sum_kevents = 0.0;
384
385 events = userspace_events = 0;
386 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
387 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
388
389 for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
390 sum_kevents += tmp[i].count[0];
391
392 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
393
394 printf(
395"------------------------------------------------------------------------------\n");
396 printf( " KernelTop:%8.0f irqs/sec kernel:%4.1f%% [%s, ",
397 events_per_sec,
398 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
399 nmi ? "NMI" : "IRQ");
400
401 if (nr_counters == 1)
402 printf("%d ", event_count[0]);
403
404 for (counter = 0; counter < nr_counters; counter++) {
405 if (counter)
406 printf("/");
407
408 printf("%s", event_name(counter));
409 }
410
411 printf( "], ");
412
413 if (tid != -1)
414 printf(" (tid: %d", tid);
415 else
416 printf(" (all");
417
418 if (profile_cpu != -1)
419 printf(", cpu: %d)\n", profile_cpu);
420 else {
421 if (tid != -1)
422 printf(")\n");
423 else
424 printf(", %d CPUs)\n", nr_cpus);
425 }
426
427 printf("------------------------------------------------------------------------------\n\n");
428
429 if (nr_counters == 1)
430 printf(" events pcnt");
431 else
432 printf(" weight events pcnt");
433
434 printf(" RIP kernel function\n"
435 " ______ ______ _____ ________________ _______________\n\n"
436 );
437
438 for (i = 0, printed = 0; i < sym_table_count; i++) {
439 float pcnt;
440 int count;
441
442 if (printed <= 18 && tmp[i].count[0] >= count_filter) {
443 pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
444
445 if (nr_counters == 1)
446 printf("%19.2f - %4.1f%% - %016llx : %s\n",
447 sym_weight(tmp + i),
448 pcnt, tmp[i].addr, tmp[i].sym);
449 else
450 printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
451 sym_weight(tmp + i),
452 tmp[i].count[0],
453 pcnt, tmp[i].addr, tmp[i].sym);
454 printed++;
455 }
456 /*
457 * Add decay to the counts:
458 */
459 for (count = 0; count < nr_counters; count++)
460 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
461 }
462
463 if (sym_filter_entry)
464 show_details(sym_filter_entry);
465
466 {
467 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
468
469 if (poll(&stdin_poll, 1, 0) == 1) {
470 printf("key pressed - exiting.\n");
471 exit(0);
472 }
473 }
474}
475
476static void *display_thread(void *arg)
477{
478 printf("KernelTop refresh period: %d seconds\n", delay_secs);
479
480 while (!sleep(delay_secs))
481 print_sym_table();
482
483 return NULL;
484}
485
486static int read_symbol(FILE *in, struct sym_entry *s)
487{
488 static int filter_match = 0;
489 char *sym, stype;
490 char str[500];
491 int rc, pos;
492
493 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
494 if (rc == EOF)
495 return -1;
496
497 assert(rc == 3);
498
499 /* skip until end of line: */
500 pos = strlen(str);
501 do {
502 rc = fgetc(in);
503 if (rc == '\n' || rc == EOF || pos >= 499)
504 break;
505 str[pos] = rc;
506 pos++;
507 } while (1);
508 str[pos] = 0;
509
510 sym = str;
511
512 /* Filter out known duplicates and non-text symbols. */
513 if (!strcmp(sym, "_text"))
514 return 1;
515 if (!min_ip && !strcmp(sym, "_stext"))
516 return 1;
517 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
518 return 1;
519 if (stype != 'T' && stype != 't')
520 return 1;
521 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
522 return 1;
523 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
524 return 1;
525
526 s->sym = malloc(strlen(str));
527 assert(s->sym);
528
529 strcpy((char *)s->sym, str);
530 s->skip = 0;
531
532 /* Tag events to be skipped. */
533 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
534 s->skip = 1;
535 else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
536 s->skip = 1;
537 else if (!strcmp("mwait_idle", s->sym))
538 s->skip = 1;
539
540 if (filter_match == 1) {
541 filter_end = s->addr;
542 filter_match = -1;
543 if (filter_end - filter_start > 10000) {
544 printf("hm, too large filter symbol <%s> - skipping.\n",
545 sym_filter);
546 printf("symbol filter start: %016lx\n", filter_start);
547 printf(" end: %016lx\n", filter_end);
548 filter_end = filter_start = 0;
549 sym_filter = NULL;
550 sleep(1);
551 }
552 }
553 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
554 filter_match = 1;
555 filter_start = s->addr;
556 }
557
558 return 0;
559}
560
561static int compare_addr(const void *__sym1, const void *__sym2)
562{
563 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
564
565 return sym1->addr > sym2->addr;
566}
567
568static void sort_symbol_table(void)
569{
570 int i, dups;
571
572 do {
573 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
574 for (i = 0, dups = 0; i < sym_table_count; i++) {
575 if (sym_table[i].addr == sym_table[i+1].addr) {
576 sym_table[i+1].addr = -1ll;
577 dups++;
578 }
579 }
580 sym_table_count -= dups;
581 } while(dups);
582}
583
584static void parse_symbols(void)
585{
586 struct sym_entry *last;
587
588 FILE *kallsyms = fopen("/proc/kallsyms", "r");
589
590 if (!kallsyms) {
591 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
592 exit(-1);
593 }
594
595 while (!feof(kallsyms)) {
596 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
597 sym_table_count++;
598 assert(sym_table_count <= MAX_SYMS);
599 }
600 }
601
602 sort_symbol_table();
603 min_ip = sym_table[0].addr;
604 max_ip = sym_table[sym_table_count-1].addr;
605 last = sym_table + sym_table_count++;
606
607 last->addr = -1ll;
608 last->sym = "<end>";
609
610 if (filter_end) {
611 int count;
612 for (count=0; count < sym_table_count; count ++) {
613 if (!strcmp(sym_table[count].sym, sym_filter)) {
614 sym_filter_entry = &sym_table[count];
615 break;
616 }
617 }
618 }
619 if (dump_symtab) {
620 int i;
621
622 for (i = 0; i < sym_table_count; i++)
623 fprintf(stderr, "%llx %s\n",
624 sym_table[i].addr, sym_table[i].sym);
625 }
626}
627
628/*
629 * Source lines
630 */
631
632static void parse_vmlinux(char *filename)
633{
634 FILE *file;
635 char command[PATH_MAX*2];
636 if (!filename)
637 return;
638
639 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
640
641 file = popen(command, "r");
642 if (!file)
643 return;
644
645 lines_tail = &lines;
646 while (!feof(file)) {
647 struct source_line *src;
648 size_t dummy = 0;
649 char *c;
650
651 src = malloc(sizeof(struct source_line));
652 assert(src != NULL);
653 memset(src, 0, sizeof(struct source_line));
654
655 if (getline(&src->line, &dummy, file) < 0)
656 break;
657 if (!src->line)
658 break;
659
660 c = strchr(src->line, '\n');
661 if (c)
662 *c = 0;
663
664 src->next = NULL;
665 *lines_tail = src;
666 lines_tail = &src->next;
667
668 if (strlen(src->line)>8 && src->line[8] == ':')
669 src->EIP = strtoull(src->line, NULL, 16);
670 if (strlen(src->line)>8 && src->line[16] == ':')
671 src->EIP = strtoull(src->line, NULL, 16);
672 }
673 pclose(file);
674}
675
676static void record_precise_ip(uint64_t ip)
677{
678 struct source_line *line;
679
680 for (line = lines; line; line = line->next) {
681 if (line->EIP == ip)
682 line->count++;
683 if (line->EIP > ip)
684 break;
685 }
686}
687
688static void lookup_sym_in_vmlinux(struct sym_entry *sym)
689{
690 struct source_line *line;
691 char pattern[PATH_MAX];
692 sprintf(pattern, "<%s>:", sym->sym);
693
694 for (line = lines; line; line = line->next) {
695 if (strstr(line->line, pattern)) {
696 sym->source = line;
697 break;
698 }
699 }
700}
701
702static void show_lines(struct source_line *line_queue, int line_queue_count)
703{
704 int i;
705 struct source_line *line;
706
707 line = line_queue;
708 for (i = 0; i < line_queue_count; i++) {
709 printf("%8li\t%s\n", line->count, line->line);
710 line = line->next;
711 }
712}
713
714#define TRACE_COUNT 3
715
716static void show_details(struct sym_entry *sym)
717{
718 struct source_line *line;
719 struct source_line *line_queue = NULL;
720 int displayed = 0;
721 int line_queue_count = 0;
722
723 if (!sym->source)
724 lookup_sym_in_vmlinux(sym);
725 if (!sym->source)
726 return;
727
728 printf("Showing details for %s\n", sym->sym);
729
730 line = sym->source;
731 while (line) {
732 if (displayed && strstr(line->line, ">:"))
733 break;
734
735 if (!line_queue_count)
736 line_queue = line;
737 line_queue_count ++;
738
739 if (line->count >= count_filter) {
740 show_lines(line_queue, line_queue_count);
741 line_queue_count = 0;
742 line_queue = NULL;
743 } else if (line_queue_count > TRACE_COUNT) {
744 line_queue = line_queue->next;
745 line_queue_count --;
746 }
747
748 line->count = 0;
749 displayed++;
750 if (displayed > 300)
751 break;
752 line = line->next;
753 }
754}
755
756/*
757 * Binary search in the histogram table and record the hit:
758 */
759static void record_ip(uint64_t ip, int counter)
760{
761 int left_idx, middle_idx, right_idx, idx;
762 unsigned long left, middle, right;
763
764 record_precise_ip(ip);
765
766 left_idx = 0;
767 right_idx = sym_table_count-1;
768 assert(ip <= max_ip && ip >= min_ip);
769
770 while (left_idx + 1 < right_idx) {
771 middle_idx = (left_idx + right_idx) / 2;
772
773 left = sym_table[ left_idx].addr;
774 middle = sym_table[middle_idx].addr;
775 right = sym_table[ right_idx].addr;
776
777 if (!(left <= middle && middle <= right)) {
778 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
779 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
780 }
781 assert(left <= middle && middle <= right);
782 if (!(left <= ip && ip <= right)) {
783 printf(" left: %016lx\n", left);
784 printf(" ip: %016lx\n", (unsigned long)ip);
785 printf("right: %016lx\n", right);
786 }
787 assert(left <= ip && ip <= right);
788 /*
789 * [ left .... target .... middle .... right ]
790 * => right := middle
791 */
792 if (ip < middle) {
793 right_idx = middle_idx;
794 continue;
795 }
796 /*
797 * [ left .... middle ... target ... right ]
798 * => left := middle
799 */
800 left_idx = middle_idx;
801 }
802
803 idx = left_idx;
804
805 if (!sym_table[idx].skip)
806 sym_table[idx].count[counter]++;
807 else events--;
808}
809
810static void process_event(uint64_t ip, int counter)
811{
812 events++;
813
814 if (ip < min_ip || ip > max_ip) {
815 userspace_events++;
816 return;
817 }
818
819 record_ip(ip, counter);
820}
821
822static void process_options(int argc, char **argv)
823{
824 int error = 0, counter;
825
826 for (;;) {
827 int option_index = 0;
828 /** Options for getopt */
829 static struct option long_options[] = {
830 {"count", required_argument, NULL, 'c'},
831 {"cpu", required_argument, NULL, 'C'},
832 {"delay", required_argument, NULL, 'd'},
833 {"dump_symtab", no_argument, NULL, 'D'},
834 {"event", required_argument, NULL, 'e'},
835 {"filter", required_argument, NULL, 'f'},
836 {"group", required_argument, NULL, 'g'},
837 {"help", no_argument, NULL, 'h'},
838 {"nmi", required_argument, NULL, 'n'},
839 {"mmap_info", no_argument, NULL, 'M'},
840 {"mmap_pages", required_argument, NULL, 'm'},
841 {"munmap_info", no_argument, NULL, 'U'},
842 {"pid", required_argument, NULL, 'p'},
843 {"realtime", required_argument, NULL, 'r'},
844 {"scale", no_argument, NULL, 'l'},
845 {"symbol", required_argument, NULL, 's'},
846 {"stat", no_argument, NULL, 'S'},
847 {"vmlinux", required_argument, NULL, 'x'},
848 {"zero", no_argument, NULL, 'z'},
849 {NULL, 0, NULL, 0 }
850 };
851 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
852 long_options, &option_index);
853 if (c == -1)
854 break;
855
856 switch (c) {
857 case 'a': system_wide = 1; break;
858 case 'c': default_interval = atoi(optarg); break;
859 case 'C':
860 /* CPU and PID are mutually exclusive */
861 if (tid != -1) {
862 printf("WARNING: CPU switch overriding PID\n");
863 sleep(1);
864 tid = -1;
865 }
866 profile_cpu = atoi(optarg); break;
867 case 'd': delay_secs = atoi(optarg); break;
868 case 'D': dump_symtab = 1; break;
869
870 case 'e': error = parse_events(optarg); break;
871
872 case 'f': count_filter = atoi(optarg); break;
873 case 'g': group = atoi(optarg); break;
874 case 'h': display_help(); break;
875 case 'l': scale = 1; break;
876 case 'n': nmi = atoi(optarg); break;
877 case 'p':
878 /* CPU and PID are mutually exclusive */
879 if (profile_cpu != -1) {
880 printf("WARNING: PID switch overriding CPU\n");
881 sleep(1);
882 profile_cpu = -1;
883 }
884 tid = atoi(optarg); break;
885 case 'r': realtime_prio = atoi(optarg); break;
886 case 's': sym_filter = strdup(optarg); break;
887 case 'x': vmlinux = strdup(optarg); break;
888 case 'z': zero = 1; break;
889 case 'm': mmap_pages = atoi(optarg); break;
890 case 'M': use_mmap = 1; break;
891 case 'U': use_munmap = 1; break;
892 default: error = 1; break;
893 }
894 }
895 if (error)
896 display_help();
897
898 if (!nr_counters) {
899 nr_counters = 1;
900 event_id[0] = 0;
901 }
902
903 for (counter = 0; counter < nr_counters; counter++) {
904 if (event_count[counter])
905 continue;
906
907 event_count[counter] = default_interval;
908 }
909}
910
911struct mmap_data {
912 int counter;
913 void *base;
914 unsigned int mask;
915 unsigned int prev;
916};
917
918static unsigned int mmap_read_head(struct mmap_data *md)
919{
920 struct perf_counter_mmap_page *pc = md->base;
921 int head;
922
923 head = pc->data_head;
924 rmb();
925
926 return head;
927}
928
929struct timeval last_read, this_read;
930
931static void mmap_read(struct mmap_data *md)
932{
933 unsigned int head = mmap_read_head(md);
934 unsigned int old = md->prev;
935 unsigned char *data = md->base + page_size;
936 int diff;
937
938 gettimeofday(&this_read, NULL);
939
940 /*
941 * If we're further behind than half the buffer, there's a chance
942 * the writer will bite our tail and screw up the events under us.
943 *
944 * If we somehow ended up ahead of the head, we got messed up.
945 *
946 * In either case, truncate and restart at head.
947 */
948 diff = head - old;
949 if (diff > md->mask / 2 || diff < 0) {
950 struct timeval iv;
951 unsigned long msecs;
952
953 timersub(&this_read, &last_read, &iv);
954 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
955
956 fprintf(stderr, "WARNING: failed to keep up with mmap data."
957 " Last read %lu msecs ago.\n", msecs);
958
959 /*
960 * head points to a known good entry, start there.
961 */
962 old = head;
963 }
964
965 last_read = this_read;
966
967 for (; old != head;) {
968 struct ip_event {
969 struct perf_event_header header;
970 __u64 ip;
971 __u32 pid, tid;
972 };
973 struct mmap_event {
974 struct perf_event_header header;
975 __u32 pid, tid;
976 __u64 start;
977 __u64 len;
978 __u64 pgoff;
979 char filename[PATH_MAX];
980 };
981
982 typedef union event_union {
983 struct perf_event_header header;
984 struct ip_event ip;
985 struct mmap_event mmap;
986 } event_t;
987
988 event_t *event = (event_t *)&data[old & md->mask];
989
990 event_t event_copy;
991
992 size_t size = event->header.size;
993
994 /*
995 * Event straddles the mmap boundary -- header should always
996 * be inside due to u64 alignment of output.
997 */
998 if ((old & md->mask) + size != ((old + size) & md->mask)) {
999 unsigned int offset = old;
1000 unsigned int len = min(sizeof(*event), size), cpy;
1001 void *dst = &event_copy;
1002
1003 do {
1004 cpy = min(md->mask + 1 - (offset & md->mask), len);
1005 memcpy(dst, &data[offset & md->mask], cpy);
1006 offset += cpy;
1007 dst += cpy;
1008 len -= cpy;
1009 } while (len);
1010
1011 event = &event_copy;
1012 }
1013
1014 old += size;
1015
1016 if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
1017 if (event->header.type & PERF_RECORD_IP)
1018 process_event(event->ip.ip, md->counter);
1019 } else {
1020 switch (event->header.type) {
1021 case PERF_EVENT_MMAP:
1022 case PERF_EVENT_MUNMAP:
1023 printf("%s: %Lu %Lu %Lu %s\n",
1024 event->header.type == PERF_EVENT_MMAP
1025 ? "mmap" : "munmap",
1026 event->mmap.start,
1027 event->mmap.len,
1028 event->mmap.pgoff,
1029 event->mmap.filename);
1030 break;
1031 }
1032 }
1033 }
1034
1035 md->prev = old;
1036}
1037
1038int cmd_top(int argc, char **argv, const char *prefix)
1039{
1040 struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
1041 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1042 struct perf_counter_hw_event hw_event;
1043 pthread_t thread;
1044 int i, counter, group_fd, nr_poll = 0;
1045 unsigned int cpu;
1046 int ret;
1047
1048 page_size = sysconf(_SC_PAGE_SIZE);
1049
1050 process_options(argc, argv);
1051
1052 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1053 assert(nr_cpus <= MAX_NR_CPUS);
1054 assert(nr_cpus >= 0);
1055
1056 if (tid != -1 || profile_cpu != -1)
1057 nr_cpus = 1;
1058
1059 parse_symbols();
1060 if (vmlinux && sym_filter_entry)
1061 parse_vmlinux(vmlinux);
1062
1063 for (i = 0; i < nr_cpus; i++) {
1064 group_fd = -1;
1065 for (counter = 0; counter < nr_counters; counter++) {
1066
1067 cpu = profile_cpu;
1068 if (tid == -1 && profile_cpu == -1)
1069 cpu = i;
1070
1071 memset(&hw_event, 0, sizeof(hw_event));
1072 hw_event.config = event_id[counter];
1073 hw_event.irq_period = event_count[counter];
1074 hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID;
1075 hw_event.nmi = nmi;
1076 hw_event.mmap = use_mmap;
1077 hw_event.munmap = use_munmap;
1078
1079 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1080 if (fd[i][counter] < 0) {
1081 int err = errno;
1082 printf("kerneltop error: syscall returned with %d (%s)\n",
1083 fd[i][counter], strerror(err));
1084 if (err == EPERM)
1085 printf("Are you root?\n");
1086 exit(-1);
1087 }
1088 assert(fd[i][counter] >= 0);
1089 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1090
1091 /*
1092 * First counter acts as the group leader:
1093 */
1094 if (group && group_fd == -1)
1095 group_fd = fd[i][counter];
1096
1097 event_array[nr_poll].fd = fd[i][counter];
1098 event_array[nr_poll].events = POLLIN;
1099 nr_poll++;
1100
1101 mmap_array[i][counter].counter = counter;
1102 mmap_array[i][counter].prev = 0;
1103 mmap_array[i][counter].mask = mmap_pages*page_size - 1;
1104 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
1105 PROT_READ, MAP_SHARED, fd[i][counter], 0);
1106 if (mmap_array[i][counter].base == MAP_FAILED) {
1107 printf("kerneltop error: failed to mmap with %d (%s)\n",
1108 errno, strerror(errno));
1109 exit(-1);
1110 }
1111 }
1112 }
1113
1114 if (pthread_create(&thread, NULL, display_thread, NULL)) {
1115 printf("Could not create display thread.\n");
1116 exit(-1);
1117 }
1118
1119 if (realtime_prio) {
1120 struct sched_param param;
1121
1122 param.sched_priority = realtime_prio;
1123 if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1124 printf("Could not set realtime priority.\n");
1125 exit(-1);
1126 }
1127 }
1128
1129 while (1) {
1130 int hits = events;
1131
1132 for (i = 0; i < nr_cpus; i++) {
1133 for (counter = 0; counter < nr_counters; counter++)
1134 mmap_read(&mmap_array[i][counter]);
1135 }
1136
1137 if (hits == events)
1138 ret = poll(event_array, nr_poll, 100);
1139 }
1140
1141 return 0;
1142}
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
new file mode 100644
index 000000000000..d32318aed8cf
--- /dev/null
+++ b/Documentation/perf_counter/builtin.h
@@ -0,0 +1,22 @@
1#ifndef BUILTIN_H
2#define BUILTIN_H
3
4#include "util/util.h"
5#include "util/strbuf.h"
6
7extern const char perf_version_string[];
8extern const char perf_usage_string[];
9extern const char perf_more_info_string[];
10
11extern void list_common_cmds_help(void);
12extern const char *help_unknown_cmd(const char *cmd);
13extern void prune_packed_objects(int);
14extern int read_line_with_nul(char *buf, int size, FILE *file);
15extern int check_pager_config(const char *cmd);
16
17extern int cmd_help(int argc, const char **argv, const char *prefix);
18extern int cmd_record(int argc, const char **argv, const char *prefix);
19extern int cmd_stat(int argc, const char **argv, const char *prefix);
20extern int cmd_top(int argc, const char **argv, const char *prefix);
21extern int cmd_version(int argc, const char **argv, const char *prefix);
22#endif
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
new file mode 100644
index 000000000000..d15210aa0cae
--- /dev/null
+++ b/Documentation/perf_counter/command-list.txt
@@ -0,0 +1,6 @@
1# List of known perf commands.
2# command name category [deprecated] [common]
3perf-record mainporcelain common
4perf-stat mainporcelain common
5perf-top mainporcelain common
6
diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
new file mode 100644
index 000000000000..9930c4bddc6f
--- /dev/null
+++ b/Documentation/perf_counter/design.txt
@@ -0,0 +1,449 @@
1
2Performance Counters for Linux
3------------------------------
4
5Performance counters are special hardware registers available on most modern
6CPUs. These registers count the number of certain types of hw events: such
7as instructions executed, cachemisses suffered, or branches mis-predicted -
8without slowing down the kernel or applications. These registers can also
9trigger interrupts when a threshold number of events have passed - and can
10thus be used to profile the code that runs on that CPU.
11
12The Linux Performance Counter subsystem provides an abstraction of these
13hardware capabilities. It provides per task and per CPU counters, counter
14groups, and it provides event capabilities on top of those. It
15provides "virtual" 64-bit counters, regardless of the width of the
16underlying hardware counters.
17
18Performance counters are accessed via special file descriptors.
19There's one file descriptor per virtual counter used.
20
21The special file descriptor is opened via the perf_counter_open()
22system call:
23
24 int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
25 pid_t pid, int cpu, int group_fd,
26 unsigned long flags);
27
28The syscall returns the new fd. The fd can be used via the normal
29VFS system calls: read() can be used to read the counter, fcntl()
30can be used to set the blocking mode, etc.
31
32Multiple counters can be kept open at a time, and the counters
33can be poll()ed.
34
35When creating a new counter fd, 'perf_counter_hw_event' is:
36
37struct perf_counter_hw_event {
38 /*
39 * The MSB of the config word signifies if the rest contains cpu
40 * specific (raw) counter configuration data, if unset, the next
41 * 7 bits are an event type and the rest of the bits are the event
42 * identifier.
43 */
44 __u64 config;
45
46 __u64 irq_period;
47 __u32 record_type;
48 __u32 read_format;
49
50 __u64 disabled : 1, /* off by default */
51 nmi : 1, /* NMI sampling */
52 inherit : 1, /* children inherit it */
53 pinned : 1, /* must always be on PMU */
54 exclusive : 1, /* only group on PMU */
55 exclude_user : 1, /* don't count user */
56 exclude_kernel : 1, /* ditto kernel */
57 exclude_hv : 1, /* ditto hypervisor */
58 exclude_idle : 1, /* don't count when idle */
59 mmap : 1, /* include mmap data */
60 munmap : 1, /* include munmap data */
61 comm : 1, /* include comm data */
62
63 __reserved_1 : 52;
64
65 __u32 extra_config_len;
66 __u32 wakeup_events; /* wakeup every n events */
67
68 __u64 __reserved_2;
69 __u64 __reserved_3;
70};
71
72The 'config' field specifies what the counter should count. It
73is divided into 3 bit-fields:
74
75raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000
76type: 7 bits (next most significant) 0x7f00_0000_0000_0000
77event_id: 56 bits (least significant) 0x00ff_ffff_ffff_ffff
78
79If 'raw_type' is 1, then the counter will count a hardware event
80specified by the remaining 63 bits of event_config. The encoding is
81machine-specific.
82
83If 'raw_type' is 0, then the 'type' field says what kind of counter
84this is, with the following encoding:
85
86enum perf_event_types {
87 PERF_TYPE_HARDWARE = 0,
88 PERF_TYPE_SOFTWARE = 1,
89 PERF_TYPE_TRACEPOINT = 2,
90};
91
92A counter of PERF_TYPE_HARDWARE will count the hardware event
93specified by 'event_id':
94
95/*
96 * Generalized performance counter event types, used by the hw_event.event_id
97 * parameter of the sys_perf_counter_open() syscall:
98 */
99enum hw_event_ids {
100 /*
101 * Common hardware events, generalized by the kernel:
102 */
103 PERF_COUNT_CPU_CYCLES = 0,
104 PERF_COUNT_INSTRUCTIONS = 1,
105 PERF_COUNT_CACHE_REFERENCES = 2,
106 PERF_COUNT_CACHE_MISSES = 3,
107 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
108 PERF_COUNT_BRANCH_MISSES = 5,
109 PERF_COUNT_BUS_CYCLES = 6,
110};
111
112These are standardized types of events that work relatively uniformly
113on all CPUs that implement Performance Counters support under Linux,
114although there may be variations (e.g., different CPUs might count
115cache references and misses at different levels of the cache hierarchy).
116If a CPU is not able to count the selected event, then the system call
117will return -EINVAL.
118
119More hw_event_types are supported as well, but they are CPU-specific
120and accessed as raw events. For example, to count "External bus
121cycles while bus lock signal asserted" events on Intel Core CPUs, pass
122in a 0x4064 event_id value and set hw_event.raw_type to 1.
123
124A counter of type PERF_TYPE_SOFTWARE will count one of the available
125software events, selected by 'event_id':
126
127/*
128 * Special "software" counters provided by the kernel, even if the hardware
129 * does not support performance counters. These counters measure various
130 * physical and sw events of the kernel (and allow the profiling of them as
131 * well):
132 */
133enum sw_event_ids {
134 PERF_COUNT_CPU_CLOCK = 0,
135 PERF_COUNT_TASK_CLOCK = 1,
136 PERF_COUNT_PAGE_FAULTS = 2,
137 PERF_COUNT_CONTEXT_SWITCHES = 3,
138 PERF_COUNT_CPU_MIGRATIONS = 4,
139 PERF_COUNT_PAGE_FAULTS_MIN = 5,
140 PERF_COUNT_PAGE_FAULTS_MAJ = 6,
141};
142
143Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
144tracer is available, and event_id values can be obtained from
145/debug/tracing/events/*/*/id
146
147
148Counters come in two flavours: counting counters and sampling
149counters. A "counting" counter is one that is used for counting the
150number of events that occur, and is characterised by having
151irq_period = 0.
152
153
154A read() on a counter returns the current value of the counter and possible
155additional values as specified by 'read_format', each value is a u64 (8 bytes)
156in size.
157
158/*
159 * Bits that can be set in hw_event.read_format to request that
160 * reads on the counter should return the indicated quantities,
161 * in increasing order of bit value, after the counter value.
162 */
163enum perf_counter_read_format {
164 PERF_FORMAT_TOTAL_TIME_ENABLED = 1,
165 PERF_FORMAT_TOTAL_TIME_RUNNING = 2,
166};
167
168Using these additional values one can establish the overcommit ratio for a
169particular counter allowing one to take the round-robin scheduling effect
170into account.
171
172
173A "sampling" counter is one that is set up to generate an interrupt
174every N events, where N is given by 'irq_period'. A sampling counter
175has irq_period > 0. The record_type controls what data is recorded on each
176interrupt:
177
178/*
179 * Bits that can be set in hw_event.record_type to request information
180 * in the overflow packets.
181 */
182enum perf_counter_record_format {
183 PERF_RECORD_IP = 1U << 0,
184 PERF_RECORD_TID = 1U << 1,
185 PERF_RECORD_TIME = 1U << 2,
186 PERF_RECORD_ADDR = 1U << 3,
187 PERF_RECORD_GROUP = 1U << 4,
188 PERF_RECORD_CALLCHAIN = 1U << 5,
189};
190
191Such (and other) events will be recorded in a ring-buffer, which is
192available to user-space using mmap() (see below).
193
194The 'disabled' bit specifies whether the counter starts out disabled
195or enabled. If it is initially disabled, it can be enabled by ioctl
196or prctl (see below).
197
198The 'nmi' bit specifies, for hardware events, whether the counter
199should be set up to request non-maskable interrupts (NMIs) or normal
200interrupts. This bit is ignored if the user doesn't have
201CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't
202generate NMIs from hardware counters.
203
204The 'inherit' bit, if set, specifies that this counter should count
205events on descendant tasks as well as the task specified. This only
206applies to new descendents, not to any existing descendents at the
207time the counter is created (nor to any new descendents of existing
208descendents).
209
210The 'pinned' bit, if set, specifies that the counter should always be
211on the CPU if at all possible. It only applies to hardware counters
212and only to group leaders. If a pinned counter cannot be put onto the
213CPU (e.g. because there are not enough hardware counters or because of
214a conflict with some other event), then the counter goes into an
215'error' state, where reads return end-of-file (i.e. read() returns 0)
216until the counter is subsequently enabled or disabled.
217
218The 'exclusive' bit, if set, specifies that when this counter's group
219is on the CPU, it should be the only group using the CPU's counters.
220In future, this will allow sophisticated monitoring programs to supply
221extra configuration information via 'extra_config_len' to exploit
222advanced features of the CPU's Performance Monitor Unit (PMU) that are
223not otherwise accessible and that might disrupt other hardware
224counters.
225
226The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
227way to request that counting of events be restricted to times when the
228CPU is in user, kernel and/or hypervisor mode.
229
230The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap
231operations, these can be used to relate userspace IP addresses to actual
232code, even after the mapping (or even the whole process) is gone,
233these events are recorded in the ring-buffer (see below).
234
235The 'comm' bit allows tracking of process comm data on process creation.
236This too is recorded in the ring-buffer (see below).
237
238The 'pid' parameter to the perf_counter_open() system call allows the
239counter to be specific to a task:
240
241 pid == 0: if the pid parameter is zero, the counter is attached to the
242 current task.
243
244 pid > 0: the counter is attached to a specific task (if the current task
245 has sufficient privilege to do so)
246
247 pid < 0: all tasks are counted (per cpu counters)
248
249The 'cpu' parameter allows a counter to be made specific to a CPU:
250
251 cpu >= 0: the counter is restricted to a specific CPU
252 cpu == -1: the counter counts on all CPUs
253
254(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
255
256A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
257events of that task and 'follows' that task to whatever CPU the task
258gets schedule to. Per task counters can be created by any user, for
259their own tasks.
260
261A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
262all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
263
264The 'flags' parameter is currently unused and must be zero.
265
266The 'group_fd' parameter allows counter "groups" to be set up. A
267counter group has one counter which is the group "leader". The leader
268is created first, with group_fd = -1 in the perf_counter_open call
269that creates it. The rest of the group members are created
270subsequently, with group_fd giving the fd of the group leader.
271(A single counter on its own is created with group_fd = -1 and is
272considered to be a group with only 1 member.)
273
274A counter group is scheduled onto the CPU as a unit, that is, it will
275only be put onto the CPU if all of the counters in the group can be
276put onto the CPU. This means that the values of the member counters
277can be meaningfully compared, added, divided (to get ratios), etc.,
278with each other, since they have counted events for the same set of
279executed instructions.
280
281
282Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap
283tracking are logged into a ring-buffer. This ring-buffer is created and
284accessed through mmap().
285
286The mmap size should be 1+2^n pages, where the first page is a meta-data page
287(struct perf_counter_mmap_page) that contains various bits of information such
288as where the ring-buffer head is.
289
290/*
291 * Structure of the page that can be mapped via mmap
292 */
293struct perf_counter_mmap_page {
294 __u32 version; /* version number of this structure */
295 __u32 compat_version; /* lowest version this is compat with */
296
297 /*
298 * Bits needed to read the hw counters in user-space.
299 *
300 * u32 seq;
301 * s64 count;
302 *
303 * do {
304 * seq = pc->lock;
305 *
306 * barrier()
307 * if (pc->index) {
308 * count = pmc_read(pc->index - 1);
309 * count += pc->offset;
310 * } else
311 * goto regular_read;
312 *
313 * barrier();
314 * } while (pc->lock != seq);
315 *
316 * NOTE: for obvious reason this only works on self-monitoring
317 * processes.
318 */
319 __u32 lock; /* seqlock for synchronization */
320 __u32 index; /* hardware counter identifier */
321 __s64 offset; /* add to hardware counter value */
322
323 /*
324 * Control data for the mmap() data buffer.
325 *
326 * User-space reading this value should issue an rmb(), on SMP capable
327 * platforms, after reading this value -- see perf_counter_wakeup().
328 */
329 __u32 data_head; /* head in the data section */
330};
331
332NOTE: the hw-counter userspace bits are arch specific and are currently only
333 implemented on powerpc.
334
335The following 2^n pages are the ring-buffer which contains events of the form:
336
337#define PERF_EVENT_MISC_KERNEL (1 << 0)
338#define PERF_EVENT_MISC_USER (1 << 1)
339#define PERF_EVENT_MISC_OVERFLOW (1 << 2)
340
341struct perf_event_header {
342 __u32 type;
343 __u16 misc;
344 __u16 size;
345};
346
347enum perf_event_type {
348
349 /*
350 * The MMAP events record the PROT_EXEC mappings so that we can
351 * correlate userspace IPs to code. They have the following structure:
352 *
353 * struct {
354 * struct perf_event_header header;
355 *
356 * u32 pid, tid;
357 * u64 addr;
358 * u64 len;
359 * u64 pgoff;
360 * char filename[];
361 * };
362 */
363 PERF_EVENT_MMAP = 1,
364 PERF_EVENT_MUNMAP = 2,
365
366 /*
367 * struct {
368 * struct perf_event_header header;
369 *
370 * u32 pid, tid;
371 * char comm[];
372 * };
373 */
374 PERF_EVENT_COMM = 3,
375
376 /*
377 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
378 * will be PERF_RECORD_*
379 *
380 * struct {
381 * struct perf_event_header header;
382 *
383 * { u64 ip; } && PERF_RECORD_IP
384 * { u32 pid, tid; } && PERF_RECORD_TID
385 * { u64 time; } && PERF_RECORD_TIME
386 * { u64 addr; } && PERF_RECORD_ADDR
387 *
388 * { u64 nr;
389 * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP
390 *
391 * { u16 nr,
392 * hv,
393 * kernel,
394 * user;
395 * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN
396 * };
397 */
398};
399
400NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented
401 on x86.
402
403Notification of new events is possible through poll()/select()/epoll() and
404fcntl() managing signals.
405
406Normally a notification is generated for every page filled, however one can
407additionally set perf_counter_hw_event.wakeup_events to generate one every
408so many counter overflow events.
409
410Future work will include a splice() interface to the ring-buffer.
411
412
413Counters can be enabled and disabled in two ways: via ioctl and via
414prctl. When a counter is disabled, it doesn't count or generate
415events but does continue to exist and maintain its count value.
416
417An individual counter or counter group can be enabled with
418
419 ioctl(fd, PERF_COUNTER_IOC_ENABLE);
420
421or disabled with
422
423 ioctl(fd, PERF_COUNTER_IOC_DISABLE);
424
425Enabling or disabling the leader of a group enables or disables the
426whole group; that is, while the group leader is disabled, none of the
427counters in the group will count. Enabling or disabling a member of a
428group other than the leader only affects that counter - disabling an
429non-leader stops that counter from counting but doesn't affect any
430other counter.
431
432Additionally, non-inherited overflow counters can use
433
434 ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
435
436to enable a counter for 'nr' events, after which it gets disabled again.
437
438A process can enable or disable all the counter groups that are
439attached to it, using prctl:
440
441 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
442
443 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
444
445This applies to all counters on the current process, whether created
446by this process or by another, and doesn't affect any counters that
447this process has created on other processes. It only enables or
448disables the group leaders, not any other members in the groups.
449
diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
new file mode 100644
index 000000000000..8855107fe6b3
--- /dev/null
+++ b/Documentation/perf_counter/perf-report.cc
@@ -0,0 +1,515 @@
1#define _GNU_SOURCE
2#include <sys/types.h>
3#include <sys/stat.h>
4#include <sys/time.h>
5#include <unistd.h>
6#include <stdint.h>
7#include <stdlib.h>
8#include <string.h>
9#include <limits.h>
10#include <fcntl.h>
11#include <stdio.h>
12#include <errno.h>
13#include <ctype.h>
14#include <time.h>
15#include <getopt.h>
16#include <assert.h>
17
18#include <sys/ioctl.h>
19#include <sys/poll.h>
20#include <sys/prctl.h>
21#include <sys/wait.h>
22#include <sys/mman.h>
23#include <sys/types.h>
24#include <sys/stat.h>
25
26#include <linux/unistd.h>
27#include <linux/types.h>
28
29#include "../../include/linux/perf_counter.h"
30
31#include <set>
32#include <map>
33#include <string>
34
35
36#define SHOW_KERNEL 1
37#define SHOW_USER 2
38#define SHOW_HV 4
39
40static char const *input_name = "output.perf";
41static int input;
42static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
43
44static unsigned long page_size;
45static unsigned long mmap_window = 32;
46
47struct ip_event {
48 struct perf_event_header header;
49 __u64 ip;
50 __u32 pid, tid;
51};
52struct mmap_event {
53 struct perf_event_header header;
54 __u32 pid, tid;
55 __u64 start;
56 __u64 len;
57 __u64 pgoff;
58 char filename[PATH_MAX];
59};
60struct comm_event {
61 struct perf_event_header header;
62 __u32 pid,tid;
63 char comm[16];
64};
65
66typedef union event_union {
67 struct perf_event_header header;
68 struct ip_event ip;
69 struct mmap_event mmap;
70 struct comm_event comm;
71} event_t;
72
73struct section {
74 uint64_t start;
75 uint64_t end;
76
77 uint64_t offset;
78
79 std::string name;
80
81 section() { };
82
83 section(uint64_t stab) : end(stab) { };
84
85 section(uint64_t start, uint64_t size, uint64_t offset, std::string name) :
86 start(start), end(start + size), offset(offset), name(name)
87 { };
88
89 bool operator < (const struct section &s) const {
90 return end < s.end;
91 };
92};
93
94typedef std::set<struct section> sections_t;
95
96struct symbol {
97 uint64_t start;
98 uint64_t end;
99
100 std::string name;
101
102 symbol() { };
103
104 symbol(uint64_t ip) : start(ip) { }
105
106 symbol(uint64_t start, uint64_t len, std::string name) :
107 start(start), end(start + len), name(name)
108 { };
109
110 bool operator < (const struct symbol &s) const {
111 return start < s.start;
112 };
113};
114
115typedef std::set<struct symbol> symbols_t;
116
117struct dso {
118 sections_t sections;
119 symbols_t syms;
120};
121
122static std::map<std::string, struct dso> dsos;
123
124static void load_dso_sections(std::string dso_name)
125{
126 struct dso &dso = dsos[dso_name];
127
128 std::string cmd = "readelf -DSW " + dso_name;
129
130 FILE *file = popen(cmd.c_str(), "r");
131 if (!file) {
132 perror("failed to open pipe");
133 exit(-1);
134 }
135
136 char *line = NULL;
137 size_t n = 0;
138
139 while (!feof(file)) {
140 uint64_t addr, off, size;
141 char name[32];
142
143 if (getline(&line, &n, file) < 0)
144 break;
145 if (!line)
146 break;
147
148 if (sscanf(line, " [%*2d] %16s %*14s %Lx %Lx %Lx",
149 name, &addr, &off, &size) == 4) {
150
151 dso.sections.insert(section(addr, size, addr - off, name));
152 }
153#if 0
154 /*
155 * for reading readelf symbols (-s), however these don't seem
156 * to include nearly everything, so use nm for that.
157 */
158 if (sscanf(line, " %*4d %*3d: %Lx %5Lu %*7s %*6s %*7s %3d %s",
159 &start, &size, &section, sym) == 4) {
160
161 start -= dso.section_offsets[section];
162
163 dso.syms.insert(symbol(start, size, std::string(sym)));
164 }
165#endif
166 }
167 pclose(file);
168}
169
170static void load_dso_symbols(std::string dso_name, std::string args)
171{
172 struct dso &dso = dsos[dso_name];
173
174 std::string cmd = "nm -nSC " + args + " " + dso_name;
175
176 FILE *file = popen(cmd.c_str(), "r");
177 if (!file) {
178 perror("failed to open pipe");
179 exit(-1);
180 }
181
182 char *line = NULL;
183 size_t n = 0;
184
185 while (!feof(file)) {
186 uint64_t start, size;
187 char c;
188 char sym[1024];
189
190 if (getline(&line, &n, file) < 0)
191 break;
192 if (!line)
193 break;
194
195
196 if (sscanf(line, "%Lx %Lx %c %s", &start, &size, &c, sym) == 4) {
197 sections_t::const_iterator si =
198 dso.sections.upper_bound(section(start));
199 if (si == dso.sections.end()) {
200 printf("symbol in unknown section: %s\n", sym);
201 continue;
202 }
203
204 start -= si->offset;
205
206 dso.syms.insert(symbol(start, size, sym));
207 }
208 }
209 pclose(file);
210}
211
212static void load_dso(std::string dso_name)
213{
214 load_dso_sections(dso_name);
215 load_dso_symbols(dso_name, "-D"); /* dynamic symbols */
216 load_dso_symbols(dso_name, ""); /* regular ones */
217}
218
219void load_kallsyms(void)
220{
221 struct dso &dso = dsos["[kernel]"];
222
223 FILE *file = fopen("/proc/kallsyms", "r");
224 if (!file) {
225 perror("failed to open kallsyms");
226 exit(-1);
227 }
228
229 char *line;
230 size_t n;
231
232 while (!feof(file)) {
233 uint64_t start;
234 char c;
235 char sym[1024000];
236
237 if (getline(&line, &n, file) < 0)
238 break;
239 if (!line)
240 break;
241
242 if (sscanf(line, "%Lx %c %s", &start, &c, sym) == 3)
243 dso.syms.insert(symbol(start, 0x1000000, std::string(sym)));
244 }
245 fclose(file);
246}
247
248struct map {
249 uint64_t start;
250 uint64_t end;
251 uint64_t pgoff;
252
253 std::string dso;
254
255 map() { };
256
257 map(uint64_t ip) : end(ip) { }
258
259 map(mmap_event *mmap) {
260 start = mmap->start;
261 end = mmap->start + mmap->len;
262 pgoff = mmap->pgoff;
263
264 dso = std::string(mmap->filename);
265
266 if (dsos.find(dso) == dsos.end())
267 load_dso(dso);
268 };
269
270 bool operator < (const struct map &m) const {
271 return end < m.end;
272 };
273};
274
275typedef std::set<struct map> maps_t;
276
277static std::map<int, maps_t> maps;
278
279static std::map<int, std::string> comms;
280
281static std::map<std::string, int> hist;
282static std::multimap<int, std::string> rev_hist;
283
284static std::string resolve_comm(int pid)
285{
286 std::string comm;
287
288 std::map<int, std::string>::const_iterator ci = comms.find(pid);
289 if (ci != comms.end()) {
290 comm = ci->second;
291 } else {
292 char pid_str[30];
293
294 sprintf(pid_str, ":%d", pid);
295 comm = pid_str;
296 }
297
298 return comm;
299}
300
301static std::string resolve_user_symbol(int pid, uint64_t ip)
302{
303 std::string sym = "<unknown>";
304
305 maps_t &m = maps[pid];
306 maps_t::const_iterator mi = m.upper_bound(map(ip));
307 if (mi == m.end())
308 return sym;
309
310 ip -= mi->start + mi->pgoff;
311
312 symbols_t &s = dsos[mi->dso].syms;
313 symbols_t::const_iterator si = s.upper_bound(symbol(ip));
314
315 sym = mi->dso + ": <unknown>";
316
317 if (si == s.begin())
318 return sym;
319 si--;
320
321 if (si->start <= ip && ip < si->end)
322 sym = mi->dso + ": " + si->name;
323#if 0
324 else if (si->start <= ip)
325 sym = mi->dso + ": ?" + si->name;
326#endif
327
328 return sym;
329}
330
331static std::string resolve_kernel_symbol(uint64_t ip)
332{
333 std::string sym = "<unknown>";
334
335 symbols_t &s = dsos["[kernel]"].syms;
336 symbols_t::const_iterator si = s.upper_bound(symbol(ip));
337
338 if (si == s.begin())
339 return sym;
340 si--;
341
342 if (si->start <= ip && ip < si->end)
343 sym = si->name;
344
345 return sym;
346}
347
348static void display_help(void)
349{
350 printf(
351 "Usage: perf-report [<options>]\n"
352 " -i file --input=<file> # input file\n"
353 );
354
355 exit(0);
356}
357
358static void process_options(int argc, char *argv[])
359{
360 int error = 0;
361
362 for (;;) {
363 int option_index = 0;
364 /** Options for getopt */
365 static struct option long_options[] = {
366 {"input", required_argument, NULL, 'i'},
367 {"no-user", no_argument, NULL, 'u'},
368 {"no-kernel", no_argument, NULL, 'k'},
369 {"no-hv", no_argument, NULL, 'h'},
370 {NULL, 0, NULL, 0 }
371 };
372 int c = getopt_long(argc, argv, "+:i:kuh",
373 long_options, &option_index);
374 if (c == -1)
375 break;
376
377 switch (c) {
378 case 'i': input_name = strdup(optarg); break;
379 case 'k': show_mask &= ~SHOW_KERNEL; break;
380 case 'u': show_mask &= ~SHOW_USER; break;
381 case 'h': show_mask &= ~SHOW_HV; break;
382 default: error = 1; break;
383 }
384 }
385
386 if (error)
387 display_help();
388}
389
390int main(int argc, char *argv[])
391{
392 unsigned long offset = 0;
393 unsigned long head = 0;
394 struct stat stat;
395 char *buf;
396 event_t *event;
397 int ret;
398 unsigned long total = 0;
399
400 page_size = getpagesize();
401
402 process_options(argc, argv);
403
404 input = open(input_name, O_RDONLY);
405 if (input < 0) {
406 perror("failed to open file");
407 exit(-1);
408 }
409
410 ret = fstat(input, &stat);
411 if (ret < 0) {
412 perror("failed to stat file");
413 exit(-1);
414 }
415
416 if (!stat.st_size) {
417 fprintf(stderr, "zero-sized file, nothing to do!\n");
418 exit(0);
419 }
420
421 load_kallsyms();
422
423remap:
424 buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
425 MAP_SHARED, input, offset);
426 if (buf == MAP_FAILED) {
427 perror("failed to mmap file");
428 exit(-1);
429 }
430
431more:
432 event = (event_t *)(buf + head);
433
434 if (head + event->header.size >= page_size * mmap_window) {
435 unsigned long shift = page_size * (head / page_size);
436 int ret;
437
438 ret = munmap(buf, page_size * mmap_window);
439 assert(ret == 0);
440
441 offset += shift;
442 head -= shift;
443 goto remap;
444 }
445
446
447 if (!event->header.size) {
448 fprintf(stderr, "zero-sized event at file offset %ld\n", offset + head);
449 fprintf(stderr, "skipping %ld bytes of events.\n", stat.st_size - offset - head);
450 goto done;
451 }
452
453 head += event->header.size;
454
455 if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
456 std::string comm, sym, level;
457 int show = 0;
458 char output[1024];
459
460 if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
461 show |= SHOW_KERNEL;
462 level = " [k] ";
463 sym = resolve_kernel_symbol(event->ip.ip);
464 } else if (event->header.misc & PERF_EVENT_MISC_USER) {
465 show |= SHOW_USER;
466 level = " [.] ";
467 sym = resolve_user_symbol(event->ip.pid, event->ip.ip);
468 } else {
469 show |= SHOW_HV;
470 level = " [H] ";
471 }
472
473 if (show & show_mask) {
474 comm = resolve_comm(event->ip.pid);
475 snprintf(output, sizeof(output), "%16s %s %s",
476 comm.c_str(), level.c_str(), sym.c_str());
477 hist[output]++;
478 }
479
480 total++;
481
482 } else switch (event->header.type) {
483 case PERF_EVENT_MMAP:
484 maps[event->mmap.pid].insert(map(&event->mmap));
485 break;
486
487 case PERF_EVENT_COMM:
488 comms[event->comm.pid] = std::string(event->comm.comm);
489 break;
490 }
491
492 if (offset + head < stat.st_size)
493 goto more;
494
495done:
496
497 close(input);
498
499 std::map<std::string, int>::iterator hi = hist.begin();
500
501 while (hi != hist.end()) {
502 rev_hist.insert(std::pair<int, std::string>(hi->second, hi->first));
503 hist.erase(hi++);
504 }
505
506 std::multimap<int, std::string>::const_iterator ri = rev_hist.begin();
507
508 while (ri != rev_hist.end()) {
509 printf(" %5.2f %s\n", (100.0 * ri->first)/total, ri->second.c_str());
510 ri++;
511 }
512
513 return 0;
514}
515
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
new file mode 100644
index 000000000000..594d270be390
--- /dev/null
+++ b/Documentation/perf_counter/perf.c
@@ -0,0 +1,414 @@
1#include "builtin.h"
2#include "util/exec_cmd.h"
3#include "util/cache.h"
4#include "util/quote.h"
5#include "util/run-command.h"
6
7const char perf_usage_string[] =
8 "perf [--version] [--help] COMMAND [ARGS]";
9
10const char perf_more_info_string[] =
11 "See 'perf help COMMAND' for more information on a specific command.";
12
13static int use_pager = -1;
14struct pager_config {
15 const char *cmd;
16 int val;
17};
18
19static int pager_command_config(const char *var, const char *value, void *data)
20{
21 struct pager_config *c = data;
22 if (!prefixcmp(var, "pager.") && !strcmp(var + 6, c->cmd))
23 c->val = perf_config_bool(var, value);
24 return 0;
25}
26
27/* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */
28int check_pager_config(const char *cmd)
29{
30 struct pager_config c;
31 c.cmd = cmd;
32 c.val = -1;
33 perf_config(pager_command_config, &c);
34 return c.val;
35}
36
37static void commit_pager_choice(void) {
38 switch (use_pager) {
39 case 0:
40 setenv("PERF_PAGER", "cat", 1);
41 break;
42 case 1:
43 /* setup_pager(); */
44 break;
45 default:
46 break;
47 }
48}
49
50static int handle_options(const char*** argv, int* argc, int* envchanged)
51{
52 int handled = 0;
53
54 while (*argc > 0) {
55 const char *cmd = (*argv)[0];
56 if (cmd[0] != '-')
57 break;
58
59 /*
60 * For legacy reasons, the "version" and "help"
61 * commands can be written with "--" prepended
62 * to make them look like flags.
63 */
64 if (!strcmp(cmd, "--help") || !strcmp(cmd, "--version"))
65 break;
66
67 /*
68 * Check remaining flags.
69 */
70 if (!prefixcmp(cmd, "--exec-path")) {
71 cmd += 11;
72 if (*cmd == '=')
73 perf_set_argv_exec_path(cmd + 1);
74 else {
75 puts(perf_exec_path());
76 exit(0);
77 }
78 } else if (!strcmp(cmd, "--html-path")) {
79 puts(system_path(PERF_HTML_PATH));
80 exit(0);
81 } else if (!strcmp(cmd, "-p") || !strcmp(cmd, "--paginate")) {
82 use_pager = 1;
83 } else if (!strcmp(cmd, "--no-pager")) {
84 use_pager = 0;
85 if (envchanged)
86 *envchanged = 1;
87 } else if (!strcmp(cmd, "--perf-dir")) {
88 if (*argc < 2) {
89 fprintf(stderr, "No directory given for --perf-dir.\n" );
90 usage(perf_usage_string);
91 }
92 setenv(PERF_DIR_ENVIRONMENT, (*argv)[1], 1);
93 if (envchanged)
94 *envchanged = 1;
95 (*argv)++;
96 (*argc)--;
97 handled++;
98 } else if (!prefixcmp(cmd, "--perf-dir=")) {
99 setenv(PERF_DIR_ENVIRONMENT, cmd + 10, 1);
100 if (envchanged)
101 *envchanged = 1;
102 } else if (!strcmp(cmd, "--work-tree")) {
103 if (*argc < 2) {
104 fprintf(stderr, "No directory given for --work-tree.\n" );
105 usage(perf_usage_string);
106 }
107 setenv(PERF_WORK_TREE_ENVIRONMENT, (*argv)[1], 1);
108 if (envchanged)
109 *envchanged = 1;
110 (*argv)++;
111 (*argc)--;
112 } else if (!prefixcmp(cmd, "--work-tree=")) {
113 setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1);
114 if (envchanged)
115 *envchanged = 1;
116 } else {
117 fprintf(stderr, "Unknown option: %s\n", cmd);
118 usage(perf_usage_string);
119 }
120
121 (*argv)++;
122 (*argc)--;
123 handled++;
124 }
125 return handled;
126}
127
128static int handle_alias(int *argcp, const char ***argv)
129{
130 int envchanged = 0, ret = 0, saved_errno = errno;
131 int count, option_count;
132 const char** new_argv;
133 const char *alias_command;
134 char *alias_string;
135
136 alias_command = (*argv)[0];
137 alias_string = alias_lookup(alias_command);
138 if (alias_string) {
139 if (alias_string[0] == '!') {
140 if (*argcp > 1) {
141 struct strbuf buf;
142
143 strbuf_init(&buf, PATH_MAX);
144 strbuf_addstr(&buf, alias_string);
145 sq_quote_argv(&buf, (*argv) + 1, PATH_MAX);
146 free(alias_string);
147 alias_string = buf.buf;
148 }
149 ret = system(alias_string + 1);
150 if (ret >= 0 && WIFEXITED(ret) &&
151 WEXITSTATUS(ret) != 127)
152 exit(WEXITSTATUS(ret));
153 die("Failed to run '%s' when expanding alias '%s'",
154 alias_string + 1, alias_command);
155 }
156 count = split_cmdline(alias_string, &new_argv);
157 if (count < 0)
158 die("Bad alias.%s string", alias_command);
159 option_count = handle_options(&new_argv, &count, &envchanged);
160 if (envchanged)
161 die("alias '%s' changes environment variables\n"
162 "You can use '!perf' in the alias to do this.",
163 alias_command);
164 memmove(new_argv - option_count, new_argv,
165 count * sizeof(char *));
166 new_argv -= option_count;
167
168 if (count < 1)
169 die("empty alias for %s", alias_command);
170
171 if (!strcmp(alias_command, new_argv[0]))
172 die("recursive alias: %s", alias_command);
173
174 new_argv = realloc(new_argv, sizeof(char*) *
175 (count + *argcp + 1));
176 /* insert after command name */
177 memcpy(new_argv + count, *argv + 1, sizeof(char*) * *argcp);
178 new_argv[count+*argcp] = NULL;
179
180 *argv = new_argv;
181 *argcp += count - 1;
182
183 ret = 1;
184 }
185
186 errno = saved_errno;
187
188 return ret;
189}
190
191const char perf_version_string[] = PERF_VERSION;
192
193#define RUN_SETUP (1<<0)
194#define USE_PAGER (1<<1)
195/*
196 * require working tree to be present -- anything uses this needs
197 * RUN_SETUP for reading from the configuration file.
198 */
199#define NEED_WORK_TREE (1<<2)
200
201struct cmd_struct {
202 const char *cmd;
203 int (*fn)(int, const char **, const char *);
204 int option;
205};
206
207static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
208{
209 int status;
210 struct stat st;
211 const char *prefix;
212
213 prefix = NULL;
214 if (p->option & RUN_SETUP)
215 prefix = NULL; /* setup_perf_directory(); */
216
217 if (use_pager == -1 && p->option & RUN_SETUP)
218 use_pager = check_pager_config(p->cmd);
219 if (use_pager == -1 && p->option & USE_PAGER)
220 use_pager = 1;
221 commit_pager_choice();
222
223 if (p->option & NEED_WORK_TREE)
224 /* setup_work_tree() */;
225
226 status = p->fn(argc, argv, prefix);
227 if (status)
228 return status & 0xff;
229
230 /* Somebody closed stdout? */
231 if (fstat(fileno(stdout), &st))
232 return 0;
233 /* Ignore write errors for pipes and sockets.. */
234 if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode))
235 return 0;
236
237 /* Check for ENOSPC and EIO errors.. */
238 if (fflush(stdout))
239 die("write failure on standard output: %s", strerror(errno));
240 if (ferror(stdout))
241 die("unknown write failure on standard output");
242 if (fclose(stdout))
243 die("close failed on standard output: %s", strerror(errno));
244 return 0;
245}
246
247static void handle_internal_command(int argc, const char **argv)
248{
249 const char *cmd = argv[0];
250 static struct cmd_struct commands[] = {
251 { "help", cmd_help, 0 },
252 { "record", cmd_record, 0 },
253 { "stat", cmd_stat, 0 },
254 { "top", cmd_top, 0 },
255 { "version", cmd_version, 0 },
256 };
257 int i;
258 static const char ext[] = STRIP_EXTENSION;
259
260 if (sizeof(ext) > 1) {
261 i = strlen(argv[0]) - strlen(ext);
262 if (i > 0 && !strcmp(argv[0] + i, ext)) {
263 char *argv0 = strdup(argv[0]);
264 argv[0] = cmd = argv0;
265 argv0[i] = '\0';
266 }
267 }
268
269 /* Turn "perf cmd --help" into "perf help cmd" */
270 if (argc > 1 && !strcmp(argv[1], "--help")) {
271 argv[1] = argv[0];
272 argv[0] = cmd = "help";
273 }
274
275 for (i = 0; i < ARRAY_SIZE(commands); i++) {
276 struct cmd_struct *p = commands+i;
277 if (strcmp(p->cmd, cmd))
278 continue;
279 exit(run_builtin(p, argc, argv));
280 }
281}
282
283static void execv_dashed_external(const char **argv)
284{
285 struct strbuf cmd = STRBUF_INIT;
286 const char *tmp;
287 int status;
288
289 strbuf_addf(&cmd, "perf-%s", argv[0]);
290
291 /*
292 * argv[0] must be the perf command, but the argv array
293 * belongs to the caller, and may be reused in
294 * subsequent loop iterations. Save argv[0] and
295 * restore it on error.
296 */
297 tmp = argv[0];
298 argv[0] = cmd.buf;
299
300 /*
301 * if we fail because the command is not found, it is
302 * OK to return. Otherwise, we just pass along the status code.
303 */
304 status = run_command_v_opt(argv, 0);
305 if (status != -ERR_RUN_COMMAND_EXEC) {
306 if (IS_RUN_COMMAND_ERR(status))
307 die("unable to run '%s'", argv[0]);
308 exit(-status);
309 }
310 errno = ENOENT; /* as if we called execvp */
311
312 argv[0] = tmp;
313
314 strbuf_release(&cmd);
315}
316
317static int run_argv(int *argcp, const char ***argv)
318{
319 int done_alias = 0;
320
321 while (1) {
322 /* See if it's an internal command */
323 handle_internal_command(*argcp, *argv);
324
325 /* .. then try the external ones */
326 execv_dashed_external(*argv);
327
328 /* It could be an alias -- this works around the insanity
329 * of overriding "perf log" with "perf show" by having
330 * alias.log = show
331 */
332 if (done_alias || !handle_alias(argcp, argv))
333 break;
334 done_alias = 1;
335 }
336
337 return done_alias;
338}
339
340
341int main(int argc, const char **argv)
342{
343 const char *cmd;
344
345 cmd = perf_extract_argv0_path(argv[0]);
346 if (!cmd)
347 cmd = "perf-help";
348
349 /*
350 * "perf-xxxx" is the same as "perf xxxx", but we obviously:
351 *
352 * - cannot take flags in between the "perf" and the "xxxx".
353 * - cannot execute it externally (since it would just do
354 * the same thing over again)
355 *
356 * So we just directly call the internal command handler, and
357 * die if that one cannot handle it.
358 */
359 if (!prefixcmp(cmd, "perf-")) {
360 cmd += 4;
361 argv[0] = cmd;
362 handle_internal_command(argc, argv);
363 die("cannot handle %s internally", cmd);
364 }
365
366 /* Look for flags.. */
367 argv++;
368 argc--;
369 handle_options(&argv, &argc, NULL);
370 commit_pager_choice();
371 if (argc > 0) {
372 if (!prefixcmp(argv[0], "--"))
373 argv[0] += 2;
374 } else {
375 /* The user didn't specify a command; give them help */
376 printf("usage: %s\n\n", perf_usage_string);
377 list_common_cmds_help();
378 printf("\n%s\n", perf_more_info_string);
379 exit(1);
380 }
381 cmd = argv[0];
382
383 /*
384 * We use PATH to find perf commands, but we prepend some higher
385 * precidence paths: the "--exec-path" option, the PERF_EXEC_PATH
386 * environment, and the $(perfexecdir) from the Makefile at build
387 * time.
388 */
389 setup_path();
390
391 while (1) {
392 static int done_help = 0;
393 static int was_alias = 0;
394 was_alias = run_argv(&argc, &argv);
395 if (errno != ENOENT)
396 break;
397 if (was_alias) {
398 fprintf(stderr, "Expansion of alias '%s' failed; "
399 "'%s' is not a perf-command\n",
400 cmd, argv[0]);
401 exit(1);
402 }
403 if (!done_help) {
404 cmd = argv[0] = help_unknown_cmd(cmd);
405 done_help = 1;
406 } else
407 break;
408 }
409
410 fprintf(stderr, "Failed to run command '%s': %s\n",
411 cmd, strerror(errno));
412
413 return 1;
414}
diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
new file mode 100644
index 000000000000..6fa3656399f4
--- /dev/null
+++ b/Documentation/perf_counter/perf.h
@@ -0,0 +1,62 @@
1#ifndef _PERF_PERF_H
2#define _PERF_PERF_H
3
4/*
5 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
6 * counters in the current task.
7 */
8#define PR_TASK_PERF_COUNTERS_DISABLE 31
9#define PR_TASK_PERF_COUNTERS_ENABLE 32
10
11#ifndef NSEC_PER_SEC
12# define NSEC_PER_SEC 1000000000ULL
13#endif
14
15static inline unsigned long long rdclock(void)
16{
17 struct timespec ts;
18
19 clock_gettime(CLOCK_MONOTONIC, &ts);
20 return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
21}
22
23/*
24 * Pick up some kernel type conventions:
25 */
26#define __user
27#define asmlinkage
28
29#if defined(__x86_64__) || defined(__i386__)
30#include "../../arch/x86/include/asm/unistd.h"
31#define rmb() asm volatile("lfence" ::: "memory")
32#define cpu_relax() asm volatile("rep; nop" ::: "memory");
33#endif
34
35#ifdef __powerpc__
36#include "../../arch/powerpc/include/asm/unistd.h"
37#define rmb() asm volatile ("sync" ::: "memory")
38#define cpu_relax() asm volatile ("" ::: "memory");
39#endif
40
41#define unlikely(x) __builtin_expect(!!(x), 0)
42#define min(x, y) ({ \
43 typeof(x) _min1 = (x); \
44 typeof(y) _min2 = (y); \
45 (void) (&_min1 == &_min2); \
46 _min1 < _min2 ? _min1 : _min2; })
47
48static inline int
49sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
50 pid_t pid, int cpu, int group_fd,
51 unsigned long flags)
52{
53 return syscall(__NR_perf_counter_open, hw_event_uptr, pid, cpu,
54 group_fd, flags);
55}
56
57#define MAX_COUNTERS 64
58#define MAX_NR_CPUS 256
59
60#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
61
62#endif
diff --git a/Documentation/perf_counter/util/PERF-VERSION-GEN b/Documentation/perf_counter/util/PERF-VERSION-GEN
new file mode 100755
index 000000000000..c561d1538c03
--- /dev/null
+++ b/Documentation/perf_counter/util/PERF-VERSION-GEN
@@ -0,0 +1,42 @@
1#!/bin/sh
2
3GVF=PERF-VERSION-FILE
4DEF_VER=v0.0.1.PERF
5
6LF='
7'
8
9# First see if there is a version file (included in release tarballs),
10# then try git-describe, then default.
11if test -f version
12then
13 VN=$(cat version) || VN="$DEF_VER"
14elif test -d .git -o -f .git &&
15 VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
16 case "$VN" in
17 *$LF*) (exit 1) ;;
18 v[0-9]*)
19 git update-index -q --refresh
20 test -z "$(git diff-index --name-only HEAD --)" ||
21 VN="$VN-dirty" ;;
22 esac
23then
24 VN=$(echo "$VN" | sed -e 's/-/./g');
25else
26 VN="$DEF_VER"
27fi
28
29VN=$(expr "$VN" : v*'\(.*\)')
30
31if test -r $GVF
32then
33 VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
34else
35 VC=unset
36fi
37test "$VN" = "$VC" || {
38 echo >&2 "PERF_VERSION = $VN"
39 echo "PERF_VERSION = $VN" >$GVF
40}
41
42
diff --git a/Documentation/perf_counter/util/abspath.c b/Documentation/perf_counter/util/abspath.c
new file mode 100644
index 000000000000..649f34f83365
--- /dev/null
+++ b/Documentation/perf_counter/util/abspath.c
@@ -0,0 +1,117 @@
1#include "cache.h"
2
3/*
4 * Do not use this for inspecting *tracked* content. When path is a
5 * symlink to a directory, we do not want to say it is a directory when
6 * dealing with tracked content in the working tree.
7 */
8int is_directory(const char *path)
9{
10 struct stat st;
11 return (!stat(path, &st) && S_ISDIR(st.st_mode));
12}
13
14/* We allow "recursive" symbolic links. Only within reason, though. */
15#define MAXDEPTH 5
16
17const char *make_absolute_path(const char *path)
18{
19 static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1];
20 char cwd[1024] = "";
21 int buf_index = 1, len;
22
23 int depth = MAXDEPTH;
24 char *last_elem = NULL;
25 struct stat st;
26
27 if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
28 die ("Too long path: %.*s", 60, path);
29
30 while (depth--) {
31 if (!is_directory(buf)) {
32 char *last_slash = strrchr(buf, '/');
33 if (last_slash) {
34 *last_slash = '\0';
35 last_elem = xstrdup(last_slash + 1);
36 } else {
37 last_elem = xstrdup(buf);
38 *buf = '\0';
39 }
40 }
41
42 if (*buf) {
43 if (!*cwd && !getcwd(cwd, sizeof(cwd)))
44 die ("Could not get current working directory");
45
46 if (chdir(buf))
47 die ("Could not switch to '%s'", buf);
48 }
49 if (!getcwd(buf, PATH_MAX))
50 die ("Could not get current working directory");
51
52 if (last_elem) {
53 int len = strlen(buf);
54 if (len + strlen(last_elem) + 2 > PATH_MAX)
55 die ("Too long path name: '%s/%s'",
56 buf, last_elem);
57 buf[len] = '/';
58 strcpy(buf + len + 1, last_elem);
59 free(last_elem);
60 last_elem = NULL;
61 }
62
63 if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) {
64 len = readlink(buf, next_buf, PATH_MAX);
65 if (len < 0)
66 die ("Invalid symlink: %s", buf);
67 if (PATH_MAX <= len)
68 die("symbolic link too long: %s", buf);
69 next_buf[len] = '\0';
70 buf = next_buf;
71 buf_index = 1 - buf_index;
72 next_buf = bufs[buf_index];
73 } else
74 break;
75 }
76
77 if (*cwd && chdir(cwd))
78 die ("Could not change back to '%s'", cwd);
79
80 return buf;
81}
82
83static const char *get_pwd_cwd(void)
84{
85 static char cwd[PATH_MAX + 1];
86 char *pwd;
87 struct stat cwd_stat, pwd_stat;
88 if (getcwd(cwd, PATH_MAX) == NULL)
89 return NULL;
90 pwd = getenv("PWD");
91 if (pwd && strcmp(pwd, cwd)) {
92 stat(cwd, &cwd_stat);
93 if (!stat(pwd, &pwd_stat) &&
94 pwd_stat.st_dev == cwd_stat.st_dev &&
95 pwd_stat.st_ino == cwd_stat.st_ino) {
96 strlcpy(cwd, pwd, PATH_MAX);
97 }
98 }
99 return cwd;
100}
101
102const char *make_nonrelative_path(const char *path)
103{
104 static char buf[PATH_MAX + 1];
105
106 if (is_absolute_path(path)) {
107 if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
108 die("Too long path: %.*s", 60, path);
109 } else {
110 const char *cwd = get_pwd_cwd();
111 if (!cwd)
112 die("Cannot determine the current working directory");
113 if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
114 die("Too long path: %.*s", 60, path);
115 }
116 return buf;
117}
diff --git a/Documentation/perf_counter/util/alias.c b/Documentation/perf_counter/util/alias.c
new file mode 100644
index 000000000000..9b3dd2b428df
--- /dev/null
+++ b/Documentation/perf_counter/util/alias.c
@@ -0,0 +1,77 @@
1#include "cache.h"
2
3static const char *alias_key;
4static char *alias_val;
5
6static int alias_lookup_cb(const char *k, const char *v, void *cb)
7{
8 if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) {
9 if (!v)
10 return config_error_nonbool(k);
11 alias_val = strdup(v);
12 return 0;
13 }
14 return 0;
15}
16
17char *alias_lookup(const char *alias)
18{
19 alias_key = alias;
20 alias_val = NULL;
21 perf_config(alias_lookup_cb, NULL);
22 return alias_val;
23}
24
25int split_cmdline(char *cmdline, const char ***argv)
26{
27 int src, dst, count = 0, size = 16;
28 char quoted = 0;
29
30 *argv = malloc(sizeof(char*) * size);
31
32 /* split alias_string */
33 (*argv)[count++] = cmdline;
34 for (src = dst = 0; cmdline[src];) {
35 char c = cmdline[src];
36 if (!quoted && isspace(c)) {
37 cmdline[dst++] = 0;
38 while (cmdline[++src]
39 && isspace(cmdline[src]))
40 ; /* skip */
41 if (count >= size) {
42 size += 16;
43 *argv = realloc(*argv, sizeof(char*) * size);
44 }
45 (*argv)[count++] = cmdline + dst;
46 } else if (!quoted && (c == '\'' || c == '"')) {
47 quoted = c;
48 src++;
49 } else if (c == quoted) {
50 quoted = 0;
51 src++;
52 } else {
53 if (c == '\\' && quoted != '\'') {
54 src++;
55 c = cmdline[src];
56 if (!c) {
57 free(*argv);
58 *argv = NULL;
59 return error("cmdline ends with \\");
60 }
61 }
62 cmdline[dst++] = c;
63 src++;
64 }
65 }
66
67 cmdline[dst] = 0;
68
69 if (quoted) {
70 free(*argv);
71 *argv = NULL;
72 return error("unclosed quote");
73 }
74
75 return count;
76}
77
diff --git a/Documentation/perf_counter/util/cache.h b/Documentation/perf_counter/util/cache.h
new file mode 100644
index 000000000000..71080512fa86
--- /dev/null
+++ b/Documentation/perf_counter/util/cache.h
@@ -0,0 +1,117 @@
1#ifndef CACHE_H
2#define CACHE_H
3
4#include "util.h"
5#include "strbuf.h"
6
7#define PERF_DIR_ENVIRONMENT "PERF_DIR"
8#define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE"
9#define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
10#define DB_ENVIRONMENT "PERF_OBJECT_DIRECTORY"
11#define INDEX_ENVIRONMENT "PERF_INDEX_FILE"
12#define GRAFT_ENVIRONMENT "PERF_GRAFT_FILE"
13#define TEMPLATE_DIR_ENVIRONMENT "PERF_TEMPLATE_DIR"
14#define CONFIG_ENVIRONMENT "PERF_CONFIG"
15#define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH"
16#define CEILING_DIRECTORIES_ENVIRONMENT "PERF_CEILING_DIRECTORIES"
17#define PERFATTRIBUTES_FILE ".perfattributes"
18#define INFOATTRIBUTES_FILE "info/attributes"
19#define ATTRIBUTE_MACRO_PREFIX "[attr]"
20
21typedef int (*config_fn_t)(const char *, const char *, void *);
22extern int perf_default_config(const char *, const char *, void *);
23extern int perf_config_from_file(config_fn_t fn, const char *, void *);
24extern int perf_config(config_fn_t fn, void *);
25extern int perf_parse_ulong(const char *, unsigned long *);
26extern int perf_config_int(const char *, const char *);
27extern unsigned long perf_config_ulong(const char *, const char *);
28extern int perf_config_bool_or_int(const char *, const char *, int *);
29extern int perf_config_bool(const char *, const char *);
30extern int perf_config_string(const char **, const char *, const char *);
31extern int perf_config_set(const char *, const char *);
32extern int perf_config_set_multivar(const char *, const char *, const char *, int);
33extern int perf_config_rename_section(const char *, const char *);
34extern const char *perf_etc_perfconfig(void);
35extern int check_repository_format_version(const char *var, const char *value, void *cb);
36extern int perf_config_system(void);
37extern int perf_config_global(void);
38extern int config_error_nonbool(const char *);
39extern const char *config_exclusive_filename;
40
41#define MAX_PERFNAME (1000)
42extern char perf_default_email[MAX_PERFNAME];
43extern char perf_default_name[MAX_PERFNAME];
44extern int user_ident_explicitly_given;
45
46extern const char *perf_log_output_encoding;
47extern const char *perf_mailmap_file;
48
49/* IO helper functions */
50extern void maybe_flush_or_die(FILE *, const char *);
51extern int copy_fd(int ifd, int ofd);
52extern int copy_file(const char *dst, const char *src, int mode);
53extern ssize_t read_in_full(int fd, void *buf, size_t count);
54extern ssize_t write_in_full(int fd, const void *buf, size_t count);
55extern void write_or_die(int fd, const void *buf, size_t count);
56extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg);
57extern int write_or_whine_pipe(int fd, const void *buf, size_t count, const char *msg);
58extern void fsync_or_die(int fd, const char *);
59
60/* pager.c */
61extern void setup_pager(void);
62extern const char *pager_program;
63extern int pager_in_use(void);
64extern int pager_use_color;
65
66extern const char *editor_program;
67extern const char *excludes_file;
68
69char *alias_lookup(const char *alias);
70int split_cmdline(char *cmdline, const char ***argv);
71
72#define alloc_nr(x) (((x)+16)*3/2)
73
74/*
75 * Realloc the buffer pointed at by variable 'x' so that it can hold
76 * at least 'nr' entries; the number of entries currently allocated
77 * is 'alloc', using the standard growing factor alloc_nr() macro.
78 *
79 * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
80 */
81#define ALLOC_GROW(x, nr, alloc) \
82 do { \
83 if ((nr) > alloc) { \
84 if (alloc_nr(alloc) < (nr)) \
85 alloc = (nr); \
86 else \
87 alloc = alloc_nr(alloc); \
88 x = xrealloc((x), alloc * sizeof(*(x))); \
89 } \
90 } while(0)
91
92
93static inline int is_absolute_path(const char *path)
94{
95 return path[0] == '/';
96}
97
98const char *make_absolute_path(const char *path);
99const char *make_nonrelative_path(const char *path);
100const char *make_relative_path(const char *abs, const char *base);
101int normalize_path_copy(char *dst, const char *src);
102int longest_ancestor_length(const char *path, const char *prefix_list);
103char *strip_path_suffix(const char *path, const char *suffix);
104
105extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
106extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
107
108extern char *mksnpath(char *buf, size_t n, const char *fmt, ...)
109 __attribute__((format (printf, 3, 4)));
110extern char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
111 __attribute__((format (printf, 3, 4)));
112extern char *perf_pathdup(const char *fmt, ...)
113 __attribute__((format (printf, 1, 2)));
114
115extern size_t strlcpy(char *dest, const char *src, size_t size);
116
117#endif /* CACHE_H */
diff --git a/Documentation/perf_counter/util/config.c b/Documentation/perf_counter/util/config.c
new file mode 100644
index 000000000000..3dd13faa6a27
--- /dev/null
+++ b/Documentation/perf_counter/util/config.c
@@ -0,0 +1,873 @@
1/*
2 * GIT - The information manager from hell
3 *
4 * Copyright (C) Linus Torvalds, 2005
5 * Copyright (C) Johannes Schindelin, 2005
6 *
7 */
8#include "util.h"
9#include "cache.h"
10#include "exec_cmd.h"
11
12#define MAXNAME (256)
13
14static FILE *config_file;
15static const char *config_file_name;
16static int config_linenr;
17static int config_file_eof;
18
19const char *config_exclusive_filename = NULL;
20
21static int get_next_char(void)
22{
23 int c;
24 FILE *f;
25
26 c = '\n';
27 if ((f = config_file) != NULL) {
28 c = fgetc(f);
29 if (c == '\r') {
30 /* DOS like systems */
31 c = fgetc(f);
32 if (c != '\n') {
33 ungetc(c, f);
34 c = '\r';
35 }
36 }
37 if (c == '\n')
38 config_linenr++;
39 if (c == EOF) {
40 config_file_eof = 1;
41 c = '\n';
42 }
43 }
44 return c;
45}
46
47static char *parse_value(void)
48{
49 static char value[1024];
50 int quote = 0, comment = 0, len = 0, space = 0;
51
52 for (;;) {
53 int c = get_next_char();
54 if (len >= sizeof(value) - 1)
55 return NULL;
56 if (c == '\n') {
57 if (quote)
58 return NULL;
59 value[len] = 0;
60 return value;
61 }
62 if (comment)
63 continue;
64 if (isspace(c) && !quote) {
65 space = 1;
66 continue;
67 }
68 if (!quote) {
69 if (c == ';' || c == '#') {
70 comment = 1;
71 continue;
72 }
73 }
74 if (space) {
75 if (len)
76 value[len++] = ' ';
77 space = 0;
78 }
79 if (c == '\\') {
80 c = get_next_char();
81 switch (c) {
82 case '\n':
83 continue;
84 case 't':
85 c = '\t';
86 break;
87 case 'b':
88 c = '\b';
89 break;
90 case 'n':
91 c = '\n';
92 break;
93 /* Some characters escape as themselves */
94 case '\\': case '"':
95 break;
96 /* Reject unknown escape sequences */
97 default:
98 return NULL;
99 }
100 value[len++] = c;
101 continue;
102 }
103 if (c == '"') {
104 quote = 1-quote;
105 continue;
106 }
107 value[len++] = c;
108 }
109}
110
111static inline int iskeychar(int c)
112{
113 return isalnum(c) || c == '-';
114}
115
116static int get_value(config_fn_t fn, void *data, char *name, unsigned int len)
117{
118 int c;
119 char *value;
120
121 /* Get the full name */
122 for (;;) {
123 c = get_next_char();
124 if (config_file_eof)
125 break;
126 if (!iskeychar(c))
127 break;
128 name[len++] = tolower(c);
129 if (len >= MAXNAME)
130 return -1;
131 }
132 name[len] = 0;
133 while (c == ' ' || c == '\t')
134 c = get_next_char();
135
136 value = NULL;
137 if (c != '\n') {
138 if (c != '=')
139 return -1;
140 value = parse_value();
141 if (!value)
142 return -1;
143 }
144 return fn(name, value, data);
145}
146
147static int get_extended_base_var(char *name, int baselen, int c)
148{
149 do {
150 if (c == '\n')
151 return -1;
152 c = get_next_char();
153 } while (isspace(c));
154
155 /* We require the format to be '[base "extension"]' */
156 if (c != '"')
157 return -1;
158 name[baselen++] = '.';
159
160 for (;;) {
161 int c = get_next_char();
162 if (c == '\n')
163 return -1;
164 if (c == '"')
165 break;
166 if (c == '\\') {
167 c = get_next_char();
168 if (c == '\n')
169 return -1;
170 }
171 name[baselen++] = c;
172 if (baselen > MAXNAME / 2)
173 return -1;
174 }
175
176 /* Final ']' */
177 if (get_next_char() != ']')
178 return -1;
179 return baselen;
180}
181
182static int get_base_var(char *name)
183{
184 int baselen = 0;
185
186 for (;;) {
187 int c = get_next_char();
188 if (config_file_eof)
189 return -1;
190 if (c == ']')
191 return baselen;
192 if (isspace(c))
193 return get_extended_base_var(name, baselen, c);
194 if (!iskeychar(c) && c != '.')
195 return -1;
196 if (baselen > MAXNAME / 2)
197 return -1;
198 name[baselen++] = tolower(c);
199 }
200}
201
202static int perf_parse_file(config_fn_t fn, void *data)
203{
204 int comment = 0;
205 int baselen = 0;
206 static char var[MAXNAME];
207
208 /* U+FEFF Byte Order Mark in UTF8 */
209 static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf";
210 const unsigned char *bomptr = utf8_bom;
211
212 for (;;) {
213 int c = get_next_char();
214 if (bomptr && *bomptr) {
215 /* We are at the file beginning; skip UTF8-encoded BOM
216 * if present. Sane editors won't put this in on their
217 * own, but e.g. Windows Notepad will do it happily. */
218 if ((unsigned char) c == *bomptr) {
219 bomptr++;
220 continue;
221 } else {
222 /* Do not tolerate partial BOM. */
223 if (bomptr != utf8_bom)
224 break;
225 /* No BOM at file beginning. Cool. */
226 bomptr = NULL;
227 }
228 }
229 if (c == '\n') {
230 if (config_file_eof)
231 return 0;
232 comment = 0;
233 continue;
234 }
235 if (comment || isspace(c))
236 continue;
237 if (c == '#' || c == ';') {
238 comment = 1;
239 continue;
240 }
241 if (c == '[') {
242 baselen = get_base_var(var);
243 if (baselen <= 0)
244 break;
245 var[baselen++] = '.';
246 var[baselen] = 0;
247 continue;
248 }
249 if (!isalpha(c))
250 break;
251 var[baselen] = tolower(c);
252 if (get_value(fn, data, var, baselen+1) < 0)
253 break;
254 }
255 die("bad config file line %d in %s", config_linenr, config_file_name);
256}
257
258static int parse_unit_factor(const char *end, unsigned long *val)
259{
260 if (!*end)
261 return 1;
262 else if (!strcasecmp(end, "k")) {
263 *val *= 1024;
264 return 1;
265 }
266 else if (!strcasecmp(end, "m")) {
267 *val *= 1024 * 1024;
268 return 1;
269 }
270 else if (!strcasecmp(end, "g")) {
271 *val *= 1024 * 1024 * 1024;
272 return 1;
273 }
274 return 0;
275}
276
277static int perf_parse_long(const char *value, long *ret)
278{
279 if (value && *value) {
280 char *end;
281 long val = strtol(value, &end, 0);
282 unsigned long factor = 1;
283 if (!parse_unit_factor(end, &factor))
284 return 0;
285 *ret = val * factor;
286 return 1;
287 }
288 return 0;
289}
290
291int perf_parse_ulong(const char *value, unsigned long *ret)
292{
293 if (value && *value) {
294 char *end;
295 unsigned long val = strtoul(value, &end, 0);
296 if (!parse_unit_factor(end, &val))
297 return 0;
298 *ret = val;
299 return 1;
300 }
301 return 0;
302}
303
304static void die_bad_config(const char *name)
305{
306 if (config_file_name)
307 die("bad config value for '%s' in %s", name, config_file_name);
308 die("bad config value for '%s'", name);
309}
310
311int perf_config_int(const char *name, const char *value)
312{
313 long ret = 0;
314 if (!perf_parse_long(value, &ret))
315 die_bad_config(name);
316 return ret;
317}
318
319unsigned long perf_config_ulong(const char *name, const char *value)
320{
321 unsigned long ret;
322 if (!perf_parse_ulong(value, &ret))
323 die_bad_config(name);
324 return ret;
325}
326
327int perf_config_bool_or_int(const char *name, const char *value, int *is_bool)
328{
329 *is_bool = 1;
330 if (!value)
331 return 1;
332 if (!*value)
333 return 0;
334 if (!strcasecmp(value, "true") || !strcasecmp(value, "yes") || !strcasecmp(value, "on"))
335 return 1;
336 if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off"))
337 return 0;
338 *is_bool = 0;
339 return perf_config_int(name, value);
340}
341
342int perf_config_bool(const char *name, const char *value)
343{
344 int discard;
345 return !!perf_config_bool_or_int(name, value, &discard);
346}
347
348int perf_config_string(const char **dest, const char *var, const char *value)
349{
350 if (!value)
351 return config_error_nonbool(var);
352 *dest = strdup(value);
353 return 0;
354}
355
356static int perf_default_core_config(const char *var, const char *value)
357{
358 /* Add other config variables here and to Documentation/config.txt. */
359 return 0;
360}
361
362int perf_default_config(const char *var, const char *value, void *dummy)
363{
364 if (!prefixcmp(var, "core."))
365 return perf_default_core_config(var, value);
366
367 /* Add other config variables here and to Documentation/config.txt. */
368 return 0;
369}
370
371int perf_config_from_file(config_fn_t fn, const char *filename, void *data)
372{
373 int ret;
374 FILE *f = fopen(filename, "r");
375
376 ret = -1;
377 if (f) {
378 config_file = f;
379 config_file_name = filename;
380 config_linenr = 1;
381 config_file_eof = 0;
382 ret = perf_parse_file(fn, data);
383 fclose(f);
384 config_file_name = NULL;
385 }
386 return ret;
387}
388
389const char *perf_etc_perfconfig(void)
390{
391 static const char *system_wide;
392 if (!system_wide)
393 system_wide = system_path(ETC_PERFCONFIG);
394 return system_wide;
395}
396
397static int perf_env_bool(const char *k, int def)
398{
399 const char *v = getenv(k);
400 return v ? perf_config_bool(k, v) : def;
401}
402
403int perf_config_system(void)
404{
405 return !perf_env_bool("PERF_CONFIG_NOSYSTEM", 0);
406}
407
408int perf_config_global(void)
409{
410 return !perf_env_bool("PERF_CONFIG_NOGLOBAL", 0);
411}
412
413int perf_config(config_fn_t fn, void *data)
414{
415 int ret = 0, found = 0;
416 char *repo_config = NULL;
417 const char *home = NULL;
418
419 /* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
420 if (config_exclusive_filename)
421 return perf_config_from_file(fn, config_exclusive_filename, data);
422 if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
423 ret += perf_config_from_file(fn, perf_etc_perfconfig(),
424 data);
425 found += 1;
426 }
427
428 home = getenv("HOME");
429 if (perf_config_global() && home) {
430 char *user_config = strdup(mkpath("%s/.perfconfig", home));
431 if (!access(user_config, R_OK)) {
432 ret += perf_config_from_file(fn, user_config, data);
433 found += 1;
434 }
435 free(user_config);
436 }
437
438 repo_config = perf_pathdup("config");
439 if (!access(repo_config, R_OK)) {
440 ret += perf_config_from_file(fn, repo_config, data);
441 found += 1;
442 }
443 free(repo_config);
444 if (found == 0)
445 return -1;
446 return ret;
447}
448
449/*
450 * Find all the stuff for perf_config_set() below.
451 */
452
453#define MAX_MATCHES 512
454
455static struct {
456 int baselen;
457 char* key;
458 int do_not_match;
459 regex_t* value_regex;
460 int multi_replace;
461 size_t offset[MAX_MATCHES];
462 enum { START, SECTION_SEEN, SECTION_END_SEEN, KEY_SEEN } state;
463 int seen;
464} store;
465
466static int matches(const char* key, const char* value)
467{
468 return !strcmp(key, store.key) &&
469 (store.value_regex == NULL ||
470 (store.do_not_match ^
471 !regexec(store.value_regex, value, 0, NULL, 0)));
472}
473
474static int store_aux(const char* key, const char* value, void *cb)
475{
476 const char *ep;
477 size_t section_len;
478
479 switch (store.state) {
480 case KEY_SEEN:
481 if (matches(key, value)) {
482 if (store.seen == 1 && store.multi_replace == 0) {
483 warning("%s has multiple values", key);
484 } else if (store.seen >= MAX_MATCHES) {
485 error("too many matches for %s", key);
486 return 1;
487 }
488
489 store.offset[store.seen] = ftell(config_file);
490 store.seen++;
491 }
492 break;
493 case SECTION_SEEN:
494 /*
495 * What we are looking for is in store.key (both
496 * section and var), and its section part is baselen
497 * long. We found key (again, both section and var).
498 * We would want to know if this key is in the same
499 * section as what we are looking for. We already
500 * know we are in the same section as what should
501 * hold store.key.
502 */
503 ep = strrchr(key, '.');
504 section_len = ep - key;
505
506 if ((section_len != store.baselen) ||
507 memcmp(key, store.key, section_len+1)) {
508 store.state = SECTION_END_SEEN;
509 break;
510 }
511
512 /*
513 * Do not increment matches: this is no match, but we
514 * just made sure we are in the desired section.
515 */
516 store.offset[store.seen] = ftell(config_file);
517 /* fallthru */
518 case SECTION_END_SEEN:
519 case START:
520 if (matches(key, value)) {
521 store.offset[store.seen] = ftell(config_file);
522 store.state = KEY_SEEN;
523 store.seen++;
524 } else {
525 if (strrchr(key, '.') - key == store.baselen &&
526 !strncmp(key, store.key, store.baselen)) {
527 store.state = SECTION_SEEN;
528 store.offset[store.seen] = ftell(config_file);
529 }
530 }
531 }
532 return 0;
533}
534
535static int store_write_section(int fd, const char* key)
536{
537 const char *dot;
538 int i, success;
539 struct strbuf sb = STRBUF_INIT;
540
541 dot = memchr(key, '.', store.baselen);
542 if (dot) {
543 strbuf_addf(&sb, "[%.*s \"", (int)(dot - key), key);
544 for (i = dot - key + 1; i < store.baselen; i++) {
545 if (key[i] == '"' || key[i] == '\\')
546 strbuf_addch(&sb, '\\');
547 strbuf_addch(&sb, key[i]);
548 }
549 strbuf_addstr(&sb, "\"]\n");
550 } else {
551 strbuf_addf(&sb, "[%.*s]\n", store.baselen, key);
552 }
553
554 success = write_in_full(fd, sb.buf, sb.len) == sb.len;
555 strbuf_release(&sb);
556
557 return success;
558}
559
560static int store_write_pair(int fd, const char* key, const char* value)
561{
562 int i, success;
563 int length = strlen(key + store.baselen + 1);
564 const char *quote = "";
565 struct strbuf sb = STRBUF_INIT;
566
567 /*
568 * Check to see if the value needs to be surrounded with a dq pair.
569 * Note that problematic characters are always backslash-quoted; this
570 * check is about not losing leading or trailing SP and strings that
571 * follow beginning-of-comment characters (i.e. ';' and '#') by the
572 * configuration parser.
573 */
574 if (value[0] == ' ')
575 quote = "\"";
576 for (i = 0; value[i]; i++)
577 if (value[i] == ';' || value[i] == '#')
578 quote = "\"";
579 if (i && value[i - 1] == ' ')
580 quote = "\"";
581
582 strbuf_addf(&sb, "\t%.*s = %s",
583 length, key + store.baselen + 1, quote);
584
585 for (i = 0; value[i]; i++)
586 switch (value[i]) {
587 case '\n':
588 strbuf_addstr(&sb, "\\n");
589 break;
590 case '\t':
591 strbuf_addstr(&sb, "\\t");
592 break;
593 case '"':
594 case '\\':
595 strbuf_addch(&sb, '\\');
596 default:
597 strbuf_addch(&sb, value[i]);
598 break;
599 }
600 strbuf_addf(&sb, "%s\n", quote);
601
602 success = write_in_full(fd, sb.buf, sb.len) == sb.len;
603 strbuf_release(&sb);
604
605 return success;
606}
607
608static ssize_t find_beginning_of_line(const char* contents, size_t size,
609 size_t offset_, int* found_bracket)
610{
611 size_t equal_offset = size, bracket_offset = size;
612 ssize_t offset;
613
614contline:
615 for (offset = offset_-2; offset > 0
616 && contents[offset] != '\n'; offset--)
617 switch (contents[offset]) {
618 case '=': equal_offset = offset; break;
619 case ']': bracket_offset = offset; break;
620 }
621 if (offset > 0 && contents[offset-1] == '\\') {
622 offset_ = offset;
623 goto contline;
624 }
625 if (bracket_offset < equal_offset) {
626 *found_bracket = 1;
627 offset = bracket_offset+1;
628 } else
629 offset++;
630
631 return offset;
632}
633
634int perf_config_set(const char* key, const char* value)
635{
636 return perf_config_set_multivar(key, value, NULL, 0);
637}
638
639/*
640 * If value==NULL, unset in (remove from) config,
641 * if value_regex!=NULL, disregard key/value pairs where value does not match.
642 * if multi_replace==0, nothing, or only one matching key/value is replaced,
643 * else all matching key/values (regardless how many) are removed,
644 * before the new pair is written.
645 *
646 * Returns 0 on success.
647 *
648 * This function does this:
649 *
650 * - it locks the config file by creating ".perf/config.lock"
651 *
652 * - it then parses the config using store_aux() as validator to find
653 * the position on the key/value pair to replace. If it is to be unset,
654 * it must be found exactly once.
655 *
656 * - the config file is mmap()ed and the part before the match (if any) is
657 * written to the lock file, then the changed part and the rest.
658 *
659 * - the config file is removed and the lock file rename()d to it.
660 *
661 */
662int perf_config_set_multivar(const char* key, const char* value,
663 const char* value_regex, int multi_replace)
664{
665 int i, dot;
666 int fd = -1, in_fd;
667 int ret = 0;
668 char* config_filename;
669 const char* last_dot = strrchr(key, '.');
670
671 if (config_exclusive_filename)
672 config_filename = strdup(config_exclusive_filename);
673 else
674 config_filename = perf_pathdup("config");
675
676 /*
677 * Since "key" actually contains the section name and the real
678 * key name separated by a dot, we have to know where the dot is.
679 */
680
681 if (last_dot == NULL) {
682 error("key does not contain a section: %s", key);
683 ret = 2;
684 goto out_free;
685 }
686 store.baselen = last_dot - key;
687
688 store.multi_replace = multi_replace;
689
690 /*
691 * Validate the key and while at it, lower case it for matching.
692 */
693 store.key = malloc(strlen(key) + 1);
694 dot = 0;
695 for (i = 0; key[i]; i++) {
696 unsigned char c = key[i];
697 if (c == '.')
698 dot = 1;
699 /* Leave the extended basename untouched.. */
700 if (!dot || i > store.baselen) {
701 if (!iskeychar(c) || (i == store.baselen+1 && !isalpha(c))) {
702 error("invalid key: %s", key);
703 free(store.key);
704 ret = 1;
705 goto out_free;
706 }
707 c = tolower(c);
708 } else if (c == '\n') {
709 error("invalid key (newline): %s", key);
710 free(store.key);
711 ret = 1;
712 goto out_free;
713 }
714 store.key[i] = c;
715 }
716 store.key[i] = 0;
717
718 /*
719 * If .perf/config does not exist yet, write a minimal version.
720 */
721 in_fd = open(config_filename, O_RDONLY);
722 if ( in_fd < 0 ) {
723 free(store.key);
724
725 if ( ENOENT != errno ) {
726 error("opening %s: %s", config_filename,
727 strerror(errno));
728 ret = 3; /* same as "invalid config file" */
729 goto out_free;
730 }
731 /* if nothing to unset, error out */
732 if (value == NULL) {
733 ret = 5;
734 goto out_free;
735 }
736
737 store.key = (char*)key;
738 if (!store_write_section(fd, key) ||
739 !store_write_pair(fd, key, value))
740 goto write_err_out;
741 } else {
742 struct stat st;
743 char* contents;
744 size_t contents_sz, copy_begin, copy_end;
745 int i, new_line = 0;
746
747 if (value_regex == NULL)
748 store.value_regex = NULL;
749 else {
750 if (value_regex[0] == '!') {
751 store.do_not_match = 1;
752 value_regex++;
753 } else
754 store.do_not_match = 0;
755
756 store.value_regex = (regex_t*)malloc(sizeof(regex_t));
757 if (regcomp(store.value_regex, value_regex,
758 REG_EXTENDED)) {
759 error("invalid pattern: %s", value_regex);
760 free(store.value_regex);
761 ret = 6;
762 goto out_free;
763 }
764 }
765
766 store.offset[0] = 0;
767 store.state = START;
768 store.seen = 0;
769
770 /*
771 * After this, store.offset will contain the *end* offset
772 * of the last match, or remain at 0 if no match was found.
773 * As a side effect, we make sure to transform only a valid
774 * existing config file.
775 */
776 if (perf_config_from_file(store_aux, config_filename, NULL)) {
777 error("invalid config file %s", config_filename);
778 free(store.key);
779 if (store.value_regex != NULL) {
780 regfree(store.value_regex);
781 free(store.value_regex);
782 }
783 ret = 3;
784 goto out_free;
785 }
786
787 free(store.key);
788 if (store.value_regex != NULL) {
789 regfree(store.value_regex);
790 free(store.value_regex);
791 }
792
793 /* if nothing to unset, or too many matches, error out */
794 if ((store.seen == 0 && value == NULL) ||
795 (store.seen > 1 && multi_replace == 0)) {
796 ret = 5;
797 goto out_free;
798 }
799
800 fstat(in_fd, &st);
801 contents_sz = xsize_t(st.st_size);
802 contents = mmap(NULL, contents_sz, PROT_READ,
803 MAP_PRIVATE, in_fd, 0);
804 close(in_fd);
805
806 if (store.seen == 0)
807 store.seen = 1;
808
809 for (i = 0, copy_begin = 0; i < store.seen; i++) {
810 if (store.offset[i] == 0) {
811 store.offset[i] = copy_end = contents_sz;
812 } else if (store.state != KEY_SEEN) {
813 copy_end = store.offset[i];
814 } else
815 copy_end = find_beginning_of_line(
816 contents, contents_sz,
817 store.offset[i]-2, &new_line);
818
819 if (copy_end > 0 && contents[copy_end-1] != '\n')
820 new_line = 1;
821
822 /* write the first part of the config */
823 if (copy_end > copy_begin) {
824 if (write_in_full(fd, contents + copy_begin,
825 copy_end - copy_begin) <
826 copy_end - copy_begin)
827 goto write_err_out;
828 if (new_line &&
829 write_in_full(fd, "\n", 1) != 1)
830 goto write_err_out;
831 }
832 copy_begin = store.offset[i];
833 }
834
835 /* write the pair (value == NULL means unset) */
836 if (value != NULL) {
837 if (store.state == START) {
838 if (!store_write_section(fd, key))
839 goto write_err_out;
840 }
841 if (!store_write_pair(fd, key, value))
842 goto write_err_out;
843 }
844
845 /* write the rest of the config */
846 if (copy_begin < contents_sz)
847 if (write_in_full(fd, contents + copy_begin,
848 contents_sz - copy_begin) <
849 contents_sz - copy_begin)
850 goto write_err_out;
851
852 munmap(contents, contents_sz);
853 }
854
855 ret = 0;
856
857out_free:
858 free(config_filename);
859 return ret;
860
861write_err_out:
862 goto out_free;
863
864}
865
866/*
867 * Call this to report error for your variable that should not
868 * get a boolean value (i.e. "[my] var" means "true").
869 */
870int config_error_nonbool(const char *var)
871{
872 return error("Missing value for '%s'", var);
873}
diff --git a/Documentation/perf_counter/util/ctype.c b/Documentation/perf_counter/util/ctype.c
new file mode 100644
index 000000000000..b90ec004f29c
--- /dev/null
+++ b/Documentation/perf_counter/util/ctype.c
@@ -0,0 +1,26 @@
1/*
2 * Sane locale-independent, ASCII ctype.
3 *
4 * No surprises, and works with signed and unsigned chars.
5 */
6#include "cache.h"
7
8enum {
9 S = GIT_SPACE,
10 A = GIT_ALPHA,
11 D = GIT_DIGIT,
12 G = GIT_GLOB_SPECIAL, /* *, ?, [, \\ */
13 R = GIT_REGEX_SPECIAL, /* $, (, ), +, ., ^, {, | * */
14};
15
16unsigned char sane_ctype[256] = {
17 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0, /* 0.. 15 */
18 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16.. 31 */
19 S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0, /* 32.. 47 */
20 D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G, /* 48.. 63 */
21 0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 64.. 79 */
22 A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0, /* 80.. 95 */
23 0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 96..111 */
24 A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0, /* 112..127 */
25 /* Nothing in the 128.. range */
26};
diff --git a/Documentation/perf_counter/util/exec_cmd.c b/Documentation/perf_counter/util/exec_cmd.c
new file mode 100644
index 000000000000..d39292263153
--- /dev/null
+++ b/Documentation/perf_counter/util/exec_cmd.c
@@ -0,0 +1,165 @@
1#include "cache.h"
2#include "exec_cmd.h"
3#include "quote.h"
4#define MAX_ARGS 32
5
6extern char **environ;
7static const char *argv_exec_path;
8static const char *argv0_path;
9
10const char *system_path(const char *path)
11{
12#ifdef RUNTIME_PREFIX
13 static const char *prefix;
14#else
15 static const char *prefix = PREFIX;
16#endif
17 struct strbuf d = STRBUF_INIT;
18
19 if (is_absolute_path(path))
20 return path;
21
22#ifdef RUNTIME_PREFIX
23 assert(argv0_path);
24 assert(is_absolute_path(argv0_path));
25
26 if (!prefix &&
27 !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) &&
28 !(prefix = strip_path_suffix(argv0_path, BINDIR)) &&
29 !(prefix = strip_path_suffix(argv0_path, "perf"))) {
30 prefix = PREFIX;
31 fprintf(stderr, "RUNTIME_PREFIX requested, "
32 "but prefix computation failed. "
33 "Using static fallback '%s'.\n", prefix);
34 }
35#endif
36
37 strbuf_addf(&d, "%s/%s", prefix, path);
38 path = strbuf_detach(&d, NULL);
39 return path;
40}
41
42const char *perf_extract_argv0_path(const char *argv0)
43{
44 const char *slash;
45
46 if (!argv0 || !*argv0)
47 return NULL;
48 slash = argv0 + strlen(argv0);
49
50 while (argv0 <= slash && !is_dir_sep(*slash))
51 slash--;
52
53 if (slash >= argv0) {
54 argv0_path = strndup(argv0, slash - argv0);
55 return slash + 1;
56 }
57
58 return argv0;
59}
60
61void perf_set_argv_exec_path(const char *exec_path)
62{
63 argv_exec_path = exec_path;
64 /*
65 * Propagate this setting to external programs.
66 */
67 setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1);
68}
69
70
71/* Returns the highest-priority, location to look for perf programs. */
72const char *perf_exec_path(void)
73{
74 const char *env;
75
76 if (argv_exec_path)
77 return argv_exec_path;
78
79 env = getenv(EXEC_PATH_ENVIRONMENT);
80 if (env && *env) {
81 return env;
82 }
83
84 return system_path(PERF_EXEC_PATH);
85}
86
87static void add_path(struct strbuf *out, const char *path)
88{
89 if (path && *path) {
90 if (is_absolute_path(path))
91 strbuf_addstr(out, path);
92 else
93 strbuf_addstr(out, make_nonrelative_path(path));
94
95 strbuf_addch(out, PATH_SEP);
96 }
97}
98
99void setup_path(void)
100{
101 const char *old_path = getenv("PATH");
102 struct strbuf new_path = STRBUF_INIT;
103
104 add_path(&new_path, perf_exec_path());
105 add_path(&new_path, argv0_path);
106
107 if (old_path)
108 strbuf_addstr(&new_path, old_path);
109 else
110 strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin");
111
112 setenv("PATH", new_path.buf, 1);
113
114 strbuf_release(&new_path);
115}
116
117const char **prepare_perf_cmd(const char **argv)
118{
119 int argc;
120 const char **nargv;
121
122 for (argc = 0; argv[argc]; argc++)
123 ; /* just counting */
124 nargv = malloc(sizeof(*nargv) * (argc + 2));
125
126 nargv[0] = "perf";
127 for (argc = 0; argv[argc]; argc++)
128 nargv[argc + 1] = argv[argc];
129 nargv[argc + 1] = NULL;
130 return nargv;
131}
132
133int execv_perf_cmd(const char **argv) {
134 const char **nargv = prepare_perf_cmd(argv);
135
136 /* execvp() can only ever return if it fails */
137 execvp("perf", (char **)nargv);
138
139 free(nargv);
140 return -1;
141}
142
143
144int execl_perf_cmd(const char *cmd,...)
145{
146 int argc;
147 const char *argv[MAX_ARGS + 1];
148 const char *arg;
149 va_list param;
150
151 va_start(param, cmd);
152 argv[0] = cmd;
153 argc = 1;
154 while (argc < MAX_ARGS) {
155 arg = argv[argc++] = va_arg(param, char *);
156 if (!arg)
157 break;
158 }
159 va_end(param);
160 if (MAX_ARGS <= argc)
161 return error("too many args to run %s", cmd);
162
163 argv[argc] = NULL;
164 return execv_perf_cmd(argv);
165}
diff --git a/Documentation/perf_counter/util/exec_cmd.h b/Documentation/perf_counter/util/exec_cmd.h
new file mode 100644
index 000000000000..effe25eb1545
--- /dev/null
+++ b/Documentation/perf_counter/util/exec_cmd.h
@@ -0,0 +1,13 @@
1#ifndef PERF_EXEC_CMD_H
2#define PERF_EXEC_CMD_H
3
4extern void perf_set_argv_exec_path(const char *exec_path);
5extern const char *perf_extract_argv0_path(const char *path);
6extern const char *perf_exec_path(void);
7extern void setup_path(void);
8extern const char **prepare_perf_cmd(const char **argv);
9extern int execv_perf_cmd(const char **argv); /* NULL terminated */
10extern int execl_perf_cmd(const char *cmd, ...);
11extern const char *system_path(const char *path);
12
13#endif /* PERF_EXEC_CMD_H */
diff --git a/Documentation/perf_counter/util/generate-cmdlist.sh b/Documentation/perf_counter/util/generate-cmdlist.sh
new file mode 100755
index 000000000000..f06f6fd148f8
--- /dev/null
+++ b/Documentation/perf_counter/util/generate-cmdlist.sh
@@ -0,0 +1,24 @@
1#!/bin/sh
2
3echo "/* Automatically generated by $0 */
4struct cmdname_help
5{
6 char name[16];
7 char help[80];
8};
9
10static struct cmdname_help common_cmds[] = {"
11
12sed -n -e 's/^perf-\([^ ]*\)[ ].* common.*/\1/p' command-list.txt |
13sort |
14while read cmd
15do
16 sed -n '
17 /^NAME/,/perf-'"$cmd"'/H
18 ${
19 x
20 s/.*perf-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/
21 p
22 }' "Documentation/perf-$cmd.txt"
23done
24echo "};"
diff --git a/Documentation/perf_counter/util/help.c b/Documentation/perf_counter/util/help.c
new file mode 100644
index 000000000000..edde541d238d
--- /dev/null
+++ b/Documentation/perf_counter/util/help.c
@@ -0,0 +1,366 @@
1#include "cache.h"
2#include "../builtin.h"
3#include "exec_cmd.h"
4#include "levenshtein.h"
5#include "help.h"
6
7/* most GUI terminals set COLUMNS (although some don't export it) */
8static int term_columns(void)
9{
10 char *col_string = getenv("COLUMNS");
11 int n_cols;
12
13 if (col_string && (n_cols = atoi(col_string)) > 0)
14 return n_cols;
15
16#ifdef TIOCGWINSZ
17 {
18 struct winsize ws;
19 if (!ioctl(1, TIOCGWINSZ, &ws)) {
20 if (ws.ws_col)
21 return ws.ws_col;
22 }
23 }
24#endif
25
26 return 80;
27}
28
29void add_cmdname(struct cmdnames *cmds, const char *name, int len)
30{
31 struct cmdname *ent = malloc(sizeof(*ent) + len + 1);
32
33 ent->len = len;
34 memcpy(ent->name, name, len);
35 ent->name[len] = 0;
36
37 ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc);
38 cmds->names[cmds->cnt++] = ent;
39}
40
41static void clean_cmdnames(struct cmdnames *cmds)
42{
43 int i;
44 for (i = 0; i < cmds->cnt; ++i)
45 free(cmds->names[i]);
46 free(cmds->names);
47 cmds->cnt = 0;
48 cmds->alloc = 0;
49}
50
51static int cmdname_compare(const void *a_, const void *b_)
52{
53 struct cmdname *a = *(struct cmdname **)a_;
54 struct cmdname *b = *(struct cmdname **)b_;
55 return strcmp(a->name, b->name);
56}
57
58static void uniq(struct cmdnames *cmds)
59{
60 int i, j;
61
62 if (!cmds->cnt)
63 return;
64
65 for (i = j = 1; i < cmds->cnt; i++)
66 if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name))
67 cmds->names[j++] = cmds->names[i];
68
69 cmds->cnt = j;
70}
71
72void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
73{
74 int ci, cj, ei;
75 int cmp;
76
77 ci = cj = ei = 0;
78 while (ci < cmds->cnt && ei < excludes->cnt) {
79 cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
80 if (cmp < 0)
81 cmds->names[cj++] = cmds->names[ci++];
82 else if (cmp == 0)
83 ci++, ei++;
84 else if (cmp > 0)
85 ei++;
86 }
87
88 while (ci < cmds->cnt)
89 cmds->names[cj++] = cmds->names[ci++];
90
91 cmds->cnt = cj;
92}
93
94static void pretty_print_string_list(struct cmdnames *cmds, int longest)
95{
96 int cols = 1, rows;
97 int space = longest + 1; /* min 1 SP between words */
98 int max_cols = term_columns() - 1; /* don't print *on* the edge */
99 int i, j;
100
101 if (space < max_cols)
102 cols = max_cols / space;
103 rows = (cmds->cnt + cols - 1) / cols;
104
105 for (i = 0; i < rows; i++) {
106 printf(" ");
107
108 for (j = 0; j < cols; j++) {
109 int n = j * rows + i;
110 int size = space;
111 if (n >= cmds->cnt)
112 break;
113 if (j == cols-1 || n + rows >= cmds->cnt)
114 size = 1;
115 printf("%-*s", size, cmds->names[n]->name);
116 }
117 putchar('\n');
118 }
119}
120
121static int is_executable(const char *name)
122{
123 struct stat st;
124
125 if (stat(name, &st) || /* stat, not lstat */
126 !S_ISREG(st.st_mode))
127 return 0;
128
129#ifdef __MINGW32__
130 /* cannot trust the executable bit, peek into the file instead */
131 char buf[3] = { 0 };
132 int n;
133 int fd = open(name, O_RDONLY);
134 st.st_mode &= ~S_IXUSR;
135 if (fd >= 0) {
136 n = read(fd, buf, 2);
137 if (n == 2)
138 /* DOS executables start with "MZ" */
139 if (!strcmp(buf, "#!") || !strcmp(buf, "MZ"))
140 st.st_mode |= S_IXUSR;
141 close(fd);
142 }
143#endif
144 return st.st_mode & S_IXUSR;
145}
146
147static void list_commands_in_dir(struct cmdnames *cmds,
148 const char *path,
149 const char *prefix)
150{
151 int prefix_len;
152 DIR *dir = opendir(path);
153 struct dirent *de;
154 struct strbuf buf = STRBUF_INIT;
155 int len;
156
157 if (!dir)
158 return;
159 if (!prefix)
160 prefix = "perf-";
161 prefix_len = strlen(prefix);
162
163 strbuf_addf(&buf, "%s/", path);
164 len = buf.len;
165
166 while ((de = readdir(dir)) != NULL) {
167 int entlen;
168
169 if (prefixcmp(de->d_name, prefix))
170 continue;
171
172 strbuf_setlen(&buf, len);
173 strbuf_addstr(&buf, de->d_name);
174 if (!is_executable(buf.buf))
175 continue;
176
177 entlen = strlen(de->d_name) - prefix_len;
178 if (has_extension(de->d_name, ".exe"))
179 entlen -= 4;
180
181 add_cmdname(cmds, de->d_name + prefix_len, entlen);
182 }
183 closedir(dir);
184 strbuf_release(&buf);
185}
186
187void load_command_list(const char *prefix,
188 struct cmdnames *main_cmds,
189 struct cmdnames *other_cmds)
190{
191 const char *env_path = getenv("PATH");
192 const char *exec_path = perf_exec_path();
193
194 if (exec_path) {
195 list_commands_in_dir(main_cmds, exec_path, prefix);
196 qsort(main_cmds->names, main_cmds->cnt,
197 sizeof(*main_cmds->names), cmdname_compare);
198 uniq(main_cmds);
199 }
200
201 if (env_path) {
202 char *paths, *path, *colon;
203 path = paths = strdup(env_path);
204 while (1) {
205 if ((colon = strchr(path, PATH_SEP)))
206 *colon = 0;
207 if (!exec_path || strcmp(path, exec_path))
208 list_commands_in_dir(other_cmds, path, prefix);
209
210 if (!colon)
211 break;
212 path = colon + 1;
213 }
214 free(paths);
215
216 qsort(other_cmds->names, other_cmds->cnt,
217 sizeof(*other_cmds->names), cmdname_compare);
218 uniq(other_cmds);
219 }
220 exclude_cmds(other_cmds, main_cmds);
221}
222
223void list_commands(const char *title, struct cmdnames *main_cmds,
224 struct cmdnames *other_cmds)
225{
226 int i, longest = 0;
227
228 for (i = 0; i < main_cmds->cnt; i++)
229 if (longest < main_cmds->names[i]->len)
230 longest = main_cmds->names[i]->len;
231 for (i = 0; i < other_cmds->cnt; i++)
232 if (longest < other_cmds->names[i]->len)
233 longest = other_cmds->names[i]->len;
234
235 if (main_cmds->cnt) {
236 const char *exec_path = perf_exec_path();
237 printf("available %s in '%s'\n", title, exec_path);
238 printf("----------------");
239 mput_char('-', strlen(title) + strlen(exec_path));
240 putchar('\n');
241 pretty_print_string_list(main_cmds, longest);
242 putchar('\n');
243 }
244
245 if (other_cmds->cnt) {
246 printf("%s available from elsewhere on your $PATH\n", title);
247 printf("---------------------------------------");
248 mput_char('-', strlen(title));
249 putchar('\n');
250 pretty_print_string_list(other_cmds, longest);
251 putchar('\n');
252 }
253}
254
255int is_in_cmdlist(struct cmdnames *c, const char *s)
256{
257 int i;
258 for (i = 0; i < c->cnt; i++)
259 if (!strcmp(s, c->names[i]->name))
260 return 1;
261 return 0;
262}
263
264static int autocorrect;
265static struct cmdnames aliases;
266
267static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
268{
269 if (!strcmp(var, "help.autocorrect"))
270 autocorrect = perf_config_int(var,value);
271 /* Also use aliases for command lookup */
272 if (!prefixcmp(var, "alias."))
273 add_cmdname(&aliases, var + 6, strlen(var + 6));
274
275 return perf_default_config(var, value, cb);
276}
277
278static int levenshtein_compare(const void *p1, const void *p2)
279{
280 const struct cmdname *const *c1 = p1, *const *c2 = p2;
281 const char *s1 = (*c1)->name, *s2 = (*c2)->name;
282 int l1 = (*c1)->len;
283 int l2 = (*c2)->len;
284 return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
285}
286
287static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
288{
289 int i;
290 ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
291
292 for (i = 0; i < old->cnt; i++)
293 cmds->names[cmds->cnt++] = old->names[i];
294 free(old->names);
295 old->cnt = 0;
296 old->names = NULL;
297}
298
299const char *help_unknown_cmd(const char *cmd)
300{
301 int i, n, best_similarity = 0;
302 struct cmdnames main_cmds, other_cmds;
303
304 memset(&main_cmds, 0, sizeof(main_cmds));
305 memset(&other_cmds, 0, sizeof(main_cmds));
306 memset(&aliases, 0, sizeof(aliases));
307
308 perf_config(perf_unknown_cmd_config, NULL);
309
310 load_command_list("perf-", &main_cmds, &other_cmds);
311
312 add_cmd_list(&main_cmds, &aliases);
313 add_cmd_list(&main_cmds, &other_cmds);
314 qsort(main_cmds.names, main_cmds.cnt,
315 sizeof(main_cmds.names), cmdname_compare);
316 uniq(&main_cmds);
317
318 /* This reuses cmdname->len for similarity index */
319 for (i = 0; i < main_cmds.cnt; ++i)
320 main_cmds.names[i]->len =
321 levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
322
323 qsort(main_cmds.names, main_cmds.cnt,
324 sizeof(*main_cmds.names), levenshtein_compare);
325
326 if (!main_cmds.cnt)
327 die ("Uh oh. Your system reports no Git commands at all.");
328
329 best_similarity = main_cmds.names[0]->len;
330 n = 1;
331 while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
332 ++n;
333 if (autocorrect && n == 1) {
334 const char *assumed = main_cmds.names[0]->name;
335 main_cmds.names[0] = NULL;
336 clean_cmdnames(&main_cmds);
337 fprintf(stderr, "WARNING: You called a Git program named '%s', "
338 "which does not exist.\n"
339 "Continuing under the assumption that you meant '%s'\n",
340 cmd, assumed);
341 if (autocorrect > 0) {
342 fprintf(stderr, "in %0.1f seconds automatically...\n",
343 (float)autocorrect/10.0);
344 poll(NULL, 0, autocorrect * 100);
345 }
346 return assumed;
347 }
348
349 fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
350
351 if (best_similarity < 6) {
352 fprintf(stderr, "\nDid you mean %s?\n",
353 n < 2 ? "this": "one of these");
354
355 for (i = 0; i < n; i++)
356 fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
357 }
358
359 exit(1);
360}
361
362int cmd_version(int argc, const char **argv, const char *prefix)
363{
364 printf("perf version %s\n", perf_version_string);
365 return 0;
366}
diff --git a/Documentation/perf_counter/util/help.h b/Documentation/perf_counter/util/help.h
new file mode 100644
index 000000000000..56bc15406ffc
--- /dev/null
+++ b/Documentation/perf_counter/util/help.h
@@ -0,0 +1,29 @@
1#ifndef HELP_H
2#define HELP_H
3
4struct cmdnames {
5 int alloc;
6 int cnt;
7 struct cmdname {
8 size_t len; /* also used for similarity index in help.c */
9 char name[FLEX_ARRAY];
10 } **names;
11};
12
13static inline void mput_char(char c, unsigned int num)
14{
15 while(num--)
16 putchar(c);
17}
18
19void load_command_list(const char *prefix,
20 struct cmdnames *main_cmds,
21 struct cmdnames *other_cmds);
22void add_cmdname(struct cmdnames *cmds, const char *name, int len);
23/* Here we require that excludes is a sorted list. */
24void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
25int is_in_cmdlist(struct cmdnames *c, const char *s);
26void list_commands(const char *title, struct cmdnames *main_cmds,
27 struct cmdnames *other_cmds);
28
29#endif /* HELP_H */
diff --git a/Documentation/perf_counter/util/levenshtein.c b/Documentation/perf_counter/util/levenshtein.c
new file mode 100644
index 000000000000..e521d1516df6
--- /dev/null
+++ b/Documentation/perf_counter/util/levenshtein.c
@@ -0,0 +1,84 @@
1#include "cache.h"
2#include "levenshtein.h"
3
4/*
5 * This function implements the Damerau-Levenshtein algorithm to
6 * calculate a distance between strings.
7 *
8 * Basically, it says how many letters need to be swapped, substituted,
9 * deleted from, or added to string1, at least, to get string2.
10 *
11 * The idea is to build a distance matrix for the substrings of both
12 * strings. To avoid a large space complexity, only the last three rows
13 * are kept in memory (if swaps had the same or higher cost as one deletion
14 * plus one insertion, only two rows would be needed).
15 *
16 * At any stage, "i + 1" denotes the length of the current substring of
17 * string1 that the distance is calculated for.
18 *
19 * row2 holds the current row, row1 the previous row (i.e. for the substring
20 * of string1 of length "i"), and row0 the row before that.
21 *
22 * In other words, at the start of the big loop, row2[j + 1] contains the
23 * Damerau-Levenshtein distance between the substring of string1 of length
24 * "i" and the substring of string2 of length "j + 1".
25 *
26 * All the big loop does is determine the partial minimum-cost paths.
27 *
28 * It does so by calculating the costs of the path ending in characters
29 * i (in string1) and j (in string2), respectively, given that the last
30 * operation is a substition, a swap, a deletion, or an insertion.
31 *
32 * This implementation allows the costs to be weighted:
33 *
34 * - w (as in "sWap")
35 * - s (as in "Substitution")
36 * - a (for insertion, AKA "Add")
37 * - d (as in "Deletion")
38 *
39 * Note that this algorithm calculates a distance _iff_ d == a.
40 */
41int levenshtein(const char *string1, const char *string2,
42 int w, int s, int a, int d)
43{
44 int len1 = strlen(string1), len2 = strlen(string2);
45 int *row0 = malloc(sizeof(int) * (len2 + 1));
46 int *row1 = malloc(sizeof(int) * (len2 + 1));
47 int *row2 = malloc(sizeof(int) * (len2 + 1));
48 int i, j;
49
50 for (j = 0; j <= len2; j++)
51 row1[j] = j * a;
52 for (i = 0; i < len1; i++) {
53 int *dummy;
54
55 row2[0] = (i + 1) * d;
56 for (j = 0; j < len2; j++) {
57 /* substitution */
58 row2[j + 1] = row1[j] + s * (string1[i] != string2[j]);
59 /* swap */
60 if (i > 0 && j > 0 && string1[i - 1] == string2[j] &&
61 string1[i] == string2[j - 1] &&
62 row2[j + 1] > row0[j - 1] + w)
63 row2[j + 1] = row0[j - 1] + w;
64 /* deletion */
65 if (row2[j + 1] > row1[j + 1] + d)
66 row2[j + 1] = row1[j + 1] + d;
67 /* insertion */
68 if (row2[j + 1] > row2[j] + a)
69 row2[j + 1] = row2[j] + a;
70 }
71
72 dummy = row0;
73 row0 = row1;
74 row1 = row2;
75 row2 = dummy;
76 }
77
78 i = row1[len2];
79 free(row0);
80 free(row1);
81 free(row2);
82
83 return i;
84}
diff --git a/Documentation/perf_counter/util/levenshtein.h b/Documentation/perf_counter/util/levenshtein.h
new file mode 100644
index 000000000000..0173abeef52c
--- /dev/null
+++ b/Documentation/perf_counter/util/levenshtein.h
@@ -0,0 +1,8 @@
1#ifndef LEVENSHTEIN_H
2#define LEVENSHTEIN_H
3
4int levenshtein(const char *string1, const char *string2,
5 int swap_penalty, int substition_penalty,
6 int insertion_penalty, int deletion_penalty);
7
8#endif
diff --git a/Documentation/perf_counter/util/parse-options.c b/Documentation/perf_counter/util/parse-options.c
new file mode 100644
index 000000000000..28b34c1c29cf
--- /dev/null
+++ b/Documentation/perf_counter/util/parse-options.c
@@ -0,0 +1,492 @@
1#include "util.h"
2#include "parse-options.h"
3#include "cache.h"
4
5#define OPT_SHORT 1
6#define OPT_UNSET 2
7
8static int opterror(const struct option *opt, const char *reason, int flags)
9{
10 if (flags & OPT_SHORT)
11 return error("switch `%c' %s", opt->short_name, reason);
12 if (flags & OPT_UNSET)
13 return error("option `no-%s' %s", opt->long_name, reason);
14 return error("option `%s' %s", opt->long_name, reason);
15}
16
17static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
18 int flags, const char **arg)
19{
20 if (p->opt) {
21 *arg = p->opt;
22 p->opt = NULL;
23 } else if (p->argc == 1 && (opt->flags & PARSE_OPT_LASTARG_DEFAULT)) {
24 *arg = (const char *)opt->defval;
25 } else if (p->argc > 1) {
26 p->argc--;
27 *arg = *++p->argv;
28 } else
29 return opterror(opt, "requires a value", flags);
30 return 0;
31}
32
33static int get_value(struct parse_opt_ctx_t *p,
34 const struct option *opt, int flags)
35{
36 const char *s, *arg;
37 const int unset = flags & OPT_UNSET;
38
39 if (unset && p->opt)
40 return opterror(opt, "takes no value", flags);
41 if (unset && (opt->flags & PARSE_OPT_NONEG))
42 return opterror(opt, "isn't available", flags);
43
44 if (!(flags & OPT_SHORT) && p->opt) {
45 switch (opt->type) {
46 case OPTION_CALLBACK:
47 if (!(opt->flags & PARSE_OPT_NOARG))
48 break;
49 /* FALLTHROUGH */
50 case OPTION_BOOLEAN:
51 case OPTION_BIT:
52 case OPTION_SET_INT:
53 case OPTION_SET_PTR:
54 return opterror(opt, "takes no value", flags);
55 default:
56 break;
57 }
58 }
59
60 switch (opt->type) {
61 case OPTION_BIT:
62 if (unset)
63 *(int *)opt->value &= ~opt->defval;
64 else
65 *(int *)opt->value |= opt->defval;
66 return 0;
67
68 case OPTION_BOOLEAN:
69 *(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
70 return 0;
71
72 case OPTION_SET_INT:
73 *(int *)opt->value = unset ? 0 : opt->defval;
74 return 0;
75
76 case OPTION_SET_PTR:
77 *(void **)opt->value = unset ? NULL : (void *)opt->defval;
78 return 0;
79
80 case OPTION_STRING:
81 if (unset)
82 *(const char **)opt->value = NULL;
83 else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
84 *(const char **)opt->value = (const char *)opt->defval;
85 else
86 return get_arg(p, opt, flags, (const char **)opt->value);
87 return 0;
88
89 case OPTION_CALLBACK:
90 if (unset)
91 return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
92 if (opt->flags & PARSE_OPT_NOARG)
93 return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
94 if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
95 return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
96 if (get_arg(p, opt, flags, &arg))
97 return -1;
98 return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
99
100 case OPTION_INTEGER:
101 if (unset) {
102 *(int *)opt->value = 0;
103 return 0;
104 }
105 if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
106 *(int *)opt->value = opt->defval;
107 return 0;
108 }
109 if (get_arg(p, opt, flags, &arg))
110 return -1;
111 *(int *)opt->value = strtol(arg, (char **)&s, 10);
112 if (*s)
113 return opterror(opt, "expects a numerical value", flags);
114 return 0;
115
116 default:
117 die("should not happen, someone must be hit on the forehead");
118 }
119}
120
121static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options)
122{
123 for (; options->type != OPTION_END; options++) {
124 if (options->short_name == *p->opt) {
125 p->opt = p->opt[1] ? p->opt + 1 : NULL;
126 return get_value(p, options, OPT_SHORT);
127 }
128 }
129 return -2;
130}
131
132static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
133 const struct option *options)
134{
135 const char *arg_end = strchr(arg, '=');
136 const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
137 int abbrev_flags = 0, ambiguous_flags = 0;
138
139 if (!arg_end)
140 arg_end = arg + strlen(arg);
141
142 for (; options->type != OPTION_END; options++) {
143 const char *rest;
144 int flags = 0;
145
146 if (!options->long_name)
147 continue;
148
149 rest = skip_prefix(arg, options->long_name);
150 if (options->type == OPTION_ARGUMENT) {
151 if (!rest)
152 continue;
153 if (*rest == '=')
154 return opterror(options, "takes no value", flags);
155 if (*rest)
156 continue;
157 p->out[p->cpidx++] = arg - 2;
158 return 0;
159 }
160 if (!rest) {
161 /* abbreviated? */
162 if (!strncmp(options->long_name, arg, arg_end - arg)) {
163is_abbreviated:
164 if (abbrev_option) {
165 /*
166 * If this is abbreviated, it is
167 * ambiguous. So when there is no
168 * exact match later, we need to
169 * error out.
170 */
171 ambiguous_option = abbrev_option;
172 ambiguous_flags = abbrev_flags;
173 }
174 if (!(flags & OPT_UNSET) && *arg_end)
175 p->opt = arg_end + 1;
176 abbrev_option = options;
177 abbrev_flags = flags;
178 continue;
179 }
180 /* negated and abbreviated very much? */
181 if (!prefixcmp("no-", arg)) {
182 flags |= OPT_UNSET;
183 goto is_abbreviated;
184 }
185 /* negated? */
186 if (strncmp(arg, "no-", 3))
187 continue;
188 flags |= OPT_UNSET;
189 rest = skip_prefix(arg + 3, options->long_name);
190 /* abbreviated and negated? */
191 if (!rest && !prefixcmp(options->long_name, arg + 3))
192 goto is_abbreviated;
193 if (!rest)
194 continue;
195 }
196 if (*rest) {
197 if (*rest != '=')
198 continue;
199 p->opt = rest + 1;
200 }
201 return get_value(p, options, flags);
202 }
203
204 if (ambiguous_option)
205 return error("Ambiguous option: %s "
206 "(could be --%s%s or --%s%s)",
207 arg,
208 (ambiguous_flags & OPT_UNSET) ? "no-" : "",
209 ambiguous_option->long_name,
210 (abbrev_flags & OPT_UNSET) ? "no-" : "",
211 abbrev_option->long_name);
212 if (abbrev_option)
213 return get_value(p, abbrev_option, abbrev_flags);
214 return -2;
215}
216
217static void check_typos(const char *arg, const struct option *options)
218{
219 if (strlen(arg) < 3)
220 return;
221
222 if (!prefixcmp(arg, "no-")) {
223 error ("did you mean `--%s` (with two dashes ?)", arg);
224 exit(129);
225 }
226
227 for (; options->type != OPTION_END; options++) {
228 if (!options->long_name)
229 continue;
230 if (!prefixcmp(options->long_name, arg)) {
231 error ("did you mean `--%s` (with two dashes ?)", arg);
232 exit(129);
233 }
234 }
235}
236
237void parse_options_start(struct parse_opt_ctx_t *ctx,
238 int argc, const char **argv, int flags)
239{
240 memset(ctx, 0, sizeof(*ctx));
241 ctx->argc = argc - 1;
242 ctx->argv = argv + 1;
243 ctx->out = argv;
244 ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
245 ctx->flags = flags;
246 if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
247 (flags & PARSE_OPT_STOP_AT_NON_OPTION))
248 die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
249}
250
251static int usage_with_options_internal(const char * const *,
252 const struct option *, int);
253
254int parse_options_step(struct parse_opt_ctx_t *ctx,
255 const struct option *options,
256 const char * const usagestr[])
257{
258 int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
259
260 /* we must reset ->opt, unknown short option leave it dangling */
261 ctx->opt = NULL;
262
263 for (; ctx->argc; ctx->argc--, ctx->argv++) {
264 const char *arg = ctx->argv[0];
265
266 if (*arg != '-' || !arg[1]) {
267 if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
268 break;
269 ctx->out[ctx->cpidx++] = ctx->argv[0];
270 continue;
271 }
272
273 if (arg[1] != '-') {
274 ctx->opt = arg + 1;
275 if (internal_help && *ctx->opt == 'h')
276 return parse_options_usage(usagestr, options);
277 switch (parse_short_opt(ctx, options)) {
278 case -1:
279 return parse_options_usage(usagestr, options);
280 case -2:
281 goto unknown;
282 }
283 if (ctx->opt)
284 check_typos(arg + 1, options);
285 while (ctx->opt) {
286 if (internal_help && *ctx->opt == 'h')
287 return parse_options_usage(usagestr, options);
288 switch (parse_short_opt(ctx, options)) {
289 case -1:
290 return parse_options_usage(usagestr, options);
291 case -2:
292 /* fake a short option thing to hide the fact that we may have
293 * started to parse aggregated stuff
294 *
295 * This is leaky, too bad.
296 */
297 ctx->argv[0] = strdup(ctx->opt - 1);
298 *(char *)ctx->argv[0] = '-';
299 goto unknown;
300 }
301 }
302 continue;
303 }
304
305 if (!arg[2]) { /* "--" */
306 if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
307 ctx->argc--;
308 ctx->argv++;
309 }
310 break;
311 }
312
313 if (internal_help && !strcmp(arg + 2, "help-all"))
314 return usage_with_options_internal(usagestr, options, 1);
315 if (internal_help && !strcmp(arg + 2, "help"))
316 return parse_options_usage(usagestr, options);
317 switch (parse_long_opt(ctx, arg + 2, options)) {
318 case -1:
319 return parse_options_usage(usagestr, options);
320 case -2:
321 goto unknown;
322 }
323 continue;
324unknown:
325 if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
326 return PARSE_OPT_UNKNOWN;
327 ctx->out[ctx->cpidx++] = ctx->argv[0];
328 ctx->opt = NULL;
329 }
330 return PARSE_OPT_DONE;
331}
332
333int parse_options_end(struct parse_opt_ctx_t *ctx)
334{
335 memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
336 ctx->out[ctx->cpidx + ctx->argc] = NULL;
337 return ctx->cpidx + ctx->argc;
338}
339
340int parse_options(int argc, const char **argv, const struct option *options,
341 const char * const usagestr[], int flags)
342{
343 struct parse_opt_ctx_t ctx;
344
345 parse_options_start(&ctx, argc, argv, flags);
346 switch (parse_options_step(&ctx, options, usagestr)) {
347 case PARSE_OPT_HELP:
348 exit(129);
349 case PARSE_OPT_DONE:
350 break;
351 default: /* PARSE_OPT_UNKNOWN */
352 if (ctx.argv[0][1] == '-') {
353 error("unknown option `%s'", ctx.argv[0] + 2);
354 } else {
355 error("unknown switch `%c'", *ctx.opt);
356 }
357 usage_with_options(usagestr, options);
358 }
359
360 return parse_options_end(&ctx);
361}
362
363#define USAGE_OPTS_WIDTH 24
364#define USAGE_GAP 2
365
366int usage_with_options_internal(const char * const *usagestr,
367 const struct option *opts, int full)
368{
369 if (!usagestr)
370 return PARSE_OPT_HELP;
371
372 fprintf(stderr, "usage: %s\n", *usagestr++);
373 while (*usagestr && **usagestr)
374 fprintf(stderr, " or: %s\n", *usagestr++);
375 while (*usagestr) {
376 fprintf(stderr, "%s%s\n",
377 **usagestr ? " " : "",
378 *usagestr);
379 usagestr++;
380 }
381
382 if (opts->type != OPTION_GROUP)
383 fputc('\n', stderr);
384
385 for (; opts->type != OPTION_END; opts++) {
386 size_t pos;
387 int pad;
388
389 if (opts->type == OPTION_GROUP) {
390 fputc('\n', stderr);
391 if (*opts->help)
392 fprintf(stderr, "%s\n", opts->help);
393 continue;
394 }
395 if (!full && (opts->flags & PARSE_OPT_HIDDEN))
396 continue;
397
398 pos = fprintf(stderr, " ");
399 if (opts->short_name)
400 pos += fprintf(stderr, "-%c", opts->short_name);
401 if (opts->long_name && opts->short_name)
402 pos += fprintf(stderr, ", ");
403 if (opts->long_name)
404 pos += fprintf(stderr, "--%s", opts->long_name);
405
406 switch (opts->type) {
407 case OPTION_ARGUMENT:
408 break;
409 case OPTION_INTEGER:
410 if (opts->flags & PARSE_OPT_OPTARG)
411 if (opts->long_name)
412 pos += fprintf(stderr, "[=<n>]");
413 else
414 pos += fprintf(stderr, "[<n>]");
415 else
416 pos += fprintf(stderr, " <n>");
417 break;
418 case OPTION_CALLBACK:
419 if (opts->flags & PARSE_OPT_NOARG)
420 break;
421 /* FALLTHROUGH */
422 case OPTION_STRING:
423 if (opts->argh) {
424 if (opts->flags & PARSE_OPT_OPTARG)
425 if (opts->long_name)
426 pos += fprintf(stderr, "[=<%s>]", opts->argh);
427 else
428 pos += fprintf(stderr, "[<%s>]", opts->argh);
429 else
430 pos += fprintf(stderr, " <%s>", opts->argh);
431 } else {
432 if (opts->flags & PARSE_OPT_OPTARG)
433 if (opts->long_name)
434 pos += fprintf(stderr, "[=...]");
435 else
436 pos += fprintf(stderr, "[...]");
437 else
438 pos += fprintf(stderr, " ...");
439 }
440 break;
441 default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */
442 break;
443 }
444
445 if (pos <= USAGE_OPTS_WIDTH)
446 pad = USAGE_OPTS_WIDTH - pos;
447 else {
448 fputc('\n', stderr);
449 pad = USAGE_OPTS_WIDTH;
450 }
451 fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
452 }
453 fputc('\n', stderr);
454
455 return PARSE_OPT_HELP;
456}
457
458void usage_with_options(const char * const *usagestr,
459 const struct option *opts)
460{
461 usage_with_options_internal(usagestr, opts, 0);
462 exit(129);
463}
464
465int parse_options_usage(const char * const *usagestr,
466 const struct option *opts)
467{
468 return usage_with_options_internal(usagestr, opts, 0);
469}
470
471
472int parse_opt_verbosity_cb(const struct option *opt, const char *arg,
473 int unset)
474{
475 int *target = opt->value;
476
477 if (unset)
478 /* --no-quiet, --no-verbose */
479 *target = 0;
480 else if (opt->short_name == 'v') {
481 if (*target >= 0)
482 (*target)++;
483 else
484 *target = 1;
485 } else {
486 if (*target <= 0)
487 (*target)--;
488 else
489 *target = -1;
490 }
491 return 0;
492}
diff --git a/Documentation/perf_counter/util/parse-options.h b/Documentation/perf_counter/util/parse-options.h
new file mode 100644
index 000000000000..a81c7faff68e
--- /dev/null
+++ b/Documentation/perf_counter/util/parse-options.h
@@ -0,0 +1,172 @@
1#ifndef PARSE_OPTIONS_H
2#define PARSE_OPTIONS_H
3
4enum parse_opt_type {
5 /* special types */
6 OPTION_END,
7 OPTION_ARGUMENT,
8 OPTION_GROUP,
9 /* options with no arguments */
10 OPTION_BIT,
11 OPTION_BOOLEAN, /* _INCR would have been a better name */
12 OPTION_SET_INT,
13 OPTION_SET_PTR,
14 /* options with arguments (usually) */
15 OPTION_STRING,
16 OPTION_INTEGER,
17 OPTION_CALLBACK,
18};
19
20enum parse_opt_flags {
21 PARSE_OPT_KEEP_DASHDASH = 1,
22 PARSE_OPT_STOP_AT_NON_OPTION = 2,
23 PARSE_OPT_KEEP_ARGV0 = 4,
24 PARSE_OPT_KEEP_UNKNOWN = 8,
25 PARSE_OPT_NO_INTERNAL_HELP = 16,
26};
27
28enum parse_opt_option_flags {
29 PARSE_OPT_OPTARG = 1,
30 PARSE_OPT_NOARG = 2,
31 PARSE_OPT_NONEG = 4,
32 PARSE_OPT_HIDDEN = 8,
33 PARSE_OPT_LASTARG_DEFAULT = 16,
34};
35
36struct option;
37typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
38
39/*
40 * `type`::
41 * holds the type of the option, you must have an OPTION_END last in your
42 * array.
43 *
44 * `short_name`::
45 * the character to use as a short option name, '\0' if none.
46 *
47 * `long_name`::
48 * the long option name, without the leading dashes, NULL if none.
49 *
50 * `value`::
51 * stores pointers to the values to be filled.
52 *
53 * `argh`::
54 * token to explain the kind of argument this option wants. Keep it
55 * homogenous across the repository.
56 *
57 * `help`::
58 * the short help associated to what the option does.
59 * Must never be NULL (except for OPTION_END).
60 * OPTION_GROUP uses this pointer to store the group header.
61 *
62 * `flags`::
63 * mask of parse_opt_option_flags.
64 * PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
65 * PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
66 * PARSE_OPT_NONEG: says that this option cannot be negated
67 * PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
68 * the long one.
69 *
70 * `callback`::
71 * pointer to the callback to use for OPTION_CALLBACK.
72 *
73 * `defval`::
74 * default value to fill (*->value) with for PARSE_OPT_OPTARG.
75 * OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in
76 * the value when met.
77 * CALLBACKS can use it like they want.
78 */
79struct option {
80 enum parse_opt_type type;
81 int short_name;
82 const char *long_name;
83 void *value;
84 const char *argh;
85 const char *help;
86
87 int flags;
88 parse_opt_cb *callback;
89 intptr_t defval;
90};
91
92#define OPT_END() { OPTION_END }
93#define OPT_ARGUMENT(l, h) { OPTION_ARGUMENT, 0, (l), NULL, NULL, (h) }
94#define OPT_GROUP(h) { OPTION_GROUP, 0, NULL, NULL, NULL, (h) }
95#define OPT_BIT(s, l, v, h, b) { OPTION_BIT, (s), (l), (v), NULL, (h), 0, NULL, (b) }
96#define OPT_BOOLEAN(s, l, v, h) { OPTION_BOOLEAN, (s), (l), (v), NULL, (h) }
97#define OPT_SET_INT(s, l, v, h, i) { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) }
98#define OPT_SET_PTR(s, l, v, h, p) { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) }
99#define OPT_INTEGER(s, l, v, h) { OPTION_INTEGER, (s), (l), (v), NULL, (h) }
100#define OPT_STRING(s, l, v, a, h) { OPTION_STRING, (s), (l), (v), (a), (h) }
101#define OPT_DATE(s, l, v, h) \
102 { OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \
103 parse_opt_approxidate_cb }
104#define OPT_CALLBACK(s, l, v, a, h, f) \
105 { OPTION_CALLBACK, (s), (l), (v), (a), (h), 0, (f) }
106
107/* parse_options() will filter out the processed options and leave the
108 * non-option argments in argv[].
109 * Returns the number of arguments left in argv[].
110 */
111extern int parse_options(int argc, const char **argv,
112 const struct option *options,
113 const char * const usagestr[], int flags);
114
115extern NORETURN void usage_with_options(const char * const *usagestr,
116 const struct option *options);
117
118/*----- incremantal advanced APIs -----*/
119
120enum {
121 PARSE_OPT_HELP = -1,
122 PARSE_OPT_DONE,
123 PARSE_OPT_UNKNOWN,
124};
125
126/*
127 * It's okay for the caller to consume argv/argc in the usual way.
128 * Other fields of that structure are private to parse-options and should not
129 * be modified in any way.
130 */
131struct parse_opt_ctx_t {
132 const char **argv;
133 const char **out;
134 int argc, cpidx;
135 const char *opt;
136 int flags;
137};
138
139extern int parse_options_usage(const char * const *usagestr,
140 const struct option *opts);
141
142extern void parse_options_start(struct parse_opt_ctx_t *ctx,
143 int argc, const char **argv, int flags);
144
145extern int parse_options_step(struct parse_opt_ctx_t *ctx,
146 const struct option *options,
147 const char * const usagestr[]);
148
149extern int parse_options_end(struct parse_opt_ctx_t *ctx);
150
151
152/*----- some often used options -----*/
153extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
154extern int parse_opt_approxidate_cb(const struct option *, const char *, int);
155extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
156
157#define OPT__VERBOSE(var) OPT_BOOLEAN('v', "verbose", (var), "be verbose")
158#define OPT__QUIET(var) OPT_BOOLEAN('q', "quiet", (var), "be quiet")
159#define OPT__VERBOSITY(var) \
160 { OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \
161 PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \
162 { OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \
163 PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }
164#define OPT__DRY_RUN(var) OPT_BOOLEAN('n', "dry-run", (var), "dry run")
165#define OPT__ABBREV(var) \
166 { OPTION_CALLBACK, 0, "abbrev", (var), "n", \
167 "use <n> digits to display SHA-1s", \
168 PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 }
169
170extern const char *parse_options_fix_filename(const char *prefix, const char *file);
171
172#endif
diff --git a/Documentation/perf_counter/util/path.c b/Documentation/perf_counter/util/path.c
new file mode 100644
index 000000000000..a501a40dd2cb
--- /dev/null
+++ b/Documentation/perf_counter/util/path.c
@@ -0,0 +1,353 @@
1/*
2 * I'm tired of doing "vsnprintf()" etc just to open a
3 * file, so here's a "return static buffer with printf"
4 * interface for paths.
5 *
6 * It's obviously not thread-safe. Sue me. But it's quite
7 * useful for doing things like
8 *
9 * f = open(mkpath("%s/%s.perf", base, name), O_RDONLY);
10 *
11 * which is what it's designed for.
12 */
13#include "cache.h"
14
15static char bad_path[] = "/bad-path/";
16/*
17 * Two hacks:
18 */
19
20static char *get_perf_dir(void)
21{
22 return ".";
23}
24
25size_t strlcpy(char *dest, const char *src, size_t size)
26{
27 size_t ret = strlen(src);
28
29 if (size) {
30 size_t len = (ret >= size) ? size - 1 : ret;
31 memcpy(dest, src, len);
32 dest[len] = '\0';
33 }
34 return ret;
35}
36
37
38static char *get_pathname(void)
39{
40 static char pathname_array[4][PATH_MAX];
41 static int index;
42 return pathname_array[3 & ++index];
43}
44
45static char *cleanup_path(char *path)
46{
47 /* Clean it up */
48 if (!memcmp(path, "./", 2)) {
49 path += 2;
50 while (*path == '/')
51 path++;
52 }
53 return path;
54}
55
56char *mksnpath(char *buf, size_t n, const char *fmt, ...)
57{
58 va_list args;
59 unsigned len;
60
61 va_start(args, fmt);
62 len = vsnprintf(buf, n, fmt, args);
63 va_end(args);
64 if (len >= n) {
65 strlcpy(buf, bad_path, n);
66 return buf;
67 }
68 return cleanup_path(buf);
69}
70
71static char *perf_vsnpath(char *buf, size_t n, const char *fmt, va_list args)
72{
73 const char *perf_dir = get_perf_dir();
74 size_t len;
75
76 len = strlen(perf_dir);
77 if (n < len + 1)
78 goto bad;
79 memcpy(buf, perf_dir, len);
80 if (len && !is_dir_sep(perf_dir[len-1]))
81 buf[len++] = '/';
82 len += vsnprintf(buf + len, n - len, fmt, args);
83 if (len >= n)
84 goto bad;
85 return cleanup_path(buf);
86bad:
87 strlcpy(buf, bad_path, n);
88 return buf;
89}
90
91char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
92{
93 va_list args;
94 va_start(args, fmt);
95 (void)perf_vsnpath(buf, n, fmt, args);
96 va_end(args);
97 return buf;
98}
99
100char *perf_pathdup(const char *fmt, ...)
101{
102 char path[PATH_MAX];
103 va_list args;
104 va_start(args, fmt);
105 (void)perf_vsnpath(path, sizeof(path), fmt, args);
106 va_end(args);
107 return xstrdup(path);
108}
109
110char *mkpath(const char *fmt, ...)
111{
112 va_list args;
113 unsigned len;
114 char *pathname = get_pathname();
115
116 va_start(args, fmt);
117 len = vsnprintf(pathname, PATH_MAX, fmt, args);
118 va_end(args);
119 if (len >= PATH_MAX)
120 return bad_path;
121 return cleanup_path(pathname);
122}
123
124char *perf_path(const char *fmt, ...)
125{
126 const char *perf_dir = get_perf_dir();
127 char *pathname = get_pathname();
128 va_list args;
129 unsigned len;
130
131 len = strlen(perf_dir);
132 if (len > PATH_MAX-100)
133 return bad_path;
134 memcpy(pathname, perf_dir, len);
135 if (len && perf_dir[len-1] != '/')
136 pathname[len++] = '/';
137 va_start(args, fmt);
138 len += vsnprintf(pathname + len, PATH_MAX - len, fmt, args);
139 va_end(args);
140 if (len >= PATH_MAX)
141 return bad_path;
142 return cleanup_path(pathname);
143}
144
145
146/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
147int perf_mkstemp(char *path, size_t len, const char *template)
148{
149 const char *tmp;
150 size_t n;
151
152 tmp = getenv("TMPDIR");
153 if (!tmp)
154 tmp = "/tmp";
155 n = snprintf(path, len, "%s/%s", tmp, template);
156 if (len <= n) {
157 errno = ENAMETOOLONG;
158 return -1;
159 }
160 return mkstemp(path);
161}
162
163
164const char *make_relative_path(const char *abs, const char *base)
165{
166 static char buf[PATH_MAX + 1];
167 int baselen;
168 if (!base)
169 return abs;
170 baselen = strlen(base);
171 if (prefixcmp(abs, base))
172 return abs;
173 if (abs[baselen] == '/')
174 baselen++;
175 else if (base[baselen - 1] != '/')
176 return abs;
177 strcpy(buf, abs + baselen);
178 return buf;
179}
180
181/*
182 * It is okay if dst == src, but they should not overlap otherwise.
183 *
184 * Performs the following normalizations on src, storing the result in dst:
185 * - Ensures that components are separated by '/' (Windows only)
186 * - Squashes sequences of '/'.
187 * - Removes "." components.
188 * - Removes ".." components, and the components the precede them.
189 * Returns failure (non-zero) if a ".." component appears as first path
190 * component anytime during the normalization. Otherwise, returns success (0).
191 *
192 * Note that this function is purely textual. It does not follow symlinks,
193 * verify the existence of the path, or make any system calls.
194 */
195int normalize_path_copy(char *dst, const char *src)
196{
197 char *dst0;
198
199 if (has_dos_drive_prefix(src)) {
200 *dst++ = *src++;
201 *dst++ = *src++;
202 }
203 dst0 = dst;
204
205 if (is_dir_sep(*src)) {
206 *dst++ = '/';
207 while (is_dir_sep(*src))
208 src++;
209 }
210
211 for (;;) {
212 char c = *src;
213
214 /*
215 * A path component that begins with . could be
216 * special:
217 * (1) "." and ends -- ignore and terminate.
218 * (2) "./" -- ignore them, eat slash and continue.
219 * (3) ".." and ends -- strip one and terminate.
220 * (4) "../" -- strip one, eat slash and continue.
221 */
222 if (c == '.') {
223 if (!src[1]) {
224 /* (1) */
225 src++;
226 } else if (is_dir_sep(src[1])) {
227 /* (2) */
228 src += 2;
229 while (is_dir_sep(*src))
230 src++;
231 continue;
232 } else if (src[1] == '.') {
233 if (!src[2]) {
234 /* (3) */
235 src += 2;
236 goto up_one;
237 } else if (is_dir_sep(src[2])) {
238 /* (4) */
239 src += 3;
240 while (is_dir_sep(*src))
241 src++;
242 goto up_one;
243 }
244 }
245 }
246
247 /* copy up to the next '/', and eat all '/' */
248 while ((c = *src++) != '\0' && !is_dir_sep(c))
249 *dst++ = c;
250 if (is_dir_sep(c)) {
251 *dst++ = '/';
252 while (is_dir_sep(c))
253 c = *src++;
254 src--;
255 } else if (!c)
256 break;
257 continue;
258
259 up_one:
260 /*
261 * dst0..dst is prefix portion, and dst[-1] is '/';
262 * go up one level.
263 */
264 dst--; /* go to trailing '/' */
265 if (dst <= dst0)
266 return -1;
267 /* Windows: dst[-1] cannot be backslash anymore */
268 while (dst0 < dst && dst[-1] != '/')
269 dst--;
270 }
271 *dst = '\0';
272 return 0;
273}
274
275/*
276 * path = Canonical absolute path
277 * prefix_list = Colon-separated list of absolute paths
278 *
279 * Determines, for each path in prefix_list, whether the "prefix" really
280 * is an ancestor directory of path. Returns the length of the longest
281 * ancestor directory, excluding any trailing slashes, or -1 if no prefix
282 * is an ancestor. (Note that this means 0 is returned if prefix_list is
283 * "/".) "/foo" is not considered an ancestor of "/foobar". Directories
284 * are not considered to be their own ancestors. path must be in a
285 * canonical form: empty components, or "." or ".." components are not
286 * allowed. prefix_list may be null, which is like "".
287 */
288int longest_ancestor_length(const char *path, const char *prefix_list)
289{
290 char buf[PATH_MAX+1];
291 const char *ceil, *colon;
292 int len, max_len = -1;
293
294 if (prefix_list == NULL || !strcmp(path, "/"))
295 return -1;
296
297 for (colon = ceil = prefix_list; *colon; ceil = colon+1) {
298 for (colon = ceil; *colon && *colon != PATH_SEP; colon++);
299 len = colon - ceil;
300 if (len == 0 || len > PATH_MAX || !is_absolute_path(ceil))
301 continue;
302 strlcpy(buf, ceil, len+1);
303 if (normalize_path_copy(buf, buf) < 0)
304 continue;
305 len = strlen(buf);
306 if (len > 0 && buf[len-1] == '/')
307 buf[--len] = '\0';
308
309 if (!strncmp(path, buf, len) &&
310 path[len] == '/' &&
311 len > max_len) {
312 max_len = len;
313 }
314 }
315
316 return max_len;
317}
318
319/* strip arbitrary amount of directory separators at end of path */
320static inline int chomp_trailing_dir_sep(const char *path, int len)
321{
322 while (len && is_dir_sep(path[len - 1]))
323 len--;
324 return len;
325}
326
327/*
328 * If path ends with suffix (complete path components), returns the
329 * part before suffix (sans trailing directory separators).
330 * Otherwise returns NULL.
331 */
332char *strip_path_suffix(const char *path, const char *suffix)
333{
334 int path_len = strlen(path), suffix_len = strlen(suffix);
335
336 while (suffix_len) {
337 if (!path_len)
338 return NULL;
339
340 if (is_dir_sep(path[path_len - 1])) {
341 if (!is_dir_sep(suffix[suffix_len - 1]))
342 return NULL;
343 path_len = chomp_trailing_dir_sep(path, path_len);
344 suffix_len = chomp_trailing_dir_sep(suffix, suffix_len);
345 }
346 else if (path[--path_len] != suffix[--suffix_len])
347 return NULL;
348 }
349
350 if (path_len && !is_dir_sep(path[path_len - 1]))
351 return NULL;
352 return xstrndup(path, chomp_trailing_dir_sep(path, path_len));
353}
diff --git a/Documentation/perf_counter/util/quote.c b/Documentation/perf_counter/util/quote.c
new file mode 100644
index 000000000000..7a49fcf69671
--- /dev/null
+++ b/Documentation/perf_counter/util/quote.c
@@ -0,0 +1,478 @@
1#include "cache.h"
2#include "quote.h"
3
4int quote_path_fully = 1;
5
6/* Help to copy the thing properly quoted for the shell safety.
7 * any single quote is replaced with '\'', any exclamation point
8 * is replaced with '\!', and the whole thing is enclosed in a
9 *
10 * E.g.
11 * original sq_quote result
12 * name ==> name ==> 'name'
13 * a b ==> a b ==> 'a b'
14 * a'b ==> a'\''b ==> 'a'\''b'
15 * a!b ==> a'\!'b ==> 'a'\!'b'
16 */
17static inline int need_bs_quote(char c)
18{
19 return (c == '\'' || c == '!');
20}
21
22void sq_quote_buf(struct strbuf *dst, const char *src)
23{
24 char *to_free = NULL;
25
26 if (dst->buf == src)
27 to_free = strbuf_detach(dst, NULL);
28
29 strbuf_addch(dst, '\'');
30 while (*src) {
31 size_t len = strcspn(src, "'!");
32 strbuf_add(dst, src, len);
33 src += len;
34 while (need_bs_quote(*src)) {
35 strbuf_addstr(dst, "'\\");
36 strbuf_addch(dst, *src++);
37 strbuf_addch(dst, '\'');
38 }
39 }
40 strbuf_addch(dst, '\'');
41 free(to_free);
42}
43
44void sq_quote_print(FILE *stream, const char *src)
45{
46 char c;
47
48 fputc('\'', stream);
49 while ((c = *src++)) {
50 if (need_bs_quote(c)) {
51 fputs("'\\", stream);
52 fputc(c, stream);
53 fputc('\'', stream);
54 } else {
55 fputc(c, stream);
56 }
57 }
58 fputc('\'', stream);
59}
60
61void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
62{
63 int i;
64
65 /* Copy into destination buffer. */
66 strbuf_grow(dst, 255);
67 for (i = 0; argv[i]; ++i) {
68 strbuf_addch(dst, ' ');
69 sq_quote_buf(dst, argv[i]);
70 if (maxlen && dst->len > maxlen)
71 die("Too many or long arguments");
72 }
73}
74
75char *sq_dequote_step(char *arg, char **next)
76{
77 char *dst = arg;
78 char *src = arg;
79 char c;
80
81 if (*src != '\'')
82 return NULL;
83 for (;;) {
84 c = *++src;
85 if (!c)
86 return NULL;
87 if (c != '\'') {
88 *dst++ = c;
89 continue;
90 }
91 /* We stepped out of sq */
92 switch (*++src) {
93 case '\0':
94 *dst = 0;
95 if (next)
96 *next = NULL;
97 return arg;
98 case '\\':
99 c = *++src;
100 if (need_bs_quote(c) && *++src == '\'') {
101 *dst++ = c;
102 continue;
103 }
104 /* Fallthrough */
105 default:
106 if (!next || !isspace(*src))
107 return NULL;
108 do {
109 c = *++src;
110 } while (isspace(c));
111 *dst = 0;
112 *next = src;
113 return arg;
114 }
115 }
116}
117
118char *sq_dequote(char *arg)
119{
120 return sq_dequote_step(arg, NULL);
121}
122
123int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc)
124{
125 char *next = arg;
126
127 if (!*arg)
128 return 0;
129 do {
130 char *dequoted = sq_dequote_step(next, &next);
131 if (!dequoted)
132 return -1;
133 ALLOC_GROW(*argv, *nr + 1, *alloc);
134 (*argv)[(*nr)++] = dequoted;
135 } while (next);
136
137 return 0;
138}
139
140/* 1 means: quote as octal
141 * 0 means: quote as octal if (quote_path_fully)
142 * -1 means: never quote
143 * c: quote as "\\c"
144 */
145#define X8(x) x, x, x, x, x, x, x, x
146#define X16(x) X8(x), X8(x)
147static signed char const sq_lookup[256] = {
148 /* 0 1 2 3 4 5 6 7 */
149 /* 0x00 */ 1, 1, 1, 1, 1, 1, 1, 'a',
150 /* 0x08 */ 'b', 't', 'n', 'v', 'f', 'r', 1, 1,
151 /* 0x10 */ X16(1),
152 /* 0x20 */ -1, -1, '"', -1, -1, -1, -1, -1,
153 /* 0x28 */ X16(-1), X16(-1), X16(-1),
154 /* 0x58 */ -1, -1, -1, -1,'\\', -1, -1, -1,
155 /* 0x60 */ X16(-1), X8(-1),
156 /* 0x78 */ -1, -1, -1, -1, -1, -1, -1, 1,
157 /* 0x80 */ /* set to 0 */
158};
159
160static inline int sq_must_quote(char c)
161{
162 return sq_lookup[(unsigned char)c] + quote_path_fully > 0;
163}
164
165/* returns the longest prefix not needing a quote up to maxlen if positive.
166 This stops at the first \0 because it's marked as a character needing an
167 escape */
168static size_t next_quote_pos(const char *s, ssize_t maxlen)
169{
170 size_t len;
171 if (maxlen < 0) {
172 for (len = 0; !sq_must_quote(s[len]); len++);
173 } else {
174 for (len = 0; len < maxlen && !sq_must_quote(s[len]); len++);
175 }
176 return len;
177}
178
179/*
180 * C-style name quoting.
181 *
182 * (1) if sb and fp are both NULL, inspect the input name and counts the
183 * number of bytes that are needed to hold c_style quoted version of name,
184 * counting the double quotes around it but not terminating NUL, and
185 * returns it.
186 * However, if name does not need c_style quoting, it returns 0.
187 *
188 * (2) if sb or fp are not NULL, it emits the c_style quoted version
189 * of name, enclosed with double quotes if asked and needed only.
190 * Return value is the same as in (1).
191 */
192static size_t quote_c_style_counted(const char *name, ssize_t maxlen,
193 struct strbuf *sb, FILE *fp, int no_dq)
194{
195#undef EMIT
196#define EMIT(c) \
197 do { \
198 if (sb) strbuf_addch(sb, (c)); \
199 if (fp) fputc((c), fp); \
200 count++; \
201 } while (0)
202#define EMITBUF(s, l) \
203 do { \
204 if (sb) strbuf_add(sb, (s), (l)); \
205 if (fp) fwrite((s), (l), 1, fp); \
206 count += (l); \
207 } while (0)
208
209 size_t len, count = 0;
210 const char *p = name;
211
212 for (;;) {
213 int ch;
214
215 len = next_quote_pos(p, maxlen);
216 if (len == maxlen || !p[len])
217 break;
218
219 if (!no_dq && p == name)
220 EMIT('"');
221
222 EMITBUF(p, len);
223 EMIT('\\');
224 p += len;
225 ch = (unsigned char)*p++;
226 if (sq_lookup[ch] >= ' ') {
227 EMIT(sq_lookup[ch]);
228 } else {
229 EMIT(((ch >> 6) & 03) + '0');
230 EMIT(((ch >> 3) & 07) + '0');
231 EMIT(((ch >> 0) & 07) + '0');
232 }
233 }
234
235 EMITBUF(p, len);
236 if (p == name) /* no ending quote needed */
237 return 0;
238
239 if (!no_dq)
240 EMIT('"');
241 return count;
242}
243
244size_t quote_c_style(const char *name, struct strbuf *sb, FILE *fp, int nodq)
245{
246 return quote_c_style_counted(name, -1, sb, fp, nodq);
247}
248
249void quote_two_c_style(struct strbuf *sb, const char *prefix, const char *path, int nodq)
250{
251 if (quote_c_style(prefix, NULL, NULL, 0) ||
252 quote_c_style(path, NULL, NULL, 0)) {
253 if (!nodq)
254 strbuf_addch(sb, '"');
255 quote_c_style(prefix, sb, NULL, 1);
256 quote_c_style(path, sb, NULL, 1);
257 if (!nodq)
258 strbuf_addch(sb, '"');
259 } else {
260 strbuf_addstr(sb, prefix);
261 strbuf_addstr(sb, path);
262 }
263}
264
265void write_name_quoted(const char *name, FILE *fp, int terminator)
266{
267 if (terminator) {
268 quote_c_style(name, NULL, fp, 0);
269 } else {
270 fputs(name, fp);
271 }
272 fputc(terminator, fp);
273}
274
275extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
276 const char *name, FILE *fp, int terminator)
277{
278 int needquote = 0;
279
280 if (terminator) {
281 needquote = next_quote_pos(pfx, pfxlen) < pfxlen
282 || name[next_quote_pos(name, -1)];
283 }
284 if (needquote) {
285 fputc('"', fp);
286 quote_c_style_counted(pfx, pfxlen, NULL, fp, 1);
287 quote_c_style(name, NULL, fp, 1);
288 fputc('"', fp);
289 } else {
290 fwrite(pfx, pfxlen, 1, fp);
291 fputs(name, fp);
292 }
293 fputc(terminator, fp);
294}
295
296/* quote path as relative to the given prefix */
297char *quote_path_relative(const char *in, int len,
298 struct strbuf *out, const char *prefix)
299{
300 int needquote;
301
302 if (len < 0)
303 len = strlen(in);
304
305 /* "../" prefix itself does not need quoting, but "in" might. */
306 needquote = next_quote_pos(in, len) < len;
307 strbuf_setlen(out, 0);
308 strbuf_grow(out, len);
309
310 if (needquote)
311 strbuf_addch(out, '"');
312 if (prefix) {
313 int off = 0;
314 while (prefix[off] && off < len && prefix[off] == in[off])
315 if (prefix[off] == '/') {
316 prefix += off + 1;
317 in += off + 1;
318 len -= off + 1;
319 off = 0;
320 } else
321 off++;
322
323 for (; *prefix; prefix++)
324 if (*prefix == '/')
325 strbuf_addstr(out, "../");
326 }
327
328 quote_c_style_counted (in, len, out, NULL, 1);
329
330 if (needquote)
331 strbuf_addch(out, '"');
332 if (!out->len)
333 strbuf_addstr(out, "./");
334
335 return out->buf;
336}
337
338/*
339 * C-style name unquoting.
340 *
341 * Quoted should point at the opening double quote.
342 * + Returns 0 if it was able to unquote the string properly, and appends the
343 * result in the strbuf `sb'.
344 * + Returns -1 in case of error, and doesn't touch the strbuf. Though note
345 * that this function will allocate memory in the strbuf, so calling
346 * strbuf_release is mandatory whichever result unquote_c_style returns.
347 *
348 * Updates endp pointer to point at one past the ending double quote if given.
349 */
350int unquote_c_style(struct strbuf *sb, const char *quoted, const char **endp)
351{
352 size_t oldlen = sb->len, len;
353 int ch, ac;
354
355 if (*quoted++ != '"')
356 return -1;
357
358 for (;;) {
359 len = strcspn(quoted, "\"\\");
360 strbuf_add(sb, quoted, len);
361 quoted += len;
362
363 switch (*quoted++) {
364 case '"':
365 if (endp)
366 *endp = quoted;
367 return 0;
368 case '\\':
369 break;
370 default:
371 goto error;
372 }
373
374 switch ((ch = *quoted++)) {
375 case 'a': ch = '\a'; break;
376 case 'b': ch = '\b'; break;
377 case 'f': ch = '\f'; break;
378 case 'n': ch = '\n'; break;
379 case 'r': ch = '\r'; break;
380 case 't': ch = '\t'; break;
381 case 'v': ch = '\v'; break;
382
383 case '\\': case '"':
384 break; /* verbatim */
385
386 /* octal values with first digit over 4 overflow */
387 case '0': case '1': case '2': case '3':
388 ac = ((ch - '0') << 6);
389 if ((ch = *quoted++) < '0' || '7' < ch)
390 goto error;
391 ac |= ((ch - '0') << 3);
392 if ((ch = *quoted++) < '0' || '7' < ch)
393 goto error;
394 ac |= (ch - '0');
395 ch = ac;
396 break;
397 default:
398 goto error;
399 }
400 strbuf_addch(sb, ch);
401 }
402
403 error:
404 strbuf_setlen(sb, oldlen);
405 return -1;
406}
407
408/* quoting as a string literal for other languages */
409
410void perl_quote_print(FILE *stream, const char *src)
411{
412 const char sq = '\'';
413 const char bq = '\\';
414 char c;
415
416 fputc(sq, stream);
417 while ((c = *src++)) {
418 if (c == sq || c == bq)
419 fputc(bq, stream);
420 fputc(c, stream);
421 }
422 fputc(sq, stream);
423}
424
425void python_quote_print(FILE *stream, const char *src)
426{
427 const char sq = '\'';
428 const char bq = '\\';
429 const char nl = '\n';
430 char c;
431
432 fputc(sq, stream);
433 while ((c = *src++)) {
434 if (c == nl) {
435 fputc(bq, stream);
436 fputc('n', stream);
437 continue;
438 }
439 if (c == sq || c == bq)
440 fputc(bq, stream);
441 fputc(c, stream);
442 }
443 fputc(sq, stream);
444}
445
446void tcl_quote_print(FILE *stream, const char *src)
447{
448 char c;
449
450 fputc('"', stream);
451 while ((c = *src++)) {
452 switch (c) {
453 case '[': case ']':
454 case '{': case '}':
455 case '$': case '\\': case '"':
456 fputc('\\', stream);
457 default:
458 fputc(c, stream);
459 break;
460 case '\f':
461 fputs("\\f", stream);
462 break;
463 case '\r':
464 fputs("\\r", stream);
465 break;
466 case '\n':
467 fputs("\\n", stream);
468 break;
469 case '\t':
470 fputs("\\t", stream);
471 break;
472 case '\v':
473 fputs("\\v", stream);
474 break;
475 }
476 }
477 fputc('"', stream);
478}
diff --git a/Documentation/perf_counter/util/quote.h b/Documentation/perf_counter/util/quote.h
new file mode 100644
index 000000000000..5dfad89816db
--- /dev/null
+++ b/Documentation/perf_counter/util/quote.h
@@ -0,0 +1,68 @@
1#ifndef QUOTE_H
2#define QUOTE_H
3
4#include <stddef.h>
5#include <stdio.h>
6
7/* Help to copy the thing properly quoted for the shell safety.
8 * any single quote is replaced with '\'', any exclamation point
9 * is replaced with '\!', and the whole thing is enclosed in a
10 * single quote pair.
11 *
12 * For example, if you are passing the result to system() as an
13 * argument:
14 *
15 * sprintf(cmd, "foobar %s %s", sq_quote(arg0), sq_quote(arg1))
16 *
17 * would be appropriate. If the system() is going to call ssh to
18 * run the command on the other side:
19 *
20 * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1));
21 * sprintf(rcmd, "ssh %s %s", sq_util/quote.host), sq_quote(cmd));
22 *
23 * Note that the above examples leak memory! Remember to free result from
24 * sq_quote() in a real application.
25 *
26 * sq_quote_buf() writes to an existing buffer of specified size; it
27 * will return the number of characters that would have been written
28 * excluding the final null regardless of the buffer size.
29 */
30
31extern void sq_quote_print(FILE *stream, const char *src);
32
33extern void sq_quote_buf(struct strbuf *, const char *src);
34extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
35
36/* This unwraps what sq_quote() produces in place, but returns
37 * NULL if the input does not look like what sq_quote would have
38 * produced.
39 */
40extern char *sq_dequote(char *);
41
42/*
43 * Same as the above, but can be used to unwrap many arguments in the
44 * same string separated by space. "next" is changed to point to the
45 * next argument that should be passed as first parameter. When there
46 * is no more argument to be dequoted, "next" is updated to point to NULL.
47 */
48extern char *sq_dequote_step(char *arg, char **next);
49extern int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc);
50
51extern int unquote_c_style(struct strbuf *, const char *quoted, const char **endp);
52extern size_t quote_c_style(const char *name, struct strbuf *, FILE *, int no_dq);
53extern void quote_two_c_style(struct strbuf *, const char *, const char *, int);
54
55extern void write_name_quoted(const char *name, FILE *, int terminator);
56extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
57 const char *name, FILE *, int terminator);
58
59/* quote path as relative to the given prefix */
60char *quote_path_relative(const char *in, int len,
61 struct strbuf *out, const char *prefix);
62
63/* quoting as a string literal for other languages */
64extern void perl_quote_print(FILE *stream, const char *src);
65extern void python_quote_print(FILE *stream, const char *src);
66extern void tcl_quote_print(FILE *stream, const char *src);
67
68#endif
diff --git a/Documentation/perf_counter/util/run-command.c b/Documentation/perf_counter/util/run-command.c
new file mode 100644
index 000000000000..b2f5e854f40a
--- /dev/null
+++ b/Documentation/perf_counter/util/run-command.c
@@ -0,0 +1,395 @@
1#include "cache.h"
2#include "run-command.h"
3#include "exec_cmd.h"
4
5static inline void close_pair(int fd[2])
6{
7 close(fd[0]);
8 close(fd[1]);
9}
10
11static inline void dup_devnull(int to)
12{
13 int fd = open("/dev/null", O_RDWR);
14 dup2(fd, to);
15 close(fd);
16}
17
18int start_command(struct child_process *cmd)
19{
20 int need_in, need_out, need_err;
21 int fdin[2], fdout[2], fderr[2];
22
23 /*
24 * In case of errors we must keep the promise to close FDs
25 * that have been passed in via ->in and ->out.
26 */
27
28 need_in = !cmd->no_stdin && cmd->in < 0;
29 if (need_in) {
30 if (pipe(fdin) < 0) {
31 if (cmd->out > 0)
32 close(cmd->out);
33 return -ERR_RUN_COMMAND_PIPE;
34 }
35 cmd->in = fdin[1];
36 }
37
38 need_out = !cmd->no_stdout
39 && !cmd->stdout_to_stderr
40 && cmd->out < 0;
41 if (need_out) {
42 if (pipe(fdout) < 0) {
43 if (need_in)
44 close_pair(fdin);
45 else if (cmd->in)
46 close(cmd->in);
47 return -ERR_RUN_COMMAND_PIPE;
48 }
49 cmd->out = fdout[0];
50 }
51
52 need_err = !cmd->no_stderr && cmd->err < 0;
53 if (need_err) {
54 if (pipe(fderr) < 0) {
55 if (need_in)
56 close_pair(fdin);
57 else if (cmd->in)
58 close(cmd->in);
59 if (need_out)
60 close_pair(fdout);
61 else if (cmd->out)
62 close(cmd->out);
63 return -ERR_RUN_COMMAND_PIPE;
64 }
65 cmd->err = fderr[0];
66 }
67
68#ifndef __MINGW32__
69 fflush(NULL);
70 cmd->pid = fork();
71 if (!cmd->pid) {
72 if (cmd->no_stdin)
73 dup_devnull(0);
74 else if (need_in) {
75 dup2(fdin[0], 0);
76 close_pair(fdin);
77 } else if (cmd->in) {
78 dup2(cmd->in, 0);
79 close(cmd->in);
80 }
81
82 if (cmd->no_stderr)
83 dup_devnull(2);
84 else if (need_err) {
85 dup2(fderr[1], 2);
86 close_pair(fderr);
87 }
88
89 if (cmd->no_stdout)
90 dup_devnull(1);
91 else if (cmd->stdout_to_stderr)
92 dup2(2, 1);
93 else if (need_out) {
94 dup2(fdout[1], 1);
95 close_pair(fdout);
96 } else if (cmd->out > 1) {
97 dup2(cmd->out, 1);
98 close(cmd->out);
99 }
100
101 if (cmd->dir && chdir(cmd->dir))
102 die("exec %s: cd to %s failed (%s)", cmd->argv[0],
103 cmd->dir, strerror(errno));
104 if (cmd->env) {
105 for (; *cmd->env; cmd->env++) {
106 if (strchr(*cmd->env, '='))
107 putenv((char*)*cmd->env);
108 else
109 unsetenv(*cmd->env);
110 }
111 }
112 if (cmd->preexec_cb)
113 cmd->preexec_cb();
114 if (cmd->perf_cmd) {
115 execv_perf_cmd(cmd->argv);
116 } else {
117 execvp(cmd->argv[0], (char *const*) cmd->argv);
118 }
119 exit(127);
120 }
121#else
122 int s0 = -1, s1 = -1, s2 = -1; /* backups of stdin, stdout, stderr */
123 const char **sargv = cmd->argv;
124 char **env = environ;
125
126 if (cmd->no_stdin) {
127 s0 = dup(0);
128 dup_devnull(0);
129 } else if (need_in) {
130 s0 = dup(0);
131 dup2(fdin[0], 0);
132 } else if (cmd->in) {
133 s0 = dup(0);
134 dup2(cmd->in, 0);
135 }
136
137 if (cmd->no_stderr) {
138 s2 = dup(2);
139 dup_devnull(2);
140 } else if (need_err) {
141 s2 = dup(2);
142 dup2(fderr[1], 2);
143 }
144
145 if (cmd->no_stdout) {
146 s1 = dup(1);
147 dup_devnull(1);
148 } else if (cmd->stdout_to_stderr) {
149 s1 = dup(1);
150 dup2(2, 1);
151 } else if (need_out) {
152 s1 = dup(1);
153 dup2(fdout[1], 1);
154 } else if (cmd->out > 1) {
155 s1 = dup(1);
156 dup2(cmd->out, 1);
157 }
158
159 if (cmd->dir)
160 die("chdir in start_command() not implemented");
161 if (cmd->env) {
162 env = copy_environ();
163 for (; *cmd->env; cmd->env++)
164 env = env_setenv(env, *cmd->env);
165 }
166
167 if (cmd->perf_cmd) {
168 cmd->argv = prepare_perf_cmd(cmd->argv);
169 }
170
171 cmd->pid = mingw_spawnvpe(cmd->argv[0], cmd->argv, env);
172
173 if (cmd->env)
174 free_environ(env);
175 if (cmd->perf_cmd)
176 free(cmd->argv);
177
178 cmd->argv = sargv;
179 if (s0 >= 0)
180 dup2(s0, 0), close(s0);
181 if (s1 >= 0)
182 dup2(s1, 1), close(s1);
183 if (s2 >= 0)
184 dup2(s2, 2), close(s2);
185#endif
186
187 if (cmd->pid < 0) {
188 int err = errno;
189 if (need_in)
190 close_pair(fdin);
191 else if (cmd->in)
192 close(cmd->in);
193 if (need_out)
194 close_pair(fdout);
195 else if (cmd->out)
196 close(cmd->out);
197 if (need_err)
198 close_pair(fderr);
199 return err == ENOENT ?
200 -ERR_RUN_COMMAND_EXEC :
201 -ERR_RUN_COMMAND_FORK;
202 }
203
204 if (need_in)
205 close(fdin[0]);
206 else if (cmd->in)
207 close(cmd->in);
208
209 if (need_out)
210 close(fdout[1]);
211 else if (cmd->out)
212 close(cmd->out);
213
214 if (need_err)
215 close(fderr[1]);
216
217 return 0;
218}
219
220static int wait_or_whine(pid_t pid)
221{
222 for (;;) {
223 int status, code;
224 pid_t waiting = waitpid(pid, &status, 0);
225
226 if (waiting < 0) {
227 if (errno == EINTR)
228 continue;
229 error("waitpid failed (%s)", strerror(errno));
230 return -ERR_RUN_COMMAND_WAITPID;
231 }
232 if (waiting != pid)
233 return -ERR_RUN_COMMAND_WAITPID_WRONG_PID;
234 if (WIFSIGNALED(status))
235 return -ERR_RUN_COMMAND_WAITPID_SIGNAL;
236
237 if (!WIFEXITED(status))
238 return -ERR_RUN_COMMAND_WAITPID_NOEXIT;
239 code = WEXITSTATUS(status);
240 switch (code) {
241 case 127:
242 return -ERR_RUN_COMMAND_EXEC;
243 case 0:
244 return 0;
245 default:
246 return -code;
247 }
248 }
249}
250
251int finish_command(struct child_process *cmd)
252{
253 return wait_or_whine(cmd->pid);
254}
255
256int run_command(struct child_process *cmd)
257{
258 int code = start_command(cmd);
259 if (code)
260 return code;
261 return finish_command(cmd);
262}
263
264static void prepare_run_command_v_opt(struct child_process *cmd,
265 const char **argv,
266 int opt)
267{
268 memset(cmd, 0, sizeof(*cmd));
269 cmd->argv = argv;
270 cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
271 cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0;
272 cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
273}
274
275int run_command_v_opt(const char **argv, int opt)
276{
277 struct child_process cmd;
278 prepare_run_command_v_opt(&cmd, argv, opt);
279 return run_command(&cmd);
280}
281
282int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env)
283{
284 struct child_process cmd;
285 prepare_run_command_v_opt(&cmd, argv, opt);
286 cmd.dir = dir;
287 cmd.env = env;
288 return run_command(&cmd);
289}
290
291#ifdef __MINGW32__
292static __stdcall unsigned run_thread(void *data)
293{
294 struct async *async = data;
295 return async->proc(async->fd_for_proc, async->data);
296}
297#endif
298
299int start_async(struct async *async)
300{
301 int pipe_out[2];
302
303 if (pipe(pipe_out) < 0)
304 return error("cannot create pipe: %s", strerror(errno));
305 async->out = pipe_out[0];
306
307#ifndef __MINGW32__
308 /* Flush stdio before fork() to avoid cloning buffers */
309 fflush(NULL);
310
311 async->pid = fork();
312 if (async->pid < 0) {
313 error("fork (async) failed: %s", strerror(errno));
314 close_pair(pipe_out);
315 return -1;
316 }
317 if (!async->pid) {
318 close(pipe_out[0]);
319 exit(!!async->proc(pipe_out[1], async->data));
320 }
321 close(pipe_out[1]);
322#else
323 async->fd_for_proc = pipe_out[1];
324 async->tid = (HANDLE) _beginthreadex(NULL, 0, run_thread, async, 0, NULL);
325 if (!async->tid) {
326 error("cannot create thread: %s", strerror(errno));
327 close_pair(pipe_out);
328 return -1;
329 }
330#endif
331 return 0;
332}
333
334int finish_async(struct async *async)
335{
336#ifndef __MINGW32__
337 int ret = 0;
338
339 if (wait_or_whine(async->pid))
340 ret = error("waitpid (async) failed");
341#else
342 DWORD ret = 0;
343 if (WaitForSingleObject(async->tid, INFINITE) != WAIT_OBJECT_0)
344 ret = error("waiting for thread failed: %lu", GetLastError());
345 else if (!GetExitCodeThread(async->tid, &ret))
346 ret = error("cannot get thread exit code: %lu", GetLastError());
347 CloseHandle(async->tid);
348#endif
349 return ret;
350}
351
352int run_hook(const char *index_file, const char *name, ...)
353{
354 struct child_process hook;
355 const char **argv = NULL, *env[2];
356 char index[PATH_MAX];
357 va_list args;
358 int ret;
359 size_t i = 0, alloc = 0;
360
361 if (access(perf_path("hooks/%s", name), X_OK) < 0)
362 return 0;
363
364 va_start(args, name);
365 ALLOC_GROW(argv, i + 1, alloc);
366 argv[i++] = perf_path("hooks/%s", name);
367 while (argv[i-1]) {
368 ALLOC_GROW(argv, i + 1, alloc);
369 argv[i++] = va_arg(args, const char *);
370 }
371 va_end(args);
372
373 memset(&hook, 0, sizeof(hook));
374 hook.argv = argv;
375 hook.no_stdin = 1;
376 hook.stdout_to_stderr = 1;
377 if (index_file) {
378 snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file);
379 env[0] = index;
380 env[1] = NULL;
381 hook.env = env;
382 }
383
384 ret = start_command(&hook);
385 free(argv);
386 if (ret) {
387 warning("Could not spawn %s", argv[0]);
388 return ret;
389 }
390 ret = finish_command(&hook);
391 if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
392 warning("%s exited due to uncaught signal", argv[0]);
393
394 return ret;
395}
diff --git a/Documentation/perf_counter/util/run-command.h b/Documentation/perf_counter/util/run-command.h
new file mode 100644
index 000000000000..328289f23669
--- /dev/null
+++ b/Documentation/perf_counter/util/run-command.h
@@ -0,0 +1,93 @@
1#ifndef RUN_COMMAND_H
2#define RUN_COMMAND_H
3
4enum {
5 ERR_RUN_COMMAND_FORK = 10000,
6 ERR_RUN_COMMAND_EXEC,
7 ERR_RUN_COMMAND_PIPE,
8 ERR_RUN_COMMAND_WAITPID,
9 ERR_RUN_COMMAND_WAITPID_WRONG_PID,
10 ERR_RUN_COMMAND_WAITPID_SIGNAL,
11 ERR_RUN_COMMAND_WAITPID_NOEXIT,
12};
13#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK)
14
15struct child_process {
16 const char **argv;
17 pid_t pid;
18 /*
19 * Using .in, .out, .err:
20 * - Specify 0 for no redirections (child inherits stdin, stdout,
21 * stderr from parent).
22 * - Specify -1 to have a pipe allocated as follows:
23 * .in: returns the writable pipe end; parent writes to it,
24 * the readable pipe end becomes child's stdin
25 * .out, .err: returns the readable pipe end; parent reads from
26 * it, the writable pipe end becomes child's stdout/stderr
27 * The caller of start_command() must close the returned FDs
28 * after it has completed reading from/writing to it!
29 * - Specify > 0 to set a channel to a particular FD as follows:
30 * .in: a readable FD, becomes child's stdin
31 * .out: a writable FD, becomes child's stdout/stderr
32 * .err > 0 not supported
33 * The specified FD is closed by start_command(), even in case
34 * of errors!
35 */
36 int in;
37 int out;
38 int err;
39 const char *dir;
40 const char *const *env;
41 unsigned no_stdin:1;
42 unsigned no_stdout:1;
43 unsigned no_stderr:1;
44 unsigned perf_cmd:1; /* if this is to be perf sub-command */
45 unsigned stdout_to_stderr:1;
46 void (*preexec_cb)(void);
47};
48
49int start_command(struct child_process *);
50int finish_command(struct child_process *);
51int run_command(struct child_process *);
52
53extern int run_hook(const char *index_file, const char *name, ...);
54
55#define RUN_COMMAND_NO_STDIN 1
56#define RUN_PERF_CMD 2 /*If this is to be perf sub-command */
57#define RUN_COMMAND_STDOUT_TO_STDERR 4
58int run_command_v_opt(const char **argv, int opt);
59
60/*
61 * env (the environment) is to be formatted like environ: "VAR=VALUE".
62 * To unset an environment variable use just "VAR".
63 */
64int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env);
65
66/*
67 * The purpose of the following functions is to feed a pipe by running
68 * a function asynchronously and providing output that the caller reads.
69 *
70 * It is expected that no synchronization and mutual exclusion between
71 * the caller and the feed function is necessary so that the function
72 * can run in a thread without interfering with the caller.
73 */
74struct async {
75 /*
76 * proc writes to fd and closes it;
77 * returns 0 on success, non-zero on failure
78 */
79 int (*proc)(int fd, void *data);
80 void *data;
81 int out; /* caller reads from here and closes it */
82#ifndef __MINGW32__
83 pid_t pid;
84#else
85 HANDLE tid;
86 int fd_for_proc;
87#endif
88};
89
90int start_async(struct async *async);
91int finish_async(struct async *async);
92
93#endif
diff --git a/Documentation/perf_counter/util/strbuf.c b/Documentation/perf_counter/util/strbuf.c
new file mode 100644
index 000000000000..eaba09306802
--- /dev/null
+++ b/Documentation/perf_counter/util/strbuf.c
@@ -0,0 +1,359 @@
1#include "cache.h"
2
3int prefixcmp(const char *str, const char *prefix)
4{
5 for (; ; str++, prefix++)
6 if (!*prefix)
7 return 0;
8 else if (*str != *prefix)
9 return (unsigned char)*prefix - (unsigned char)*str;
10}
11
12/*
13 * Used as the default ->buf value, so that people can always assume
14 * buf is non NULL and ->buf is NUL terminated even for a freshly
15 * initialized strbuf.
16 */
17char strbuf_slopbuf[1];
18
19void strbuf_init(struct strbuf *sb, size_t hint)
20{
21 sb->alloc = sb->len = 0;
22 sb->buf = strbuf_slopbuf;
23 if (hint)
24 strbuf_grow(sb, hint);
25}
26
27void strbuf_release(struct strbuf *sb)
28{
29 if (sb->alloc) {
30 free(sb->buf);
31 strbuf_init(sb, 0);
32 }
33}
34
35char *strbuf_detach(struct strbuf *sb, size_t *sz)
36{
37 char *res = sb->alloc ? sb->buf : NULL;
38 if (sz)
39 *sz = sb->len;
40 strbuf_init(sb, 0);
41 return res;
42}
43
44void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc)
45{
46 strbuf_release(sb);
47 sb->buf = buf;
48 sb->len = len;
49 sb->alloc = alloc;
50 strbuf_grow(sb, 0);
51 sb->buf[sb->len] = '\0';
52}
53
54void strbuf_grow(struct strbuf *sb, size_t extra)
55{
56 if (sb->len + extra + 1 <= sb->len)
57 die("you want to use way too much memory");
58 if (!sb->alloc)
59 sb->buf = NULL;
60 ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
61}
62
63void strbuf_trim(struct strbuf *sb)
64{
65 char *b = sb->buf;
66 while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
67 sb->len--;
68 while (sb->len > 0 && isspace(*b)) {
69 b++;
70 sb->len--;
71 }
72 memmove(sb->buf, b, sb->len);
73 sb->buf[sb->len] = '\0';
74}
75void strbuf_rtrim(struct strbuf *sb)
76{
77 while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
78 sb->len--;
79 sb->buf[sb->len] = '\0';
80}
81
82void strbuf_ltrim(struct strbuf *sb)
83{
84 char *b = sb->buf;
85 while (sb->len > 0 && isspace(*b)) {
86 b++;
87 sb->len--;
88 }
89 memmove(sb->buf, b, sb->len);
90 sb->buf[sb->len] = '\0';
91}
92
93void strbuf_tolower(struct strbuf *sb)
94{
95 int i;
96 for (i = 0; i < sb->len; i++)
97 sb->buf[i] = tolower(sb->buf[i]);
98}
99
100struct strbuf **strbuf_split(const struct strbuf *sb, int delim)
101{
102 int alloc = 2, pos = 0;
103 char *n, *p;
104 struct strbuf **ret;
105 struct strbuf *t;
106
107 ret = calloc(alloc, sizeof(struct strbuf *));
108 p = n = sb->buf;
109 while (n < sb->buf + sb->len) {
110 int len;
111 n = memchr(n, delim, sb->len - (n - sb->buf));
112 if (pos + 1 >= alloc) {
113 alloc = alloc * 2;
114 ret = realloc(ret, sizeof(struct strbuf *) * alloc);
115 }
116 if (!n)
117 n = sb->buf + sb->len - 1;
118 len = n - p + 1;
119 t = malloc(sizeof(struct strbuf));
120 strbuf_init(t, len);
121 strbuf_add(t, p, len);
122 ret[pos] = t;
123 ret[++pos] = NULL;
124 p = ++n;
125 }
126 return ret;
127}
128
129void strbuf_list_free(struct strbuf **sbs)
130{
131 struct strbuf **s = sbs;
132
133 while (*s) {
134 strbuf_release(*s);
135 free(*s++);
136 }
137 free(sbs);
138}
139
140int strbuf_cmp(const struct strbuf *a, const struct strbuf *b)
141{
142 int len = a->len < b->len ? a->len: b->len;
143 int cmp = memcmp(a->buf, b->buf, len);
144 if (cmp)
145 return cmp;
146 return a->len < b->len ? -1: a->len != b->len;
147}
148
149void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
150 const void *data, size_t dlen)
151{
152 if (pos + len < pos)
153 die("you want to use way too much memory");
154 if (pos > sb->len)
155 die("`pos' is too far after the end of the buffer");
156 if (pos + len > sb->len)
157 die("`pos + len' is too far after the end of the buffer");
158
159 if (dlen >= len)
160 strbuf_grow(sb, dlen - len);
161 memmove(sb->buf + pos + dlen,
162 sb->buf + pos + len,
163 sb->len - pos - len);
164 memcpy(sb->buf + pos, data, dlen);
165 strbuf_setlen(sb, sb->len + dlen - len);
166}
167
168void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len)
169{
170 strbuf_splice(sb, pos, 0, data, len);
171}
172
173void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
174{
175 strbuf_splice(sb, pos, len, NULL, 0);
176}
177
178void strbuf_add(struct strbuf *sb, const void *data, size_t len)
179{
180 strbuf_grow(sb, len);
181 memcpy(sb->buf + sb->len, data, len);
182 strbuf_setlen(sb, sb->len + len);
183}
184
185void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len)
186{
187 strbuf_grow(sb, len);
188 memcpy(sb->buf + sb->len, sb->buf + pos, len);
189 strbuf_setlen(sb, sb->len + len);
190}
191
192void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
193{
194 int len;
195 va_list ap;
196
197 if (!strbuf_avail(sb))
198 strbuf_grow(sb, 64);
199 va_start(ap, fmt);
200 len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
201 va_end(ap);
202 if (len < 0)
203 die("your vsnprintf is broken");
204 if (len > strbuf_avail(sb)) {
205 strbuf_grow(sb, len);
206 va_start(ap, fmt);
207 len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
208 va_end(ap);
209 if (len > strbuf_avail(sb)) {
210 die("this should not happen, your snprintf is broken");
211 }
212 }
213 strbuf_setlen(sb, sb->len + len);
214}
215
216void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn,
217 void *context)
218{
219 for (;;) {
220 const char *percent;
221 size_t consumed;
222
223 percent = strchrnul(format, '%');
224 strbuf_add(sb, format, percent - format);
225 if (!*percent)
226 break;
227 format = percent + 1;
228
229 consumed = fn(sb, format, context);
230 if (consumed)
231 format += consumed;
232 else
233 strbuf_addch(sb, '%');
234 }
235}
236
237size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder,
238 void *context)
239{
240 struct strbuf_expand_dict_entry *e = context;
241 size_t len;
242
243 for (; e->placeholder && (len = strlen(e->placeholder)); e++) {
244 if (!strncmp(placeholder, e->placeholder, len)) {
245 if (e->value)
246 strbuf_addstr(sb, e->value);
247 return len;
248 }
249 }
250 return 0;
251}
252
253size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f)
254{
255 size_t res;
256 size_t oldalloc = sb->alloc;
257
258 strbuf_grow(sb, size);
259 res = fread(sb->buf + sb->len, 1, size, f);
260 if (res > 0)
261 strbuf_setlen(sb, sb->len + res);
262 else if (res < 0 && oldalloc == 0)
263 strbuf_release(sb);
264 return res;
265}
266
267ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint)
268{
269 size_t oldlen = sb->len;
270 size_t oldalloc = sb->alloc;
271
272 strbuf_grow(sb, hint ? hint : 8192);
273 for (;;) {
274 ssize_t cnt;
275
276 cnt = read(fd, sb->buf + sb->len, sb->alloc - sb->len - 1);
277 if (cnt < 0) {
278 if (oldalloc == 0)
279 strbuf_release(sb);
280 else
281 strbuf_setlen(sb, oldlen);
282 return -1;
283 }
284 if (!cnt)
285 break;
286 sb->len += cnt;
287 strbuf_grow(sb, 8192);
288 }
289
290 sb->buf[sb->len] = '\0';
291 return sb->len - oldlen;
292}
293
294#define STRBUF_MAXLINK (2*PATH_MAX)
295
296int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint)
297{
298 size_t oldalloc = sb->alloc;
299
300 if (hint < 32)
301 hint = 32;
302
303 while (hint < STRBUF_MAXLINK) {
304 int len;
305
306 strbuf_grow(sb, hint);
307 len = readlink(path, sb->buf, hint);
308 if (len < 0) {
309 if (errno != ERANGE)
310 break;
311 } else if (len < hint) {
312 strbuf_setlen(sb, len);
313 return 0;
314 }
315
316 /* .. the buffer was too small - try again */
317 hint *= 2;
318 }
319 if (oldalloc == 0)
320 strbuf_release(sb);
321 return -1;
322}
323
324int strbuf_getline(struct strbuf *sb, FILE *fp, int term)
325{
326 int ch;
327
328 strbuf_grow(sb, 0);
329 if (feof(fp))
330 return EOF;
331
332 strbuf_reset(sb);
333 while ((ch = fgetc(fp)) != EOF) {
334 if (ch == term)
335 break;
336 strbuf_grow(sb, 1);
337 sb->buf[sb->len++] = ch;
338 }
339 if (ch == EOF && sb->len == 0)
340 return EOF;
341
342 sb->buf[sb->len] = '\0';
343 return 0;
344}
345
346int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint)
347{
348 int fd, len;
349
350 fd = open(path, O_RDONLY);
351 if (fd < 0)
352 return -1;
353 len = strbuf_read(sb, fd, hint);
354 close(fd);
355 if (len < 0)
356 return -1;
357
358 return len;
359}
diff --git a/Documentation/perf_counter/util/strbuf.h b/Documentation/perf_counter/util/strbuf.h
new file mode 100644
index 000000000000..9ee908a3ec5d
--- /dev/null
+++ b/Documentation/perf_counter/util/strbuf.h
@@ -0,0 +1,137 @@
1#ifndef STRBUF_H
2#define STRBUF_H
3
4/*
5 * Strbuf's can be use in many ways: as a byte array, or to store arbitrary
6 * long, overflow safe strings.
7 *
8 * Strbufs has some invariants that are very important to keep in mind:
9 *
10 * 1. the ->buf member is always malloc-ed, hence strbuf's can be used to
11 * build complex strings/buffers whose final size isn't easily known.
12 *
13 * It is NOT legal to copy the ->buf pointer away.
14 * `strbuf_detach' is the operation that detachs a buffer from its shell
15 * while keeping the shell valid wrt its invariants.
16 *
17 * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
18 * allocated. The extra byte is used to store a '\0', allowing the ->buf
19 * member to be a valid C-string. Every strbuf function ensure this
20 * invariant is preserved.
21 *
22 * Note that it is OK to "play" with the buffer directly if you work it
23 * that way:
24 *
25 * strbuf_grow(sb, SOME_SIZE);
26 * ... Here, the memory array starting at sb->buf, and of length
27 * ... strbuf_avail(sb) is all yours, and you are sure that
28 * ... strbuf_avail(sb) is at least SOME_SIZE.
29 * strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE);
30 *
31 * Of course, SOME_OTHER_SIZE must be smaller or equal to strbuf_avail(sb).
32 *
33 * Doing so is safe, though if it has to be done in many places, adding the
34 * missing API to the strbuf module is the way to go.
35 *
36 * XXX: do _not_ assume that the area that is yours is of size ->alloc - 1
37 * even if it's true in the current implementation. Alloc is somehow a
38 * "private" member that should not be messed with.
39 */
40
41#include <assert.h>
42
43extern char strbuf_slopbuf[];
44struct strbuf {
45 size_t alloc;
46 size_t len;
47 char *buf;
48};
49
50#define STRBUF_INIT { 0, 0, strbuf_slopbuf }
51
52/*----- strbuf life cycle -----*/
53extern void strbuf_init(struct strbuf *, size_t);
54extern void strbuf_release(struct strbuf *);
55extern char *strbuf_detach(struct strbuf *, size_t *);
56extern void strbuf_attach(struct strbuf *, void *, size_t, size_t);
57static inline void strbuf_swap(struct strbuf *a, struct strbuf *b) {
58 struct strbuf tmp = *a;
59 *a = *b;
60 *b = tmp;
61}
62
63/*----- strbuf size related -----*/
64static inline size_t strbuf_avail(const struct strbuf *sb) {
65 return sb->alloc ? sb->alloc - sb->len - 1 : 0;
66}
67
68extern void strbuf_grow(struct strbuf *, size_t);
69
70static inline void strbuf_setlen(struct strbuf *sb, size_t len) {
71 if (!sb->alloc)
72 strbuf_grow(sb, 0);
73 assert(len < sb->alloc);
74 sb->len = len;
75 sb->buf[len] = '\0';
76}
77#define strbuf_reset(sb) strbuf_setlen(sb, 0)
78
79/*----- content related -----*/
80extern void strbuf_trim(struct strbuf *);
81extern void strbuf_rtrim(struct strbuf *);
82extern void strbuf_ltrim(struct strbuf *);
83extern int strbuf_cmp(const struct strbuf *, const struct strbuf *);
84extern void strbuf_tolower(struct strbuf *);
85
86extern struct strbuf **strbuf_split(const struct strbuf *, int delim);
87extern void strbuf_list_free(struct strbuf **);
88
89/*----- add data in your buffer -----*/
90static inline void strbuf_addch(struct strbuf *sb, int c) {
91 strbuf_grow(sb, 1);
92 sb->buf[sb->len++] = c;
93 sb->buf[sb->len] = '\0';
94}
95
96extern void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t);
97extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
98
99/* splice pos..pos+len with given data */
100extern void strbuf_splice(struct strbuf *, size_t pos, size_t len,
101 const void *, size_t);
102
103extern void strbuf_add(struct strbuf *, const void *, size_t);
104static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
105 strbuf_add(sb, s, strlen(s));
106}
107static inline void strbuf_addbuf(struct strbuf *sb, const struct strbuf *sb2) {
108 strbuf_add(sb, sb2->buf, sb2->len);
109}
110extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len);
111
112typedef size_t (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context);
113extern void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, void *context);
114struct strbuf_expand_dict_entry {
115 const char *placeholder;
116 const char *value;
117};
118extern size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void *context);
119
120__attribute__((format(printf,2,3)))
121extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
122
123extern size_t strbuf_fread(struct strbuf *, size_t, FILE *);
124/* XXX: if read fails, any partial read is undone */
125extern ssize_t strbuf_read(struct strbuf *, int fd, size_t hint);
126extern int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint);
127extern int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint);
128
129extern int strbuf_getline(struct strbuf *, FILE *, int);
130
131extern void stripspace(struct strbuf *buf, int skip_comments);
132extern int launch_editor(const char *path, struct strbuf *buffer, const char *const *env);
133
134extern int strbuf_branchname(struct strbuf *sb, const char *name);
135extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name);
136
137#endif /* STRBUF_H */
diff --git a/Documentation/perf_counter/util/usage.c b/Documentation/perf_counter/util/usage.c
new file mode 100644
index 000000000000..7a10421fe6b4
--- /dev/null
+++ b/Documentation/perf_counter/util/usage.c
@@ -0,0 +1,80 @@
1/*
2 * GIT - The information manager from hell
3 *
4 * Copyright (C) Linus Torvalds, 2005
5 */
6#include "util.h"
7
8static void report(const char *prefix, const char *err, va_list params)
9{
10 char msg[1024];
11 vsnprintf(msg, sizeof(msg), err, params);
12 fprintf(stderr, "%s%s\n", prefix, msg);
13}
14
15static NORETURN void usage_builtin(const char *err)
16{
17 fprintf(stderr, "usage: %s\n", err);
18 exit(129);
19}
20
21static NORETURN void die_builtin(const char *err, va_list params)
22{
23 report("fatal: ", err, params);
24 exit(128);
25}
26
27static void error_builtin(const char *err, va_list params)
28{
29 report("error: ", err, params);
30}
31
32static void warn_builtin(const char *warn, va_list params)
33{
34 report("warning: ", warn, params);
35}
36
37/* If we are in a dlopen()ed .so write to a global variable would segfault
38 * (ugh), so keep things static. */
39static void (*usage_routine)(const char *err) NORETURN = usage_builtin;
40static void (*die_routine)(const char *err, va_list params) NORETURN = die_builtin;
41static void (*error_routine)(const char *err, va_list params) = error_builtin;
42static void (*warn_routine)(const char *err, va_list params) = warn_builtin;
43
44void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN)
45{
46 die_routine = routine;
47}
48
49void usage(const char *err)
50{
51 usage_routine(err);
52}
53
54void die(const char *err, ...)
55{
56 va_list params;
57
58 va_start(params, err);
59 die_routine(err, params);
60 va_end(params);
61}
62
63int error(const char *err, ...)
64{
65 va_list params;
66
67 va_start(params, err);
68 error_routine(err, params);
69 va_end(params);
70 return -1;
71}
72
73void warning(const char *warn, ...)
74{
75 va_list params;
76
77 va_start(params, warn);
78 warn_routine(warn, params);
79 va_end(params);
80}
diff --git a/Documentation/perf_counter/util/util.h b/Documentation/perf_counter/util/util.h
new file mode 100644
index 000000000000..36e40c38e093
--- /dev/null
+++ b/Documentation/perf_counter/util/util.h
@@ -0,0 +1,408 @@
1#ifndef GIT_COMPAT_UTIL_H
2#define GIT_COMPAT_UTIL_H
3
4#define _FILE_OFFSET_BITS 64
5
6#ifndef FLEX_ARRAY
7/*
8 * See if our compiler is known to support flexible array members.
9 */
10#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
11# define FLEX_ARRAY /* empty */
12#elif defined(__GNUC__)
13# if (__GNUC__ >= 3)
14# define FLEX_ARRAY /* empty */
15# else
16# define FLEX_ARRAY 0 /* older GNU extension */
17# endif
18#endif
19
20/*
21 * Otherwise, default to safer but a bit wasteful traditional style
22 */
23#ifndef FLEX_ARRAY
24# define FLEX_ARRAY 1
25#endif
26#endif
27
28#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
29
30#ifdef __GNUC__
31#define TYPEOF(x) (__typeof__(x))
32#else
33#define TYPEOF(x)
34#endif
35
36#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits))))
37#define HAS_MULTI_BITS(i) ((i) & ((i) - 1)) /* checks if an integer has more than 1 bit set */
38
39/* Approximation of the length of the decimal representation of this type. */
40#define decimal_length(x) ((int)(sizeof(x) * 2.56 + 0.5) + 1)
41
42#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__USLC__) && !defined(_M_UNIX)
43#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
44#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
45#endif
46#define _ALL_SOURCE 1
47#define _GNU_SOURCE 1
48#define _BSD_SOURCE 1
49
50#include <unistd.h>
51#include <stdio.h>
52#include <sys/stat.h>
53#include <fcntl.h>
54#include <stddef.h>
55#include <stdlib.h>
56#include <stdarg.h>
57#include <string.h>
58#include <errno.h>
59#include <limits.h>
60#include <sys/param.h>
61#include <sys/types.h>
62#include <dirent.h>
63#include <sys/time.h>
64#include <time.h>
65#include <signal.h>
66#include <fnmatch.h>
67#include <assert.h>
68#include <regex.h>
69#include <utime.h>
70#ifndef __MINGW32__
71#include <sys/wait.h>
72#include <sys/poll.h>
73#include <sys/socket.h>
74#include <sys/ioctl.h>
75#ifndef NO_SYS_SELECT_H
76#include <sys/select.h>
77#endif
78#include <netinet/in.h>
79#include <netinet/tcp.h>
80#include <arpa/inet.h>
81#include <netdb.h>
82#include <pwd.h>
83#include <inttypes.h>
84#if defined(__CYGWIN__)
85#undef _XOPEN_SOURCE
86#include <grp.h>
87#define _XOPEN_SOURCE 600
88#include "compat/cygwin.h"
89#else
90#undef _ALL_SOURCE /* AIX 5.3L defines a struct list with _ALL_SOURCE. */
91#include <grp.h>
92#define _ALL_SOURCE 1
93#endif
94#else /* __MINGW32__ */
95/* pull in Windows compatibility stuff */
96#include "compat/mingw.h"
97#endif /* __MINGW32__ */
98
99#ifndef NO_ICONV
100#include <iconv.h>
101#endif
102
103#ifndef NO_OPENSSL
104#include <openssl/ssl.h>
105#include <openssl/err.h>
106#endif
107
108/* On most systems <limits.h> would have given us this, but
109 * not on some systems (e.g. GNU/Hurd).
110 */
111#ifndef PATH_MAX
112#define PATH_MAX 4096
113#endif
114
115#ifndef PRIuMAX
116#define PRIuMAX "llu"
117#endif
118
119#ifndef PRIu32
120#define PRIu32 "u"
121#endif
122
123#ifndef PRIx32
124#define PRIx32 "x"
125#endif
126
127#ifndef PATH_SEP
128#define PATH_SEP ':'
129#endif
130
131#ifndef STRIP_EXTENSION
132#define STRIP_EXTENSION ""
133#endif
134
135#ifndef has_dos_drive_prefix
136#define has_dos_drive_prefix(path) 0
137#endif
138
139#ifndef is_dir_sep
140#define is_dir_sep(c) ((c) == '/')
141#endif
142
143#ifdef __GNUC__
144#define NORETURN __attribute__((__noreturn__))
145#else
146#define NORETURN
147#ifndef __attribute__
148#define __attribute__(x)
149#endif
150#endif
151
152/* General helper functions */
153extern void usage(const char *err) NORETURN;
154extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
155extern int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
156extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
157
158extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
159
160extern int prefixcmp(const char *str, const char *prefix);
161extern time_t tm_to_time_t(const struct tm *tm);
162
163static inline const char *skip_prefix(const char *str, const char *prefix)
164{
165 size_t len = strlen(prefix);
166 return strncmp(str, prefix, len) ? NULL : str + len;
167}
168
169#if defined(NO_MMAP) || defined(USE_WIN32_MMAP)
170
171#ifndef PROT_READ
172#define PROT_READ 1
173#define PROT_WRITE 2
174#define MAP_PRIVATE 1
175#define MAP_FAILED ((void*)-1)
176#endif
177
178#define mmap git_mmap
179#define munmap git_munmap
180extern void *git_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
181extern int git_munmap(void *start, size_t length);
182
183#else /* NO_MMAP || USE_WIN32_MMAP */
184
185#include <sys/mman.h>
186
187#endif /* NO_MMAP || USE_WIN32_MMAP */
188
189#ifdef NO_MMAP
190
191/* This value must be multiple of (pagesize * 2) */
192#define DEFAULT_PACKED_GIT_WINDOW_SIZE (1 * 1024 * 1024)
193
194#else /* NO_MMAP */
195
196/* This value must be multiple of (pagesize * 2) */
197#define DEFAULT_PACKED_GIT_WINDOW_SIZE \
198 (sizeof(void*) >= 8 \
199 ? 1 * 1024 * 1024 * 1024 \
200 : 32 * 1024 * 1024)
201
202#endif /* NO_MMAP */
203
204#ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
205#define on_disk_bytes(st) ((st).st_size)
206#else
207#define on_disk_bytes(st) ((st).st_blocks * 512)
208#endif
209
210#define DEFAULT_PACKED_GIT_LIMIT \
211 ((1024L * 1024L) * (sizeof(void*) >= 8 ? 8192 : 256))
212
213#ifdef NO_PREAD
214#define pread git_pread
215extern ssize_t git_pread(int fd, void *buf, size_t count, off_t offset);
216#endif
217/*
218 * Forward decl that will remind us if its twin in cache.h changes.
219 * This function is used in compat/pread.c. But we can't include
220 * cache.h there.
221 */
222extern ssize_t read_in_full(int fd, void *buf, size_t count);
223
224#ifdef NO_SETENV
225#define setenv gitsetenv
226extern int gitsetenv(const char *, const char *, int);
227#endif
228
229#ifdef NO_MKDTEMP
230#define mkdtemp gitmkdtemp
231extern char *gitmkdtemp(char *);
232#endif
233
234#ifdef NO_UNSETENV
235#define unsetenv gitunsetenv
236extern void gitunsetenv(const char *);
237#endif
238
239#ifdef NO_STRCASESTR
240#define strcasestr gitstrcasestr
241extern char *gitstrcasestr(const char *haystack, const char *needle);
242#endif
243
244#ifdef NO_STRLCPY
245#define strlcpy gitstrlcpy
246extern size_t gitstrlcpy(char *, const char *, size_t);
247#endif
248
249#ifdef NO_STRTOUMAX
250#define strtoumax gitstrtoumax
251extern uintmax_t gitstrtoumax(const char *, char **, int);
252#endif
253
254#ifdef NO_HSTRERROR
255#define hstrerror githstrerror
256extern const char *githstrerror(int herror);
257#endif
258
259#ifdef NO_MEMMEM
260#define memmem gitmemmem
261void *gitmemmem(const void *haystack, size_t haystacklen,
262 const void *needle, size_t needlelen);
263#endif
264
265#ifdef FREAD_READS_DIRECTORIES
266#ifdef fopen
267#undef fopen
268#endif
269#define fopen(a,b) git_fopen(a,b)
270extern FILE *git_fopen(const char*, const char*);
271#endif
272
273#ifdef SNPRINTF_RETURNS_BOGUS
274#define snprintf git_snprintf
275extern int git_snprintf(char *str, size_t maxsize,
276 const char *format, ...);
277#define vsnprintf git_vsnprintf
278extern int git_vsnprintf(char *str, size_t maxsize,
279 const char *format, va_list ap);
280#endif
281
282#ifdef __GLIBC_PREREQ
283#if __GLIBC_PREREQ(2, 1)
284#define HAVE_STRCHRNUL
285#endif
286#endif
287
288#ifndef HAVE_STRCHRNUL
289#define strchrnul gitstrchrnul
290static inline char *gitstrchrnul(const char *s, int c)
291{
292 while (*s && *s != c)
293 s++;
294 return (char *)s;
295}
296#endif
297
298/*
299 * Wrappers:
300 */
301extern char *xstrdup(const char *str);
302extern void *xmalloc(size_t size);
303extern void *xmemdupz(const void *data, size_t len);
304extern char *xstrndup(const char *str, size_t len);
305extern void *xrealloc(void *ptr, size_t size);
306extern void *xcalloc(size_t nmemb, size_t size);
307extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
308extern ssize_t xread(int fd, void *buf, size_t len);
309extern ssize_t xwrite(int fd, const void *buf, size_t len);
310extern int xdup(int fd);
311extern FILE *xfdopen(int fd, const char *mode);
312static inline size_t xsize_t(off_t len)
313{
314 return (size_t)len;
315}
316
317static inline int has_extension(const char *filename, const char *ext)
318{
319 size_t len = strlen(filename);
320 size_t extlen = strlen(ext);
321 return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
322}
323
324/* Sane ctype - no locale, and works with signed chars */
325#undef isascii
326#undef isspace
327#undef isdigit
328#undef isalpha
329#undef isalnum
330#undef tolower
331#undef toupper
332extern unsigned char sane_ctype[256];
333#define GIT_SPACE 0x01
334#define GIT_DIGIT 0x02
335#define GIT_ALPHA 0x04
336#define GIT_GLOB_SPECIAL 0x08
337#define GIT_REGEX_SPECIAL 0x10
338#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
339#define isascii(x) (((x) & ~0x7f) == 0)
340#define isspace(x) sane_istest(x,GIT_SPACE)
341#define isdigit(x) sane_istest(x,GIT_DIGIT)
342#define isalpha(x) sane_istest(x,GIT_ALPHA)
343#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
344#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
345#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
346#define tolower(x) sane_case((unsigned char)(x), 0x20)
347#define toupper(x) sane_case((unsigned char)(x), 0)
348
349static inline int sane_case(int x, int high)
350{
351 if (sane_istest(x, GIT_ALPHA))
352 x = (x & ~0x20) | high;
353 return x;
354}
355
356static inline int strtoul_ui(char const *s, int base, unsigned int *result)
357{
358 unsigned long ul;
359 char *p;
360
361 errno = 0;
362 ul = strtoul(s, &p, base);
363 if (errno || *p || p == s || (unsigned int) ul != ul)
364 return -1;
365 *result = ul;
366 return 0;
367}
368
369static inline int strtol_i(char const *s, int base, int *result)
370{
371 long ul;
372 char *p;
373
374 errno = 0;
375 ul = strtol(s, &p, base);
376 if (errno || *p || p == s || (int) ul != ul)
377 return -1;
378 *result = ul;
379 return 0;
380}
381
382#ifdef INTERNAL_QSORT
383void git_qsort(void *base, size_t nmemb, size_t size,
384 int(*compar)(const void *, const void *));
385#define qsort git_qsort
386#endif
387
388#ifndef DIR_HAS_BSD_GROUP_SEMANTICS
389# define FORCE_DIR_SET_GID S_ISGID
390#else
391# define FORCE_DIR_SET_GID 0
392#endif
393
394#ifdef NO_NSEC
395#undef USE_NSEC
396#define ST_CTIME_NSEC(st) 0
397#define ST_MTIME_NSEC(st) 0
398#else
399#ifdef USE_ST_TIMESPEC
400#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec))
401#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec))
402#else
403#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec))
404#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec))
405#endif
406#endif
407
408#endif
diff --git a/Documentation/perf_counter/util/wrapper.c b/Documentation/perf_counter/util/wrapper.c
new file mode 100644
index 000000000000..6350d65f6d9e
--- /dev/null
+++ b/Documentation/perf_counter/util/wrapper.c
@@ -0,0 +1,206 @@
1/*
2 * Various trivial helper wrappers around standard functions
3 */
4#include "cache.h"
5
6/*
7 * There's no pack memory to release - but stay close to the Git
8 * version so wrap this away:
9 */
10static inline void release_pack_memory(size_t size, int flag)
11{
12}
13
14char *xstrdup(const char *str)
15{
16 char *ret = strdup(str);
17 if (!ret) {
18 release_pack_memory(strlen(str) + 1, -1);
19 ret = strdup(str);
20 if (!ret)
21 die("Out of memory, strdup failed");
22 }
23 return ret;
24}
25
26void *xmalloc(size_t size)
27{
28 void *ret = malloc(size);
29 if (!ret && !size)
30 ret = malloc(1);
31 if (!ret) {
32 release_pack_memory(size, -1);
33 ret = malloc(size);
34 if (!ret && !size)
35 ret = malloc(1);
36 if (!ret)
37 die("Out of memory, malloc failed");
38 }
39#ifdef XMALLOC_POISON
40 memset(ret, 0xA5, size);
41#endif
42 return ret;
43}
44
45/*
46 * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of
47 * "data" to the allocated memory, zero terminates the allocated memory,
48 * and returns a pointer to the allocated memory. If the allocation fails,
49 * the program dies.
50 */
51void *xmemdupz(const void *data, size_t len)
52{
53 char *p = xmalloc(len + 1);
54 memcpy(p, data, len);
55 p[len] = '\0';
56 return p;
57}
58
59char *xstrndup(const char *str, size_t len)
60{
61 char *p = memchr(str, '\0', len);
62 return xmemdupz(str, p ? p - str : len);
63}
64
65void *xrealloc(void *ptr, size_t size)
66{
67 void *ret = realloc(ptr, size);
68 if (!ret && !size)
69 ret = realloc(ptr, 1);
70 if (!ret) {
71 release_pack_memory(size, -1);
72 ret = realloc(ptr, size);
73 if (!ret && !size)
74 ret = realloc(ptr, 1);
75 if (!ret)
76 die("Out of memory, realloc failed");
77 }
78 return ret;
79}
80
81void *xcalloc(size_t nmemb, size_t size)
82{
83 void *ret = calloc(nmemb, size);
84 if (!ret && (!nmemb || !size))
85 ret = calloc(1, 1);
86 if (!ret) {
87 release_pack_memory(nmemb * size, -1);
88 ret = calloc(nmemb, size);
89 if (!ret && (!nmemb || !size))
90 ret = calloc(1, 1);
91 if (!ret)
92 die("Out of memory, calloc failed");
93 }
94 return ret;
95}
96
97void *xmmap(void *start, size_t length,
98 int prot, int flags, int fd, off_t offset)
99{
100 void *ret = mmap(start, length, prot, flags, fd, offset);
101 if (ret == MAP_FAILED) {
102 if (!length)
103 return NULL;
104 release_pack_memory(length, fd);
105 ret = mmap(start, length, prot, flags, fd, offset);
106 if (ret == MAP_FAILED)
107 die("Out of memory? mmap failed: %s", strerror(errno));
108 }
109 return ret;
110}
111
112/*
113 * xread() is the same a read(), but it automatically restarts read()
114 * operations with a recoverable error (EAGAIN and EINTR). xread()
115 * DOES NOT GUARANTEE that "len" bytes is read even if the data is available.
116 */
117ssize_t xread(int fd, void *buf, size_t len)
118{
119 ssize_t nr;
120 while (1) {
121 nr = read(fd, buf, len);
122 if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
123 continue;
124 return nr;
125 }
126}
127
128/*
129 * xwrite() is the same a write(), but it automatically restarts write()
130 * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT
131 * GUARANTEE that "len" bytes is written even if the operation is successful.
132 */
133ssize_t xwrite(int fd, const void *buf, size_t len)
134{
135 ssize_t nr;
136 while (1) {
137 nr = write(fd, buf, len);
138 if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
139 continue;
140 return nr;
141 }
142}
143
144ssize_t read_in_full(int fd, void *buf, size_t count)
145{
146 char *p = buf;
147 ssize_t total = 0;
148
149 while (count > 0) {
150 ssize_t loaded = xread(fd, p, count);
151 if (loaded <= 0)
152 return total ? total : loaded;
153 count -= loaded;
154 p += loaded;
155 total += loaded;
156 }
157
158 return total;
159}
160
161ssize_t write_in_full(int fd, const void *buf, size_t count)
162{
163 const char *p = buf;
164 ssize_t total = 0;
165
166 while (count > 0) {
167 ssize_t written = xwrite(fd, p, count);
168 if (written < 0)
169 return -1;
170 if (!written) {
171 errno = ENOSPC;
172 return -1;
173 }
174 count -= written;
175 p += written;
176 total += written;
177 }
178
179 return total;
180}
181
182int xdup(int fd)
183{
184 int ret = dup(fd);
185 if (ret < 0)
186 die("dup failed: %s", strerror(errno));
187 return ret;
188}
189
190FILE *xfdopen(int fd, const char *mode)
191{
192 FILE *stream = fdopen(fd, mode);
193 if (stream == NULL)
194 die("Out of memory? fdopen failed: %s", strerror(errno));
195 return stream;
196}
197
198int xmkstemp(char *template)
199{
200 int fd;
201
202 fd = mkstemp(template);
203 if (fd < 0)
204 die("Unable to create temporary file: %s", strerror(errno));
205 return fd;
206}
diff --git a/MAINTAINERS b/MAINTAINERS
index c547f4a2bb62..5114b5341df4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4375,6 +4375,16 @@ S: Maintained
4375F: include/linux/delayacct.h 4375F: include/linux/delayacct.h
4376F: kernel/delayacct.c 4376F: kernel/delayacct.c
4377 4377
4378PERFORMANCE COUNTER SUBSYSTEM
4379P: Peter Zijlstra
4380M: a.p.zijlstra@chello.nl
4381P: Paul Mackerras
4382M: paulus@samba.org
4383P: Ingo Molnar
4384M: mingo@elte.hu
4385L: linux-kernel@vger.kernel.org
4386S: Supported
4387
4378PERSONALITY HANDLING 4388PERSONALITY HANDLING
4379P: Christoph Hellwig 4389P: Christoph Hellwig
4380M: hch@infradead.org 4390M: hch@infradead.org
diff --git a/Makefile b/Makefile
index 9e5dc8f0ef47..eb38c8399261 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
1VERSION = 2 1VERSION = 2
2PATCHLEVEL = 6 2PATCHLEVEL = 6
3SUBLEVEL = 30 3SUBLEVEL = 30
4EXTRAVERSION = -rc3 4EXTRAVERSION = -rc4
5NAME = Temporary Tasmanian Devil 5NAME = Vindictive Armadillo
6 6
7# *DOCUMENTATION* 7# *DOCUMENTATION*
8# To see a list of typical targets execute "make help" 8# To see a list of typical targets execute "make help"
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index b7e034b0a6dd..20a44d0c9fdd 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,5 +131,44 @@ static inline int irqs_disabled_flags(unsigned long flags)
131 */ 131 */
132struct irq_chip; 132struct irq_chip;
133 133
134#ifdef CONFIG_PERF_COUNTERS
135static inline unsigned long test_perf_counter_pending(void)
136{
137 unsigned long x;
138
139 asm volatile("lbz %0,%1(13)"
140 : "=r" (x)
141 : "i" (offsetof(struct paca_struct, perf_counter_pending)));
142 return x;
143}
144
145static inline void set_perf_counter_pending(void)
146{
147 asm volatile("stb %0,%1(13)" : :
148 "r" (1),
149 "i" (offsetof(struct paca_struct, perf_counter_pending)));
150}
151
152static inline void clear_perf_counter_pending(void)
153{
154 asm volatile("stb %0,%1(13)" : :
155 "r" (0),
156 "i" (offsetof(struct paca_struct, perf_counter_pending)));
157}
158
159extern void perf_counter_do_pending(void);
160
161#else
162
163static inline unsigned long test_perf_counter_pending(void)
164{
165 return 0;
166}
167
168static inline void set_perf_counter_pending(void) {}
169static inline void clear_perf_counter_pending(void) {}
170static inline void perf_counter_do_pending(void) {}
171#endif /* CONFIG_PERF_COUNTERS */
172
134#endif /* __KERNEL__ */ 173#endif /* __KERNEL__ */
135#endif /* _ASM_POWERPC_HW_IRQ_H */ 174#endif /* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 082b3aedf145..6ef055723019 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,6 +99,7 @@ struct paca_struct {
99 u8 soft_enabled; /* irq soft-enable flag */ 99 u8 soft_enabled; /* irq soft-enable flag */
100 u8 hard_enabled; /* set if irqs are enabled in MSR */ 100 u8 hard_enabled; /* set if irqs are enabled in MSR */
101 u8 io_sync; /* writel() needs spin_unlock sync */ 101 u8 io_sync; /* writel() needs spin_unlock sync */
102 u8 perf_counter_pending; /* PM interrupt while soft-disabled */
102 103
103 /* Stuff for accurate time accounting */ 104 /* Stuff for accurate time accounting */
104 u64 user_time; /* accumulated usermode TB ticks */ 105 u64 user_time; /* accumulated usermode TB ticks */
diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
new file mode 100644
index 000000000000..56d66c38143b
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -0,0 +1,83 @@
1/*
2 * Performance counter support - PowerPC-specific definitions.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/types.h>
12
13#define MAX_HWCOUNTERS 8
14#define MAX_EVENT_ALTERNATIVES 8
15#define MAX_LIMITED_HWCOUNTERS 2
16
17/*
18 * This struct provides the constants and functions needed to
19 * describe the PMU on a particular POWER-family CPU.
20 */
21struct power_pmu {
22 int n_counter;
23 int max_alternatives;
24 u64 add_fields;
25 u64 test_adder;
26 int (*compute_mmcr)(unsigned int events[], int n_ev,
27 unsigned int hwc[], u64 mmcr[]);
28 int (*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
29 int (*get_alternatives)(unsigned int event, unsigned int flags,
30 unsigned int alt[]);
31 void (*disable_pmc)(unsigned int pmc, u64 mmcr[]);
32 int (*limited_pmc_event)(unsigned int event);
33 int limited_pmc5_6; /* PMC5 and PMC6 have limited function */
34 int n_generic;
35 int *generic_events;
36};
37
38extern struct power_pmu *ppmu;
39
40/*
41 * Values for flags to get_alternatives()
42 */
43#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */
44#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */
45#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */
46
47/*
48 * The power_pmu.get_constraint function returns a 64-bit value and
49 * a 64-bit mask that express the constraints between this event and
50 * other events.
51 *
52 * The value and mask are divided up into (non-overlapping) bitfields
53 * of three different types:
54 *
55 * Select field: this expresses the constraint that some set of bits
56 * in MMCR* needs to be set to a specific value for this event. For a
57 * select field, the mask contains 1s in every bit of the field, and
58 * the value contains a unique value for each possible setting of the
59 * MMCR* bits. The constraint checking code will ensure that two events
60 * that set the same field in their masks have the same value in their
61 * value dwords.
62 *
63 * Add field: this expresses the constraint that there can be at most
64 * N events in a particular class. A field of k bits can be used for
65 * N <= 2^(k-1) - 1. The mask has the most significant bit of the field
66 * set (and the other bits 0), and the value has only the least significant
67 * bit of the field set. In addition, the 'add_fields' and 'test_adder'
68 * in the struct power_pmu for this processor come into play. The
69 * add_fields value contains 1 in the LSB of the field, and the
70 * test_adder contains 2^(k-1) - 1 - N in the field.
71 *
72 * NAND field: this expresses the constraint that you may not have events
73 * in all of a set of classes. (For example, on PPC970, you can't select
74 * events from the FPU, ISU and IDU simultaneously, although any two are
75 * possible.) For N classes, the field is N+1 bits wide, and each class
76 * is assigned one bit from the least-significant N bits. The mask has
77 * only the most-significant bit set, and the value has only the bit
78 * for the event's class set. The test_adder has the least significant
79 * bit set in the field.
80 *
81 * If an event is not subject to the constraint expressed by a particular
82 * field, then it will have 0 in both the mask and value for that field.
83 */
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index d98a30dfd41c..a0b92de51c7e 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,6 +322,6 @@ SYSCALL_SPU(epoll_create1)
322SYSCALL_SPU(dup3) 322SYSCALL_SPU(dup3)
323SYSCALL_SPU(pipe2) 323SYSCALL_SPU(pipe2)
324SYSCALL(inotify_init1) 324SYSCALL(inotify_init1)
325SYSCALL(ni_syscall) 325SYSCALL_SPU(perf_counter_open)
326COMPAT_SYS_SPU(preadv) 326COMPAT_SYS_SPU(preadv)
327COMPAT_SYS_SPU(pwritev) 327COMPAT_SYS_SPU(pwritev)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index 3f06f8ec81c5..4badac2d11d1 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,6 +341,7 @@
341#define __NR_dup3 316 341#define __NR_dup3 316
342#define __NR_pipe2 317 342#define __NR_pipe2 317
343#define __NR_inotify_init1 318 343#define __NR_inotify_init1 318
344#define __NR_perf_counter_open 319
344#define __NR_preadv 320 345#define __NR_preadv 320
345#define __NR_pwritev 321 346#define __NR_pwritev 321
346 347
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 71901fbda4a5..9ba1bb731fcc 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,6 +94,8 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o
94 94
95obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 95obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
96obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 96obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
97obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o power4-pmu.o ppc970-pmu.o \
98 power5-pmu.o power5+-pmu.o power6-pmu.o
97 99
98obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o 100obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
99 101
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 1e40bc053946..e981d1ce1914 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -131,6 +131,7 @@ int main(void)
131 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); 131 DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
132 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); 132 DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
133 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); 133 DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
134 DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
134 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); 135 DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
135 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); 136 DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
136 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); 137 DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index abfc32330479..43e073477c34 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
5262: 5262:
527 TRACE_AND_RESTORE_IRQ(r5); 527 TRACE_AND_RESTORE_IRQ(r5);
528 528
529#ifdef CONFIG_PERF_COUNTERS
530 /* check paca->perf_counter_pending if we're enabling ints */
531 lbz r3,PACAPERFPEND(r13)
532 and. r3,r3,r5
533 beq 27f
534 bl .perf_counter_do_pending
53527:
536#endif /* CONFIG_PERF_COUNTERS */
537
529 /* extract EE bit and use it to restore paca->hard_enabled */ 538 /* extract EE bit and use it to restore paca->hard_enabled */
530 ld r3,_MSR(r1) 539 ld r3,_MSR(r1)
531 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ 540 rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 8c1a4966867e..feff792ed0f9 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,6 +135,11 @@ notrace void raw_local_irq_restore(unsigned long en)
135 iseries_handle_interrupts(); 135 iseries_handle_interrupts();
136 } 136 }
137 137
138 if (test_perf_counter_pending()) {
139 clear_perf_counter_pending();
140 perf_counter_do_pending();
141 }
142
138 /* 143 /*
139 * if (get_paca()->hard_enabled) return; 144 * if (get_paca()->hard_enabled) return;
140 * But again we need to take care that gcc gets hard_enabled directly 145 * But again we need to take care that gcc gets hard_enabled directly
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
new file mode 100644
index 000000000000..15cdc8e67229
--- /dev/null
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,1092 @@
1/*
2 * Performance counter support - powerpc architecture code
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/sched.h>
13#include <linux/perf_counter.h>
14#include <linux/percpu.h>
15#include <linux/hardirq.h>
16#include <asm/reg.h>
17#include <asm/pmc.h>
18#include <asm/machdep.h>
19#include <asm/firmware.h>
20
21struct cpu_hw_counters {
22 int n_counters;
23 int n_percpu;
24 int disabled;
25 int n_added;
26 int n_limited;
27 u8 pmcs_enabled;
28 struct perf_counter *counter[MAX_HWCOUNTERS];
29 unsigned int events[MAX_HWCOUNTERS];
30 unsigned int flags[MAX_HWCOUNTERS];
31 u64 mmcr[3];
32 struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
33 u8 limited_hwidx[MAX_LIMITED_HWCOUNTERS];
34};
35DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
36
37struct power_pmu *ppmu;
38
39/*
40 * Normally, to ignore kernel events we set the FCS (freeze counters
41 * in supervisor mode) bit in MMCR0, but if the kernel runs with the
42 * hypervisor bit set in the MSR, or if we are running on a processor
43 * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
44 * then we need to use the FCHV bit to ignore kernel events.
45 */
46static unsigned int freeze_counters_kernel = MMCR0_FCS;
47
48static void perf_counter_interrupt(struct pt_regs *regs);
49
50void perf_counter_print_debug(void)
51{
52}
53
54/*
55 * Read one performance monitor counter (PMC).
56 */
57static unsigned long read_pmc(int idx)
58{
59 unsigned long val;
60
61 switch (idx) {
62 case 1:
63 val = mfspr(SPRN_PMC1);
64 break;
65 case 2:
66 val = mfspr(SPRN_PMC2);
67 break;
68 case 3:
69 val = mfspr(SPRN_PMC3);
70 break;
71 case 4:
72 val = mfspr(SPRN_PMC4);
73 break;
74 case 5:
75 val = mfspr(SPRN_PMC5);
76 break;
77 case 6:
78 val = mfspr(SPRN_PMC6);
79 break;
80 case 7:
81 val = mfspr(SPRN_PMC7);
82 break;
83 case 8:
84 val = mfspr(SPRN_PMC8);
85 break;
86 default:
87 printk(KERN_ERR "oops trying to read PMC%d\n", idx);
88 val = 0;
89 }
90 return val;
91}
92
93/*
94 * Write one PMC.
95 */
96static void write_pmc(int idx, unsigned long val)
97{
98 switch (idx) {
99 case 1:
100 mtspr(SPRN_PMC1, val);
101 break;
102 case 2:
103 mtspr(SPRN_PMC2, val);
104 break;
105 case 3:
106 mtspr(SPRN_PMC3, val);
107 break;
108 case 4:
109 mtspr(SPRN_PMC4, val);
110 break;
111 case 5:
112 mtspr(SPRN_PMC5, val);
113 break;
114 case 6:
115 mtspr(SPRN_PMC6, val);
116 break;
117 case 7:
118 mtspr(SPRN_PMC7, val);
119 break;
120 case 8:
121 mtspr(SPRN_PMC8, val);
122 break;
123 default:
124 printk(KERN_ERR "oops trying to write PMC%d\n", idx);
125 }
126}
127
128/*
129 * Check if a set of events can all go on the PMU at once.
130 * If they can't, this will look at alternative codes for the events
131 * and see if any combination of alternative codes is feasible.
132 * The feasible set is returned in event[].
133 */
134static int power_check_constraints(unsigned int event[], unsigned int cflags[],
135 int n_ev)
136{
137 u64 mask, value, nv;
138 unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
139 u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
140 u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
141 u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
142 int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
143 int i, j;
144 u64 addf = ppmu->add_fields;
145 u64 tadd = ppmu->test_adder;
146
147 if (n_ev > ppmu->n_counter)
148 return -1;
149
150 /* First see if the events will go on as-is */
151 for (i = 0; i < n_ev; ++i) {
152 if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
153 && !ppmu->limited_pmc_event(event[i])) {
154 ppmu->get_alternatives(event[i], cflags[i],
155 alternatives[i]);
156 event[i] = alternatives[i][0];
157 }
158 if (ppmu->get_constraint(event[i], &amasks[i][0],
159 &avalues[i][0]))
160 return -1;
161 }
162 value = mask = 0;
163 for (i = 0; i < n_ev; ++i) {
164 nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
165 if ((((nv + tadd) ^ value) & mask) != 0 ||
166 (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
167 break;
168 value = nv;
169 mask |= amasks[i][0];
170 }
171 if (i == n_ev)
172 return 0; /* all OK */
173
174 /* doesn't work, gather alternatives... */
175 if (!ppmu->get_alternatives)
176 return -1;
177 for (i = 0; i < n_ev; ++i) {
178 choice[i] = 0;
179 n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
180 alternatives[i]);
181 for (j = 1; j < n_alt[i]; ++j)
182 ppmu->get_constraint(alternatives[i][j],
183 &amasks[i][j], &avalues[i][j]);
184 }
185
186 /* enumerate all possibilities and see if any will work */
187 i = 0;
188 j = -1;
189 value = mask = nv = 0;
190 while (i < n_ev) {
191 if (j >= 0) {
192 /* we're backtracking, restore context */
193 value = svalues[i];
194 mask = smasks[i];
195 j = choice[i];
196 }
197 /*
198 * See if any alternative k for event i,
199 * where k > j, will satisfy the constraints.
200 */
201 while (++j < n_alt[i]) {
202 nv = (value | avalues[i][j]) +
203 (value & avalues[i][j] & addf);
204 if ((((nv + tadd) ^ value) & mask) == 0 &&
205 (((nv + tadd) ^ avalues[i][j])
206 & amasks[i][j]) == 0)
207 break;
208 }
209 if (j >= n_alt[i]) {
210 /*
211 * No feasible alternative, backtrack
212 * to event i-1 and continue enumerating its
213 * alternatives from where we got up to.
214 */
215 if (--i < 0)
216 return -1;
217 } else {
218 /*
219 * Found a feasible alternative for event i,
220 * remember where we got up to with this event,
221 * go on to the next event, and start with
222 * the first alternative for it.
223 */
224 choice[i] = j;
225 svalues[i] = value;
226 smasks[i] = mask;
227 value = nv;
228 mask |= amasks[i][j];
229 ++i;
230 j = -1;
231 }
232 }
233
234 /* OK, we have a feasible combination, tell the caller the solution */
235 for (i = 0; i < n_ev; ++i)
236 event[i] = alternatives[i][choice[i]];
237 return 0;
238}
239
240/*
241 * Check if newly-added counters have consistent settings for
242 * exclude_{user,kernel,hv} with each other and any previously
243 * added counters.
244 */
245static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
246 int n_prev, int n_new)
247{
248 int eu = 0, ek = 0, eh = 0;
249 int i, n, first;
250 struct perf_counter *counter;
251
252 n = n_prev + n_new;
253 if (n <= 1)
254 return 0;
255
256 first = 1;
257 for (i = 0; i < n; ++i) {
258 if (cflags[i] & PPMU_LIMITED_PMC_OK) {
259 cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
260 continue;
261 }
262 counter = ctrs[i];
263 if (first) {
264 eu = counter->hw_event.exclude_user;
265 ek = counter->hw_event.exclude_kernel;
266 eh = counter->hw_event.exclude_hv;
267 first = 0;
268 } else if (counter->hw_event.exclude_user != eu ||
269 counter->hw_event.exclude_kernel != ek ||
270 counter->hw_event.exclude_hv != eh) {
271 return -EAGAIN;
272 }
273 }
274
275 if (eu || ek || eh)
276 for (i = 0; i < n; ++i)
277 if (cflags[i] & PPMU_LIMITED_PMC_OK)
278 cflags[i] |= PPMU_LIMITED_PMC_REQD;
279
280 return 0;
281}
282
283static void power_pmu_read(struct perf_counter *counter)
284{
285 long val, delta, prev;
286
287 if (!counter->hw.idx)
288 return;
289 /*
290 * Performance monitor interrupts come even when interrupts
291 * are soft-disabled, as long as interrupts are hard-enabled.
292 * Therefore we treat them like NMIs.
293 */
294 do {
295 prev = atomic64_read(&counter->hw.prev_count);
296 barrier();
297 val = read_pmc(counter->hw.idx);
298 } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
299
300 /* The counters are only 32 bits wide */
301 delta = (val - prev) & 0xfffffffful;
302 atomic64_add(delta, &counter->count);
303 atomic64_sub(delta, &counter->hw.period_left);
304}
305
306/*
307 * On some machines, PMC5 and PMC6 can't be written, don't respect
308 * the freeze conditions, and don't generate interrupts. This tells
309 * us if `counter' is using such a PMC.
310 */
311static int is_limited_pmc(int pmcnum)
312{
313 return ppmu->limited_pmc5_6 && (pmcnum == 5 || pmcnum == 6);
314}
315
316static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
317 unsigned long pmc5, unsigned long pmc6)
318{
319 struct perf_counter *counter;
320 u64 val, prev, delta;
321 int i;
322
323 for (i = 0; i < cpuhw->n_limited; ++i) {
324 counter = cpuhw->limited_counter[i];
325 if (!counter->hw.idx)
326 continue;
327 val = (counter->hw.idx == 5) ? pmc5 : pmc6;
328 prev = atomic64_read(&counter->hw.prev_count);
329 counter->hw.idx = 0;
330 delta = (val - prev) & 0xfffffffful;
331 atomic64_add(delta, &counter->count);
332 }
333}
334
335static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
336 unsigned long pmc5, unsigned long pmc6)
337{
338 struct perf_counter *counter;
339 u64 val;
340 int i;
341
342 for (i = 0; i < cpuhw->n_limited; ++i) {
343 counter = cpuhw->limited_counter[i];
344 counter->hw.idx = cpuhw->limited_hwidx[i];
345 val = (counter->hw.idx == 5) ? pmc5 : pmc6;
346 atomic64_set(&counter->hw.prev_count, val);
347 perf_counter_update_userpage(counter);
348 }
349}
350
351/*
352 * Since limited counters don't respect the freeze conditions, we
353 * have to read them immediately after freezing or unfreezing the
354 * other counters. We try to keep the values from the limited
355 * counters as consistent as possible by keeping the delay (in
356 * cycles and instructions) between freezing/unfreezing and reading
357 * the limited counters as small and consistent as possible.
358 * Therefore, if any limited counters are in use, we read them
359 * both, and always in the same order, to minimize variability,
360 * and do it inside the same asm that writes MMCR0.
361 */
362static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
363{
364 unsigned long pmc5, pmc6;
365
366 if (!cpuhw->n_limited) {
367 mtspr(SPRN_MMCR0, mmcr0);
368 return;
369 }
370
371 /*
372 * Write MMCR0, then read PMC5 and PMC6 immediately.
373 */
374 asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
375 : "=&r" (pmc5), "=&r" (pmc6)
376 : "r" (mmcr0), "i" (SPRN_MMCR0),
377 "i" (SPRN_PMC5), "i" (SPRN_PMC6));
378
379 if (mmcr0 & MMCR0_FC)
380 freeze_limited_counters(cpuhw, pmc5, pmc6);
381 else
382 thaw_limited_counters(cpuhw, pmc5, pmc6);
383}
384
385/*
386 * Disable all counters to prevent PMU interrupts and to allow
387 * counters to be added or removed.
388 */
389u64 hw_perf_save_disable(void)
390{
391 struct cpu_hw_counters *cpuhw;
392 unsigned long ret;
393 unsigned long flags;
394
395 local_irq_save(flags);
396 cpuhw = &__get_cpu_var(cpu_hw_counters);
397
398 ret = cpuhw->disabled;
399 if (!ret) {
400 cpuhw->disabled = 1;
401 cpuhw->n_added = 0;
402
403 /*
404 * Check if we ever enabled the PMU on this cpu.
405 */
406 if (!cpuhw->pmcs_enabled) {
407 if (ppc_md.enable_pmcs)
408 ppc_md.enable_pmcs();
409 cpuhw->pmcs_enabled = 1;
410 }
411
412 /*
413 * Disable instruction sampling if it was enabled
414 */
415 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
416 mtspr(SPRN_MMCRA,
417 cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
418 mb();
419 }
420
421 /*
422 * Set the 'freeze counters' bit.
423 * The barrier is to make sure the mtspr has been
424 * executed and the PMU has frozen the counters
425 * before we return.
426 */
427 write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
428 mb();
429 }
430 local_irq_restore(flags);
431 return ret;
432}
433
434/*
435 * Re-enable all counters if disable == 0.
436 * If we were previously disabled and counters were added, then
437 * put the new config on the PMU.
438 */
439void hw_perf_restore(u64 disable)
440{
441 struct perf_counter *counter;
442 struct cpu_hw_counters *cpuhw;
443 unsigned long flags;
444 long i;
445 unsigned long val;
446 s64 left;
447 unsigned int hwc_index[MAX_HWCOUNTERS];
448 int n_lim;
449 int idx;
450
451 if (disable)
452 return;
453 local_irq_save(flags);
454 cpuhw = &__get_cpu_var(cpu_hw_counters);
455 cpuhw->disabled = 0;
456
457 /*
458 * If we didn't change anything, or only removed counters,
459 * no need to recalculate MMCR* settings and reset the PMCs.
460 * Just reenable the PMU with the current MMCR* settings
461 * (possibly updated for removal of counters).
462 */
463 if (!cpuhw->n_added) {
464 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
465 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
466 if (cpuhw->n_counters == 0)
467 get_lppaca()->pmcregs_in_use = 0;
468 goto out_enable;
469 }
470
471 /*
472 * Compute MMCR* values for the new set of counters
473 */
474 if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
475 cpuhw->mmcr)) {
476 /* shouldn't ever get here */
477 printk(KERN_ERR "oops compute_mmcr failed\n");
478 goto out;
479 }
480
481 /*
482 * Add in MMCR0 freeze bits corresponding to the
483 * hw_event.exclude_* bits for the first counter.
484 * We have already checked that all counters have the
485 * same values for these bits as the first counter.
486 */
487 counter = cpuhw->counter[0];
488 if (counter->hw_event.exclude_user)
489 cpuhw->mmcr[0] |= MMCR0_FCP;
490 if (counter->hw_event.exclude_kernel)
491 cpuhw->mmcr[0] |= freeze_counters_kernel;
492 if (counter->hw_event.exclude_hv)
493 cpuhw->mmcr[0] |= MMCR0_FCHV;
494
495 /*
496 * Write the new configuration to MMCR* with the freeze
497 * bit set and set the hardware counters to their initial values.
498 * Then unfreeze the counters.
499 */
500 get_lppaca()->pmcregs_in_use = 1;
501 mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
502 mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
503 mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
504 | MMCR0_FC);
505
506 /*
507 * Read off any pre-existing counters that need to move
508 * to another PMC.
509 */
510 for (i = 0; i < cpuhw->n_counters; ++i) {
511 counter = cpuhw->counter[i];
512 if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
513 power_pmu_read(counter);
514 write_pmc(counter->hw.idx, 0);
515 counter->hw.idx = 0;
516 }
517 }
518
519 /*
520 * Initialize the PMCs for all the new and moved counters.
521 */
522 cpuhw->n_limited = n_lim = 0;
523 for (i = 0; i < cpuhw->n_counters; ++i) {
524 counter = cpuhw->counter[i];
525 if (counter->hw.idx)
526 continue;
527 idx = hwc_index[i] + 1;
528 if (is_limited_pmc(idx)) {
529 cpuhw->limited_counter[n_lim] = counter;
530 cpuhw->limited_hwidx[n_lim] = idx;
531 ++n_lim;
532 continue;
533 }
534 val = 0;
535 if (counter->hw_event.irq_period) {
536 left = atomic64_read(&counter->hw.period_left);
537 if (left < 0x80000000L)
538 val = 0x80000000L - left;
539 }
540 atomic64_set(&counter->hw.prev_count, val);
541 counter->hw.idx = idx;
542 write_pmc(idx, val);
543 perf_counter_update_userpage(counter);
544 }
545 cpuhw->n_limited = n_lim;
546 cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
547
548 out_enable:
549 mb();
550 write_mmcr0(cpuhw, cpuhw->mmcr[0]);
551
552 /*
553 * Enable instruction sampling if necessary
554 */
555 if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
556 mb();
557 mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
558 }
559
560 out:
561 local_irq_restore(flags);
562}
563
564static int collect_events(struct perf_counter *group, int max_count,
565 struct perf_counter *ctrs[], unsigned int *events,
566 unsigned int *flags)
567{
568 int n = 0;
569 struct perf_counter *counter;
570
571 if (!is_software_counter(group)) {
572 if (n >= max_count)
573 return -1;
574 ctrs[n] = group;
575 flags[n] = group->hw.counter_base;
576 events[n++] = group->hw.config;
577 }
578 list_for_each_entry(counter, &group->sibling_list, list_entry) {
579 if (!is_software_counter(counter) &&
580 counter->state != PERF_COUNTER_STATE_OFF) {
581 if (n >= max_count)
582 return -1;
583 ctrs[n] = counter;
584 flags[n] = counter->hw.counter_base;
585 events[n++] = counter->hw.config;
586 }
587 }
588 return n;
589}
590
591static void counter_sched_in(struct perf_counter *counter, int cpu)
592{
593 counter->state = PERF_COUNTER_STATE_ACTIVE;
594 counter->oncpu = cpu;
595 counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
596 if (is_software_counter(counter))
597 counter->pmu->enable(counter);
598}
599
600/*
601 * Called to enable a whole group of counters.
602 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
603 * Assumes the caller has disabled interrupts and has
604 * frozen the PMU with hw_perf_save_disable.
605 */
606int hw_perf_group_sched_in(struct perf_counter *group_leader,
607 struct perf_cpu_context *cpuctx,
608 struct perf_counter_context *ctx, int cpu)
609{
610 struct cpu_hw_counters *cpuhw;
611 long i, n, n0;
612 struct perf_counter *sub;
613
614 cpuhw = &__get_cpu_var(cpu_hw_counters);
615 n0 = cpuhw->n_counters;
616 n = collect_events(group_leader, ppmu->n_counter - n0,
617 &cpuhw->counter[n0], &cpuhw->events[n0],
618 &cpuhw->flags[n0]);
619 if (n < 0)
620 return -EAGAIN;
621 if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
622 return -EAGAIN;
623 i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0);
624 if (i < 0)
625 return -EAGAIN;
626 cpuhw->n_counters = n0 + n;
627 cpuhw->n_added += n;
628
629 /*
630 * OK, this group can go on; update counter states etc.,
631 * and enable any software counters
632 */
633 for (i = n0; i < n0 + n; ++i)
634 cpuhw->counter[i]->hw.config = cpuhw->events[i];
635 cpuctx->active_oncpu += n;
636 n = 1;
637 counter_sched_in(group_leader, cpu);
638 list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
639 if (sub->state != PERF_COUNTER_STATE_OFF) {
640 counter_sched_in(sub, cpu);
641 ++n;
642 }
643 }
644 ctx->nr_active += n;
645
646 return 1;
647}
648
649/*
650 * Add a counter to the PMU.
651 * If all counters are not already frozen, then we disable and
652 * re-enable the PMU in order to get hw_perf_restore to do the
653 * actual work of reconfiguring the PMU.
654 */
655static int power_pmu_enable(struct perf_counter *counter)
656{
657 struct cpu_hw_counters *cpuhw;
658 unsigned long flags;
659 u64 pmudis;
660 int n0;
661 int ret = -EAGAIN;
662
663 local_irq_save(flags);
664 pmudis = hw_perf_save_disable();
665
666 /*
667 * Add the counter to the list (if there is room)
668 * and check whether the total set is still feasible.
669 */
670 cpuhw = &__get_cpu_var(cpu_hw_counters);
671 n0 = cpuhw->n_counters;
672 if (n0 >= ppmu->n_counter)
673 goto out;
674 cpuhw->counter[n0] = counter;
675 cpuhw->events[n0] = counter->hw.config;
676 cpuhw->flags[n0] = counter->hw.counter_base;
677 if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
678 goto out;
679 if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1))
680 goto out;
681
682 counter->hw.config = cpuhw->events[n0];
683 ++cpuhw->n_counters;
684 ++cpuhw->n_added;
685
686 ret = 0;
687 out:
688 hw_perf_restore(pmudis);
689 local_irq_restore(flags);
690 return ret;
691}
692
693/*
694 * Remove a counter from the PMU.
695 */
696static void power_pmu_disable(struct perf_counter *counter)
697{
698 struct cpu_hw_counters *cpuhw;
699 long i;
700 u64 pmudis;
701 unsigned long flags;
702
703 local_irq_save(flags);
704 pmudis = hw_perf_save_disable();
705
706 power_pmu_read(counter);
707
708 cpuhw = &__get_cpu_var(cpu_hw_counters);
709 for (i = 0; i < cpuhw->n_counters; ++i) {
710 if (counter == cpuhw->counter[i]) {
711 while (++i < cpuhw->n_counters)
712 cpuhw->counter[i-1] = cpuhw->counter[i];
713 --cpuhw->n_counters;
714 ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
715 if (counter->hw.idx) {
716 write_pmc(counter->hw.idx, 0);
717 counter->hw.idx = 0;
718 }
719 perf_counter_update_userpage(counter);
720 break;
721 }
722 }
723 for (i = 0; i < cpuhw->n_limited; ++i)
724 if (counter == cpuhw->limited_counter[i])
725 break;
726 if (i < cpuhw->n_limited) {
727 while (++i < cpuhw->n_limited) {
728 cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
729 cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
730 }
731 --cpuhw->n_limited;
732 }
733 if (cpuhw->n_counters == 0) {
734 /* disable exceptions if no counters are running */
735 cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
736 }
737
738 hw_perf_restore(pmudis);
739 local_irq_restore(flags);
740}
741
742struct pmu power_pmu = {
743 .enable = power_pmu_enable,
744 .disable = power_pmu_disable,
745 .read = power_pmu_read,
746};
747
748/*
749 * Return 1 if we might be able to put counter on a limited PMC,
750 * or 0 if not.
751 * A counter can only go on a limited PMC if it counts something
752 * that a limited PMC can count, doesn't require interrupts, and
753 * doesn't exclude any processor mode.
754 */
755static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
756 unsigned int flags)
757{
758 int n;
759 unsigned int alt[MAX_EVENT_ALTERNATIVES];
760
761 if (counter->hw_event.exclude_user
762 || counter->hw_event.exclude_kernel
763 || counter->hw_event.exclude_hv
764 || counter->hw_event.irq_period)
765 return 0;
766
767 if (ppmu->limited_pmc_event(ev))
768 return 1;
769
770 /*
771 * The requested event isn't on a limited PMC already;
772 * see if any alternative code goes on a limited PMC.
773 */
774 if (!ppmu->get_alternatives)
775 return 0;
776
777 flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
778 n = ppmu->get_alternatives(ev, flags, alt);
779 if (n)
780 return alt[0];
781
782 return 0;
783}
784
785/*
786 * Find an alternative event that goes on a normal PMC, if possible,
787 * and return the event code, or 0 if there is no such alternative.
788 * (Note: event code 0 is "don't count" on all machines.)
789 */
790static unsigned long normal_pmc_alternative(unsigned long ev,
791 unsigned long flags)
792{
793 unsigned int alt[MAX_EVENT_ALTERNATIVES];
794 int n;
795
796 flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
797 n = ppmu->get_alternatives(ev, flags, alt);
798 if (!n)
799 return 0;
800 return alt[0];
801}
802
803/* Number of perf_counters counting hardware events */
804static atomic_t num_counters;
805/* Used to avoid races in calling reserve/release_pmc_hardware */
806static DEFINE_MUTEX(pmc_reserve_mutex);
807
808/*
809 * Release the PMU if this is the last perf_counter.
810 */
811static void hw_perf_counter_destroy(struct perf_counter *counter)
812{
813 if (!atomic_add_unless(&num_counters, -1, 1)) {
814 mutex_lock(&pmc_reserve_mutex);
815 if (atomic_dec_return(&num_counters) == 0)
816 release_pmc_hardware();
817 mutex_unlock(&pmc_reserve_mutex);
818 }
819}
820
821const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
822{
823 unsigned long ev, flags;
824 struct perf_counter *ctrs[MAX_HWCOUNTERS];
825 unsigned int events[MAX_HWCOUNTERS];
826 unsigned int cflags[MAX_HWCOUNTERS];
827 int n;
828 int err;
829
830 if (!ppmu)
831 return ERR_PTR(-ENXIO);
832 if ((s64)counter->hw_event.irq_period < 0)
833 return ERR_PTR(-EINVAL);
834 if (!perf_event_raw(&counter->hw_event)) {
835 ev = perf_event_id(&counter->hw_event);
836 if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
837 return ERR_PTR(-EOPNOTSUPP);
838 ev = ppmu->generic_events[ev];
839 } else {
840 ev = perf_event_config(&counter->hw_event);
841 }
842 counter->hw.config_base = ev;
843 counter->hw.idx = 0;
844
845 /*
846 * If we are not running on a hypervisor, force the
847 * exclude_hv bit to 0 so that we don't care what
848 * the user set it to.
849 */
850 if (!firmware_has_feature(FW_FEATURE_LPAR))
851 counter->hw_event.exclude_hv = 0;
852
853 /*
854 * If this is a per-task counter, then we can use
855 * PM_RUN_* events interchangeably with their non RUN_*
856 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
857 * XXX we should check if the task is an idle task.
858 */
859 flags = 0;
860 if (counter->ctx->task)
861 flags |= PPMU_ONLY_COUNT_RUN;
862
863 /*
864 * If this machine has limited counters, check whether this
865 * event could go on a limited counter.
866 */
867 if (ppmu->limited_pmc5_6) {
868 if (can_go_on_limited_pmc(counter, ev, flags)) {
869 flags |= PPMU_LIMITED_PMC_OK;
870 } else if (ppmu->limited_pmc_event(ev)) {
871 /*
872 * The requested event is on a limited PMC,
873 * but we can't use a limited PMC; see if any
874 * alternative goes on a normal PMC.
875 */
876 ev = normal_pmc_alternative(ev, flags);
877 if (!ev)
878 return ERR_PTR(-EINVAL);
879 }
880 }
881
882 /*
883 * If this is in a group, check if it can go on with all the
884 * other hardware counters in the group. We assume the counter
885 * hasn't been linked into its leader's sibling list at this point.
886 */
887 n = 0;
888 if (counter->group_leader != counter) {
889 n = collect_events(counter->group_leader, ppmu->n_counter - 1,
890 ctrs, events, cflags);
891 if (n < 0)
892 return ERR_PTR(-EINVAL);
893 }
894 events[n] = ev;
895 ctrs[n] = counter;
896 cflags[n] = flags;
897 if (check_excludes(ctrs, cflags, n, 1))
898 return ERR_PTR(-EINVAL);
899 if (power_check_constraints(events, cflags, n + 1))
900 return ERR_PTR(-EINVAL);
901
902 counter->hw.config = events[n];
903 counter->hw.counter_base = cflags[n];
904 atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
905
906 /*
907 * See if we need to reserve the PMU.
908 * If no counters are currently in use, then we have to take a
909 * mutex to ensure that we don't race with another task doing
910 * reserve_pmc_hardware or release_pmc_hardware.
911 */
912 err = 0;
913 if (!atomic_inc_not_zero(&num_counters)) {
914 mutex_lock(&pmc_reserve_mutex);
915 if (atomic_read(&num_counters) == 0 &&
916 reserve_pmc_hardware(perf_counter_interrupt))
917 err = -EBUSY;
918 else
919 atomic_inc(&num_counters);
920 mutex_unlock(&pmc_reserve_mutex);
921 }
922 counter->destroy = hw_perf_counter_destroy;
923
924 if (err)
925 return ERR_PTR(err);
926 return &power_pmu;
927}
928
929/*
930 * A counter has overflowed; update its count and record
931 * things if requested. Note that interrupts are hard-disabled
932 * here so there is no possibility of being interrupted.
933 */
934static void record_and_restart(struct perf_counter *counter, long val,
935 struct pt_regs *regs, int nmi)
936{
937 s64 prev, delta, left;
938 int record = 0;
939
940 /* we don't have to worry about interrupts here */
941 prev = atomic64_read(&counter->hw.prev_count);
942 delta = (val - prev) & 0xfffffffful;
943 atomic64_add(delta, &counter->count);
944
945 /*
946 * See if the total period for this counter has expired,
947 * and update for the next period.
948 */
949 val = 0;
950 left = atomic64_read(&counter->hw.period_left) - delta;
951 if (counter->hw_event.irq_period) {
952 if (left <= 0) {
953 left += counter->hw_event.irq_period;
954 if (left <= 0)
955 left = counter->hw_event.irq_period;
956 record = 1;
957 }
958 if (left < 0x80000000L)
959 val = 0x80000000L - left;
960 }
961 write_pmc(counter->hw.idx, val);
962 atomic64_set(&counter->hw.prev_count, val);
963 atomic64_set(&counter->hw.period_left, left);
964 perf_counter_update_userpage(counter);
965
966 /*
967 * Finally record data if requested.
968 */
969 if (record)
970 perf_counter_overflow(counter, nmi, regs, 0);
971}
972
973/*
974 * Performance monitor interrupt stuff
975 */
976static void perf_counter_interrupt(struct pt_regs *regs)
977{
978 int i;
979 struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
980 struct perf_counter *counter;
981 long val;
982 int found = 0;
983 int nmi;
984
985 if (cpuhw->n_limited)
986 freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
987 mfspr(SPRN_PMC6));
988
989 /*
990 * If interrupts were soft-disabled when this PMU interrupt
991 * occurred, treat it as an NMI.
992 */
993 nmi = !regs->softe;
994 if (nmi)
995 nmi_enter();
996 else
997 irq_enter();
998
999 for (i = 0; i < cpuhw->n_counters; ++i) {
1000 counter = cpuhw->counter[i];
1001 if (is_limited_pmc(counter->hw.idx))
1002 continue;
1003 val = read_pmc(counter->hw.idx);
1004 if ((int)val < 0) {
1005 /* counter has overflowed */
1006 found = 1;
1007 record_and_restart(counter, val, regs, nmi);
1008 }
1009 }
1010
1011 /*
1012 * In case we didn't find and reset the counter that caused
1013 * the interrupt, scan all counters and reset any that are
1014 * negative, to avoid getting continual interrupts.
1015 * Any that we processed in the previous loop will not be negative.
1016 */
1017 if (!found) {
1018 for (i = 0; i < ppmu->n_counter; ++i) {
1019 if (is_limited_pmc(i + 1))
1020 continue;
1021 val = read_pmc(i + 1);
1022 if ((int)val < 0)
1023 write_pmc(i + 1, 0);
1024 }
1025 }
1026
1027 /*
1028 * Reset MMCR0 to its normal value. This will set PMXE and
1029 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
1030 * and thus allow interrupts to occur again.
1031 * XXX might want to use MSR.PM to keep the counters frozen until
1032 * we get back out of this interrupt.
1033 */
1034 write_mmcr0(cpuhw, cpuhw->mmcr[0]);
1035
1036 if (nmi)
1037 nmi_exit();
1038 else
1039 irq_exit();
1040}
1041
1042void hw_perf_counter_setup(int cpu)
1043{
1044 struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
1045
1046 memset(cpuhw, 0, sizeof(*cpuhw));
1047 cpuhw->mmcr[0] = MMCR0_FC;
1048}
1049
1050extern struct power_pmu power4_pmu;
1051extern struct power_pmu ppc970_pmu;
1052extern struct power_pmu power5_pmu;
1053extern struct power_pmu power5p_pmu;
1054extern struct power_pmu power6_pmu;
1055
1056static int init_perf_counters(void)
1057{
1058 unsigned long pvr;
1059
1060 /* XXX should get this from cputable */
1061 pvr = mfspr(SPRN_PVR);
1062 switch (PVR_VER(pvr)) {
1063 case PV_POWER4:
1064 case PV_POWER4p:
1065 ppmu = &power4_pmu;
1066 break;
1067 case PV_970:
1068 case PV_970FX:
1069 case PV_970MP:
1070 ppmu = &ppc970_pmu;
1071 break;
1072 case PV_POWER5:
1073 ppmu = &power5_pmu;
1074 break;
1075 case PV_POWER5p:
1076 ppmu = &power5p_pmu;
1077 break;
1078 case 0x3e:
1079 ppmu = &power6_pmu;
1080 break;
1081 }
1082
1083 /*
1084 * Use FCHV to ignore kernel events if MSR.HV is set.
1085 */
1086 if (mfmsr() & MSR_HV)
1087 freeze_counters_kernel = MMCR0_FCHV;
1088
1089 return 0;
1090}
1091
1092arch_initcall(init_perf_counters);
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
new file mode 100644
index 000000000000..744a2756958e
--- /dev/null
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -0,0 +1,558 @@
1/*
2 * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER4
17 */
18#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
21#define PM_UNIT_MSK 0xf
22#define PM_LOWER_SH 6
23#define PM_LOWER_MSK 1
24#define PM_LOWER_MSKS 0x40
25#define PM_BYTE_SH 4 /* Byte number of event bus to use */
26#define PM_BYTE_MSK 3
27#define PM_PMCSEL_MSK 7
28
29/*
30 * Unit code values
31 */
32#define PM_FPU 1
33#define PM_ISU1 2
34#define PM_IFU 3
35#define PM_IDU0 4
36#define PM_ISU1_ALT 6
37#define PM_ISU2 7
38#define PM_IFU_ALT 8
39#define PM_LSU0 9
40#define PM_LSU1 0xc
41#define PM_GPS 0xf
42
43/*
44 * Bits in MMCR0 for POWER4
45 */
46#define MMCR0_PMC1SEL_SH 8
47#define MMCR0_PMC2SEL_SH 1
48#define MMCR_PMCSEL_MSK 0x1f
49
50/*
51 * Bits in MMCR1 for POWER4
52 */
53#define MMCR1_TTM0SEL_SH 62
54#define MMCR1_TTC0SEL_SH 61
55#define MMCR1_TTM1SEL_SH 59
56#define MMCR1_TTC1SEL_SH 58
57#define MMCR1_TTM2SEL_SH 56
58#define MMCR1_TTC2SEL_SH 55
59#define MMCR1_TTM3SEL_SH 53
60#define MMCR1_TTC3SEL_SH 52
61#define MMCR1_TTMSEL_MSK 3
62#define MMCR1_TD_CP_DBG0SEL_SH 50
63#define MMCR1_TD_CP_DBG1SEL_SH 48
64#define MMCR1_TD_CP_DBG2SEL_SH 46
65#define MMCR1_TD_CP_DBG3SEL_SH 44
66#define MMCR1_DEBUG0SEL_SH 43
67#define MMCR1_DEBUG1SEL_SH 42
68#define MMCR1_DEBUG2SEL_SH 41
69#define MMCR1_DEBUG3SEL_SH 40
70#define MMCR1_PMC1_ADDER_SEL_SH 39
71#define MMCR1_PMC2_ADDER_SEL_SH 38
72#define MMCR1_PMC6_ADDER_SEL_SH 37
73#define MMCR1_PMC5_ADDER_SEL_SH 36
74#define MMCR1_PMC8_ADDER_SEL_SH 35
75#define MMCR1_PMC7_ADDER_SEL_SH 34
76#define MMCR1_PMC3_ADDER_SEL_SH 33
77#define MMCR1_PMC4_ADDER_SEL_SH 32
78#define MMCR1_PMC3SEL_SH 27
79#define MMCR1_PMC4SEL_SH 22
80#define MMCR1_PMC5SEL_SH 17
81#define MMCR1_PMC6SEL_SH 12
82#define MMCR1_PMC7SEL_SH 7
83#define MMCR1_PMC8SEL_SH 2 /* note bit 0 is in MMCRA for GP */
84
85static short mmcr1_adder_bits[8] = {
86 MMCR1_PMC1_ADDER_SEL_SH,
87 MMCR1_PMC2_ADDER_SEL_SH,
88 MMCR1_PMC3_ADDER_SEL_SH,
89 MMCR1_PMC4_ADDER_SEL_SH,
90 MMCR1_PMC5_ADDER_SEL_SH,
91 MMCR1_PMC6_ADDER_SEL_SH,
92 MMCR1_PMC7_ADDER_SEL_SH,
93 MMCR1_PMC8_ADDER_SEL_SH
94};
95
96/*
97 * Bits in MMCRA
98 */
99#define MMCRA_PMC8SEL0_SH 17 /* PMC8SEL bit 0 for GP */
100
101/*
102 * Layout of constraint bits:
103 * 6666555555555544444444443333333333222222222211111111110000000000
104 * 3210987654321098765432109876543210987654321098765432109876543210
105 * |[ >[ >[ >|||[ >[ >< >< >< >< ><><><><><><><><>
106 * | UC1 UC2 UC3 ||| PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
107 * \SMPL ||\TTC3SEL
108 * |\TTC_IFU_SEL
109 * \TTM2SEL0
110 *
111 * SMPL - SAMPLE_ENABLE constraint
112 * 56: SAMPLE_ENABLE value 0x0100_0000_0000_0000
113 *
114 * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2
115 * 55: UC1 error 0x0080_0000_0000_0000
116 * 54: FPU events needed 0x0040_0000_0000_0000
117 * 53: ISU1 events needed 0x0020_0000_0000_0000
118 * 52: IDU0|ISU2 events needed 0x0010_0000_0000_0000
119 *
120 * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0
121 * 51: UC2 error 0x0008_0000_0000_0000
122 * 50: FPU events needed 0x0004_0000_0000_0000
123 * 49: IFU events needed 0x0002_0000_0000_0000
124 * 48: LSU0 events needed 0x0001_0000_0000_0000
125 *
126 * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1
127 * 47: UC3 error 0x8000_0000_0000
128 * 46: LSU0 events needed 0x4000_0000_0000
129 * 45: IFU events needed 0x2000_0000_0000
130 * 44: IDU0|ISU2 events needed 0x1000_0000_0000
131 * 43: ISU1 events needed 0x0800_0000_0000
132 *
133 * TTM2SEL0
134 * 42: 0 = IDU0 events needed
135 * 1 = ISU2 events needed 0x0400_0000_0000
136 *
137 * TTC_IFU_SEL
138 * 41: 0 = IFU.U events needed
139 * 1 = IFU.L events needed 0x0200_0000_0000
140 *
141 * TTC3SEL
142 * 40: 0 = LSU1.U events needed
143 * 1 = LSU1.L events needed 0x0100_0000_0000
144 *
145 * PS1
146 * 39: PS1 error 0x0080_0000_0000
147 * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
148 *
149 * PS2
150 * 35: PS2 error 0x0008_0000_0000
151 * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
152 *
153 * B0
154 * 28-31: Byte 0 event source 0xf000_0000
155 * 1 = FPU
156 * 2 = ISU1
157 * 3 = IFU
158 * 4 = IDU0
159 * 7 = ISU2
160 * 9 = LSU0
161 * c = LSU1
162 * f = GPS
163 *
164 * B1, B2, B3
165 * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
166 *
167 * P8
168 * 15: P8 error 0x8000
169 * 14-15: Count of events needing PMC8
170 *
171 * P1..P7
172 * 0-13: Count of events needing PMC1..PMC7
173 *
174 * Note: this doesn't allow events using IFU.U to be combined with events
175 * using IFU.L, though that is feasible (using TTM0 and TTM2). However
176 * there are no listed events for IFU.L (they are debug events not
177 * verified for performance monitoring) so this shouldn't cause a
178 * problem.
179 */
180
181static struct unitinfo {
182 u64 value, mask;
183 int unit;
184 int lowerbit;
185} p4_unitinfo[16] = {
186 [PM_FPU] = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 },
187 [PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
188 [PM_ISU1_ALT] =
189 { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
190 [PM_IFU] = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
191 [PM_IFU_ALT] =
192 { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
193 [PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 },
194 [PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 },
195 [PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 },
196 [PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 },
197 [PM_GPS] = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 }
198};
199
200static unsigned char direct_marked_event[8] = {
201 (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
202 (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
203 (1<<3), /* PMC3: PM_MRK_ST_CMPL_INT */
204 (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
205 (1<<4) | (1<<5), /* PMC5: PM_MRK_GRP_TIMEO */
206 (1<<3) | (1<<4) | (1<<5),
207 /* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
208 (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
209 (1<<4), /* PMC8: PM_MRK_LSU_FIN */
210};
211
212/*
213 * Returns 1 if event counts things relating to marked instructions
214 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
215 */
216static int p4_marked_instr_event(unsigned int event)
217{
218 int pmc, psel, unit, byte, bit;
219 unsigned int mask;
220
221 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
222 psel = event & PM_PMCSEL_MSK;
223 if (pmc) {
224 if (direct_marked_event[pmc - 1] & (1 << psel))
225 return 1;
226 if (psel == 0) /* add events */
227 bit = (pmc <= 4)? pmc - 1: 8 - pmc;
228 else if (psel == 6) /* decode events */
229 bit = 4;
230 else
231 return 0;
232 } else
233 bit = psel;
234
235 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
236 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
237 mask = 0;
238 switch (unit) {
239 case PM_LSU1:
240 if (event & PM_LOWER_MSKS)
241 mask = 1 << 28; /* byte 7 bit 4 */
242 else
243 mask = 6 << 24; /* byte 3 bits 1 and 2 */
244 break;
245 case PM_LSU0:
246 /* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */
247 mask = 0x083dff00;
248 }
249 return (mask >> (byte * 8 + bit)) & 1;
250}
251
252static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
253{
254 int pmc, byte, unit, lower, sh;
255 u64 mask = 0, value = 0;
256 int grp = -1;
257
258 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
259 if (pmc) {
260 if (pmc > 8)
261 return -1;
262 sh = (pmc - 1) * 2;
263 mask |= 2 << sh;
264 value |= 1 << sh;
265 grp = ((pmc - 1) >> 1) & 1;
266 }
267 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
268 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
269 if (unit) {
270 lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK;
271
272 /*
273 * Bus events on bytes 0 and 2 can be counted
274 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
275 */
276 if (!pmc)
277 grp = byte & 1;
278
279 if (!p4_unitinfo[unit].unit)
280 return -1;
281 mask |= p4_unitinfo[unit].mask;
282 value |= p4_unitinfo[unit].value;
283 sh = p4_unitinfo[unit].lowerbit;
284 if (sh > 1)
285 value |= (u64)lower << sh;
286 else if (lower != sh)
287 return -1;
288 unit = p4_unitinfo[unit].unit;
289
290 /* Set byte lane select field */
291 mask |= 0xfULL << (28 - 4 * byte);
292 value |= (u64)unit << (28 - 4 * byte);
293 }
294 if (grp == 0) {
295 /* increment PMC1/2/5/6 field */
296 mask |= 0x8000000000ull;
297 value |= 0x1000000000ull;
298 } else {
299 /* increment PMC3/4/7/8 field */
300 mask |= 0x800000000ull;
301 value |= 0x100000000ull;
302 }
303
304 /* Marked instruction events need sample_enable set */
305 if (p4_marked_instr_event(event)) {
306 mask |= 1ull << 56;
307 value |= 1ull << 56;
308 }
309
310 /* PMCSEL=6 decode events on byte 2 need sample_enable clear */
311 if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2)
312 mask |= 1ull << 56;
313
314 *maskp = mask;
315 *valp = value;
316 return 0;
317}
318
319static unsigned int ppc_inst_cmpl[] = {
320 0x1001, 0x4001, 0x6001, 0x7001, 0x8001
321};
322
323static int p4_get_alternatives(unsigned int event, unsigned int flags,
324 unsigned int alt[])
325{
326 int i, j, na;
327
328 alt[0] = event;
329 na = 1;
330
331 /* 2 possibilities for PM_GRP_DISP_REJECT */
332 if (event == 0x8003 || event == 0x0224) {
333 alt[1] = event ^ (0x8003 ^ 0x0224);
334 return 2;
335 }
336
337 /* 2 possibilities for PM_ST_MISS_L1 */
338 if (event == 0x0c13 || event == 0x0c23) {
339 alt[1] = event ^ (0x0c13 ^ 0x0c23);
340 return 2;
341 }
342
343 /* several possibilities for PM_INST_CMPL */
344 for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) {
345 if (event == ppc_inst_cmpl[i]) {
346 for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j)
347 if (j != i)
348 alt[na++] = ppc_inst_cmpl[j];
349 break;
350 }
351 }
352
353 return na;
354}
355
356static int p4_compute_mmcr(unsigned int event[], int n_ev,
357 unsigned int hwc[], u64 mmcr[])
358{
359 u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
360 unsigned int pmc, unit, byte, psel, lower;
361 unsigned int ttm, grp;
362 unsigned int pmc_inuse = 0;
363 unsigned int pmc_grp_use[2];
364 unsigned char busbyte[4];
365 unsigned char unituse[16];
366 unsigned int unitlower = 0;
367 int i;
368
369 if (n_ev > 8)
370 return -1;
371
372 /* First pass to count resource use */
373 pmc_grp_use[0] = pmc_grp_use[1] = 0;
374 memset(busbyte, 0, sizeof(busbyte));
375 memset(unituse, 0, sizeof(unituse));
376 for (i = 0; i < n_ev; ++i) {
377 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
378 if (pmc) {
379 if (pmc_inuse & (1 << (pmc - 1)))
380 return -1;
381 pmc_inuse |= 1 << (pmc - 1);
382 /* count 1/2/5/6 vs 3/4/7/8 use */
383 ++pmc_grp_use[((pmc - 1) >> 1) & 1];
384 }
385 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
386 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
387 lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK;
388 if (unit) {
389 if (!pmc)
390 ++pmc_grp_use[byte & 1];
391 if (unit == 6 || unit == 8)
392 /* map alt ISU1/IFU codes: 6->2, 8->3 */
393 unit = (unit >> 1) - 1;
394 if (busbyte[byte] && busbyte[byte] != unit)
395 return -1;
396 busbyte[byte] = unit;
397 lower <<= unit;
398 if (unituse[unit] && lower != (unitlower & lower))
399 return -1;
400 unituse[unit] = 1;
401 unitlower |= lower;
402 }
403 }
404 if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
405 return -1;
406
407 /*
408 * Assign resources and set multiplexer selects.
409 *
410 * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2.
411 * Each TTMx can only select one unit, but since
412 * units 2 and 6 are both ISU1, and 3 and 8 are both IFU,
413 * we have some choices.
414 */
415 if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) {
416 unituse[6] = 1; /* Move 2 to 6 */
417 unituse[2] = 0;
418 }
419 if (unituse[3] & (unituse[1] | unituse[2])) {
420 unituse[8] = 1; /* Move 3 to 8 */
421 unituse[3] = 0;
422 unitlower = (unitlower & ~8) | ((unitlower & 8) << 5);
423 }
424 /* Check only one unit per TTMx */
425 if (unituse[1] + unituse[2] + unituse[3] > 1 ||
426 unituse[4] + unituse[6] + unituse[7] > 1 ||
427 unituse[8] + unituse[9] > 1 ||
428 (unituse[5] | unituse[10] | unituse[11] |
429 unituse[13] | unituse[14]))
430 return -1;
431
432 /* Set TTMxSEL fields. Note, units 1-3 => TTM0SEL codes 0-2 */
433 mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH;
434 mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH;
435 mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH;
436
437 /* Set TTCxSEL fields. */
438 if (unitlower & 0xe)
439 mmcr1 |= 1ull << MMCR1_TTC0SEL_SH;
440 if (unitlower & 0xf0)
441 mmcr1 |= 1ull << MMCR1_TTC1SEL_SH;
442 if (unitlower & 0xf00)
443 mmcr1 |= 1ull << MMCR1_TTC2SEL_SH;
444 if (unitlower & 0x7000)
445 mmcr1 |= 1ull << MMCR1_TTC3SEL_SH;
446
447 /* Set byte lane select fields. */
448 for (byte = 0; byte < 4; ++byte) {
449 unit = busbyte[byte];
450 if (!unit)
451 continue;
452 if (unit == 0xf) {
453 /* special case for GPS */
454 mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte);
455 } else {
456 if (!unituse[unit])
457 ttm = unit - 1; /* 2->1, 3->2 */
458 else
459 ttm = unit >> 2;
460 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte);
461 }
462 }
463
464 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
465 for (i = 0; i < n_ev; ++i) {
466 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
467 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
468 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
469 psel = event[i] & PM_PMCSEL_MSK;
470 if (!pmc) {
471 /* Bus event or 00xxx direct event (off or cycles) */
472 if (unit)
473 psel |= 0x10 | ((byte & 2) << 2);
474 for (pmc = 0; pmc < 8; ++pmc) {
475 if (pmc_inuse & (1 << pmc))
476 continue;
477 grp = (pmc >> 1) & 1;
478 if (unit) {
479 if (grp == (byte & 1))
480 break;
481 } else if (pmc_grp_use[grp] < 4) {
482 ++pmc_grp_use[grp];
483 break;
484 }
485 }
486 pmc_inuse |= 1 << pmc;
487 } else {
488 /* Direct event */
489 --pmc;
490 if (psel == 0 && (byte & 2))
491 /* add events on higher-numbered bus */
492 mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
493 else if (psel == 6 && byte == 3)
494 /* seem to need to set sample_enable here */
495 mmcra |= MMCRA_SAMPLE_ENABLE;
496 psel |= 8;
497 }
498 if (pmc <= 1)
499 mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc);
500 else
501 mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
502 if (pmc == 7) /* PMC8 */
503 mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH;
504 hwc[i] = pmc;
505 if (p4_marked_instr_event(event[i]))
506 mmcra |= MMCRA_SAMPLE_ENABLE;
507 }
508
509 if (pmc_inuse & 1)
510 mmcr0 |= MMCR0_PMC1CE;
511 if (pmc_inuse & 0xfe)
512 mmcr0 |= MMCR0_PMCjCE;
513
514 mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
515
516 /* Return MMCRx values */
517 mmcr[0] = mmcr0;
518 mmcr[1] = mmcr1;
519 mmcr[2] = mmcra;
520 return 0;
521}
522
523static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
524{
525 /*
526 * Setting the PMCxSEL field to 0 disables PMC x.
527 * (Note that pmc is 0-based here, not 1-based.)
528 */
529 if (pmc <= 1) {
530 mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc));
531 } else {
532 mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)));
533 if (pmc == 7)
534 mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH);
535 }
536}
537
538static int p4_generic_events[] = {
539 [PERF_COUNT_CPU_CYCLES] = 7,
540 [PERF_COUNT_INSTRUCTIONS] = 0x1001,
541 [PERF_COUNT_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */
542 [PERF_COUNT_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */
543 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */
544 [PERF_COUNT_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */
545};
546
547struct power_pmu power4_pmu = {
548 .n_counter = 8,
549 .max_alternatives = 5,
550 .add_fields = 0x0000001100005555ull,
551 .test_adder = 0x0011083300000000ull,
552 .compute_mmcr = p4_compute_mmcr,
553 .get_constraint = p4_get_constraint,
554 .get_alternatives = p4_get_alternatives,
555 .disable_pmc = p4_disable_pmc,
556 .n_generic = ARRAY_SIZE(p4_generic_events),
557 .generic_events = p4_generic_events,
558};
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
new file mode 100644
index 000000000000..8154eaa2404f
--- /dev/null
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -0,0 +1,630 @@
1/*
2 * Performance counter support for POWER5+/++ (not POWER5) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3)
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
22#define PM_UNIT_MSK 0xf
23#define PM_BYTE_SH 12 /* Byte number of event bus to use */
24#define PM_BYTE_MSK 7
25#define PM_GRS_SH 8 /* Storage subsystem mux select */
26#define PM_GRS_MSK 7
27#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
28#define PM_PMCSEL_MSK 0x7f
29
30/* Values in PM_UNIT field */
31#define PM_FPU 0
32#define PM_ISU0 1
33#define PM_IFU 2
34#define PM_ISU1 3
35#define PM_IDU 4
36#define PM_ISU0_ALT 6
37#define PM_GRS 7
38#define PM_LSU0 8
39#define PM_LSU1 0xc
40#define PM_LASTUNIT 0xc
41
42/*
43 * Bits in MMCR1 for POWER5+
44 */
45#define MMCR1_TTM0SEL_SH 62
46#define MMCR1_TTM1SEL_SH 60
47#define MMCR1_TTM2SEL_SH 58
48#define MMCR1_TTM3SEL_SH 56
49#define MMCR1_TTMSEL_MSK 3
50#define MMCR1_TD_CP_DBG0SEL_SH 54
51#define MMCR1_TD_CP_DBG1SEL_SH 52
52#define MMCR1_TD_CP_DBG2SEL_SH 50
53#define MMCR1_TD_CP_DBG3SEL_SH 48
54#define MMCR1_GRS_L2SEL_SH 46
55#define MMCR1_GRS_L2SEL_MSK 3
56#define MMCR1_GRS_L3SEL_SH 44
57#define MMCR1_GRS_L3SEL_MSK 3
58#define MMCR1_GRS_MCSEL_SH 41
59#define MMCR1_GRS_MCSEL_MSK 7
60#define MMCR1_GRS_FABSEL_SH 39
61#define MMCR1_GRS_FABSEL_MSK 3
62#define MMCR1_PMC1_ADDER_SEL_SH 35
63#define MMCR1_PMC2_ADDER_SEL_SH 34
64#define MMCR1_PMC3_ADDER_SEL_SH 33
65#define MMCR1_PMC4_ADDER_SEL_SH 32
66#define MMCR1_PMC1SEL_SH 25
67#define MMCR1_PMC2SEL_SH 17
68#define MMCR1_PMC3SEL_SH 9
69#define MMCR1_PMC4SEL_SH 1
70#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
71#define MMCR1_PMCSEL_MSK 0x7f
72
73/*
74 * Bits in MMCRA
75 */
76
77/*
78 * Layout of constraint bits:
79 * 6666555555555544444444443333333333222222222211111111110000000000
80 * 3210987654321098765432109876543210987654321098765432109876543210
81 * [ ><><>< ><> <><>[ > < >< >< >< ><><><><><><>
82 * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P6P5P4P3P2P1
83 *
84 * NC - number of counters
85 * 51: NC error 0x0008_0000_0000_0000
86 * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
87 *
88 * G0..G3 - GRS mux constraints
89 * 46-47: GRS_L2SEL value
90 * 44-45: GRS_L3SEL value
91 * 41-44: GRS_MCSEL value
92 * 39-40: GRS_FABSEL value
93 * Note that these match up with their bit positions in MMCR1
94 *
95 * T0 - TTM0 constraint
96 * 36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000
97 *
98 * T1 - TTM1 constraint
99 * 34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000
100 *
101 * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
102 * 33: UC3 error 0x02_0000_0000
103 * 32: FPU|IFU|ISU1 events needed 0x01_0000_0000
104 * 31: ISU0 events needed 0x01_8000_0000
105 * 30: IDU|GRS events needed 0x00_4000_0000
106 *
107 * B0
108 * 24-27: Byte 0 event source 0x0f00_0000
109 * Encoding as for the event code
110 *
111 * B1, B2, B3
112 * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
113 *
114 * P6
115 * 11: P6 error 0x800
116 * 10-11: Count of events needing PMC6
117 *
118 * P1..P5
119 * 0-9: Count of events needing PMC1..PMC5
120 */
121
122static const int grsel_shift[8] = {
123 MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
124 MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
125 MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
126};
127
128/* Masks and values for using events from the various units */
129static u64 unit_cons[PM_LASTUNIT+1][2] = {
130 [PM_FPU] = { 0x3200000000ull, 0x0100000000ull },
131 [PM_ISU0] = { 0x0200000000ull, 0x0080000000ull },
132 [PM_ISU1] = { 0x3200000000ull, 0x3100000000ull },
133 [PM_IFU] = { 0x3200000000ull, 0x2100000000ull },
134 [PM_IDU] = { 0x0e00000000ull, 0x0040000000ull },
135 [PM_GRS] = { 0x0e00000000ull, 0x0c40000000ull },
136};
137
138static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
139{
140 int pmc, byte, unit, sh;
141 int bit, fmask;
142 u64 mask = 0, value = 0;
143
144 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
145 if (pmc) {
146 if (pmc > 6)
147 return -1;
148 sh = (pmc - 1) * 2;
149 mask |= 2 << sh;
150 value |= 1 << sh;
151 if (pmc >= 5 && !(event == 0x500009 || event == 0x600005))
152 return -1;
153 }
154 if (event & PM_BUSEVENT_MSK) {
155 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
156 if (unit > PM_LASTUNIT)
157 return -1;
158 if (unit == PM_ISU0_ALT)
159 unit = PM_ISU0;
160 mask |= unit_cons[unit][0];
161 value |= unit_cons[unit][1];
162 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
163 if (byte >= 4) {
164 if (unit != PM_LSU1)
165 return -1;
166 /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
167 ++unit;
168 byte &= 3;
169 }
170 if (unit == PM_GRS) {
171 bit = event & 7;
172 fmask = (bit == 6)? 7: 3;
173 sh = grsel_shift[bit];
174 mask |= (u64)fmask << sh;
175 value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
176 }
177 /* Set byte lane select field */
178 mask |= 0xfULL << (24 - 4 * byte);
179 value |= (u64)unit << (24 - 4 * byte);
180 }
181 if (pmc < 5) {
182 /* need a counter from PMC1-4 set */
183 mask |= 0x8000000000000ull;
184 value |= 0x1000000000000ull;
185 }
186 *maskp = mask;
187 *valp = value;
188 return 0;
189}
190
191static int power5p_limited_pmc_event(unsigned int event)
192{
193 int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
194
195 return pmc == 5 || pmc == 6;
196}
197
198#define MAX_ALT 3 /* at most 3 alternatives for any event */
199
200static const unsigned int event_alternatives[][MAX_ALT] = {
201 { 0x100c0, 0x40001f }, /* PM_GCT_FULL_CYC */
202 { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
203 { 0x230e2, 0x323087 }, /* PM_BR_PRED_CR */
204 { 0x230e3, 0x223087, 0x3230a0 }, /* PM_BR_PRED_TA */
205 { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
206 { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */
207 { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */
208 { 0x100005, 0x600005 }, /* PM_RUN_CYC */
209 { 0x100009, 0x200009 }, /* PM_INST_CMPL */
210 { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */
211 { 0x300009, 0x400009 }, /* PM_INST_DISP */
212};
213
214/*
215 * Scan the alternatives table for a match and return the
216 * index into the alternatives table if found, else -1.
217 */
218static int find_alternative(unsigned int event)
219{
220 int i, j;
221
222 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
223 if (event < event_alternatives[i][0])
224 break;
225 for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
226 if (event == event_alternatives[i][j])
227 return i;
228 }
229 return -1;
230}
231
232static const unsigned char bytedecode_alternatives[4][4] = {
233 /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
234 /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
235 /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
236 /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
237};
238
239/*
240 * Some direct events for decodes of event bus byte 3 have alternative
241 * PMCSEL values on other counters. This returns the alternative
242 * event code for those that do, or -1 otherwise. This also handles
243 * alternative PCMSEL values for add events.
244 */
245static int find_alternative_bdecode(unsigned int event)
246{
247 int pmc, altpmc, pp, j;
248
249 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
250 if (pmc == 0 || pmc > 4)
251 return -1;
252 altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
253 pp = event & PM_PMCSEL_MSK;
254 for (j = 0; j < 4; ++j) {
255 if (bytedecode_alternatives[pmc - 1][j] == pp) {
256 return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
257 (altpmc << PM_PMC_SH) |
258 bytedecode_alternatives[altpmc - 1][j];
259 }
260 }
261
262 /* new decode alternatives for power5+ */
263 if (pmc == 1 && (pp == 0x0d || pp == 0x0e))
264 return event + (2 << PM_PMC_SH) + (0x2e - 0x0d);
265 if (pmc == 3 && (pp == 0x2e || pp == 0x2f))
266 return event - (2 << PM_PMC_SH) - (0x2e - 0x0d);
267
268 /* alternative add event encodings */
269 if (pp == 0x10 || pp == 0x28)
270 return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) |
271 (altpmc << PM_PMC_SH);
272
273 return -1;
274}
275
276static int power5p_get_alternatives(unsigned int event, unsigned int flags,
277 unsigned int alt[])
278{
279 int i, j, ae, nalt = 1;
280 int nlim;
281
282 alt[0] = event;
283 nalt = 1;
284 nlim = power5p_limited_pmc_event(event);
285 i = find_alternative(event);
286 if (i >= 0) {
287 for (j = 0; j < MAX_ALT; ++j) {
288 ae = event_alternatives[i][j];
289 if (ae && ae != event)
290 alt[nalt++] = ae;
291 nlim += power5p_limited_pmc_event(ae);
292 }
293 } else {
294 ae = find_alternative_bdecode(event);
295 if (ae > 0)
296 alt[nalt++] = ae;
297 }
298
299 if (flags & PPMU_ONLY_COUNT_RUN) {
300 /*
301 * We're only counting in RUN state,
302 * so PM_CYC is equivalent to PM_RUN_CYC
303 * and PM_INST_CMPL === PM_RUN_INST_CMPL.
304 * This doesn't include alternatives that don't provide
305 * any extra flexibility in assigning PMCs (e.g.
306 * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC).
307 * Note that even with these additional alternatives
308 * we never end up with more than 3 alternatives for any event.
309 */
310 j = nalt;
311 for (i = 0; i < nalt; ++i) {
312 switch (alt[i]) {
313 case 0xf: /* PM_CYC */
314 alt[j++] = 0x600005; /* PM_RUN_CYC */
315 ++nlim;
316 break;
317 case 0x600005: /* PM_RUN_CYC */
318 alt[j++] = 0xf;
319 break;
320 case 0x100009: /* PM_INST_CMPL */
321 alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */
322 ++nlim;
323 break;
324 case 0x500009: /* PM_RUN_INST_CMPL */
325 alt[j++] = 0x100009; /* PM_INST_CMPL */
326 alt[j++] = 0x200009;
327 break;
328 }
329 }
330 nalt = j;
331 }
332
333 if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
334 /* remove the limited PMC events */
335 j = 0;
336 for (i = 0; i < nalt; ++i) {
337 if (!power5p_limited_pmc_event(alt[i])) {
338 alt[j] = alt[i];
339 ++j;
340 }
341 }
342 nalt = j;
343 } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
344 /* remove all but the limited PMC events */
345 j = 0;
346 for (i = 0; i < nalt; ++i) {
347 if (power5p_limited_pmc_event(alt[i])) {
348 alt[j] = alt[i];
349 ++j;
350 }
351 }
352 nalt = j;
353 }
354
355 return nalt;
356}
357
358/*
359 * Map of which direct events on which PMCs are marked instruction events.
360 * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
361 * Bit 0 is set if it is marked for all PMCs.
362 * The 0x80 bit indicates a byte decode PMCSEL value.
363 */
364static unsigned char direct_event_is_marked[0x28] = {
365 0, /* 00 */
366 0x1f, /* 01 PM_IOPS_CMPL */
367 0x2, /* 02 PM_MRK_GRP_DISP */
368 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
369 0, /* 04 */
370 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
371 0x80, /* 06 */
372 0x80, /* 07 */
373 0, 0, 0,/* 08 - 0a */
374 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
375 0, /* 0c */
376 0x80, /* 0d */
377 0x80, /* 0e */
378 0, /* 0f */
379 0, /* 10 */
380 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
381 0, /* 12 */
382 0x10, /* 13 PM_MRK_GRP_CMPL */
383 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
384 0x2, /* 15 PM_MRK_GRP_ISSUED */
385 0x80, /* 16 */
386 0x80, /* 17 */
387 0, 0, 0, 0, 0,
388 0x80, /* 1d */
389 0x80, /* 1e */
390 0, /* 1f */
391 0x80, /* 20 */
392 0x80, /* 21 */
393 0x80, /* 22 */
394 0x80, /* 23 */
395 0x80, /* 24 */
396 0x80, /* 25 */
397 0x80, /* 26 */
398 0x80, /* 27 */
399};
400
401/*
402 * Returns 1 if event counts things relating to marked instructions
403 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
404 */
405static int power5p_marked_instr_event(unsigned int event)
406{
407 int pmc, psel;
408 int bit, byte, unit;
409 u32 mask;
410
411 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
412 psel = event & PM_PMCSEL_MSK;
413 if (pmc >= 5)
414 return 0;
415
416 bit = -1;
417 if (psel < sizeof(direct_event_is_marked)) {
418 if (direct_event_is_marked[psel] & (1 << pmc))
419 return 1;
420 if (direct_event_is_marked[psel] & 0x80)
421 bit = 4;
422 else if (psel == 0x08)
423 bit = pmc - 1;
424 else if (psel == 0x10)
425 bit = 4 - pmc;
426 else if (psel == 0x1b && (pmc == 1 || pmc == 3))
427 bit = 4;
428 } else if ((psel & 0x48) == 0x40) {
429 bit = psel & 7;
430 } else if (psel == 0x28) {
431 bit = pmc - 1;
432 } else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) {
433 bit = 4;
434 }
435
436 if (!(event & PM_BUSEVENT_MSK) || bit == -1)
437 return 0;
438
439 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
440 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
441 if (unit == PM_LSU0) {
442 /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
443 mask = 0x5dff00;
444 } else if (unit == PM_LSU1 && byte >= 4) {
445 byte -= 4;
446 /* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */
447 mask = 0x5f11c000;
448 } else
449 return 0;
450
451 return (mask >> (byte * 8 + bit)) & 1;
452}
453
454static int power5p_compute_mmcr(unsigned int event[], int n_ev,
455 unsigned int hwc[], u64 mmcr[])
456{
457 u64 mmcr1 = 0;
458 u64 mmcra = 0;
459 unsigned int pmc, unit, byte, psel;
460 unsigned int ttm;
461 int i, isbus, bit, grsel;
462 unsigned int pmc_inuse = 0;
463 unsigned char busbyte[4];
464 unsigned char unituse[16];
465 int ttmuse;
466
467 if (n_ev > 6)
468 return -1;
469
470 /* First pass to count resource use */
471 memset(busbyte, 0, sizeof(busbyte));
472 memset(unituse, 0, sizeof(unituse));
473 for (i = 0; i < n_ev; ++i) {
474 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
475 if (pmc) {
476 if (pmc > 6)
477 return -1;
478 if (pmc_inuse & (1 << (pmc - 1)))
479 return -1;
480 pmc_inuse |= 1 << (pmc - 1);
481 }
482 if (event[i] & PM_BUSEVENT_MSK) {
483 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
484 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
485 if (unit > PM_LASTUNIT)
486 return -1;
487 if (unit == PM_ISU0_ALT)
488 unit = PM_ISU0;
489 if (byte >= 4) {
490 if (unit != PM_LSU1)
491 return -1;
492 ++unit;
493 byte &= 3;
494 }
495 if (busbyte[byte] && busbyte[byte] != unit)
496 return -1;
497 busbyte[byte] = unit;
498 unituse[unit] = 1;
499 }
500 }
501
502 /*
503 * Assign resources and set multiplexer selects.
504 *
505 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
506 * choice we have to deal with.
507 */
508 if (unituse[PM_ISU0] &
509 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
510 unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
511 unituse[PM_ISU0] = 0;
512 }
513 /* Set TTM[01]SEL fields. */
514 ttmuse = 0;
515 for (i = PM_FPU; i <= PM_ISU1; ++i) {
516 if (!unituse[i])
517 continue;
518 if (ttmuse++)
519 return -1;
520 mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
521 }
522 ttmuse = 0;
523 for (; i <= PM_GRS; ++i) {
524 if (!unituse[i])
525 continue;
526 if (ttmuse++)
527 return -1;
528 mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
529 }
530 if (ttmuse > 1)
531 return -1;
532
533 /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
534 for (byte = 0; byte < 4; ++byte) {
535 unit = busbyte[byte];
536 if (!unit)
537 continue;
538 if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
539 /* get ISU0 through TTM1 rather than TTM0 */
540 unit = PM_ISU0_ALT;
541 } else if (unit == PM_LSU1 + 1) {
542 /* select lower word of LSU1 for this byte */
543 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
544 }
545 ttm = unit >> 2;
546 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
547 }
548
549 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
550 for (i = 0; i < n_ev; ++i) {
551 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
552 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
553 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
554 psel = event[i] & PM_PMCSEL_MSK;
555 isbus = event[i] & PM_BUSEVENT_MSK;
556 if (!pmc) {
557 /* Bus event or any-PMC direct event */
558 for (pmc = 0; pmc < 4; ++pmc) {
559 if (!(pmc_inuse & (1 << pmc)))
560 break;
561 }
562 if (pmc >= 4)
563 return -1;
564 pmc_inuse |= 1 << pmc;
565 } else if (pmc <= 4) {
566 /* Direct event */
567 --pmc;
568 if (isbus && (byte & 2) &&
569 (psel == 8 || psel == 0x10 || psel == 0x28))
570 /* add events on higher-numbered bus */
571 mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
572 } else {
573 /* Instructions or run cycles on PMC5/6 */
574 --pmc;
575 }
576 if (isbus && unit == PM_GRS) {
577 bit = psel & 7;
578 grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
579 mmcr1 |= (u64)grsel << grsel_shift[bit];
580 }
581 if (power5p_marked_instr_event(event[i]))
582 mmcra |= MMCRA_SAMPLE_ENABLE;
583 if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
584 /* select alternate byte lane */
585 psel |= 0x10;
586 if (pmc <= 3)
587 mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
588 hwc[i] = pmc;
589 }
590
591 /* Return MMCRx values */
592 mmcr[0] = 0;
593 if (pmc_inuse & 1)
594 mmcr[0] = MMCR0_PMC1CE;
595 if (pmc_inuse & 0x3e)
596 mmcr[0] |= MMCR0_PMCjCE;
597 mmcr[1] = mmcr1;
598 mmcr[2] = mmcra;
599 return 0;
600}
601
602static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
603{
604 if (pmc <= 3)
605 mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
606}
607
608static int power5p_generic_events[] = {
609 [PERF_COUNT_CPU_CYCLES] = 0xf,
610 [PERF_COUNT_INSTRUCTIONS] = 0x100009,
611 [PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */
612 [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
613 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
614 [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
615};
616
617struct power_pmu power5p_pmu = {
618 .n_counter = 6,
619 .max_alternatives = MAX_ALT,
620 .add_fields = 0x7000000000055ull,
621 .test_adder = 0x3000040000000ull,
622 .compute_mmcr = power5p_compute_mmcr,
623 .get_constraint = power5p_get_constraint,
624 .get_alternatives = power5p_get_alternatives,
625 .disable_pmc = power5p_disable_pmc,
626 .n_generic = ARRAY_SIZE(power5p_generic_events),
627 .generic_events = power5p_generic_events,
628 .limited_pmc5_6 = 1,
629 .limited_pmc_event = power5p_limited_pmc_event,
630};
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
new file mode 100644
index 000000000000..6e667dc86470
--- /dev/null
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -0,0 +1,570 @@
1/*
2 * Performance counter support for POWER5 (not POWER5++) processors.
3 *
4 * Copyright 2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER5 (not POWER5++)
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */
22#define PM_UNIT_MSK 0xf
23#define PM_BYTE_SH 12 /* Byte number of event bus to use */
24#define PM_BYTE_MSK 7
25#define PM_GRS_SH 8 /* Storage subsystem mux select */
26#define PM_GRS_MSK 7
27#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */
28#define PM_PMCSEL_MSK 0x7f
29
30/* Values in PM_UNIT field */
31#define PM_FPU 0
32#define PM_ISU0 1
33#define PM_IFU 2
34#define PM_ISU1 3
35#define PM_IDU 4
36#define PM_ISU0_ALT 6
37#define PM_GRS 7
38#define PM_LSU0 8
39#define PM_LSU1 0xc
40#define PM_LASTUNIT 0xc
41
42/*
43 * Bits in MMCR1 for POWER5
44 */
45#define MMCR1_TTM0SEL_SH 62
46#define MMCR1_TTM1SEL_SH 60
47#define MMCR1_TTM2SEL_SH 58
48#define MMCR1_TTM3SEL_SH 56
49#define MMCR1_TTMSEL_MSK 3
50#define MMCR1_TD_CP_DBG0SEL_SH 54
51#define MMCR1_TD_CP_DBG1SEL_SH 52
52#define MMCR1_TD_CP_DBG2SEL_SH 50
53#define MMCR1_TD_CP_DBG3SEL_SH 48
54#define MMCR1_GRS_L2SEL_SH 46
55#define MMCR1_GRS_L2SEL_MSK 3
56#define MMCR1_GRS_L3SEL_SH 44
57#define MMCR1_GRS_L3SEL_MSK 3
58#define MMCR1_GRS_MCSEL_SH 41
59#define MMCR1_GRS_MCSEL_MSK 7
60#define MMCR1_GRS_FABSEL_SH 39
61#define MMCR1_GRS_FABSEL_MSK 3
62#define MMCR1_PMC1_ADDER_SEL_SH 35
63#define MMCR1_PMC2_ADDER_SEL_SH 34
64#define MMCR1_PMC3_ADDER_SEL_SH 33
65#define MMCR1_PMC4_ADDER_SEL_SH 32
66#define MMCR1_PMC1SEL_SH 25
67#define MMCR1_PMC2SEL_SH 17
68#define MMCR1_PMC3SEL_SH 9
69#define MMCR1_PMC4SEL_SH 1
70#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
71#define MMCR1_PMCSEL_MSK 0x7f
72
73/*
74 * Bits in MMCRA
75 */
76
77/*
78 * Layout of constraint bits:
79 * 6666555555555544444444443333333333222222222211111111110000000000
80 * 3210987654321098765432109876543210987654321098765432109876543210
81 * <><>[ ><><>< ><> [ >[ >[ >< >< >< >< ><><><><><><>
82 * T0T1 NC G0G1G2 G3 UC PS1PS2 B0 B1 B2 B3 P6P5P4P3P2P1
83 *
84 * T0 - TTM0 constraint
85 * 54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000
86 *
87 * T1 - TTM1 constraint
88 * 52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000
89 *
90 * NC - number of counters
91 * 51: NC error 0x0008_0000_0000_0000
92 * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
93 *
94 * G0..G3 - GRS mux constraints
95 * 46-47: GRS_L2SEL value
96 * 44-45: GRS_L3SEL value
97 * 41-44: GRS_MCSEL value
98 * 39-40: GRS_FABSEL value
99 * Note that these match up with their bit positions in MMCR1
100 *
101 * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
102 * 37: UC3 error 0x20_0000_0000
103 * 36: FPU|IFU|ISU1 events needed 0x10_0000_0000
104 * 35: ISU0 events needed 0x08_0000_0000
105 * 34: IDU|GRS events needed 0x04_0000_0000
106 *
107 * PS1
108 * 33: PS1 error 0x2_0000_0000
109 * 31-32: count of events needing PMC1/2 0x1_8000_0000
110 *
111 * PS2
112 * 30: PS2 error 0x4000_0000
113 * 28-29: count of events needing PMC3/4 0x3000_0000
114 *
115 * B0
116 * 24-27: Byte 0 event source 0x0f00_0000
117 * Encoding as for the event code
118 *
119 * B1, B2, B3
120 * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
121 *
122 * P1..P6
123 * 0-11: Count of events needing PMC1..PMC6
124 */
125
126static const int grsel_shift[8] = {
127 MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
128 MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
129 MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
130};
131
132/* Masks and values for using events from the various units */
133static u64 unit_cons[PM_LASTUNIT+1][2] = {
134 [PM_FPU] = { 0xc0002000000000ull, 0x00001000000000ull },
135 [PM_ISU0] = { 0x00002000000000ull, 0x00000800000000ull },
136 [PM_ISU1] = { 0xc0002000000000ull, 0xc0001000000000ull },
137 [PM_IFU] = { 0xc0002000000000ull, 0x80001000000000ull },
138 [PM_IDU] = { 0x30002000000000ull, 0x00000400000000ull },
139 [PM_GRS] = { 0x30002000000000ull, 0x30000400000000ull },
140};
141
142static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
143{
144 int pmc, byte, unit, sh;
145 int bit, fmask;
146 u64 mask = 0, value = 0;
147 int grp = -1;
148
149 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
150 if (pmc) {
151 if (pmc > 6)
152 return -1;
153 sh = (pmc - 1) * 2;
154 mask |= 2 << sh;
155 value |= 1 << sh;
156 if (pmc <= 4)
157 grp = (pmc - 1) >> 1;
158 else if (event != 0x500009 && event != 0x600005)
159 return -1;
160 }
161 if (event & PM_BUSEVENT_MSK) {
162 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
163 if (unit > PM_LASTUNIT)
164 return -1;
165 if (unit == PM_ISU0_ALT)
166 unit = PM_ISU0;
167 mask |= unit_cons[unit][0];
168 value |= unit_cons[unit][1];
169 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
170 if (byte >= 4) {
171 if (unit != PM_LSU1)
172 return -1;
173 /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
174 ++unit;
175 byte &= 3;
176 }
177 if (unit == PM_GRS) {
178 bit = event & 7;
179 fmask = (bit == 6)? 7: 3;
180 sh = grsel_shift[bit];
181 mask |= (u64)fmask << sh;
182 value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
183 }
184 /*
185 * Bus events on bytes 0 and 2 can be counted
186 * on PMC1/2; bytes 1 and 3 on PMC3/4.
187 */
188 if (!pmc)
189 grp = byte & 1;
190 /* Set byte lane select field */
191 mask |= 0xfULL << (24 - 4 * byte);
192 value |= (u64)unit << (24 - 4 * byte);
193 }
194 if (grp == 0) {
195 /* increment PMC1/2 field */
196 mask |= 0x200000000ull;
197 value |= 0x080000000ull;
198 } else if (grp == 1) {
199 /* increment PMC3/4 field */
200 mask |= 0x40000000ull;
201 value |= 0x10000000ull;
202 }
203 if (pmc < 5) {
204 /* need a counter from PMC1-4 set */
205 mask |= 0x8000000000000ull;
206 value |= 0x1000000000000ull;
207 }
208 *maskp = mask;
209 *valp = value;
210 return 0;
211}
212
213#define MAX_ALT 3 /* at most 3 alternatives for any event */
214
215static const unsigned int event_alternatives[][MAX_ALT] = {
216 { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */
217 { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */
218 { 0x100005, 0x600005 }, /* PM_RUN_CYC */
219 { 0x100009, 0x200009, 0x500009 }, /* PM_INST_CMPL */
220 { 0x300009, 0x400009 }, /* PM_INST_DISP */
221};
222
223/*
224 * Scan the alternatives table for a match and return the
225 * index into the alternatives table if found, else -1.
226 */
227static int find_alternative(unsigned int event)
228{
229 int i, j;
230
231 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
232 if (event < event_alternatives[i][0])
233 break;
234 for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
235 if (event == event_alternatives[i][j])
236 return i;
237 }
238 return -1;
239}
240
241static const unsigned char bytedecode_alternatives[4][4] = {
242 /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 },
243 /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e },
244 /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 },
245 /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e }
246};
247
248/*
249 * Some direct events for decodes of event bus byte 3 have alternative
250 * PMCSEL values on other counters. This returns the alternative
251 * event code for those that do, or -1 otherwise.
252 */
253static int find_alternative_bdecode(unsigned int event)
254{
255 int pmc, altpmc, pp, j;
256
257 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
258 if (pmc == 0 || pmc > 4)
259 return -1;
260 altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */
261 pp = event & PM_PMCSEL_MSK;
262 for (j = 0; j < 4; ++j) {
263 if (bytedecode_alternatives[pmc - 1][j] == pp) {
264 return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
265 (altpmc << PM_PMC_SH) |
266 bytedecode_alternatives[altpmc - 1][j];
267 }
268 }
269 return -1;
270}
271
272static int power5_get_alternatives(unsigned int event, unsigned int flags,
273 unsigned int alt[])
274{
275 int i, j, ae, nalt = 1;
276
277 alt[0] = event;
278 nalt = 1;
279 i = find_alternative(event);
280 if (i >= 0) {
281 for (j = 0; j < MAX_ALT; ++j) {
282 ae = event_alternatives[i][j];
283 if (ae && ae != event)
284 alt[nalt++] = ae;
285 }
286 } else {
287 ae = find_alternative_bdecode(event);
288 if (ae > 0)
289 alt[nalt++] = ae;
290 }
291 return nalt;
292}
293
294/*
295 * Map of which direct events on which PMCs are marked instruction events.
296 * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
297 * Bit 0 is set if it is marked for all PMCs.
298 * The 0x80 bit indicates a byte decode PMCSEL value.
299 */
300static unsigned char direct_event_is_marked[0x28] = {
301 0, /* 00 */
302 0x1f, /* 01 PM_IOPS_CMPL */
303 0x2, /* 02 PM_MRK_GRP_DISP */
304 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
305 0, /* 04 */
306 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
307 0x80, /* 06 */
308 0x80, /* 07 */
309 0, 0, 0,/* 08 - 0a */
310 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
311 0, /* 0c */
312 0x80, /* 0d */
313 0x80, /* 0e */
314 0, /* 0f */
315 0, /* 10 */
316 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
317 0, /* 12 */
318 0x10, /* 13 PM_MRK_GRP_CMPL */
319 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
320 0x2, /* 15 PM_MRK_GRP_ISSUED */
321 0x80, /* 16 */
322 0x80, /* 17 */
323 0, 0, 0, 0, 0,
324 0x80, /* 1d */
325 0x80, /* 1e */
326 0, /* 1f */
327 0x80, /* 20 */
328 0x80, /* 21 */
329 0x80, /* 22 */
330 0x80, /* 23 */
331 0x80, /* 24 */
332 0x80, /* 25 */
333 0x80, /* 26 */
334 0x80, /* 27 */
335};
336
337/*
338 * Returns 1 if event counts things relating to marked instructions
339 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
340 */
341static int power5_marked_instr_event(unsigned int event)
342{
343 int pmc, psel;
344 int bit, byte, unit;
345 u32 mask;
346
347 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
348 psel = event & PM_PMCSEL_MSK;
349 if (pmc >= 5)
350 return 0;
351
352 bit = -1;
353 if (psel < sizeof(direct_event_is_marked)) {
354 if (direct_event_is_marked[psel] & (1 << pmc))
355 return 1;
356 if (direct_event_is_marked[psel] & 0x80)
357 bit = 4;
358 else if (psel == 0x08)
359 bit = pmc - 1;
360 else if (psel == 0x10)
361 bit = 4 - pmc;
362 else if (psel == 0x1b && (pmc == 1 || pmc == 3))
363 bit = 4;
364 } else if ((psel & 0x58) == 0x40)
365 bit = psel & 7;
366
367 if (!(event & PM_BUSEVENT_MSK))
368 return 0;
369
370 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
371 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
372 if (unit == PM_LSU0) {
373 /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
374 mask = 0x5dff00;
375 } else if (unit == PM_LSU1 && byte >= 4) {
376 byte -= 4;
377 /* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */
378 mask = 0x5f00c0aa;
379 } else
380 return 0;
381
382 return (mask >> (byte * 8 + bit)) & 1;
383}
384
385static int power5_compute_mmcr(unsigned int event[], int n_ev,
386 unsigned int hwc[], u64 mmcr[])
387{
388 u64 mmcr1 = 0;
389 u64 mmcra = 0;
390 unsigned int pmc, unit, byte, psel;
391 unsigned int ttm, grp;
392 int i, isbus, bit, grsel;
393 unsigned int pmc_inuse = 0;
394 unsigned int pmc_grp_use[2];
395 unsigned char busbyte[4];
396 unsigned char unituse[16];
397 int ttmuse;
398
399 if (n_ev > 6)
400 return -1;
401
402 /* First pass to count resource use */
403 pmc_grp_use[0] = pmc_grp_use[1] = 0;
404 memset(busbyte, 0, sizeof(busbyte));
405 memset(unituse, 0, sizeof(unituse));
406 for (i = 0; i < n_ev; ++i) {
407 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
408 if (pmc) {
409 if (pmc > 6)
410 return -1;
411 if (pmc_inuse & (1 << (pmc - 1)))
412 return -1;
413 pmc_inuse |= 1 << (pmc - 1);
414 /* count 1/2 vs 3/4 use */
415 if (pmc <= 4)
416 ++pmc_grp_use[(pmc - 1) >> 1];
417 }
418 if (event[i] & PM_BUSEVENT_MSK) {
419 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
420 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
421 if (unit > PM_LASTUNIT)
422 return -1;
423 if (unit == PM_ISU0_ALT)
424 unit = PM_ISU0;
425 if (byte >= 4) {
426 if (unit != PM_LSU1)
427 return -1;
428 ++unit;
429 byte &= 3;
430 }
431 if (!pmc)
432 ++pmc_grp_use[byte & 1];
433 if (busbyte[byte] && busbyte[byte] != unit)
434 return -1;
435 busbyte[byte] = unit;
436 unituse[unit] = 1;
437 }
438 }
439 if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2)
440 return -1;
441
442 /*
443 * Assign resources and set multiplexer selects.
444 *
445 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
446 * choice we have to deal with.
447 */
448 if (unituse[PM_ISU0] &
449 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
450 unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */
451 unituse[PM_ISU0] = 0;
452 }
453 /* Set TTM[01]SEL fields. */
454 ttmuse = 0;
455 for (i = PM_FPU; i <= PM_ISU1; ++i) {
456 if (!unituse[i])
457 continue;
458 if (ttmuse++)
459 return -1;
460 mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
461 }
462 ttmuse = 0;
463 for (; i <= PM_GRS; ++i) {
464 if (!unituse[i])
465 continue;
466 if (ttmuse++)
467 return -1;
468 mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
469 }
470 if (ttmuse > 1)
471 return -1;
472
473 /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
474 for (byte = 0; byte < 4; ++byte) {
475 unit = busbyte[byte];
476 if (!unit)
477 continue;
478 if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
479 /* get ISU0 through TTM1 rather than TTM0 */
480 unit = PM_ISU0_ALT;
481 } else if (unit == PM_LSU1 + 1) {
482 /* select lower word of LSU1 for this byte */
483 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
484 }
485 ttm = unit >> 2;
486 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
487 }
488
489 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
490 for (i = 0; i < n_ev; ++i) {
491 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
492 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
493 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
494 psel = event[i] & PM_PMCSEL_MSK;
495 isbus = event[i] & PM_BUSEVENT_MSK;
496 if (!pmc) {
497 /* Bus event or any-PMC direct event */
498 for (pmc = 0; pmc < 4; ++pmc) {
499 if (pmc_inuse & (1 << pmc))
500 continue;
501 grp = (pmc >> 1) & 1;
502 if (isbus) {
503 if (grp == (byte & 1))
504 break;
505 } else if (pmc_grp_use[grp] < 2) {
506 ++pmc_grp_use[grp];
507 break;
508 }
509 }
510 pmc_inuse |= 1 << pmc;
511 } else if (pmc <= 4) {
512 /* Direct event */
513 --pmc;
514 if ((psel == 8 || psel == 0x10) && isbus && (byte & 2))
515 /* add events on higher-numbered bus */
516 mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
517 } else {
518 /* Instructions or run cycles on PMC5/6 */
519 --pmc;
520 }
521 if (isbus && unit == PM_GRS) {
522 bit = psel & 7;
523 grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
524 mmcr1 |= (u64)grsel << grsel_shift[bit];
525 }
526 if (power5_marked_instr_event(event[i]))
527 mmcra |= MMCRA_SAMPLE_ENABLE;
528 if (pmc <= 3)
529 mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
530 hwc[i] = pmc;
531 }
532
533 /* Return MMCRx values */
534 mmcr[0] = 0;
535 if (pmc_inuse & 1)
536 mmcr[0] = MMCR0_PMC1CE;
537 if (pmc_inuse & 0x3e)
538 mmcr[0] |= MMCR0_PMCjCE;
539 mmcr[1] = mmcr1;
540 mmcr[2] = mmcra;
541 return 0;
542}
543
544static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
545{
546 if (pmc <= 3)
547 mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
548}
549
550static int power5_generic_events[] = {
551 [PERF_COUNT_CPU_CYCLES] = 0xf,
552 [PERF_COUNT_INSTRUCTIONS] = 0x100009,
553 [PERF_COUNT_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */
554 [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */
555 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */
556 [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */
557};
558
559struct power_pmu power5_pmu = {
560 .n_counter = 6,
561 .max_alternatives = MAX_ALT,
562 .add_fields = 0x7000090000555ull,
563 .test_adder = 0x3000490000000ull,
564 .compute_mmcr = power5_compute_mmcr,
565 .get_constraint = power5_get_constraint,
566 .get_alternatives = power5_get_alternatives,
567 .disable_pmc = power5_disable_pmc,
568 .n_generic = ARRAY_SIZE(power5_generic_events),
569 .generic_events = power5_generic_events,
570};
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
new file mode 100644
index 000000000000..d44049f0ae27
--- /dev/null
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -0,0 +1,490 @@
1/*
2 * Performance counter support for POWER6 processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for POWER6
17 */
18#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0x7
20#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH)
21#define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */
22#define PM_UNIT_MSK 0xf
23#define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH)
24#define PM_LLAV 0x8000 /* Load lookahead match value */
25#define PM_LLA 0x4000 /* Load lookahead match enable */
26#define PM_BYTE_SH 12 /* Byte of event bus to use */
27#define PM_BYTE_MSK 3
28#define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */
29#define PM_SUBUNIT_MSK 7
30#define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
31#define PM_PMCSEL_MSK 0xff /* PMCxSEL value */
32#define PM_BUSEVENT_MSK 0xf3700
33
34/*
35 * Bits in MMCR1 for POWER6
36 */
37#define MMCR1_TTM0SEL_SH 60
38#define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4)
39#define MMCR1_TTMSEL_MSK 0xf
40#define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
41#define MMCR1_NESTSEL_SH 45
42#define MMCR1_NESTSEL_MSK 0x7
43#define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
44#define MMCR1_PMC1_LLA ((u64)1 << 44)
45#define MMCR1_PMC1_LLA_VALUE ((u64)1 << 39)
46#define MMCR1_PMC1_ADDR_SEL ((u64)1 << 35)
47#define MMCR1_PMC1SEL_SH 24
48#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8)
49#define MMCR1_PMCSEL_MSK 0xff
50
51/*
52 * Map of which direct events on which PMCs are marked instruction events.
53 * Indexed by PMCSEL value >> 1.
54 * Bottom 4 bits are a map of which PMCs are interesting,
55 * top 4 bits say what sort of event:
56 * 0 = direct marked event,
57 * 1 = byte decode event,
58 * 4 = add/and event (PMC1 -> bits 0 & 4),
59 * 5 = add/and event (PMC1 -> bits 1 & 5),
60 * 6 = add/and event (PMC1 -> bits 2 & 6),
61 * 7 = add/and event (PMC1 -> bits 3 & 7).
62 */
63static unsigned char direct_event_is_marked[0x60 >> 1] = {
64 0, /* 00 */
65 0, /* 02 */
66 0, /* 04 */
67 0x07, /* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
68 0x04, /* 08 PM_MRK_DFU_FIN */
69 0x06, /* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */
70 0, /* 0c */
71 0, /* 0e */
72 0x02, /* 10 PM_MRK_INST_DISP */
73 0x08, /* 12 PM_MRK_LSU_DERAT_MISS */
74 0, /* 14 */
75 0, /* 16 */
76 0x0c, /* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */
77 0x0f, /* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */
78 0x01, /* 1c PM_MRK_INST_ISSUED */
79 0, /* 1e */
80 0, /* 20 */
81 0, /* 22 */
82 0, /* 24 */
83 0, /* 26 */
84 0x15, /* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */
85 0, /* 2a */
86 0, /* 2c */
87 0, /* 2e */
88 0x4f, /* 30 */
89 0x7f, /* 32 */
90 0x4f, /* 34 */
91 0x5f, /* 36 */
92 0x6f, /* 38 */
93 0x4f, /* 3a */
94 0, /* 3c */
95 0x08, /* 3e PM_MRK_INST_TIMEO */
96 0x1f, /* 40 */
97 0x1f, /* 42 */
98 0x1f, /* 44 */
99 0x1f, /* 46 */
100 0x1f, /* 48 */
101 0x1f, /* 4a */
102 0x1f, /* 4c */
103 0x1f, /* 4e */
104 0, /* 50 */
105 0x05, /* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */
106 0x1c, /* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */
107 0x02, /* 56 PM_MRK_LD_MISS_L1 */
108 0, /* 58 */
109 0, /* 5a */
110 0, /* 5c */
111 0, /* 5e */
112};
113
114/*
115 * Masks showing for each unit which bits are marked events.
116 * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0.
117 */
118static u32 marked_bus_events[16] = {
119 0x01000000, /* direct events set 1: byte 3 bit 0 */
120 0x00010000, /* direct events set 2: byte 2 bit 0 */
121 0, 0, 0, 0, /* IDU, IFU, nest: nothing */
122 0x00000088, /* VMX set 1: byte 0 bits 3, 7 */
123 0x000000c0, /* VMX set 2: byte 0 bits 4-7 */
124 0x04010000, /* LSU set 1: byte 2 bit 0, byte 3 bit 2 */
125 0xff010000u, /* LSU set 2: byte 2 bit 0, all of byte 3 */
126 0, /* LSU set 3 */
127 0x00000010, /* VMX set 3: byte 0 bit 4 */
128 0, /* BFP set 1 */
129 0x00000022, /* BFP set 2: byte 0 bits 1, 5 */
130 0, 0
131};
132
133/*
134 * Returns 1 if event counts things relating to marked instructions
135 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
136 */
137static int power6_marked_instr_event(unsigned int event)
138{
139 int pmc, psel, ptype;
140 int bit, byte, unit;
141 u32 mask;
142
143 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
144 psel = (event & PM_PMCSEL_MSK) >> 1; /* drop edge/level bit */
145 if (pmc >= 5)
146 return 0;
147
148 bit = -1;
149 if (psel < sizeof(direct_event_is_marked)) {
150 ptype = direct_event_is_marked[psel];
151 if (pmc == 0 || !(ptype & (1 << (pmc - 1))))
152 return 0;
153 ptype >>= 4;
154 if (ptype == 0)
155 return 1;
156 if (ptype == 1)
157 bit = 0;
158 else
159 bit = ptype ^ (pmc - 1);
160 } else if ((psel & 0x48) == 0x40)
161 bit = psel & 7;
162
163 if (!(event & PM_BUSEVENT_MSK) || bit == -1)
164 return 0;
165
166 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
167 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
168 mask = marked_bus_events[unit];
169 return (mask >> (byte * 8 + bit)) & 1;
170}
171
172/*
173 * Assign PMC numbers and compute MMCR1 value for a set of events
174 */
175static int p6_compute_mmcr(unsigned int event[], int n_ev,
176 unsigned int hwc[], u64 mmcr[])
177{
178 u64 mmcr1 = 0;
179 u64 mmcra = 0;
180 int i;
181 unsigned int pmc, ev, b, u, s, psel;
182 unsigned int ttmset = 0;
183 unsigned int pmc_inuse = 0;
184
185 if (n_ev > 6)
186 return -1;
187 for (i = 0; i < n_ev; ++i) {
188 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
189 if (pmc) {
190 if (pmc_inuse & (1 << (pmc - 1)))
191 return -1; /* collision! */
192 pmc_inuse |= 1 << (pmc - 1);
193 }
194 }
195 for (i = 0; i < n_ev; ++i) {
196 ev = event[i];
197 pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
198 if (pmc) {
199 --pmc;
200 } else {
201 /* can go on any PMC; find a free one */
202 for (pmc = 0; pmc < 4; ++pmc)
203 if (!(pmc_inuse & (1 << pmc)))
204 break;
205 if (pmc >= 4)
206 return -1;
207 pmc_inuse |= 1 << pmc;
208 }
209 hwc[i] = pmc;
210 psel = ev & PM_PMCSEL_MSK;
211 if (ev & PM_BUSEVENT_MSK) {
212 /* this event uses the event bus */
213 b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
214 u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
215 /* check for conflict on this byte of event bus */
216 if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
217 return -1;
218 mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
219 ttmset |= 1 << b;
220 if (u == 5) {
221 /* Nest events have a further mux */
222 s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
223 if ((ttmset & 0x10) &&
224 MMCR1_NESTSEL(mmcr1) != s)
225 return -1;
226 ttmset |= 0x10;
227 mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
228 }
229 if (0x30 <= psel && psel <= 0x3d) {
230 /* these need the PMCx_ADDR_SEL bits */
231 if (b >= 2)
232 mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
233 }
234 /* bus select values are different for PMC3/4 */
235 if (pmc >= 2 && (psel & 0x90) == 0x80)
236 psel ^= 0x20;
237 }
238 if (ev & PM_LLA) {
239 mmcr1 |= MMCR1_PMC1_LLA >> pmc;
240 if (ev & PM_LLAV)
241 mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
242 }
243 if (power6_marked_instr_event(event[i]))
244 mmcra |= MMCRA_SAMPLE_ENABLE;
245 if (pmc < 4)
246 mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
247 }
248 mmcr[0] = 0;
249 if (pmc_inuse & 1)
250 mmcr[0] = MMCR0_PMC1CE;
251 if (pmc_inuse & 0xe)
252 mmcr[0] |= MMCR0_PMCjCE;
253 mmcr[1] = mmcr1;
254 mmcr[2] = mmcra;
255 return 0;
256}
257
258/*
259 * Layout of constraint bits:
260 *
261 * 0-1 add field: number of uses of PMC1 (max 1)
262 * 2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6
263 * 12-15 add field: number of uses of PMC1-4 (max 4)
264 * 16-19 select field: unit on byte 0 of event bus
265 * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
266 * 32-34 select field: nest (subunit) event selector
267 */
268static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
269{
270 int pmc, byte, sh, subunit;
271 u64 mask = 0, value = 0;
272
273 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
274 if (pmc) {
275 if (pmc > 4 && !(event == 0x500009 || event == 0x600005))
276 return -1;
277 sh = (pmc - 1) * 2;
278 mask |= 2 << sh;
279 value |= 1 << sh;
280 }
281 if (event & PM_BUSEVENT_MSK) {
282 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
283 sh = byte * 4 + (16 - PM_UNIT_SH);
284 mask |= PM_UNIT_MSKS << sh;
285 value |= (u64)(event & PM_UNIT_MSKS) << sh;
286 if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
287 subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
288 mask |= (u64)PM_SUBUNIT_MSK << 32;
289 value |= (u64)subunit << 32;
290 }
291 }
292 if (pmc <= 4) {
293 mask |= 0x8000; /* add field for count of PMC1-4 uses */
294 value |= 0x1000;
295 }
296 *maskp = mask;
297 *valp = value;
298 return 0;
299}
300
301static int p6_limited_pmc_event(unsigned int event)
302{
303 int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
304
305 return pmc == 5 || pmc == 6;
306}
307
308#define MAX_ALT 4 /* at most 4 alternatives for any event */
309
310static const unsigned int event_alternatives[][MAX_ALT] = {
311 { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */
312 { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
313 { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */
314 { 0x10000a, 0x2000f4, 0x600005 }, /* PM_RUN_CYC */
315 { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */
316 { 0x10000e, 0x400010 }, /* PM_PURR */
317 { 0x100010, 0x4000f8 }, /* PM_FLUSH */
318 { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */
319 { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */
320 { 0x100054, 0x2000f0 }, /* PM_ST_FIN */
321 { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */
322 { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */
323 { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */
324 { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */
325 { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */
326 { 0x200012, 0x300012 }, /* PM_INST_DISP */
327 { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */
328 { 0x2000f8, 0x300010 }, /* PM_EXT_INT */
329 { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */
330 { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */
331 { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */
332 { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */
333 { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */
334};
335
336/*
337 * This could be made more efficient with a binary search on
338 * a presorted list, if necessary
339 */
340static int find_alternatives_list(unsigned int event)
341{
342 int i, j;
343 unsigned int alt;
344
345 for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
346 if (event < event_alternatives[i][0])
347 return -1;
348 for (j = 0; j < MAX_ALT; ++j) {
349 alt = event_alternatives[i][j];
350 if (!alt || event < alt)
351 break;
352 if (event == alt)
353 return i;
354 }
355 }
356 return -1;
357}
358
359static int p6_get_alternatives(unsigned int event, unsigned int flags,
360 unsigned int alt[])
361{
362 int i, j, nlim;
363 unsigned int aevent, psel, pmc;
364 unsigned int nalt = 1;
365
366 alt[0] = event;
367 nlim = p6_limited_pmc_event(event);
368
369 /* check the alternatives table */
370 i = find_alternatives_list(event);
371 if (i >= 0) {
372 /* copy out alternatives from list */
373 for (j = 0; j < MAX_ALT; ++j) {
374 aevent = event_alternatives[i][j];
375 if (!aevent)
376 break;
377 if (aevent != event)
378 alt[nalt++] = aevent;
379 nlim += p6_limited_pmc_event(aevent);
380 }
381
382 } else {
383 /* Check for alternative ways of computing sum events */
384 /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
385 psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */
386 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
387 if (pmc && (psel == 0x32 || psel == 0x34))
388 alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
389 ((5 - pmc) << PM_PMC_SH);
390
391 /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
392 if (pmc && (psel == 0x38 || psel == 0x3a))
393 alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
394 ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
395 }
396
397 if (flags & PPMU_ONLY_COUNT_RUN) {
398 /*
399 * We're only counting in RUN state,
400 * so PM_CYC is equivalent to PM_RUN_CYC,
401 * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR.
402 * This doesn't include alternatives that don't provide
403 * any extra flexibility in assigning PMCs (e.g.
404 * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC).
405 * Note that even with these additional alternatives
406 * we never end up with more than 4 alternatives for any event.
407 */
408 j = nalt;
409 for (i = 0; i < nalt; ++i) {
410 switch (alt[i]) {
411 case 0x1e: /* PM_CYC */
412 alt[j++] = 0x600005; /* PM_RUN_CYC */
413 ++nlim;
414 break;
415 case 0x10000a: /* PM_RUN_CYC */
416 alt[j++] = 0x1e; /* PM_CYC */
417 break;
418 case 2: /* PM_INST_CMPL */
419 alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */
420 ++nlim;
421 break;
422 case 0x500009: /* PM_RUN_INST_CMPL */
423 alt[j++] = 2; /* PM_INST_CMPL */
424 break;
425 case 0x10000e: /* PM_PURR */
426 alt[j++] = 0x4000f4; /* PM_RUN_PURR */
427 break;
428 case 0x4000f4: /* PM_RUN_PURR */
429 alt[j++] = 0x10000e; /* PM_PURR */
430 break;
431 }
432 }
433 nalt = j;
434 }
435
436 if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
437 /* remove the limited PMC events */
438 j = 0;
439 for (i = 0; i < nalt; ++i) {
440 if (!p6_limited_pmc_event(alt[i])) {
441 alt[j] = alt[i];
442 ++j;
443 }
444 }
445 nalt = j;
446 } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
447 /* remove all but the limited PMC events */
448 j = 0;
449 for (i = 0; i < nalt; ++i) {
450 if (p6_limited_pmc_event(alt[i])) {
451 alt[j] = alt[i];
452 ++j;
453 }
454 }
455 nalt = j;
456 }
457
458 return nalt;
459}
460
461static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
462{
463 /* Set PMCxSEL to 0 to disable PMCx */
464 if (pmc <= 3)
465 mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
466}
467
468static int power6_generic_events[] = {
469 [PERF_COUNT_CPU_CYCLES] = 0x1e,
470 [PERF_COUNT_INSTRUCTIONS] = 2,
471 [PERF_COUNT_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */
472 [PERF_COUNT_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */
473 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */
474 [PERF_COUNT_BRANCH_MISSES] = 0x400052, /* BR_MPRED */
475};
476
477struct power_pmu power6_pmu = {
478 .n_counter = 6,
479 .max_alternatives = MAX_ALT,
480 .add_fields = 0x1555,
481 .test_adder = 0x3000,
482 .compute_mmcr = p6_compute_mmcr,
483 .get_constraint = p6_get_constraint,
484 .get_alternatives = p6_get_alternatives,
485 .disable_pmc = p6_disable_pmc,
486 .n_generic = ARRAY_SIZE(power6_generic_events),
487 .generic_events = power6_generic_events,
488 .limited_pmc5_6 = 1,
489 .limited_pmc_event = p6_limited_pmc_event,
490};
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
new file mode 100644
index 000000000000..af2d1884058c
--- /dev/null
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -0,0 +1,442 @@
1/*
2 * Performance counter support for PPC970-family processors.
3 *
4 * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/string.h>
12#include <linux/perf_counter.h>
13#include <asm/reg.h>
14
15/*
16 * Bits in event code for PPC970
17 */
18#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */
19#define PM_PMC_MSK 0xf
20#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */
21#define PM_UNIT_MSK 0xf
22#define PM_SPCSEL_SH 6
23#define PM_SPCSEL_MSK 3
24#define PM_BYTE_SH 4 /* Byte number of event bus to use */
25#define PM_BYTE_MSK 3
26#define PM_PMCSEL_MSK 0xf
27
28/* Values in PM_UNIT field */
29#define PM_NONE 0
30#define PM_FPU 1
31#define PM_VPU 2
32#define PM_ISU 3
33#define PM_IFU 4
34#define PM_IDU 5
35#define PM_STS 6
36#define PM_LSU0 7
37#define PM_LSU1U 8
38#define PM_LSU1L 9
39#define PM_LASTUNIT 9
40
41/*
42 * Bits in MMCR0 for PPC970
43 */
44#define MMCR0_PMC1SEL_SH 8
45#define MMCR0_PMC2SEL_SH 1
46#define MMCR_PMCSEL_MSK 0x1f
47
48/*
49 * Bits in MMCR1 for PPC970
50 */
51#define MMCR1_TTM0SEL_SH 62
52#define MMCR1_TTM1SEL_SH 59
53#define MMCR1_TTM3SEL_SH 53
54#define MMCR1_TTMSEL_MSK 3
55#define MMCR1_TD_CP_DBG0SEL_SH 50
56#define MMCR1_TD_CP_DBG1SEL_SH 48
57#define MMCR1_TD_CP_DBG2SEL_SH 46
58#define MMCR1_TD_CP_DBG3SEL_SH 44
59#define MMCR1_PMC1_ADDER_SEL_SH 39
60#define MMCR1_PMC2_ADDER_SEL_SH 38
61#define MMCR1_PMC6_ADDER_SEL_SH 37
62#define MMCR1_PMC5_ADDER_SEL_SH 36
63#define MMCR1_PMC8_ADDER_SEL_SH 35
64#define MMCR1_PMC7_ADDER_SEL_SH 34
65#define MMCR1_PMC3_ADDER_SEL_SH 33
66#define MMCR1_PMC4_ADDER_SEL_SH 32
67#define MMCR1_PMC3SEL_SH 27
68#define MMCR1_PMC4SEL_SH 22
69#define MMCR1_PMC5SEL_SH 17
70#define MMCR1_PMC6SEL_SH 12
71#define MMCR1_PMC7SEL_SH 7
72#define MMCR1_PMC8SEL_SH 2
73
74static short mmcr1_adder_bits[8] = {
75 MMCR1_PMC1_ADDER_SEL_SH,
76 MMCR1_PMC2_ADDER_SEL_SH,
77 MMCR1_PMC3_ADDER_SEL_SH,
78 MMCR1_PMC4_ADDER_SEL_SH,
79 MMCR1_PMC5_ADDER_SEL_SH,
80 MMCR1_PMC6_ADDER_SEL_SH,
81 MMCR1_PMC7_ADDER_SEL_SH,
82 MMCR1_PMC8_ADDER_SEL_SH
83};
84
85/*
86 * Bits in MMCRA
87 */
88
89/*
90 * Layout of constraint bits:
91 * 6666555555555544444444443333333333222222222211111111110000000000
92 * 3210987654321098765432109876543210987654321098765432109876543210
93 * <><><>[ >[ >[ >< >< >< >< ><><><><><><><><>
94 * SPT0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8
95 *
96 * SP - SPCSEL constraint
97 * 48-49: SPCSEL value 0x3_0000_0000_0000
98 *
99 * T0 - TTM0 constraint
100 * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
101 *
102 * T1 - TTM1 constraint
103 * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
104 *
105 * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
106 * 43: UC3 error 0x0800_0000_0000
107 * 42: FPU|IFU|VPU events needed 0x0400_0000_0000
108 * 41: ISU events needed 0x0200_0000_0000
109 * 40: IDU|STS events needed 0x0100_0000_0000
110 *
111 * PS1
112 * 39: PS1 error 0x0080_0000_0000
113 * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
114 *
115 * PS2
116 * 35: PS2 error 0x0008_0000_0000
117 * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
118 *
119 * B0
120 * 28-31: Byte 0 event source 0xf000_0000
121 * Encoding as for the event code
122 *
123 * B1, B2, B3
124 * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
125 *
126 * P1
127 * 15: P1 error 0x8000
128 * 14-15: Count of events needing PMC1
129 *
130 * P2..P8
131 * 0-13: Count of events needing PMC2..PMC8
132 */
133
134static unsigned char direct_marked_event[8] = {
135 (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
136 (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
137 (1<<3) | (1<<5), /* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */
138 (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
139 (1<<4) | (1<<5), /* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */
140 (1<<3) | (1<<4) | (1<<5),
141 /* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
142 (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
143 (1<<4) /* PMC8: PM_MRK_LSU_FIN */
144};
145
146/*
147 * Returns 1 if event counts things relating to marked instructions
148 * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
149 */
150static int p970_marked_instr_event(unsigned int event)
151{
152 int pmc, psel, unit, byte, bit;
153 unsigned int mask;
154
155 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
156 psel = event & PM_PMCSEL_MSK;
157 if (pmc) {
158 if (direct_marked_event[pmc - 1] & (1 << psel))
159 return 1;
160 if (psel == 0) /* add events */
161 bit = (pmc <= 4)? pmc - 1: 8 - pmc;
162 else if (psel == 7 || psel == 13) /* decode events */
163 bit = 4;
164 else
165 return 0;
166 } else
167 bit = psel;
168
169 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
170 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
171 mask = 0;
172 switch (unit) {
173 case PM_VPU:
174 mask = 0x4c; /* byte 0 bits 2,3,6 */
175 case PM_LSU0:
176 /* byte 2 bits 0,2,3,4,6; all of byte 1 */
177 mask = 0x085dff00;
178 case PM_LSU1L:
179 mask = 0x50 << 24; /* byte 3 bits 4,6 */
180 break;
181 }
182 return (mask >> (byte * 8 + bit)) & 1;
183}
184
185/* Masks and values for using events from the various units */
186static u64 unit_cons[PM_LASTUNIT+1][2] = {
187 [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull },
188 [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull },
189 [PM_ISU] = { 0x080000000000ull, 0x020000000000ull },
190 [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull },
191 [PM_IDU] = { 0x380000000000ull, 0x010000000000ull },
192 [PM_STS] = { 0x380000000000ull, 0x310000000000ull },
193};
194
195static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
196{
197 int pmc, byte, unit, sh, spcsel;
198 u64 mask = 0, value = 0;
199 int grp = -1;
200
201 pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
202 if (pmc) {
203 if (pmc > 8)
204 return -1;
205 sh = (pmc - 1) * 2;
206 mask |= 2 << sh;
207 value |= 1 << sh;
208 grp = ((pmc - 1) >> 1) & 1;
209 }
210 unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
211 if (unit) {
212 if (unit > PM_LASTUNIT)
213 return -1;
214 mask |= unit_cons[unit][0];
215 value |= unit_cons[unit][1];
216 byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
217 /*
218 * Bus events on bytes 0 and 2 can be counted
219 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
220 */
221 if (!pmc)
222 grp = byte & 1;
223 /* Set byte lane select field */
224 mask |= 0xfULL << (28 - 4 * byte);
225 value |= (u64)unit << (28 - 4 * byte);
226 }
227 if (grp == 0) {
228 /* increment PMC1/2/5/6 field */
229 mask |= 0x8000000000ull;
230 value |= 0x1000000000ull;
231 } else if (grp == 1) {
232 /* increment PMC3/4/7/8 field */
233 mask |= 0x800000000ull;
234 value |= 0x100000000ull;
235 }
236 spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
237 if (spcsel) {
238 mask |= 3ull << 48;
239 value |= (u64)spcsel << 48;
240 }
241 *maskp = mask;
242 *valp = value;
243 return 0;
244}
245
246static int p970_get_alternatives(unsigned int event, unsigned int flags,
247 unsigned int alt[])
248{
249 alt[0] = event;
250
251 /* 2 alternatives for LSU empty */
252 if (event == 0x2002 || event == 0x3002) {
253 alt[1] = event ^ 0x1000;
254 return 2;
255 }
256
257 return 1;
258}
259
260static int p970_compute_mmcr(unsigned int event[], int n_ev,
261 unsigned int hwc[], u64 mmcr[])
262{
263 u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
264 unsigned int pmc, unit, byte, psel;
265 unsigned int ttm, grp;
266 unsigned int pmc_inuse = 0;
267 unsigned int pmc_grp_use[2];
268 unsigned char busbyte[4];
269 unsigned char unituse[16];
270 unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
271 unsigned char ttmuse[2];
272 unsigned char pmcsel[8];
273 int i;
274 int spcsel;
275
276 if (n_ev > 8)
277 return -1;
278
279 /* First pass to count resource use */
280 pmc_grp_use[0] = pmc_grp_use[1] = 0;
281 memset(busbyte, 0, sizeof(busbyte));
282 memset(unituse, 0, sizeof(unituse));
283 for (i = 0; i < n_ev; ++i) {
284 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
285 if (pmc) {
286 if (pmc_inuse & (1 << (pmc - 1)))
287 return -1;
288 pmc_inuse |= 1 << (pmc - 1);
289 /* count 1/2/5/6 vs 3/4/7/8 use */
290 ++pmc_grp_use[((pmc - 1) >> 1) & 1];
291 }
292 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
293 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
294 if (unit) {
295 if (unit > PM_LASTUNIT)
296 return -1;
297 if (!pmc)
298 ++pmc_grp_use[byte & 1];
299 if (busbyte[byte] && busbyte[byte] != unit)
300 return -1;
301 busbyte[byte] = unit;
302 unituse[unit] = 1;
303 }
304 }
305 if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
306 return -1;
307
308 /*
309 * Assign resources and set multiplexer selects.
310 *
311 * PM_ISU can go either on TTM0 or TTM1, but that's the only
312 * choice we have to deal with.
313 */
314 if (unituse[PM_ISU] &
315 (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
316 unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */
317 /* Set TTM[01]SEL fields. */
318 ttmuse[0] = ttmuse[1] = 0;
319 for (i = PM_FPU; i <= PM_STS; ++i) {
320 if (!unituse[i])
321 continue;
322 ttm = unitmap[i];
323 ++ttmuse[(ttm >> 2) & 1];
324 mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
325 }
326 /* Check only one unit per TTMx */
327 if (ttmuse[0] > 1 || ttmuse[1] > 1)
328 return -1;
329
330 /* Set byte lane select fields and TTM3SEL. */
331 for (byte = 0; byte < 4; ++byte) {
332 unit = busbyte[byte];
333 if (!unit)
334 continue;
335 if (unit <= PM_STS)
336 ttm = (unitmap[unit] >> 2) & 1;
337 else if (unit == PM_LSU0)
338 ttm = 2;
339 else {
340 ttm = 3;
341 if (unit == PM_LSU1L && byte >= 2)
342 mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
343 }
344 mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
345 }
346
347 /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
348 memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */
349 for (i = 0; i < n_ev; ++i) {
350 pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
351 unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
352 byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
353 psel = event[i] & PM_PMCSEL_MSK;
354 if (!pmc) {
355 /* Bus event or any-PMC direct event */
356 if (unit)
357 psel |= 0x10 | ((byte & 2) << 2);
358 else
359 psel |= 8;
360 for (pmc = 0; pmc < 8; ++pmc) {
361 if (pmc_inuse & (1 << pmc))
362 continue;
363 grp = (pmc >> 1) & 1;
364 if (unit) {
365 if (grp == (byte & 1))
366 break;
367 } else if (pmc_grp_use[grp] < 4) {
368 ++pmc_grp_use[grp];
369 break;
370 }
371 }
372 pmc_inuse |= 1 << pmc;
373 } else {
374 /* Direct event */
375 --pmc;
376 if (psel == 0 && (byte & 2))
377 /* add events on higher-numbered bus */
378 mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
379 }
380 pmcsel[pmc] = psel;
381 hwc[i] = pmc;
382 spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
383 mmcr1 |= spcsel;
384 if (p970_marked_instr_event(event[i]))
385 mmcra |= MMCRA_SAMPLE_ENABLE;
386 }
387 for (pmc = 0; pmc < 2; ++pmc)
388 mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
389 for (; pmc < 8; ++pmc)
390 mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
391 if (pmc_inuse & 1)
392 mmcr0 |= MMCR0_PMC1CE;
393 if (pmc_inuse & 0xfe)
394 mmcr0 |= MMCR0_PMCjCE;
395
396 mmcra |= 0x2000; /* mark only one IOP per PPC instruction */
397
398 /* Return MMCRx values */
399 mmcr[0] = mmcr0;
400 mmcr[1] = mmcr1;
401 mmcr[2] = mmcra;
402 return 0;
403}
404
405static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
406{
407 int shift, i;
408
409 if (pmc <= 1) {
410 shift = MMCR0_PMC1SEL_SH - 7 * pmc;
411 i = 0;
412 } else {
413 shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
414 i = 1;
415 }
416 /*
417 * Setting the PMCxSEL field to 0x08 disables PMC x.
418 */
419 mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
420}
421
422static int ppc970_generic_events[] = {
423 [PERF_COUNT_CPU_CYCLES] = 7,
424 [PERF_COUNT_INSTRUCTIONS] = 1,
425 [PERF_COUNT_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */
426 [PERF_COUNT_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */
427 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */
428 [PERF_COUNT_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */
429};
430
431struct power_pmu ppc970_pmu = {
432 .n_counter = 8,
433 .max_alternatives = 2,
434 .add_fields = 0x001100005555ull,
435 .test_adder = 0x013300000000ull,
436 .compute_mmcr = p970_compute_mmcr,
437 .get_constraint = p970_get_constraint,
438 .get_alternatives = p970_get_alternatives,
439 .disable_pmc = p970_disable_pmc,
440 .n_generic = ARRAY_SIZE(ppc970_generic_events),
441 .generic_events = ppc970_generic_events,
442};
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 76993941cac9..ac0e112031b2 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,6 +29,7 @@
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/kprobes.h> 30#include <linux/kprobes.h>
31#include <linux/kdebug.h> 31#include <linux/kdebug.h>
32#include <linux/perf_counter.h>
32 33
33#include <asm/firmware.h> 34#include <asm/firmware.h>
34#include <asm/page.h> 35#include <asm/page.h>
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
170 die("Weird page fault", regs, SIGSEGV); 171 die("Weird page fault", regs, SIGSEGV);
171 } 172 }
172 173
174 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
175
173 /* When running in the kernel we expect faults to occur only to 176 /* When running in the kernel we expect faults to occur only to
174 * addresses in user space. All other faults represent errors in the 177 * addresses in user space. All other faults represent errors in the
175 * kernel and should generate an OOPS. Unfortunately, in the case of an 178 * kernel and should generate an OOPS. Unfortunately, in the case of an
@@ -309,6 +312,8 @@ good_area:
309 } 312 }
310 if (ret & VM_FAULT_MAJOR) { 313 if (ret & VM_FAULT_MAJOR) {
311 current->maj_flt++; 314 current->maj_flt++;
315 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
316 regs, address);
312#ifdef CONFIG_PPC_SMLPAR 317#ifdef CONFIG_PPC_SMLPAR
313 if (firmware_has_feature(FW_FEATURE_CMO)) { 318 if (firmware_has_feature(FW_FEATURE_CMO)) {
314 preempt_disable(); 319 preempt_disable();
@@ -316,8 +321,11 @@ good_area:
316 preempt_enable(); 321 preempt_enable();
317 } 322 }
318#endif 323#endif
319 } else 324 } else {
320 current->min_flt++; 325 current->min_flt++;
326 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
327 regs, address);
328 }
321 up_read(&mm->mmap_sem); 329 up_read(&mm->mmap_sem);
322 return 0; 330 return 0;
323 331
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 9da795e49337..732ee93a8e98 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
1config PPC64 1config PPC64
2 bool "64-bit kernel" 2 bool "64-bit kernel"
3 default n 3 default n
4 select HAVE_PERF_COUNTERS
4 help 5 help
5 This option selects whether a 32-bit or a 64-bit kernel 6 This option selects whether a 32-bit or a 64-bit kernel
6 will be built. 7 will be built.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index df9e885eee14..32ada97c964d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -727,6 +727,7 @@ config X86_UP_IOAPIC
727config X86_LOCAL_APIC 727config X86_LOCAL_APIC
728 def_bool y 728 def_bool y
729 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 729 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
730 select HAVE_PERF_COUNTERS if (!M386 && !M486)
730 731
731config X86_IO_APIC 732config X86_IO_APIC
732 def_bool y 733 def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a505202086e8..e590261ba059 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -825,9 +825,11 @@ ia32_sys_call_table:
825 .quad compat_sys_signalfd4 825 .quad compat_sys_signalfd4
826 .quad sys_eventfd2 826 .quad sys_eventfd2
827 .quad sys_epoll_create1 827 .quad sys_epoll_create1
828 .quad sys_dup3 /* 330 */ 828 .quad sys_dup3 /* 330 */
829 .quad sys_pipe2 829 .quad sys_pipe2
830 .quad sys_inotify_init1 830 .quad sys_inotify_init1
831 .quad compat_sys_preadv 831 .quad compat_sys_preadv
832 .quad compat_sys_pwritev 832 .quad compat_sys_pwritev
833 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
834 .quad sys_perf_counter_open
833ia32_syscall_end: 835ia32_syscall_end:
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 85b46fba4229..aff9f1fcdcd7 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -247,5 +247,241 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
247#define smp_mb__before_atomic_inc() barrier() 247#define smp_mb__before_atomic_inc() barrier()
248#define smp_mb__after_atomic_inc() barrier() 248#define smp_mb__after_atomic_inc() barrier()
249 249
250/* An 64bit atomic type */
251
252typedef struct {
253 unsigned long long counter;
254} atomic64_t;
255
256#define ATOMIC64_INIT(val) { (val) }
257
258/**
259 * atomic64_read - read atomic64 variable
260 * @v: pointer of type atomic64_t
261 *
262 * Atomically reads the value of @v.
263 * Doesn't imply a read memory barrier.
264 */
265#define __atomic64_read(ptr) ((ptr)->counter)
266
267static inline unsigned long long
268cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
269{
270 asm volatile(
271
272 LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
273
274 : "=A" (old)
275
276 : [ptr] "D" (ptr),
277 "A" (old),
278 "b" (ll_low(new)),
279 "c" (ll_high(new))
280
281 : "memory");
282
283 return old;
284}
285
286static inline unsigned long long
287atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
288 unsigned long long new_val)
289{
290 return cmpxchg8b(&ptr->counter, old_val, new_val);
291}
292
293/**
294 * atomic64_xchg - xchg atomic64 variable
295 * @ptr: pointer to type atomic64_t
296 * @new_val: value to assign
297 * @old_val: old value that was there
298 *
299 * Atomically xchgs the value of @ptr to @new_val and returns
300 * the old value.
301 */
302
303static inline unsigned long long
304atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
305{
306 unsigned long long old_val;
307
308 do {
309 old_val = atomic_read(ptr);
310 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
311
312 return old_val;
313}
314
315/**
316 * atomic64_set - set atomic64 variable
317 * @ptr: pointer to type atomic64_t
318 * @new_val: value to assign
319 *
320 * Atomically sets the value of @ptr to @new_val.
321 */
322static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
323{
324 atomic64_xchg(ptr, new_val);
325}
326
327/**
328 * atomic64_read - read atomic64 variable
329 * @ptr: pointer to type atomic64_t
330 *
331 * Atomically reads the value of @ptr and returns it.
332 */
333static inline unsigned long long atomic64_read(atomic64_t *ptr)
334{
335 unsigned long long curr_val;
336
337 do {
338 curr_val = __atomic64_read(ptr);
339 } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
340
341 return curr_val;
342}
343
344/**
345 * atomic64_add_return - add and return
346 * @delta: integer value to add
347 * @ptr: pointer to type atomic64_t
348 *
349 * Atomically adds @delta to @ptr and returns @delta + *@ptr
350 */
351static inline unsigned long long
352atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
353{
354 unsigned long long old_val, new_val;
355
356 do {
357 old_val = atomic_read(ptr);
358 new_val = old_val + delta;
359
360 } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
361
362 return new_val;
363}
364
365static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
366{
367 return atomic64_add_return(-delta, ptr);
368}
369
370static inline long atomic64_inc_return(atomic64_t *ptr)
371{
372 return atomic64_add_return(1, ptr);
373}
374
375static inline long atomic64_dec_return(atomic64_t *ptr)
376{
377 return atomic64_sub_return(1, ptr);
378}
379
380/**
381 * atomic64_add - add integer to atomic64 variable
382 * @delta: integer value to add
383 * @ptr: pointer to type atomic64_t
384 *
385 * Atomically adds @delta to @ptr.
386 */
387static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
388{
389 atomic64_add_return(delta, ptr);
390}
391
392/**
393 * atomic64_sub - subtract the atomic64 variable
394 * @delta: integer value to subtract
395 * @ptr: pointer to type atomic64_t
396 *
397 * Atomically subtracts @delta from @ptr.
398 */
399static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
400{
401 atomic64_add(-delta, ptr);
402}
403
404/**
405 * atomic64_sub_and_test - subtract value from variable and test result
406 * @delta: integer value to subtract
407 * @ptr: pointer to type atomic64_t
408 *
409 * Atomically subtracts @delta from @ptr and returns
410 * true if the result is zero, or false for all
411 * other cases.
412 */
413static inline int
414atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
415{
416 unsigned long long old_val = atomic64_sub_return(delta, ptr);
417
418 return old_val == 0;
419}
420
421/**
422 * atomic64_inc - increment atomic64 variable
423 * @ptr: pointer to type atomic64_t
424 *
425 * Atomically increments @ptr by 1.
426 */
427static inline void atomic64_inc(atomic64_t *ptr)
428{
429 atomic64_add(1, ptr);
430}
431
432/**
433 * atomic64_dec - decrement atomic64 variable
434 * @ptr: pointer to type atomic64_t
435 *
436 * Atomically decrements @ptr by 1.
437 */
438static inline void atomic64_dec(atomic64_t *ptr)
439{
440 atomic64_sub(1, ptr);
441}
442
443/**
444 * atomic64_dec_and_test - decrement and test
445 * @ptr: pointer to type atomic64_t
446 *
447 * Atomically decrements @ptr by 1 and
448 * returns true if the result is 0, or false for all other
449 * cases.
450 */
451static inline int atomic64_dec_and_test(atomic64_t *ptr)
452{
453 return atomic64_sub_and_test(1, ptr);
454}
455
456/**
457 * atomic64_inc_and_test - increment and test
458 * @ptr: pointer to type atomic64_t
459 *
460 * Atomically increments @ptr by 1
461 * and returns true if the result is zero, or false for all
462 * other cases.
463 */
464static inline int atomic64_inc_and_test(atomic64_t *ptr)
465{
466 return atomic64_sub_and_test(-1, ptr);
467}
468
469/**
470 * atomic64_add_negative - add and test if negative
471 * @delta: integer value to add
472 * @ptr: pointer to type atomic64_t
473 *
474 * Atomically adds @delta to @ptr and returns true
475 * if the result is negative, or false when
476 * result is greater than or equal to zero.
477 */
478static inline int
479atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
480{
481 long long old_val = atomic64_add_return(delta, ptr);
482
483 return old_val < 0;
484}
485
250#include <asm-generic/atomic.h> 486#include <asm-generic/atomic.h>
251#endif /* _ASM_X86_ATOMIC_32_H */ 487#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf258..fe24d2802490 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -50,6 +50,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
50 50
51#ifdef CONFIG_PERF_COUNTERS 51#ifdef CONFIG_PERF_COUNTERS
52BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) 52BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
53BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
53#endif 54#endif
54 55
55#ifdef CONFIG_X86_MCE_P4THERMAL 56#ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 37555e52f980..9ebc5c255032 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -13,6 +13,8 @@ typedef struct {
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int generic_irqs; /* arch dependent */
16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs;
16#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
17 unsigned int irq_resched_count; 19 unsigned int irq_resched_count;
18 unsigned int irq_call_count; 20 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b762ea49bd70..7309c0ad6902 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,6 +29,9 @@
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void generic_interrupt(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_counter_interrupt(void);
33extern void perf_pending_interrupt(void);
34
32extern void spurious_interrupt(void); 35extern void spurious_interrupt(void);
33extern void thermal_interrupt(void); 36extern void thermal_interrupt(void);
34extern void reschedule_interrupt(void); 37extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
deleted file mode 100644
index fa0fd068bc2e..000000000000
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ /dev/null
@@ -1,31 +0,0 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47c..545bb811ccb5 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -117,6 +117,11 @@
117#define GENERIC_INTERRUPT_VECTOR 0xed 117#define GENERIC_INTERRUPT_VECTOR 0xed
118 118
119/* 119/*
120 * Performance monitoring pending work vector:
121 */
122#define LOCAL_PENDING_VECTOR 0xec
123
124/*
120 * First APIC vector available to drivers: (vectors 0x30-0xee) we 125 * First APIC vector available to drivers: (vectors 0x30-0xee) we
121 * start at 0x31(0x41) to spread out vectors evenly between priority 126 * start at 0x31(0x41) to spread out vectors evenly between priority
122 * levels. (0x80 is the syscall vector) 127 * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
new file mode 100644
index 000000000000..d08dd52cb8ff
--- /dev/null
+++ b/arch/x86/include/asm/perf_counter.h
@@ -0,0 +1,100 @@
1#ifndef _ASM_X86_PERF_COUNTER_H
2#define _ASM_X86_PERF_COUNTER_H
3
4/*
5 * Performance counter hw details:
6 */
7
8#define X86_PMC_MAX_GENERIC 8
9#define X86_PMC_MAX_FIXED 3
10
11#define X86_PMC_IDX_GENERIC 0
12#define X86_PMC_IDX_FIXED 32
13#define X86_PMC_IDX_MAX 64
14
15#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
16#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
17
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
25
26/*
27 * Includes eventsel and unit mask as well:
28 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff
30
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36
37#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6
38
39/*
40 * Intel "Architectural Performance Monitoring" CPUID
41 * detection/enumeration details:
42 */
43union cpuid10_eax {
44 struct {
45 unsigned int version_id:8;
46 unsigned int num_counters:8;
47 unsigned int bit_width:8;
48 unsigned int mask_length:8;
49 } split;
50 unsigned int full;
51};
52
53union cpuid10_edx {
54 struct {
55 unsigned int num_counters_fixed:4;
56 unsigned int reserved:28;
57 } split;
58 unsigned int full;
59};
60
61
62/*
63 * Fixed-purpose performance counters:
64 */
65
66/*
67 * All 3 fixed-mode PMCs are configured via this single MSR:
68 */
69#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d
70
71/*
72 * The counts are available in three separate MSRs:
73 */
74
75/* Instr_Retired.Any: */
76#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309
77#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0)
78
79/* CPU_CLK_Unhalted.Core: */
80#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a
81#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1)
82
83/* CPU_CLK_Unhalted.Ref: */
84#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
85#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
86
87extern void set_perf_counter_pending(void);
88
89#define clear_perf_counter_pending() do { } while (0)
90#define test_perf_counter_pending() (0)
91
92#ifdef CONFIG_PERF_COUNTERS
93extern void init_hw_perf_counters(void);
94extern void perf_counters_lapic_init(int nmi);
95#else
96static inline void init_hw_perf_counters(void) { }
97static inline void perf_counters_lapic_init(int nmi) { }
98#endif
99
100#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74cf8dc..732a30706153 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,8 @@
340#define __NR_inotify_init1 332 340#define __NR_inotify_init1 332
341#define __NR_preadv 333 341#define __NR_preadv 333
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_counter_open 336
343 345
344#ifdef __KERNEL__ 346#ifdef __KERNEL__
345 347
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index f81829462325..900e1617e672 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -657,7 +657,10 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
657__SYSCALL(__NR_preadv, sys_preadv) 657__SYSCALL(__NR_preadv, sys_preadv)
658#define __NR_pwritev 296 658#define __NR_pwritev 296
659__SYSCALL(__NR_pwritev, sys_pwritev) 659__SYSCALL(__NR_pwritev, sys_pwritev)
660 660#define __NR_rt_tgsigqueueinfo 297
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662#define __NR_perf_counter_open 298
663__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
661 664
662#ifndef __NO_STUBS 665#ifndef __NO_STUBS
663#define __ARCH_WANT_OLD_READDIR 666#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f2870920f246..e9021a908020 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -34,6 +34,7 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/mm.h> 35#include <linux/mm.h>
36 36
37#include <asm/perf_counter.h>
37#include <asm/pgalloc.h> 38#include <asm/pgalloc.h>
38#include <asm/atomic.h> 39#include <asm/atomic.h>
39#include <asm/mpspec.h> 40#include <asm/mpspec.h>
@@ -761,6 +762,8 @@ static void local_apic_timer_interrupt(void)
761 inc_irq_stat(apic_timer_irqs); 762 inc_irq_stat(apic_timer_irqs);
762 763
763 evt->event_handler(evt); 764 evt->event_handler(evt);
765
766 perf_counter_unthrottle();
764} 767}
765 768
766/* 769/*
@@ -1133,6 +1136,7 @@ void __cpuinit setup_local_APIC(void)
1133 apic_write(APIC_ESR, 0); 1136 apic_write(APIC_ESR, 0);
1134 } 1137 }
1135#endif 1138#endif
1139 perf_counters_lapic_init(0);
1136 1140
1137 preempt_disable(); 1141 preempt_disable();
1138 1142
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 4e242f9a06e4..3efcb2b96a15 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
1# 1#
2# Makefile for x86-compatible CPU details and quirks 2# Makefile for x86-compatible CPU details, features and quirks
3# 3#
4 4
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
@@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 23obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 24obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
25 25
26obj-$(CONFIG_X86_MCE) += mcheck/ 26obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
27obj-$(CONFIG_MTRR) += mtrr/
28obj-$(CONFIG_CPU_FREQ) += cpufreq/
29 27
30obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 28obj-$(CONFIG_X86_MCE) += mcheck/
29obj-$(CONFIG_MTRR) += mtrr/
30obj-$(CONFIG_CPU_FREQ) += cpufreq/
31
32obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
31 33
32quiet_cmd_mkcapflags = MKCAP $@ 34quiet_cmd_mkcapflags = MKCAP $@
33 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 35 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c4f667896c28..a86769efe0df 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h>
16#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
17#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -854,6 +855,7 @@ void __init identify_boot_cpu(void)
854#else 855#else
855 vgetcpu_set_mode(); 856 vgetcpu_set_mode();
856#endif 857#endif
858 init_hw_perf_counters();
857} 859}
858 860
859void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 861void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 000000000000..a6878b0798e5
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,1214 @@
1/*
2 * Performance counter x86 architecture code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 *
10 * For licencing details see kernel-base/COPYING
11 */
12
13#include <linux/perf_counter.h>
14#include <linux/capability.h>
15#include <linux/notifier.h>
16#include <linux/hardirq.h>
17#include <linux/kprobes.h>
18#include <linux/module.h>
19#include <linux/kdebug.h>
20#include <linux/sched.h>
21#include <linux/uaccess.h>
22
23#include <asm/apic.h>
24#include <asm/stacktrace.h>
25#include <asm/nmi.h>
26
27static u64 perf_counter_mask __read_mostly;
28
29struct cpu_hw_counters {
30 struct perf_counter *counters[X86_PMC_IDX_MAX];
31 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
32 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
33 unsigned long interrupts;
34 u64 throttle_ctrl;
35 int enabled;
36};
37
38/*
39 * struct x86_pmu - generic x86 pmu
40 */
41struct x86_pmu {
42 const char *name;
43 int version;
44 int (*handle_irq)(struct pt_regs *, int);
45 u64 (*save_disable_all)(void);
46 void (*restore_all)(u64);
47 void (*enable)(struct hw_perf_counter *, int);
48 void (*disable)(struct hw_perf_counter *, int);
49 unsigned eventsel;
50 unsigned perfctr;
51 u64 (*event_map)(int);
52 u64 (*raw_event)(u64);
53 int max_events;
54 int num_counters;
55 int num_counters_fixed;
56 int counter_bits;
57 u64 counter_mask;
58 u64 max_period;
59};
60
61static struct x86_pmu x86_pmu __read_mostly;
62
63static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
64 .enabled = 1,
65};
66
67/*
68 * Intel PerfMon v3. Used on Core2 and later.
69 */
70static const u64 intel_perfmon_event_map[] =
71{
72 [PERF_COUNT_CPU_CYCLES] = 0x003c,
73 [PERF_COUNT_INSTRUCTIONS] = 0x00c0,
74 [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e,
75 [PERF_COUNT_CACHE_MISSES] = 0x412e,
76 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4,
77 [PERF_COUNT_BRANCH_MISSES] = 0x00c5,
78 [PERF_COUNT_BUS_CYCLES] = 0x013c,
79};
80
81static u64 intel_pmu_event_map(int event)
82{
83 return intel_perfmon_event_map[event];
84}
85
86static u64 intel_pmu_raw_event(u64 event)
87{
88#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
89#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
90#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL
91
92#define CORE_EVNTSEL_MASK \
93 (CORE_EVNTSEL_EVENT_MASK | \
94 CORE_EVNTSEL_UNIT_MASK | \
95 CORE_EVNTSEL_COUNTER_MASK)
96
97 return event & CORE_EVNTSEL_MASK;
98}
99
100/*
101 * AMD Performance Monitor K7 and later.
102 */
103static const u64 amd_perfmon_event_map[] =
104{
105 [PERF_COUNT_CPU_CYCLES] = 0x0076,
106 [PERF_COUNT_INSTRUCTIONS] = 0x00c0,
107 [PERF_COUNT_CACHE_REFERENCES] = 0x0080,
108 [PERF_COUNT_CACHE_MISSES] = 0x0081,
109 [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4,
110 [PERF_COUNT_BRANCH_MISSES] = 0x00c5,
111};
112
113static u64 amd_pmu_event_map(int event)
114{
115 return amd_perfmon_event_map[event];
116}
117
118static u64 amd_pmu_raw_event(u64 event)
119{
120#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
121#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
122#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL
123
124#define K7_EVNTSEL_MASK \
125 (K7_EVNTSEL_EVENT_MASK | \
126 K7_EVNTSEL_UNIT_MASK | \
127 K7_EVNTSEL_COUNTER_MASK)
128
129 return event & K7_EVNTSEL_MASK;
130}
131
132/*
133 * Propagate counter elapsed time into the generic counter.
134 * Can only be executed on the CPU where the counter is active.
135 * Returns the delta events processed.
136 */
137static u64
138x86_perf_counter_update(struct perf_counter *counter,
139 struct hw_perf_counter *hwc, int idx)
140{
141 u64 prev_raw_count, new_raw_count, delta;
142
143 /*
144 * Careful: an NMI might modify the previous counter value.
145 *
146 * Our tactic to handle this is to first atomically read and
147 * exchange a new raw count - then add that new-prev delta
148 * count to the generic counter atomically:
149 */
150again:
151 prev_raw_count = atomic64_read(&hwc->prev_count);
152 rdmsrl(hwc->counter_base + idx, new_raw_count);
153
154 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
155 new_raw_count) != prev_raw_count)
156 goto again;
157
158 /*
159 * Now we have the new raw value and have updated the prev
160 * timestamp already. We can now calculate the elapsed delta
161 * (counter-)time and add that to the generic counter.
162 *
163 * Careful, not all hw sign-extends above the physical width
164 * of the count, so we do that by clipping the delta to 32 bits:
165 */
166 delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
167
168 atomic64_add(delta, &counter->count);
169 atomic64_sub(delta, &hwc->period_left);
170
171 return new_raw_count;
172}
173
174static atomic_t active_counters;
175static DEFINE_MUTEX(pmc_reserve_mutex);
176
177static bool reserve_pmc_hardware(void)
178{
179 int i;
180
181 if (nmi_watchdog == NMI_LOCAL_APIC)
182 disable_lapic_nmi_watchdog();
183
184 for (i = 0; i < x86_pmu.num_counters; i++) {
185 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
186 goto perfctr_fail;
187 }
188
189 for (i = 0; i < x86_pmu.num_counters; i++) {
190 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
191 goto eventsel_fail;
192 }
193
194 return true;
195
196eventsel_fail:
197 for (i--; i >= 0; i--)
198 release_evntsel_nmi(x86_pmu.eventsel + i);
199
200 i = x86_pmu.num_counters;
201
202perfctr_fail:
203 for (i--; i >= 0; i--)
204 release_perfctr_nmi(x86_pmu.perfctr + i);
205
206 if (nmi_watchdog == NMI_LOCAL_APIC)
207 enable_lapic_nmi_watchdog();
208
209 return false;
210}
211
212static void release_pmc_hardware(void)
213{
214 int i;
215
216 for (i = 0; i < x86_pmu.num_counters; i++) {
217 release_perfctr_nmi(x86_pmu.perfctr + i);
218 release_evntsel_nmi(x86_pmu.eventsel + i);
219 }
220
221 if (nmi_watchdog == NMI_LOCAL_APIC)
222 enable_lapic_nmi_watchdog();
223}
224
225static void hw_perf_counter_destroy(struct perf_counter *counter)
226{
227 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
228 release_pmc_hardware();
229 mutex_unlock(&pmc_reserve_mutex);
230 }
231}
232
233static inline int x86_pmu_initialized(void)
234{
235 return x86_pmu.handle_irq != NULL;
236}
237
238/*
239 * Setup the hardware configuration for a given hw_event_type
240 */
241static int __hw_perf_counter_init(struct perf_counter *counter)
242{
243 struct perf_counter_hw_event *hw_event = &counter->hw_event;
244 struct hw_perf_counter *hwc = &counter->hw;
245 int err;
246
247 if (!x86_pmu_initialized())
248 return -ENODEV;
249
250 err = 0;
251 if (!atomic_inc_not_zero(&active_counters)) {
252 mutex_lock(&pmc_reserve_mutex);
253 if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
254 err = -EBUSY;
255 else
256 atomic_inc(&active_counters);
257 mutex_unlock(&pmc_reserve_mutex);
258 }
259 if (err)
260 return err;
261
262 /*
263 * Generate PMC IRQs:
264 * (keep 'enabled' bit clear for now)
265 */
266 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
267
268 /*
269 * Count user and OS events unless requested not to.
270 */
271 if (!hw_event->exclude_user)
272 hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
273 if (!hw_event->exclude_kernel)
274 hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
275
276 /*
277 * If privileged enough, allow NMI events:
278 */
279 hwc->nmi = 0;
280 if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
281 hwc->nmi = 1;
282
283 hwc->irq_period = hw_event->irq_period;
284 if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
285 hwc->irq_period = x86_pmu.max_period;
286
287 atomic64_set(&hwc->period_left, hwc->irq_period);
288
289 /*
290 * Raw event type provide the config in the event structure
291 */
292 if (perf_event_raw(hw_event)) {
293 hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
294 } else {
295 if (perf_event_id(hw_event) >= x86_pmu.max_events)
296 return -EINVAL;
297 /*
298 * The generic map:
299 */
300 hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
301 }
302
303 counter->destroy = hw_perf_counter_destroy;
304
305 return 0;
306}
307
308static u64 intel_pmu_save_disable_all(void)
309{
310 u64 ctrl;
311
312 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
313 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
314
315 return ctrl;
316}
317
318static u64 amd_pmu_save_disable_all(void)
319{
320 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
321 int enabled, idx;
322
323 enabled = cpuc->enabled;
324 cpuc->enabled = 0;
325 /*
326 * ensure we write the disable before we start disabling the
327 * counters proper, so that amd_pmu_enable_counter() does the
328 * right thing.
329 */
330 barrier();
331
332 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
333 u64 val;
334
335 if (!test_bit(idx, cpuc->active_mask))
336 continue;
337 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
338 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
339 continue;
340 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
341 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
342 }
343
344 return enabled;
345}
346
347u64 hw_perf_save_disable(void)
348{
349 if (!x86_pmu_initialized())
350 return 0;
351 return x86_pmu.save_disable_all();
352}
353/*
354 * Exported because of ACPI idle
355 */
356EXPORT_SYMBOL_GPL(hw_perf_save_disable);
357
358static void intel_pmu_restore_all(u64 ctrl)
359{
360 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
361}
362
363static void amd_pmu_restore_all(u64 ctrl)
364{
365 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
366 int idx;
367
368 cpuc->enabled = ctrl;
369 barrier();
370 if (!ctrl)
371 return;
372
373 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
374 u64 val;
375
376 if (!test_bit(idx, cpuc->active_mask))
377 continue;
378 rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
379 if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
380 continue;
381 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
382 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
383 }
384}
385
386void hw_perf_restore(u64 ctrl)
387{
388 if (!x86_pmu_initialized())
389 return;
390 x86_pmu.restore_all(ctrl);
391}
392/*
393 * Exported because of ACPI idle
394 */
395EXPORT_SYMBOL_GPL(hw_perf_restore);
396
397static inline u64 intel_pmu_get_status(void)
398{
399 u64 status;
400
401 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
402
403 return status;
404}
405
406static inline void intel_pmu_ack_status(u64 ack)
407{
408 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
409}
410
411static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
412{
413 int err;
414 err = checking_wrmsrl(hwc->config_base + idx,
415 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
416}
417
418static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
419{
420 int err;
421 err = checking_wrmsrl(hwc->config_base + idx,
422 hwc->config);
423}
424
425static inline void
426intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
427{
428 int idx = __idx - X86_PMC_IDX_FIXED;
429 u64 ctrl_val, mask;
430 int err;
431
432 mask = 0xfULL << (idx * 4);
433
434 rdmsrl(hwc->config_base, ctrl_val);
435 ctrl_val &= ~mask;
436 err = checking_wrmsrl(hwc->config_base, ctrl_val);
437}
438
439static inline void
440intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
441{
442 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
443 intel_pmu_disable_fixed(hwc, idx);
444 return;
445 }
446
447 x86_pmu_disable_counter(hwc, idx);
448}
449
450static inline void
451amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
452{
453 x86_pmu_disable_counter(hwc, idx);
454}
455
456static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
457
458/*
459 * Set the next IRQ period, based on the hwc->period_left value.
460 * To be called with the counter disabled in hw:
461 */
462static void
463x86_perf_counter_set_period(struct perf_counter *counter,
464 struct hw_perf_counter *hwc, int idx)
465{
466 s64 left = atomic64_read(&hwc->period_left);
467 s64 period = hwc->irq_period;
468 int err;
469
470 /*
471 * If we are way outside a reasoable range then just skip forward:
472 */
473 if (unlikely(left <= -period)) {
474 left = period;
475 atomic64_set(&hwc->period_left, left);
476 }
477
478 if (unlikely(left <= 0)) {
479 left += period;
480 atomic64_set(&hwc->period_left, left);
481 }
482
483 per_cpu(prev_left[idx], smp_processor_id()) = left;
484
485 /*
486 * The hw counter starts counting from this counter offset,
487 * mark it to be able to extra future deltas:
488 */
489 atomic64_set(&hwc->prev_count, (u64)-left);
490
491 err = checking_wrmsrl(hwc->counter_base + idx,
492 (u64)(-left) & x86_pmu.counter_mask);
493}
494
495static inline void
496intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
497{
498 int idx = __idx - X86_PMC_IDX_FIXED;
499 u64 ctrl_val, bits, mask;
500 int err;
501
502 /*
503 * Enable IRQ generation (0x8),
504 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
505 * if requested:
506 */
507 bits = 0x8ULL;
508 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
509 bits |= 0x2;
510 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
511 bits |= 0x1;
512 bits <<= (idx * 4);
513 mask = 0xfULL << (idx * 4);
514
515 rdmsrl(hwc->config_base, ctrl_val);
516 ctrl_val &= ~mask;
517 ctrl_val |= bits;
518 err = checking_wrmsrl(hwc->config_base, ctrl_val);
519}
520
521static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
522{
523 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
524 intel_pmu_enable_fixed(hwc, idx);
525 return;
526 }
527
528 x86_pmu_enable_counter(hwc, idx);
529}
530
531static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
532{
533 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
534
535 if (cpuc->enabled)
536 x86_pmu_enable_counter(hwc, idx);
537 else
538 x86_pmu_disable_counter(hwc, idx);
539}
540
541static int
542fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
543{
544 unsigned int event;
545
546 if (!x86_pmu.num_counters_fixed)
547 return -1;
548
549 if (unlikely(hwc->nmi))
550 return -1;
551
552 event = hwc->config & ARCH_PERFMON_EVENT_MASK;
553
554 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
555 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
556 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
557 return X86_PMC_IDX_FIXED_CPU_CYCLES;
558 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
559 return X86_PMC_IDX_FIXED_BUS_CYCLES;
560
561 return -1;
562}
563
564/*
565 * Find a PMC slot for the freshly enabled / scheduled in counter:
566 */
567static int x86_pmu_enable(struct perf_counter *counter)
568{
569 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
570 struct hw_perf_counter *hwc = &counter->hw;
571 int idx;
572
573 idx = fixed_mode_idx(counter, hwc);
574 if (idx >= 0) {
575 /*
576 * Try to get the fixed counter, if that is already taken
577 * then try to get a generic counter:
578 */
579 if (test_and_set_bit(idx, cpuc->used_mask))
580 goto try_generic;
581
582 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
583 /*
584 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
585 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
586 */
587 hwc->counter_base =
588 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
589 hwc->idx = idx;
590 } else {
591 idx = hwc->idx;
592 /* Try to get the previous generic counter again */
593 if (test_and_set_bit(idx, cpuc->used_mask)) {
594try_generic:
595 idx = find_first_zero_bit(cpuc->used_mask,
596 x86_pmu.num_counters);
597 if (idx == x86_pmu.num_counters)
598 return -EAGAIN;
599
600 set_bit(idx, cpuc->used_mask);
601 hwc->idx = idx;
602 }
603 hwc->config_base = x86_pmu.eventsel;
604 hwc->counter_base = x86_pmu.perfctr;
605 }
606
607 perf_counters_lapic_init(hwc->nmi);
608
609 x86_pmu.disable(hwc, idx);
610
611 cpuc->counters[idx] = counter;
612 set_bit(idx, cpuc->active_mask);
613
614 x86_perf_counter_set_period(counter, hwc, idx);
615 x86_pmu.enable(hwc, idx);
616
617 return 0;
618}
619
620void perf_counter_print_debug(void)
621{
622 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
623 struct cpu_hw_counters *cpuc;
624 int cpu, idx;
625
626 if (!x86_pmu.num_counters)
627 return;
628
629 local_irq_disable();
630
631 cpu = smp_processor_id();
632 cpuc = &per_cpu(cpu_hw_counters, cpu);
633
634 if (x86_pmu.version >= 2) {
635 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
636 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
637 rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
638 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
639
640 pr_info("\n");
641 pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
642 pr_info("CPU#%d: status: %016llx\n", cpu, status);
643 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
644 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
645 }
646 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
647
648 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
649 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
650 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
651
652 prev_left = per_cpu(prev_left[idx], cpu);
653
654 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
655 cpu, idx, pmc_ctrl);
656 pr_info("CPU#%d: gen-PMC%d count: %016llx\n",
657 cpu, idx, pmc_count);
658 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
659 cpu, idx, prev_left);
660 }
661 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
662 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
663
664 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
665 cpu, idx, pmc_count);
666 }
667 local_irq_enable();
668}
669
670static void x86_pmu_disable(struct perf_counter *counter)
671{
672 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
673 struct hw_perf_counter *hwc = &counter->hw;
674 int idx = hwc->idx;
675
676 /*
677 * Must be done before we disable, otherwise the nmi handler
678 * could reenable again:
679 */
680 clear_bit(idx, cpuc->active_mask);
681 x86_pmu.disable(hwc, idx);
682
683 /*
684 * Make sure the cleared pointer becomes visible before we
685 * (potentially) free the counter:
686 */
687 barrier();
688
689 /*
690 * Drain the remaining delta count out of a counter
691 * that we are disabling:
692 */
693 x86_perf_counter_update(counter, hwc, idx);
694 cpuc->counters[idx] = NULL;
695 clear_bit(idx, cpuc->used_mask);
696}
697
698/*
699 * Save and restart an expired counter. Called by NMI contexts,
700 * so it has to be careful about preempting normal counter ops:
701 */
702static void intel_pmu_save_and_restart(struct perf_counter *counter)
703{
704 struct hw_perf_counter *hwc = &counter->hw;
705 int idx = hwc->idx;
706
707 x86_perf_counter_update(counter, hwc, idx);
708 x86_perf_counter_set_period(counter, hwc, idx);
709
710 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
711 intel_pmu_enable_counter(hwc, idx);
712}
713
714/*
715 * Maximum interrupt frequency of 100KHz per CPU
716 */
717#define PERFMON_MAX_INTERRUPTS (100000/HZ)
718
719/*
720 * This handler is triggered by the local APIC, so the APIC IRQ handling
721 * rules apply:
722 */
723static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
724{
725 int bit, cpu = smp_processor_id();
726 u64 ack, status;
727 struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
728 int ret = 0;
729
730 cpuc->throttle_ctrl = intel_pmu_save_disable_all();
731
732 status = intel_pmu_get_status();
733 if (!status)
734 goto out;
735
736 ret = 1;
737again:
738 inc_irq_stat(apic_perf_irqs);
739 ack = status;
740 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
741 struct perf_counter *counter = cpuc->counters[bit];
742
743 clear_bit(bit, (unsigned long *) &status);
744 if (!test_bit(bit, cpuc->active_mask))
745 continue;
746
747 intel_pmu_save_and_restart(counter);
748 if (perf_counter_overflow(counter, nmi, regs, 0))
749 intel_pmu_disable_counter(&counter->hw, bit);
750 }
751
752 intel_pmu_ack_status(ack);
753
754 /*
755 * Repeat if there is more work to be done:
756 */
757 status = intel_pmu_get_status();
758 if (status)
759 goto again;
760out:
761 /*
762 * Restore - do not reenable when global enable is off or throttled:
763 */
764 if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
765 intel_pmu_restore_all(cpuc->throttle_ctrl);
766
767 return ret;
768}
769
770static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
771{
772 int cpu = smp_processor_id();
773 struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
774 u64 val;
775 int handled = 0;
776 struct perf_counter *counter;
777 struct hw_perf_counter *hwc;
778 int idx;
779
780 ++cpuc->interrupts;
781 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
782 if (!test_bit(idx, cpuc->active_mask))
783 continue;
784 counter = cpuc->counters[idx];
785 hwc = &counter->hw;
786 val = x86_perf_counter_update(counter, hwc, idx);
787 if (val & (1ULL << (x86_pmu.counter_bits - 1)))
788 continue;
789 /* counter overflow */
790 x86_perf_counter_set_period(counter, hwc, idx);
791 handled = 1;
792 inc_irq_stat(apic_perf_irqs);
793 if (perf_counter_overflow(counter, nmi, regs, 0))
794 amd_pmu_disable_counter(hwc, idx);
795 else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
796 /*
797 * do not reenable when throttled, but reload
798 * the register
799 */
800 amd_pmu_disable_counter(hwc, idx);
801 else if (counter->state == PERF_COUNTER_STATE_ACTIVE)
802 amd_pmu_enable_counter(hwc, idx);
803 }
804 return handled;
805}
806
807void perf_counter_unthrottle(void)
808{
809 struct cpu_hw_counters *cpuc;
810
811 if (!x86_pmu_initialized())
812 return;
813
814 cpuc = &__get_cpu_var(cpu_hw_counters);
815 if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
816 if (printk_ratelimit())
817 printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
818 hw_perf_restore(cpuc->throttle_ctrl);
819 }
820 cpuc->interrupts = 0;
821}
822
823void smp_perf_counter_interrupt(struct pt_regs *regs)
824{
825 irq_enter();
826 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
827 ack_APIC_irq();
828 x86_pmu.handle_irq(regs, 0);
829 irq_exit();
830}
831
832void smp_perf_pending_interrupt(struct pt_regs *regs)
833{
834 irq_enter();
835 ack_APIC_irq();
836 inc_irq_stat(apic_pending_irqs);
837 perf_counter_do_pending();
838 irq_exit();
839}
840
841void set_perf_counter_pending(void)
842{
843 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
844}
845
846void perf_counters_lapic_init(int nmi)
847{
848 u32 apic_val;
849
850 if (!x86_pmu_initialized())
851 return;
852
853 /*
854 * Enable the performance counter vector in the APIC LVT:
855 */
856 apic_val = apic_read(APIC_LVTERR);
857
858 apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
859 if (nmi)
860 apic_write(APIC_LVTPC, APIC_DM_NMI);
861 else
862 apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
863 apic_write(APIC_LVTERR, apic_val);
864}
865
866static int __kprobes
867perf_counter_nmi_handler(struct notifier_block *self,
868 unsigned long cmd, void *__args)
869{
870 struct die_args *args = __args;
871 struct pt_regs *regs;
872 int ret;
873
874 if (!atomic_read(&active_counters))
875 return NOTIFY_DONE;
876
877 switch (cmd) {
878 case DIE_NMI:
879 case DIE_NMI_IPI:
880 break;
881
882 default:
883 return NOTIFY_DONE;
884 }
885
886 regs = args->regs;
887
888 apic_write(APIC_LVTPC, APIC_DM_NMI);
889 ret = x86_pmu.handle_irq(regs, 1);
890
891 return ret ? NOTIFY_STOP : NOTIFY_OK;
892}
893
894static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
895 .notifier_call = perf_counter_nmi_handler,
896 .next = NULL,
897 .priority = 1
898};
899
900static struct x86_pmu intel_pmu = {
901 .name = "Intel",
902 .handle_irq = intel_pmu_handle_irq,
903 .save_disable_all = intel_pmu_save_disable_all,
904 .restore_all = intel_pmu_restore_all,
905 .enable = intel_pmu_enable_counter,
906 .disable = intel_pmu_disable_counter,
907 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
908 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
909 .event_map = intel_pmu_event_map,
910 .raw_event = intel_pmu_raw_event,
911 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
912 /*
913 * Intel PMCs cannot be accessed sanely above 32 bit width,
914 * so we install an artificial 1<<31 period regardless of
915 * the generic counter period:
916 */
917 .max_period = (1ULL << 31) - 1,
918};
919
920static struct x86_pmu amd_pmu = {
921 .name = "AMD",
922 .handle_irq = amd_pmu_handle_irq,
923 .save_disable_all = amd_pmu_save_disable_all,
924 .restore_all = amd_pmu_restore_all,
925 .enable = amd_pmu_enable_counter,
926 .disable = amd_pmu_disable_counter,
927 .eventsel = MSR_K7_EVNTSEL0,
928 .perfctr = MSR_K7_PERFCTR0,
929 .event_map = amd_pmu_event_map,
930 .raw_event = amd_pmu_raw_event,
931 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
932 .num_counters = 4,
933 .counter_bits = 48,
934 .counter_mask = (1ULL << 48) - 1,
935 /* use highest bit to detect overflow */
936 .max_period = (1ULL << 47) - 1,
937};
938
939static int intel_pmu_init(void)
940{
941 union cpuid10_edx edx;
942 union cpuid10_eax eax;
943 unsigned int unused;
944 unsigned int ebx;
945 int version;
946
947 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
948 return -ENODEV;
949
950 /*
951 * Check whether the Architectural PerfMon supports
952 * Branch Misses Retired Event or not.
953 */
954 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
955 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
956 return -ENODEV;
957
958 version = eax.split.version_id;
959 if (version < 2)
960 return -ENODEV;
961
962 x86_pmu = intel_pmu;
963 x86_pmu.version = version;
964 x86_pmu.num_counters = eax.split.num_counters;
965
966 /*
967 * Quirk: v2 perfmon does not report fixed-purpose counters, so
968 * assume at least 3 counters:
969 */
970 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
971
972 x86_pmu.counter_bits = eax.split.bit_width;
973 x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
974
975 return 0;
976}
977
978static int amd_pmu_init(void)
979{
980 x86_pmu = amd_pmu;
981 return 0;
982}
983
984void __init init_hw_perf_counters(void)
985{
986 int err;
987
988 switch (boot_cpu_data.x86_vendor) {
989 case X86_VENDOR_INTEL:
990 err = intel_pmu_init();
991 break;
992 case X86_VENDOR_AMD:
993 err = amd_pmu_init();
994 break;
995 default:
996 return;
997 }
998 if (err != 0)
999 return;
1000
1001 pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
1002 pr_info("... version: %d\n", x86_pmu.version);
1003 pr_info("... bit width: %d\n", x86_pmu.counter_bits);
1004
1005 pr_info("... num counters: %d\n", x86_pmu.num_counters);
1006 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1007 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1008 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
1009 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1010 }
1011 perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
1012 perf_max_counters = x86_pmu.num_counters;
1013
1014 pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask);
1015 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
1016
1017 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1018 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1019 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
1020 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1021 }
1022 pr_info("... fixed counters: %d\n", x86_pmu.num_counters_fixed);
1023
1024 perf_counter_mask |=
1025 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1026
1027 pr_info("... counter mask: %016Lx\n", perf_counter_mask);
1028
1029 perf_counters_lapic_init(0);
1030 register_die_notifier(&perf_counter_nmi_notifier);
1031}
1032
1033static inline void x86_pmu_read(struct perf_counter *counter)
1034{
1035 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
1036}
1037
1038static const struct pmu pmu = {
1039 .enable = x86_pmu_enable,
1040 .disable = x86_pmu_disable,
1041 .read = x86_pmu_read,
1042};
1043
1044const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
1045{
1046 int err;
1047
1048 err = __hw_perf_counter_init(counter);
1049 if (err)
1050 return ERR_PTR(err);
1051
1052 return &pmu;
1053}
1054
1055/*
1056 * callchain support
1057 */
1058
1059static inline
1060void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
1061{
1062 if (entry->nr < MAX_STACK_DEPTH)
1063 entry->ip[entry->nr++] = ip;
1064}
1065
1066static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
1067static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
1068
1069
1070static void
1071backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1072{
1073 /* Ignore warnings */
1074}
1075
1076static void backtrace_warning(void *data, char *msg)
1077{
1078 /* Ignore warnings */
1079}
1080
1081static int backtrace_stack(void *data, char *name)
1082{
1083 /* Don't bother with IRQ stacks for now */
1084 return -1;
1085}
1086
1087static void backtrace_address(void *data, unsigned long addr, int reliable)
1088{
1089 struct perf_callchain_entry *entry = data;
1090
1091 if (reliable)
1092 callchain_store(entry, addr);
1093}
1094
1095static const struct stacktrace_ops backtrace_ops = {
1096 .warning = backtrace_warning,
1097 .warning_symbol = backtrace_warning_symbol,
1098 .stack = backtrace_stack,
1099 .address = backtrace_address,
1100};
1101
1102static void
1103perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
1104{
1105 unsigned long bp;
1106 char *stack;
1107 int nr = entry->nr;
1108
1109 callchain_store(entry, instruction_pointer(regs));
1110
1111 stack = ((char *)regs + sizeof(struct pt_regs));
1112#ifdef CONFIG_FRAME_POINTER
1113 bp = frame_pointer(regs);
1114#else
1115 bp = 0;
1116#endif
1117
1118 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
1119
1120 entry->kernel = entry->nr - nr;
1121}
1122
1123
1124struct stack_frame {
1125 const void __user *next_fp;
1126 unsigned long return_address;
1127};
1128
1129static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
1130{
1131 int ret;
1132
1133 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
1134 return 0;
1135
1136 ret = 1;
1137 pagefault_disable();
1138 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
1139 ret = 0;
1140 pagefault_enable();
1141
1142 return ret;
1143}
1144
1145static void
1146perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1147{
1148 struct stack_frame frame;
1149 const void __user *fp;
1150 int nr = entry->nr;
1151
1152 regs = (struct pt_regs *)current->thread.sp0 - 1;
1153 fp = (void __user *)regs->bp;
1154
1155 callchain_store(entry, regs->ip);
1156
1157 while (entry->nr < MAX_STACK_DEPTH) {
1158 frame.next_fp = NULL;
1159 frame.return_address = 0;
1160
1161 if (!copy_stack_frame(fp, &frame))
1162 break;
1163
1164 if ((unsigned long)fp < user_stack_pointer(regs))
1165 break;
1166
1167 callchain_store(entry, frame.return_address);
1168 fp = frame.next_fp;
1169 }
1170
1171 entry->user = entry->nr - nr;
1172}
1173
1174static void
1175perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1176{
1177 int is_user;
1178
1179 if (!regs)
1180 return;
1181
1182 is_user = user_mode(regs);
1183
1184 if (!current || current->pid == 0)
1185 return;
1186
1187 if (is_user && current->state != TASK_RUNNING)
1188 return;
1189
1190 if (!is_user)
1191 perf_callchain_kernel(regs, entry);
1192
1193 if (current->mm)
1194 perf_callchain_user(regs, entry);
1195}
1196
1197struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1198{
1199 struct perf_callchain_entry *entry;
1200
1201 if (in_nmi())
1202 entry = &__get_cpu_var(nmi_entry);
1203 else
1204 entry = &__get_cpu_var(irq_entry);
1205
1206 entry->nr = 0;
1207 entry->hv = 0;
1208 entry->kernel = 0;
1209 entry->user = 0;
1210
1211 perf_do_callchain(regs, entry);
1212
1213 return entry;
1214}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f6c70a164e32..d6f5b9fbde32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,8 +19,8 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/genapic.h> 22#include <asm/apic.h>
23#include <asm/intel_arch_perfmon.h> 23#include <asm/perf_counter.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 38946c6e8433..891004619142 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1025,6 +1025,13 @@ apicinterrupt ERROR_APIC_VECTOR \
1025apicinterrupt SPURIOUS_APIC_VECTOR \ 1025apicinterrupt SPURIOUS_APIC_VECTOR \
1026 spurious_interrupt smp_spurious_interrupt 1026 spurious_interrupt smp_spurious_interrupt
1027 1027
1028#ifdef CONFIG_PERF_COUNTERS
1029apicinterrupt LOCAL_PERF_VECTOR \
1030 perf_counter_interrupt smp_perf_counter_interrupt
1031apicinterrupt LOCAL_PENDING_VECTOR \
1032 perf_pending_interrupt smp_perf_pending_interrupt
1033#endif
1034
1028/* 1035/*
1029 * Exception entry points. 1036 * Exception entry points.
1030 */ 1037 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c3fe010d74c8..8279fb8df17f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -63,6 +63,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)
63 for_each_online_cpu(j) 63 for_each_online_cpu(j)
64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); 64 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
65 seq_printf(p, " Spurious interrupts\n"); 65 seq_printf(p, " Spurious interrupts\n");
66 seq_printf(p, "%*s: ", prec, "CNT");
67 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance counter interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND");
71 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n");
66#endif 74#endif
67 if (generic_interrupt_extension) { 75 if (generic_interrupt_extension) {
68 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
@@ -166,6 +174,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
166#ifdef CONFIG_X86_LOCAL_APIC 174#ifdef CONFIG_X86_LOCAL_APIC
167 sum += irq_stats(cpu)->apic_timer_irqs; 175 sum += irq_stats(cpu)->apic_timer_irqs;
168 sum += irq_stats(cpu)->irq_spurious_count; 176 sum += irq_stats(cpu)->irq_spurious_count;
177 sum += irq_stats(cpu)->apic_perf_irqs;
178 sum += irq_stats(cpu)->apic_pending_irqs;
169#endif 179#endif
170 if (generic_interrupt_extension) 180 if (generic_interrupt_extension)
171 sum += irq_stats(cpu)->generic_irqs; 181 sum += irq_stats(cpu)->generic_irqs;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 368b0a8836f9..3190a6b961e6 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -118,28 +118,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
118 return 0; 118 return 0;
119} 119}
120 120
121/* Overridden in paravirt.c */ 121static void __init smp_intr_init(void)
122void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
123
124void __init native_init_IRQ(void)
125{ 122{
126 int i;
127
128 /* Execute any quirks before the call gates are initialised: */
129 x86_quirk_pre_intr_init();
130
131 /*
132 * Cover the whole vector space, no vector can escape
133 * us. (some of these will be overridden and become
134 * 'special' SMP interrupts)
135 */
136 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
137 /* SYSCALL_VECTOR was reserved in trap_init. */
138 if (i != SYSCALL_VECTOR)
139 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
140 }
141
142
143#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) 123#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
144 /* 124 /*
145 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 125 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -168,6 +148,11 @@ void __init native_init_IRQ(void)
168 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 148 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
169 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); 149 set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
170#endif 150#endif
151}
152
153static void __init apic_intr_init(void)
154{
155 smp_intr_init();
171 156
172#ifdef CONFIG_X86_LOCAL_APIC 157#ifdef CONFIG_X86_LOCAL_APIC
173 /* self generated IPI for local APIC timer */ 158 /* self generated IPI for local APIC timer */
@@ -179,12 +164,41 @@ void __init native_init_IRQ(void)
179 /* IPI vectors for APIC spurious and error interrupts */ 164 /* IPI vectors for APIC spurious and error interrupts */
180 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 165 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
181 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 166 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
182#endif 167# ifdef CONFIG_PERF_COUNTERS
168 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
169 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
170# endif
183 171
184#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) 172# ifdef CONFIG_X86_MCE_P4THERMAL
185 /* thermal monitor LVT interrupt */ 173 /* thermal monitor LVT interrupt */
186 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 174 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
175# endif
187#endif 176#endif
177}
178
179/* Overridden in paravirt.c */
180void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
181
182void __init native_init_IRQ(void)
183{
184 int i;
185
186 /* Execute any quirks before the call gates are initialised: */
187 x86_quirk_pre_intr_init();
188
189 apic_intr_init();
190
191 /*
192 * Cover the whole vector space, no vector can escape
193 * us. (some of these will be overridden and become
194 * 'special' SMP interrupts)
195 */
196 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
197 int vector = FIRST_EXTERNAL_VECTOR + i;
198 /* SYSCALL_VECTOR was reserved in trap_init. */
199 if (!test_bit(vector, used_vectors))
200 set_intr_gate(vector, interrupt[i]);
201 }
188 202
189 if (!acpi_ioapic) 203 if (!acpi_ioapic)
190 setup_irq(2, &irq2); 204 setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8cd10537fd46..53ceb26f80ff 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -152,6 +152,12 @@ static void __init apic_intr_init(void)
152 /* IPI vectors for APIC spurious and error interrupts */ 152 /* IPI vectors for APIC spurious and error interrupts */
153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 153 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 154 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
155
156 /* Performance monitoring interrupt: */
157#ifdef CONFIG_PERF_COUNTERS
158 alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
159 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
160#endif
155} 161}
156 162
157void __init native_init_IRQ(void) 163void __init native_init_IRQ(void)
@@ -159,6 +165,9 @@ void __init native_init_IRQ(void)
159 int i; 165 int i;
160 166
161 init_ISA_irqs(); 167 init_ISA_irqs();
168
169 apic_intr_init();
170
162 /* 171 /*
163 * Cover the whole vector space, no vector can escape 172 * Cover the whole vector space, no vector can escape
164 * us. (some of these will be overridden and become 173 * us. (some of these will be overridden and become
@@ -166,12 +175,10 @@ void __init native_init_IRQ(void)
166 */ 175 */
167 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { 176 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
168 int vector = FIRST_EXTERNAL_VECTOR + i; 177 int vector = FIRST_EXTERNAL_VECTOR + i;
169 if (vector != IA32_SYSCALL_VECTOR) 178 if (!test_bit(vector, used_vectors))
170 set_intr_gate(vector, interrupt[i]); 179 set_intr_gate(vector, interrupt[i]);
171 } 180 }
172 181
173 apic_intr_init();
174
175 if (!acpi_ioapic) 182 if (!acpi_ioapic)
176 setup_irq(2, &irq2); 183 setup_irq(2, &irq2);
177} 184}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 14425166b8e3..0a813b17b172 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#include <linux/sched.h> 9#include <linux/sched.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12#include <linux/smp.h> 11#include <linux/smp.h>
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c8736b491..d51321ddafda 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,5 @@ ENTRY(sys_call_table)
334 .long sys_inotify_init1 334 .long sys_inotify_init1
335 .long sys_preadv 335 .long sys_preadv
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_counter_open
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d288327ff0..2cc162e09c4b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -945,8 +945,13 @@ void __init trap_init(void)
945#endif 945#endif
946 set_intr_gate(19, &simd_coprocessor_error); 946 set_intr_gate(19, &simd_coprocessor_error);
947 947
948 /* Reserve all the builtin and the syscall vector: */
949 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
950 set_bit(i, used_vectors);
951
948#ifdef CONFIG_IA32_EMULATION 952#ifdef CONFIG_IA32_EMULATION
949 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); 953 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
954 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
950#endif 955#endif
951 956
952#ifdef CONFIG_X86_32 957#ifdef CONFIG_X86_32
@@ -963,17 +968,9 @@ void __init trap_init(void)
963 } 968 }
964 969
965 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 970 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
966#endif
967
968 /* Reserve all the builtin and the syscall vector: */
969 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
970 set_bit(i, used_vectors);
971
972#ifdef CONFIG_X86_64
973 set_bit(IA32_SYSCALL_VECTOR, used_vectors);
974#else
975 set_bit(SYSCALL_VECTOR, used_vectors); 971 set_bit(SYSCALL_VECTOR, used_vectors);
976#endif 972#endif
973
977 /* 974 /*
978 * Should be a barrier for any external CPU state: 975 * Should be a barrier for any external CPU state:
979 */ 976 */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa0..6f9df2babe48 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
27#include <linux/tty.h> 27#include <linux/tty.h>
28#include <linux/smp.h> 28#include <linux/smp.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/perf_counter.h>
30 31
31#include <asm-generic/sections.h> 32#include <asm-generic/sections.h>
32 33
@@ -1044,6 +1045,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1044 if (unlikely(error_code & PF_RSVD)) 1045 if (unlikely(error_code & PF_RSVD))
1045 pgtable_bad(regs, error_code, address); 1046 pgtable_bad(regs, error_code, address);
1046 1047
1048 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
1049
1047 /* 1050 /*
1048 * If we're in an interrupt, have no user context or are running 1051 * If we're in an interrupt, have no user context or are running
1049 * in an atomic region then we must not take the fault: 1052 * in an atomic region then we must not take the fault:
@@ -1137,10 +1140,15 @@ good_area:
1137 return; 1140 return;
1138 } 1141 }
1139 1142
1140 if (fault & VM_FAULT_MAJOR) 1143 if (fault & VM_FAULT_MAJOR) {
1141 tsk->maj_flt++; 1144 tsk->maj_flt++;
1142 else 1145 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
1146 regs, address);
1147 } else {
1143 tsk->min_flt++; 1148 tsk->min_flt++;
1149 perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
1150 regs, address);
1151 }
1144 1152
1145 check_v8086_mode(regs, address, tsk); 1153 check_v8086_mode(regs, address, tsk);
1146 1154
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864ad49a7..c638685136e1 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
40 40
41 switch (val) { 41 switch (val) {
42 case DIE_NMI: 42 case DIE_NMI:
43 if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) 43 case DIE_NMI_IPI:
44 ret = NOTIFY_STOP; 44 model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
45 ret = NOTIFY_STOP;
45 break; 46 break;
46 default: 47 default:
47 break; 48 break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
134static struct notifier_block profile_exceptions_nb = { 135static struct notifier_block profile_exceptions_nb = {
135 .notifier_call = profile_exceptions_notify, 136 .notifier_call = profile_exceptions_notify,
136 .next = NULL, 137 .next = NULL,
137 .priority = 0 138 .priority = 2
138}; 139};
139 140
140static int nmi_setup(void) 141static int nmi_setup(void)
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 10131fbdaada..4da7230b3d17 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
18#include <asm/msr.h> 18#include <asm/msr.h>
19#include <asm/apic.h> 19#include <asm/apic.h>
20#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h> 21#include <asm/perf_counter.h>
22 22
23#include "op_x86_model.h" 23#include "op_x86_model.h"
24#include "op_counter.h" 24#include "op_counter.h"
@@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
136 u64 val; 136 u64 val;
137 int i; 137 int i;
138 138
139 /*
140 * This can happen if perf counters are in use when
141 * we steal the die notifier NMI.
142 */
143 if (unlikely(!reset_value))
144 goto out;
145
139 for (i = 0 ; i < num_counters; ++i) { 146 for (i = 0 ; i < num_counters; ++i) {
140 if (!reset_value[i]) 147 if (!reset_value[i])
141 continue; 148 continue;
@@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
146 } 153 }
147 } 154 }
148 155
156out:
149 /* Only P6 based Pentium M need to re-unmask the apic vector but it 157 /* Only P6 based Pentium M need to re-unmask the apic vector but it
150 * doesn't hurt other P6 variant */ 158 * doesn't hurt other P6 variant */
151 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); 159 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
diff --git a/drivers/Makefile b/drivers/Makefile
index 2618a6169a13..1266ead6ace0 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -36,7 +36,7 @@ obj-$(CONFIG_FB_INTEL) += video/intelfb/
36 36
37obj-y += serial/ 37obj-y += serial/
38obj-$(CONFIG_PARPORT) += parport/ 38obj-$(CONFIG_PARPORT) += parport/
39obj-y += base/ block/ misc/ mfd/ media/ 39obj-y += base/ block/ misc/ mfd/
40obj-$(CONFIG_NUBUS) += nubus/ 40obj-$(CONFIG_NUBUS) += nubus/
41obj-y += macintosh/ 41obj-y += macintosh/
42obj-$(CONFIG_IDE) += ide/ 42obj-$(CONFIG_IDE) += ide/
@@ -71,7 +71,7 @@ obj-$(CONFIG_GAMEPORT) += input/gameport/
71obj-$(CONFIG_INPUT) += input/ 71obj-$(CONFIG_INPUT) += input/
72obj-$(CONFIG_I2O) += message/ 72obj-$(CONFIG_I2O) += message/
73obj-$(CONFIG_RTC_LIB) += rtc/ 73obj-$(CONFIG_RTC_LIB) += rtc/
74obj-y += i2c/ 74obj-y += i2c/ media/
75obj-$(CONFIG_W1) += w1/ 75obj-$(CONFIG_W1) += w1/
76obj-$(CONFIG_POWER_SUPPLY) += power/ 76obj-$(CONFIG_POWER_SUPPLY) += power/
77obj-$(CONFIG_HWMON) += hwmon/ 77obj-$(CONFIG_HWMON) += hwmon/
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index f7ca8c55956b..d2830f39d46b 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -763,8 +763,11 @@ static int acpi_idle_bm_check(void)
763 */ 763 */
764static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) 764static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
765{ 765{
766 u64 perf_flags;
767
766 /* Don't trace irqs off for idle */ 768 /* Don't trace irqs off for idle */
767 stop_critical_timings(); 769 stop_critical_timings();
770 perf_flags = hw_perf_save_disable();
768 if (cx->entry_method == ACPI_CSTATE_FFH) { 771 if (cx->entry_method == ACPI_CSTATE_FFH) {
769 /* Call into architectural FFH based C-state */ 772 /* Call into architectural FFH based C-state */
770 acpi_processor_ffh_cstate_enter(cx); 773 acpi_processor_ffh_cstate_enter(cx);
@@ -779,6 +782,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
779 gets asserted in time to freeze execution properly. */ 782 gets asserted in time to freeze execution properly. */
780 unused = inl(acpi_gbl_FADT.xpm_timer_block.address); 783 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
781 } 784 }
785 hw_perf_restore(perf_flags);
782 start_critical_timings(); 786 start_critical_timings();
783} 787}
784 788
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index b0a6a3e51924..aed2b2936ecf 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
25#include <linux/kbd_kern.h> 25#include <linux/kbd_kern.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/quotaops.h> 27#include <linux/quotaops.h>
28#include <linux/perf_counter.h>
28#include <linux/kernel.h> 29#include <linux/kernel.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/suspend.h> 31#include <linux/suspend.h>
@@ -243,6 +244,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
243 struct pt_regs *regs = get_irq_regs(); 244 struct pt_regs *regs = get_irq_regs();
244 if (regs) 245 if (regs)
245 show_regs(regs); 246 show_regs(regs);
247 perf_counter_print_debug();
246} 248}
247static struct sysrq_key_op sysrq_showregs_op = { 249static struct sysrq_key_op sysrq_showregs_op = {
248 .handler = sysrq_handle_showregs, 250 .handler = sysrq_handle_showregs,
diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index 5360c4fd4739..f33170368cd1 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -270,6 +270,15 @@ static void recv_handler(struct work_struct *work)
270 mutex_unlock(&ap->recv_mtx); 270 mutex_unlock(&ap->recv_mtx);
271} 271}
272 272
273/**
274 * capi_ctr_handle_message() - handle incoming CAPI message
275 * @card: controller descriptor structure.
276 * @appl: application ID.
277 * @skb: message.
278 *
279 * Called by hardware driver to pass a CAPI message to the application.
280 */
281
273void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *skb) 282void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *skb)
274{ 283{
275 struct capi20_appl *ap; 284 struct capi20_appl *ap;
@@ -348,6 +357,13 @@ error:
348 357
349EXPORT_SYMBOL(capi_ctr_handle_message); 358EXPORT_SYMBOL(capi_ctr_handle_message);
350 359
360/**
361 * capi_ctr_ready() - signal CAPI controller ready
362 * @card: controller descriptor structure.
363 *
364 * Called by hardware driver to signal that the controller is up and running.
365 */
366
351void capi_ctr_ready(struct capi_ctr * card) 367void capi_ctr_ready(struct capi_ctr * card)
352{ 368{
353 card->cardstate = CARD_RUNNING; 369 card->cardstate = CARD_RUNNING;
@@ -360,6 +376,14 @@ void capi_ctr_ready(struct capi_ctr * card)
360 376
361EXPORT_SYMBOL(capi_ctr_ready); 377EXPORT_SYMBOL(capi_ctr_ready);
362 378
379/**
380 * capi_ctr_reseted() - signal CAPI controller reset
381 * @card: controller descriptor structure.
382 *
383 * Called by hardware driver to signal that the controller is down and
384 * unavailable for use.
385 */
386
363void capi_ctr_reseted(struct capi_ctr * card) 387void capi_ctr_reseted(struct capi_ctr * card)
364{ 388{
365 u16 appl; 389 u16 appl;
@@ -391,6 +415,13 @@ void capi_ctr_reseted(struct capi_ctr * card)
391 415
392EXPORT_SYMBOL(capi_ctr_reseted); 416EXPORT_SYMBOL(capi_ctr_reseted);
393 417
418/**
419 * capi_ctr_suspend_output() - suspend controller
420 * @card: controller descriptor structure.
421 *
422 * Called by hardware driver to stop data flow.
423 */
424
394void capi_ctr_suspend_output(struct capi_ctr *card) 425void capi_ctr_suspend_output(struct capi_ctr *card)
395{ 426{
396 if (!card->blocked) { 427 if (!card->blocked) {
@@ -401,6 +432,13 @@ void capi_ctr_suspend_output(struct capi_ctr *card)
401 432
402EXPORT_SYMBOL(capi_ctr_suspend_output); 433EXPORT_SYMBOL(capi_ctr_suspend_output);
403 434
435/**
436 * capi_ctr_resume_output() - resume controller
437 * @card: controller descriptor structure.
438 *
439 * Called by hardware driver to resume data flow.
440 */
441
404void capi_ctr_resume_output(struct capi_ctr *card) 442void capi_ctr_resume_output(struct capi_ctr *card)
405{ 443{
406 if (card->blocked) { 444 if (card->blocked) {
@@ -413,6 +451,14 @@ EXPORT_SYMBOL(capi_ctr_resume_output);
413 451
414/* ------------------------------------------------------------- */ 452/* ------------------------------------------------------------- */
415 453
454/**
455 * attach_capi_ctr() - register CAPI controller
456 * @card: controller descriptor structure.
457 *
458 * Called by hardware driver to register a controller with the CAPI subsystem.
459 * Return value: 0 on success, error code < 0 on error
460 */
461
416int 462int
417attach_capi_ctr(struct capi_ctr *card) 463attach_capi_ctr(struct capi_ctr *card)
418{ 464{
@@ -459,6 +505,15 @@ attach_capi_ctr(struct capi_ctr *card)
459 505
460EXPORT_SYMBOL(attach_capi_ctr); 506EXPORT_SYMBOL(attach_capi_ctr);
461 507
508/**
509 * detach_capi_ctr() - unregister CAPI controller
510 * @card: controller descriptor structure.
511 *
512 * Called by hardware driver to remove the registration of a controller
513 * with the CAPI subsystem.
514 * Return value: 0 on success, error code < 0 on error
515 */
516
462int detach_capi_ctr(struct capi_ctr *card) 517int detach_capi_ctr(struct capi_ctr *card)
463{ 518{
464 if (card->cardstate != CARD_DETECTED) 519 if (card->cardstate != CARD_DETECTED)
@@ -479,6 +534,13 @@ int detach_capi_ctr(struct capi_ctr *card)
479 534
480EXPORT_SYMBOL(detach_capi_ctr); 535EXPORT_SYMBOL(detach_capi_ctr);
481 536
537/**
538 * register_capi_driver() - register CAPI driver
539 * @driver: driver descriptor structure.
540 *
541 * Called by hardware driver to register itself with the CAPI subsystem.
542 */
543
482void register_capi_driver(struct capi_driver *driver) 544void register_capi_driver(struct capi_driver *driver)
483{ 545{
484 unsigned long flags; 546 unsigned long flags;
@@ -490,6 +552,13 @@ void register_capi_driver(struct capi_driver *driver)
490 552
491EXPORT_SYMBOL(register_capi_driver); 553EXPORT_SYMBOL(register_capi_driver);
492 554
555/**
556 * unregister_capi_driver() - unregister CAPI driver
557 * @driver: driver descriptor structure.
558 *
559 * Called by hardware driver to unregister itself from the CAPI subsystem.
560 */
561
493void unregister_capi_driver(struct capi_driver *driver) 562void unregister_capi_driver(struct capi_driver *driver)
494{ 563{
495 unsigned long flags; 564 unsigned long flags;
@@ -505,6 +574,13 @@ EXPORT_SYMBOL(unregister_capi_driver);
505/* -------- CAPI2.0 Interface ---------------------------------- */ 574/* -------- CAPI2.0 Interface ---------------------------------- */
506/* ------------------------------------------------------------- */ 575/* ------------------------------------------------------------- */
507 576
577/**
578 * capi20_isinstalled() - CAPI 2.0 operation CAPI_INSTALLED
579 *
580 * Return value: CAPI result code (CAPI_NOERROR if at least one ISDN controller
581 * is ready for use, CAPI_REGNOTINSTALLED otherwise)
582 */
583
508u16 capi20_isinstalled(void) 584u16 capi20_isinstalled(void)
509{ 585{
510 int i; 586 int i;
@@ -517,6 +593,18 @@ u16 capi20_isinstalled(void)
517 593
518EXPORT_SYMBOL(capi20_isinstalled); 594EXPORT_SYMBOL(capi20_isinstalled);
519 595
596/**
597 * capi20_register() - CAPI 2.0 operation CAPI_REGISTER
598 * @ap: CAPI application descriptor structure.
599 *
600 * Register an application's presence with CAPI.
601 * A unique application ID is assigned and stored in @ap->applid.
602 * After this function returns successfully, the message receive
603 * callback function @ap->recv_message() may be called at any time
604 * until capi20_release() has been called for the same @ap.
605 * Return value: CAPI result code
606 */
607
520u16 capi20_register(struct capi20_appl *ap) 608u16 capi20_register(struct capi20_appl *ap)
521{ 609{
522 int i; 610 int i;
@@ -571,6 +659,16 @@ u16 capi20_register(struct capi20_appl *ap)
571 659
572EXPORT_SYMBOL(capi20_register); 660EXPORT_SYMBOL(capi20_register);
573 661
662/**
663 * capi20_release() - CAPI 2.0 operation CAPI_RELEASE
664 * @ap: CAPI application descriptor structure.
665 *
666 * Terminate an application's registration with CAPI.
667 * After this function returns successfully, the message receive
668 * callback function @ap->recv_message() will no longer be called.
669 * Return value: CAPI result code
670 */
671
574u16 capi20_release(struct capi20_appl *ap) 672u16 capi20_release(struct capi20_appl *ap)
575{ 673{
576 int i; 674 int i;
@@ -603,6 +701,15 @@ u16 capi20_release(struct capi20_appl *ap)
603 701
604EXPORT_SYMBOL(capi20_release); 702EXPORT_SYMBOL(capi20_release);
605 703
704/**
705 * capi20_put_message() - CAPI 2.0 operation CAPI_PUT_MESSAGE
706 * @ap: CAPI application descriptor structure.
707 * @skb: CAPI message.
708 *
709 * Transfer a single message to CAPI.
710 * Return value: CAPI result code
711 */
712
606u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb) 713u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb)
607{ 714{
608 struct capi_ctr *card; 715 struct capi_ctr *card;
@@ -668,6 +775,16 @@ u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb)
668 775
669EXPORT_SYMBOL(capi20_put_message); 776EXPORT_SYMBOL(capi20_put_message);
670 777
778/**
779 * capi20_get_manufacturer() - CAPI 2.0 operation CAPI_GET_MANUFACTURER
780 * @contr: controller number.
781 * @buf: result buffer (64 bytes).
782 *
783 * Retrieve information about the manufacturer of the specified ISDN controller
784 * or (for @contr == 0) the driver itself.
785 * Return value: CAPI result code
786 */
787
671u16 capi20_get_manufacturer(u32 contr, u8 *buf) 788u16 capi20_get_manufacturer(u32 contr, u8 *buf)
672{ 789{
673 struct capi_ctr *card; 790 struct capi_ctr *card;
@@ -685,6 +802,16 @@ u16 capi20_get_manufacturer(u32 contr, u8 *buf)
685 802
686EXPORT_SYMBOL(capi20_get_manufacturer); 803EXPORT_SYMBOL(capi20_get_manufacturer);
687 804
805/**
806 * capi20_get_version() - CAPI 2.0 operation CAPI_GET_VERSION
807 * @contr: controller number.
808 * @verp: result structure.
809 *
810 * Retrieve version information for the specified ISDN controller
811 * or (for @contr == 0) the driver itself.
812 * Return value: CAPI result code
813 */
814
688u16 capi20_get_version(u32 contr, struct capi_version *verp) 815u16 capi20_get_version(u32 contr, struct capi_version *verp)
689{ 816{
690 struct capi_ctr *card; 817 struct capi_ctr *card;
@@ -703,6 +830,16 @@ u16 capi20_get_version(u32 contr, struct capi_version *verp)
703 830
704EXPORT_SYMBOL(capi20_get_version); 831EXPORT_SYMBOL(capi20_get_version);
705 832
833/**
834 * capi20_get_serial() - CAPI 2.0 operation CAPI_GET_SERIAL_NUMBER
835 * @contr: controller number.
836 * @serial: result buffer (8 bytes).
837 *
838 * Retrieve the serial number of the specified ISDN controller
839 * or (for @contr == 0) the driver itself.
840 * Return value: CAPI result code
841 */
842
706u16 capi20_get_serial(u32 contr, u8 *serial) 843u16 capi20_get_serial(u32 contr, u8 *serial)
707{ 844{
708 struct capi_ctr *card; 845 struct capi_ctr *card;
@@ -721,6 +858,16 @@ u16 capi20_get_serial(u32 contr, u8 *serial)
721 858
722EXPORT_SYMBOL(capi20_get_serial); 859EXPORT_SYMBOL(capi20_get_serial);
723 860
861/**
862 * capi20_get_profile() - CAPI 2.0 operation CAPI_GET_PROFILE
863 * @contr: controller number.
864 * @profp: result structure.
865 *
866 * Retrieve capability information for the specified ISDN controller
867 * or (for @contr == 0) the number of installed controllers.
868 * Return value: CAPI result code
869 */
870
724u16 capi20_get_profile(u32 contr, struct capi_profile *profp) 871u16 capi20_get_profile(u32 contr, struct capi_profile *profp)
725{ 872{
726 struct capi_ctr *card; 873 struct capi_ctr *card;
@@ -903,6 +1050,15 @@ static int old_capi_manufacturer(unsigned int cmd, void __user *data)
903} 1050}
904#endif 1051#endif
905 1052
1053/**
1054 * capi20_manufacturer() - CAPI 2.0 operation CAPI_MANUFACTURER
1055 * @cmd: command.
1056 * @data: parameter.
1057 *
1058 * Perform manufacturer specific command.
1059 * Return value: CAPI result code
1060 */
1061
906int capi20_manufacturer(unsigned int cmd, void __user *data) 1062int capi20_manufacturer(unsigned int cmd, void __user *data)
907{ 1063{
908 struct capi_ctr *card; 1064 struct capi_ctr *card;
@@ -981,6 +1137,21 @@ int capi20_manufacturer(unsigned int cmd, void __user *data)
981EXPORT_SYMBOL(capi20_manufacturer); 1137EXPORT_SYMBOL(capi20_manufacturer);
982 1138
983/* temporary hack */ 1139/* temporary hack */
1140
1141/**
1142 * capi20_set_callback() - set CAPI application notification callback function
1143 * @ap: CAPI application descriptor structure.
1144 * @callback: callback function (NULL to remove).
1145 *
1146 * If not NULL, the callback function will be called to notify the
1147 * application of the addition or removal of a controller.
1148 * The first argument (cmd) will tell whether the controller was added
1149 * (KCI_CONTRUP) or removed (KCI_CONTRDOWN).
1150 * The second argument (contr) will be the controller number.
1151 * For cmd==KCI_CONTRUP the third argument (data) will be a pointer to the
1152 * new controller's capability profile structure.
1153 */
1154
984void capi20_set_callback(struct capi20_appl *ap, 1155void capi20_set_callback(struct capi20_appl *ap,
985 void (*callback) (unsigned int cmd, __u32 contr, void *data)) 1156 void (*callback) (unsigned int cmd, __u32 contr, void *data))
986{ 1157{
diff --git a/drivers/media/video/au0828/au0828-core.c b/drivers/media/video/au0828/au0828-core.c
index 4cee0b92eeee..a1e4c0d769a6 100644
--- a/drivers/media/video/au0828/au0828-core.c
+++ b/drivers/media/video/au0828/au0828-core.c
@@ -192,8 +192,6 @@ static int au0828_usb_probe(struct usb_interface *interface,
192 dev->usbdev = usbdev; 192 dev->usbdev = usbdev;
193 dev->boardnr = id->driver_info; 193 dev->boardnr = id->driver_info;
194 194
195 usb_set_intfdata(interface, dev);
196
197 /* Create the v4l2_device */ 195 /* Create the v4l2_device */
198 retval = v4l2_device_register(&interface->dev, &dev->v4l2_dev); 196 retval = v4l2_device_register(&interface->dev, &dev->v4l2_dev);
199 if (retval) { 197 if (retval) {
@@ -222,6 +220,10 @@ static int au0828_usb_probe(struct usb_interface *interface,
222 /* Digital TV */ 220 /* Digital TV */
223 au0828_dvb_register(dev); 221 au0828_dvb_register(dev);
224 222
223 /* Store the pointer to the au0828_dev so it can be accessed in
224 au0828_usb_disconnect */
225 usb_set_intfdata(interface, dev);
226
225 printk(KERN_INFO "Registered device AU0828 [%s]\n", 227 printk(KERN_INFO "Registered device AU0828 [%s]\n",
226 dev->board.name == NULL ? "Unset" : dev->board.name); 228 dev->board.name == NULL ? "Unset" : dev->board.name);
227 229
diff --git a/drivers/media/video/cx18/cx18-audio.c b/drivers/media/video/cx18/cx18-audio.c
index 1519e91c677a..7a8ad5963de8 100644
--- a/drivers/media/video/cx18/cx18-audio.c
+++ b/drivers/media/video/cx18/cx18-audio.c
@@ -44,7 +44,7 @@ int cx18_audio_set_io(struct cx18 *cx)
44 44
45 /* handle muxer chips */ 45 /* handle muxer chips */
46 v4l2_subdev_call(cx->sd_extmux, audio, s_routing, 46 v4l2_subdev_call(cx->sd_extmux, audio, s_routing,
47 in->audio_input, 0, 0); 47 (u32) in->muxer_input, 0, 0);
48 48
49 err = cx18_call_hw_err(cx, cx->card->hw_audio_ctrl, 49 err = cx18_call_hw_err(cx, cx->card->hw_audio_ctrl,
50 audio, s_routing, in->audio_input, 0, 0); 50 audio, s_routing, in->audio_input, 0, 0);
diff --git a/drivers/media/video/cx18/cx18-i2c.c b/drivers/media/video/cx18/cx18-i2c.c
index b9b7064a2be8..8591e4fc359f 100644
--- a/drivers/media/video/cx18/cx18-i2c.c
+++ b/drivers/media/video/cx18/cx18-i2c.c
@@ -211,7 +211,7 @@ static struct i2c_algo_bit_data cx18_i2c_algo_template = {
211/* init + register i2c algo-bit adapter */ 211/* init + register i2c algo-bit adapter */
212int init_cx18_i2c(struct cx18 *cx) 212int init_cx18_i2c(struct cx18 *cx)
213{ 213{
214 int i; 214 int i, err;
215 CX18_DEBUG_I2C("i2c init\n"); 215 CX18_DEBUG_I2C("i2c init\n");
216 216
217 for (i = 0; i < 2; i++) { 217 for (i = 0; i < 2; i++) {
@@ -268,8 +268,18 @@ int init_cx18_i2c(struct cx18 *cx)
268 cx18_call_hw(cx, CX18_HW_GPIO_RESET_CTRL, 268 cx18_call_hw(cx, CX18_HW_GPIO_RESET_CTRL,
269 core, reset, (u32) CX18_GPIO_RESET_I2C); 269 core, reset, (u32) CX18_GPIO_RESET_I2C);
270 270
271 return i2c_bit_add_bus(&cx->i2c_adap[0]) || 271 err = i2c_bit_add_bus(&cx->i2c_adap[0]);
272 i2c_bit_add_bus(&cx->i2c_adap[1]); 272 if (err)
273 goto err;
274 err = i2c_bit_add_bus(&cx->i2c_adap[1]);
275 if (err)
276 goto err_del_bus_0;
277 return 0;
278
279 err_del_bus_0:
280 i2c_del_adapter(&cx->i2c_adap[0]);
281 err:
282 return err;
273} 283}
274 284
275void exit_cx18_i2c(struct cx18 *cx) 285void exit_cx18_i2c(struct cx18 *cx)
diff --git a/drivers/media/video/cx231xx/Kconfig b/drivers/media/video/cx231xx/Kconfig
index 91156546a07a..477d4ab5e9ac 100644
--- a/drivers/media/video/cx231xx/Kconfig
+++ b/drivers/media/video/cx231xx/Kconfig
@@ -1,12 +1,11 @@
1config VIDEO_CX231XX 1config VIDEO_CX231XX
2 tristate "Conexant cx231xx USB video capture support" 2 tristate "Conexant cx231xx USB video capture support"
3 depends on VIDEO_DEV && I2C && INPUT 3 depends on VIDEO_DEV && I2C && INPUT
4 select VIDEO_TUNER 4 select VIDEO_TUNER
5 select VIDEO_TVEEPROM 5 select VIDEO_TVEEPROM
6 select VIDEO_IR 6 select VIDEO_IR
7 select VIDEOBUF_VMALLOC 7 select VIDEOBUF_VMALLOC
8 select VIDEO_CX25840 8 select VIDEO_CX25840
9 select VIDEO_CX231XX_ALSA
10 9
11 ---help--- 10 ---help---
12 This is a video4linux driver for Conexant 231xx USB based TV cards. 11 This is a video4linux driver for Conexant 231xx USB based TV cards.
@@ -15,21 +14,22 @@ config VIDEO_CX231XX
15 module will be called cx231xx 14 module will be called cx231xx
16 15
17config VIDEO_CX231XX_ALSA 16config VIDEO_CX231XX_ALSA
18 tristate "Conexant Cx231xx ALSA audio module" 17 tristate "Conexant Cx231xx ALSA audio module"
19 depends on VIDEO_CX231XX && SND 18 depends on VIDEO_CX231XX && SND
20 select SND_PCM 19 select SND_PCM
21 20
22 ---help--- 21 ---help---
23 This is an ALSA driver for Cx231xx USB based TV cards. 22 This is an ALSA driver for Cx231xx USB based TV cards.
24 23
25 To compile this driver as a module, choose M here: the 24 To compile this driver as a module, choose M here: the
26 module will be called cx231xx-alsa 25 module will be called cx231xx-alsa
27 26
28config VIDEO_CX231XX_DVB 27config VIDEO_CX231XX_DVB
29 tristate "DVB/ATSC Support for Cx231xx based TV cards" 28 tristate "DVB/ATSC Support for Cx231xx based TV cards"
30 depends on VIDEO_CX231XX && DVB_CORE 29 depends on VIDEO_CX231XX && DVB_CORE
31 select VIDEOBUF_DVB 30 select VIDEOBUF_DVB
32 select MEDIA_TUNER_XC5000 if !DVB_FE_CUSTOMISE 31 select MEDIA_TUNER_XC5000 if !DVB_FE_CUSTOMISE
33 ---help--- 32
34 This adds support for DVB cards based on the 33 ---help---
35 Conexant cx231xx chips. 34 This adds support for DVB cards based on the
35 Conexant cx231xx chips.
diff --git a/drivers/media/video/cx23885/cx23885-cards.c b/drivers/media/video/cx23885/cx23885-cards.c
index a3c0565be1a9..6d6293f7d428 100644
--- a/drivers/media/video/cx23885/cx23885-cards.c
+++ b/drivers/media/video/cx23885/cx23885-cards.c
@@ -441,9 +441,9 @@ int cx23885_tuner_callback(void *priv, int component, int command, int arg)
441 case CX23885_BOARD_DVICO_FUSIONHDTV_DVB_T_DUAL_EXP: 441 case CX23885_BOARD_DVICO_FUSIONHDTV_DVB_T_DUAL_EXP:
442 /* Two identical tuners on two different i2c buses, 442 /* Two identical tuners on two different i2c buses,
443 * we need to reset the correct gpio. */ 443 * we need to reset the correct gpio. */
444 if (port->nr == 0) 444 if (port->nr == 1)
445 bitmask = 0x01; 445 bitmask = 0x01;
446 else if (port->nr == 1) 446 else if (port->nr == 2)
447 bitmask = 0x04; 447 bitmask = 0x04;
448 break; 448 break;
449 } 449 }
diff --git a/drivers/media/video/cx23885/cx23885-dvb.c b/drivers/media/video/cx23885/cx23885-dvb.c
index f48454ab3900..0c49a98213c4 100644
--- a/drivers/media/video/cx23885/cx23885-dvb.c
+++ b/drivers/media/video/cx23885/cx23885-dvb.c
@@ -314,6 +314,7 @@ static struct zl10353_config dvico_fusionhdtv_xc3028 = {
314 .demod_address = 0x0f, 314 .demod_address = 0x0f,
315 .if2 = 45600, 315 .if2 = 45600,
316 .no_tuner = 1, 316 .no_tuner = 1,
317 .disable_i2c_gate_ctrl = 1,
317}; 318};
318 319
319static struct stv0900_config netup_stv0900_config = { 320static struct stv0900_config netup_stv0900_config = {
diff --git a/drivers/media/video/mx3_camera.c b/drivers/media/video/mx3_camera.c
index c462b811e994..2d0781118eb0 100644
--- a/drivers/media/video/mx3_camera.c
+++ b/drivers/media/video/mx3_camera.c
@@ -1063,10 +1063,6 @@ static struct soc_camera_host_ops mx3_soc_camera_host_ops = {
1063 .owner = THIS_MODULE, 1063 .owner = THIS_MODULE,
1064 .add = mx3_camera_add_device, 1064 .add = mx3_camera_add_device,
1065 .remove = mx3_camera_remove_device, 1065 .remove = mx3_camera_remove_device,
1066#ifdef CONFIG_PM
1067 .suspend = mx3_camera_suspend,
1068 .resume = mx3_camera_resume,
1069#endif
1070 .set_crop = mx3_camera_set_crop, 1066 .set_crop = mx3_camera_set_crop,
1071 .set_fmt = mx3_camera_set_fmt, 1067 .set_fmt = mx3_camera_set_fmt,
1072 .try_fmt = mx3_camera_try_fmt, 1068 .try_fmt = mx3_camera_try_fmt,
diff --git a/drivers/media/video/s2255drv.c b/drivers/media/video/s2255drv.c
index 5202cadb2aae..30f4698be90a 100644
--- a/drivers/media/video/s2255drv.c
+++ b/drivers/media/video/s2255drv.c
@@ -1237,6 +1237,7 @@ static int s2255_set_mode(struct s2255_dev *dev, unsigned long chn,
1237 buffer[1] = (u32) chn_rev; 1237 buffer[1] = (u32) chn_rev;
1238 buffer[2] = CMD_SET_MODE; 1238 buffer[2] = CMD_SET_MODE;
1239 memcpy(&buffer[3], &dev->mode[chn], sizeof(struct s2255_mode)); 1239 memcpy(&buffer[3], &dev->mode[chn], sizeof(struct s2255_mode));
1240 dev->setmode_ready[chn] = 0;
1240 res = s2255_write_config(dev->udev, (unsigned char *)buffer, 512); 1241 res = s2255_write_config(dev->udev, (unsigned char *)buffer, 512);
1241 if (debug) 1242 if (debug)
1242 dump_verify_mode(dev, mode); 1243 dump_verify_mode(dev, mode);
@@ -1245,7 +1246,6 @@ static int s2255_set_mode(struct s2255_dev *dev, unsigned long chn,
1245 1246
1246 /* wait at least 3 frames before continuing */ 1247 /* wait at least 3 frames before continuing */
1247 if (mode->restart) { 1248 if (mode->restart) {
1248 dev->setmode_ready[chn] = 0;
1249 wait_event_timeout(dev->wait_setmode[chn], 1249 wait_event_timeout(dev->wait_setmode[chn],
1250 (dev->setmode_ready[chn] != 0), 1250 (dev->setmode_ready[chn] != 0),
1251 msecs_to_jiffies(S2255_SETMODE_TIMEOUT)); 1251 msecs_to_jiffies(S2255_SETMODE_TIMEOUT));
diff --git a/drivers/media/video/saa5246a.c b/drivers/media/video/saa5246a.c
index da47b2f05288..155804b061e9 100644
--- a/drivers/media/video/saa5246a.c
+++ b/drivers/media/video/saa5246a.c
@@ -1092,9 +1092,8 @@ static int saa5246a_probe(struct i2c_client *client,
1092 /* Register it */ 1092 /* Register it */
1093 err = video_register_device(t->vdev, VFL_TYPE_VTX, -1); 1093 err = video_register_device(t->vdev, VFL_TYPE_VTX, -1);
1094 if (err < 0) { 1094 if (err < 0) {
1095 kfree(t);
1096 video_device_release(t->vdev); 1095 video_device_release(t->vdev);
1097 t->vdev = NULL; 1096 kfree(t);
1098 return err; 1097 return err;
1099 } 1098 }
1100 return 0; 1099 return 0;
diff --git a/drivers/media/video/saa5249.c b/drivers/media/video/saa5249.c
index 48b27fe48087..271d6e931b75 100644
--- a/drivers/media/video/saa5249.c
+++ b/drivers/media/video/saa5249.c
@@ -598,6 +598,7 @@ static int saa5249_probe(struct i2c_client *client,
598 /* Now create a video4linux device */ 598 /* Now create a video4linux device */
599 t->vdev = video_device_alloc(); 599 t->vdev = video_device_alloc();
600 if (t->vdev == NULL) { 600 if (t->vdev == NULL) {
601 kfree(t);
601 kfree(client); 602 kfree(client);
602 return -ENOMEM; 603 return -ENOMEM;
603 } 604 }
@@ -617,9 +618,8 @@ static int saa5249_probe(struct i2c_client *client,
617 /* Register it */ 618 /* Register it */
618 err = video_register_device(t->vdev, VFL_TYPE_VTX, -1); 619 err = video_register_device(t->vdev, VFL_TYPE_VTX, -1);
619 if (err < 0) { 620 if (err < 0) {
620 kfree(t);
621 video_device_release(t->vdev); 621 video_device_release(t->vdev);
622 t->vdev = NULL; 622 kfree(t);
623 return err; 623 return err;
624 } 624 }
625 return 0; 625 return 0;
diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 5c0b457c7868..0f9ee1348552 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -2728,7 +2728,7 @@ static void __devexit e100_remove(struct pci_dev *pdev)
2728#define E100_82552_SMARTSPEED 0x14 /* SmartSpeed Ctrl register */ 2728#define E100_82552_SMARTSPEED 0x14 /* SmartSpeed Ctrl register */
2729#define E100_82552_REV_ANEG 0x0200 /* Reverse auto-negotiation */ 2729#define E100_82552_REV_ANEG 0x0200 /* Reverse auto-negotiation */
2730#define E100_82552_ANEG_NOW 0x0400 /* Auto-negotiate now */ 2730#define E100_82552_ANEG_NOW 0x0400 /* Auto-negotiate now */
2731static int e100_suspend(struct pci_dev *pdev, pm_message_t state) 2731static void __e100_shutdown(struct pci_dev *pdev, bool *enable_wake)
2732{ 2732{
2733 struct net_device *netdev = pci_get_drvdata(pdev); 2733 struct net_device *netdev = pci_get_drvdata(pdev);
2734 struct nic *nic = netdev_priv(netdev); 2734 struct nic *nic = netdev_priv(netdev);
@@ -2749,19 +2749,32 @@ static int e100_suspend(struct pci_dev *pdev, pm_message_t state)
2749 E100_82552_SMARTSPEED, smartspeed | 2749 E100_82552_SMARTSPEED, smartspeed |
2750 E100_82552_REV_ANEG | E100_82552_ANEG_NOW); 2750 E100_82552_REV_ANEG | E100_82552_ANEG_NOW);
2751 } 2751 }
2752 if (pci_enable_wake(pdev, PCI_D3cold, true)) 2752 *enable_wake = true;
2753 pci_enable_wake(pdev, PCI_D3hot, true);
2754 } else { 2753 } else {
2755 pci_enable_wake(pdev, PCI_D3hot, false); 2754 *enable_wake = false;
2756 } 2755 }
2757 2756
2758 pci_disable_device(pdev); 2757 pci_disable_device(pdev);
2759 pci_set_power_state(pdev, PCI_D3hot); 2758}
2760 2759
2761 return 0; 2760static int __e100_power_off(struct pci_dev *pdev, bool wake)
2761{
2762 if (wake) {
2763 return pci_prepare_to_sleep(pdev);
2764 } else {
2765 pci_wake_from_d3(pdev, false);
2766 return pci_set_power_state(pdev, PCI_D3hot);
2767 }
2762} 2768}
2763 2769
2764#ifdef CONFIG_PM 2770#ifdef CONFIG_PM
2771static int e100_suspend(struct pci_dev *pdev, pm_message_t state)
2772{
2773 bool wake;
2774 __e100_shutdown(pdev, &wake);
2775 return __e100_power_off(pdev, wake);
2776}
2777
2765static int e100_resume(struct pci_dev *pdev) 2778static int e100_resume(struct pci_dev *pdev)
2766{ 2779{
2767 struct net_device *netdev = pci_get_drvdata(pdev); 2780 struct net_device *netdev = pci_get_drvdata(pdev);
@@ -2792,7 +2805,10 @@ static int e100_resume(struct pci_dev *pdev)
2792 2805
2793static void e100_shutdown(struct pci_dev *pdev) 2806static void e100_shutdown(struct pci_dev *pdev)
2794{ 2807{
2795 e100_suspend(pdev, PMSG_SUSPEND); 2808 bool wake;
2809 __e100_shutdown(pdev, &wake);
2810 if (system_state == SYSTEM_POWER_OFF)
2811 __e100_power_off(pdev, wake);
2796} 2812}
2797 2813
2798/* ------------------ PCI Error Recovery infrastructure -------------- */ 2814/* ------------------ PCI Error Recovery infrastructure -------------- */
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index 11d5db16ed9c..f9a846b1b92f 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -1880,6 +1880,7 @@ static void nv_init_tx(struct net_device *dev)
1880 np->tx_pkts_in_progress = 0; 1880 np->tx_pkts_in_progress = 0;
1881 np->tx_change_owner = NULL; 1881 np->tx_change_owner = NULL;
1882 np->tx_end_flip = NULL; 1882 np->tx_end_flip = NULL;
1883 np->tx_stop = 0;
1883 1884
1884 for (i = 0; i < np->tx_ring_size; i++) { 1885 for (i = 0; i < np->tx_ring_size; i++) {
1885 if (!nv_optimized(np)) { 1886 if (!nv_optimized(np)) {
@@ -2530,6 +2531,8 @@ static void nv_tx_timeout(struct net_device *dev)
2530 struct fe_priv *np = netdev_priv(dev); 2531 struct fe_priv *np = netdev_priv(dev);
2531 u8 __iomem *base = get_hwbase(dev); 2532 u8 __iomem *base = get_hwbase(dev);
2532 u32 status; 2533 u32 status;
2534 union ring_type put_tx;
2535 int saved_tx_limit;
2533 2536
2534 if (np->msi_flags & NV_MSI_X_ENABLED) 2537 if (np->msi_flags & NV_MSI_X_ENABLED)
2535 status = readl(base + NvRegMSIXIrqStatus) & NVREG_IRQSTAT_MASK; 2538 status = readl(base + NvRegMSIXIrqStatus) & NVREG_IRQSTAT_MASK;
@@ -2589,24 +2592,32 @@ static void nv_tx_timeout(struct net_device *dev)
2589 /* 1) stop tx engine */ 2592 /* 1) stop tx engine */
2590 nv_stop_tx(dev); 2593 nv_stop_tx(dev);
2591 2594
2592 /* 2) check that the packets were not sent already: */ 2595 /* 2) complete any outstanding tx and do not give HW any limited tx pkts */
2596 saved_tx_limit = np->tx_limit;
2597 np->tx_limit = 0; /* prevent giving HW any limited pkts */
2598 np->tx_stop = 0; /* prevent waking tx queue */
2593 if (!nv_optimized(np)) 2599 if (!nv_optimized(np))
2594 nv_tx_done(dev, np->tx_ring_size); 2600 nv_tx_done(dev, np->tx_ring_size);
2595 else 2601 else
2596 nv_tx_done_optimized(dev, np->tx_ring_size); 2602 nv_tx_done_optimized(dev, np->tx_ring_size);
2597 2603
2598 /* 3) if there are dead entries: clear everything */ 2604 /* save current HW postion */
2599 if (np->get_tx_ctx != np->put_tx_ctx) { 2605 if (np->tx_change_owner)
2600 printk(KERN_DEBUG "%s: tx_timeout: dead entries!\n", dev->name); 2606 put_tx.ex = np->tx_change_owner->first_tx_desc;
2601 nv_drain_tx(dev); 2607 else
2602 nv_init_tx(dev); 2608 put_tx = np->put_tx;
2603 setup_hw_rings(dev, NV_SETUP_TX_RING);
2604 }
2605 2609
2606 netif_wake_queue(dev); 2610 /* 3) clear all tx state */
2611 nv_drain_tx(dev);
2612 nv_init_tx(dev);
2613
2614 /* 4) restore state to current HW position */
2615 np->get_tx = np->put_tx = put_tx;
2616 np->tx_limit = saved_tx_limit;
2607 2617
2608 /* 4) restart tx engine */ 2618 /* 5) restart tx engine */
2609 nv_start_tx(dev); 2619 nv_start_tx(dev);
2620 netif_wake_queue(dev);
2610 spin_unlock_irq(&np->lock); 2621 spin_unlock_irq(&np->lock);
2611} 2622}
2612 2623
diff --git a/drivers/net/ixgbe/ixgbe_common.c b/drivers/net/ixgbe/ixgbe_common.c
index 5567519676d5..186a65069b33 100644
--- a/drivers/net/ixgbe/ixgbe_common.c
+++ b/drivers/net/ixgbe/ixgbe_common.c
@@ -50,7 +50,6 @@ static u16 ixgbe_calc_eeprom_checksum(struct ixgbe_hw *hw);
50static void ixgbe_enable_rar(struct ixgbe_hw *hw, u32 index); 50static void ixgbe_enable_rar(struct ixgbe_hw *hw, u32 index);
51static void ixgbe_disable_rar(struct ixgbe_hw *hw, u32 index); 51static void ixgbe_disable_rar(struct ixgbe_hw *hw, u32 index);
52static s32 ixgbe_mta_vector(struct ixgbe_hw *hw, u8 *mc_addr); 52static s32 ixgbe_mta_vector(struct ixgbe_hw *hw, u8 *mc_addr);
53static void ixgbe_add_mc_addr(struct ixgbe_hw *hw, u8 *mc_addr);
54static void ixgbe_add_uc_addr(struct ixgbe_hw *hw, u8 *addr, u32 vmdq); 53static void ixgbe_add_uc_addr(struct ixgbe_hw *hw, u8 *addr, u32 vmdq);
55 54
56/** 55/**
@@ -1377,8 +1376,7 @@ s32 ixgbe_update_uc_addr_list_generic(struct ixgbe_hw *hw, u8 *addr_list,
1377 * Clear accounting of old secondary address list, 1376 * Clear accounting of old secondary address list,
1378 * don't count RAR[0] 1377 * don't count RAR[0]
1379 */ 1378 */
1380 uc_addr_in_use = hw->addr_ctrl.rar_used_count - 1379 uc_addr_in_use = hw->addr_ctrl.rar_used_count - 1;
1381 hw->addr_ctrl.mc_addr_in_rar_count - 1;
1382 hw->addr_ctrl.rar_used_count -= uc_addr_in_use; 1380 hw->addr_ctrl.rar_used_count -= uc_addr_in_use;
1383 hw->addr_ctrl.overflow_promisc = 0; 1381 hw->addr_ctrl.overflow_promisc = 0;
1384 1382
@@ -1493,40 +1491,6 @@ static void ixgbe_set_mta(struct ixgbe_hw *hw, u8 *mc_addr)
1493} 1491}
1494 1492
1495/** 1493/**
1496 * ixgbe_add_mc_addr - Adds a multicast address.
1497 * @hw: pointer to hardware structure
1498 * @mc_addr: new multicast address
1499 *
1500 * Adds it to unused receive address register or to the multicast table.
1501 **/
1502static void ixgbe_add_mc_addr(struct ixgbe_hw *hw, u8 *mc_addr)
1503{
1504 u32 rar_entries = hw->mac.num_rar_entries;
1505 u32 rar;
1506
1507 hw_dbg(hw, " MC Addr =%.2X %.2X %.2X %.2X %.2X %.2X\n",
1508 mc_addr[0], mc_addr[1], mc_addr[2],
1509 mc_addr[3], mc_addr[4], mc_addr[5]);
1510
1511 /*
1512 * Place this multicast address in the RAR if there is room,
1513 * else put it in the MTA
1514 */
1515 if (hw->addr_ctrl.rar_used_count < rar_entries) {
1516 /* use RAR from the end up for multicast */
1517 rar = rar_entries - hw->addr_ctrl.mc_addr_in_rar_count - 1;
1518 hw->mac.ops.set_rar(hw, rar, mc_addr, 0, IXGBE_RAH_AV);
1519 hw_dbg(hw, "Added a multicast address to RAR[%d]\n", rar);
1520 hw->addr_ctrl.rar_used_count++;
1521 hw->addr_ctrl.mc_addr_in_rar_count++;
1522 } else {
1523 ixgbe_set_mta(hw, mc_addr);
1524 }
1525
1526 hw_dbg(hw, "ixgbe_add_mc_addr Complete\n");
1527}
1528
1529/**
1530 * ixgbe_update_mc_addr_list_generic - Updates MAC list of multicast addresses 1494 * ixgbe_update_mc_addr_list_generic - Updates MAC list of multicast addresses
1531 * @hw: pointer to hardware structure 1495 * @hw: pointer to hardware structure
1532 * @mc_addr_list: the list of new multicast addresses 1496 * @mc_addr_list: the list of new multicast addresses
@@ -1542,7 +1506,6 @@ s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list,
1542 u32 mc_addr_count, ixgbe_mc_addr_itr next) 1506 u32 mc_addr_count, ixgbe_mc_addr_itr next)
1543{ 1507{
1544 u32 i; 1508 u32 i;
1545 u32 rar_entries = hw->mac.num_rar_entries;
1546 u32 vmdq; 1509 u32 vmdq;
1547 1510
1548 /* 1511 /*
@@ -1550,18 +1513,8 @@ s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list,
1550 * use. 1513 * use.
1551 */ 1514 */
1552 hw->addr_ctrl.num_mc_addrs = mc_addr_count; 1515 hw->addr_ctrl.num_mc_addrs = mc_addr_count;
1553 hw->addr_ctrl.rar_used_count -= hw->addr_ctrl.mc_addr_in_rar_count;
1554 hw->addr_ctrl.mc_addr_in_rar_count = 0;
1555 hw->addr_ctrl.mta_in_use = 0; 1516 hw->addr_ctrl.mta_in_use = 0;
1556 1517
1557 /* Zero out the other receive addresses. */
1558 hw_dbg(hw, "Clearing RAR[%d-%d]\n", hw->addr_ctrl.rar_used_count,
1559 rar_entries - 1);
1560 for (i = hw->addr_ctrl.rar_used_count; i < rar_entries; i++) {
1561 IXGBE_WRITE_REG(hw, IXGBE_RAL(i), 0);
1562 IXGBE_WRITE_REG(hw, IXGBE_RAH(i), 0);
1563 }
1564
1565 /* Clear the MTA */ 1518 /* Clear the MTA */
1566 hw_dbg(hw, " Clearing MTA\n"); 1519 hw_dbg(hw, " Clearing MTA\n");
1567 for (i = 0; i < hw->mac.mcft_size; i++) 1520 for (i = 0; i < hw->mac.mcft_size; i++)
@@ -1570,7 +1523,7 @@ s32 ixgbe_update_mc_addr_list_generic(struct ixgbe_hw *hw, u8 *mc_addr_list,
1570 /* Add the new addresses */ 1523 /* Add the new addresses */
1571 for (i = 0; i < mc_addr_count; i++) { 1524 for (i = 0; i < mc_addr_count; i++) {
1572 hw_dbg(hw, " Adding the multicast addresses:\n"); 1525 hw_dbg(hw, " Adding the multicast addresses:\n");
1573 ixgbe_add_mc_addr(hw, next(hw, &mc_addr_list, &vmdq)); 1526 ixgbe_set_mta(hw, next(hw, &mc_addr_list, &vmdq));
1574 } 1527 }
1575 1528
1576 /* Enable mta */ 1529 /* Enable mta */
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 01884256f4c9..07e778d3e5d2 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -3646,6 +3646,8 @@ static int ixgbe_resume(struct pci_dev *pdev)
3646 3646
3647 ixgbe_reset(adapter); 3647 ixgbe_reset(adapter);
3648 3648
3649 IXGBE_WRITE_REG(&adapter->hw, IXGBE_WUS, ~0);
3650
3649 if (netif_running(netdev)) { 3651 if (netif_running(netdev)) {
3650 err = ixgbe_open(adapter->netdev); 3652 err = ixgbe_open(adapter->netdev);
3651 if (err) 3653 if (err)
@@ -4575,7 +4577,6 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
4575 const struct ixgbe_info *ii = ixgbe_info_tbl[ent->driver_data]; 4577 const struct ixgbe_info *ii = ixgbe_info_tbl[ent->driver_data];
4576 static int cards_found; 4578 static int cards_found;
4577 int i, err, pci_using_dac; 4579 int i, err, pci_using_dac;
4578 u16 pm_value = 0;
4579 u32 part_num, eec; 4580 u32 part_num, eec;
4580 4581
4581 err = pci_enable_device(pdev); 4582 err = pci_enable_device(pdev);
@@ -4763,11 +4764,8 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
4763 4764
4764 switch (pdev->device) { 4765 switch (pdev->device) {
4765 case IXGBE_DEV_ID_82599_KX4: 4766 case IXGBE_DEV_ID_82599_KX4:
4766#define IXGBE_PCIE_PMCSR 0x44 4767 adapter->wol = (IXGBE_WUFC_MAG | IXGBE_WUFC_EX |
4767 adapter->wol = IXGBE_WUFC_MAG; 4768 IXGBE_WUFC_MC | IXGBE_WUFC_BC);
4768 pci_read_config_word(pdev, IXGBE_PCIE_PMCSR, &pm_value);
4769 pci_write_config_word(pdev, IXGBE_PCIE_PMCSR,
4770 (pm_value | (1 << 8)));
4771 break; 4769 break;
4772 default: 4770 default:
4773 adapter->wol = 0; 4771 adapter->wol = 0;
diff --git a/drivers/net/mlx4/en_netdev.c b/drivers/net/mlx4/en_netdev.c
index 438678ab2a10..7bcc49de1637 100644
--- a/drivers/net/mlx4/en_netdev.c
+++ b/drivers/net/mlx4/en_netdev.c
@@ -583,7 +583,7 @@ int mlx4_en_start_port(struct net_device *dev)
583 err = mlx4_en_activate_cq(priv, cq); 583 err = mlx4_en_activate_cq(priv, cq);
584 if (err) { 584 if (err) {
585 mlx4_err(mdev, "Failed activating Rx CQ\n"); 585 mlx4_err(mdev, "Failed activating Rx CQ\n");
586 goto rx_err; 586 goto cq_err;
587 } 587 }
588 for (j = 0; j < cq->size; j++) 588 for (j = 0; j < cq->size; j++)
589 cq->buf[j].owner_sr_opcode = MLX4_CQE_OWNER_MASK; 589 cq->buf[j].owner_sr_opcode = MLX4_CQE_OWNER_MASK;
diff --git a/drivers/net/mlx4/en_rx.c b/drivers/net/mlx4/en_rx.c
index 0cbb78ca7b29..7942c4d3cd88 100644
--- a/drivers/net/mlx4/en_rx.c
+++ b/drivers/net/mlx4/en_rx.c
@@ -610,6 +610,10 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
610 used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, skb_frags, 610 used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, skb_frags,
611 skb_shinfo(skb)->frags, 611 skb_shinfo(skb)->frags,
612 page_alloc, length); 612 page_alloc, length);
613 if (unlikely(!used_frags)) {
614 kfree_skb(skb);
615 return NULL;
616 }
613 skb_shinfo(skb)->nr_frags = used_frags; 617 skb_shinfo(skb)->nr_frags = used_frags;
614 618
615 /* Copy headers into the skb linear buffer */ 619 /* Copy headers into the skb linear buffer */
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 015db1cece72..8e56fcf0a0e3 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -210,14 +210,11 @@ rx_drop:
210 210
211static struct net_device_stats *veth_get_stats(struct net_device *dev) 211static struct net_device_stats *veth_get_stats(struct net_device *dev)
212{ 212{
213 struct veth_priv *priv; 213 struct veth_priv *priv = netdev_priv(dev);
214 struct net_device_stats *dev_stats; 214 struct net_device_stats *dev_stats = &dev->stats;
215 int cpu; 215 unsigned int cpu;
216 struct veth_net_stats *stats; 216 struct veth_net_stats *stats;
217 217
218 priv = netdev_priv(dev);
219 dev_stats = &dev->stats;
220
221 dev_stats->rx_packets = 0; 218 dev_stats->rx_packets = 0;
222 dev_stats->tx_packets = 0; 219 dev_stats->tx_packets = 0;
223 dev_stats->rx_bytes = 0; 220 dev_stats->rx_bytes = 0;
@@ -225,16 +222,17 @@ static struct net_device_stats *veth_get_stats(struct net_device *dev)
225 dev_stats->tx_dropped = 0; 222 dev_stats->tx_dropped = 0;
226 dev_stats->rx_dropped = 0; 223 dev_stats->rx_dropped = 0;
227 224
228 for_each_online_cpu(cpu) { 225 if (priv->stats)
229 stats = per_cpu_ptr(priv->stats, cpu); 226 for_each_online_cpu(cpu) {
227 stats = per_cpu_ptr(priv->stats, cpu);
230 228
231 dev_stats->rx_packets += stats->rx_packets; 229 dev_stats->rx_packets += stats->rx_packets;
232 dev_stats->tx_packets += stats->tx_packets; 230 dev_stats->tx_packets += stats->tx_packets;
233 dev_stats->rx_bytes += stats->rx_bytes; 231 dev_stats->rx_bytes += stats->rx_bytes;
234 dev_stats->tx_bytes += stats->tx_bytes; 232 dev_stats->tx_bytes += stats->tx_bytes;
235 dev_stats->tx_dropped += stats->tx_dropped; 233 dev_stats->tx_dropped += stats->tx_dropped;
236 dev_stats->rx_dropped += stats->rx_dropped; 234 dev_stats->rx_dropped += stats->rx_dropped;
237 } 235 }
238 236
239 return dev_stats; 237 return dev_stats;
240} 238}
@@ -261,6 +259,8 @@ static int veth_close(struct net_device *dev)
261 netif_carrier_off(dev); 259 netif_carrier_off(dev);
262 netif_carrier_off(priv->peer); 260 netif_carrier_off(priv->peer);
263 261
262 free_percpu(priv->stats);
263 priv->stats = NULL;
264 return 0; 264 return 0;
265} 265}
266 266
@@ -291,15 +291,6 @@ static int veth_dev_init(struct net_device *dev)
291 return 0; 291 return 0;
292} 292}
293 293
294static void veth_dev_free(struct net_device *dev)
295{
296 struct veth_priv *priv;
297
298 priv = netdev_priv(dev);
299 free_percpu(priv->stats);
300 free_netdev(dev);
301}
302
303static const struct net_device_ops veth_netdev_ops = { 294static const struct net_device_ops veth_netdev_ops = {
304 .ndo_init = veth_dev_init, 295 .ndo_init = veth_dev_init,
305 .ndo_open = veth_open, 296 .ndo_open = veth_open,
@@ -317,7 +308,7 @@ static void veth_setup(struct net_device *dev)
317 dev->netdev_ops = &veth_netdev_ops; 308 dev->netdev_ops = &veth_netdev_ops;
318 dev->ethtool_ops = &veth_ethtool_ops; 309 dev->ethtool_ops = &veth_ethtool_ops;
319 dev->features |= NETIF_F_LLTX; 310 dev->features |= NETIF_F_LLTX;
320 dev->destructor = veth_dev_free; 311 dev->destructor = free_netdev;
321} 312}
322 313
323/* 314/*
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 93bc0f8174a7..2f0945d63297 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -667,7 +667,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
667 lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); 667 lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
668 if (lower_buf == NULL) { 668 if (lower_buf == NULL) {
669 printk(KERN_ERR "%s: Out of memory whilst attempting to " 669 printk(KERN_ERR "%s: Out of memory whilst attempting to "
670 "kmalloc [%d] bytes\n", __func__, lower_bufsiz); 670 "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
671 rc = -ENOMEM; 671 rc = -ENOMEM;
672 goto out; 672 goto out;
673 } 673 }
@@ -690,7 +690,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
690 } 690 }
691 /* Check for bufsiz <= 0 done in sys_readlinkat() */ 691 /* Check for bufsiz <= 0 done in sys_readlinkat() */
692 rc = copy_to_user(buf, plaintext_name, 692 rc = copy_to_user(buf, plaintext_name,
693 min((unsigned) bufsiz, plaintext_name_size)); 693 min((size_t) bufsiz, plaintext_name_size));
694 if (rc) 694 if (rc)
695 rc = -EFAULT; 695 rc = -EFAULT;
696 else 696 else
diff --git a/fs/exec.c b/fs/exec.c
index a3a8ce83940f..fe75dcff023a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
33#include <linux/string.h> 33#include <linux/string.h>
34#include <linux/init.h> 34#include <linux/init.h>
35#include <linux/pagemap.h> 35#include <linux/pagemap.h>
36#include <linux/perf_counter.h>
36#include <linux/highmem.h> 37#include <linux/highmem.h>
37#include <linux/spinlock.h> 38#include <linux/spinlock.h>
38#include <linux/key.h> 39#include <linux/key.h>
@@ -950,6 +951,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
950 task_lock(tsk); 951 task_lock(tsk);
951 strlcpy(tsk->comm, buf, sizeof(tsk->comm)); 952 strlcpy(tsk->comm, buf, sizeof(tsk->comm));
952 task_unlock(tsk); 953 task_unlock(tsk);
954 perf_counter_comm(tsk);
953} 955}
954 956
955int flush_old_exec(struct linux_binprm * bprm) 957int flush_old_exec(struct linux_binprm * bprm)
@@ -1018,6 +1020,13 @@ int flush_old_exec(struct linux_binprm * bprm)
1018 1020
1019 current->personality &= ~bprm->per_clear; 1021 current->personality &= ~bprm->per_clear;
1020 1022
1023 /*
1024 * Flush performance counters when crossing a
1025 * security domain:
1026 */
1027 if (!get_dumpable(current->mm))
1028 perf_counter_exit_task(current);
1029
1021 /* An exec changes our domain. We are no longer part of the thread 1030 /* An exec changes our domain. We are no longer part of the thread
1022 group */ 1031 group */
1023 1032
diff --git a/include/linux/compat.h b/include/linux/compat.h
index f2ded21f9a3c..af931ee43dd8 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -222,6 +222,8 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from);
222int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from); 222int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from);
223int get_compat_sigevent(struct sigevent *event, 223int get_compat_sigevent(struct sigevent *event,
224 const struct compat_sigevent __user *u_event); 224 const struct compat_sigevent __user *u_event);
225long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
226 struct compat_siginfo __user *uinfo);
225 227
226static inline int compat_timeval_compare(struct compat_timeval *lhs, 228static inline int compat_timeval_compare(struct compat_timeval *lhs,
227 struct compat_timeval *rhs) 229 struct compat_timeval *rhs)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d87247d2641f..503afaa0afa7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,6 +108,18 @@ extern struct group_info init_groups;
108 108
109extern struct cred init_cred; 109extern struct cred init_cred;
110 110
111#ifdef CONFIG_PERF_COUNTERS
112# define INIT_PERF_COUNTERS(tsk) \
113 .perf_counter_ctx.counter_list = \
114 LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \
115 .perf_counter_ctx.event_list = \
116 LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \
117 .perf_counter_ctx.lock = \
118 __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
119#else
120# define INIT_PERF_COUNTERS(tsk)
121#endif
122
111/* 123/*
112 * INIT_TASK is used to set up the first task table, touch at 124 * INIT_TASK is used to set up the first task table, touch at
113 * your own risk!. Base=0, limit=0x1fffff (=2MB) 125 * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -171,6 +183,7 @@ extern struct cred init_cred;
171 }, \ 183 }, \
172 .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ 184 .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \
173 INIT_IDS \ 185 INIT_IDS \
186 INIT_PERF_COUNTERS(tsk) \
174 INIT_TRACE_IRQFLAGS \ 187 INIT_TRACE_IRQFLAGS \
175 INIT_LOCKDEP \ 188 INIT_LOCKDEP \
176 INIT_FTRACE_GRAPH \ 189 INIT_FTRACE_GRAPH \
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 0c8b89f28a95..a77c6007dc99 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -81,7 +81,12 @@ static inline unsigned int kstat_irqs(unsigned int irq)
81 return sum; 81 return sum;
82} 82}
83 83
84
85/*
86 * Lock/unlock the current runqueue - to extract task statistics:
87 */
84extern unsigned long long task_delta_exec(struct task_struct *); 88extern unsigned long long task_delta_exec(struct task_struct *);
89
85extern void account_user_time(struct task_struct *, cputime_t, cputime_t); 90extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
86extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); 91extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
87extern void account_steal_time(cputime_t); 92extern void account_steal_time(cputime_t);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2e7783f4a755..5a96a1a406e9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -104,7 +104,7 @@ struct wireless_dev;
104# else 104# else
105# define LL_MAX_HEADER 96 105# define LL_MAX_HEADER 96
106# endif 106# endif
107#elif defined(CONFIG_TR) 107#elif defined(CONFIG_TR) || defined(CONFIG_TR_MODULE)
108# define LL_MAX_HEADER 48 108# define LL_MAX_HEADER 48
109#else 109#else
110# define LL_MAX_HEADER 32 110# define LL_MAX_HEADER 32
@@ -500,7 +500,7 @@ struct netdev_queue {
500 * 500 *
501 * int (*ndo_set_mac_address)(struct net_device *dev, void *addr); 501 * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
502 * This function is called when the Media Access Control address 502 * This function is called when the Media Access Control address
503 * needs to be changed. If not this interface is not defined, the 503 * needs to be changed. If this interface is not defined, the
504 * mac address can not be changed. 504 * mac address can not be changed.
505 * 505 *
506 * int (*ndo_validate_addr)(struct net_device *dev); 506 * int (*ndo_validate_addr)(struct net_device *dev);
diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 29fe9ea1d346..1a865e48b8eb 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -100,6 +100,7 @@ enum ctattr_protoinfo_tcp {
100enum ctattr_protoinfo_dccp { 100enum ctattr_protoinfo_dccp {
101 CTA_PROTOINFO_DCCP_UNSPEC, 101 CTA_PROTOINFO_DCCP_UNSPEC,
102 CTA_PROTOINFO_DCCP_STATE, 102 CTA_PROTOINFO_DCCP_STATE,
103 CTA_PROTOINFO_DCCP_ROLE,
103 __CTA_PROTOINFO_DCCP_MAX, 104 __CTA_PROTOINFO_DCCP_MAX,
104}; 105};
105#define CTA_PROTOINFO_DCCP_MAX (__CTA_PROTOINFO_DCCP_MAX - 1) 106#define CTA_PROTOINFO_DCCP_MAX (__CTA_PROTOINFO_DCCP_MAX - 1)
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 7b1a652066c0..1b2e43502ef7 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -354,9 +354,6 @@ struct xt_table
354 /* What hooks you will enter on */ 354 /* What hooks you will enter on */
355 unsigned int valid_hooks; 355 unsigned int valid_hooks;
356 356
357 /* Lock for the curtain */
358 struct mutex lock;
359
360 /* Man behind the curtain... */ 357 /* Man behind the curtain... */
361 struct xt_table_info *private; 358 struct xt_table_info *private;
362 359
@@ -434,8 +431,74 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
434 431
435extern struct xt_table_info *xt_alloc_table_info(unsigned int size); 432extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
436extern void xt_free_table_info(struct xt_table_info *info); 433extern void xt_free_table_info(struct xt_table_info *info);
437extern void xt_table_entry_swap_rcu(struct xt_table_info *old, 434
438 struct xt_table_info *new); 435/*
436 * Per-CPU spinlock associated with per-cpu table entries, and
437 * with a counter for the "reading" side that allows a recursive
438 * reader to avoid taking the lock and deadlocking.
439 *
440 * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu.
441 * It needs to ensure that the rules are not being changed while the packet
442 * is being processed. In some cases, the read lock will be acquired
443 * twice on the same CPU; this is okay because of the count.
444 *
445 * "writing" is used when reading counters.
446 * During replace any readers that are using the old tables have to complete
447 * before freeing the old table. This is handled by the write locking
448 * necessary for reading the counters.
449 */
450struct xt_info_lock {
451 spinlock_t lock;
452 unsigned char readers;
453};
454DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
455
456/*
457 * Note: we need to ensure that preemption is disabled before acquiring
458 * the per-cpu-variable, so we do it as a two step process rather than
459 * using "spin_lock_bh()".
460 *
461 * We _also_ need to disable bottom half processing before updating our
462 * nesting count, to make sure that the only kind of re-entrancy is this
463 * code being called by itself: since the count+lock is not an atomic
464 * operation, we can allow no races.
465 *
466 * _Only_ that special combination of being per-cpu and never getting
467 * re-entered asynchronously means that the count is safe.
468 */
469static inline void xt_info_rdlock_bh(void)
470{
471 struct xt_info_lock *lock;
472
473 local_bh_disable();
474 lock = &__get_cpu_var(xt_info_locks);
475 if (!lock->readers++)
476 spin_lock(&lock->lock);
477}
478
479static inline void xt_info_rdunlock_bh(void)
480{
481 struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
482
483 if (!--lock->readers)
484 spin_unlock(&lock->lock);
485 local_bh_enable();
486}
487
488/*
489 * The "writer" side needs to get exclusive access to the lock,
490 * regardless of readers. This must be called with bottom half
491 * processing (and thus also preemption) disabled.
492 */
493static inline void xt_info_wrlock(unsigned int cpu)
494{
495 spin_lock(&per_cpu(xt_info_locks, cpu).lock);
496}
497
498static inline void xt_info_wrunlock(unsigned int cpu)
499{
500 spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
501}
439 502
440/* 503/*
441 * This helper is performance critical and must be inlined 504 * This helper is performance critical and must be inlined
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 000000000000..00081d84169f
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,617 @@
1/*
2 * Performance counters:
3 *
4 * Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
5 * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
6 *
7 * Data type definitions, declarations, prototypes.
8 *
9 * Started by: Thomas Gleixner and Ingo Molnar
10 *
11 * For licencing details see kernel-base/COPYING
12 */
13#ifndef _LINUX_PERF_COUNTER_H
14#define _LINUX_PERF_COUNTER_H
15
16#include <linux/types.h>
17#include <linux/ioctl.h>
18#include <asm/byteorder.h>
19
20/*
21 * User-space ABI bits:
22 */
23
24/*
25 * hw_event.type
26 */
27enum perf_event_types {
28 PERF_TYPE_HARDWARE = 0,
29 PERF_TYPE_SOFTWARE = 1,
30 PERF_TYPE_TRACEPOINT = 2,
31
32 /*
33 * available TYPE space, raw is the max value.
34 */
35
36 PERF_TYPE_RAW = 128,
37};
38
39/*
40 * Generalized performance counter event types, used by the hw_event.event_id
41 * parameter of the sys_perf_counter_open() syscall:
42 */
43enum hw_event_ids {
44 /*
45 * Common hardware events, generalized by the kernel:
46 */
47 PERF_COUNT_CPU_CYCLES = 0,
48 PERF_COUNT_INSTRUCTIONS = 1,
49 PERF_COUNT_CACHE_REFERENCES = 2,
50 PERF_COUNT_CACHE_MISSES = 3,
51 PERF_COUNT_BRANCH_INSTRUCTIONS = 4,
52 PERF_COUNT_BRANCH_MISSES = 5,
53 PERF_COUNT_BUS_CYCLES = 6,
54
55 PERF_HW_EVENTS_MAX = 7,
56};
57
58/*
59 * Special "software" counters provided by the kernel, even if the hardware
60 * does not support performance counters. These counters measure various
61 * physical and sw events of the kernel (and allow the profiling of them as
62 * well):
63 */
64enum sw_event_ids {
65 PERF_COUNT_CPU_CLOCK = 0,
66 PERF_COUNT_TASK_CLOCK = 1,
67 PERF_COUNT_PAGE_FAULTS = 2,
68 PERF_COUNT_CONTEXT_SWITCHES = 3,
69 PERF_COUNT_CPU_MIGRATIONS = 4,
70 PERF_COUNT_PAGE_FAULTS_MIN = 5,
71 PERF_COUNT_PAGE_FAULTS_MAJ = 6,
72
73 PERF_SW_EVENTS_MAX = 7,
74};
75
76#define __PERF_COUNTER_MASK(name) \
77 (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \
78 PERF_COUNTER_##name##_SHIFT)
79
80#define PERF_COUNTER_RAW_BITS 1
81#define PERF_COUNTER_RAW_SHIFT 63
82#define PERF_COUNTER_RAW_MASK __PERF_COUNTER_MASK(RAW)
83
84#define PERF_COUNTER_CONFIG_BITS 63
85#define PERF_COUNTER_CONFIG_SHIFT 0
86#define PERF_COUNTER_CONFIG_MASK __PERF_COUNTER_MASK(CONFIG)
87
88#define PERF_COUNTER_TYPE_BITS 7
89#define PERF_COUNTER_TYPE_SHIFT 56
90#define PERF_COUNTER_TYPE_MASK __PERF_COUNTER_MASK(TYPE)
91
92#define PERF_COUNTER_EVENT_BITS 56
93#define PERF_COUNTER_EVENT_SHIFT 0
94#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT)
95
96/*
97 * Bits that can be set in hw_event.record_type to request information
98 * in the overflow packets.
99 */
100enum perf_counter_record_format {
101 PERF_RECORD_IP = 1U << 0,
102 PERF_RECORD_TID = 1U << 1,
103 PERF_RECORD_TIME = 1U << 2,
104 PERF_RECORD_ADDR = 1U << 3,
105 PERF_RECORD_GROUP = 1U << 4,
106 PERF_RECORD_CALLCHAIN = 1U << 5,
107};
108
109/*
110 * Bits that can be set in hw_event.read_format to request that
111 * reads on the counter should return the indicated quantities,
112 * in increasing order of bit value, after the counter value.
113 */
114enum perf_counter_read_format {
115 PERF_FORMAT_TOTAL_TIME_ENABLED = 1,
116 PERF_FORMAT_TOTAL_TIME_RUNNING = 2,
117};
118
119/*
120 * Hardware event to monitor via a performance monitoring counter:
121 */
122struct perf_counter_hw_event {
123 /*
124 * The MSB of the config word signifies if the rest contains cpu
125 * specific (raw) counter configuration data, if unset, the next
126 * 7 bits are an event type and the rest of the bits are the event
127 * identifier.
128 */
129 __u64 config;
130
131 __u64 irq_period;
132 __u32 record_type;
133 __u32 read_format;
134
135 __u64 disabled : 1, /* off by default */
136 nmi : 1, /* NMI sampling */
137 inherit : 1, /* children inherit it */
138 pinned : 1, /* must always be on PMU */
139 exclusive : 1, /* only group on PMU */
140 exclude_user : 1, /* don't count user */
141 exclude_kernel : 1, /* ditto kernel */
142 exclude_hv : 1, /* ditto hypervisor */
143 exclude_idle : 1, /* don't count when idle */
144 mmap : 1, /* include mmap data */
145 munmap : 1, /* include munmap data */
146 comm : 1, /* include comm data */
147
148 __reserved_1 : 52;
149
150 __u32 extra_config_len;
151 __u32 wakeup_events; /* wakeup every n events */
152
153 __u64 __reserved_2;
154 __u64 __reserved_3;
155};
156
157/*
158 * Ioctls that can be done on a perf counter fd:
159 */
160#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0)
161#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1)
162#define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32)
163#define PERF_COUNTER_IOC_RESET _IO ('$', 3)
164
165/*
166 * Structure of the page that can be mapped via mmap
167 */
168struct perf_counter_mmap_page {
169 __u32 version; /* version number of this structure */
170 __u32 compat_version; /* lowest version this is compat with */
171
172 /*
173 * Bits needed to read the hw counters in user-space.
174 *
175 * u32 seq;
176 * s64 count;
177 *
178 * do {
179 * seq = pc->lock;
180 *
181 * barrier()
182 * if (pc->index) {
183 * count = pmc_read(pc->index - 1);
184 * count += pc->offset;
185 * } else
186 * goto regular_read;
187 *
188 * barrier();
189 * } while (pc->lock != seq);
190 *
191 * NOTE: for obvious reason this only works on self-monitoring
192 * processes.
193 */
194 __u32 lock; /* seqlock for synchronization */
195 __u32 index; /* hardware counter identifier */
196 __s64 offset; /* add to hardware counter value */
197
198 /*
199 * Control data for the mmap() data buffer.
200 *
201 * User-space reading this value should issue an rmb(), on SMP capable
202 * platforms, after reading this value -- see perf_counter_wakeup().
203 */
204 __u32 data_head; /* head in the data section */
205};
206
207#define PERF_EVENT_MISC_KERNEL (1 << 0)
208#define PERF_EVENT_MISC_USER (1 << 1)
209#define PERF_EVENT_MISC_OVERFLOW (1 << 2)
210
211struct perf_event_header {
212 __u32 type;
213 __u16 misc;
214 __u16 size;
215};
216
217enum perf_event_type {
218
219 /*
220 * The MMAP events record the PROT_EXEC mappings so that we can
221 * correlate userspace IPs to code. They have the following structure:
222 *
223 * struct {
224 * struct perf_event_header header;
225 *
226 * u32 pid, tid;
227 * u64 addr;
228 * u64 len;
229 * u64 pgoff;
230 * char filename[];
231 * };
232 */
233 PERF_EVENT_MMAP = 1,
234 PERF_EVENT_MUNMAP = 2,
235
236 /*
237 * struct {
238 * struct perf_event_header header;
239 *
240 * u32 pid, tid;
241 * char comm[];
242 * };
243 */
244 PERF_EVENT_COMM = 3,
245
246 /*
247 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
248 * will be PERF_RECORD_*
249 *
250 * struct {
251 * struct perf_event_header header;
252 *
253 * { u64 ip; } && PERF_RECORD_IP
254 * { u32 pid, tid; } && PERF_RECORD_TID
255 * { u64 time; } && PERF_RECORD_TIME
256 * { u64 addr; } && PERF_RECORD_ADDR
257 *
258 * { u64 nr;
259 * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP
260 *
261 * { u16 nr,
262 * hv,
263 * kernel,
264 * user;
265 * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN
266 * };
267 */
268};
269
270#ifdef __KERNEL__
271/*
272 * Kernel-internal data types and definitions:
273 */
274
275#ifdef CONFIG_PERF_COUNTERS
276# include <asm/perf_counter.h>
277#endif
278
279#include <linux/list.h>
280#include <linux/mutex.h>
281#include <linux/rculist.h>
282#include <linux/rcupdate.h>
283#include <linux/spinlock.h>
284#include <linux/hrtimer.h>
285#include <linux/fs.h>
286#include <asm/atomic.h>
287
288struct task_struct;
289
290static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
291{
292 return hw_event->config & PERF_COUNTER_RAW_MASK;
293}
294
295static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
296{
297 return hw_event->config & PERF_COUNTER_CONFIG_MASK;
298}
299
300static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
301{
302 return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
303 PERF_COUNTER_TYPE_SHIFT;
304}
305
306static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
307{
308 return hw_event->config & PERF_COUNTER_EVENT_MASK;
309}
310
311/**
312 * struct hw_perf_counter - performance counter hardware details:
313 */
314struct hw_perf_counter {
315#ifdef CONFIG_PERF_COUNTERS
316 union {
317 struct { /* hardware */
318 u64 config;
319 unsigned long config_base;
320 unsigned long counter_base;
321 int nmi;
322 int idx;
323 };
324 union { /* software */
325 atomic64_t count;
326 struct hrtimer hrtimer;
327 };
328 };
329 atomic64_t prev_count;
330 u64 irq_period;
331 atomic64_t period_left;
332#endif
333};
334
335struct perf_counter;
336
337/**
338 * struct pmu - generic performance monitoring unit
339 */
340struct pmu {
341 int (*enable) (struct perf_counter *counter);
342 void (*disable) (struct perf_counter *counter);
343 void (*read) (struct perf_counter *counter);
344};
345
346/**
347 * enum perf_counter_active_state - the states of a counter
348 */
349enum perf_counter_active_state {
350 PERF_COUNTER_STATE_ERROR = -2,
351 PERF_COUNTER_STATE_OFF = -1,
352 PERF_COUNTER_STATE_INACTIVE = 0,
353 PERF_COUNTER_STATE_ACTIVE = 1,
354};
355
356struct file;
357
358struct perf_mmap_data {
359 struct rcu_head rcu_head;
360 int nr_pages; /* nr of data pages */
361 int nr_locked; /* nr pages mlocked */
362
363 atomic_t poll; /* POLL_ for wakeups */
364 atomic_t head; /* write position */
365 atomic_t events; /* event limit */
366
367 atomic_t done_head; /* completed head */
368 atomic_t lock; /* concurrent writes */
369
370 atomic_t wakeup; /* needs a wakeup */
371
372 struct perf_counter_mmap_page *user_page;
373 void *data_pages[0];
374};
375
376struct perf_pending_entry {
377 struct perf_pending_entry *next;
378 void (*func)(struct perf_pending_entry *);
379};
380
381/**
382 * struct perf_counter - performance counter kernel representation:
383 */
384struct perf_counter {
385#ifdef CONFIG_PERF_COUNTERS
386 struct list_head list_entry;
387 struct list_head event_entry;
388 struct list_head sibling_list;
389 int nr_siblings;
390 struct perf_counter *group_leader;
391 const struct pmu *pmu;
392
393 enum perf_counter_active_state state;
394 enum perf_counter_active_state prev_state;
395 atomic64_t count;
396
397 /*
398 * These are the total time in nanoseconds that the counter
399 * has been enabled (i.e. eligible to run, and the task has
400 * been scheduled in, if this is a per-task counter)
401 * and running (scheduled onto the CPU), respectively.
402 *
403 * They are computed from tstamp_enabled, tstamp_running and
404 * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
405 */
406 u64 total_time_enabled;
407 u64 total_time_running;
408
409 /*
410 * These are timestamps used for computing total_time_enabled
411 * and total_time_running when the counter is in INACTIVE or
412 * ACTIVE state, measured in nanoseconds from an arbitrary point
413 * in time.
414 * tstamp_enabled: the notional time when the counter was enabled
415 * tstamp_running: the notional time when the counter was scheduled on
416 * tstamp_stopped: in INACTIVE state, the notional time when the
417 * counter was scheduled off.
418 */
419 u64 tstamp_enabled;
420 u64 tstamp_running;
421 u64 tstamp_stopped;
422
423 struct perf_counter_hw_event hw_event;
424 struct hw_perf_counter hw;
425
426 struct perf_counter_context *ctx;
427 struct task_struct *task;
428 struct file *filp;
429
430 struct perf_counter *parent;
431 struct list_head child_list;
432
433 /*
434 * These accumulate total time (in nanoseconds) that children
435 * counters have been enabled and running, respectively.
436 */
437 atomic64_t child_total_time_enabled;
438 atomic64_t child_total_time_running;
439
440 /*
441 * Protect attach/detach and child_list:
442 */
443 struct mutex mutex;
444
445 int oncpu;
446 int cpu;
447
448 /* mmap bits */
449 struct mutex mmap_mutex;
450 atomic_t mmap_count;
451 struct perf_mmap_data *data;
452
453 /* poll related */
454 wait_queue_head_t waitq;
455 struct fasync_struct *fasync;
456
457 /* delayed work for NMIs and such */
458 int pending_wakeup;
459 int pending_kill;
460 int pending_disable;
461 struct perf_pending_entry pending;
462
463 atomic_t event_limit;
464
465 void (*destroy)(struct perf_counter *);
466 struct rcu_head rcu_head;
467#endif
468};
469
470/**
471 * struct perf_counter_context - counter context structure
472 *
473 * Used as a container for task counters and CPU counters as well:
474 */
475struct perf_counter_context {
476#ifdef CONFIG_PERF_COUNTERS
477 /*
478 * Protect the states of the counters in the list,
479 * nr_active, and the list:
480 */
481 spinlock_t lock;
482 /*
483 * Protect the list of counters. Locking either mutex or lock
484 * is sufficient to ensure the list doesn't change; to change
485 * the list you need to lock both the mutex and the spinlock.
486 */
487 struct mutex mutex;
488
489 struct list_head counter_list;
490 struct list_head event_list;
491 int nr_counters;
492 int nr_active;
493 int is_active;
494 struct task_struct *task;
495
496 /*
497 * Context clock, runs when context enabled.
498 */
499 u64 time;
500 u64 timestamp;
501#endif
502};
503
504/**
505 * struct perf_counter_cpu_context - per cpu counter context structure
506 */
507struct perf_cpu_context {
508 struct perf_counter_context ctx;
509 struct perf_counter_context *task_ctx;
510 int active_oncpu;
511 int max_pertask;
512 int exclusive;
513
514 /*
515 * Recursion avoidance:
516 *
517 * task, softirq, irq, nmi context
518 */
519 int recursion[4];
520};
521
522#ifdef CONFIG_PERF_COUNTERS
523
524/*
525 * Set by architecture code:
526 */
527extern int perf_max_counters;
528
529extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
530
531extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
532extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
533extern void perf_counter_task_tick(struct task_struct *task, int cpu);
534extern void perf_counter_init_task(struct task_struct *child);
535extern void perf_counter_exit_task(struct task_struct *child);
536extern void perf_counter_do_pending(void);
537extern void perf_counter_print_debug(void);
538extern void perf_counter_unthrottle(void);
539extern u64 hw_perf_save_disable(void);
540extern void hw_perf_restore(u64 ctrl);
541extern int perf_counter_task_disable(void);
542extern int perf_counter_task_enable(void);
543extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
544 struct perf_cpu_context *cpuctx,
545 struct perf_counter_context *ctx, int cpu);
546extern void perf_counter_update_userpage(struct perf_counter *counter);
547
548extern int perf_counter_overflow(struct perf_counter *counter,
549 int nmi, struct pt_regs *regs, u64 addr);
550/*
551 * Return 1 for a software counter, 0 for a hardware counter
552 */
553static inline int is_software_counter(struct perf_counter *counter)
554{
555 return !perf_event_raw(&counter->hw_event) &&
556 perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
557}
558
559extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
560
561extern void perf_counter_mmap(unsigned long addr, unsigned long len,
562 unsigned long pgoff, struct file *file);
563
564extern void perf_counter_munmap(unsigned long addr, unsigned long len,
565 unsigned long pgoff, struct file *file);
566
567extern void perf_counter_comm(struct task_struct *tsk);
568
569#define MAX_STACK_DEPTH 255
570
571struct perf_callchain_entry {
572 u16 nr, hv, kernel, user;
573 u64 ip[MAX_STACK_DEPTH];
574};
575
576extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
577
578extern int sysctl_perf_counter_priv;
579extern int sysctl_perf_counter_mlock;
580
581extern void perf_counter_init(void);
582
583#else
584static inline void
585perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
586static inline void
587perf_counter_task_sched_out(struct task_struct *task, int cpu) { }
588static inline void
589perf_counter_task_tick(struct task_struct *task, int cpu) { }
590static inline void perf_counter_init_task(struct task_struct *child) { }
591static inline void perf_counter_exit_task(struct task_struct *child) { }
592static inline void perf_counter_do_pending(void) { }
593static inline void perf_counter_print_debug(void) { }
594static inline void perf_counter_unthrottle(void) { }
595static inline void hw_perf_restore(u64 ctrl) { }
596static inline u64 hw_perf_save_disable(void) { return 0; }
597static inline int perf_counter_task_disable(void) { return -EINVAL; }
598static inline int perf_counter_task_enable(void) { return -EINVAL; }
599
600static inline void
601perf_swcounter_event(u32 event, u64 nr, int nmi,
602 struct pt_regs *regs, u64 addr) { }
603
604static inline void
605perf_counter_mmap(unsigned long addr, unsigned long len,
606 unsigned long pgoff, struct file *file) { }
607
608static inline void
609perf_counter_munmap(unsigned long addr, unsigned long len,
610 unsigned long pgoff, struct file *file) { }
611
612static inline void perf_counter_comm(struct task_struct *tsk) { }
613static inline void perf_counter_init(void) { }
614#endif
615
616#endif /* __KERNEL__ */
617#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 48d887e3c6e7..b00df4c79c63 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
85#define PR_SET_TIMERSLACK 29 85#define PR_SET_TIMERSLACK 29
86#define PR_GET_TIMERSLACK 30 86#define PR_GET_TIMERSLACK 30
87 87
88#define PR_TASK_PERF_COUNTERS_DISABLE 31
89#define PR_TASK_PERF_COUNTERS_ENABLE 32
90
88#endif /* _LINUX_PRCTL_H */ 91#endif /* _LINUX_PRCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049c..d1857580a132 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,7 @@ struct sched_param {
71#include <linux/path.h> 71#include <linux/path.h>
72#include <linux/compiler.h> 72#include <linux/compiler.h>
73#include <linux/completion.h> 73#include <linux/completion.h>
74#include <linux/perf_counter.h>
74#include <linux/pid.h> 75#include <linux/pid.h>
75#include <linux/percpu.h> 76#include <linux/percpu.h>
76#include <linux/topology.h> 77#include <linux/topology.h>
@@ -137,6 +138,7 @@ extern unsigned long nr_running(void);
137extern unsigned long nr_uninterruptible(void); 138extern unsigned long nr_uninterruptible(void);
138extern unsigned long nr_active(void); 139extern unsigned long nr_active(void);
139extern unsigned long nr_iowait(void); 140extern unsigned long nr_iowait(void);
141extern u64 cpu_nr_migrations(int cpu);
140 142
141extern unsigned long get_parent_ip(unsigned long addr); 143extern unsigned long get_parent_ip(unsigned long addr);
142 144
@@ -1052,9 +1054,10 @@ struct sched_entity {
1052 u64 last_wakeup; 1054 u64 last_wakeup;
1053 u64 avg_overlap; 1055 u64 avg_overlap;
1054 1056
1057 u64 nr_migrations;
1058
1055 u64 start_runtime; 1059 u64 start_runtime;
1056 u64 avg_wakeup; 1060 u64 avg_wakeup;
1057 u64 nr_migrations;
1058 1061
1059#ifdef CONFIG_SCHEDSTATS 1062#ifdef CONFIG_SCHEDSTATS
1060 u64 wait_start; 1063 u64 wait_start;
@@ -1380,6 +1383,7 @@ struct task_struct {
1380 struct list_head pi_state_list; 1383 struct list_head pi_state_list;
1381 struct futex_pi_state *pi_state_cache; 1384 struct futex_pi_state *pi_state_cache;
1382#endif 1385#endif
1386 struct perf_counter_context perf_counter_ctx;
1383#ifdef CONFIG_NUMA 1387#ifdef CONFIG_NUMA
1384 struct mempolicy *mempolicy; 1388 struct mempolicy *mempolicy;
1385 short il_next; 1389 short il_next;
@@ -2388,6 +2392,13 @@ static inline void inc_syscw(struct task_struct *tsk)
2388#define TASK_SIZE_OF(tsk) TASK_SIZE 2392#define TASK_SIZE_OF(tsk) TASK_SIZE
2389#endif 2393#endif
2390 2394
2395/*
2396 * Call the function if the target task is executing on a CPU right now:
2397 */
2398extern void task_oncpu_function_call(struct task_struct *p,
2399 void (*func) (void *info), void *info);
2400
2401
2391#ifdef CONFIG_MM_OWNER 2402#ifdef CONFIG_MM_OWNER
2392extern void mm_update_next_owner(struct mm_struct *mm); 2403extern void mm_update_next_owner(struct mm_struct *mm);
2393extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); 2404extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 84f997f8aa53..c7552836bd95 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -235,6 +235,8 @@ static inline int valid_signal(unsigned long sig)
235extern int next_signal(struct sigpending *pending, sigset_t *mask); 235extern int next_signal(struct sigpending *pending, sigset_t *mask);
236extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p); 236extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
237extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); 237extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
238extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
239 siginfo_t *info);
238extern long do_sigpending(void __user *, unsigned long); 240extern long do_sigpending(void __user *, unsigned long);
239extern int sigprocmask(int, sigset_t *, sigset_t *); 241extern int sigprocmask(int, sigset_t *, sigset_t *);
240extern int show_unhandled_signals; 242extern int show_unhandled_signals;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 40617c1d8976..677d159fe5f4 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,6 +55,7 @@ struct compat_timeval;
55struct robust_list_head; 55struct robust_list_head;
56struct getcpu_cache; 56struct getcpu_cache;
57struct old_linux_dirent; 57struct old_linux_dirent;
58struct perf_counter_hw_event;
58 59
59#include <linux/types.h> 60#include <linux/types.h>
60#include <linux/aio_abi.h> 61#include <linux/aio_abi.h>
@@ -754,4 +755,8 @@ asmlinkage long sys_pipe(int __user *);
754 755
755int kernel_execve(const char *filename, char *const argv[], char *const envp[]); 756int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
756 757
758
759asmlinkage long sys_perf_counter_open(
760 const struct perf_counter_hw_event __user *hw_event_uptr,
761 pid_t pid, int cpu, int group_fd, unsigned long flags);
757#endif 762#endif
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5d631c17eaee..bc024632f365 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -440,13 +440,15 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
440int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); 440int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
441int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); 441int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
442 442
443#define DEFINE_WAIT(name) \ 443#define DEFINE_WAIT_FUNC(name, function) \
444 wait_queue_t name = { \ 444 wait_queue_t name = { \
445 .private = current, \ 445 .private = current, \
446 .func = autoremove_wake_function, \ 446 .func = function, \
447 .task_list = LIST_HEAD_INIT((name).task_list), \ 447 .task_list = LIST_HEAD_INIT((name).task_list), \
448 } 448 }
449 449
450#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
451
450#define DEFINE_WAIT_BIT(name, word, bit) \ 452#define DEFINE_WAIT_BIT(name, word, bit) \
451 struct wait_bit_queue name = { \ 453 struct wait_bit_queue name = { \
452 .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \ 454 .key = __WAIT_BIT_KEY_INITIALIZER(word, bit), \
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index f69f015bbcc0..ed3aea1605e8 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -101,6 +101,7 @@ enum {
101/* HCI timeouts */ 101/* HCI timeouts */
102#define HCI_CONNECT_TIMEOUT (40000) /* 40 seconds */ 102#define HCI_CONNECT_TIMEOUT (40000) /* 40 seconds */
103#define HCI_DISCONN_TIMEOUT (2000) /* 2 seconds */ 103#define HCI_DISCONN_TIMEOUT (2000) /* 2 seconds */
104#define HCI_PAIRING_TIMEOUT (60000) /* 60 seconds */
104#define HCI_IDLE_TIMEOUT (6000) /* 6 seconds */ 105#define HCI_IDLE_TIMEOUT (6000) /* 6 seconds */
105#define HCI_INIT_TIMEOUT (10000) /* 10 seconds */ 106#define HCI_INIT_TIMEOUT (10000) /* 10 seconds */
106 107
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 01f9316b4c23..be5bd713d2c9 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -171,6 +171,7 @@ struct hci_conn {
171 __u8 auth_type; 171 __u8 auth_type;
172 __u8 sec_level; 172 __u8 sec_level;
173 __u8 power_save; 173 __u8 power_save;
174 __u16 disc_timeout;
174 unsigned long pend; 175 unsigned long pend;
175 176
176 unsigned int sent; 177 unsigned int sent;
@@ -180,7 +181,8 @@ struct hci_conn {
180 struct timer_list disc_timer; 181 struct timer_list disc_timer;
181 struct timer_list idle_timer; 182 struct timer_list idle_timer;
182 183
183 struct work_struct work; 184 struct work_struct work_add;
185 struct work_struct work_del;
184 186
185 struct device dev; 187 struct device dev;
186 188
@@ -348,9 +350,9 @@ static inline void hci_conn_put(struct hci_conn *conn)
348 if (conn->type == ACL_LINK) { 350 if (conn->type == ACL_LINK) {
349 del_timer(&conn->idle_timer); 351 del_timer(&conn->idle_timer);
350 if (conn->state == BT_CONNECTED) { 352 if (conn->state == BT_CONNECTED) {
351 timeo = msecs_to_jiffies(HCI_DISCONN_TIMEOUT); 353 timeo = msecs_to_jiffies(conn->disc_timeout);
352 if (!conn->out) 354 if (!conn->out)
353 timeo *= 5; 355 timeo *= 2;
354 } else 356 } else
355 timeo = msecs_to_jiffies(10); 357 timeo = msecs_to_jiffies(10);
356 } else 358 } else
diff --git a/init/Kconfig b/init/Kconfig
index 7be4d3836745..8158f1f44694 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -933,6 +933,41 @@ config AIO
933 by some high performance threaded applications. Disabling 933 by some high performance threaded applications. Disabling
934 this option saves about 7k. 934 this option saves about 7k.
935 935
936config HAVE_PERF_COUNTERS
937 bool
938
939menu "Performance Counters"
940
941config PERF_COUNTERS
942 bool "Kernel Performance Counters"
943 depends on HAVE_PERF_COUNTERS
944 default y
945 select ANON_INODES
946 help
947 Enable kernel support for performance counter hardware.
948
949 Performance counters are special hardware registers available
950 on most modern CPUs. These registers count the number of certain
951 types of hw events: such as instructions executed, cachemisses
952 suffered, or branches mis-predicted - without slowing down the
953 kernel or applications. These registers can also trigger interrupts
954 when a threshold number of events have passed - and can thus be
955 used to profile the code that runs on that CPU.
956
957 The Linux Performance Counter subsystem provides an abstraction of
958 these hardware capabilities, available via a system call. It
959 provides per task and per CPU counters, and it provides event
960 capabilities on top of those.
961
962 Say Y if unsure.
963
964config EVENT_PROFILE
965 bool "Tracepoint profile sources"
966 depends on PERF_COUNTERS && EVENT_TRACER
967 default y
968
969endmenu
970
936config VM_EVENT_COUNTERS 971config VM_EVENT_COUNTERS
937 default y 972 default y
938 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED 973 bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
diff --git a/kernel/Makefile b/kernel/Makefile
index 42423665660a..e914ca992d70 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
95obj-$(CONFIG_TRACING) += trace/ 95obj-$(CONFIG_TRACING) += trace/
96obj-$(CONFIG_SMP) += sched_cpupri.o 96obj-$(CONFIG_SMP) += sched_cpupri.o
97obj-$(CONFIG_SLOW_WORK) += slow-work.o 97obj-$(CONFIG_SLOW_WORK) += slow-work.o
98obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
98 99
99ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 100ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
100# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 101# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/compat.c b/kernel/compat.c
index 42d56544460f..f6c204f07ea6 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
882 882
883} 883}
884 884
885asmlinkage long
886compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
887 struct compat_siginfo __user *uinfo)
888{
889 siginfo_t info;
890
891 if (copy_siginfo_from_user32(&info, uinfo))
892 return -EFAULT;
893 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
894}
895
885#ifdef __ARCH_WANT_COMPAT_SYS_TIME 896#ifdef __ARCH_WANT_COMPAT_SYS_TIME
886 897
887/* compat_time_t is a 32 bit "long" and needs to get converted. */ 898/* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/exit.c b/kernel/exit.c
index abf9cf3b95c6..4741376c8dec 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -158,6 +158,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
158{ 158{
159 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); 159 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
160 160
161#ifdef CONFIG_PERF_COUNTERS
162 WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
163#endif
161 trace_sched_process_free(tsk); 164 trace_sched_process_free(tsk);
162 put_task_struct(tsk); 165 put_task_struct(tsk);
163} 166}
@@ -981,10 +984,6 @@ NORET_TYPE void do_exit(long code)
981 tsk->mempolicy = NULL; 984 tsk->mempolicy = NULL;
982#endif 985#endif
983#ifdef CONFIG_FUTEX 986#ifdef CONFIG_FUTEX
984 /*
985 * This must happen late, after the PID is not
986 * hashed anymore:
987 */
988 if (unlikely(!list_empty(&tsk->pi_state_list))) 987 if (unlikely(!list_empty(&tsk->pi_state_list)))
989 exit_pi_state_list(tsk); 988 exit_pi_state_list(tsk);
990 if (unlikely(current->pi_state_cache)) 989 if (unlikely(current->pi_state_cache))
@@ -1251,6 +1250,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
1251 */ 1250 */
1252 read_unlock(&tasklist_lock); 1251 read_unlock(&tasklist_lock);
1253 1252
1253 /*
1254 * Flush inherited counters to the parent - before the parent
1255 * gets woken up by child-exit notifications.
1256 */
1257 perf_counter_exit_task(p);
1258
1254 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; 1259 retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
1255 status = (p->signal->flags & SIGNAL_GROUP_EXIT) 1260 status = (p->signal->flags & SIGNAL_GROUP_EXIT)
1256 ? p->signal->group_exit_code : p->exit_code; 1261 ? p->signal->group_exit_code : p->exit_code;
diff --git a/kernel/fork.c b/kernel/fork.c
index b9e2edd00726..d32fef4d38e5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -983,6 +983,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
983 goto fork_out; 983 goto fork_out;
984 984
985 rt_mutex_init_task(p); 985 rt_mutex_init_task(p);
986 perf_counter_init_task(p);
986 987
987#ifdef CONFIG_PROVE_LOCKING 988#ifdef CONFIG_PROVE_LOCKING
988 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 989 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index e2d25e9e62ae..f788a5ace24b 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
89 * 89 *
90 * This function is similar to (but not equivalent to) down(). 90 * This function is similar to (but not equivalent to) down().
91 */ 91 */
92void inline __sched mutex_lock(struct mutex *lock) 92void __sched mutex_lock(struct mutex *lock)
93{ 93{
94 might_sleep(); 94 might_sleep();
95 /* 95 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 000000000000..60e55f0b48f4
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,3406 @@
1/*
2 * Performance counter core code
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/fs.h>
13#include <linux/mm.h>
14#include <linux/cpu.h>
15#include <linux/smp.h>
16#include <linux/file.h>
17#include <linux/poll.h>
18#include <linux/sysfs.h>
19#include <linux/ptrace.h>
20#include <linux/percpu.h>
21#include <linux/vmstat.h>
22#include <linux/hardirq.h>
23#include <linux/rculist.h>
24#include <linux/uaccess.h>
25#include <linux/syscalls.h>
26#include <linux/anon_inodes.h>
27#include <linux/kernel_stat.h>
28#include <linux/perf_counter.h>
29#include <linux/dcache.h>
30
31#include <asm/irq_regs.h>
32
33/*
34 * Each CPU has a list of per CPU counters:
35 */
36DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
37
38int perf_max_counters __read_mostly = 1;
39static int perf_reserved_percpu __read_mostly;
40static int perf_overcommit __read_mostly = 1;
41
42static atomic_t nr_mmap_tracking __read_mostly;
43static atomic_t nr_munmap_tracking __read_mostly;
44static atomic_t nr_comm_tracking __read_mostly;
45
46int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
47int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */
48
49/*
50 * Lock for (sysadmin-configurable) counter reservations:
51 */
52static DEFINE_SPINLOCK(perf_resource_lock);
53
54/*
55 * Architecture provided APIs - weak aliases:
56 */
57extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
58{
59 return NULL;
60}
61
62u64 __weak hw_perf_save_disable(void) { return 0; }
63void __weak hw_perf_restore(u64 ctrl) { barrier(); }
64void __weak hw_perf_counter_setup(int cpu) { barrier(); }
65int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
66 struct perf_cpu_context *cpuctx,
67 struct perf_counter_context *ctx, int cpu)
68{
69 return 0;
70}
71
72void __weak perf_counter_print_debug(void) { }
73
74static void
75list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
76{
77 struct perf_counter *group_leader = counter->group_leader;
78
79 /*
80 * Depending on whether it is a standalone or sibling counter,
81 * add it straight to the context's counter list, or to the group
82 * leader's sibling list:
83 */
84 if (counter->group_leader == counter)
85 list_add_tail(&counter->list_entry, &ctx->counter_list);
86 else {
87 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
88 group_leader->nr_siblings++;
89 }
90
91 list_add_rcu(&counter->event_entry, &ctx->event_list);
92}
93
94static void
95list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
96{
97 struct perf_counter *sibling, *tmp;
98
99 list_del_init(&counter->list_entry);
100 list_del_rcu(&counter->event_entry);
101
102 if (counter->group_leader != counter)
103 counter->group_leader->nr_siblings--;
104
105 /*
106 * If this was a group counter with sibling counters then
107 * upgrade the siblings to singleton counters by adding them
108 * to the context list directly:
109 */
110 list_for_each_entry_safe(sibling, tmp,
111 &counter->sibling_list, list_entry) {
112
113 list_move_tail(&sibling->list_entry, &ctx->counter_list);
114 sibling->group_leader = sibling;
115 }
116}
117
118static void
119counter_sched_out(struct perf_counter *counter,
120 struct perf_cpu_context *cpuctx,
121 struct perf_counter_context *ctx)
122{
123 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
124 return;
125
126 counter->state = PERF_COUNTER_STATE_INACTIVE;
127 counter->tstamp_stopped = ctx->time;
128 counter->pmu->disable(counter);
129 counter->oncpu = -1;
130
131 if (!is_software_counter(counter))
132 cpuctx->active_oncpu--;
133 ctx->nr_active--;
134 if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
135 cpuctx->exclusive = 0;
136}
137
138static void
139group_sched_out(struct perf_counter *group_counter,
140 struct perf_cpu_context *cpuctx,
141 struct perf_counter_context *ctx)
142{
143 struct perf_counter *counter;
144
145 if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
146 return;
147
148 counter_sched_out(group_counter, cpuctx, ctx);
149
150 /*
151 * Schedule out siblings (if any):
152 */
153 list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
154 counter_sched_out(counter, cpuctx, ctx);
155
156 if (group_counter->hw_event.exclusive)
157 cpuctx->exclusive = 0;
158}
159
160/*
161 * Cross CPU call to remove a performance counter
162 *
163 * We disable the counter on the hardware level first. After that we
164 * remove it from the context list.
165 */
166static void __perf_counter_remove_from_context(void *info)
167{
168 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
169 struct perf_counter *counter = info;
170 struct perf_counter_context *ctx = counter->ctx;
171 unsigned long flags;
172 u64 perf_flags;
173
174 /*
175 * If this is a task context, we need to check whether it is
176 * the current task context of this cpu. If not it has been
177 * scheduled out before the smp call arrived.
178 */
179 if (ctx->task && cpuctx->task_ctx != ctx)
180 return;
181
182 spin_lock_irqsave(&ctx->lock, flags);
183
184 counter_sched_out(counter, cpuctx, ctx);
185
186 counter->task = NULL;
187 ctx->nr_counters--;
188
189 /*
190 * Protect the list operation against NMI by disabling the
191 * counters on a global level. NOP for non NMI based counters.
192 */
193 perf_flags = hw_perf_save_disable();
194 list_del_counter(counter, ctx);
195 hw_perf_restore(perf_flags);
196
197 if (!ctx->task) {
198 /*
199 * Allow more per task counters with respect to the
200 * reservation:
201 */
202 cpuctx->max_pertask =
203 min(perf_max_counters - ctx->nr_counters,
204 perf_max_counters - perf_reserved_percpu);
205 }
206
207 spin_unlock_irqrestore(&ctx->lock, flags);
208}
209
210
211/*
212 * Remove the counter from a task's (or a CPU's) list of counters.
213 *
214 * Must be called with counter->mutex and ctx->mutex held.
215 *
216 * CPU counters are removed with a smp call. For task counters we only
217 * call when the task is on a CPU.
218 */
219static void perf_counter_remove_from_context(struct perf_counter *counter)
220{
221 struct perf_counter_context *ctx = counter->ctx;
222 struct task_struct *task = ctx->task;
223
224 if (!task) {
225 /*
226 * Per cpu counters are removed via an smp call and
227 * the removal is always sucessful.
228 */
229 smp_call_function_single(counter->cpu,
230 __perf_counter_remove_from_context,
231 counter, 1);
232 return;
233 }
234
235retry:
236 task_oncpu_function_call(task, __perf_counter_remove_from_context,
237 counter);
238
239 spin_lock_irq(&ctx->lock);
240 /*
241 * If the context is active we need to retry the smp call.
242 */
243 if (ctx->nr_active && !list_empty(&counter->list_entry)) {
244 spin_unlock_irq(&ctx->lock);
245 goto retry;
246 }
247
248 /*
249 * The lock prevents that this context is scheduled in so we
250 * can remove the counter safely, if the call above did not
251 * succeed.
252 */
253 if (!list_empty(&counter->list_entry)) {
254 ctx->nr_counters--;
255 list_del_counter(counter, ctx);
256 counter->task = NULL;
257 }
258 spin_unlock_irq(&ctx->lock);
259}
260
261static inline u64 perf_clock(void)
262{
263 return cpu_clock(smp_processor_id());
264}
265
266/*
267 * Update the record of the current time in a context.
268 */
269static void update_context_time(struct perf_counter_context *ctx)
270{
271 u64 now = perf_clock();
272
273 ctx->time += now - ctx->timestamp;
274 ctx->timestamp = now;
275}
276
277/*
278 * Update the total_time_enabled and total_time_running fields for a counter.
279 */
280static void update_counter_times(struct perf_counter *counter)
281{
282 struct perf_counter_context *ctx = counter->ctx;
283 u64 run_end;
284
285 if (counter->state < PERF_COUNTER_STATE_INACTIVE)
286 return;
287
288 counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
289
290 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
291 run_end = counter->tstamp_stopped;
292 else
293 run_end = ctx->time;
294
295 counter->total_time_running = run_end - counter->tstamp_running;
296}
297
298/*
299 * Update total_time_enabled and total_time_running for all counters in a group.
300 */
301static void update_group_times(struct perf_counter *leader)
302{
303 struct perf_counter *counter;
304
305 update_counter_times(leader);
306 list_for_each_entry(counter, &leader->sibling_list, list_entry)
307 update_counter_times(counter);
308}
309
310/*
311 * Cross CPU call to disable a performance counter
312 */
313static void __perf_counter_disable(void *info)
314{
315 struct perf_counter *counter = info;
316 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
317 struct perf_counter_context *ctx = counter->ctx;
318 unsigned long flags;
319
320 /*
321 * If this is a per-task counter, need to check whether this
322 * counter's task is the current task on this cpu.
323 */
324 if (ctx->task && cpuctx->task_ctx != ctx)
325 return;
326
327 spin_lock_irqsave(&ctx->lock, flags);
328
329 /*
330 * If the counter is on, turn it off.
331 * If it is in error state, leave it in error state.
332 */
333 if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
334 update_context_time(ctx);
335 update_counter_times(counter);
336 if (counter == counter->group_leader)
337 group_sched_out(counter, cpuctx, ctx);
338 else
339 counter_sched_out(counter, cpuctx, ctx);
340 counter->state = PERF_COUNTER_STATE_OFF;
341 }
342
343 spin_unlock_irqrestore(&ctx->lock, flags);
344}
345
346/*
347 * Disable a counter.
348 */
349static void perf_counter_disable(struct perf_counter *counter)
350{
351 struct perf_counter_context *ctx = counter->ctx;
352 struct task_struct *task = ctx->task;
353
354 if (!task) {
355 /*
356 * Disable the counter on the cpu that it's on
357 */
358 smp_call_function_single(counter->cpu, __perf_counter_disable,
359 counter, 1);
360 return;
361 }
362
363 retry:
364 task_oncpu_function_call(task, __perf_counter_disable, counter);
365
366 spin_lock_irq(&ctx->lock);
367 /*
368 * If the counter is still active, we need to retry the cross-call.
369 */
370 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
371 spin_unlock_irq(&ctx->lock);
372 goto retry;
373 }
374
375 /*
376 * Since we have the lock this context can't be scheduled
377 * in, so we can change the state safely.
378 */
379 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
380 update_counter_times(counter);
381 counter->state = PERF_COUNTER_STATE_OFF;
382 }
383
384 spin_unlock_irq(&ctx->lock);
385}
386
387/*
388 * Disable a counter and all its children.
389 */
390static void perf_counter_disable_family(struct perf_counter *counter)
391{
392 struct perf_counter *child;
393
394 perf_counter_disable(counter);
395
396 /*
397 * Lock the mutex to protect the list of children
398 */
399 mutex_lock(&counter->mutex);
400 list_for_each_entry(child, &counter->child_list, child_list)
401 perf_counter_disable(child);
402 mutex_unlock(&counter->mutex);
403}
404
405static int
406counter_sched_in(struct perf_counter *counter,
407 struct perf_cpu_context *cpuctx,
408 struct perf_counter_context *ctx,
409 int cpu)
410{
411 if (counter->state <= PERF_COUNTER_STATE_OFF)
412 return 0;
413
414 counter->state = PERF_COUNTER_STATE_ACTIVE;
415 counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */
416 /*
417 * The new state must be visible before we turn it on in the hardware:
418 */
419 smp_wmb();
420
421 if (counter->pmu->enable(counter)) {
422 counter->state = PERF_COUNTER_STATE_INACTIVE;
423 counter->oncpu = -1;
424 return -EAGAIN;
425 }
426
427 counter->tstamp_running += ctx->time - counter->tstamp_stopped;
428
429 if (!is_software_counter(counter))
430 cpuctx->active_oncpu++;
431 ctx->nr_active++;
432
433 if (counter->hw_event.exclusive)
434 cpuctx->exclusive = 1;
435
436 return 0;
437}
438
439/*
440 * Return 1 for a group consisting entirely of software counters,
441 * 0 if the group contains any hardware counters.
442 */
443static int is_software_only_group(struct perf_counter *leader)
444{
445 struct perf_counter *counter;
446
447 if (!is_software_counter(leader))
448 return 0;
449
450 list_for_each_entry(counter, &leader->sibling_list, list_entry)
451 if (!is_software_counter(counter))
452 return 0;
453
454 return 1;
455}
456
457/*
458 * Work out whether we can put this counter group on the CPU now.
459 */
460static int group_can_go_on(struct perf_counter *counter,
461 struct perf_cpu_context *cpuctx,
462 int can_add_hw)
463{
464 /*
465 * Groups consisting entirely of software counters can always go on.
466 */
467 if (is_software_only_group(counter))
468 return 1;
469 /*
470 * If an exclusive group is already on, no other hardware
471 * counters can go on.
472 */
473 if (cpuctx->exclusive)
474 return 0;
475 /*
476 * If this group is exclusive and there are already
477 * counters on the CPU, it can't go on.
478 */
479 if (counter->hw_event.exclusive && cpuctx->active_oncpu)
480 return 0;
481 /*
482 * Otherwise, try to add it if all previous groups were able
483 * to go on.
484 */
485 return can_add_hw;
486}
487
488static void add_counter_to_ctx(struct perf_counter *counter,
489 struct perf_counter_context *ctx)
490{
491 list_add_counter(counter, ctx);
492 ctx->nr_counters++;
493 counter->prev_state = PERF_COUNTER_STATE_OFF;
494 counter->tstamp_enabled = ctx->time;
495 counter->tstamp_running = ctx->time;
496 counter->tstamp_stopped = ctx->time;
497}
498
499/*
500 * Cross CPU call to install and enable a performance counter
501 */
502static void __perf_install_in_context(void *info)
503{
504 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
505 struct perf_counter *counter = info;
506 struct perf_counter_context *ctx = counter->ctx;
507 struct perf_counter *leader = counter->group_leader;
508 int cpu = smp_processor_id();
509 unsigned long flags;
510 u64 perf_flags;
511 int err;
512
513 /*
514 * If this is a task context, we need to check whether it is
515 * the current task context of this cpu. If not it has been
516 * scheduled out before the smp call arrived.
517 */
518 if (ctx->task && cpuctx->task_ctx != ctx)
519 return;
520
521 spin_lock_irqsave(&ctx->lock, flags);
522 update_context_time(ctx);
523
524 /*
525 * Protect the list operation against NMI by disabling the
526 * counters on a global level. NOP for non NMI based counters.
527 */
528 perf_flags = hw_perf_save_disable();
529
530 add_counter_to_ctx(counter, ctx);
531
532 /*
533 * Don't put the counter on if it is disabled or if
534 * it is in a group and the group isn't on.
535 */
536 if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
537 (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
538 goto unlock;
539
540 /*
541 * An exclusive counter can't go on if there are already active
542 * hardware counters, and no hardware counter can go on if there
543 * is already an exclusive counter on.
544 */
545 if (!group_can_go_on(counter, cpuctx, 1))
546 err = -EEXIST;
547 else
548 err = counter_sched_in(counter, cpuctx, ctx, cpu);
549
550 if (err) {
551 /*
552 * This counter couldn't go on. If it is in a group
553 * then we have to pull the whole group off.
554 * If the counter group is pinned then put it in error state.
555 */
556 if (leader != counter)
557 group_sched_out(leader, cpuctx, ctx);
558 if (leader->hw_event.pinned) {
559 update_group_times(leader);
560 leader->state = PERF_COUNTER_STATE_ERROR;
561 }
562 }
563
564 if (!err && !ctx->task && cpuctx->max_pertask)
565 cpuctx->max_pertask--;
566
567 unlock:
568 hw_perf_restore(perf_flags);
569
570 spin_unlock_irqrestore(&ctx->lock, flags);
571}
572
573/*
574 * Attach a performance counter to a context
575 *
576 * First we add the counter to the list with the hardware enable bit
577 * in counter->hw_config cleared.
578 *
579 * If the counter is attached to a task which is on a CPU we use a smp
580 * call to enable it in the task context. The task might have been
581 * scheduled away, but we check this in the smp call again.
582 *
583 * Must be called with ctx->mutex held.
584 */
585static void
586perf_install_in_context(struct perf_counter_context *ctx,
587 struct perf_counter *counter,
588 int cpu)
589{
590 struct task_struct *task = ctx->task;
591
592 if (!task) {
593 /*
594 * Per cpu counters are installed via an smp call and
595 * the install is always sucessful.
596 */
597 smp_call_function_single(cpu, __perf_install_in_context,
598 counter, 1);
599 return;
600 }
601
602 counter->task = task;
603retry:
604 task_oncpu_function_call(task, __perf_install_in_context,
605 counter);
606
607 spin_lock_irq(&ctx->lock);
608 /*
609 * we need to retry the smp call.
610 */
611 if (ctx->is_active && list_empty(&counter->list_entry)) {
612 spin_unlock_irq(&ctx->lock);
613 goto retry;
614 }
615
616 /*
617 * The lock prevents that this context is scheduled in so we
618 * can add the counter safely, if it the call above did not
619 * succeed.
620 */
621 if (list_empty(&counter->list_entry))
622 add_counter_to_ctx(counter, ctx);
623 spin_unlock_irq(&ctx->lock);
624}
625
626/*
627 * Cross CPU call to enable a performance counter
628 */
629static void __perf_counter_enable(void *info)
630{
631 struct perf_counter *counter = info;
632 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
633 struct perf_counter_context *ctx = counter->ctx;
634 struct perf_counter *leader = counter->group_leader;
635 unsigned long flags;
636 int err;
637
638 /*
639 * If this is a per-task counter, need to check whether this
640 * counter's task is the current task on this cpu.
641 */
642 if (ctx->task && cpuctx->task_ctx != ctx)
643 return;
644
645 spin_lock_irqsave(&ctx->lock, flags);
646 update_context_time(ctx);
647
648 counter->prev_state = counter->state;
649 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
650 goto unlock;
651 counter->state = PERF_COUNTER_STATE_INACTIVE;
652 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
653
654 /*
655 * If the counter is in a group and isn't the group leader,
656 * then don't put it on unless the group is on.
657 */
658 if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
659 goto unlock;
660
661 if (!group_can_go_on(counter, cpuctx, 1))
662 err = -EEXIST;
663 else
664 err = counter_sched_in(counter, cpuctx, ctx,
665 smp_processor_id());
666
667 if (err) {
668 /*
669 * If this counter can't go on and it's part of a
670 * group, then the whole group has to come off.
671 */
672 if (leader != counter)
673 group_sched_out(leader, cpuctx, ctx);
674 if (leader->hw_event.pinned) {
675 update_group_times(leader);
676 leader->state = PERF_COUNTER_STATE_ERROR;
677 }
678 }
679
680 unlock:
681 spin_unlock_irqrestore(&ctx->lock, flags);
682}
683
684/*
685 * Enable a counter.
686 */
687static void perf_counter_enable(struct perf_counter *counter)
688{
689 struct perf_counter_context *ctx = counter->ctx;
690 struct task_struct *task = ctx->task;
691
692 if (!task) {
693 /*
694 * Enable the counter on the cpu that it's on
695 */
696 smp_call_function_single(counter->cpu, __perf_counter_enable,
697 counter, 1);
698 return;
699 }
700
701 spin_lock_irq(&ctx->lock);
702 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
703 goto out;
704
705 /*
706 * If the counter is in error state, clear that first.
707 * That way, if we see the counter in error state below, we
708 * know that it has gone back into error state, as distinct
709 * from the task having been scheduled away before the
710 * cross-call arrived.
711 */
712 if (counter->state == PERF_COUNTER_STATE_ERROR)
713 counter->state = PERF_COUNTER_STATE_OFF;
714
715 retry:
716 spin_unlock_irq(&ctx->lock);
717 task_oncpu_function_call(task, __perf_counter_enable, counter);
718
719 spin_lock_irq(&ctx->lock);
720
721 /*
722 * If the context is active and the counter is still off,
723 * we need to retry the cross-call.
724 */
725 if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
726 goto retry;
727
728 /*
729 * Since we have the lock this context can't be scheduled
730 * in, so we can change the state safely.
731 */
732 if (counter->state == PERF_COUNTER_STATE_OFF) {
733 counter->state = PERF_COUNTER_STATE_INACTIVE;
734 counter->tstamp_enabled =
735 ctx->time - counter->total_time_enabled;
736 }
737 out:
738 spin_unlock_irq(&ctx->lock);
739}
740
741static int perf_counter_refresh(struct perf_counter *counter, int refresh)
742{
743 /*
744 * not supported on inherited counters
745 */
746 if (counter->hw_event.inherit)
747 return -EINVAL;
748
749 atomic_add(refresh, &counter->event_limit);
750 perf_counter_enable(counter);
751
752 return 0;
753}
754
755/*
756 * Enable a counter and all its children.
757 */
758static void perf_counter_enable_family(struct perf_counter *counter)
759{
760 struct perf_counter *child;
761
762 perf_counter_enable(counter);
763
764 /*
765 * Lock the mutex to protect the list of children
766 */
767 mutex_lock(&counter->mutex);
768 list_for_each_entry(child, &counter->child_list, child_list)
769 perf_counter_enable(child);
770 mutex_unlock(&counter->mutex);
771}
772
773void __perf_counter_sched_out(struct perf_counter_context *ctx,
774 struct perf_cpu_context *cpuctx)
775{
776 struct perf_counter *counter;
777 u64 flags;
778
779 spin_lock(&ctx->lock);
780 ctx->is_active = 0;
781 if (likely(!ctx->nr_counters))
782 goto out;
783 update_context_time(ctx);
784
785 flags = hw_perf_save_disable();
786 if (ctx->nr_active) {
787 list_for_each_entry(counter, &ctx->counter_list, list_entry)
788 group_sched_out(counter, cpuctx, ctx);
789 }
790 hw_perf_restore(flags);
791 out:
792 spin_unlock(&ctx->lock);
793}
794
795/*
796 * Called from scheduler to remove the counters of the current task,
797 * with interrupts disabled.
798 *
799 * We stop each counter and update the counter value in counter->count.
800 *
801 * This does not protect us against NMI, but disable()
802 * sets the disabled bit in the control field of counter _before_
803 * accessing the counter control register. If a NMI hits, then it will
804 * not restart the counter.
805 */
806void perf_counter_task_sched_out(struct task_struct *task, int cpu)
807{
808 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
809 struct perf_counter_context *ctx = &task->perf_counter_ctx;
810 struct pt_regs *regs;
811
812 if (likely(!cpuctx->task_ctx))
813 return;
814
815 update_context_time(ctx);
816
817 regs = task_pt_regs(task);
818 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
819 __perf_counter_sched_out(ctx, cpuctx);
820
821 cpuctx->task_ctx = NULL;
822}
823
824static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
825{
826 __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
827}
828
829static int
830group_sched_in(struct perf_counter *group_counter,
831 struct perf_cpu_context *cpuctx,
832 struct perf_counter_context *ctx,
833 int cpu)
834{
835 struct perf_counter *counter, *partial_group;
836 int ret;
837
838 if (group_counter->state == PERF_COUNTER_STATE_OFF)
839 return 0;
840
841 ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
842 if (ret)
843 return ret < 0 ? ret : 0;
844
845 group_counter->prev_state = group_counter->state;
846 if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
847 return -EAGAIN;
848
849 /*
850 * Schedule in siblings as one group (if any):
851 */
852 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
853 counter->prev_state = counter->state;
854 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
855 partial_group = counter;
856 goto group_error;
857 }
858 }
859
860 return 0;
861
862group_error:
863 /*
864 * Groups can be scheduled in as one unit only, so undo any
865 * partial group before returning:
866 */
867 list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
868 if (counter == partial_group)
869 break;
870 counter_sched_out(counter, cpuctx, ctx);
871 }
872 counter_sched_out(group_counter, cpuctx, ctx);
873
874 return -EAGAIN;
875}
876
877static void
878__perf_counter_sched_in(struct perf_counter_context *ctx,
879 struct perf_cpu_context *cpuctx, int cpu)
880{
881 struct perf_counter *counter;
882 u64 flags;
883 int can_add_hw = 1;
884
885 spin_lock(&ctx->lock);
886 ctx->is_active = 1;
887 if (likely(!ctx->nr_counters))
888 goto out;
889
890 ctx->timestamp = perf_clock();
891
892 flags = hw_perf_save_disable();
893
894 /*
895 * First go through the list and put on any pinned groups
896 * in order to give them the best chance of going on.
897 */
898 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
899 if (counter->state <= PERF_COUNTER_STATE_OFF ||
900 !counter->hw_event.pinned)
901 continue;
902 if (counter->cpu != -1 && counter->cpu != cpu)
903 continue;
904
905 if (group_can_go_on(counter, cpuctx, 1))
906 group_sched_in(counter, cpuctx, ctx, cpu);
907
908 /*
909 * If this pinned group hasn't been scheduled,
910 * put it in error state.
911 */
912 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
913 update_group_times(counter);
914 counter->state = PERF_COUNTER_STATE_ERROR;
915 }
916 }
917
918 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
919 /*
920 * Ignore counters in OFF or ERROR state, and
921 * ignore pinned counters since we did them already.
922 */
923 if (counter->state <= PERF_COUNTER_STATE_OFF ||
924 counter->hw_event.pinned)
925 continue;
926
927 /*
928 * Listen to the 'cpu' scheduling filter constraint
929 * of counters:
930 */
931 if (counter->cpu != -1 && counter->cpu != cpu)
932 continue;
933
934 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
935 if (group_sched_in(counter, cpuctx, ctx, cpu))
936 can_add_hw = 0;
937 }
938 }
939 hw_perf_restore(flags);
940 out:
941 spin_unlock(&ctx->lock);
942}
943
944/*
945 * Called from scheduler to add the counters of the current task
946 * with interrupts disabled.
947 *
948 * We restore the counter value and then enable it.
949 *
950 * This does not protect us against NMI, but enable()
951 * sets the enabled bit in the control field of counter _before_
952 * accessing the counter control register. If a NMI hits, then it will
953 * keep the counter running.
954 */
955void perf_counter_task_sched_in(struct task_struct *task, int cpu)
956{
957 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
958 struct perf_counter_context *ctx = &task->perf_counter_ctx;
959
960 __perf_counter_sched_in(ctx, cpuctx, cpu);
961 cpuctx->task_ctx = ctx;
962}
963
964static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
965{
966 struct perf_counter_context *ctx = &cpuctx->ctx;
967
968 __perf_counter_sched_in(ctx, cpuctx, cpu);
969}
970
971int perf_counter_task_disable(void)
972{
973 struct task_struct *curr = current;
974 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
975 struct perf_counter *counter;
976 unsigned long flags;
977 u64 perf_flags;
978 int cpu;
979
980 if (likely(!ctx->nr_counters))
981 return 0;
982
983 local_irq_save(flags);
984 cpu = smp_processor_id();
985
986 perf_counter_task_sched_out(curr, cpu);
987
988 spin_lock(&ctx->lock);
989
990 /*
991 * Disable all the counters:
992 */
993 perf_flags = hw_perf_save_disable();
994
995 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
996 if (counter->state != PERF_COUNTER_STATE_ERROR) {
997 update_group_times(counter);
998 counter->state = PERF_COUNTER_STATE_OFF;
999 }
1000 }
1001
1002 hw_perf_restore(perf_flags);
1003
1004 spin_unlock_irqrestore(&ctx->lock, flags);
1005
1006 return 0;
1007}
1008
1009int perf_counter_task_enable(void)
1010{
1011 struct task_struct *curr = current;
1012 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1013 struct perf_counter *counter;
1014 unsigned long flags;
1015 u64 perf_flags;
1016 int cpu;
1017
1018 if (likely(!ctx->nr_counters))
1019 return 0;
1020
1021 local_irq_save(flags);
1022 cpu = smp_processor_id();
1023
1024 perf_counter_task_sched_out(curr, cpu);
1025
1026 spin_lock(&ctx->lock);
1027
1028 /*
1029 * Disable all the counters:
1030 */
1031 perf_flags = hw_perf_save_disable();
1032
1033 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1034 if (counter->state > PERF_COUNTER_STATE_OFF)
1035 continue;
1036 counter->state = PERF_COUNTER_STATE_INACTIVE;
1037 counter->tstamp_enabled =
1038 ctx->time - counter->total_time_enabled;
1039 counter->hw_event.disabled = 0;
1040 }
1041 hw_perf_restore(perf_flags);
1042
1043 spin_unlock(&ctx->lock);
1044
1045 perf_counter_task_sched_in(curr, cpu);
1046
1047 local_irq_restore(flags);
1048
1049 return 0;
1050}
1051
1052/*
1053 * Round-robin a context's counters:
1054 */
1055static void rotate_ctx(struct perf_counter_context *ctx)
1056{
1057 struct perf_counter *counter;
1058 u64 perf_flags;
1059
1060 if (!ctx->nr_counters)
1061 return;
1062
1063 spin_lock(&ctx->lock);
1064 /*
1065 * Rotate the first entry last (works just fine for group counters too):
1066 */
1067 perf_flags = hw_perf_save_disable();
1068 list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1069 list_move_tail(&counter->list_entry, &ctx->counter_list);
1070 break;
1071 }
1072 hw_perf_restore(perf_flags);
1073
1074 spin_unlock(&ctx->lock);
1075}
1076
1077void perf_counter_task_tick(struct task_struct *curr, int cpu)
1078{
1079 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1080 struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1081
1082 perf_counter_cpu_sched_out(cpuctx);
1083 perf_counter_task_sched_out(curr, cpu);
1084
1085 rotate_ctx(&cpuctx->ctx);
1086 rotate_ctx(ctx);
1087
1088 perf_counter_cpu_sched_in(cpuctx, cpu);
1089 perf_counter_task_sched_in(curr, cpu);
1090}
1091
1092/*
1093 * Cross CPU call to read the hardware counter
1094 */
1095static void __read(void *info)
1096{
1097 struct perf_counter *counter = info;
1098 struct perf_counter_context *ctx = counter->ctx;
1099 unsigned long flags;
1100
1101 local_irq_save(flags);
1102 if (ctx->is_active)
1103 update_context_time(ctx);
1104 counter->pmu->read(counter);
1105 update_counter_times(counter);
1106 local_irq_restore(flags);
1107}
1108
1109static u64 perf_counter_read(struct perf_counter *counter)
1110{
1111 /*
1112 * If counter is enabled and currently active on a CPU, update the
1113 * value in the counter structure:
1114 */
1115 if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1116 smp_call_function_single(counter->oncpu,
1117 __read, counter, 1);
1118 } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1119 update_counter_times(counter);
1120 }
1121
1122 return atomic64_read(&counter->count);
1123}
1124
1125static void put_context(struct perf_counter_context *ctx)
1126{
1127 if (ctx->task)
1128 put_task_struct(ctx->task);
1129}
1130
1131static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1132{
1133 struct perf_cpu_context *cpuctx;
1134 struct perf_counter_context *ctx;
1135 struct task_struct *task;
1136
1137 /*
1138 * If cpu is not a wildcard then this is a percpu counter:
1139 */
1140 if (cpu != -1) {
1141 /* Must be root to operate on a CPU counter: */
1142 if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
1143 return ERR_PTR(-EACCES);
1144
1145 if (cpu < 0 || cpu > num_possible_cpus())
1146 return ERR_PTR(-EINVAL);
1147
1148 /*
1149 * We could be clever and allow to attach a counter to an
1150 * offline CPU and activate it when the CPU comes up, but
1151 * that's for later.
1152 */
1153 if (!cpu_isset(cpu, cpu_online_map))
1154 return ERR_PTR(-ENODEV);
1155
1156 cpuctx = &per_cpu(perf_cpu_context, cpu);
1157 ctx = &cpuctx->ctx;
1158
1159 return ctx;
1160 }
1161
1162 rcu_read_lock();
1163 if (!pid)
1164 task = current;
1165 else
1166 task = find_task_by_vpid(pid);
1167 if (task)
1168 get_task_struct(task);
1169 rcu_read_unlock();
1170
1171 if (!task)
1172 return ERR_PTR(-ESRCH);
1173
1174 ctx = &task->perf_counter_ctx;
1175 ctx->task = task;
1176
1177 /* Reuse ptrace permission checks for now. */
1178 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1179 put_context(ctx);
1180 return ERR_PTR(-EACCES);
1181 }
1182
1183 return ctx;
1184}
1185
1186static void free_counter_rcu(struct rcu_head *head)
1187{
1188 struct perf_counter *counter;
1189
1190 counter = container_of(head, struct perf_counter, rcu_head);
1191 kfree(counter);
1192}
1193
1194static void perf_pending_sync(struct perf_counter *counter);
1195
1196static void free_counter(struct perf_counter *counter)
1197{
1198 perf_pending_sync(counter);
1199
1200 if (counter->hw_event.mmap)
1201 atomic_dec(&nr_mmap_tracking);
1202 if (counter->hw_event.munmap)
1203 atomic_dec(&nr_munmap_tracking);
1204 if (counter->hw_event.comm)
1205 atomic_dec(&nr_comm_tracking);
1206
1207 if (counter->destroy)
1208 counter->destroy(counter);
1209
1210 call_rcu(&counter->rcu_head, free_counter_rcu);
1211}
1212
1213/*
1214 * Called when the last reference to the file is gone.
1215 */
1216static int perf_release(struct inode *inode, struct file *file)
1217{
1218 struct perf_counter *counter = file->private_data;
1219 struct perf_counter_context *ctx = counter->ctx;
1220
1221 file->private_data = NULL;
1222
1223 mutex_lock(&ctx->mutex);
1224 mutex_lock(&counter->mutex);
1225
1226 perf_counter_remove_from_context(counter);
1227
1228 mutex_unlock(&counter->mutex);
1229 mutex_unlock(&ctx->mutex);
1230
1231 free_counter(counter);
1232 put_context(ctx);
1233
1234 return 0;
1235}
1236
1237/*
1238 * Read the performance counter - simple non blocking version for now
1239 */
1240static ssize_t
1241perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1242{
1243 u64 values[3];
1244 int n;
1245
1246 /*
1247 * Return end-of-file for a read on a counter that is in
1248 * error state (i.e. because it was pinned but it couldn't be
1249 * scheduled on to the CPU at some point).
1250 */
1251 if (counter->state == PERF_COUNTER_STATE_ERROR)
1252 return 0;
1253
1254 mutex_lock(&counter->mutex);
1255 values[0] = perf_counter_read(counter);
1256 n = 1;
1257 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1258 values[n++] = counter->total_time_enabled +
1259 atomic64_read(&counter->child_total_time_enabled);
1260 if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1261 values[n++] = counter->total_time_running +
1262 atomic64_read(&counter->child_total_time_running);
1263 mutex_unlock(&counter->mutex);
1264
1265 if (count < n * sizeof(u64))
1266 return -EINVAL;
1267 count = n * sizeof(u64);
1268
1269 if (copy_to_user(buf, values, count))
1270 return -EFAULT;
1271
1272 return count;
1273}
1274
1275static ssize_t
1276perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1277{
1278 struct perf_counter *counter = file->private_data;
1279
1280 return perf_read_hw(counter, buf, count);
1281}
1282
1283static unsigned int perf_poll(struct file *file, poll_table *wait)
1284{
1285 struct perf_counter *counter = file->private_data;
1286 struct perf_mmap_data *data;
1287 unsigned int events = POLL_HUP;
1288
1289 rcu_read_lock();
1290 data = rcu_dereference(counter->data);
1291 if (data)
1292 events = atomic_xchg(&data->poll, 0);
1293 rcu_read_unlock();
1294
1295 poll_wait(file, &counter->waitq, wait);
1296
1297 return events;
1298}
1299
1300static void perf_counter_reset(struct perf_counter *counter)
1301{
1302 atomic_set(&counter->count, 0);
1303}
1304
1305static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1306{
1307 struct perf_counter *counter = file->private_data;
1308 int err = 0;
1309
1310 switch (cmd) {
1311 case PERF_COUNTER_IOC_ENABLE:
1312 perf_counter_enable_family(counter);
1313 break;
1314 case PERF_COUNTER_IOC_DISABLE:
1315 perf_counter_disable_family(counter);
1316 break;
1317 case PERF_COUNTER_IOC_REFRESH:
1318 err = perf_counter_refresh(counter, arg);
1319 break;
1320 case PERF_COUNTER_IOC_RESET:
1321 perf_counter_reset(counter);
1322 break;
1323 default:
1324 err = -ENOTTY;
1325 }
1326 return err;
1327}
1328
1329/*
1330 * Callers need to ensure there can be no nesting of this function, otherwise
1331 * the seqlock logic goes bad. We can not serialize this because the arch
1332 * code calls this from NMI context.
1333 */
1334void perf_counter_update_userpage(struct perf_counter *counter)
1335{
1336 struct perf_mmap_data *data;
1337 struct perf_counter_mmap_page *userpg;
1338
1339 rcu_read_lock();
1340 data = rcu_dereference(counter->data);
1341 if (!data)
1342 goto unlock;
1343
1344 userpg = data->user_page;
1345
1346 /*
1347 * Disable preemption so as to not let the corresponding user-space
1348 * spin too long if we get preempted.
1349 */
1350 preempt_disable();
1351 ++userpg->lock;
1352 barrier();
1353 userpg->index = counter->hw.idx;
1354 userpg->offset = atomic64_read(&counter->count);
1355 if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1356 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1357
1358 barrier();
1359 ++userpg->lock;
1360 preempt_enable();
1361unlock:
1362 rcu_read_unlock();
1363}
1364
1365static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1366{
1367 struct perf_counter *counter = vma->vm_file->private_data;
1368 struct perf_mmap_data *data;
1369 int ret = VM_FAULT_SIGBUS;
1370
1371 rcu_read_lock();
1372 data = rcu_dereference(counter->data);
1373 if (!data)
1374 goto unlock;
1375
1376 if (vmf->pgoff == 0) {
1377 vmf->page = virt_to_page(data->user_page);
1378 } else {
1379 int nr = vmf->pgoff - 1;
1380
1381 if ((unsigned)nr > data->nr_pages)
1382 goto unlock;
1383
1384 vmf->page = virt_to_page(data->data_pages[nr]);
1385 }
1386 get_page(vmf->page);
1387 ret = 0;
1388unlock:
1389 rcu_read_unlock();
1390
1391 return ret;
1392}
1393
1394static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1395{
1396 struct perf_mmap_data *data;
1397 unsigned long size;
1398 int i;
1399
1400 WARN_ON(atomic_read(&counter->mmap_count));
1401
1402 size = sizeof(struct perf_mmap_data);
1403 size += nr_pages * sizeof(void *);
1404
1405 data = kzalloc(size, GFP_KERNEL);
1406 if (!data)
1407 goto fail;
1408
1409 data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1410 if (!data->user_page)
1411 goto fail_user_page;
1412
1413 for (i = 0; i < nr_pages; i++) {
1414 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1415 if (!data->data_pages[i])
1416 goto fail_data_pages;
1417 }
1418
1419 data->nr_pages = nr_pages;
1420 atomic_set(&data->lock, -1);
1421
1422 rcu_assign_pointer(counter->data, data);
1423
1424 return 0;
1425
1426fail_data_pages:
1427 for (i--; i >= 0; i--)
1428 free_page((unsigned long)data->data_pages[i]);
1429
1430 free_page((unsigned long)data->user_page);
1431
1432fail_user_page:
1433 kfree(data);
1434
1435fail:
1436 return -ENOMEM;
1437}
1438
1439static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1440{
1441 struct perf_mmap_data *data = container_of(rcu_head,
1442 struct perf_mmap_data, rcu_head);
1443 int i;
1444
1445 free_page((unsigned long)data->user_page);
1446 for (i = 0; i < data->nr_pages; i++)
1447 free_page((unsigned long)data->data_pages[i]);
1448 kfree(data);
1449}
1450
1451static void perf_mmap_data_free(struct perf_counter *counter)
1452{
1453 struct perf_mmap_data *data = counter->data;
1454
1455 WARN_ON(atomic_read(&counter->mmap_count));
1456
1457 rcu_assign_pointer(counter->data, NULL);
1458 call_rcu(&data->rcu_head, __perf_mmap_data_free);
1459}
1460
1461static void perf_mmap_open(struct vm_area_struct *vma)
1462{
1463 struct perf_counter *counter = vma->vm_file->private_data;
1464
1465 atomic_inc(&counter->mmap_count);
1466}
1467
1468static void perf_mmap_close(struct vm_area_struct *vma)
1469{
1470 struct perf_counter *counter = vma->vm_file->private_data;
1471
1472 if (atomic_dec_and_mutex_lock(&counter->mmap_count,
1473 &counter->mmap_mutex)) {
1474 vma->vm_mm->locked_vm -= counter->data->nr_locked;
1475 perf_mmap_data_free(counter);
1476 mutex_unlock(&counter->mmap_mutex);
1477 }
1478}
1479
1480static struct vm_operations_struct perf_mmap_vmops = {
1481 .open = perf_mmap_open,
1482 .close = perf_mmap_close,
1483 .fault = perf_mmap_fault,
1484};
1485
1486static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1487{
1488 struct perf_counter *counter = file->private_data;
1489 unsigned long vma_size;
1490 unsigned long nr_pages;
1491 unsigned long locked, lock_limit;
1492 int ret = 0;
1493 long extra;
1494
1495 if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1496 return -EINVAL;
1497
1498 vma_size = vma->vm_end - vma->vm_start;
1499 nr_pages = (vma_size / PAGE_SIZE) - 1;
1500
1501 /*
1502 * If we have data pages ensure they're a power-of-two number, so we
1503 * can do bitmasks instead of modulo.
1504 */
1505 if (nr_pages != 0 && !is_power_of_2(nr_pages))
1506 return -EINVAL;
1507
1508 if (vma_size != PAGE_SIZE * (1 + nr_pages))
1509 return -EINVAL;
1510
1511 if (vma->vm_pgoff != 0)
1512 return -EINVAL;
1513
1514 mutex_lock(&counter->mmap_mutex);
1515 if (atomic_inc_not_zero(&counter->mmap_count)) {
1516 if (nr_pages != counter->data->nr_pages)
1517 ret = -EINVAL;
1518 goto unlock;
1519 }
1520
1521 extra = nr_pages /* + 1 only account the data pages */;
1522 extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
1523 if (extra < 0)
1524 extra = 0;
1525
1526 locked = vma->vm_mm->locked_vm + extra;
1527
1528 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1529 lock_limit >>= PAGE_SHIFT;
1530
1531 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1532 ret = -EPERM;
1533 goto unlock;
1534 }
1535
1536 WARN_ON(counter->data);
1537 ret = perf_mmap_data_alloc(counter, nr_pages);
1538 if (ret)
1539 goto unlock;
1540
1541 atomic_set(&counter->mmap_count, 1);
1542 vma->vm_mm->locked_vm += extra;
1543 counter->data->nr_locked = extra;
1544unlock:
1545 mutex_unlock(&counter->mmap_mutex);
1546
1547 vma->vm_flags &= ~VM_MAYWRITE;
1548 vma->vm_flags |= VM_RESERVED;
1549 vma->vm_ops = &perf_mmap_vmops;
1550
1551 return ret;
1552}
1553
1554static int perf_fasync(int fd, struct file *filp, int on)
1555{
1556 struct perf_counter *counter = filp->private_data;
1557 struct inode *inode = filp->f_path.dentry->d_inode;
1558 int retval;
1559
1560 mutex_lock(&inode->i_mutex);
1561 retval = fasync_helper(fd, filp, on, &counter->fasync);
1562 mutex_unlock(&inode->i_mutex);
1563
1564 if (retval < 0)
1565 return retval;
1566
1567 return 0;
1568}
1569
1570static const struct file_operations perf_fops = {
1571 .release = perf_release,
1572 .read = perf_read,
1573 .poll = perf_poll,
1574 .unlocked_ioctl = perf_ioctl,
1575 .compat_ioctl = perf_ioctl,
1576 .mmap = perf_mmap,
1577 .fasync = perf_fasync,
1578};
1579
1580/*
1581 * Perf counter wakeup
1582 *
1583 * If there's data, ensure we set the poll() state and publish everything
1584 * to user-space before waking everybody up.
1585 */
1586
1587void perf_counter_wakeup(struct perf_counter *counter)
1588{
1589 wake_up_all(&counter->waitq);
1590
1591 if (counter->pending_kill) {
1592 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
1593 counter->pending_kill = 0;
1594 }
1595}
1596
1597/*
1598 * Pending wakeups
1599 *
1600 * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
1601 *
1602 * The NMI bit means we cannot possibly take locks. Therefore, maintain a
1603 * single linked list and use cmpxchg() to add entries lockless.
1604 */
1605
1606static void perf_pending_counter(struct perf_pending_entry *entry)
1607{
1608 struct perf_counter *counter = container_of(entry,
1609 struct perf_counter, pending);
1610
1611 if (counter->pending_disable) {
1612 counter->pending_disable = 0;
1613 perf_counter_disable(counter);
1614 }
1615
1616 if (counter->pending_wakeup) {
1617 counter->pending_wakeup = 0;
1618 perf_counter_wakeup(counter);
1619 }
1620}
1621
1622#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
1623
1624static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
1625 PENDING_TAIL,
1626};
1627
1628static void perf_pending_queue(struct perf_pending_entry *entry,
1629 void (*func)(struct perf_pending_entry *))
1630{
1631 struct perf_pending_entry **head;
1632
1633 if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
1634 return;
1635
1636 entry->func = func;
1637
1638 head = &get_cpu_var(perf_pending_head);
1639
1640 do {
1641 entry->next = *head;
1642 } while (cmpxchg(head, entry->next, entry) != entry->next);
1643
1644 set_perf_counter_pending();
1645
1646 put_cpu_var(perf_pending_head);
1647}
1648
1649static int __perf_pending_run(void)
1650{
1651 struct perf_pending_entry *list;
1652 int nr = 0;
1653
1654 list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
1655 while (list != PENDING_TAIL) {
1656 void (*func)(struct perf_pending_entry *);
1657 struct perf_pending_entry *entry = list;
1658
1659 list = list->next;
1660
1661 func = entry->func;
1662 entry->next = NULL;
1663 /*
1664 * Ensure we observe the unqueue before we issue the wakeup,
1665 * so that we won't be waiting forever.
1666 * -- see perf_not_pending().
1667 */
1668 smp_wmb();
1669
1670 func(entry);
1671 nr++;
1672 }
1673
1674 return nr;
1675}
1676
1677static inline int perf_not_pending(struct perf_counter *counter)
1678{
1679 /*
1680 * If we flush on whatever cpu we run, there is a chance we don't
1681 * need to wait.
1682 */
1683 get_cpu();
1684 __perf_pending_run();
1685 put_cpu();
1686
1687 /*
1688 * Ensure we see the proper queue state before going to sleep
1689 * so that we do not miss the wakeup. -- see perf_pending_handle()
1690 */
1691 smp_rmb();
1692 return counter->pending.next == NULL;
1693}
1694
1695static void perf_pending_sync(struct perf_counter *counter)
1696{
1697 wait_event(counter->waitq, perf_not_pending(counter));
1698}
1699
1700void perf_counter_do_pending(void)
1701{
1702 __perf_pending_run();
1703}
1704
1705/*
1706 * Callchain support -- arch specific
1707 */
1708
1709__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1710{
1711 return NULL;
1712}
1713
1714/*
1715 * Output
1716 */
1717
1718struct perf_output_handle {
1719 struct perf_counter *counter;
1720 struct perf_mmap_data *data;
1721 unsigned int offset;
1722 unsigned int head;
1723 int nmi;
1724 int overflow;
1725 int locked;
1726 unsigned long flags;
1727};
1728
1729static void perf_output_wakeup(struct perf_output_handle *handle)
1730{
1731 atomic_set(&handle->data->poll, POLL_IN);
1732
1733 if (handle->nmi) {
1734 handle->counter->pending_wakeup = 1;
1735 perf_pending_queue(&handle->counter->pending,
1736 perf_pending_counter);
1737 } else
1738 perf_counter_wakeup(handle->counter);
1739}
1740
1741/*
1742 * Curious locking construct.
1743 *
1744 * We need to ensure a later event doesn't publish a head when a former
1745 * event isn't done writing. However since we need to deal with NMIs we
1746 * cannot fully serialize things.
1747 *
1748 * What we do is serialize between CPUs so we only have to deal with NMI
1749 * nesting on a single CPU.
1750 *
1751 * We only publish the head (and generate a wakeup) when the outer-most
1752 * event completes.
1753 */
1754static void perf_output_lock(struct perf_output_handle *handle)
1755{
1756 struct perf_mmap_data *data = handle->data;
1757 int cpu;
1758
1759 handle->locked = 0;
1760
1761 local_irq_save(handle->flags);
1762 cpu = smp_processor_id();
1763
1764 if (in_nmi() && atomic_read(&data->lock) == cpu)
1765 return;
1766
1767 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
1768 cpu_relax();
1769
1770 handle->locked = 1;
1771}
1772
1773static void perf_output_unlock(struct perf_output_handle *handle)
1774{
1775 struct perf_mmap_data *data = handle->data;
1776 int head, cpu;
1777
1778 data->done_head = data->head;
1779
1780 if (!handle->locked)
1781 goto out;
1782
1783again:
1784 /*
1785 * The xchg implies a full barrier that ensures all writes are done
1786 * before we publish the new head, matched by a rmb() in userspace when
1787 * reading this position.
1788 */
1789 while ((head = atomic_xchg(&data->done_head, 0)))
1790 data->user_page->data_head = head;
1791
1792 /*
1793 * NMI can happen here, which means we can miss a done_head update.
1794 */
1795
1796 cpu = atomic_xchg(&data->lock, -1);
1797 WARN_ON_ONCE(cpu != smp_processor_id());
1798
1799 /*
1800 * Therefore we have to validate we did not indeed do so.
1801 */
1802 if (unlikely(atomic_read(&data->done_head))) {
1803 /*
1804 * Since we had it locked, we can lock it again.
1805 */
1806 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
1807 cpu_relax();
1808
1809 goto again;
1810 }
1811
1812 if (atomic_xchg(&data->wakeup, 0))
1813 perf_output_wakeup(handle);
1814out:
1815 local_irq_restore(handle->flags);
1816}
1817
1818static int perf_output_begin(struct perf_output_handle *handle,
1819 struct perf_counter *counter, unsigned int size,
1820 int nmi, int overflow)
1821{
1822 struct perf_mmap_data *data;
1823 unsigned int offset, head;
1824
1825 /*
1826 * For inherited counters we send all the output towards the parent.
1827 */
1828 if (counter->parent)
1829 counter = counter->parent;
1830
1831 rcu_read_lock();
1832 data = rcu_dereference(counter->data);
1833 if (!data)
1834 goto out;
1835
1836 handle->data = data;
1837 handle->counter = counter;
1838 handle->nmi = nmi;
1839 handle->overflow = overflow;
1840
1841 if (!data->nr_pages)
1842 goto fail;
1843
1844 perf_output_lock(handle);
1845
1846 do {
1847 offset = head = atomic_read(&data->head);
1848 head += size;
1849 } while (atomic_cmpxchg(&data->head, offset, head) != offset);
1850
1851 handle->offset = offset;
1852 handle->head = head;
1853
1854 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
1855 atomic_set(&data->wakeup, 1);
1856
1857 return 0;
1858
1859fail:
1860 perf_output_wakeup(handle);
1861out:
1862 rcu_read_unlock();
1863
1864 return -ENOSPC;
1865}
1866
1867static void perf_output_copy(struct perf_output_handle *handle,
1868 void *buf, unsigned int len)
1869{
1870 unsigned int pages_mask;
1871 unsigned int offset;
1872 unsigned int size;
1873 void **pages;
1874
1875 offset = handle->offset;
1876 pages_mask = handle->data->nr_pages - 1;
1877 pages = handle->data->data_pages;
1878
1879 do {
1880 unsigned int page_offset;
1881 int nr;
1882
1883 nr = (offset >> PAGE_SHIFT) & pages_mask;
1884 page_offset = offset & (PAGE_SIZE - 1);
1885 size = min_t(unsigned int, PAGE_SIZE - page_offset, len);
1886
1887 memcpy(pages[nr] + page_offset, buf, size);
1888
1889 len -= size;
1890 buf += size;
1891 offset += size;
1892 } while (len);
1893
1894 handle->offset = offset;
1895
1896 WARN_ON_ONCE(handle->offset > handle->head);
1897}
1898
1899#define perf_output_put(handle, x) \
1900 perf_output_copy((handle), &(x), sizeof(x))
1901
1902static void perf_output_end(struct perf_output_handle *handle)
1903{
1904 struct perf_counter *counter = handle->counter;
1905 struct perf_mmap_data *data = handle->data;
1906
1907 int wakeup_events = counter->hw_event.wakeup_events;
1908
1909 if (handle->overflow && wakeup_events) {
1910 int events = atomic_inc_return(&data->events);
1911 if (events >= wakeup_events) {
1912 atomic_sub(wakeup_events, &data->events);
1913 atomic_set(&data->wakeup, 1);
1914 }
1915 }
1916
1917 perf_output_unlock(handle);
1918 rcu_read_unlock();
1919}
1920
1921static void perf_counter_output(struct perf_counter *counter,
1922 int nmi, struct pt_regs *regs, u64 addr)
1923{
1924 int ret;
1925 u64 record_type = counter->hw_event.record_type;
1926 struct perf_output_handle handle;
1927 struct perf_event_header header;
1928 u64 ip;
1929 struct {
1930 u32 pid, tid;
1931 } tid_entry;
1932 struct {
1933 u64 event;
1934 u64 counter;
1935 } group_entry;
1936 struct perf_callchain_entry *callchain = NULL;
1937 int callchain_size = 0;
1938 u64 time;
1939
1940 header.type = 0;
1941 header.size = sizeof(header);
1942
1943 header.misc = PERF_EVENT_MISC_OVERFLOW;
1944 header.misc |= user_mode(regs) ?
1945 PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
1946
1947 if (record_type & PERF_RECORD_IP) {
1948 ip = instruction_pointer(regs);
1949 header.type |= PERF_RECORD_IP;
1950 header.size += sizeof(ip);
1951 }
1952
1953 if (record_type & PERF_RECORD_TID) {
1954 /* namespace issues */
1955 tid_entry.pid = current->group_leader->pid;
1956 tid_entry.tid = current->pid;
1957
1958 header.type |= PERF_RECORD_TID;
1959 header.size += sizeof(tid_entry);
1960 }
1961
1962 if (record_type & PERF_RECORD_TIME) {
1963 /*
1964 * Maybe do better on x86 and provide cpu_clock_nmi()
1965 */
1966 time = sched_clock();
1967
1968 header.type |= PERF_RECORD_TIME;
1969 header.size += sizeof(u64);
1970 }
1971
1972 if (record_type & PERF_RECORD_ADDR) {
1973 header.type |= PERF_RECORD_ADDR;
1974 header.size += sizeof(u64);
1975 }
1976
1977 if (record_type & PERF_RECORD_GROUP) {
1978 header.type |= PERF_RECORD_GROUP;
1979 header.size += sizeof(u64) +
1980 counter->nr_siblings * sizeof(group_entry);
1981 }
1982
1983 if (record_type & PERF_RECORD_CALLCHAIN) {
1984 callchain = perf_callchain(regs);
1985
1986 if (callchain) {
1987 callchain_size = (1 + callchain->nr) * sizeof(u64);
1988
1989 header.type |= PERF_RECORD_CALLCHAIN;
1990 header.size += callchain_size;
1991 }
1992 }
1993
1994 ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
1995 if (ret)
1996 return;
1997
1998 perf_output_put(&handle, header);
1999
2000 if (record_type & PERF_RECORD_IP)
2001 perf_output_put(&handle, ip);
2002
2003 if (record_type & PERF_RECORD_TID)
2004 perf_output_put(&handle, tid_entry);
2005
2006 if (record_type & PERF_RECORD_TIME)
2007 perf_output_put(&handle, time);
2008
2009 if (record_type & PERF_RECORD_ADDR)
2010 perf_output_put(&handle, addr);
2011
2012 /*
2013 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
2014 */
2015 if (record_type & PERF_RECORD_GROUP) {
2016 struct perf_counter *leader, *sub;
2017 u64 nr = counter->nr_siblings;
2018
2019 perf_output_put(&handle, nr);
2020
2021 leader = counter->group_leader;
2022 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
2023 if (sub != counter)
2024 sub->pmu->read(sub);
2025
2026 group_entry.event = sub->hw_event.config;
2027 group_entry.counter = atomic64_read(&sub->count);
2028
2029 perf_output_put(&handle, group_entry);
2030 }
2031 }
2032
2033 if (callchain)
2034 perf_output_copy(&handle, callchain, callchain_size);
2035
2036 perf_output_end(&handle);
2037}
2038
2039/*
2040 * comm tracking
2041 */
2042
2043struct perf_comm_event {
2044 struct task_struct *task;
2045 char *comm;
2046 int comm_size;
2047
2048 struct {
2049 struct perf_event_header header;
2050
2051 u32 pid;
2052 u32 tid;
2053 } event;
2054};
2055
2056static void perf_counter_comm_output(struct perf_counter *counter,
2057 struct perf_comm_event *comm_event)
2058{
2059 struct perf_output_handle handle;
2060 int size = comm_event->event.header.size;
2061 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2062
2063 if (ret)
2064 return;
2065
2066 perf_output_put(&handle, comm_event->event);
2067 perf_output_copy(&handle, comm_event->comm,
2068 comm_event->comm_size);
2069 perf_output_end(&handle);
2070}
2071
2072static int perf_counter_comm_match(struct perf_counter *counter,
2073 struct perf_comm_event *comm_event)
2074{
2075 if (counter->hw_event.comm &&
2076 comm_event->event.header.type == PERF_EVENT_COMM)
2077 return 1;
2078
2079 return 0;
2080}
2081
2082static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
2083 struct perf_comm_event *comm_event)
2084{
2085 struct perf_counter *counter;
2086
2087 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2088 return;
2089
2090 rcu_read_lock();
2091 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2092 if (perf_counter_comm_match(counter, comm_event))
2093 perf_counter_comm_output(counter, comm_event);
2094 }
2095 rcu_read_unlock();
2096}
2097
2098static void perf_counter_comm_event(struct perf_comm_event *comm_event)
2099{
2100 struct perf_cpu_context *cpuctx;
2101 unsigned int size;
2102 char *comm = comm_event->task->comm;
2103
2104 size = ALIGN(strlen(comm)+1, sizeof(u64));
2105
2106 comm_event->comm = comm;
2107 comm_event->comm_size = size;
2108
2109 comm_event->event.header.size = sizeof(comm_event->event) + size;
2110
2111 cpuctx = &get_cpu_var(perf_cpu_context);
2112 perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
2113 put_cpu_var(perf_cpu_context);
2114
2115 perf_counter_comm_ctx(&current->perf_counter_ctx, comm_event);
2116}
2117
2118void perf_counter_comm(struct task_struct *task)
2119{
2120 struct perf_comm_event comm_event;
2121
2122 if (!atomic_read(&nr_comm_tracking))
2123 return;
2124
2125 comm_event = (struct perf_comm_event){
2126 .task = task,
2127 .event = {
2128 .header = { .type = PERF_EVENT_COMM, },
2129 .pid = task->group_leader->pid,
2130 .tid = task->pid,
2131 },
2132 };
2133
2134 perf_counter_comm_event(&comm_event);
2135}
2136
2137/*
2138 * mmap tracking
2139 */
2140
2141struct perf_mmap_event {
2142 struct file *file;
2143 char *file_name;
2144 int file_size;
2145
2146 struct {
2147 struct perf_event_header header;
2148
2149 u32 pid;
2150 u32 tid;
2151 u64 start;
2152 u64 len;
2153 u64 pgoff;
2154 } event;
2155};
2156
2157static void perf_counter_mmap_output(struct perf_counter *counter,
2158 struct perf_mmap_event *mmap_event)
2159{
2160 struct perf_output_handle handle;
2161 int size = mmap_event->event.header.size;
2162 int ret = perf_output_begin(&handle, counter, size, 0, 0);
2163
2164 if (ret)
2165 return;
2166
2167 perf_output_put(&handle, mmap_event->event);
2168 perf_output_copy(&handle, mmap_event->file_name,
2169 mmap_event->file_size);
2170 perf_output_end(&handle);
2171}
2172
2173static int perf_counter_mmap_match(struct perf_counter *counter,
2174 struct perf_mmap_event *mmap_event)
2175{
2176 if (counter->hw_event.mmap &&
2177 mmap_event->event.header.type == PERF_EVENT_MMAP)
2178 return 1;
2179
2180 if (counter->hw_event.munmap &&
2181 mmap_event->event.header.type == PERF_EVENT_MUNMAP)
2182 return 1;
2183
2184 return 0;
2185}
2186
2187static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
2188 struct perf_mmap_event *mmap_event)
2189{
2190 struct perf_counter *counter;
2191
2192 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2193 return;
2194
2195 rcu_read_lock();
2196 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2197 if (perf_counter_mmap_match(counter, mmap_event))
2198 perf_counter_mmap_output(counter, mmap_event);
2199 }
2200 rcu_read_unlock();
2201}
2202
2203static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
2204{
2205 struct perf_cpu_context *cpuctx;
2206 struct file *file = mmap_event->file;
2207 unsigned int size;
2208 char tmp[16];
2209 char *buf = NULL;
2210 char *name;
2211
2212 if (file) {
2213 buf = kzalloc(PATH_MAX, GFP_KERNEL);
2214 if (!buf) {
2215 name = strncpy(tmp, "//enomem", sizeof(tmp));
2216 goto got_name;
2217 }
2218 name = d_path(&file->f_path, buf, PATH_MAX);
2219 if (IS_ERR(name)) {
2220 name = strncpy(tmp, "//toolong", sizeof(tmp));
2221 goto got_name;
2222 }
2223 } else {
2224 name = strncpy(tmp, "//anon", sizeof(tmp));
2225 goto got_name;
2226 }
2227
2228got_name:
2229 size = ALIGN(strlen(name)+1, sizeof(u64));
2230
2231 mmap_event->file_name = name;
2232 mmap_event->file_size = size;
2233
2234 mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2235
2236 cpuctx = &get_cpu_var(perf_cpu_context);
2237 perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2238 put_cpu_var(perf_cpu_context);
2239
2240 perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
2241
2242 kfree(buf);
2243}
2244
2245void perf_counter_mmap(unsigned long addr, unsigned long len,
2246 unsigned long pgoff, struct file *file)
2247{
2248 struct perf_mmap_event mmap_event;
2249
2250 if (!atomic_read(&nr_mmap_tracking))
2251 return;
2252
2253 mmap_event = (struct perf_mmap_event){
2254 .file = file,
2255 .event = {
2256 .header = { .type = PERF_EVENT_MMAP, },
2257 .pid = current->group_leader->pid,
2258 .tid = current->pid,
2259 .start = addr,
2260 .len = len,
2261 .pgoff = pgoff,
2262 },
2263 };
2264
2265 perf_counter_mmap_event(&mmap_event);
2266}
2267
2268void perf_counter_munmap(unsigned long addr, unsigned long len,
2269 unsigned long pgoff, struct file *file)
2270{
2271 struct perf_mmap_event mmap_event;
2272
2273 if (!atomic_read(&nr_munmap_tracking))
2274 return;
2275
2276 mmap_event = (struct perf_mmap_event){
2277 .file = file,
2278 .event = {
2279 .header = { .type = PERF_EVENT_MUNMAP, },
2280 .pid = current->group_leader->pid,
2281 .tid = current->pid,
2282 .start = addr,
2283 .len = len,
2284 .pgoff = pgoff,
2285 },
2286 };
2287
2288 perf_counter_mmap_event(&mmap_event);
2289}
2290
2291/*
2292 * Generic counter overflow handling.
2293 */
2294
2295int perf_counter_overflow(struct perf_counter *counter,
2296 int nmi, struct pt_regs *regs, u64 addr)
2297{
2298 int events = atomic_read(&counter->event_limit);
2299 int ret = 0;
2300
2301 /*
2302 * XXX event_limit might not quite work as expected on inherited
2303 * counters
2304 */
2305
2306 counter->pending_kill = POLL_IN;
2307 if (events && atomic_dec_and_test(&counter->event_limit)) {
2308 ret = 1;
2309 counter->pending_kill = POLL_HUP;
2310 if (nmi) {
2311 counter->pending_disable = 1;
2312 perf_pending_queue(&counter->pending,
2313 perf_pending_counter);
2314 } else
2315 perf_counter_disable(counter);
2316 }
2317
2318 perf_counter_output(counter, nmi, regs, addr);
2319 return ret;
2320}
2321
2322/*
2323 * Generic software counter infrastructure
2324 */
2325
2326static void perf_swcounter_update(struct perf_counter *counter)
2327{
2328 struct hw_perf_counter *hwc = &counter->hw;
2329 u64 prev, now;
2330 s64 delta;
2331
2332again:
2333 prev = atomic64_read(&hwc->prev_count);
2334 now = atomic64_read(&hwc->count);
2335 if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
2336 goto again;
2337
2338 delta = now - prev;
2339
2340 atomic64_add(delta, &counter->count);
2341 atomic64_sub(delta, &hwc->period_left);
2342}
2343
2344static void perf_swcounter_set_period(struct perf_counter *counter)
2345{
2346 struct hw_perf_counter *hwc = &counter->hw;
2347 s64 left = atomic64_read(&hwc->period_left);
2348 s64 period = hwc->irq_period;
2349
2350 if (unlikely(left <= -period)) {
2351 left = period;
2352 atomic64_set(&hwc->period_left, left);
2353 }
2354
2355 if (unlikely(left <= 0)) {
2356 left += period;
2357 atomic64_add(period, &hwc->period_left);
2358 }
2359
2360 atomic64_set(&hwc->prev_count, -left);
2361 atomic64_set(&hwc->count, -left);
2362}
2363
2364static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
2365{
2366 enum hrtimer_restart ret = HRTIMER_RESTART;
2367 struct perf_counter *counter;
2368 struct pt_regs *regs;
2369
2370 counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
2371 counter->pmu->read(counter);
2372
2373 regs = get_irq_regs();
2374 /*
2375 * In case we exclude kernel IPs or are somehow not in interrupt
2376 * context, provide the next best thing, the user IP.
2377 */
2378 if ((counter->hw_event.exclude_kernel || !regs) &&
2379 !counter->hw_event.exclude_user)
2380 regs = task_pt_regs(current);
2381
2382 if (regs) {
2383 if (perf_counter_overflow(counter, 0, regs, 0))
2384 ret = HRTIMER_NORESTART;
2385 }
2386
2387 hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
2388
2389 return ret;
2390}
2391
2392static void perf_swcounter_overflow(struct perf_counter *counter,
2393 int nmi, struct pt_regs *regs, u64 addr)
2394{
2395 perf_swcounter_update(counter);
2396 perf_swcounter_set_period(counter);
2397 if (perf_counter_overflow(counter, nmi, regs, addr))
2398 /* soft-disable the counter */
2399 ;
2400
2401}
2402
2403static int perf_swcounter_match(struct perf_counter *counter,
2404 enum perf_event_types type,
2405 u32 event, struct pt_regs *regs)
2406{
2407 if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2408 return 0;
2409
2410 if (perf_event_raw(&counter->hw_event))
2411 return 0;
2412
2413 if (perf_event_type(&counter->hw_event) != type)
2414 return 0;
2415
2416 if (perf_event_id(&counter->hw_event) != event)
2417 return 0;
2418
2419 if (counter->hw_event.exclude_user && user_mode(regs))
2420 return 0;
2421
2422 if (counter->hw_event.exclude_kernel && !user_mode(regs))
2423 return 0;
2424
2425 return 1;
2426}
2427
2428static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
2429 int nmi, struct pt_regs *regs, u64 addr)
2430{
2431 int neg = atomic64_add_negative(nr, &counter->hw.count);
2432 if (counter->hw.irq_period && !neg)
2433 perf_swcounter_overflow(counter, nmi, regs, addr);
2434}
2435
2436static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
2437 enum perf_event_types type, u32 event,
2438 u64 nr, int nmi, struct pt_regs *regs,
2439 u64 addr)
2440{
2441 struct perf_counter *counter;
2442
2443 if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2444 return;
2445
2446 rcu_read_lock();
2447 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2448 if (perf_swcounter_match(counter, type, event, regs))
2449 perf_swcounter_add(counter, nr, nmi, regs, addr);
2450 }
2451 rcu_read_unlock();
2452}
2453
2454static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
2455{
2456 if (in_nmi())
2457 return &cpuctx->recursion[3];
2458
2459 if (in_irq())
2460 return &cpuctx->recursion[2];
2461
2462 if (in_softirq())
2463 return &cpuctx->recursion[1];
2464
2465 return &cpuctx->recursion[0];
2466}
2467
2468static void __perf_swcounter_event(enum perf_event_types type, u32 event,
2469 u64 nr, int nmi, struct pt_regs *regs,
2470 u64 addr)
2471{
2472 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
2473 int *recursion = perf_swcounter_recursion_context(cpuctx);
2474
2475 if (*recursion)
2476 goto out;
2477
2478 (*recursion)++;
2479 barrier();
2480
2481 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
2482 nr, nmi, regs, addr);
2483 if (cpuctx->task_ctx) {
2484 perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
2485 nr, nmi, regs, addr);
2486 }
2487
2488 barrier();
2489 (*recursion)--;
2490
2491out:
2492 put_cpu_var(perf_cpu_context);
2493}
2494
2495void
2496perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
2497{
2498 __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
2499}
2500
2501static void perf_swcounter_read(struct perf_counter *counter)
2502{
2503 perf_swcounter_update(counter);
2504}
2505
2506static int perf_swcounter_enable(struct perf_counter *counter)
2507{
2508 perf_swcounter_set_period(counter);
2509 return 0;
2510}
2511
2512static void perf_swcounter_disable(struct perf_counter *counter)
2513{
2514 perf_swcounter_update(counter);
2515}
2516
2517static const struct pmu perf_ops_generic = {
2518 .enable = perf_swcounter_enable,
2519 .disable = perf_swcounter_disable,
2520 .read = perf_swcounter_read,
2521};
2522
2523/*
2524 * Software counter: cpu wall time clock
2525 */
2526
2527static void cpu_clock_perf_counter_update(struct perf_counter *counter)
2528{
2529 int cpu = raw_smp_processor_id();
2530 s64 prev;
2531 u64 now;
2532
2533 now = cpu_clock(cpu);
2534 prev = atomic64_read(&counter->hw.prev_count);
2535 atomic64_set(&counter->hw.prev_count, now);
2536 atomic64_add(now - prev, &counter->count);
2537}
2538
2539static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
2540{
2541 struct hw_perf_counter *hwc = &counter->hw;
2542 int cpu = raw_smp_processor_id();
2543
2544 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
2545 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2546 hwc->hrtimer.function = perf_swcounter_hrtimer;
2547 if (hwc->irq_period) {
2548 __hrtimer_start_range_ns(&hwc->hrtimer,
2549 ns_to_ktime(hwc->irq_period), 0,
2550 HRTIMER_MODE_REL, 0);
2551 }
2552
2553 return 0;
2554}
2555
2556static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
2557{
2558 hrtimer_cancel(&counter->hw.hrtimer);
2559 cpu_clock_perf_counter_update(counter);
2560}
2561
2562static void cpu_clock_perf_counter_read(struct perf_counter *counter)
2563{
2564 cpu_clock_perf_counter_update(counter);
2565}
2566
2567static const struct pmu perf_ops_cpu_clock = {
2568 .enable = cpu_clock_perf_counter_enable,
2569 .disable = cpu_clock_perf_counter_disable,
2570 .read = cpu_clock_perf_counter_read,
2571};
2572
2573/*
2574 * Software counter: task time clock
2575 */
2576
2577static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
2578{
2579 u64 prev;
2580 s64 delta;
2581
2582 prev = atomic64_xchg(&counter->hw.prev_count, now);
2583 delta = now - prev;
2584 atomic64_add(delta, &counter->count);
2585}
2586
2587static int task_clock_perf_counter_enable(struct perf_counter *counter)
2588{
2589 struct hw_perf_counter *hwc = &counter->hw;
2590 u64 now;
2591
2592 now = counter->ctx->time;
2593
2594 atomic64_set(&hwc->prev_count, now);
2595 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2596 hwc->hrtimer.function = perf_swcounter_hrtimer;
2597 if (hwc->irq_period) {
2598 __hrtimer_start_range_ns(&hwc->hrtimer,
2599 ns_to_ktime(hwc->irq_period), 0,
2600 HRTIMER_MODE_REL, 0);
2601 }
2602
2603 return 0;
2604}
2605
2606static void task_clock_perf_counter_disable(struct perf_counter *counter)
2607{
2608 hrtimer_cancel(&counter->hw.hrtimer);
2609 task_clock_perf_counter_update(counter, counter->ctx->time);
2610
2611}
2612
2613static void task_clock_perf_counter_read(struct perf_counter *counter)
2614{
2615 u64 time;
2616
2617 if (!in_nmi()) {
2618 update_context_time(counter->ctx);
2619 time = counter->ctx->time;
2620 } else {
2621 u64 now = perf_clock();
2622 u64 delta = now - counter->ctx->timestamp;
2623 time = counter->ctx->time + delta;
2624 }
2625
2626 task_clock_perf_counter_update(counter, time);
2627}
2628
2629static const struct pmu perf_ops_task_clock = {
2630 .enable = task_clock_perf_counter_enable,
2631 .disable = task_clock_perf_counter_disable,
2632 .read = task_clock_perf_counter_read,
2633};
2634
2635/*
2636 * Software counter: cpu migrations
2637 */
2638
2639static inline u64 get_cpu_migrations(struct perf_counter *counter)
2640{
2641 struct task_struct *curr = counter->ctx->task;
2642
2643 if (curr)
2644 return curr->se.nr_migrations;
2645 return cpu_nr_migrations(smp_processor_id());
2646}
2647
2648static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
2649{
2650 u64 prev, now;
2651 s64 delta;
2652
2653 prev = atomic64_read(&counter->hw.prev_count);
2654 now = get_cpu_migrations(counter);
2655
2656 atomic64_set(&counter->hw.prev_count, now);
2657
2658 delta = now - prev;
2659
2660 atomic64_add(delta, &counter->count);
2661}
2662
2663static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
2664{
2665 cpu_migrations_perf_counter_update(counter);
2666}
2667
2668static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
2669{
2670 if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
2671 atomic64_set(&counter->hw.prev_count,
2672 get_cpu_migrations(counter));
2673 return 0;
2674}
2675
2676static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
2677{
2678 cpu_migrations_perf_counter_update(counter);
2679}
2680
2681static const struct pmu perf_ops_cpu_migrations = {
2682 .enable = cpu_migrations_perf_counter_enable,
2683 .disable = cpu_migrations_perf_counter_disable,
2684 .read = cpu_migrations_perf_counter_read,
2685};
2686
2687#ifdef CONFIG_EVENT_PROFILE
2688void perf_tpcounter_event(int event_id)
2689{
2690 struct pt_regs *regs = get_irq_regs();
2691
2692 if (!regs)
2693 regs = task_pt_regs(current);
2694
2695 __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
2696}
2697EXPORT_SYMBOL_GPL(perf_tpcounter_event);
2698
2699extern int ftrace_profile_enable(int);
2700extern void ftrace_profile_disable(int);
2701
2702static void tp_perf_counter_destroy(struct perf_counter *counter)
2703{
2704 ftrace_profile_disable(perf_event_id(&counter->hw_event));
2705}
2706
2707static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
2708{
2709 int event_id = perf_event_id(&counter->hw_event);
2710 int ret;
2711
2712 ret = ftrace_profile_enable(event_id);
2713 if (ret)
2714 return NULL;
2715
2716 counter->destroy = tp_perf_counter_destroy;
2717 counter->hw.irq_period = counter->hw_event.irq_period;
2718
2719 return &perf_ops_generic;
2720}
2721#else
2722static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
2723{
2724 return NULL;
2725}
2726#endif
2727
2728static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
2729{
2730 struct perf_counter_hw_event *hw_event = &counter->hw_event;
2731 const struct pmu *pmu = NULL;
2732 struct hw_perf_counter *hwc = &counter->hw;
2733
2734 /*
2735 * Software counters (currently) can't in general distinguish
2736 * between user, kernel and hypervisor events.
2737 * However, context switches and cpu migrations are considered
2738 * to be kernel events, and page faults are never hypervisor
2739 * events.
2740 */
2741 switch (perf_event_id(&counter->hw_event)) {
2742 case PERF_COUNT_CPU_CLOCK:
2743 pmu = &perf_ops_cpu_clock;
2744
2745 if (hw_event->irq_period && hw_event->irq_period < 10000)
2746 hw_event->irq_period = 10000;
2747 break;
2748 case PERF_COUNT_TASK_CLOCK:
2749 /*
2750 * If the user instantiates this as a per-cpu counter,
2751 * use the cpu_clock counter instead.
2752 */
2753 if (counter->ctx->task)
2754 pmu = &perf_ops_task_clock;
2755 else
2756 pmu = &perf_ops_cpu_clock;
2757
2758 if (hw_event->irq_period && hw_event->irq_period < 10000)
2759 hw_event->irq_period = 10000;
2760 break;
2761 case PERF_COUNT_PAGE_FAULTS:
2762 case PERF_COUNT_PAGE_FAULTS_MIN:
2763 case PERF_COUNT_PAGE_FAULTS_MAJ:
2764 case PERF_COUNT_CONTEXT_SWITCHES:
2765 pmu = &perf_ops_generic;
2766 break;
2767 case PERF_COUNT_CPU_MIGRATIONS:
2768 if (!counter->hw_event.exclude_kernel)
2769 pmu = &perf_ops_cpu_migrations;
2770 break;
2771 }
2772
2773 if (pmu)
2774 hwc->irq_period = hw_event->irq_period;
2775
2776 return pmu;
2777}
2778
2779/*
2780 * Allocate and initialize a counter structure
2781 */
2782static struct perf_counter *
2783perf_counter_alloc(struct perf_counter_hw_event *hw_event,
2784 int cpu,
2785 struct perf_counter_context *ctx,
2786 struct perf_counter *group_leader,
2787 gfp_t gfpflags)
2788{
2789 const struct pmu *pmu;
2790 struct perf_counter *counter;
2791 long err;
2792
2793 counter = kzalloc(sizeof(*counter), gfpflags);
2794 if (!counter)
2795 return ERR_PTR(-ENOMEM);
2796
2797 /*
2798 * Single counters are their own group leaders, with an
2799 * empty sibling list:
2800 */
2801 if (!group_leader)
2802 group_leader = counter;
2803
2804 mutex_init(&counter->mutex);
2805 INIT_LIST_HEAD(&counter->list_entry);
2806 INIT_LIST_HEAD(&counter->event_entry);
2807 INIT_LIST_HEAD(&counter->sibling_list);
2808 init_waitqueue_head(&counter->waitq);
2809
2810 mutex_init(&counter->mmap_mutex);
2811
2812 INIT_LIST_HEAD(&counter->child_list);
2813
2814 counter->cpu = cpu;
2815 counter->hw_event = *hw_event;
2816 counter->group_leader = group_leader;
2817 counter->pmu = NULL;
2818 counter->ctx = ctx;
2819
2820 counter->state = PERF_COUNTER_STATE_INACTIVE;
2821 if (hw_event->disabled)
2822 counter->state = PERF_COUNTER_STATE_OFF;
2823
2824 pmu = NULL;
2825
2826 /*
2827 * we currently do not support PERF_RECORD_GROUP on inherited counters
2828 */
2829 if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP))
2830 goto done;
2831
2832 if (perf_event_raw(hw_event)) {
2833 pmu = hw_perf_counter_init(counter);
2834 goto done;
2835 }
2836
2837 switch (perf_event_type(hw_event)) {
2838 case PERF_TYPE_HARDWARE:
2839 pmu = hw_perf_counter_init(counter);
2840 break;
2841
2842 case PERF_TYPE_SOFTWARE:
2843 pmu = sw_perf_counter_init(counter);
2844 break;
2845
2846 case PERF_TYPE_TRACEPOINT:
2847 pmu = tp_perf_counter_init(counter);
2848 break;
2849 }
2850done:
2851 err = 0;
2852 if (!pmu)
2853 err = -EINVAL;
2854 else if (IS_ERR(pmu))
2855 err = PTR_ERR(pmu);
2856
2857 if (err) {
2858 kfree(counter);
2859 return ERR_PTR(err);
2860 }
2861
2862 counter->pmu = pmu;
2863
2864 if (counter->hw_event.mmap)
2865 atomic_inc(&nr_mmap_tracking);
2866 if (counter->hw_event.munmap)
2867 atomic_inc(&nr_munmap_tracking);
2868 if (counter->hw_event.comm)
2869 atomic_inc(&nr_comm_tracking);
2870
2871 return counter;
2872}
2873
2874/**
2875 * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
2876 *
2877 * @hw_event_uptr: event type attributes for monitoring/sampling
2878 * @pid: target pid
2879 * @cpu: target cpu
2880 * @group_fd: group leader counter fd
2881 */
2882SYSCALL_DEFINE5(perf_counter_open,
2883 const struct perf_counter_hw_event __user *, hw_event_uptr,
2884 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
2885{
2886 struct perf_counter *counter, *group_leader;
2887 struct perf_counter_hw_event hw_event;
2888 struct perf_counter_context *ctx;
2889 struct file *counter_file = NULL;
2890 struct file *group_file = NULL;
2891 int fput_needed = 0;
2892 int fput_needed2 = 0;
2893 int ret;
2894
2895 /* for future expandability... */
2896 if (flags)
2897 return -EINVAL;
2898
2899 if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
2900 return -EFAULT;
2901
2902 /*
2903 * Get the target context (task or percpu):
2904 */
2905 ctx = find_get_context(pid, cpu);
2906 if (IS_ERR(ctx))
2907 return PTR_ERR(ctx);
2908
2909 /*
2910 * Look up the group leader (we will attach this counter to it):
2911 */
2912 group_leader = NULL;
2913 if (group_fd != -1) {
2914 ret = -EINVAL;
2915 group_file = fget_light(group_fd, &fput_needed);
2916 if (!group_file)
2917 goto err_put_context;
2918 if (group_file->f_op != &perf_fops)
2919 goto err_put_context;
2920
2921 group_leader = group_file->private_data;
2922 /*
2923 * Do not allow a recursive hierarchy (this new sibling
2924 * becoming part of another group-sibling):
2925 */
2926 if (group_leader->group_leader != group_leader)
2927 goto err_put_context;
2928 /*
2929 * Do not allow to attach to a group in a different
2930 * task or CPU context:
2931 */
2932 if (group_leader->ctx != ctx)
2933 goto err_put_context;
2934 /*
2935 * Only a group leader can be exclusive or pinned
2936 */
2937 if (hw_event.exclusive || hw_event.pinned)
2938 goto err_put_context;
2939 }
2940
2941 counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
2942 GFP_KERNEL);
2943 ret = PTR_ERR(counter);
2944 if (IS_ERR(counter))
2945 goto err_put_context;
2946
2947 ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
2948 if (ret < 0)
2949 goto err_free_put_context;
2950
2951 counter_file = fget_light(ret, &fput_needed2);
2952 if (!counter_file)
2953 goto err_free_put_context;
2954
2955 counter->filp = counter_file;
2956 mutex_lock(&ctx->mutex);
2957 perf_install_in_context(ctx, counter, cpu);
2958 mutex_unlock(&ctx->mutex);
2959
2960 fput_light(counter_file, fput_needed2);
2961
2962out_fput:
2963 fput_light(group_file, fput_needed);
2964
2965 return ret;
2966
2967err_free_put_context:
2968 kfree(counter);
2969
2970err_put_context:
2971 put_context(ctx);
2972
2973 goto out_fput;
2974}
2975
2976/*
2977 * Initialize the perf_counter context in a task_struct:
2978 */
2979static void
2980__perf_counter_init_context(struct perf_counter_context *ctx,
2981 struct task_struct *task)
2982{
2983 memset(ctx, 0, sizeof(*ctx));
2984 spin_lock_init(&ctx->lock);
2985 mutex_init(&ctx->mutex);
2986 INIT_LIST_HEAD(&ctx->counter_list);
2987 INIT_LIST_HEAD(&ctx->event_list);
2988 ctx->task = task;
2989}
2990
2991/*
2992 * inherit a counter from parent task to child task:
2993 */
2994static struct perf_counter *
2995inherit_counter(struct perf_counter *parent_counter,
2996 struct task_struct *parent,
2997 struct perf_counter_context *parent_ctx,
2998 struct task_struct *child,
2999 struct perf_counter *group_leader,
3000 struct perf_counter_context *child_ctx)
3001{
3002 struct perf_counter *child_counter;
3003
3004 /*
3005 * Instead of creating recursive hierarchies of counters,
3006 * we link inherited counters back to the original parent,
3007 * which has a filp for sure, which we use as the reference
3008 * count:
3009 */
3010 if (parent_counter->parent)
3011 parent_counter = parent_counter->parent;
3012
3013 child_counter = perf_counter_alloc(&parent_counter->hw_event,
3014 parent_counter->cpu, child_ctx,
3015 group_leader, GFP_KERNEL);
3016 if (IS_ERR(child_counter))
3017 return child_counter;
3018
3019 /*
3020 * Link it up in the child's context:
3021 */
3022 child_counter->task = child;
3023 add_counter_to_ctx(child_counter, child_ctx);
3024
3025 child_counter->parent = parent_counter;
3026 /*
3027 * inherit into child's child as well:
3028 */
3029 child_counter->hw_event.inherit = 1;
3030
3031 /*
3032 * Get a reference to the parent filp - we will fput it
3033 * when the child counter exits. This is safe to do because
3034 * we are in the parent and we know that the filp still
3035 * exists and has a nonzero count:
3036 */
3037 atomic_long_inc(&parent_counter->filp->f_count);
3038
3039 /*
3040 * Link this into the parent counter's child list
3041 */
3042 mutex_lock(&parent_counter->mutex);
3043 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
3044
3045 /*
3046 * Make the child state follow the state of the parent counter,
3047 * not its hw_event.disabled bit. We hold the parent's mutex,
3048 * so we won't race with perf_counter_{en,dis}able_family.
3049 */
3050 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
3051 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
3052 else
3053 child_counter->state = PERF_COUNTER_STATE_OFF;
3054
3055 mutex_unlock(&parent_counter->mutex);
3056
3057 return child_counter;
3058}
3059
3060static int inherit_group(struct perf_counter *parent_counter,
3061 struct task_struct *parent,
3062 struct perf_counter_context *parent_ctx,
3063 struct task_struct *child,
3064 struct perf_counter_context *child_ctx)
3065{
3066 struct perf_counter *leader;
3067 struct perf_counter *sub;
3068 struct perf_counter *child_ctr;
3069
3070 leader = inherit_counter(parent_counter, parent, parent_ctx,
3071 child, NULL, child_ctx);
3072 if (IS_ERR(leader))
3073 return PTR_ERR(leader);
3074 list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
3075 child_ctr = inherit_counter(sub, parent, parent_ctx,
3076 child, leader, child_ctx);
3077 if (IS_ERR(child_ctr))
3078 return PTR_ERR(child_ctr);
3079 }
3080 return 0;
3081}
3082
3083static void sync_child_counter(struct perf_counter *child_counter,
3084 struct perf_counter *parent_counter)
3085{
3086 u64 parent_val, child_val;
3087
3088 parent_val = atomic64_read(&parent_counter->count);
3089 child_val = atomic64_read(&child_counter->count);
3090
3091 /*
3092 * Add back the child's count to the parent's count:
3093 */
3094 atomic64_add(child_val, &parent_counter->count);
3095 atomic64_add(child_counter->total_time_enabled,
3096 &parent_counter->child_total_time_enabled);
3097 atomic64_add(child_counter->total_time_running,
3098 &parent_counter->child_total_time_running);
3099
3100 /*
3101 * Remove this counter from the parent's list
3102 */
3103 mutex_lock(&parent_counter->mutex);
3104 list_del_init(&child_counter->child_list);
3105 mutex_unlock(&parent_counter->mutex);
3106
3107 /*
3108 * Release the parent counter, if this was the last
3109 * reference to it.
3110 */
3111 fput(parent_counter->filp);
3112}
3113
3114static void
3115__perf_counter_exit_task(struct task_struct *child,
3116 struct perf_counter *child_counter,
3117 struct perf_counter_context *child_ctx)
3118{
3119 struct perf_counter *parent_counter;
3120 struct perf_counter *sub, *tmp;
3121
3122 /*
3123 * If we do not self-reap then we have to wait for the
3124 * child task to unschedule (it will happen for sure),
3125 * so that its counter is at its final count. (This
3126 * condition triggers rarely - child tasks usually get
3127 * off their CPU before the parent has a chance to
3128 * get this far into the reaping action)
3129 */
3130 if (child != current) {
3131 wait_task_inactive(child, 0);
3132 list_del_init(&child_counter->list_entry);
3133 update_counter_times(child_counter);
3134 } else {
3135 struct perf_cpu_context *cpuctx;
3136 unsigned long flags;
3137 u64 perf_flags;
3138
3139 /*
3140 * Disable and unlink this counter.
3141 *
3142 * Be careful about zapping the list - IRQ/NMI context
3143 * could still be processing it:
3144 */
3145 local_irq_save(flags);
3146 perf_flags = hw_perf_save_disable();
3147
3148 cpuctx = &__get_cpu_var(perf_cpu_context);
3149
3150 group_sched_out(child_counter, cpuctx, child_ctx);
3151 update_counter_times(child_counter);
3152
3153 list_del_init(&child_counter->list_entry);
3154
3155 child_ctx->nr_counters--;
3156
3157 hw_perf_restore(perf_flags);
3158 local_irq_restore(flags);
3159 }
3160
3161 parent_counter = child_counter->parent;
3162 /*
3163 * It can happen that parent exits first, and has counters
3164 * that are still around due to the child reference. These
3165 * counters need to be zapped - but otherwise linger.
3166 */
3167 if (parent_counter) {
3168 sync_child_counter(child_counter, parent_counter);
3169 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
3170 list_entry) {
3171 if (sub->parent) {
3172 sync_child_counter(sub, sub->parent);
3173 free_counter(sub);
3174 }
3175 }
3176 free_counter(child_counter);
3177 }
3178}
3179
3180/*
3181 * When a child task exits, feed back counter values to parent counters.
3182 *
3183 * Note: we may be running in child context, but the PID is not hashed
3184 * anymore so new counters will not be added.
3185 */
3186void perf_counter_exit_task(struct task_struct *child)
3187{
3188 struct perf_counter *child_counter, *tmp;
3189 struct perf_counter_context *child_ctx;
3190
3191 child_ctx = &child->perf_counter_ctx;
3192
3193 if (likely(!child_ctx->nr_counters))
3194 return;
3195
3196 list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
3197 list_entry)
3198 __perf_counter_exit_task(child, child_counter, child_ctx);
3199}
3200
3201/*
3202 * Initialize the perf_counter context in task_struct
3203 */
3204void perf_counter_init_task(struct task_struct *child)
3205{
3206 struct perf_counter_context *child_ctx, *parent_ctx;
3207 struct perf_counter *counter;
3208 struct task_struct *parent = current;
3209
3210 child_ctx = &child->perf_counter_ctx;
3211 parent_ctx = &parent->perf_counter_ctx;
3212
3213 __perf_counter_init_context(child_ctx, child);
3214
3215 /*
3216 * This is executed from the parent task context, so inherit
3217 * counters that have been marked for cloning:
3218 */
3219
3220 if (likely(!parent_ctx->nr_counters))
3221 return;
3222
3223 /*
3224 * Lock the parent list. No need to lock the child - not PID
3225 * hashed yet and not running, so nobody can access it.
3226 */
3227 mutex_lock(&parent_ctx->mutex);
3228
3229 /*
3230 * We dont have to disable NMIs - we are only looking at
3231 * the list, not manipulating it:
3232 */
3233 list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
3234 if (!counter->hw_event.inherit)
3235 continue;
3236
3237 if (inherit_group(counter, parent,
3238 parent_ctx, child, child_ctx))
3239 break;
3240 }
3241
3242 mutex_unlock(&parent_ctx->mutex);
3243}
3244
3245static void __cpuinit perf_counter_init_cpu(int cpu)
3246{
3247 struct perf_cpu_context *cpuctx;
3248
3249 cpuctx = &per_cpu(perf_cpu_context, cpu);
3250 __perf_counter_init_context(&cpuctx->ctx, NULL);
3251
3252 spin_lock(&perf_resource_lock);
3253 cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
3254 spin_unlock(&perf_resource_lock);
3255
3256 hw_perf_counter_setup(cpu);
3257}
3258
3259#ifdef CONFIG_HOTPLUG_CPU
3260static void __perf_counter_exit_cpu(void *info)
3261{
3262 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3263 struct perf_counter_context *ctx = &cpuctx->ctx;
3264 struct perf_counter *counter, *tmp;
3265
3266 list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
3267 __perf_counter_remove_from_context(counter);
3268}
3269static void perf_counter_exit_cpu(int cpu)
3270{
3271 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3272 struct perf_counter_context *ctx = &cpuctx->ctx;
3273
3274 mutex_lock(&ctx->mutex);
3275 smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
3276 mutex_unlock(&ctx->mutex);
3277}
3278#else
3279static inline void perf_counter_exit_cpu(int cpu) { }
3280#endif
3281
3282static int __cpuinit
3283perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
3284{
3285 unsigned int cpu = (long)hcpu;
3286
3287 switch (action) {
3288
3289 case CPU_UP_PREPARE:
3290 case CPU_UP_PREPARE_FROZEN:
3291 perf_counter_init_cpu(cpu);
3292 break;
3293
3294 case CPU_DOWN_PREPARE:
3295 case CPU_DOWN_PREPARE_FROZEN:
3296 perf_counter_exit_cpu(cpu);
3297 break;
3298
3299 default:
3300 break;
3301 }
3302
3303 return NOTIFY_OK;
3304}
3305
3306static struct notifier_block __cpuinitdata perf_cpu_nb = {
3307 .notifier_call = perf_cpu_notify,
3308};
3309
3310void __init perf_counter_init(void)
3311{
3312 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
3313 (void *)(long)smp_processor_id());
3314 register_cpu_notifier(&perf_cpu_nb);
3315}
3316
3317static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
3318{
3319 return sprintf(buf, "%d\n", perf_reserved_percpu);
3320}
3321
3322static ssize_t
3323perf_set_reserve_percpu(struct sysdev_class *class,
3324 const char *buf,
3325 size_t count)
3326{
3327 struct perf_cpu_context *cpuctx;
3328 unsigned long val;
3329 int err, cpu, mpt;
3330
3331 err = strict_strtoul(buf, 10, &val);
3332 if (err)
3333 return err;
3334 if (val > perf_max_counters)
3335 return -EINVAL;
3336
3337 spin_lock(&perf_resource_lock);
3338 perf_reserved_percpu = val;
3339 for_each_online_cpu(cpu) {
3340 cpuctx = &per_cpu(perf_cpu_context, cpu);
3341 spin_lock_irq(&cpuctx->ctx.lock);
3342 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
3343 perf_max_counters - perf_reserved_percpu);
3344 cpuctx->max_pertask = mpt;
3345 spin_unlock_irq(&cpuctx->ctx.lock);
3346 }
3347 spin_unlock(&perf_resource_lock);
3348
3349 return count;
3350}
3351
3352static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
3353{
3354 return sprintf(buf, "%d\n", perf_overcommit);
3355}
3356
3357static ssize_t
3358perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
3359{
3360 unsigned long val;
3361 int err;
3362
3363 err = strict_strtoul(buf, 10, &val);
3364 if (err)
3365 return err;
3366 if (val > 1)
3367 return -EINVAL;
3368
3369 spin_lock(&perf_resource_lock);
3370 perf_overcommit = val;
3371 spin_unlock(&perf_resource_lock);
3372
3373 return count;
3374}
3375
3376static SYSDEV_CLASS_ATTR(
3377 reserve_percpu,
3378 0644,
3379 perf_show_reserve_percpu,
3380 perf_set_reserve_percpu
3381 );
3382
3383static SYSDEV_CLASS_ATTR(
3384 overcommit,
3385 0644,
3386 perf_show_overcommit,
3387 perf_set_overcommit
3388 );
3389
3390static struct attribute *perfclass_attrs[] = {
3391 &attr_reserve_percpu.attr,
3392 &attr_overcommit.attr,
3393 NULL
3394};
3395
3396static struct attribute_group perfclass_attr_group = {
3397 .attrs = perfclass_attrs,
3398 .name = "perf_counters",
3399};
3400
3401static int __init perf_counter_sysfs_init(void)
3402{
3403 return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
3404 &perfclass_attr_group);
3405}
3406device_initcall(perf_counter_sysfs_init);
diff --git a/kernel/sched.c b/kernel/sched.c
index b902e587a3a0..a728976a3a6c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h>
42#include <linux/security.h> 43#include <linux/security.h>
43#include <linux/notifier.h> 44#include <linux/notifier.h>
44#include <linux/profile.h> 45#include <linux/profile.h>
@@ -584,6 +585,7 @@ struct rq {
584 struct load_weight load; 585 struct load_weight load;
585 unsigned long nr_load_updates; 586 unsigned long nr_load_updates;
586 u64 nr_switches; 587 u64 nr_switches;
588 u64 nr_migrations_in;
587 589
588 struct cfs_rq cfs; 590 struct cfs_rq cfs;
589 struct rt_rq rt; 591 struct rt_rq rt;
@@ -692,7 +694,7 @@ static inline int cpu_of(struct rq *rq)
692#define task_rq(p) cpu_rq(task_cpu(p)) 694#define task_rq(p) cpu_rq(task_cpu(p))
693#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 695#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
694 696
695static inline void update_rq_clock(struct rq *rq) 697inline void update_rq_clock(struct rq *rq)
696{ 698{
697 rq->clock = sched_clock_cpu(cpu_of(rq)); 699 rq->clock = sched_clock_cpu(cpu_of(rq));
698} 700}
@@ -1967,12 +1969,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1967 p->se.sleep_start -= clock_offset; 1969 p->se.sleep_start -= clock_offset;
1968 if (p->se.block_start) 1970 if (p->se.block_start)
1969 p->se.block_start -= clock_offset; 1971 p->se.block_start -= clock_offset;
1972#endif
1970 if (old_cpu != new_cpu) { 1973 if (old_cpu != new_cpu) {
1971 schedstat_inc(p, se.nr_migrations); 1974 p->se.nr_migrations++;
1975 new_rq->nr_migrations_in++;
1976#ifdef CONFIG_SCHEDSTATS
1972 if (task_hot(p, old_rq->clock, NULL)) 1977 if (task_hot(p, old_rq->clock, NULL))
1973 schedstat_inc(p, se.nr_forced2_migrations); 1978 schedstat_inc(p, se.nr_forced2_migrations);
1974 }
1975#endif 1979#endif
1980 }
1976 p->se.vruntime -= old_cfsrq->min_vruntime - 1981 p->se.vruntime -= old_cfsrq->min_vruntime -
1977 new_cfsrq->min_vruntime; 1982 new_cfsrq->min_vruntime;
1978 1983
@@ -2324,6 +2329,27 @@ static int sched_balance_self(int cpu, int flag)
2324 2329
2325#endif /* CONFIG_SMP */ 2330#endif /* CONFIG_SMP */
2326 2331
2332/**
2333 * task_oncpu_function_call - call a function on the cpu on which a task runs
2334 * @p: the task to evaluate
2335 * @func: the function to be called
2336 * @info: the function call argument
2337 *
2338 * Calls the function @func when the task is currently running. This might
2339 * be on the current CPU, which just calls the function directly
2340 */
2341void task_oncpu_function_call(struct task_struct *p,
2342 void (*func) (void *info), void *info)
2343{
2344 int cpu;
2345
2346 preempt_disable();
2347 cpu = task_cpu(p);
2348 if (task_curr(p))
2349 smp_call_function_single(cpu, func, info, 1);
2350 preempt_enable();
2351}
2352
2327/*** 2353/***
2328 * try_to_wake_up - wake up a thread 2354 * try_to_wake_up - wake up a thread
2329 * @p: the to-be-woken-up thread 2355 * @p: the to-be-woken-up thread
@@ -2480,6 +2506,7 @@ static void __sched_fork(struct task_struct *p)
2480 p->se.exec_start = 0; 2506 p->se.exec_start = 0;
2481 p->se.sum_exec_runtime = 0; 2507 p->se.sum_exec_runtime = 0;
2482 p->se.prev_sum_exec_runtime = 0; 2508 p->se.prev_sum_exec_runtime = 0;
2509 p->se.nr_migrations = 0;
2483 p->se.last_wakeup = 0; 2510 p->se.last_wakeup = 0;
2484 p->se.avg_overlap = 0; 2511 p->se.avg_overlap = 0;
2485 p->se.start_runtime = 0; 2512 p->se.start_runtime = 0;
@@ -2710,6 +2737,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2710 */ 2737 */
2711 prev_state = prev->state; 2738 prev_state = prev->state;
2712 finish_arch_switch(prev); 2739 finish_arch_switch(prev);
2740 perf_counter_task_sched_in(current, cpu_of(rq));
2713 finish_lock_switch(rq, prev); 2741 finish_lock_switch(rq, prev);
2714#ifdef CONFIG_SMP 2742#ifdef CONFIG_SMP
2715 if (post_schedule) 2743 if (post_schedule)
@@ -2872,6 +2900,15 @@ unsigned long nr_active(void)
2872} 2900}
2873 2901
2874/* 2902/*
2903 * Externally visible per-cpu scheduler statistics:
2904 * cpu_nr_migrations(cpu) - number of migrations into that cpu
2905 */
2906u64 cpu_nr_migrations(int cpu)
2907{
2908 return cpu_rq(cpu)->nr_migrations_in;
2909}
2910
2911/*
2875 * Update rq->cpu_load[] statistics. This function is usually called every 2912 * Update rq->cpu_load[] statistics. This function is usually called every
2876 * scheduler tick (TICK_NSEC). 2913 * scheduler tick (TICK_NSEC).
2877 */ 2914 */
@@ -4838,6 +4875,7 @@ void scheduler_tick(void)
4838 update_rq_clock(rq); 4875 update_rq_clock(rq);
4839 update_cpu_load(rq); 4876 update_cpu_load(rq);
4840 curr->sched_class->task_tick(rq, curr, 0); 4877 curr->sched_class->task_tick(rq, curr, 0);
4878 perf_counter_task_tick(curr, cpu);
4841 spin_unlock(&rq->lock); 4879 spin_unlock(&rq->lock);
4842 4880
4843#ifdef CONFIG_SMP 4881#ifdef CONFIG_SMP
@@ -5053,6 +5091,7 @@ need_resched_nonpreemptible:
5053 5091
5054 if (likely(prev != next)) { 5092 if (likely(prev != next)) {
5055 sched_info_switch(prev, next); 5093 sched_info_switch(prev, next);
5094 perf_counter_task_sched_out(prev, cpu);
5056 5095
5057 rq->nr_switches++; 5096 rq->nr_switches++;
5058 rq->curr = next; 5097 rq->curr = next;
@@ -8958,7 +8997,7 @@ void __init sched_init(void)
8958 * 1024) and two child groups A0 and A1 (of weight 1024 each), 8997 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8959 * then A0's share of the cpu resource is: 8998 * then A0's share of the cpu resource is:
8960 * 8999 *
8961 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 9000 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8962 * 9001 *
8963 * We achieve this by letting init_task_group's tasks sit 9002 * We achieve this by letting init_task_group's tasks sit
8964 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 9003 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9059,6 +9098,8 @@ void __init sched_init(void)
9059 alloc_bootmem_cpumask_var(&cpu_isolated_map); 9098 alloc_bootmem_cpumask_var(&cpu_isolated_map);
9060#endif /* SMP */ 9099#endif /* SMP */
9061 9100
9101 perf_counter_init();
9102
9062 scheduler_running = 1; 9103 scheduler_running = 1;
9063} 9104}
9064 9105
diff --git a/kernel/signal.c b/kernel/signal.c
index d8034737db4c..f79b3b9f8375 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2278,24 +2278,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
2278 return kill_something_info(sig, &info, pid); 2278 return kill_something_info(sig, &info, pid);
2279} 2279}
2280 2280
2281static int do_tkill(pid_t tgid, pid_t pid, int sig) 2281static int
2282do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2282{ 2283{
2283 int error;
2284 struct siginfo info;
2285 struct task_struct *p; 2284 struct task_struct *p;
2286 unsigned long flags; 2285 unsigned long flags;
2287 2286 int error = -ESRCH;
2288 error = -ESRCH;
2289 info.si_signo = sig;
2290 info.si_errno = 0;
2291 info.si_code = SI_TKILL;
2292 info.si_pid = task_tgid_vnr(current);
2293 info.si_uid = current_uid();
2294 2287
2295 rcu_read_lock(); 2288 rcu_read_lock();
2296 p = find_task_by_vpid(pid); 2289 p = find_task_by_vpid(pid);
2297 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { 2290 if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
2298 error = check_kill_permission(sig, &info, p); 2291 error = check_kill_permission(sig, info, p);
2299 /* 2292 /*
2300 * The null signal is a permissions and process existence 2293 * The null signal is a permissions and process existence
2301 * probe. No signal is actually delivered. 2294 * probe. No signal is actually delivered.
@@ -2305,7 +2298,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2305 * signal is private anyway. 2298 * signal is private anyway.
2306 */ 2299 */
2307 if (!error && sig && lock_task_sighand(p, &flags)) { 2300 if (!error && sig && lock_task_sighand(p, &flags)) {
2308 error = specific_send_sig_info(sig, &info, p); 2301 error = specific_send_sig_info(sig, info, p);
2309 unlock_task_sighand(p, &flags); 2302 unlock_task_sighand(p, &flags);
2310 } 2303 }
2311 } 2304 }
@@ -2314,6 +2307,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
2314 return error; 2307 return error;
2315} 2308}
2316 2309
2310static int do_tkill(pid_t tgid, pid_t pid, int sig)
2311{
2312 struct siginfo info;
2313
2314 info.si_signo = sig;
2315 info.si_errno = 0;
2316 info.si_code = SI_TKILL;
2317 info.si_pid = task_tgid_vnr(current);
2318 info.si_uid = current_uid();
2319
2320 return do_send_specific(tgid, pid, sig, &info);
2321}
2322
2317/** 2323/**
2318 * sys_tgkill - send signal to one specific thread 2324 * sys_tgkill - send signal to one specific thread
2319 * @tgid: the thread group ID of the thread 2325 * @tgid: the thread group ID of the thread
@@ -2363,6 +2369,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
2363 return kill_proc_info(sig, &info, pid); 2369 return kill_proc_info(sig, &info, pid);
2364} 2370}
2365 2371
2372long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
2373{
2374 /* This is only valid for single tasks */
2375 if (pid <= 0 || tgid <= 0)
2376 return -EINVAL;
2377
2378 /* Not even root can pretend to send signals from the kernel.
2379 Nor can they impersonate a kill(), which adds source info. */
2380 if (info->si_code >= 0)
2381 return -EPERM;
2382 info->si_signo = sig;
2383
2384 return do_send_specific(tgid, pid, sig, info);
2385}
2386
2387SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
2388 siginfo_t __user *, uinfo)
2389{
2390 siginfo_t info;
2391
2392 if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
2393 return -EFAULT;
2394
2395 return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
2396}
2397
2366int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) 2398int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2367{ 2399{
2368 struct task_struct *t = current; 2400 struct task_struct *t = current;
diff --git a/kernel/sys.c b/kernel/sys.c
index e7998cf31498..438d99a38c87 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
14#include <linux/prctl.h> 14#include <linux/prctl.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/perf_counter.h>
17#include <linux/resource.h> 18#include <linux/resource.h>
18#include <linux/kernel.h> 19#include <linux/kernel.h>
19#include <linux/kexec.h> 20#include <linux/kexec.h>
@@ -1793,6 +1794,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1793 case PR_SET_TSC: 1794 case PR_SET_TSC:
1794 error = SET_TSC_CTL(arg2); 1795 error = SET_TSC_CTL(arg2);
1795 break; 1796 break;
1797 case PR_TASK_PERF_COUNTERS_DISABLE:
1798 error = perf_counter_task_disable();
1799 break;
1800 case PR_TASK_PERF_COUNTERS_ENABLE:
1801 error = perf_counter_task_enable();
1802 break;
1796 case PR_GET_TIMERSLACK: 1803 case PR_GET_TIMERSLACK:
1797 error = current->timer_slack_ns; 1804 error = current->timer_slack_ns;
1798 break; 1805 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 27dad2967387..68320f6b07b5 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);
175cond_syscall(compat_sys_timerfd_gettime); 175cond_syscall(compat_sys_timerfd_gettime);
176cond_syscall(sys_eventfd); 176cond_syscall(sys_eventfd);
177cond_syscall(sys_eventfd2); 177cond_syscall(sys_eventfd2);
178
179/* performance counters: */
180cond_syscall(sys_perf_counter_open);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e3d2c7dd59b9..3b05c2b088d2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,6 +49,7 @@
49#include <linux/reboot.h> 49#include <linux/reboot.h>
50#include <linux/ftrace.h> 50#include <linux/ftrace.h>
51#include <linux/slow-work.h> 51#include <linux/slow-work.h>
52#include <linux/perf_counter.h>
52 53
53#include <asm/uaccess.h> 54#include <asm/uaccess.h>
54#include <asm/processor.h> 55#include <asm/processor.h>
@@ -910,6 +911,24 @@ static struct ctl_table kern_table[] = {
910 .child = slow_work_sysctls, 911 .child = slow_work_sysctls,
911 }, 912 },
912#endif 913#endif
914#ifdef CONFIG_PERF_COUNTERS
915 {
916 .ctl_name = CTL_UNNUMBERED,
917 .procname = "perf_counter_privileged",
918 .data = &sysctl_perf_counter_priv,
919 .maxlen = sizeof(sysctl_perf_counter_priv),
920 .mode = 0644,
921 .proc_handler = &proc_dointvec,
922 },
923 {
924 .ctl_name = CTL_UNNUMBERED,
925 .procname = "perf_counter_mlock_kb",
926 .data = &sysctl_perf_counter_mlock,
927 .maxlen = sizeof(sysctl_perf_counter_mlock),
928 .mode = 0644,
929 .proc_handler = &proc_dointvec,
930 },
931#endif
913/* 932/*
914 * NOTE: do not add new entries to this table unless you have read 933 * NOTE: do not add new entries to this table unless you have read
915 * Documentation/sysctl/ctl_unnumbered.txt 934 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31..fed53be44fd9 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
37#include <linux/delay.h> 37#include <linux/delay.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_counter.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -1170,6 +1171,8 @@ static void run_timer_softirq(struct softirq_action *h)
1170{ 1171{
1171 struct tvec_base *base = __get_cpu_var(tvec_bases); 1172 struct tvec_base *base = __get_cpu_var(tvec_bases);
1172 1173
1174 perf_counter_do_pending();
1175
1173 hrtimer_run_pending(); 1176 hrtimer_run_pending();
1174 1177
1175 if (time_after_eq(jiffies, base->timer_jiffies)) 1178 if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/mm/mmap.c b/mm/mmap.c
index 3303d1ba8e87..8a49df4c7363 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
28#include <linux/mempolicy.h> 28#include <linux/mempolicy.h>
29#include <linux/rmap.h> 29#include <linux/rmap.h>
30#include <linux/mmu_notifier.h> 30#include <linux/mmu_notifier.h>
31#include <linux/perf_counter.h>
31 32
32#include <asm/uaccess.h> 33#include <asm/uaccess.h>
33#include <asm/cacheflush.h> 34#include <asm/cacheflush.h>
@@ -1223,6 +1224,9 @@ munmap_back:
1223 if (correct_wcount) 1224 if (correct_wcount)
1224 atomic_inc(&inode->i_writecount); 1225 atomic_inc(&inode->i_writecount);
1225out: 1226out:
1227 if (vm_flags & VM_EXEC)
1228 perf_counter_mmap(addr, len, pgoff, file);
1229
1226 mm->total_vm += len >> PAGE_SHIFT; 1230 mm->total_vm += len >> PAGE_SHIFT;
1227 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1231 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1228 if (vm_flags & VM_LOCKED) { 1232 if (vm_flags & VM_LOCKED) {
@@ -1756,6 +1760,12 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1756 do { 1760 do {
1757 long nrpages = vma_pages(vma); 1761 long nrpages = vma_pages(vma);
1758 1762
1763 if (vma->vm_flags & VM_EXEC) {
1764 perf_counter_munmap(vma->vm_start,
1765 nrpages << PAGE_SHIFT,
1766 vma->vm_pgoff, vma->vm_file);
1767 }
1768
1759 mm->total_vm -= nrpages; 1769 mm->total_vm -= nrpages;
1760 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 1770 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1761 vma = remove_vma(vma); 1771 vma = remove_vma(vma);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 2b7390e377b3..d1e10546eb85 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -492,6 +492,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
492 continue; 492 continue;
493 493
494 dev_change_flags(vlandev, flgs & ~IFF_UP); 494 dev_change_flags(vlandev, flgs & ~IFF_UP);
495 vlan_transfer_operstate(dev, vlandev);
495 } 496 }
496 break; 497 break;
497 498
@@ -507,6 +508,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
507 continue; 508 continue;
508 509
509 dev_change_flags(vlandev, flgs | IFF_UP); 510 dev_change_flags(vlandev, flgs | IFF_UP);
511 vlan_transfer_operstate(dev, vlandev);
510 } 512 }
511 break; 513 break;
512 514
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 6b0921364014..b4b9068e55a7 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -462,6 +462,7 @@ static int vlan_dev_open(struct net_device *dev)
462 if (vlan->flags & VLAN_FLAG_GVRP) 462 if (vlan->flags & VLAN_FLAG_GVRP)
463 vlan_gvrp_request_join(dev); 463 vlan_gvrp_request_join(dev);
464 464
465 netif_carrier_on(dev);
465 return 0; 466 return 0;
466 467
467clear_allmulti: 468clear_allmulti:
@@ -471,6 +472,7 @@ del_unicast:
471 if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) 472 if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr))
472 dev_unicast_delete(real_dev, dev->dev_addr, ETH_ALEN); 473 dev_unicast_delete(real_dev, dev->dev_addr, ETH_ALEN);
473out: 474out:
475 netif_carrier_off(dev);
474 return err; 476 return err;
475} 477}
476 478
@@ -492,6 +494,7 @@ static int vlan_dev_stop(struct net_device *dev)
492 if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr)) 494 if (compare_ether_addr(dev->dev_addr, real_dev->dev_addr))
493 dev_unicast_delete(real_dev, dev->dev_addr, dev->addr_len); 495 dev_unicast_delete(real_dev, dev->dev_addr, dev->addr_len);
494 496
497 netif_carrier_off(dev);
495 return 0; 498 return 0;
496} 499}
497 500
@@ -612,6 +615,8 @@ static int vlan_dev_init(struct net_device *dev)
612 struct net_device *real_dev = vlan_dev_info(dev)->real_dev; 615 struct net_device *real_dev = vlan_dev_info(dev)->real_dev;
613 int subclass = 0; 616 int subclass = 0;
614 617
618 netif_carrier_off(dev);
619
615 /* IFF_BROADCAST|IFF_MULTICAST; ??? */ 620 /* IFF_BROADCAST|IFF_MULTICAST; ??? */
616 dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI); 621 dev->flags = real_dev->flags & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI);
617 dev->iflink = real_dev->ifindex; 622 dev->iflink = real_dev->ifindex;
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 1181db08d9de..375f4b4f7f79 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -215,6 +215,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst)
215 conn->state = BT_OPEN; 215 conn->state = BT_OPEN;
216 216
217 conn->power_save = 1; 217 conn->power_save = 1;
218 conn->disc_timeout = HCI_DISCONN_TIMEOUT;
218 219
219 switch (type) { 220 switch (type) {
220 case ACL_LINK: 221 case ACL_LINK:
@@ -424,12 +425,9 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
424 if (sec_level == BT_SECURITY_SDP) 425 if (sec_level == BT_SECURITY_SDP)
425 return 1; 426 return 1;
426 427
427 if (sec_level == BT_SECURITY_LOW) { 428 if (sec_level == BT_SECURITY_LOW &&
428 if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0) 429 (!conn->ssp_mode || !conn->hdev->ssp_mode))
429 return hci_conn_auth(conn, sec_level, auth_type); 430 return 1;
430 else
431 return 1;
432 }
433 431
434 if (conn->link_mode & HCI_LM_ENCRYPT) 432 if (conn->link_mode & HCI_LM_ENCRYPT)
435 return hci_conn_auth(conn, sec_level, auth_type); 433 return hci_conn_auth(conn, sec_level, auth_type);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 15f40ea8d544..4e7cb88e5da9 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -883,6 +883,7 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s
883 if (conn->type == ACL_LINK) { 883 if (conn->type == ACL_LINK) {
884 conn->state = BT_CONFIG; 884 conn->state = BT_CONFIG;
885 hci_conn_hold(conn); 885 hci_conn_hold(conn);
886 conn->disc_timeout = HCI_DISCONN_TIMEOUT;
886 } else 887 } else
887 conn->state = BT_CONNECTED; 888 conn->state = BT_CONNECTED;
888 889
@@ -1063,9 +1064,14 @@ static inline void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *s
1063 hci_proto_connect_cfm(conn, ev->status); 1064 hci_proto_connect_cfm(conn, ev->status);
1064 hci_conn_put(conn); 1065 hci_conn_put(conn);
1065 } 1066 }
1066 } else 1067 } else {
1067 hci_auth_cfm(conn, ev->status); 1068 hci_auth_cfm(conn, ev->status);
1068 1069
1070 hci_conn_hold(conn);
1071 conn->disc_timeout = HCI_DISCONN_TIMEOUT;
1072 hci_conn_put(conn);
1073 }
1074
1069 if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) { 1075 if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) {
1070 if (!ev->status) { 1076 if (!ev->status) {
1071 struct hci_cp_set_conn_encrypt cp; 1077 struct hci_cp_set_conn_encrypt cp;
@@ -1479,7 +1485,21 @@ static inline void hci_mode_change_evt(struct hci_dev *hdev, struct sk_buff *skb
1479 1485
1480static inline void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb) 1486static inline void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
1481{ 1487{
1488 struct hci_ev_pin_code_req *ev = (void *) skb->data;
1489 struct hci_conn *conn;
1490
1482 BT_DBG("%s", hdev->name); 1491 BT_DBG("%s", hdev->name);
1492
1493 hci_dev_lock(hdev);
1494
1495 conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
1496 if (conn) {
1497 hci_conn_hold(conn);
1498 conn->disc_timeout = HCI_PAIRING_TIMEOUT;
1499 hci_conn_put(conn);
1500 }
1501
1502 hci_dev_unlock(hdev);
1483} 1503}
1484 1504
1485static inline void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb) 1505static inline void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
@@ -1489,7 +1509,21 @@ static inline void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff
1489 1509
1490static inline void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb) 1510static inline void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb)
1491{ 1511{
1512 struct hci_ev_link_key_notify *ev = (void *) skb->data;
1513 struct hci_conn *conn;
1514
1492 BT_DBG("%s", hdev->name); 1515 BT_DBG("%s", hdev->name);
1516
1517 hci_dev_lock(hdev);
1518
1519 conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
1520 if (conn) {
1521 hci_conn_hold(conn);
1522 conn->disc_timeout = HCI_DISCONN_TIMEOUT;
1523 hci_conn_put(conn);
1524 }
1525
1526 hci_dev_unlock(hdev);
1493} 1527}
1494 1528
1495static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *skb) 1529static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *skb)
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index ed82796d4a0f..b7c51082ddeb 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -9,8 +9,7 @@
9struct class *bt_class = NULL; 9struct class *bt_class = NULL;
10EXPORT_SYMBOL_GPL(bt_class); 10EXPORT_SYMBOL_GPL(bt_class);
11 11
12static struct workqueue_struct *btaddconn; 12static struct workqueue_struct *bluetooth;
13static struct workqueue_struct *btdelconn;
14 13
15static inline char *link_typetostr(int type) 14static inline char *link_typetostr(int type)
16{ 15{
@@ -88,9 +87,10 @@ static struct device_type bt_link = {
88 87
89static void add_conn(struct work_struct *work) 88static void add_conn(struct work_struct *work)
90{ 89{
91 struct hci_conn *conn = container_of(work, struct hci_conn, work); 90 struct hci_conn *conn = container_of(work, struct hci_conn, work_add);
92 91
93 flush_workqueue(btdelconn); 92 /* ensure previous add/del is complete */
93 flush_workqueue(bluetooth);
94 94
95 if (device_add(&conn->dev) < 0) { 95 if (device_add(&conn->dev) < 0) {
96 BT_ERR("Failed to register connection device"); 96 BT_ERR("Failed to register connection device");
@@ -114,9 +114,9 @@ void hci_conn_add_sysfs(struct hci_conn *conn)
114 114
115 device_initialize(&conn->dev); 115 device_initialize(&conn->dev);
116 116
117 INIT_WORK(&conn->work, add_conn); 117 INIT_WORK(&conn->work_add, add_conn);
118 118
119 queue_work(btaddconn, &conn->work); 119 queue_work(bluetooth, &conn->work_add);
120} 120}
121 121
122/* 122/*
@@ -131,9 +131,12 @@ static int __match_tty(struct device *dev, void *data)
131 131
132static void del_conn(struct work_struct *work) 132static void del_conn(struct work_struct *work)
133{ 133{
134 struct hci_conn *conn = container_of(work, struct hci_conn, work); 134 struct hci_conn *conn = container_of(work, struct hci_conn, work_del);
135 struct hci_dev *hdev = conn->hdev; 135 struct hci_dev *hdev = conn->hdev;
136 136
137 /* ensure previous add/del is complete */
138 flush_workqueue(bluetooth);
139
137 while (1) { 140 while (1) {
138 struct device *dev; 141 struct device *dev;
139 142
@@ -156,9 +159,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn)
156 if (!device_is_registered(&conn->dev)) 159 if (!device_is_registered(&conn->dev))
157 return; 160 return;
158 161
159 INIT_WORK(&conn->work, del_conn); 162 INIT_WORK(&conn->work_del, del_conn);
160 163
161 queue_work(btdelconn, &conn->work); 164 queue_work(bluetooth, &conn->work_del);
162} 165}
163 166
164static inline char *host_typetostr(int type) 167static inline char *host_typetostr(int type)
@@ -435,20 +438,13 @@ void hci_unregister_sysfs(struct hci_dev *hdev)
435 438
436int __init bt_sysfs_init(void) 439int __init bt_sysfs_init(void)
437{ 440{
438 btaddconn = create_singlethread_workqueue("btaddconn"); 441 bluetooth = create_singlethread_workqueue("bluetooth");
439 if (!btaddconn) 442 if (!bluetooth)
440 return -ENOMEM;
441
442 btdelconn = create_singlethread_workqueue("btdelconn");
443 if (!btdelconn) {
444 destroy_workqueue(btaddconn);
445 return -ENOMEM; 443 return -ENOMEM;
446 }
447 444
448 bt_class = class_create(THIS_MODULE, "bluetooth"); 445 bt_class = class_create(THIS_MODULE, "bluetooth");
449 if (IS_ERR(bt_class)) { 446 if (IS_ERR(bt_class)) {
450 destroy_workqueue(btdelconn); 447 destroy_workqueue(bluetooth);
451 destroy_workqueue(btaddconn);
452 return PTR_ERR(bt_class); 448 return PTR_ERR(bt_class);
453 } 449 }
454 450
@@ -457,8 +453,7 @@ int __init bt_sysfs_init(void)
457 453
458void bt_sysfs_cleanup(void) 454void bt_sysfs_cleanup(void)
459{ 455{
460 destroy_workqueue(btaddconn); 456 destroy_workqueue(bluetooth);
461 destroy_workqueue(btdelconn);
462 457
463 class_destroy(bt_class); 458 class_destroy(bt_class);
464} 459}
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 3953ac4214c8..e4a418fcb35b 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -788,15 +788,23 @@ static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff *skb,
788 return NF_STOLEN; 788 return NF_STOLEN;
789} 789}
790 790
791#if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE)
791static int br_nf_dev_queue_xmit(struct sk_buff *skb) 792static int br_nf_dev_queue_xmit(struct sk_buff *skb)
792{ 793{
793 if (skb->protocol == htons(ETH_P_IP) && 794 if (skb->nfct != NULL &&
795 (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb)) &&
794 skb->len > skb->dev->mtu && 796 skb->len > skb->dev->mtu &&
795 !skb_is_gso(skb)) 797 !skb_is_gso(skb))
796 return ip_fragment(skb, br_dev_queue_push_xmit); 798 return ip_fragment(skb, br_dev_queue_push_xmit);
797 else 799 else
798 return br_dev_queue_push_xmit(skb); 800 return br_dev_queue_push_xmit(skb);
799} 801}
802#else
803static int br_nf_dev_queue_xmit(struct sk_buff *skb)
804{
805 return br_dev_queue_push_xmit(skb);
806}
807#endif
800 808
801/* PF_BRIDGE/POST_ROUTING ********************************************/ 809/* PF_BRIDGE/POST_ROUTING ********************************************/
802static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb, 810static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb,
diff --git a/net/core/datagram.c b/net/core/datagram.c
index d0de644b378d..b01a76abe1d2 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -64,13 +64,25 @@ static inline int connection_based(struct sock *sk)
64 return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; 64 return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
65} 65}
66 66
67static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync,
68 void *key)
69{
70 unsigned long bits = (unsigned long)key;
71
72 /*
73 * Avoid a wakeup if event not interesting for us
74 */
75 if (bits && !(bits & (POLLIN | POLLERR)))
76 return 0;
77 return autoremove_wake_function(wait, mode, sync, key);
78}
67/* 79/*
68 * Wait for a packet.. 80 * Wait for a packet..
69 */ 81 */
70static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) 82static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
71{ 83{
72 int error; 84 int error;
73 DEFINE_WAIT(wait); 85 DEFINE_WAIT_FUNC(wait, receiver_wake_function);
74 86
75 prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); 87 prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
76 88
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 5ba533d234db..831fe1879dc0 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buff *skb,
253 indev = in ? in->name : nulldevname; 253 indev = in ? in->name : nulldevname;
254 outdev = out ? out->name : nulldevname; 254 outdev = out ? out->name : nulldevname;
255 255
256 rcu_read_lock_bh(); 256 xt_info_rdlock_bh();
257 private = rcu_dereference(table->private); 257 private = table->private;
258 table_base = rcu_dereference(private->entries[smp_processor_id()]); 258 table_base = private->entries[smp_processor_id()];
259 259
260 e = get_entry(table_base, private->hook_entry[hook]); 260 e = get_entry(table_base, private->hook_entry[hook]);
261 back = get_entry(table_base, private->underflow[hook]); 261 back = get_entry(table_base, private->underflow[hook]);
@@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
273 273
274 hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + 274 hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
275 (2 * skb->dev->addr_len); 275 (2 * skb->dev->addr_len);
276
276 ADD_COUNTER(e->counters, hdr_len, 1); 277 ADD_COUNTER(e->counters, hdr_len, 1);
277 278
278 t = arpt_get_target(e); 279 t = arpt_get_target(e);
@@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
328 e = (void *)e + e->next_offset; 329 e = (void *)e + e->next_offset;
329 } 330 }
330 } while (!hotdrop); 331 } while (!hotdrop);
331 332 xt_info_rdunlock_bh();
332 rcu_read_unlock_bh();
333 333
334 if (hotdrop) 334 if (hotdrop)
335 return NF_DROP; 335 return NF_DROP;
@@ -711,9 +711,12 @@ static void get_counters(const struct xt_table_info *t,
711 /* Instead of clearing (by a previous call to memset()) 711 /* Instead of clearing (by a previous call to memset())
712 * the counters and using adds, we set the counters 712 * the counters and using adds, we set the counters
713 * with data used by 'current' CPU 713 * with data used by 'current' CPU
714 * We dont care about preemption here. 714 *
715 * Bottom half has to be disabled to prevent deadlock
716 * if new softirq were to run and call ipt_do_table
715 */ 717 */
716 curcpu = raw_smp_processor_id(); 718 local_bh_disable();
719 curcpu = smp_processor_id();
717 720
718 i = 0; 721 i = 0;
719 ARPT_ENTRY_ITERATE(t->entries[curcpu], 722 ARPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -726,73 +729,22 @@ static void get_counters(const struct xt_table_info *t,
726 if (cpu == curcpu) 729 if (cpu == curcpu)
727 continue; 730 continue;
728 i = 0; 731 i = 0;
732 xt_info_wrlock(cpu);
729 ARPT_ENTRY_ITERATE(t->entries[cpu], 733 ARPT_ENTRY_ITERATE(t->entries[cpu],
730 t->size, 734 t->size,
731 add_entry_to_counter, 735 add_entry_to_counter,
732 counters, 736 counters,
733 &i); 737 &i);
738 xt_info_wrunlock(cpu);
734 } 739 }
735}
736
737
738/* We're lazy, and add to the first CPU; overflow works its fey magic
739 * and everything is OK. */
740static int
741add_counter_to_entry(struct arpt_entry *e,
742 const struct xt_counters addme[],
743 unsigned int *i)
744{
745 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
746
747 (*i)++;
748 return 0;
749}
750
751/* Take values from counters and add them back onto the current cpu */
752static void put_counters(struct xt_table_info *t,
753 const struct xt_counters counters[])
754{
755 unsigned int i, cpu;
756
757 local_bh_disable();
758 cpu = smp_processor_id();
759 i = 0;
760 ARPT_ENTRY_ITERATE(t->entries[cpu],
761 t->size,
762 add_counter_to_entry,
763 counters,
764 &i);
765 local_bh_enable(); 740 local_bh_enable();
766} 741}
767 742
768static inline int
769zero_entry_counter(struct arpt_entry *e, void *arg)
770{
771 e->counters.bcnt = 0;
772 e->counters.pcnt = 0;
773 return 0;
774}
775
776static void
777clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
778{
779 unsigned int cpu;
780 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
781
782 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
783 for_each_possible_cpu(cpu) {
784 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
785 ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
786 zero_entry_counter, NULL);
787 }
788}
789
790static struct xt_counters *alloc_counters(struct xt_table *table) 743static struct xt_counters *alloc_counters(struct xt_table *table)
791{ 744{
792 unsigned int countersize; 745 unsigned int countersize;
793 struct xt_counters *counters; 746 struct xt_counters *counters;
794 struct xt_table_info *private = table->private; 747 struct xt_table_info *private = table->private;
795 struct xt_table_info *info;
796 748
797 /* We need atomic snapshot of counters: rest doesn't change 749 /* We need atomic snapshot of counters: rest doesn't change
798 * (other than comefrom, which userspace doesn't care 750 * (other than comefrom, which userspace doesn't care
@@ -802,30 +754,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
802 counters = vmalloc_node(countersize, numa_node_id()); 754 counters = vmalloc_node(countersize, numa_node_id());
803 755
804 if (counters == NULL) 756 if (counters == NULL)
805 goto nomem; 757 return ERR_PTR(-ENOMEM);
806
807 info = xt_alloc_table_info(private->size);
808 if (!info)
809 goto free_counters;
810
811 clone_counters(info, private);
812
813 mutex_lock(&table->lock);
814 xt_table_entry_swap_rcu(private, info);
815 synchronize_net(); /* Wait until smoke has cleared */
816 758
817 get_counters(info, counters); 759 get_counters(private, counters);
818 put_counters(private, counters);
819 mutex_unlock(&table->lock);
820
821 xt_free_table_info(info);
822 760
823 return counters; 761 return counters;
824
825 free_counters:
826 vfree(counters);
827 nomem:
828 return ERR_PTR(-ENOMEM);
829} 762}
830 763
831static int copy_entries_to_user(unsigned int total_size, 764static int copy_entries_to_user(unsigned int total_size,
@@ -1094,8 +1027,9 @@ static int __do_replace(struct net *net, const char *name,
1094 (newinfo->number <= oldinfo->initial_entries)) 1027 (newinfo->number <= oldinfo->initial_entries))
1095 module_put(t->me); 1028 module_put(t->me);
1096 1029
1097 /* Get the old counters. */ 1030 /* Get the old counters, and synchronize with replace */
1098 get_counters(oldinfo, counters); 1031 get_counters(oldinfo, counters);
1032
1099 /* Decrease module usage counts and free resource */ 1033 /* Decrease module usage counts and free resource */
1100 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1034 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1101 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1035 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1165,10 +1099,23 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
1165 return ret; 1099 return ret;
1166} 1100}
1167 1101
1102/* We're lazy, and add to the first CPU; overflow works its fey magic
1103 * and everything is OK. */
1104static int
1105add_counter_to_entry(struct arpt_entry *e,
1106 const struct xt_counters addme[],
1107 unsigned int *i)
1108{
1109 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1110
1111 (*i)++;
1112 return 0;
1113}
1114
1168static int do_add_counters(struct net *net, void __user *user, unsigned int len, 1115static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1169 int compat) 1116 int compat)
1170{ 1117{
1171 unsigned int i; 1118 unsigned int i, curcpu;
1172 struct xt_counters_info tmp; 1119 struct xt_counters_info tmp;
1173 struct xt_counters *paddc; 1120 struct xt_counters *paddc;
1174 unsigned int num_counters; 1121 unsigned int num_counters;
@@ -1224,26 +1171,26 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
1224 goto free; 1171 goto free;
1225 } 1172 }
1226 1173
1227 mutex_lock(&t->lock); 1174 local_bh_disable();
1228 private = t->private; 1175 private = t->private;
1229 if (private->number != num_counters) { 1176 if (private->number != num_counters) {
1230 ret = -EINVAL; 1177 ret = -EINVAL;
1231 goto unlock_up_free; 1178 goto unlock_up_free;
1232 } 1179 }
1233 1180
1234 preempt_disable();
1235 i = 0; 1181 i = 0;
1236 /* Choose the copy that is on our node */ 1182 /* Choose the copy that is on our node */
1237 loc_cpu_entry = private->entries[smp_processor_id()]; 1183 curcpu = smp_processor_id();
1184 loc_cpu_entry = private->entries[curcpu];
1185 xt_info_wrlock(curcpu);
1238 ARPT_ENTRY_ITERATE(loc_cpu_entry, 1186 ARPT_ENTRY_ITERATE(loc_cpu_entry,
1239 private->size, 1187 private->size,
1240 add_counter_to_entry, 1188 add_counter_to_entry,
1241 paddc, 1189 paddc,
1242 &i); 1190 &i);
1243 preempt_enable(); 1191 xt_info_wrunlock(curcpu);
1244 unlock_up_free: 1192 unlock_up_free:
1245 mutex_unlock(&t->lock); 1193 local_bh_enable();
1246
1247 xt_table_unlock(t); 1194 xt_table_unlock(t);
1248 module_put(t->me); 1195 module_put(t->me);
1249 free: 1196 free:
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 810c0b62c7d4..2ec8d7290c40 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb,
338 tgpar.hooknum = hook; 338 tgpar.hooknum = hook;
339 339
340 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 340 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
341 341 xt_info_rdlock_bh();
342 rcu_read_lock_bh(); 342 private = table->private;
343 private = rcu_dereference(table->private); 343 table_base = private->entries[smp_processor_id()];
344 table_base = rcu_dereference(private->entries[smp_processor_id()]);
345 344
346 e = get_entry(table_base, private->hook_entry[hook]); 345 e = get_entry(table_base, private->hook_entry[hook]);
347 346
@@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb,
436 e = (void *)e + e->next_offset; 435 e = (void *)e + e->next_offset;
437 } 436 }
438 } while (!hotdrop); 437 } while (!hotdrop);
439 438 xt_info_rdunlock_bh();
440 rcu_read_unlock_bh();
441 439
442#ifdef DEBUG_ALLOW_ALL 440#ifdef DEBUG_ALLOW_ALL
443 return NF_ACCEPT; 441 return NF_ACCEPT;
@@ -896,10 +894,13 @@ get_counters(const struct xt_table_info *t,
896 894
897 /* Instead of clearing (by a previous call to memset()) 895 /* Instead of clearing (by a previous call to memset())
898 * the counters and using adds, we set the counters 896 * the counters and using adds, we set the counters
899 * with data used by 'current' CPU 897 * with data used by 'current' CPU.
900 * We dont care about preemption here. 898 *
899 * Bottom half has to be disabled to prevent deadlock
900 * if new softirq were to run and call ipt_do_table
901 */ 901 */
902 curcpu = raw_smp_processor_id(); 902 local_bh_disable();
903 curcpu = smp_processor_id();
903 904
904 i = 0; 905 i = 0;
905 IPT_ENTRY_ITERATE(t->entries[curcpu], 906 IPT_ENTRY_ITERATE(t->entries[curcpu],
@@ -912,74 +913,22 @@ get_counters(const struct xt_table_info *t,
912 if (cpu == curcpu) 913 if (cpu == curcpu)
913 continue; 914 continue;
914 i = 0; 915 i = 0;
916 xt_info_wrlock(cpu);
915 IPT_ENTRY_ITERATE(t->entries[cpu], 917 IPT_ENTRY_ITERATE(t->entries[cpu],
916 t->size, 918 t->size,
917 add_entry_to_counter, 919 add_entry_to_counter,
918 counters, 920 counters,
919 &i); 921 &i);
922 xt_info_wrunlock(cpu);
920 } 923 }
921
922}
923
924/* We're lazy, and add to the first CPU; overflow works its fey magic
925 * and everything is OK. */
926static int
927add_counter_to_entry(struct ipt_entry *e,
928 const struct xt_counters addme[],
929 unsigned int *i)
930{
931 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
932
933 (*i)++;
934 return 0;
935}
936
937/* Take values from counters and add them back onto the current cpu */
938static void put_counters(struct xt_table_info *t,
939 const struct xt_counters counters[])
940{
941 unsigned int i, cpu;
942
943 local_bh_disable();
944 cpu = smp_processor_id();
945 i = 0;
946 IPT_ENTRY_ITERATE(t->entries[cpu],
947 t->size,
948 add_counter_to_entry,
949 counters,
950 &i);
951 local_bh_enable(); 924 local_bh_enable();
952} 925}
953 926
954
955static inline int
956zero_entry_counter(struct ipt_entry *e, void *arg)
957{
958 e->counters.bcnt = 0;
959 e->counters.pcnt = 0;
960 return 0;
961}
962
963static void
964clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
965{
966 unsigned int cpu;
967 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
968
969 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
970 for_each_possible_cpu(cpu) {
971 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
972 IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
973 zero_entry_counter, NULL);
974 }
975}
976
977static struct xt_counters * alloc_counters(struct xt_table *table) 927static struct xt_counters * alloc_counters(struct xt_table *table)
978{ 928{
979 unsigned int countersize; 929 unsigned int countersize;
980 struct xt_counters *counters; 930 struct xt_counters *counters;
981 struct xt_table_info *private = table->private; 931 struct xt_table_info *private = table->private;
982 struct xt_table_info *info;
983 932
984 /* We need atomic snapshot of counters: rest doesn't change 933 /* We need atomic snapshot of counters: rest doesn't change
985 (other than comefrom, which userspace doesn't care 934 (other than comefrom, which userspace doesn't care
@@ -988,30 +937,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
988 counters = vmalloc_node(countersize, numa_node_id()); 937 counters = vmalloc_node(countersize, numa_node_id());
989 938
990 if (counters == NULL) 939 if (counters == NULL)
991 goto nomem; 940 return ERR_PTR(-ENOMEM);
992 941
993 info = xt_alloc_table_info(private->size); 942 get_counters(private, counters);
994 if (!info)
995 goto free_counters;
996
997 clone_counters(info, private);
998
999 mutex_lock(&table->lock);
1000 xt_table_entry_swap_rcu(private, info);
1001 synchronize_net(); /* Wait until smoke has cleared */
1002
1003 get_counters(info, counters);
1004 put_counters(private, counters);
1005 mutex_unlock(&table->lock);
1006
1007 xt_free_table_info(info);
1008 943
1009 return counters; 944 return counters;
1010
1011 free_counters:
1012 vfree(counters);
1013 nomem:
1014 return ERR_PTR(-ENOMEM);
1015} 945}
1016 946
1017static int 947static int
@@ -1306,8 +1236,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1306 (newinfo->number <= oldinfo->initial_entries)) 1236 (newinfo->number <= oldinfo->initial_entries))
1307 module_put(t->me); 1237 module_put(t->me);
1308 1238
1309 /* Get the old counters. */ 1239 /* Get the old counters, and synchronize with replace */
1310 get_counters(oldinfo, counters); 1240 get_counters(oldinfo, counters);
1241
1311 /* Decrease module usage counts and free resource */ 1242 /* Decrease module usage counts and free resource */
1312 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1243 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1313 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1244 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1377,11 +1308,23 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1377 return ret; 1308 return ret;
1378} 1309}
1379 1310
1311/* We're lazy, and add to the first CPU; overflow works its fey magic
1312 * and everything is OK. */
1313static int
1314add_counter_to_entry(struct ipt_entry *e,
1315 const struct xt_counters addme[],
1316 unsigned int *i)
1317{
1318 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1319
1320 (*i)++;
1321 return 0;
1322}
1380 1323
1381static int 1324static int
1382do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) 1325do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
1383{ 1326{
1384 unsigned int i; 1327 unsigned int i, curcpu;
1385 struct xt_counters_info tmp; 1328 struct xt_counters_info tmp;
1386 struct xt_counters *paddc; 1329 struct xt_counters *paddc;
1387 unsigned int num_counters; 1330 unsigned int num_counters;
@@ -1437,25 +1380,26 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
1437 goto free; 1380 goto free;
1438 } 1381 }
1439 1382
1440 mutex_lock(&t->lock); 1383 local_bh_disable();
1441 private = t->private; 1384 private = t->private;
1442 if (private->number != num_counters) { 1385 if (private->number != num_counters) {
1443 ret = -EINVAL; 1386 ret = -EINVAL;
1444 goto unlock_up_free; 1387 goto unlock_up_free;
1445 } 1388 }
1446 1389
1447 preempt_disable();
1448 i = 0; 1390 i = 0;
1449 /* Choose the copy that is on our node */ 1391 /* Choose the copy that is on our node */
1450 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1392 curcpu = smp_processor_id();
1393 loc_cpu_entry = private->entries[curcpu];
1394 xt_info_wrlock(curcpu);
1451 IPT_ENTRY_ITERATE(loc_cpu_entry, 1395 IPT_ENTRY_ITERATE(loc_cpu_entry,
1452 private->size, 1396 private->size,
1453 add_counter_to_entry, 1397 add_counter_to_entry,
1454 paddc, 1398 paddc,
1455 &i); 1399 &i);
1456 preempt_enable(); 1400 xt_info_wrunlock(curcpu);
1457 unlock_up_free: 1401 unlock_up_free:
1458 mutex_unlock(&t->lock); 1402 local_bh_enable();
1459 xt_table_unlock(t); 1403 xt_table_unlock(t);
1460 module_put(t->me); 1404 module_put(t->me);
1461 free: 1405 free:
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c40debe51b38..c4c60e9f068a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3397,7 +3397,7 @@ int __init ip_rt_init(void)
3397 0, 3397 0,
3398 &rt_hash_log, 3398 &rt_hash_log,
3399 &rt_hash_mask, 3399 &rt_hash_mask,
3400 0); 3400 rhash_entries ? 0 : 512 * 1024);
3401 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); 3401 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3402 rt_hash_lock_init(); 3402 rt_hash_lock_init();
3403 3403
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 800ae8542471..219e165aea10 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb,
365 365
366 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 366 IP_NF_ASSERT(table->valid_hooks & (1 << hook));
367 367
368 rcu_read_lock_bh(); 368 xt_info_rdlock_bh();
369 private = rcu_dereference(table->private); 369 private = table->private;
370 table_base = rcu_dereference(private->entries[smp_processor_id()]); 370 table_base = private->entries[smp_processor_id()];
371 371
372 e = get_entry(table_base, private->hook_entry[hook]); 372 e = get_entry(table_base, private->hook_entry[hook]);
373 373
@@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb,
466#ifdef CONFIG_NETFILTER_DEBUG 466#ifdef CONFIG_NETFILTER_DEBUG
467 ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; 467 ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
468#endif 468#endif
469 rcu_read_unlock_bh(); 469 xt_info_rdunlock_bh();
470 470
471#ifdef DEBUG_ALLOW_ALL 471#ifdef DEBUG_ALLOW_ALL
472 return NF_ACCEPT; 472 return NF_ACCEPT;
@@ -926,9 +926,12 @@ get_counters(const struct xt_table_info *t,
926 /* Instead of clearing (by a previous call to memset()) 926 /* Instead of clearing (by a previous call to memset())
927 * the counters and using adds, we set the counters 927 * the counters and using adds, we set the counters
928 * with data used by 'current' CPU 928 * with data used by 'current' CPU
929 * We dont care about preemption here. 929 *
930 * Bottom half has to be disabled to prevent deadlock
931 * if new softirq were to run and call ipt_do_table
930 */ 932 */
931 curcpu = raw_smp_processor_id(); 933 local_bh_disable();
934 curcpu = smp_processor_id();
932 935
933 i = 0; 936 i = 0;
934 IP6T_ENTRY_ITERATE(t->entries[curcpu], 937 IP6T_ENTRY_ITERATE(t->entries[curcpu],
@@ -941,72 +944,22 @@ get_counters(const struct xt_table_info *t,
941 if (cpu == curcpu) 944 if (cpu == curcpu)
942 continue; 945 continue;
943 i = 0; 946 i = 0;
947 xt_info_wrlock(cpu);
944 IP6T_ENTRY_ITERATE(t->entries[cpu], 948 IP6T_ENTRY_ITERATE(t->entries[cpu],
945 t->size, 949 t->size,
946 add_entry_to_counter, 950 add_entry_to_counter,
947 counters, 951 counters,
948 &i); 952 &i);
953 xt_info_wrunlock(cpu);
949 } 954 }
950}
951
952/* We're lazy, and add to the first CPU; overflow works its fey magic
953 * and everything is OK. */
954static int
955add_counter_to_entry(struct ip6t_entry *e,
956 const struct xt_counters addme[],
957 unsigned int *i)
958{
959 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
960
961 (*i)++;
962 return 0;
963}
964
965/* Take values from counters and add them back onto the current cpu */
966static void put_counters(struct xt_table_info *t,
967 const struct xt_counters counters[])
968{
969 unsigned int i, cpu;
970
971 local_bh_disable();
972 cpu = smp_processor_id();
973 i = 0;
974 IP6T_ENTRY_ITERATE(t->entries[cpu],
975 t->size,
976 add_counter_to_entry,
977 counters,
978 &i);
979 local_bh_enable(); 955 local_bh_enable();
980} 956}
981 957
982static inline int
983zero_entry_counter(struct ip6t_entry *e, void *arg)
984{
985 e->counters.bcnt = 0;
986 e->counters.pcnt = 0;
987 return 0;
988}
989
990static void
991clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
992{
993 unsigned int cpu;
994 const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
995
996 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
997 for_each_possible_cpu(cpu) {
998 memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
999 IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
1000 zero_entry_counter, NULL);
1001 }
1002}
1003
1004static struct xt_counters *alloc_counters(struct xt_table *table) 958static struct xt_counters *alloc_counters(struct xt_table *table)
1005{ 959{
1006 unsigned int countersize; 960 unsigned int countersize;
1007 struct xt_counters *counters; 961 struct xt_counters *counters;
1008 struct xt_table_info *private = table->private; 962 struct xt_table_info *private = table->private;
1009 struct xt_table_info *info;
1010 963
1011 /* We need atomic snapshot of counters: rest doesn't change 964 /* We need atomic snapshot of counters: rest doesn't change
1012 (other than comefrom, which userspace doesn't care 965 (other than comefrom, which userspace doesn't care
@@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
1015 counters = vmalloc_node(countersize, numa_node_id()); 968 counters = vmalloc_node(countersize, numa_node_id());
1016 969
1017 if (counters == NULL) 970 if (counters == NULL)
1018 goto nomem; 971 return ERR_PTR(-ENOMEM);
1019 972
1020 info = xt_alloc_table_info(private->size); 973 get_counters(private, counters);
1021 if (!info)
1022 goto free_counters;
1023
1024 clone_counters(info, private);
1025
1026 mutex_lock(&table->lock);
1027 xt_table_entry_swap_rcu(private, info);
1028 synchronize_net(); /* Wait until smoke has cleared */
1029
1030 get_counters(info, counters);
1031 put_counters(private, counters);
1032 mutex_unlock(&table->lock);
1033
1034 xt_free_table_info(info);
1035 974
1036 return counters; 975 return counters;
1037
1038 free_counters:
1039 vfree(counters);
1040 nomem:
1041 return ERR_PTR(-ENOMEM);
1042} 976}
1043 977
1044static int 978static int
@@ -1334,8 +1268,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1334 (newinfo->number <= oldinfo->initial_entries)) 1268 (newinfo->number <= oldinfo->initial_entries))
1335 module_put(t->me); 1269 module_put(t->me);
1336 1270
1337 /* Get the old counters. */ 1271 /* Get the old counters, and synchronize with replace */
1338 get_counters(oldinfo, counters); 1272 get_counters(oldinfo, counters);
1273
1339 /* Decrease module usage counts and free resource */ 1274 /* Decrease module usage counts and free resource */
1340 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1275 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
1341 IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, 1276 IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1405,11 +1340,24 @@ do_replace(struct net *net, void __user *user, unsigned int len)
1405 return ret; 1340 return ret;
1406} 1341}
1407 1342
1343/* We're lazy, and add to the first CPU; overflow works its fey magic
1344 * and everything is OK. */
1345static int
1346add_counter_to_entry(struct ip6t_entry *e,
1347 const struct xt_counters addme[],
1348 unsigned int *i)
1349{
1350 ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
1351
1352 (*i)++;
1353 return 0;
1354}
1355
1408static int 1356static int
1409do_add_counters(struct net *net, void __user *user, unsigned int len, 1357do_add_counters(struct net *net, void __user *user, unsigned int len,
1410 int compat) 1358 int compat)
1411{ 1359{
1412 unsigned int i; 1360 unsigned int i, curcpu;
1413 struct xt_counters_info tmp; 1361 struct xt_counters_info tmp;
1414 struct xt_counters *paddc; 1362 struct xt_counters *paddc;
1415 unsigned int num_counters; 1363 unsigned int num_counters;
@@ -1465,25 +1413,28 @@ do_add_counters(struct net *net, void __user *user, unsigned int len,
1465 goto free; 1413 goto free;
1466 } 1414 }
1467 1415
1468 mutex_lock(&t->lock); 1416
1417 local_bh_disable();
1469 private = t->private; 1418 private = t->private;
1470 if (private->number != num_counters) { 1419 if (private->number != num_counters) {
1471 ret = -EINVAL; 1420 ret = -EINVAL;
1472 goto unlock_up_free; 1421 goto unlock_up_free;
1473 } 1422 }
1474 1423
1475 preempt_disable();
1476 i = 0; 1424 i = 0;
1477 /* Choose the copy that is on our node */ 1425 /* Choose the copy that is on our node */
1478 loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1426 curcpu = smp_processor_id();
1427 xt_info_wrlock(curcpu);
1428 loc_cpu_entry = private->entries[curcpu];
1479 IP6T_ENTRY_ITERATE(loc_cpu_entry, 1429 IP6T_ENTRY_ITERATE(loc_cpu_entry,
1480 private->size, 1430 private->size,
1481 add_counter_to_entry, 1431 add_counter_to_entry,
1482 paddc, 1432 paddc,
1483 &i); 1433 &i);
1484 preempt_enable(); 1434 xt_info_wrunlock(curcpu);
1435
1485 unlock_up_free: 1436 unlock_up_free:
1486 mutex_unlock(&t->lock); 1437 local_bh_enable();
1487 xt_table_unlock(t); 1438 xt_table_unlock(t);
1488 module_put(t->me); 1439 module_put(t->me);
1489 free: 1440 free:
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 2329c5f50551..881203c4a142 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -275,6 +275,8 @@ config NF_CT_NETLINK
275 help 275 help
276 This option enables support for a netlink-based userspace interface 276 This option enables support for a netlink-based userspace interface
277 277
278endif # NF_CONNTRACK
279
278# transparent proxy support 280# transparent proxy support
279config NETFILTER_TPROXY 281config NETFILTER_TPROXY
280 tristate "Transparent proxying support (EXPERIMENTAL)" 282 tristate "Transparent proxying support (EXPERIMENTAL)"
@@ -290,8 +292,6 @@ config NETFILTER_TPROXY
290 292
291 To compile it as a module, choose M here. If unsure, say N. 293 To compile it as a module, choose M here. If unsure, say N.
292 294
293endif # NF_CONNTRACK
294
295config NETFILTER_XTABLES 295config NETFILTER_XTABLES
296 tristate "Netfilter Xtables support (required for ip_tables)" 296 tristate "Netfilter Xtables support (required for ip_tables)"
297 default m if NETFILTER_ADVANCED=n 297 default m if NETFILTER_ADVANCED=n
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 50dac8dbe7d8..8e757dd53396 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -633,6 +633,8 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
633 if (!nest_parms) 633 if (!nest_parms)
634 goto nla_put_failure; 634 goto nla_put_failure;
635 NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state); 635 NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state);
636 NLA_PUT_U8(skb, CTA_PROTOINFO_DCCP_ROLE,
637 ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]);
636 nla_nest_end(skb, nest_parms); 638 nla_nest_end(skb, nest_parms);
637 read_unlock_bh(&dccp_lock); 639 read_unlock_bh(&dccp_lock);
638 return 0; 640 return 0;
@@ -644,6 +646,7 @@ nla_put_failure:
644 646
645static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { 647static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = {
646 [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, 648 [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 },
649 [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 },
647}; 650};
648 651
649static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) 652static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
@@ -661,11 +664,21 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct)
661 return err; 664 return err;
662 665
663 if (!tb[CTA_PROTOINFO_DCCP_STATE] || 666 if (!tb[CTA_PROTOINFO_DCCP_STATE] ||
664 nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) 667 !tb[CTA_PROTOINFO_DCCP_ROLE] ||
668 nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX ||
669 nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) {
665 return -EINVAL; 670 return -EINVAL;
671 }
666 672
667 write_lock_bh(&dccp_lock); 673 write_lock_bh(&dccp_lock);
668 ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]); 674 ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]);
675 if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) {
676 ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
677 ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
678 } else {
679 ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER;
680 ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT;
681 }
669 write_unlock_bh(&dccp_lock); 682 write_unlock_bh(&dccp_lock);
670 return 0; 683 return 0;
671} 684}
@@ -777,6 +790,7 @@ static struct nf_conntrack_l4proto dccp_proto6 __read_mostly = {
777 .print_conntrack = dccp_print_conntrack, 790 .print_conntrack = dccp_print_conntrack,
778#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 791#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
779 .to_nlattr = dccp_to_nlattr, 792 .to_nlattr = dccp_to_nlattr,
793 .nlattr_size = dccp_nlattr_size,
780 .from_nlattr = nlattr_to_dccp, 794 .from_nlattr = nlattr_to_dccp,
781 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, 795 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
782 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, 796 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c
index 4614696c1b88..0badedc542d3 100644
--- a/net/netfilter/nf_conntrack_proto_udplite.c
+++ b/net/netfilter/nf_conntrack_proto_udplite.c
@@ -204,6 +204,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
204 .error = udplite_error, 204 .error = udplite_error,
205#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) 205#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
206 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, 206 .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr,
207 .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size,
207 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, 208 .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple,
208 .nla_policy = nf_ct_port_nla_policy, 209 .nla_policy = nf_ct_port_nla_policy,
209#endif 210#endif
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 509a95621f9f..150e5cf62f85 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_info *info)
625} 625}
626EXPORT_SYMBOL(xt_free_table_info); 626EXPORT_SYMBOL(xt_free_table_info);
627 627
628void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
629 struct xt_table_info *newinfo)
630{
631 unsigned int cpu;
632
633 for_each_possible_cpu(cpu) {
634 void *p = oldinfo->entries[cpu];
635 rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
636 newinfo->entries[cpu] = p;
637 }
638
639}
640EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
641
642/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ 628/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */
643struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, 629struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
644 const char *name) 630 const char *name)
@@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af)
676EXPORT_SYMBOL_GPL(xt_compat_unlock); 662EXPORT_SYMBOL_GPL(xt_compat_unlock);
677#endif 663#endif
678 664
665DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
666EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
667
668
679struct xt_table_info * 669struct xt_table_info *
680xt_replace_table(struct xt_table *table, 670xt_replace_table(struct xt_table *table,
681 unsigned int num_counters, 671 unsigned int num_counters,
682 struct xt_table_info *newinfo, 672 struct xt_table_info *newinfo,
683 int *error) 673 int *error)
684{ 674{
685 struct xt_table_info *oldinfo, *private; 675 struct xt_table_info *private;
686 676
687 /* Do the substitution. */ 677 /* Do the substitution. */
688 mutex_lock(&table->lock); 678 local_bh_disable();
689 private = table->private; 679 private = table->private;
680
690 /* Check inside lock: is the old number correct? */ 681 /* Check inside lock: is the old number correct? */
691 if (num_counters != private->number) { 682 if (num_counters != private->number) {
692 duprintf("num_counters != table->private->number (%u/%u)\n", 683 duprintf("num_counters != table->private->number (%u/%u)\n",
693 num_counters, private->number); 684 num_counters, private->number);
694 mutex_unlock(&table->lock); 685 local_bh_enable();
695 *error = -EAGAIN; 686 *error = -EAGAIN;
696 return NULL; 687 return NULL;
697 } 688 }
698 oldinfo = private;
699 rcu_assign_pointer(table->private, newinfo);
700 newinfo->initial_entries = oldinfo->initial_entries;
701 mutex_unlock(&table->lock);
702 689
703 synchronize_net(); 690 table->private = newinfo;
704 return oldinfo; 691 newinfo->initial_entries = private->initial_entries;
692
693 /*
694 * Even though table entries have now been swapped, other CPU's
695 * may still be using the old entries. This is okay, because
696 * resynchronization happens because of the locking done
697 * during the get_counters() routine.
698 */
699 local_bh_enable();
700
701 return private;
705} 702}
706EXPORT_SYMBOL_GPL(xt_replace_table); 703EXPORT_SYMBOL_GPL(xt_replace_table);
707 704
@@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struct net *net, struct xt_table *table,
734 731
735 /* Simplifies replace_table code. */ 732 /* Simplifies replace_table code. */
736 table->private = bootstrap; 733 table->private = bootstrap;
737 mutex_init(&table->lock);
738 734
739 if (!xt_replace_table(table, 0, newinfo, &ret)) 735 if (!xt_replace_table(table, 0, newinfo, &ret))
740 goto unlock; 736 goto unlock;
@@ -1147,7 +1143,14 @@ static struct pernet_operations xt_net_ops = {
1147 1143
1148static int __init xt_init(void) 1144static int __init xt_init(void)
1149{ 1145{
1150 int i, rv; 1146 unsigned int i;
1147 int rv;
1148
1149 for_each_possible_cpu(i) {
1150 struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
1151 spin_lock_init(&lock->lock);
1152 lock->readers = 0;
1153 }
1151 1154
1152 xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); 1155 xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
1153 if (!xt) 1156 if (!xt)
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 791e030ea903..eb0ceb846527 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -474,7 +474,7 @@ static ssize_t recent_old_proc_write(struct file *file,
474 struct recent_table *t = pde->data; 474 struct recent_table *t = pde->data;
475 struct recent_entry *e; 475 struct recent_entry *e;
476 char buf[sizeof("+255.255.255.255")], *c = buf; 476 char buf[sizeof("+255.255.255.255")], *c = buf;
477 __be32 addr; 477 union nf_inet_addr addr = {};
478 int add; 478 int add;
479 479
480 if (size > sizeof(buf)) 480 if (size > sizeof(buf))
@@ -506,14 +506,13 @@ static ssize_t recent_old_proc_write(struct file *file,
506 add = 1; 506 add = 1;
507 break; 507 break;
508 } 508 }
509 addr = in_aton(c); 509 addr.ip = in_aton(c);
510 510
511 spin_lock_bh(&recent_lock); 511 spin_lock_bh(&recent_lock);
512 e = recent_entry_lookup(t, (const void *)&addr, NFPROTO_IPV4, 0); 512 e = recent_entry_lookup(t, &addr, NFPROTO_IPV4, 0);
513 if (e == NULL) { 513 if (e == NULL) {
514 if (add) 514 if (add)
515 recent_entry_init(t, (const void *)&addr, 515 recent_entry_init(t, &addr, NFPROTO_IPV4, 0);
516 NFPROTO_IPV4, 0);
517 } else { 516 } else {
518 if (add) 517 if (add)
519 recent_entry_update(t, e); 518 recent_entry_update(t, e);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 82271720d970..5f1f86565f16 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -794,7 +794,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
794{ 794{
795 static xfrm_address_t saddr_wildcard = { }; 795 static xfrm_address_t saddr_wildcard = { };
796 struct net *net = xp_net(pol); 796 struct net *net = xp_net(pol);
797 unsigned int h; 797 unsigned int h, h_wildcard;
798 struct hlist_node *entry; 798 struct hlist_node *entry;
799 struct xfrm_state *x, *x0, *to_put; 799 struct xfrm_state *x, *x0, *to_put;
800 int acquire_in_progress = 0; 800 int acquire_in_progress = 0;
@@ -819,8 +819,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
819 if (best) 819 if (best)
820 goto found; 820 goto found;
821 821
822 h = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family); 822 h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family);
823 hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) { 823 hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h_wildcard, bydst) {
824 if (x->props.family == family && 824 if (x->props.family == family &&
825 x->props.reqid == tmpl->reqid && 825 x->props.reqid == tmpl->reqid &&
826 !(x->props.flags & XFRM_STATE_WILDRECV) && 826 !(x->props.flags & XFRM_STATE_WILDRECV) &&