aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2008-12-13 06:25:51 -0500
committerRusty Russell <rusty@rustcorp.com.au>2008-12-13 06:25:51 -0500
commit968ea6d80e395cf11a51143cfa1b9a14ada676df (patch)
treedc2acec8c9bdced33afe1e273ee5e0b0b93d2703
parent7be7585393d311866653564fbcd10a3232773c0b (diff)
parent8299608f140ae321e4eb5d1306184265d2b9511e (diff)
Merge ../linux-2.6-x86
Conflicts: arch/x86/kernel/io_apic.c kernel/sched.c kernel/sched_stats.h
-rw-r--r--Documentation/controllers/cpuacct.txt32
-rw-r--r--Documentation/ftrace.txt149
-rw-r--r--Documentation/kernel-parameters.txt8
-rw-r--r--Documentation/lockstat.txt51
-rw-r--r--Documentation/markers.txt29
-rw-r--r--Documentation/scheduler/sched-arch.txt4
-rw-r--r--Documentation/tracepoints.txt94
-rw-r--r--arch/ia64/Kconfig2
-rw-r--r--arch/ia64/include/asm/topology.h2
-rw-r--r--arch/m32r/Kconfig2
-rw-r--r--arch/mips/Kconfig2
-rw-r--r--arch/mips/include/asm/mach-ip27/topology.h1
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/include/asm/ftrace.h14
-rw-r--r--arch/powerpc/include/asm/module.h16
-rw-r--r--arch/powerpc/include/asm/topology.h1
-rw-r--r--arch/powerpc/kernel/Makefile1
-rw-r--r--arch/powerpc/kernel/entry_32.S40
-rw-r--r--arch/powerpc/kernel/entry_64.S12
-rw-r--r--arch/powerpc/kernel/ftrace.c461
-rw-r--r--arch/powerpc/kernel/idle.c5
-rw-r--r--arch/powerpc/kernel/module_32.c10
-rw-r--r--arch/powerpc/kernel/module_64.c13
-rw-r--r--arch/powerpc/lib/Makefile3
-rw-r--r--arch/s390/kernel/topology.c5
-rw-r--r--arch/sh/include/asm/topology.h1
-rw-r--r--arch/um/include/asm/system.h14
-rw-r--r--arch/x86/Kconfig28
-rw-r--r--arch/x86/Kconfig.cpu1
-rw-r--r--arch/x86/Kconfig.debug4
-rw-r--r--arch/x86/include/asm/apic.h1
-rw-r--r--arch/x86/include/asm/bigsmp/apic.h2
-rw-r--r--arch/x86/include/asm/ds.h134
-rw-r--r--arch/x86/include/asm/emergency-restart.h4
-rw-r--r--arch/x86/include/asm/es7000/apic.h79
-rw-r--r--arch/x86/include/asm/es7000/wakecpu.h41
-rw-r--r--arch/x86/include/asm/ftrace.h34
-rw-r--r--arch/x86/include/asm/genapic_32.h19
-rw-r--r--arch/x86/include/asm/genapic_64.h2
-rw-r--r--arch/x86/include/asm/io_apic.h9
-rw-r--r--arch/x86/include/asm/irq_vectors.h11
-rw-r--r--arch/x86/include/asm/mach-default/mach_apic.h2
-rw-r--r--arch/x86/include/asm/mach-default/mach_wakecpu.h24
-rw-r--r--arch/x86/include/asm/mach-default/smpboot_hooks.h8
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apic.h1
-rw-r--r--arch/x86/include/asm/mach-generic/mach_wakecpu.h12
-rw-r--r--arch/x86/include/asm/numaq/wakecpu.h24
-rw-r--r--arch/x86/include/asm/setup.h3
-rw-r--r--arch/x86/include/asm/system.h2
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/uaccess.h2
-rw-r--r--arch/x86/include/asm/uaccess_32.h8
-rw-r--r--arch/x86/include/asm/uaccess_64.h6
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/boot.c11
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c4
-rw-r--r--arch/x86/kernel/cpu/intel.c3
-rw-r--r--arch/x86/kernel/ds.c692
-rw-r--r--arch/x86/kernel/dumpstack.c351
-rw-r--r--arch/x86/kernel/dumpstack.h39
-rw-r--r--arch/x86/kernel/dumpstack_32.c307
-rw-r--r--arch/x86/kernel/dumpstack_64.c289
-rw-r--r--arch/x86/kernel/entry_32.S51
-rw-r--r--arch/x86/kernel/entry_64.S83
-rw-r--r--arch/x86/kernel/es7000_32.c62
-rw-r--r--arch/x86/kernel/ftrace.c390
-rw-r--r--arch/x86/kernel/genapic_64.c4
-rw-r--r--arch/x86/kernel/io_apic.c631
-rw-r--r--arch/x86/kernel/irq.c3
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/irqinit_32.c3
-rw-r--r--arch/x86/kernel/irqinit_64.c3
-rw-r--r--arch/x86/kernel/mpparse.c25
-rw-r--r--arch/x86/kernel/numaq_32.c10
-rw-r--r--arch/x86/kernel/process.c32
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/ptrace.c98
-rw-r--r--arch/x86/kernel/reboot.c31
-rw-r--r--arch/x86/kernel/setup.c17
-rw-r--r--arch/x86/kernel/smp.c13
-rw-r--r--arch/x86/kernel/smpboot.c17
-rw-r--r--arch/x86/kernel/stacktrace.c64
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
-rw-r--r--arch/x86/lib/usercopy_32.c8
-rw-r--r--arch/x86/lib/usercopy_64.c4
-rw-r--r--arch/x86/mach-generic/bigsmp.c1
-rw-r--r--arch/x86/mach-generic/default.c1
-rw-r--r--arch/x86/mach-generic/es7000.c14
-rw-r--r--arch/x86/mach-generic/probe.c16
-rw-r--r--arch/x86/mach-generic/summit.c1
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/fault.c13
-rw-r--r--arch/x86/pci/direct.c4
-rw-r--r--arch/x86/pci/pci.h1
-rw-r--r--arch/x86/vdso/vclock_gettime.c3
-rw-r--r--block/Kconfig1
-rw-r--r--block/blk-core.c46
-rw-r--r--block/blktrace.c332
-rw-r--r--block/elevator.c12
-rw-r--r--drivers/char/random.c22
-rw-r--r--drivers/char/sysrq.c18
-rw-r--r--drivers/md/dm.c8
-rw-r--r--drivers/pci/intr_remapping.c76
-rw-r--r--drivers/pci/msi.c55
-rw-r--r--drivers/xen/events.c12
-rw-r--r--fs/bio.c5
-rw-r--r--fs/proc/stat.c19
-rw-r--r--fs/seq_file.c14
-rw-r--r--include/asm-generic/vmlinux.lds.h21
-rw-r--r--include/asm-m32r/system.h2
-rw-r--r--include/linux/Kbuild1
-rw-r--r--include/linux/blktrace_api.h172
-rw-r--r--include/linux/compiler.h84
-rw-r--r--include/linux/debug_locks.h2
-rw-r--r--include/linux/ftrace.h272
-rw-r--r--include/linux/ftrace_irq.h13
-rw-r--r--include/linux/futex.h2
-rw-r--r--include/linux/hardirq.h15
-rw-r--r--include/linux/interrupt.h2
-rw-r--r--include/linux/irq.h45
-rw-r--r--include/linux/irqnr.h26
-rw-r--r--include/linux/kernel.h11
-rw-r--r--include/linux/kernel_stat.h14
-rw-r--r--include/linux/lockdep.h31
-rw-r--r--include/linux/marker.h75
-rw-r--r--include/linux/msi.h3
-rw-r--r--include/linux/mutex.h2
-rw-r--r--include/linux/pid.h4
-rw-r--r--include/linux/random.h51
-rw-r--r--include/linux/rcuclassic.h2
-rw-r--r--include/linux/rcupdate.h2
-rw-r--r--include/linux/ring_buffer.h16
-rw-r--r--include/linux/sched.h68
-rw-r--r--include/linux/seq_file.h1
-rw-r--r--include/linux/stacktrace.h8
-rw-r--r--include/linux/topology.h2
-rw-r--r--include/linux/tracepoint.h57
-rw-r--r--include/linux/tty.h2
-rw-r--r--include/linux/uaccess.h2
-rw-r--r--include/trace/block.h76
-rw-r--r--include/trace/boot.h60
-rw-r--r--include/trace/sched.h24
-rw-r--r--init/Kconfig1
-rw-r--r--init/main.c46
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/extable.c21
-rw-r--r--kernel/fork.c12
-rw-r--r--kernel/futex.c290
-rw-r--r--kernel/irq/autoprobe.c15
-rw-r--r--kernel/irq/chip.c3
-rw-r--r--kernel/irq/handle.c181
-rw-r--r--kernel/irq/proc.c6
-rw-r--r--kernel/irq/spurious.c5
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/lockdep.c34
-rw-r--r--kernel/lockdep_proc.c28
-rw-r--r--kernel/marker.c192
-rw-r--r--kernel/module.c13
-rw-r--r--kernel/mutex.c10
-rw-r--r--kernel/notifier.c8
-rw-r--r--kernel/posix-cpu-timers.c10
-rw-r--r--kernel/power/disk.c13
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/rcuclassic.c6
-rw-r--r--kernel/sched.c1148
-rw-r--r--kernel/sched_cpupri.c39
-rw-r--r--kernel/sched_cpupri.h5
-rw-r--r--kernel/sched_debug.c57
-rw-r--r--kernel/sched_fair.c14
-rw-r--r--kernel/sched_rt.c80
-rw-r--r--kernel/sched_stats.h3
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sysctl.c10
-rw-r--r--kernel/time/tick-sched.c10
-rw-r--r--kernel/trace/Kconfig102
-rw-r--r--kernel/trace/Makefile9
-rw-r--r--kernel/trace/ftrace.c914
-rw-r--r--kernel/trace/ring_buffer.c684
-rw-r--r--kernel/trace/trace.c901
-rw-r--r--kernel/trace/trace.h265
-rw-r--r--kernel/trace/trace_boot.c166
-rw-r--r--kernel/trace/trace_branch.c342
-rw-r--r--kernel/trace/trace_bts.c276
-rw-r--r--kernel/trace/trace_functions.c18
-rw-r--r--kernel/trace/trace_functions_graph.c611
-rw-r--r--kernel/trace/trace_irqsoff.c61
-rw-r--r--kernel/trace/trace_mmiotrace.c27
-rw-r--r--kernel/trace/trace_nop.c65
-rw-r--r--kernel/trace/trace_power.c179
-rw-r--r--kernel/trace/trace_sched_switch.c106
-rw-r--r--kernel/trace/trace_sched_wakeup.c70
-rw-r--r--kernel/trace/trace_selftest.c173
-rw-r--r--kernel/trace/trace_stack.c21
-rw-r--r--kernel/trace/trace_sysprof.c19
-rw-r--r--kernel/tracepoint.c295
-rw-r--r--kernel/user.c2
-rw-r--r--lib/Kconfig.debug10
-rw-r--r--mm/bounce.c5
-rw-r--r--mm/memory.c15
-rw-r--r--samples/tracepoints/tp-samples-trace.h4
-rw-r--r--samples/tracepoints/tracepoint-probe-sample.c1
-rw-r--r--samples/tracepoints/tracepoint-probe-sample2.c1
-rw-r--r--samples/tracepoints/tracepoint-sample.c3
-rw-r--r--scripts/Makefile.build12
-rw-r--r--scripts/bootgraph.pl16
-rwxr-xr-xscripts/recordmcount.pl50
-rw-r--r--scripts/trace/power.pl108
-rw-r--r--scripts/tracing/draw_functrace.py130
215 files changed, 10580 insertions, 3916 deletions
diff --git a/Documentation/controllers/cpuacct.txt b/Documentation/controllers/cpuacct.txt
new file mode 100644
index 000000000000..bb775fbe43d7
--- /dev/null
+++ b/Documentation/controllers/cpuacct.txt
@@ -0,0 +1,32 @@
1CPU Accounting Controller
2-------------------------
3
4The CPU accounting controller is used to group tasks using cgroups and
5account the CPU usage of these groups of tasks.
6
7The CPU accounting controller supports multi-hierarchy groups. An accounting
8group accumulates the CPU usage of all of its child groups and the tasks
9directly present in its group.
10
11Accounting groups can be created by first mounting the cgroup filesystem.
12
13# mkdir /cgroups
14# mount -t cgroup -ocpuacct none /cgroups
15
16With the above step, the initial or the parent accounting group
17becomes visible at /cgroups. At bootup, this group includes all the
18tasks in the system. /cgroups/tasks lists the tasks in this cgroup.
19/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by
20this group which is essentially the CPU time obtained by all the tasks
21in the system.
22
23New accounting groups can be created under the parent group /cgroups.
24
25# cd /cgroups
26# mkdir g1
27# echo $$ > g1
28
29The above steps create a new group g1 and move the current shell
30process (bash) into it. CPU time consumed by this bash and its children
31can be obtained from g1/cpuacct.usage and the same is accumulated in
32/cgroups/cpuacct.usage also.
diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 9cc4d685dde5..803b1318b13d 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -82,7 +82,7 @@ of ftrace. Here is a list of some of the key files:
82 tracer is not adding more data, they will display 82 tracer is not adding more data, they will display
83 the same information every time they are read. 83 the same information every time they are read.
84 84
85 iter_ctrl: This file lets the user control the amount of data 85 trace_options: This file lets the user control the amount of data
86 that is displayed in one of the above output 86 that is displayed in one of the above output
87 files. 87 files.
88 88
@@ -94,10 +94,10 @@ of ftrace. Here is a list of some of the key files:
94 only be recorded if the latency is greater than 94 only be recorded if the latency is greater than
95 the value in this file. (in microseconds) 95 the value in this file. (in microseconds)
96 96
97 trace_entries: This sets or displays the number of bytes each CPU 97 buffer_size_kb: This sets or displays the number of kilobytes each CPU
98 buffer can hold. The tracer buffers are the same size 98 buffer can hold. The tracer buffers are the same size
99 for each CPU. The displayed number is the size of the 99 for each CPU. The displayed number is the size of the
100 CPU buffer and not total size of all buffers. The 100 CPU buffer and not total size of all buffers. The
101 trace buffers are allocated in pages (blocks of memory 101 trace buffers are allocated in pages (blocks of memory
102 that the kernel uses for allocation, usually 4 KB in size). 102 that the kernel uses for allocation, usually 4 KB in size).
103 If the last page allocated has room for more bytes 103 If the last page allocated has room for more bytes
@@ -127,6 +127,8 @@ of ftrace. Here is a list of some of the key files:
127 be traced. If a function exists in both set_ftrace_filter 127 be traced. If a function exists in both set_ftrace_filter
128 and set_ftrace_notrace, the function will _not_ be traced. 128 and set_ftrace_notrace, the function will _not_ be traced.
129 129
130 set_ftrace_pid: Have the function tracer only trace a single thread.
131
130 available_filter_functions: This lists the functions that ftrace 132 available_filter_functions: This lists the functions that ftrace
131 has processed and can trace. These are the function 133 has processed and can trace. These are the function
132 names that you can pass to "set_ftrace_filter" or 134 names that you can pass to "set_ftrace_filter" or
@@ -316,23 +318,23 @@ The above is mostly meaningful for kernel developers.
316 The rest is the same as the 'trace' file. 318 The rest is the same as the 'trace' file.
317 319
318 320
319iter_ctrl 321trace_options
320--------- 322-------------
321 323
322The iter_ctrl file is used to control what gets printed in the trace 324The trace_options file is used to control what gets printed in the trace
323output. To see what is available, simply cat the file: 325output. To see what is available, simply cat the file:
324 326
325 cat /debug/tracing/iter_ctrl 327 cat /debug/tracing/trace_options
326 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ 328 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
327 noblock nostacktrace nosched-tree 329 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
328 330
329To disable one of the options, echo in the option prepended with "no". 331To disable one of the options, echo in the option prepended with "no".
330 332
331 echo noprint-parent > /debug/tracing/iter_ctrl 333 echo noprint-parent > /debug/tracing/trace_options
332 334
333To enable an option, leave off the "no". 335To enable an option, leave off the "no".
334 336
335 echo sym-offset > /debug/tracing/iter_ctrl 337 echo sym-offset > /debug/tracing/trace_options
336 338
337Here are the available options: 339Here are the available options:
338 340
@@ -378,6 +380,20 @@ Here are the available options:
378 When a trace is recorded, so is the stack of functions. 380 When a trace is recorded, so is the stack of functions.
379 This allows for back traces of trace sites. 381 This allows for back traces of trace sites.
380 382
383 userstacktrace - This option changes the trace.
384 It records a stacktrace of the current userspace thread.
385
386 sym-userobj - when user stacktrace are enabled, look up which object the
387 address belongs to, and print a relative address
388 This is especially useful when ASLR is on, otherwise you don't
389 get a chance to resolve the address to object/file/line after the app is no
390 longer running
391
392 The lookup is performed when you read trace,trace_pipe,latency_trace. Example:
393
394 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
395x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
396
381 sched-tree - TBD (any users??) 397 sched-tree - TBD (any users??)
382 398
383 399
@@ -1059,6 +1075,83 @@ For simple one time traces, the above is sufficent. For anything else,
1059a search through /proc/mounts may be needed to find where the debugfs 1075a search through /proc/mounts may be needed to find where the debugfs
1060file-system is mounted. 1076file-system is mounted.
1061 1077
1078
1079Single thread tracing
1080---------------------
1081
1082By writing into /debug/tracing/set_ftrace_pid you can trace a
1083single thread. For example:
1084
1085# cat /debug/tracing/set_ftrace_pid
1086no pid
1087# echo 3111 > /debug/tracing/set_ftrace_pid
1088# cat /debug/tracing/set_ftrace_pid
10893111
1090# echo function > /debug/tracing/current_tracer
1091# cat /debug/tracing/trace | head
1092 # tracer: function
1093 #
1094 # TASK-PID CPU# TIMESTAMP FUNCTION
1095 # | | | | |
1096 yum-updatesd-3111 [003] 1637.254676: finish_task_switch <-thread_return
1097 yum-updatesd-3111 [003] 1637.254681: hrtimer_cancel <-schedule_hrtimeout_range
1098 yum-updatesd-3111 [003] 1637.254682: hrtimer_try_to_cancel <-hrtimer_cancel
1099 yum-updatesd-3111 [003] 1637.254683: lock_hrtimer_base <-hrtimer_try_to_cancel
1100 yum-updatesd-3111 [003] 1637.254685: fget_light <-do_sys_poll
1101 yum-updatesd-3111 [003] 1637.254686: pipe_poll <-do_sys_poll
1102# echo -1 > /debug/tracing/set_ftrace_pid
1103# cat /debug/tracing/trace |head
1104 # tracer: function
1105 #
1106 # TASK-PID CPU# TIMESTAMP FUNCTION
1107 # | | | | |
1108 ##### CPU 3 buffer started ####
1109 yum-updatesd-3111 [003] 1701.957688: free_poll_entry <-poll_freewait
1110 yum-updatesd-3111 [003] 1701.957689: remove_wait_queue <-free_poll_entry
1111 yum-updatesd-3111 [003] 1701.957691: fput <-free_poll_entry
1112 yum-updatesd-3111 [003] 1701.957692: audit_syscall_exit <-sysret_audit
1113 yum-updatesd-3111 [003] 1701.957693: path_put <-audit_syscall_exit
1114
1115If you want to trace a function when executing, you could use
1116something like this simple program:
1117
1118#include <stdio.h>
1119#include <stdlib.h>
1120#include <sys/types.h>
1121#include <sys/stat.h>
1122#include <fcntl.h>
1123#include <unistd.h>
1124
1125int main (int argc, char **argv)
1126{
1127 if (argc < 1)
1128 exit(-1);
1129
1130 if (fork() > 0) {
1131 int fd, ffd;
1132 char line[64];
1133 int s;
1134
1135 ffd = open("/debug/tracing/current_tracer", O_WRONLY);
1136 if (ffd < 0)
1137 exit(-1);
1138 write(ffd, "nop", 3);
1139
1140 fd = open("/debug/tracing/set_ftrace_pid", O_WRONLY);
1141 s = sprintf(line, "%d\n", getpid());
1142 write(fd, line, s);
1143
1144 write(ffd, "function", 8);
1145
1146 close(fd);
1147 close(ffd);
1148
1149 execvp(argv[1], argv+1);
1150 }
1151
1152 return 0;
1153}
1154
1062dynamic ftrace 1155dynamic ftrace
1063-------------- 1156--------------
1064 1157
@@ -1158,7 +1251,11 @@ These are the only wild cards which are supported.
1158 1251
1159 <match>*<match> will not work. 1252 <match>*<match> will not work.
1160 1253
1161 # echo hrtimer_* > /debug/tracing/set_ftrace_filter 1254Note: It is better to use quotes to enclose the wild cards, otherwise
1255 the shell may expand the parameters into names of files in the local
1256 directory.
1257
1258 # echo 'hrtimer_*' > /debug/tracing/set_ftrace_filter
1162 1259
1163Produces: 1260Produces:
1164 1261
@@ -1213,7 +1310,7 @@ Again, now we want to append.
1213 # echo sys_nanosleep > /debug/tracing/set_ftrace_filter 1310 # echo sys_nanosleep > /debug/tracing/set_ftrace_filter
1214 # cat /debug/tracing/set_ftrace_filter 1311 # cat /debug/tracing/set_ftrace_filter
1215sys_nanosleep 1312sys_nanosleep
1216 # echo hrtimer_* >> /debug/tracing/set_ftrace_filter 1313 # echo 'hrtimer_*' >> /debug/tracing/set_ftrace_filter
1217 # cat /debug/tracing/set_ftrace_filter 1314 # cat /debug/tracing/set_ftrace_filter
1218hrtimer_run_queues 1315hrtimer_run_queues
1219hrtimer_run_pending 1316hrtimer_run_pending
@@ -1299,41 +1396,29 @@ trace entries
1299------------- 1396-------------
1300 1397
1301Having too much or not enough data can be troublesome in diagnosing 1398Having too much or not enough data can be troublesome in diagnosing
1302an issue in the kernel. The file trace_entries is used to modify 1399an issue in the kernel. The file buffer_size_kb is used to modify
1303the size of the internal trace buffers. The number listed 1400the size of the internal trace buffers. The number listed
1304is the number of entries that can be recorded per CPU. To know 1401is the number of entries that can be recorded per CPU. To know
1305the full size, multiply the number of possible CPUS with the 1402the full size, multiply the number of possible CPUS with the
1306number of entries. 1403number of entries.
1307 1404
1308 # cat /debug/tracing/trace_entries 1405 # cat /debug/tracing/buffer_size_kb
130965620 14061408 (units kilobytes)
1310 1407
1311Note, to modify this, you must have tracing completely disabled. To do that, 1408Note, to modify this, you must have tracing completely disabled. To do that,
1312echo "nop" into the current_tracer. If the current_tracer is not set 1409echo "nop" into the current_tracer. If the current_tracer is not set
1313to "nop", an EINVAL error will be returned. 1410to "nop", an EINVAL error will be returned.
1314 1411
1315 # echo nop > /debug/tracing/current_tracer 1412 # echo nop > /debug/tracing/current_tracer
1316 # echo 100000 > /debug/tracing/trace_entries 1413 # echo 10000 > /debug/tracing/buffer_size_kb
1317 # cat /debug/tracing/trace_entries 1414 # cat /debug/tracing/buffer_size_kb
1318100045 141510000 (units kilobytes)
1319
1320
1321Notice that we echoed in 100,000 but the size is 100,045. The entries
1322are held in individual pages. It allocates the number of pages it takes
1323to fulfill the request. If more entries may fit on the last page
1324then they will be added.
1325
1326 # echo 1 > /debug/tracing/trace_entries
1327 # cat /debug/tracing/trace_entries
132885
1329
1330This shows us that 85 entries can fit in a single page.
1331 1416
1332The number of pages which will be allocated is limited to a percentage 1417The number of pages which will be allocated is limited to a percentage
1333of available memory. Allocating too much will produce an error. 1418of available memory. Allocating too much will produce an error.
1334 1419
1335 # echo 1000000000000 > /debug/tracing/trace_entries 1420 # echo 1000000000000 > /debug/tracing/buffer_size_kb
1336-bash: echo: write error: Cannot allocate memory 1421-bash: echo: write error: Cannot allocate memory
1337 # cat /debug/tracing/trace_entries 1422 # cat /debug/tracing/buffer_size_kb
133885 142385
1339 1424
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e0f346d201ed..2919a2e91938 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -750,6 +750,14 @@ and is between 256 and 4096 characters. It is defined in the file
750 parameter will force ia64_sal_cache_flush to call 750 parameter will force ia64_sal_cache_flush to call
751 ia64_pal_cache_flush instead of SAL_CACHE_FLUSH. 751 ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
752 752
753 ftrace=[tracer]
754 [ftrace] will set and start the specified tracer
755 as early as possible in order to facilitate early
756 boot debugging.
757
758 ftrace_dump_on_oops
759 [ftrace] will dump the trace buffers on oops.
760
753 gamecon.map[2|3]= 761 gamecon.map[2|3]=
754 [HW,JOY] Multisystem joystick and NES/SNES/PSX pad 762 [HW,JOY] Multisystem joystick and NES/SNES/PSX pad
755 support via parallel port (up to 5 devices per port) 763 support via parallel port (up to 5 devices per port)
diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt
index 4ba4664ce5c3..9cb9138f7a79 100644
--- a/Documentation/lockstat.txt
+++ b/Documentation/lockstat.txt
@@ -71,35 +71,50 @@ Look at the current lock statistics:
71 71
72# less /proc/lock_stat 72# less /proc/lock_stat
73 73
7401 lock_stat version 0.2 7401 lock_stat version 0.3
7502 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 7502 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
7603 class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total 7603 class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total
7704 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 7704 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
7805 7805
7906 &inode->i_data.tree_lock-W: 15 21657 0.18 1093295.30 11547131054.85 58 10415 0.16 87.51 6387.60 7906 &mm->mmap_sem-W: 233 538 18446744073708 22924.27 607243.51 1342 45806 1.71 8595.89 1180582.34
8007 &inode->i_data.tree_lock-R: 0 0 0.00 0.00 0.00 23302 231198 0.25 8.45 98023.38 8007 &mm->mmap_sem-R: 205 587 18446744073708 28403.36 731975.00 1940 412426 0.58 187825.45 6307502.88
8108 -------------------------- 8108 ---------------
8209 &inode->i_data.tree_lock 0 [<ffffffff8027c08f>] add_to_page_cache+0x5f/0x190 8209 &mm->mmap_sem 487 [<ffffffff8053491f>] do_page_fault+0x466/0x928
8310 8310 &mm->mmap_sem 179 [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
8411 ............................................................................................................................................................................................... 8411 &mm->mmap_sem 279 [<ffffffff80210a57>] sys_mmap+0x75/0xce
8512 8512 &mm->mmap_sem 76 [<ffffffff802a490b>] sys_munmap+0x32/0x59
8613 dcache_lock: 1037 1161 0.38 45.32 774.51 6611 243371 0.15 306.48 77387.24 8613 ---------------
8714 ----------- 8714 &mm->mmap_sem 270 [<ffffffff80210a57>] sys_mmap+0x75/0xce
8815 dcache_lock 180 [<ffffffff802c0d7e>] sys_getcwd+0x11e/0x230 8815 &mm->mmap_sem 431 [<ffffffff8053491f>] do_page_fault+0x466/0x928
8916 dcache_lock 165 [<ffffffff802c002a>] d_alloc+0x15a/0x210 8916 &mm->mmap_sem 138 [<ffffffff802a490b>] sys_munmap+0x32/0x59
9017 dcache_lock 33 [<ffffffff8035818d>] _atomic_dec_and_lock+0x4d/0x70 9017 &mm->mmap_sem 145 [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
9118 dcache_lock 1 [<ffffffff802beef8>] shrink_dcache_parent+0x18/0x130 9118
9219 ...............................................................................................................................................................................................
9320
9421 dcache_lock: 621 623 0.52 118.26 1053.02 6745 91930 0.29 316.29 118423.41
9522 -----------
9623 dcache_lock 179 [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
9724 dcache_lock 113 [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
9825 dcache_lock 99 [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
9926 dcache_lock 104 [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
10027 -----------
10128 dcache_lock 192 [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
10229 dcache_lock 98 [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
10330 dcache_lock 72 [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
10431 dcache_lock 112 [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
92 105
93This excerpt shows the first two lock class statistics. Line 01 shows the 106This excerpt shows the first two lock class statistics. Line 01 shows the
94output version - each time the format changes this will be updated. Line 02-04 107output version - each time the format changes this will be updated. Line 02-04
95show the header with column descriptions. Lines 05-10 and 13-18 show the actual 108show the header with column descriptions. Lines 05-18 and 20-31 show the actual
96statistics. These statistics come in two parts; the actual stats separated by a 109statistics. These statistics come in two parts; the actual stats separated by a
97short separator (line 08, 14) from the contention points. 110short separator (line 08, 13) from the contention points.
98 111
99The first lock (05-10) is a read/write lock, and shows two lines above the 112The first lock (05-18) is a read/write lock, and shows two lines above the
100short separator. The contention points don't match the column descriptors, 113short separator. The contention points don't match the column descriptors,
101they have two: contentions and [<IP>] symbol. 114they have two: contentions and [<IP>] symbol. The second set of contention
115points are the points we're contending with.
102 116
117The integer part of the time values is in us.
103 118
104View the top contending locks: 119View the top contending locks:
105 120
diff --git a/Documentation/markers.txt b/Documentation/markers.txt
index 089f6138fcd9..d2b3d0e91b26 100644
--- a/Documentation/markers.txt
+++ b/Documentation/markers.txt
@@ -51,11 +51,16 @@ to call) for the specific marker through marker_probe_register() and can be
51activated by calling marker_arm(). Marker deactivation can be done by calling 51activated by calling marker_arm(). Marker deactivation can be done by calling
52marker_disarm() as many times as marker_arm() has been called. Removing a probe 52marker_disarm() as many times as marker_arm() has been called. Removing a probe
53is done through marker_probe_unregister(); it will disarm the probe. 53is done through marker_probe_unregister(); it will disarm the probe.
54marker_synchronize_unregister() must be called before the end of the module exit 54
55function to make sure there is no caller left using the probe. This, and the 55marker_synchronize_unregister() must be called between probe unregistration and
56fact that preemption is disabled around the probe call, make sure that probe 56the first occurrence of
57removal and module unload are safe. See the "Probe example" section below for a 57- the end of module exit function,
58sample probe module. 58 to make sure there is no caller left using the probe;
59- the free of any resource used by the probes,
60 to make sure the probes wont be accessing invalid data.
61This, and the fact that preemption is disabled around the probe call, make sure
62that probe removal and module unload are safe. See the "Probe example" section
63below for a sample probe module.
59 64
60The marker mechanism supports inserting multiple instances of the same marker. 65The marker mechanism supports inserting multiple instances of the same marker.
61Markers can be put in inline functions, inlined static functions, and 66Markers can be put in inline functions, inlined static functions, and
@@ -70,6 +75,20 @@ a printk warning which identifies the inconsistency:
70 75
71"Format mismatch for probe probe_name (format), marker (format)" 76"Format mismatch for probe probe_name (format), marker (format)"
72 77
78Another way to use markers is to simply define the marker without generating any
79function call to actually call into the marker. This is useful in combination
80with tracepoint probes in a scheme like this :
81
82void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk);
83
84DEFINE_MARKER_TP(marker_eventname, tracepoint_name, probe_tracepoint_name,
85 "arg1 %u pid %d");
86
87notrace void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk)
88{
89 struct marker *marker = &GET_MARKER(kernel_irq_entry);
90 /* write data to trace buffers ... */
91}
73 92
74* Probe / marker example 93* Probe / marker example
75 94
diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt
index 941615a9769b..d43dbcbd163b 100644
--- a/Documentation/scheduler/sched-arch.txt
+++ b/Documentation/scheduler/sched-arch.txt
@@ -8,7 +8,7 @@ Context switch
8By default, the switch_to arch function is called with the runqueue 8By default, the switch_to arch function is called with the runqueue
9locked. This is usually not a problem unless switch_to may need to 9locked. This is usually not a problem unless switch_to may need to
10take the runqueue lock. This is usually due to a wake up operation in 10take the runqueue lock. This is usually due to a wake up operation in
11the context switch. See include/asm-ia64/system.h for an example. 11the context switch. See arch/ia64/include/asm/system.h for an example.
12 12
13To request the scheduler call switch_to with the runqueue unlocked, 13To request the scheduler call switch_to with the runqueue unlocked,
14you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file 14you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
@@ -23,7 +23,7 @@ disabled. Interrupts may be enabled over the call if it is likely to
23introduce a significant interrupt latency by adding the line 23introduce a significant interrupt latency by adding the line
24`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for 24`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for
25unlocked context switches. This define also implies 25unlocked context switches. This define also implies
26`__ARCH_WANT_UNLOCKED_CTXSW`. See include/asm-arm/system.h for an 26`__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an
27example. 27example.
28 28
29 29
diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt
index 5d354e167494..6f0a044f5b5e 100644
--- a/Documentation/tracepoints.txt
+++ b/Documentation/tracepoints.txt
@@ -3,28 +3,30 @@
3 Mathieu Desnoyers 3 Mathieu Desnoyers
4 4
5 5
6This document introduces Linux Kernel Tracepoints and their use. It provides 6This document introduces Linux Kernel Tracepoints and their use. It
7examples of how to insert tracepoints in the kernel and connect probe functions 7provides examples of how to insert tracepoints in the kernel and
8to them and provides some examples of probe functions. 8connect probe functions to them and provides some examples of probe
9functions.
9 10
10 11
11* Purpose of tracepoints 12* Purpose of tracepoints
12 13
13A tracepoint placed in code provides a hook to call a function (probe) that you 14A tracepoint placed in code provides a hook to call a function (probe)
14can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or 15that you can provide at runtime. A tracepoint can be "on" (a probe is
15"off" (no probe is attached). When a tracepoint is "off" it has no effect, 16connected to it) or "off" (no probe is attached). When a tracepoint is
16except for adding a tiny time penalty (checking a condition for a branch) and 17"off" it has no effect, except for adding a tiny time penalty
17space penalty (adding a few bytes for the function call at the end of the 18(checking a condition for a branch) and space penalty (adding a few
18instrumented function and adds a data structure in a separate section). When a 19bytes for the function call at the end of the instrumented function
19tracepoint is "on", the function you provide is called each time the tracepoint 20and adds a data structure in a separate section). When a tracepoint
20is executed, in the execution context of the caller. When the function provided 21is "on", the function you provide is called each time the tracepoint
21ends its execution, it returns to the caller (continuing from the tracepoint 22is executed, in the execution context of the caller. When the function
22site). 23provided ends its execution, it returns to the caller (continuing from
24the tracepoint site).
23 25
24You can put tracepoints at important locations in the code. They are 26You can put tracepoints at important locations in the code. They are
25lightweight hooks that can pass an arbitrary number of parameters, 27lightweight hooks that can pass an arbitrary number of parameters,
26which prototypes are described in a tracepoint declaration placed in a header 28which prototypes are described in a tracepoint declaration placed in a
27file. 29header file.
28 30
29They can be used for tracing and performance accounting. 31They can be used for tracing and performance accounting.
30 32
@@ -42,14 +44,16 @@ In include/trace/subsys.h :
42 44
43#include <linux/tracepoint.h> 45#include <linux/tracepoint.h>
44 46
45DEFINE_TRACE(subsys_eventname, 47DECLARE_TRACE(subsys_eventname,
46 TPPTOTO(int firstarg, struct task_struct *p), 48 TPPROTO(int firstarg, struct task_struct *p),
47 TPARGS(firstarg, p)); 49 TPARGS(firstarg, p));
48 50
49In subsys/file.c (where the tracing statement must be added) : 51In subsys/file.c (where the tracing statement must be added) :
50 52
51#include <trace/subsys.h> 53#include <trace/subsys.h>
52 54
55DEFINE_TRACE(subsys_eventname);
56
53void somefct(void) 57void somefct(void)
54{ 58{
55 ... 59 ...
@@ -61,31 +65,41 @@ Where :
61- subsys_eventname is an identifier unique to your event 65- subsys_eventname is an identifier unique to your event
62 - subsys is the name of your subsystem. 66 - subsys is the name of your subsystem.
63 - eventname is the name of the event to trace. 67 - eventname is the name of the event to trace.
64- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the function
65 called by this tracepoint.
66- TPARGS(firstarg, p) are the parameters names, same as found in the prototype.
67 68
68Connecting a function (probe) to a tracepoint is done by providing a probe 69- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the
69(function to call) for the specific tracepoint through 70 function called by this tracepoint.
70register_trace_subsys_eventname(). Removing a probe is done through
71unregister_trace_subsys_eventname(); it will remove the probe sure there is no
72caller left using the probe when it returns. Probe removal is preempt-safe
73because preemption is disabled around the probe call. See the "Probe example"
74section below for a sample probe module.
75
76The tracepoint mechanism supports inserting multiple instances of the same
77tracepoint, but a single definition must be made of a given tracepoint name over
78all the kernel to make sure no type conflict will occur. Name mangling of the
79tracepoints is done using the prototypes to make sure typing is correct.
80Verification of probe type correctness is done at the registration site by the
81compiler. Tracepoints can be put in inline functions, inlined static functions,
82and unrolled loops as well as regular functions.
83
84The naming scheme "subsys_event" is suggested here as a convention intended
85to limit collisions. Tracepoint names are global to the kernel: they are
86considered as being the same whether they are in the core kernel image or in
87modules.
88 71
72- TPARGS(firstarg, p) are the parameters names, same as found in the
73 prototype.
74
75Connecting a function (probe) to a tracepoint is done by providing a
76probe (function to call) for the specific tracepoint through
77register_trace_subsys_eventname(). Removing a probe is done through
78unregister_trace_subsys_eventname(); it will remove the probe.
79
80tracepoint_synchronize_unregister() must be called before the end of
81the module exit function to make sure there is no caller left using
82the probe. This, and the fact that preemption is disabled around the
83probe call, make sure that probe removal and module unload are safe.
84See the "Probe example" section below for a sample probe module.
85
86The tracepoint mechanism supports inserting multiple instances of the
87same tracepoint, but a single definition must be made of a given
88tracepoint name over all the kernel to make sure no type conflict will
89occur. Name mangling of the tracepoints is done using the prototypes
90to make sure typing is correct. Verification of probe type correctness
91is done at the registration site by the compiler. Tracepoints can be
92put in inline functions, inlined static functions, and unrolled loops
93as well as regular functions.
94
95The naming scheme "subsys_event" is suggested here as a convention
96intended to limit collisions. Tracepoint names are global to the
97kernel: they are considered as being the same whether they are in the
98core kernel image or in modules.
99
100If the tracepoint has to be used in kernel modules, an
101EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be
102used to export the defined tracepoints.
89 103
90* Probe / tracepoint example 104* Probe / tracepoint example
91 105
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 6bd91ed7cd03..7fa8f615ba6e 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -99,7 +99,7 @@ config GENERIC_IOMAP
99 bool 99 bool
100 default y 100 default y
101 101
102config SCHED_NO_NO_OMIT_FRAME_POINTER 102config SCHED_OMIT_FRAME_POINTER
103 bool 103 bool
104 default y 104 default y
105 105
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 35bcb641c9e5..a3cc9f65f954 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -55,7 +55,6 @@
55void build_cpu_to_node_map(void); 55void build_cpu_to_node_map(void);
56 56
57#define SD_CPU_INIT (struct sched_domain) { \ 57#define SD_CPU_INIT (struct sched_domain) { \
58 .span = CPU_MASK_NONE, \
59 .parent = NULL, \ 58 .parent = NULL, \
60 .child = NULL, \ 59 .child = NULL, \
61 .groups = NULL, \ 60 .groups = NULL, \
@@ -80,7 +79,6 @@ void build_cpu_to_node_map(void);
80 79
81/* sched_domains SD_NODE_INIT for IA64 NUMA machines */ 80/* sched_domains SD_NODE_INIT for IA64 NUMA machines */
82#define SD_NODE_INIT (struct sched_domain) { \ 81#define SD_NODE_INIT (struct sched_domain) { \
83 .span = CPU_MASK_NONE, \
84 .parent = NULL, \ 82 .parent = NULL, \
85 .child = NULL, \ 83 .child = NULL, \
86 .groups = NULL, \ 84 .groups = NULL, \
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 17a6dab09319..cabba332cc48 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -274,7 +274,7 @@ config GENERIC_CALIBRATE_DELAY
274 bool 274 bool
275 default y 275 default y
276 276
277config SCHED_NO_NO_OMIT_FRAME_POINTER 277config SCHED_OMIT_FRAME_POINTER
278 bool 278 bool
279 default y 279 default y
280 280
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index f4af967a6b30..a5255e7c79e0 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -653,7 +653,7 @@ config GENERIC_CMOS_UPDATE
653 bool 653 bool
654 default y 654 default y
655 655
656config SCHED_NO_NO_OMIT_FRAME_POINTER 656config SCHED_OMIT_FRAME_POINTER
657 bool 657 bool
658 default y 658 default y
659 659
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 7785bec732f2..1fb959f98982 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -37,7 +37,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
37 37
38/* sched_domains SD_NODE_INIT for SGI IP27 machines */ 38/* sched_domains SD_NODE_INIT for SGI IP27 machines */
39#define SD_NODE_INIT (struct sched_domain) { \ 39#define SD_NODE_INIT (struct sched_domain) { \
40 .span = CPU_MASK_NONE, \
41 .parent = NULL, \ 40 .parent = NULL, \
42 .child = NULL, \ 41 .child = NULL, \
43 .groups = NULL, \ 42 .groups = NULL, \
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 525c13a4de93..adb23ea1c1ef 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,7 +141,7 @@ config GENERIC_NVRAM
141 bool 141 bool
142 default y if PPC32 142 default y if PPC32
143 143
144config SCHED_NO_NO_OMIT_FRAME_POINTER 144config SCHED_OMIT_FRAME_POINTER
145 bool 145 bool
146 default y 146 default y
147 147
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index b298f7a631e6..e5f2ae8362f7 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -7,7 +7,19 @@
7 7
8#ifndef __ASSEMBLY__ 8#ifndef __ASSEMBLY__
9extern void _mcount(void); 9extern void _mcount(void);
10#endif 10
11#ifdef CONFIG_DYNAMIC_FTRACE
12static inline unsigned long ftrace_call_adjust(unsigned long addr)
13{
14 /* reloction of mcount call site is the same as the address */
15 return addr;
16}
17
18struct dyn_arch_ftrace {
19 struct module *mod;
20};
21#endif /* CONFIG_DYNAMIC_FTRACE */
22#endif /* __ASSEMBLY__ */
11 23
12#endif 24#endif
13 25
diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h
index e5f14b13ccf0..08454880a2c0 100644
--- a/arch/powerpc/include/asm/module.h
+++ b/arch/powerpc/include/asm/module.h
@@ -34,11 +34,19 @@ struct mod_arch_specific {
34#ifdef __powerpc64__ 34#ifdef __powerpc64__
35 unsigned int stubs_section; /* Index of stubs section in module */ 35 unsigned int stubs_section; /* Index of stubs section in module */
36 unsigned int toc_section; /* What section is the TOC? */ 36 unsigned int toc_section; /* What section is the TOC? */
37#else 37#ifdef CONFIG_DYNAMIC_FTRACE
38 unsigned long toc;
39 unsigned long tramp;
40#endif
41
42#else /* powerpc64 */
38 /* Indices of PLT sections within module. */ 43 /* Indices of PLT sections within module. */
39 unsigned int core_plt_section; 44 unsigned int core_plt_section;
40 unsigned int init_plt_section; 45 unsigned int init_plt_section;
46#ifdef CONFIG_DYNAMIC_FTRACE
47 unsigned long tramp;
41#endif 48#endif
49#endif /* powerpc64 */
42 50
43 /* List of BUG addresses, source line numbers and filenames */ 51 /* List of BUG addresses, source line numbers and filenames */
44 struct list_head bug_list; 52 struct list_head bug_list;
@@ -68,6 +76,12 @@ struct mod_arch_specific {
68# endif /* MODULE */ 76# endif /* MODULE */
69#endif 77#endif
70 78
79#ifdef CONFIG_DYNAMIC_FTRACE
80# ifdef MODULE
81 asm(".section .ftrace.tramp,\"ax\",@nobits; .align 3; .previous");
82# endif /* MODULE */
83#endif
84
71 85
72struct exception_table_entry; 86struct exception_table_entry;
73void sort_ex_table(struct exception_table_entry *start, 87void sort_ex_table(struct exception_table_entry *start,
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index c32da6f97999..373fca394a54 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -48,7 +48,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
48 48
49/* sched_domains SD_NODE_INIT for PPC64 machines */ 49/* sched_domains SD_NODE_INIT for PPC64 machines */
50#define SD_NODE_INIT (struct sched_domain) { \ 50#define SD_NODE_INIT (struct sched_domain) { \
51 .span = CPU_MASK_NONE, \
52 .parent = NULL, \ 51 .parent = NULL, \
53 .child = NULL, \ 52 .child = NULL, \
54 .groups = NULL, \ 53 .groups = NULL, \
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 92673b43858d..d17edb4a2f9d 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -17,6 +17,7 @@ ifdef CONFIG_FUNCTION_TRACER
17CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog 17CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog
18CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog 18CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog
19CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog 19CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog
20CFLAGS_REMOVE_prom.o = -pg -mno-sched-epilog
20 21
21ifdef CONFIG_DYNAMIC_FTRACE 22ifdef CONFIG_DYNAMIC_FTRACE
22# dynamic ftrace setup. 23# dynamic ftrace setup.
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 7ecc0d1855c3..6f7eb7e00c79 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -1162,39 +1162,17 @@ machine_check_in_rtas:
1162#ifdef CONFIG_DYNAMIC_FTRACE 1162#ifdef CONFIG_DYNAMIC_FTRACE
1163_GLOBAL(mcount) 1163_GLOBAL(mcount)
1164_GLOBAL(_mcount) 1164_GLOBAL(_mcount)
1165 stwu r1,-48(r1) 1165 /*
1166 stw r3, 12(r1) 1166 * It is required that _mcount on PPC32 must preserve the
1167 stw r4, 16(r1) 1167 * link register. But we have r0 to play with. We use r0
1168 stw r5, 20(r1) 1168 * to push the return address back to the caller of mcount
1169 stw r6, 24(r1) 1169 * into the ctr register, restore the link register and
1170 mflr r3 1170 * then jump back using the ctr register.
1171 stw r7, 28(r1) 1171 */
1172 mfcr r5 1172 mflr r0
1173 stw r8, 32(r1)
1174 stw r9, 36(r1)
1175 stw r10,40(r1)
1176 stw r3, 44(r1)
1177 stw r5, 8(r1)
1178 subi r3, r3, MCOUNT_INSN_SIZE
1179 .globl mcount_call
1180mcount_call:
1181 bl ftrace_stub
1182 nop
1183 lwz r6, 8(r1)
1184 lwz r0, 44(r1)
1185 lwz r3, 12(r1)
1186 mtctr r0 1173 mtctr r0
1187 lwz r4, 16(r1) 1174 lwz r0, 4(r1)
1188 mtcr r6
1189 lwz r5, 20(r1)
1190 lwz r6, 24(r1)
1191 lwz r0, 52(r1)
1192 lwz r7, 28(r1)
1193 lwz r8, 32(r1)
1194 mtlr r0 1175 mtlr r0
1195 lwz r9, 36(r1)
1196 lwz r10,40(r1)
1197 addi r1, r1, 48
1198 bctr 1176 bctr
1199 1177
1200_GLOBAL(ftrace_caller) 1178_GLOBAL(ftrace_caller)
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index e0bcf9354286..383ed6eb0085 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -894,18 +894,6 @@ _GLOBAL(enter_prom)
894#ifdef CONFIG_DYNAMIC_FTRACE 894#ifdef CONFIG_DYNAMIC_FTRACE
895_GLOBAL(mcount) 895_GLOBAL(mcount)
896_GLOBAL(_mcount) 896_GLOBAL(_mcount)
897 /* Taken from output of objdump from lib64/glibc */
898 mflr r3
899 stdu r1, -112(r1)
900 std r3, 128(r1)
901 subi r3, r3, MCOUNT_INSN_SIZE
902 .globl mcount_call
903mcount_call:
904 bl ftrace_stub
905 nop
906 ld r0, 128(r1)
907 mtlr r0
908 addi r1, r1, 112
909 blr 897 blr
910 898
911_GLOBAL(ftrace_caller) 899_GLOBAL(ftrace_caller)
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index f4b006ed0ab1..5355244c99ff 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -9,22 +9,30 @@
9 9
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
12#include <linux/uaccess.h>
13#include <linux/module.h>
12#include <linux/ftrace.h> 14#include <linux/ftrace.h>
13#include <linux/percpu.h> 15#include <linux/percpu.h>
14#include <linux/init.h> 16#include <linux/init.h>
15#include <linux/list.h> 17#include <linux/list.h>
16 18
17#include <asm/cacheflush.h> 19#include <asm/cacheflush.h>
20#include <asm/code-patching.h>
18#include <asm/ftrace.h> 21#include <asm/ftrace.h>
19 22
23#if 0
24#define DEBUGP printk
25#else
26#define DEBUGP(fmt , ...) do { } while (0)
27#endif
20 28
21static unsigned int ftrace_nop = 0x60000000; 29static unsigned int ftrace_nop = PPC_NOP_INSTR;
22 30
23#ifdef CONFIG_PPC32 31#ifdef CONFIG_PPC32
24# define GET_ADDR(addr) addr 32# define GET_ADDR(addr) addr
25#else 33#else
26/* PowerPC64's functions are data that points to the functions */ 34/* PowerPC64's functions are data that points to the functions */
27# define GET_ADDR(addr) *(unsigned long *)addr 35# define GET_ADDR(addr) (*(unsigned long *)addr)
28#endif 36#endif
29 37
30 38
@@ -33,12 +41,12 @@ static unsigned int ftrace_calc_offset(long ip, long addr)
33 return (int)(addr - ip); 41 return (int)(addr - ip);
34} 42}
35 43
36unsigned char *ftrace_nop_replace(void) 44static unsigned char *ftrace_nop_replace(void)
37{ 45{
38 return (char *)&ftrace_nop; 46 return (char *)&ftrace_nop;
39} 47}
40 48
41unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) 49static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
42{ 50{
43 static unsigned int op; 51 static unsigned int op;
44 52
@@ -68,49 +76,422 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
68# define _ASM_PTR " .long " 76# define _ASM_PTR " .long "
69#endif 77#endif
70 78
71int 79static int
72ftrace_modify_code(unsigned long ip, unsigned char *old_code, 80ftrace_modify_code(unsigned long ip, unsigned char *old_code,
73 unsigned char *new_code) 81 unsigned char *new_code)
74{ 82{
75 unsigned replaced; 83 unsigned char replaced[MCOUNT_INSN_SIZE];
76 unsigned old = *(unsigned *)old_code;
77 unsigned new = *(unsigned *)new_code;
78 int faulted = 0;
79 84
80 /* 85 /*
81 * Note: Due to modules and __init, code can 86 * Note: Due to modules and __init, code can
82 * disappear and change, we need to protect against faulting 87 * disappear and change, we need to protect against faulting
83 * as well as code changing. 88 * as well as code changing. We do this by using the
89 * probe_kernel_* functions.
84 * 90 *
85 * No real locking needed, this code is run through 91 * No real locking needed, this code is run through
86 * kstop_machine. 92 * kstop_machine, or before SMP starts.
87 */ 93 */
88 asm volatile ( 94
89 "1: lwz %1, 0(%2)\n" 95 /* read the text we want to modify */
90 " cmpw %1, %5\n" 96 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
91 " bne 2f\n" 97 return -EFAULT;
92 " stwu %3, 0(%2)\n" 98
93 "2:\n" 99 /* Make sure it is what we expect it to be */
94 ".section .fixup, \"ax\"\n" 100 if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
95 "3: li %0, 1\n" 101 return -EINVAL;
96 " b 2b\n" 102
97 ".previous\n" 103 /* replace the text with the new text */
98 ".section __ex_table,\"a\"\n" 104 if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
99 _ASM_ALIGN "\n" 105 return -EPERM;
100 _ASM_PTR "1b, 3b\n" 106
101 ".previous" 107 flush_icache_range(ip, ip + 8);
102 : "=r"(faulted), "=r"(replaced) 108
103 : "r"(ip), "r"(new), 109 return 0;
104 "0"(faulted), "r"(old) 110}
105 : "memory"); 111
106 112/*
107 if (replaced != old && replaced != new) 113 * Helper functions that are the same for both PPC64 and PPC32.
108 faulted = 2; 114 */
109 115static int test_24bit_addr(unsigned long ip, unsigned long addr)
110 if (!faulted) 116{
111 flush_icache_range(ip, ip + 8); 117
112 118 /* use the create_branch to verify that this offset can be branched */
113 return faulted; 119 return create_branch((unsigned int *)ip, addr, 0);
120}
121
122static int is_bl_op(unsigned int op)
123{
124 return (op & 0xfc000003) == 0x48000001;
125}
126
127static unsigned long find_bl_target(unsigned long ip, unsigned int op)
128{
129 static int offset;
130
131 offset = (op & 0x03fffffc);
132 /* make it signed */
133 if (offset & 0x02000000)
134 offset |= 0xfe000000;
135
136 return ip + (long)offset;
137}
138
139#ifdef CONFIG_PPC64
140static int
141__ftrace_make_nop(struct module *mod,
142 struct dyn_ftrace *rec, unsigned long addr)
143{
144 unsigned int op;
145 unsigned int jmp[5];
146 unsigned long ptr;
147 unsigned long ip = rec->ip;
148 unsigned long tramp;
149 int offset;
150
151 /* read where this goes */
152 if (probe_kernel_read(&op, (void *)ip, sizeof(int)))
153 return -EFAULT;
154
155 /* Make sure that that this is still a 24bit jump */
156 if (!is_bl_op(op)) {
157 printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
158 return -EINVAL;
159 }
160
161 /* lets find where the pointer goes */
162 tramp = find_bl_target(ip, op);
163
164 /*
165 * On PPC64 the trampoline looks like:
166 * 0x3d, 0x82, 0x00, 0x00, addis r12,r2, <high>
167 * 0x39, 0x8c, 0x00, 0x00, addi r12,r12, <low>
168 * Where the bytes 2,3,6 and 7 make up the 32bit offset
169 * to the TOC that holds the pointer.
170 * to jump to.
171 * 0xf8, 0x41, 0x00, 0x28, std r2,40(r1)
172 * 0xe9, 0x6c, 0x00, 0x20, ld r11,32(r12)
173 * The actually address is 32 bytes from the offset
174 * into the TOC.
175 * 0xe8, 0x4c, 0x00, 0x28, ld r2,40(r12)
176 */
177
178 DEBUGP("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc);
179
180 /* Find where the trampoline jumps to */
181 if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
182 printk(KERN_ERR "Failed to read %lx\n", tramp);
183 return -EFAULT;
184 }
185
186 DEBUGP(" %08x %08x", jmp[0], jmp[1]);
187
188 /* verify that this is what we expect it to be */
189 if (((jmp[0] & 0xffff0000) != 0x3d820000) ||
190 ((jmp[1] & 0xffff0000) != 0x398c0000) ||
191 (jmp[2] != 0xf8410028) ||
192 (jmp[3] != 0xe96c0020) ||
193 (jmp[4] != 0xe84c0028)) {
194 printk(KERN_ERR "Not a trampoline\n");
195 return -EINVAL;
196 }
197
198 offset = (unsigned)((unsigned short)jmp[0]) << 16 |
199 (unsigned)((unsigned short)jmp[1]);
200
201 DEBUGP(" %x ", offset);
202
203 /* get the address this jumps too */
204 tramp = mod->arch.toc + offset + 32;
205 DEBUGP("toc: %lx", tramp);
206
207 if (probe_kernel_read(jmp, (void *)tramp, 8)) {
208 printk(KERN_ERR "Failed to read %lx\n", tramp);
209 return -EFAULT;
210 }
211
212 DEBUGP(" %08x %08x\n", jmp[0], jmp[1]);
213
214 ptr = ((unsigned long)jmp[0] << 32) + jmp[1];
215
216 /* This should match what was called */
217 if (ptr != GET_ADDR(addr)) {
218 printk(KERN_ERR "addr does not match %lx\n", ptr);
219 return -EINVAL;
220 }
221
222 /*
223 * We want to nop the line, but the next line is
224 * 0xe8, 0x41, 0x00, 0x28 ld r2,40(r1)
225 * This needs to be turned to a nop too.
226 */
227 if (probe_kernel_read(&op, (void *)(ip+4), MCOUNT_INSN_SIZE))
228 return -EFAULT;
229
230 if (op != 0xe8410028) {
231 printk(KERN_ERR "Next line is not ld! (%08x)\n", op);
232 return -EINVAL;
233 }
234
235 /*
236 * Milton Miller pointed out that we can not blindly do nops.
237 * If a task was preempted when calling a trace function,
238 * the nops will remove the way to restore the TOC in r2
239 * and the r2 TOC will get corrupted.
240 */
241
242 /*
243 * Replace:
244 * bl <tramp> <==== will be replaced with "b 1f"
245 * ld r2,40(r1)
246 * 1:
247 */
248 op = 0x48000008; /* b +8 */
249
250 if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
251 return -EPERM;
252
253
254 flush_icache_range(ip, ip + 8);
255
256 return 0;
257}
258
259#else /* !PPC64 */
260static int
261__ftrace_make_nop(struct module *mod,
262 struct dyn_ftrace *rec, unsigned long addr)
263{
264 unsigned int op;
265 unsigned int jmp[4];
266 unsigned long ip = rec->ip;
267 unsigned long tramp;
268
269 if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
270 return -EFAULT;
271
272 /* Make sure that that this is still a 24bit jump */
273 if (!is_bl_op(op)) {
274 printk(KERN_ERR "Not expected bl: opcode is %x\n", op);
275 return -EINVAL;
276 }
277
278 /* lets find where the pointer goes */
279 tramp = find_bl_target(ip, op);
280
281 /*
282 * On PPC32 the trampoline looks like:
283 * 0x3d, 0x60, 0x00, 0x00 lis r11,sym@ha
284 * 0x39, 0x6b, 0x00, 0x00 addi r11,r11,sym@l
285 * 0x7d, 0x69, 0x03, 0xa6 mtctr r11
286 * 0x4e, 0x80, 0x04, 0x20 bctr
287 */
288
289 DEBUGP("ip:%lx jumps to %lx", ip, tramp);
290
291 /* Find where the trampoline jumps to */
292 if (probe_kernel_read(jmp, (void *)tramp, sizeof(jmp))) {
293 printk(KERN_ERR "Failed to read %lx\n", tramp);
294 return -EFAULT;
295 }
296
297 DEBUGP(" %08x %08x ", jmp[0], jmp[1]);
298
299 /* verify that this is what we expect it to be */
300 if (((jmp[0] & 0xffff0000) != 0x3d600000) ||
301 ((jmp[1] & 0xffff0000) != 0x396b0000) ||
302 (jmp[2] != 0x7d6903a6) ||
303 (jmp[3] != 0x4e800420)) {
304 printk(KERN_ERR "Not a trampoline\n");
305 return -EINVAL;
306 }
307
308 tramp = (jmp[1] & 0xffff) |
309 ((jmp[0] & 0xffff) << 16);
310 if (tramp & 0x8000)
311 tramp -= 0x10000;
312
313 DEBUGP(" %x ", tramp);
314
315 if (tramp != addr) {
316 printk(KERN_ERR
317 "Trampoline location %08lx does not match addr\n",
318 tramp);
319 return -EINVAL;
320 }
321
322 op = PPC_NOP_INSTR;
323
324 if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
325 return -EPERM;
326
327 flush_icache_range(ip, ip + 8);
328
329 return 0;
330}
331#endif /* PPC64 */
332
333int ftrace_make_nop(struct module *mod,
334 struct dyn_ftrace *rec, unsigned long addr)
335{
336 unsigned char *old, *new;
337 unsigned long ip = rec->ip;
338
339 /*
340 * If the calling address is more that 24 bits away,
341 * then we had to use a trampoline to make the call.
342 * Otherwise just update the call site.
343 */
344 if (test_24bit_addr(ip, addr)) {
345 /* within range */
346 old = ftrace_call_replace(ip, addr);
347 new = ftrace_nop_replace();
348 return ftrace_modify_code(ip, old, new);
349 }
350
351 /*
352 * Out of range jumps are called from modules.
353 * We should either already have a pointer to the module
354 * or it has been passed in.
355 */
356 if (!rec->arch.mod) {
357 if (!mod) {
358 printk(KERN_ERR "No module loaded addr=%lx\n",
359 addr);
360 return -EFAULT;
361 }
362 rec->arch.mod = mod;
363 } else if (mod) {
364 if (mod != rec->arch.mod) {
365 printk(KERN_ERR
366 "Record mod %p not equal to passed in mod %p\n",
367 rec->arch.mod, mod);
368 return -EINVAL;
369 }
370 /* nothing to do if mod == rec->arch.mod */
371 } else
372 mod = rec->arch.mod;
373
374 return __ftrace_make_nop(mod, rec, addr);
375
376}
377
378#ifdef CONFIG_PPC64
379static int
380__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
381{
382 unsigned int op[2];
383 unsigned long ip = rec->ip;
384
385 /* read where this goes */
386 if (probe_kernel_read(op, (void *)ip, MCOUNT_INSN_SIZE * 2))
387 return -EFAULT;
388
389 /*
390 * It should be pointing to two nops or
391 * b +8; ld r2,40(r1)
392 */
393 if (((op[0] != 0x48000008) || (op[1] != 0xe8410028)) &&
394 ((op[0] != PPC_NOP_INSTR) || (op[1] != PPC_NOP_INSTR))) {
395 printk(KERN_ERR "Expected NOPs but have %x %x\n", op[0], op[1]);
396 return -EINVAL;
397 }
398
399 /* If we never set up a trampoline to ftrace_caller, then bail */
400 if (!rec->arch.mod->arch.tramp) {
401 printk(KERN_ERR "No ftrace trampoline\n");
402 return -EINVAL;
403 }
404
405 /* create the branch to the trampoline */
406 op[0] = create_branch((unsigned int *)ip,
407 rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
408 if (!op[0]) {
409 printk(KERN_ERR "REL24 out of range!\n");
410 return -EINVAL;
411 }
412
413 /* ld r2,40(r1) */
414 op[1] = 0xe8410028;
415
416 DEBUGP("write to %lx\n", rec->ip);
417
418 if (probe_kernel_write((void *)ip, op, MCOUNT_INSN_SIZE * 2))
419 return -EPERM;
420
421 flush_icache_range(ip, ip + 8);
422
423 return 0;
424}
425#else
426static int
427__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
428{
429 unsigned int op;
430 unsigned long ip = rec->ip;
431
432 /* read where this goes */
433 if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
434 return -EFAULT;
435
436 /* It should be pointing to a nop */
437 if (op != PPC_NOP_INSTR) {
438 printk(KERN_ERR "Expected NOP but have %x\n", op);
439 return -EINVAL;
440 }
441
442 /* If we never set up a trampoline to ftrace_caller, then bail */
443 if (!rec->arch.mod->arch.tramp) {
444 printk(KERN_ERR "No ftrace trampoline\n");
445 return -EINVAL;
446 }
447
448 /* create the branch to the trampoline */
449 op = create_branch((unsigned int *)ip,
450 rec->arch.mod->arch.tramp, BRANCH_SET_LINK);
451 if (!op) {
452 printk(KERN_ERR "REL24 out of range!\n");
453 return -EINVAL;
454 }
455
456 DEBUGP("write to %lx\n", rec->ip);
457
458 if (probe_kernel_write((void *)ip, &op, MCOUNT_INSN_SIZE))
459 return -EPERM;
460
461 flush_icache_range(ip, ip + 8);
462
463 return 0;
464}
465#endif /* CONFIG_PPC64 */
466
467int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
468{
469 unsigned char *old, *new;
470 unsigned long ip = rec->ip;
471
472 /*
473 * If the calling address is more that 24 bits away,
474 * then we had to use a trampoline to make the call.
475 * Otherwise just update the call site.
476 */
477 if (test_24bit_addr(ip, addr)) {
478 /* within range */
479 old = ftrace_nop_replace();
480 new = ftrace_call_replace(ip, addr);
481 return ftrace_modify_code(ip, old, new);
482 }
483
484 /*
485 * Out of range jumps are called from modules.
486 * Being that we are converting from nop, it had better
487 * already have a module defined.
488 */
489 if (!rec->arch.mod) {
490 printk(KERN_ERR "No module loaded\n");
491 return -EINVAL;
492 }
493
494 return __ftrace_make_call(rec, addr);
114} 495}
115 496
116int ftrace_update_ftrace_func(ftrace_func_t func) 497int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -128,10 +509,10 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
128 509
129int __init ftrace_dyn_arch_init(void *data) 510int __init ftrace_dyn_arch_init(void *data)
130{ 511{
131 /* This is running in kstop_machine */ 512 /* caller expects data to be zero */
513 unsigned long *p = data;
132 514
133 ftrace_mcount_set(data); 515 *p = 0;
134 516
135 return 0; 517 return 0;
136} 518}
137
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 31982d05d81a..88d9c1d5e5fb 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -69,10 +69,15 @@ void cpu_idle(void)
69 smp_mb(); 69 smp_mb();
70 local_irq_disable(); 70 local_irq_disable();
71 71
72 /* Don't trace irqs off for idle */
73 stop_critical_timings();
74
72 /* check again after disabling irqs */ 75 /* check again after disabling irqs */
73 if (!need_resched() && !cpu_should_die()) 76 if (!need_resched() && !cpu_should_die())
74 ppc_md.power_save(); 77 ppc_md.power_save();
75 78
79 start_critical_timings();
80
76 local_irq_enable(); 81 local_irq_enable();
77 set_thread_flag(TIF_POLLING_NRFLAG); 82 set_thread_flag(TIF_POLLING_NRFLAG);
78 83
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c
index 2df91a03462a..f832773fc28e 100644
--- a/arch/powerpc/kernel/module_32.c
+++ b/arch/powerpc/kernel/module_32.c
@@ -22,6 +22,7 @@
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/ftrace.h>
25#include <linux/cache.h> 26#include <linux/cache.h>
26#include <linux/bug.h> 27#include <linux/bug.h>
27#include <linux/sort.h> 28#include <linux/sort.h>
@@ -53,6 +54,9 @@ static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)
53 r_addend = rela[i].r_addend; 54 r_addend = rela[i].r_addend;
54 } 55 }
55 56
57#ifdef CONFIG_DYNAMIC_FTRACE
58 _count_relocs++; /* add one for ftrace_caller */
59#endif
56 return _count_relocs; 60 return _count_relocs;
57} 61}
58 62
@@ -306,5 +310,11 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
306 return -ENOEXEC; 310 return -ENOEXEC;
307 } 311 }
308 } 312 }
313#ifdef CONFIG_DYNAMIC_FTRACE
314 module->arch.tramp =
315 do_plt_call(module->module_core,
316 (unsigned long)ftrace_caller,
317 sechdrs, module);
318#endif
309 return 0; 319 return 0;
310} 320}
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 1af2377e4992..8992b031a7b6 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -20,6 +20,7 @@
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/err.h> 21#include <linux/err.h>
22#include <linux/vmalloc.h> 22#include <linux/vmalloc.h>
23#include <linux/ftrace.h>
23#include <linux/bug.h> 24#include <linux/bug.h>
24#include <asm/module.h> 25#include <asm/module.h>
25#include <asm/firmware.h> 26#include <asm/firmware.h>
@@ -163,6 +164,11 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
163 } 164 }
164 } 165 }
165 166
167#ifdef CONFIG_DYNAMIC_FTRACE
168 /* make the trampoline to the ftrace_caller */
169 relocs++;
170#endif
171
166 DEBUGP("Looks like a total of %lu stubs, max\n", relocs); 172 DEBUGP("Looks like a total of %lu stubs, max\n", relocs);
167 return relocs * sizeof(struct ppc64_stub_entry); 173 return relocs * sizeof(struct ppc64_stub_entry);
168} 174}
@@ -441,5 +447,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
441 } 447 }
442 } 448 }
443 449
450#ifdef CONFIG_DYNAMIC_FTRACE
451 me->arch.toc = my_r2(sechdrs, me);
452 me->arch.tramp = stub_for_addr(sechdrs,
453 (unsigned long)ftrace_caller,
454 me);
455#endif
456
444 return 0; 457 return 0;
445} 458}
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index d69912c07ce7..8db35278a4b4 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -6,6 +6,9 @@ ifeq ($(CONFIG_PPC64),y)
6EXTRA_CFLAGS += -mno-minimal-toc 6EXTRA_CFLAGS += -mno-minimal-toc
7endif 7endif
8 8
9CFLAGS_REMOVE_code-patching.o = -pg
10CFLAGS_REMOVE_feature-fixups.o = -pg
11
9obj-y := string.o alloc.o \ 12obj-y := string.o alloc.o \
10 checksum_$(CONFIG_WORD_SIZE).o 13 checksum_$(CONFIG_WORD_SIZE).o
11obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o 14obj-$(CONFIG_PPC32) += div64.o copy_32.o crtsavres.o
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index a947899dcba1..bf96f1b5c6ec 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -212,7 +212,7 @@ static void update_cpu_core_map(void)
212 cpu_core_map[cpu] = cpu_coregroup_map(cpu); 212 cpu_core_map[cpu] = cpu_coregroup_map(cpu);
213} 213}
214 214
215void arch_update_cpu_topology(void) 215int arch_update_cpu_topology(void)
216{ 216{
217 struct tl_info *info = tl_info; 217 struct tl_info *info = tl_info;
218 struct sys_device *sysdev; 218 struct sys_device *sysdev;
@@ -221,7 +221,7 @@ void arch_update_cpu_topology(void)
221 if (!machine_has_topology) { 221 if (!machine_has_topology) {
222 update_cpu_core_map(); 222 update_cpu_core_map();
223 topology_update_polarization_simple(); 223 topology_update_polarization_simple();
224 return; 224 return 0;
225 } 225 }
226 stsi(info, 15, 1, 2); 226 stsi(info, 15, 1, 2);
227 tl_to_cores(info); 227 tl_to_cores(info);
@@ -230,6 +230,7 @@ void arch_update_cpu_topology(void)
230 sysdev = get_cpu_sysdev(cpu); 230 sysdev = get_cpu_sysdev(cpu);
231 kobject_uevent(&sysdev->kobj, KOBJ_CHANGE); 231 kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
232 } 232 }
233 return 1;
233} 234}
234 235
235static void topology_work_fn(struct work_struct *work) 236static void topology_work_fn(struct work_struct *work)
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index 95f0085e098a..279d9cc4a007 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -5,7 +5,6 @@
5 5
6/* sched_domains SD_NODE_INIT for sh machines */ 6/* sched_domains SD_NODE_INIT for sh machines */
7#define SD_NODE_INIT (struct sched_domain) { \ 7#define SD_NODE_INIT (struct sched_domain) { \
8 .span = CPU_MASK_NONE, \
9 .parent = NULL, \ 8 .parent = NULL, \
10 .child = NULL, \ 9 .child = NULL, \
11 .groups = NULL, \ 10 .groups = NULL, \
diff --git a/arch/um/include/asm/system.h b/arch/um/include/asm/system.h
index 753346e2cdfd..ae5f94d6317d 100644
--- a/arch/um/include/asm/system.h
+++ b/arch/um/include/asm/system.h
@@ -11,21 +11,21 @@ extern int get_signals(void);
11extern void block_signals(void); 11extern void block_signals(void);
12extern void unblock_signals(void); 12extern void unblock_signals(void);
13 13
14#define local_save_flags(flags) do { typecheck(unsigned long, flags); \ 14#define raw_local_save_flags(flags) do { typecheck(unsigned long, flags); \
15 (flags) = get_signals(); } while(0) 15 (flags) = get_signals(); } while(0)
16#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \ 16#define raw_local_irq_restore(flags) do { typecheck(unsigned long, flags); \
17 set_signals(flags); } while(0) 17 set_signals(flags); } while(0)
18 18
19#define local_irq_save(flags) do { local_save_flags(flags); \ 19#define raw_local_irq_save(flags) do { raw_local_save_flags(flags); \
20 local_irq_disable(); } while(0) 20 raw_local_irq_disable(); } while(0)
21 21
22#define local_irq_enable() unblock_signals() 22#define raw_local_irq_enable() unblock_signals()
23#define local_irq_disable() block_signals() 23#define raw_local_irq_disable() block_signals()
24 24
25#define irqs_disabled() \ 25#define irqs_disabled() \
26({ \ 26({ \
27 unsigned long flags; \ 27 unsigned long flags; \
28 local_save_flags(flags); \ 28 raw_local_save_flags(flags); \
29 (flags == 0); \ 29 (flags == 0); \
30}) 30})
31 31
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ac22bb7719f7..d99eeb7915c6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -29,11 +29,14 @@ config X86
29 select HAVE_FTRACE_MCOUNT_RECORD 29 select HAVE_FTRACE_MCOUNT_RECORD
30 select HAVE_DYNAMIC_FTRACE 30 select HAVE_DYNAMIC_FTRACE
31 select HAVE_FUNCTION_TRACER 31 select HAVE_FUNCTION_TRACER
32 select HAVE_FUNCTION_GRAPH_TRACER
33 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
32 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 34 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
33 select HAVE_ARCH_KGDB if !X86_VOYAGER 35 select HAVE_ARCH_KGDB if !X86_VOYAGER
34 select HAVE_ARCH_TRACEHOOK 36 select HAVE_ARCH_TRACEHOOK
35 select HAVE_GENERIC_DMA_COHERENT if X86_32 37 select HAVE_GENERIC_DMA_COHERENT if X86_32
36 select HAVE_EFFICIENT_UNALIGNED_ACCESS 38 select HAVE_EFFICIENT_UNALIGNED_ACCESS
39 select USER_STACKTRACE_SUPPORT
37 40
38config ARCH_DEFCONFIG 41config ARCH_DEFCONFIG
39 string 42 string
@@ -238,6 +241,16 @@ config X86_HAS_BOOT_CPU_ID
238 def_bool y 241 def_bool y
239 depends on X86_VOYAGER 242 depends on X86_VOYAGER
240 243
244config SPARSE_IRQ
245 bool "Support sparse irq numbering"
246 depends on (PCI_MSI || HT_IRQ) && SMP
247 default y
248 help
249 This enables support for sparse irq, esp for msi/msi-x. You may need
250 if you have lots of cards supports msi-x installed.
251
252 If you don't know what to do here, say Y.
253
241config X86_FIND_SMP_CONFIG 254config X86_FIND_SMP_CONFIG
242 def_bool y 255 def_bool y
243 depends on X86_MPPARSE || X86_VOYAGER 256 depends on X86_MPPARSE || X86_VOYAGER
@@ -367,10 +380,10 @@ config X86_RDC321X
367 as R-8610-(G). 380 as R-8610-(G).
368 If you don't have one of these chips, you should say N here. 381 If you don't have one of these chips, you should say N here.
369 382
370config SCHED_NO_NO_OMIT_FRAME_POINTER 383config SCHED_OMIT_FRAME_POINTER
371 def_bool y 384 def_bool y
372 prompt "Single-depth WCHAN output" 385 prompt "Single-depth WCHAN output"
373 depends on X86_32 386 depends on X86
374 help 387 help
375 Calculate simpler /proc/<PID>/wchan values. If this option 388 Calculate simpler /proc/<PID>/wchan values. If this option
376 is disabled then wchan values will recurse back to the 389 is disabled then wchan values will recurse back to the
@@ -465,10 +478,6 @@ config X86_CYCLONE_TIMER
465 def_bool y 478 def_bool y
466 depends on X86_GENERICARCH 479 depends on X86_GENERICARCH
467 480
468config ES7000_CLUSTERED_APIC
469 def_bool y
470 depends on SMP && X86_ES7000 && MPENTIUMIII
471
472source "arch/x86/Kconfig.cpu" 481source "arch/x86/Kconfig.cpu"
473 482
474config HPET_TIMER 483config HPET_TIMER
@@ -1632,13 +1641,6 @@ config APM_ALLOW_INTS
1632 many of the newer IBM Thinkpads. If you experience hangs when you 1641 many of the newer IBM Thinkpads. If you experience hangs when you
1633 suspend, try setting this to Y. Otherwise, say N. 1642 suspend, try setting this to Y. Otherwise, say N.
1634 1643
1635config APM_REAL_MODE_POWER_OFF
1636 bool "Use real mode APM BIOS call to power off"
1637 help
1638 Use real mode APM BIOS calls to switch off the computer. This is
1639 a work-around for a number of buggy BIOSes. Switch this option on if
1640 your computer crashes instead of powering off properly.
1641
1642endif # APM 1644endif # APM
1643 1645
1644source "arch/x86/kernel/cpu/cpufreq/Kconfig" 1646source "arch/x86/kernel/cpu/cpufreq/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index b815664fe370..85a78575956c 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -515,6 +515,7 @@ config CPU_SUP_UMC_32
515config X86_DS 515config X86_DS
516 def_bool X86_PTRACE_BTS 516 def_bool X86_PTRACE_BTS
517 depends on X86_DEBUGCTLMSR 517 depends on X86_DEBUGCTLMSR
518 select HAVE_HW_BRANCH_TRACER
518 519
519config X86_PTRACE_BTS 520config X86_PTRACE_BTS
520 bool "Branch Trace Store" 521 bool "Branch Trace Store"
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 2a3dfbd5e677..fa013f529b74 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -186,14 +186,10 @@ config IOMMU_LEAK
186 Add a simple leak tracer to the IOMMU code. This is useful when you 186 Add a simple leak tracer to the IOMMU code. This is useful when you
187 are debugging a buggy device driver that leaks IOMMU mappings. 187 are debugging a buggy device driver that leaks IOMMU mappings.
188 188
189config MMIOTRACE_HOOKS
190 bool
191
192config MMIOTRACE 189config MMIOTRACE
193 bool "Memory mapped IO tracing" 190 bool "Memory mapped IO tracing"
194 depends on DEBUG_KERNEL && PCI 191 depends on DEBUG_KERNEL && PCI
195 select TRACING 192 select TRACING
196 select MMIOTRACE_HOOKS
197 help 193 help
198 Mmiotrace traces Memory Mapped I/O access and is meant for 194 Mmiotrace traces Memory Mapped I/O access and is meant for
199 debugging and reverse engineering. It is called from the ioremap 195 debugging and reverse engineering. It is called from the ioremap
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 3b1510b4fc57..25caa0738af5 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -193,6 +193,7 @@ extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
193static inline void lapic_shutdown(void) { } 193static inline void lapic_shutdown(void) { }
194#define local_apic_timer_c2_ok 1 194#define local_apic_timer_c2_ok 1
195static inline void init_apic_mappings(void) { } 195static inline void init_apic_mappings(void) { }
196static inline void disable_local_APIC(void) { }
196 197
197#endif /* !CONFIG_X86_LOCAL_APIC */ 198#endif /* !CONFIG_X86_LOCAL_APIC */
198 199
diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h
index 1d9543b9d358..ce547f24a1cd 100644
--- a/arch/x86/include/asm/bigsmp/apic.h
+++ b/arch/x86/include/asm/bigsmp/apic.h
@@ -24,8 +24,6 @@ static inline cpumask_t target_cpus(void)
24#define INT_DELIVERY_MODE (dest_Fixed) 24#define INT_DELIVERY_MODE (dest_Fixed)
25#define INT_DEST_MODE (0) /* phys delivery to target proc */ 25#define INT_DEST_MODE (0) /* phys delivery to target proc */
26#define NO_BALANCE_IRQ (0) 26#define NO_BALANCE_IRQ (0)
27#define WAKE_SECONDARY_VIA_INIT
28
29 27
30static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) 28static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
31{ 29{
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index a95008457ea4..99b6c39774a4 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -7,13 +7,12 @@
7 * 7 *
8 * It manages: 8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS 9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional) 10 * - buffer overflow handling (to be done)
11 * - buffer overflow handling
12 * - buffer access 11 * - buffer access
13 * 12 *
14 * It assumes: 13 * It assumes:
15 * - get_task_struct on all parameter tasks 14 * - get_task_struct on all traced tasks
16 * - current is allowed to trace parameter tasks 15 * - current is allowed to trace tasks
17 * 16 *
18 * 17 *
19 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2008 Intel Corporation.
@@ -26,11 +25,18 @@
26 25
27#include <linux/types.h> 26#include <linux/types.h>
28#include <linux/init.h> 27#include <linux/init.h>
28#include <linux/err.h>
29 29
30 30
31#ifdef CONFIG_X86_DS 31#ifdef CONFIG_X86_DS
32 32
33struct task_struct; 33struct task_struct;
34struct ds_tracer;
35struct bts_tracer;
36struct pebs_tracer;
37
38typedef void (*bts_ovfl_callback_t)(struct bts_tracer *);
39typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);
34 40
35/* 41/*
36 * Request BTS or PEBS 42 * Request BTS or PEBS
@@ -38,60 +44,62 @@ struct task_struct;
38 * Due to alignement constraints, the actual buffer may be slightly 44 * Due to alignement constraints, the actual buffer may be slightly
39 * smaller than the requested or provided buffer. 45 * smaller than the requested or provided buffer.
40 * 46 *
41 * Returns 0 on success; -Eerrno otherwise 47 * Returns a pointer to a tracer structure on success, or
48 * ERR_PTR(errcode) on failure.
49 *
50 * The interrupt threshold is independent from the overflow callback
51 * to allow users to use their own overflow interrupt handling mechanism.
42 * 52 *
43 * task: the task to request recording for; 53 * task: the task to request recording for;
44 * NULL for per-cpu recording on the current cpu 54 * NULL for per-cpu recording on the current cpu
45 * base: the base pointer for the (non-pageable) buffer; 55 * base: the base pointer for the (non-pageable) buffer;
46 * NULL if buffer allocation requested 56 * size: the size of the provided buffer in bytes
47 * size: the size of the requested or provided buffer
48 * ovfl: pointer to a function to be called on buffer overflow; 57 * ovfl: pointer to a function to be called on buffer overflow;
49 * NULL if cyclic buffer requested 58 * NULL if cyclic buffer requested
59 * th: the interrupt threshold in records from the end of the buffer;
60 * -1 if no interrupt threshold is requested.
50 */ 61 */
51typedef void (*ds_ovfl_callback_t)(struct task_struct *); 62extern struct bts_tracer *ds_request_bts(struct task_struct *task,
52extern int ds_request_bts(struct task_struct *task, void *base, size_t size, 63 void *base, size_t size,
53 ds_ovfl_callback_t ovfl); 64 bts_ovfl_callback_t ovfl, size_t th);
54extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, 65extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
55 ds_ovfl_callback_t ovfl); 66 void *base, size_t size,
67 pebs_ovfl_callback_t ovfl,
68 size_t th);
56 69
57/* 70/*
58 * Release BTS or PEBS resources 71 * Release BTS or PEBS resources
59 * 72 *
60 * Frees buffers allocated on ds_request.
61 *
62 * Returns 0 on success; -Eerrno otherwise 73 * Returns 0 on success; -Eerrno otherwise
63 * 74 *
64 * task: the task to release resources for; 75 * tracer: the tracer handle returned from ds_request_~()
65 * NULL to release resources for the current cpu
66 */ 76 */
67extern int ds_release_bts(struct task_struct *task); 77extern int ds_release_bts(struct bts_tracer *tracer);
68extern int ds_release_pebs(struct task_struct *task); 78extern int ds_release_pebs(struct pebs_tracer *tracer);
69 79
70/* 80/*
71 * Return the (array) index of the write pointer. 81 * Get the (array) index of the write pointer.
72 * (assuming an array of BTS/PEBS records) 82 * (assuming an array of BTS/PEBS records)
73 * 83 *
74 * Returns -Eerrno on error 84 * Returns 0 on success; -Eerrno on error
75 * 85 *
76 * task: the task to access; 86 * tracer: the tracer handle returned from ds_request_~()
77 * NULL to access the current cpu 87 * pos (out): will hold the result
78 * pos (out): if not NULL, will hold the result
79 */ 88 */
80extern int ds_get_bts_index(struct task_struct *task, size_t *pos); 89extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos);
81extern int ds_get_pebs_index(struct task_struct *task, size_t *pos); 90extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos);
82 91
83/* 92/*
84 * Return the (array) index one record beyond the end of the array. 93 * Get the (array) index one record beyond the end of the array.
85 * (assuming an array of BTS/PEBS records) 94 * (assuming an array of BTS/PEBS records)
86 * 95 *
87 * Returns -Eerrno on error 96 * Returns 0 on success; -Eerrno on error
88 * 97 *
89 * task: the task to access; 98 * tracer: the tracer handle returned from ds_request_~()
90 * NULL to access the current cpu 99 * pos (out): will hold the result
91 * pos (out): if not NULL, will hold the result
92 */ 100 */
93extern int ds_get_bts_end(struct task_struct *task, size_t *pos); 101extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos);
94extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); 102extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos);
95 103
96/* 104/*
97 * Provide a pointer to the BTS/PEBS record at parameter index. 105 * Provide a pointer to the BTS/PEBS record at parameter index.
@@ -102,14 +110,13 @@ extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);
102 * 110 *
103 * Returns the size of a single record on success; -Eerrno on error 111 * Returns the size of a single record on success; -Eerrno on error
104 * 112 *
105 * task: the task to access; 113 * tracer: the tracer handle returned from ds_request_~()
106 * NULL to access the current cpu
107 * index: the index of the requested record 114 * index: the index of the requested record
108 * record (out): pointer to the requested record 115 * record (out): pointer to the requested record
109 */ 116 */
110extern int ds_access_bts(struct task_struct *task, 117extern int ds_access_bts(struct bts_tracer *tracer,
111 size_t index, const void **record); 118 size_t index, const void **record);
112extern int ds_access_pebs(struct task_struct *task, 119extern int ds_access_pebs(struct pebs_tracer *tracer,
113 size_t index, const void **record); 120 size_t index, const void **record);
114 121
115/* 122/*
@@ -129,38 +136,24 @@ extern int ds_access_pebs(struct task_struct *task,
129 * 136 *
130 * Returns the number of bytes written or -Eerrno. 137 * Returns the number of bytes written or -Eerrno.
131 * 138 *
132 * task: the task to access; 139 * tracer: the tracer handle returned from ds_request_~()
133 * NULL to access the current cpu
134 * buffer: the buffer to write 140 * buffer: the buffer to write
135 * size: the size of the buffer 141 * size: the size of the buffer
136 */ 142 */
137extern int ds_write_bts(struct task_struct *task, 143extern int ds_write_bts(struct bts_tracer *tracer,
138 const void *buffer, size_t size); 144 const void *buffer, size_t size);
139extern int ds_write_pebs(struct task_struct *task, 145extern int ds_write_pebs(struct pebs_tracer *tracer,
140 const void *buffer, size_t size); 146 const void *buffer, size_t size);
141 147
142/* 148/*
143 * Same as ds_write_bts/pebs, but omit ownership checks.
144 *
145 * This is needed to have some other task than the owner of the
146 * BTS/PEBS buffer or the parameter task itself write into the
147 * respective buffer.
148 */
149extern int ds_unchecked_write_bts(struct task_struct *task,
150 const void *buffer, size_t size);
151extern int ds_unchecked_write_pebs(struct task_struct *task,
152 const void *buffer, size_t size);
153
154/*
155 * Reset the write pointer of the BTS/PEBS buffer. 149 * Reset the write pointer of the BTS/PEBS buffer.
156 * 150 *
157 * Returns 0 on success; -Eerrno on error 151 * Returns 0 on success; -Eerrno on error
158 * 152 *
159 * task: the task to access; 153 * tracer: the tracer handle returned from ds_request_~()
160 * NULL to access the current cpu
161 */ 154 */
162extern int ds_reset_bts(struct task_struct *task); 155extern int ds_reset_bts(struct bts_tracer *tracer);
163extern int ds_reset_pebs(struct task_struct *task); 156extern int ds_reset_pebs(struct pebs_tracer *tracer);
164 157
165/* 158/*
166 * Clear the BTS/PEBS buffer and reset the write pointer. 159 * Clear the BTS/PEBS buffer and reset the write pointer.
@@ -168,33 +161,30 @@ extern int ds_reset_pebs(struct task_struct *task);
168 * 161 *
169 * Returns 0 on success; -Eerrno on error 162 * Returns 0 on success; -Eerrno on error
170 * 163 *
171 * task: the task to access; 164 * tracer: the tracer handle returned from ds_request_~()
172 * NULL to access the current cpu
173 */ 165 */
174extern int ds_clear_bts(struct task_struct *task); 166extern int ds_clear_bts(struct bts_tracer *tracer);
175extern int ds_clear_pebs(struct task_struct *task); 167extern int ds_clear_pebs(struct pebs_tracer *tracer);
176 168
177/* 169/*
178 * Provide the PEBS counter reset value. 170 * Provide the PEBS counter reset value.
179 * 171 *
180 * Returns 0 on success; -Eerrno on error 172 * Returns 0 on success; -Eerrno on error
181 * 173 *
182 * task: the task to access; 174 * tracer: the tracer handle returned from ds_request_pebs()
183 * NULL to access the current cpu
184 * value (out): the counter reset value 175 * value (out): the counter reset value
185 */ 176 */
186extern int ds_get_pebs_reset(struct task_struct *task, u64 *value); 177extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value);
187 178
188/* 179/*
189 * Set the PEBS counter reset value. 180 * Set the PEBS counter reset value.
190 * 181 *
191 * Returns 0 on success; -Eerrno on error 182 * Returns 0 on success; -Eerrno on error
192 * 183 *
193 * task: the task to access; 184 * tracer: the tracer handle returned from ds_request_pebs()
194 * NULL to access the current cpu
195 * value: the new counter reset value 185 * value: the new counter reset value
196 */ 186 */
197extern int ds_set_pebs_reset(struct task_struct *task, u64 value); 187extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
198 188
199/* 189/*
200 * Initialization 190 * Initialization
@@ -207,17 +197,13 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
207/* 197/*
208 * The DS context - part of struct thread_struct. 198 * The DS context - part of struct thread_struct.
209 */ 199 */
200#define MAX_SIZEOF_DS (12 * 8)
201
210struct ds_context { 202struct ds_context {
211 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ 203 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
212 unsigned char *ds; 204 unsigned char ds[MAX_SIZEOF_DS];
213 /* the owner of the BTS and PEBS configuration, respectively */ 205 /* the owner of the BTS and PEBS configuration, respectively */
214 struct task_struct *owner[2]; 206 struct ds_tracer *owner[2];
215 /* buffer overflow notification function for BTS and PEBS */
216 ds_ovfl_callback_t callback[2];
217 /* the original buffer address */
218 void *buffer[2];
219 /* the number of allocated pages for on-request allocated buffers */
220 unsigned int pages[2];
221 /* use count */ 207 /* use count */
222 unsigned long count; 208 unsigned long count;
223 /* a pointer to the context location inside the thread_struct 209 /* a pointer to the context location inside the thread_struct
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
index 94826cf87455..cc70c1c78ca4 100644
--- a/arch/x86/include/asm/emergency-restart.h
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -8,7 +8,9 @@ enum reboot_type {
8 BOOT_BIOS = 'b', 8 BOOT_BIOS = 'b',
9#endif 9#endif
10 BOOT_ACPI = 'a', 10 BOOT_ACPI = 'a',
11 BOOT_EFI = 'e' 11 BOOT_EFI = 'e',
12 BOOT_CF9 = 'p',
13 BOOT_CF9_COND = 'q',
12}; 14};
13 15
14extern enum reboot_type reboot_type; 16extern enum reboot_type reboot_type;
diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h
index 380f0b4f17ed..e24ef876915f 100644
--- a/arch/x86/include/asm/es7000/apic.h
+++ b/arch/x86/include/asm/es7000/apic.h
@@ -9,31 +9,27 @@ static inline int apic_id_registered(void)
9 return (1); 9 return (1);
10} 10}
11 11
12static inline cpumask_t target_cpus(void) 12static inline cpumask_t target_cpus_cluster(void)
13{ 13{
14#if defined CONFIG_ES7000_CLUSTERED_APIC
15 return CPU_MASK_ALL; 14 return CPU_MASK_ALL;
16#else 15}
16
17static inline cpumask_t target_cpus(void)
18{
17 return cpumask_of_cpu(smp_processor_id()); 19 return cpumask_of_cpu(smp_processor_id());
18#endif
19} 20}
20 21
21#if defined CONFIG_ES7000_CLUSTERED_APIC 22#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER)
22#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) 23#define INT_DELIVERY_MODE_CLUSTER (dest_LowestPrio)
23#define INT_DELIVERY_MODE (dest_LowestPrio) 24#define INT_DEST_MODE_CLUSTER (1) /* logical delivery broadcast to all procs */
24#define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */ 25#define NO_BALANCE_IRQ_CLUSTER (1)
25#define NO_BALANCE_IRQ (1) 26
26#undef WAKE_SECONDARY_VIA_INIT
27#define WAKE_SECONDARY_VIA_MIP
28#else
29#define APIC_DFR_VALUE (APIC_DFR_FLAT) 27#define APIC_DFR_VALUE (APIC_DFR_FLAT)
30#define INT_DELIVERY_MODE (dest_Fixed) 28#define INT_DELIVERY_MODE (dest_Fixed)
31#define INT_DEST_MODE (0) /* phys delivery to target procs */ 29#define INT_DEST_MODE (0) /* phys delivery to target procs */
32#define NO_BALANCE_IRQ (0) 30#define NO_BALANCE_IRQ (0)
33#undef APIC_DEST_LOGICAL 31#undef APIC_DEST_LOGICAL
34#define APIC_DEST_LOGICAL 0x0 32#define APIC_DEST_LOGICAL 0x0
35#define WAKE_SECONDARY_VIA_INIT
36#endif
37 33
38static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) 34static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
39{ 35{
@@ -60,6 +56,16 @@ static inline unsigned long calculate_ldr(int cpu)
60 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel 56 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
61 * document number 292116). So here it goes... 57 * document number 292116). So here it goes...
62 */ 58 */
59static inline void init_apic_ldr_cluster(void)
60{
61 unsigned long val;
62 int cpu = smp_processor_id();
63
64 apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER);
65 val = calculate_ldr(cpu);
66 apic_write(APIC_LDR, val);
67}
68
63static inline void init_apic_ldr(void) 69static inline void init_apic_ldr(void)
64{ 70{
65 unsigned long val; 71 unsigned long val;
@@ -70,10 +76,6 @@ static inline void init_apic_ldr(void)
70 apic_write(APIC_LDR, val); 76 apic_write(APIC_LDR, val);
71} 77}
72 78
73#ifndef CONFIG_X86_GENERICARCH
74extern void enable_apic_mode(void);
75#endif
76
77extern int apic_version [MAX_APICS]; 79extern int apic_version [MAX_APICS];
78static inline void setup_apic_routing(void) 80static inline void setup_apic_routing(void)
79{ 81{
@@ -144,7 +146,7 @@ static inline int check_phys_apicid_present(int cpu_physical_apicid)
144 return (1); 146 return (1);
145} 147}
146 148
147static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) 149static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask)
148{ 150{
149 int num_bits_set; 151 int num_bits_set;
150 int cpus_found = 0; 152 int cpus_found = 0;
@@ -154,11 +156,7 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
154 num_bits_set = cpus_weight(cpumask); 156 num_bits_set = cpus_weight(cpumask);
155 /* Return id to all */ 157 /* Return id to all */
156 if (num_bits_set == NR_CPUS) 158 if (num_bits_set == NR_CPUS)
157#if defined CONFIG_ES7000_CLUSTERED_APIC
158 return 0xFF; 159 return 0xFF;
159#else
160 return cpu_to_logical_apicid(0);
161#endif
162 /* 160 /*
163 * The cpus in the mask must all be on the apic cluster. If are not 161 * The cpus in the mask must all be on the apic cluster. If are not
164 * on the same apicid cluster return default value of TARGET_CPUS. 162 * on the same apicid cluster return default value of TARGET_CPUS.
@@ -171,11 +169,40 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
171 if (apicid_cluster(apicid) != 169 if (apicid_cluster(apicid) !=
172 apicid_cluster(new_apicid)){ 170 apicid_cluster(new_apicid)){
173 printk ("%s: Not a valid mask!\n", __func__); 171 printk ("%s: Not a valid mask!\n", __func__);
174#if defined CONFIG_ES7000_CLUSTERED_APIC
175 return 0xFF; 172 return 0xFF;
176#else 173 }
174 apicid = new_apicid;
175 cpus_found++;
176 }
177 cpu++;
178 }
179 return apicid;
180}
181
182static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
183{
184 int num_bits_set;
185 int cpus_found = 0;
186 int cpu;
187 int apicid;
188
189 num_bits_set = cpus_weight(cpumask);
190 /* Return id to all */
191 if (num_bits_set == NR_CPUS)
192 return cpu_to_logical_apicid(0);
193 /*
194 * The cpus in the mask must all be on the apic cluster. If are not
195 * on the same apicid cluster return default value of TARGET_CPUS.
196 */
197 cpu = first_cpu(cpumask);
198 apicid = cpu_to_logical_apicid(cpu);
199 while (cpus_found < num_bits_set) {
200 if (cpu_isset(cpu, cpumask)) {
201 int new_apicid = cpu_to_logical_apicid(cpu);
202 if (apicid_cluster(apicid) !=
203 apicid_cluster(new_apicid)){
204 printk ("%s: Not a valid mask!\n", __func__);
177 return cpu_to_logical_apicid(0); 205 return cpu_to_logical_apicid(0);
178#endif
179 } 206 }
180 apicid = new_apicid; 207 apicid = new_apicid;
181 cpus_found++; 208 cpus_found++;
diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h
index 398493461913..78f0daaee436 100644
--- a/arch/x86/include/asm/es7000/wakecpu.h
+++ b/arch/x86/include/asm/es7000/wakecpu.h
@@ -1,36 +1,12 @@
1#ifndef __ASM_ES7000_WAKECPU_H 1#ifndef __ASM_ES7000_WAKECPU_H
2#define __ASM_ES7000_WAKECPU_H 2#define __ASM_ES7000_WAKECPU_H
3 3
4/* 4#define TRAMPOLINE_PHYS_LOW 0x467
5 * This file copes with machines that wakeup secondary CPUs by the 5#define TRAMPOLINE_PHYS_HIGH 0x469
6 * INIT, INIT, STARTUP sequence.
7 */
8
9#ifdef CONFIG_ES7000_CLUSTERED_APIC
10#define WAKE_SECONDARY_VIA_MIP
11#else
12#define WAKE_SECONDARY_VIA_INIT
13#endif
14
15#ifdef WAKE_SECONDARY_VIA_MIP
16extern int es7000_start_cpu(int cpu, unsigned long eip);
17static inline int
18wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
19{
20 int boot_error = 0;
21 boot_error = es7000_start_cpu(phys_apicid, start_eip);
22 return boot_error;
23}
24#endif
25
26#define TRAMPOLINE_LOW phys_to_virt(0x467)
27#define TRAMPOLINE_HIGH phys_to_virt(0x469)
28
29#define boot_cpu_apicid boot_cpu_physical_apicid
30 6
31static inline void wait_for_init_deassert(atomic_t *deassert) 7static inline void wait_for_init_deassert(atomic_t *deassert)
32{ 8{
33#ifdef WAKE_SECONDARY_VIA_INIT 9#ifndef CONFIG_ES7000_CLUSTERED_APIC
34 while (!atomic_read(deassert)) 10 while (!atomic_read(deassert))
35 cpu_relax(); 11 cpu_relax();
36#endif 12#endif
@@ -50,9 +26,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
50{ 26{
51} 27}
52 28
53#define inquire_remote_apic(apicid) do { \ 29extern void __inquire_remote_apic(int apicid);
54 if (apic_verbosity >= APIC_DEBUG) \ 30
55 __inquire_remote_apic(apicid); \ 31static inline void inquire_remote_apic(int apicid)
56 } while (0) 32{
33 if (apic_verbosity >= APIC_DEBUG)
34 __inquire_remote_apic(apicid);
35}
57 36
58#endif /* __ASM_MACH_WAKECPU_H */ 37#endif /* __ASM_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 9e8bc29b8b17..7e61b4ceb9a4 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -17,8 +17,40 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
17 */ 17 */
18 return addr - 1; 18 return addr - 1;
19} 19}
20#endif
21 20
21#ifdef CONFIG_DYNAMIC_FTRACE
22
23struct dyn_arch_ftrace {
24 /* No extra data needed for x86 */
25};
26
27#endif /* CONFIG_DYNAMIC_FTRACE */
28#endif /* __ASSEMBLY__ */
22#endif /* CONFIG_FUNCTION_TRACER */ 29#endif /* CONFIG_FUNCTION_TRACER */
23 30
31#ifdef CONFIG_FUNCTION_GRAPH_TRACER
32
33#ifndef __ASSEMBLY__
34
35/*
36 * Stack of return addresses for functions
37 * of a thread.
38 * Used in struct thread_info
39 */
40struct ftrace_ret_stack {
41 unsigned long ret;
42 unsigned long func;
43 unsigned long long calltime;
44};
45
46/*
47 * Primary handler of a function return.
48 * It relays on ftrace_return_to_handler.
49 * Defined in entry32.S
50 */
51extern void return_to_handler(void);
52
53#endif /* __ASSEMBLY__ */
54#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
55
24#endif /* _ASM_X86_FTRACE_H */ 56#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h
index 5cbd4fcc06fd..0ac17d33a8c7 100644
--- a/arch/x86/include/asm/genapic_32.h
+++ b/arch/x86/include/asm/genapic_32.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_GENAPIC_32_H 2#define _ASM_X86_GENAPIC_32_H
3 3
4#include <asm/mpspec.h> 4#include <asm/mpspec.h>
5#include <asm/atomic.h>
5 6
6/* 7/*
7 * Generic APIC driver interface. 8 * Generic APIC driver interface.
@@ -65,6 +66,14 @@ struct genapic {
65 void (*send_IPI_allbutself)(int vector); 66 void (*send_IPI_allbutself)(int vector);
66 void (*send_IPI_all)(int vector); 67 void (*send_IPI_all)(int vector);
67#endif 68#endif
69 int (*wakeup_cpu)(int apicid, unsigned long start_eip);
70 int trampoline_phys_low;
71 int trampoline_phys_high;
72 void (*wait_for_init_deassert)(atomic_t *deassert);
73 void (*smp_callin_clear_local_apic)(void);
74 void (*store_NMI_vector)(unsigned short *high, unsigned short *low);
75 void (*restore_NMI_vector)(unsigned short *high, unsigned short *low);
76 void (*inquire_remote_apic)(int apicid);
68}; 77};
69 78
70#define APICFUNC(x) .x = x, 79#define APICFUNC(x) .x = x,
@@ -105,16 +114,24 @@ struct genapic {
105 APICFUNC(get_apic_id) \ 114 APICFUNC(get_apic_id) \
106 .apic_id_mask = APIC_ID_MASK, \ 115 .apic_id_mask = APIC_ID_MASK, \
107 APICFUNC(cpu_mask_to_apicid) \ 116 APICFUNC(cpu_mask_to_apicid) \
108 APICFUNC(vector_allocation_domain) \ 117 APICFUNC(vector_allocation_domain) \
109 APICFUNC(acpi_madt_oem_check) \ 118 APICFUNC(acpi_madt_oem_check) \
110 IPIFUNC(send_IPI_mask) \ 119 IPIFUNC(send_IPI_mask) \
111 IPIFUNC(send_IPI_allbutself) \ 120 IPIFUNC(send_IPI_allbutself) \
112 IPIFUNC(send_IPI_all) \ 121 IPIFUNC(send_IPI_all) \
113 APICFUNC(enable_apic_mode) \ 122 APICFUNC(enable_apic_mode) \
114 APICFUNC(phys_pkg_id) \ 123 APICFUNC(phys_pkg_id) \
124 .trampoline_phys_low = TRAMPOLINE_PHYS_LOW, \
125 .trampoline_phys_high = TRAMPOLINE_PHYS_HIGH, \
126 APICFUNC(wait_for_init_deassert) \
127 APICFUNC(smp_callin_clear_local_apic) \
128 APICFUNC(store_NMI_vector) \
129 APICFUNC(restore_NMI_vector) \
130 APICFUNC(inquire_remote_apic) \
115} 131}
116 132
117extern struct genapic *genapic; 133extern struct genapic *genapic;
134extern void es7000_update_genapic_to_cluster(void);
118 135
119enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; 136enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
120#define get_uv_system_type() UV_NONE 137#define get_uv_system_type() UV_NONE
diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h
index 13c4e96199ea..2cae011668b7 100644
--- a/arch/x86/include/asm/genapic_64.h
+++ b/arch/x86/include/asm/genapic_64.h
@@ -32,6 +32,8 @@ struct genapic {
32 unsigned int (*get_apic_id)(unsigned long x); 32 unsigned int (*get_apic_id)(unsigned long x);
33 unsigned long (*set_apic_id)(unsigned int id); 33 unsigned long (*set_apic_id)(unsigned int id);
34 unsigned long apic_id_mask; 34 unsigned long apic_id_mask;
35 /* wakeup_secondary_cpu */
36 int (*wakeup_cpu)(int apicid, unsigned long start_eip);
35}; 37};
36 38
37extern struct genapic *genapic; 39extern struct genapic *genapic;
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 6afd9933a7dd..25d527ca1362 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -188,17 +188,14 @@ extern void restore_IO_APIC_setup(void);
188extern void reinit_intr_remapped_IO_APIC(int); 188extern void reinit_intr_remapped_IO_APIC(int);
189#endif 189#endif
190 190
191extern int probe_nr_irqs(void); 191extern void probe_nr_irqs_gsi(void);
192 192
193#else /* !CONFIG_X86_IO_APIC */ 193#else /* !CONFIG_X86_IO_APIC */
194#define io_apic_assign_pci_irqs 0 194#define io_apic_assign_pci_irqs 0
195static const int timer_through_8259 = 0; 195static const int timer_through_8259 = 0;
196static inline void ioapic_init_mappings(void) { } 196static inline void ioapic_init_mappings(void) { }
197 197
198static inline int probe_nr_irqs(void) 198static inline void probe_nr_irqs_gsi(void) { }
199{
200 return NR_IRQS;
201}
202#endif 199#endif
203 200
204#endif /* _ASM_X86_IO_APIC_H */ 201#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f941..f7ff65032b9d 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -101,12 +101,23 @@
101#define LAST_VM86_IRQ 15 101#define LAST_VM86_IRQ 15
102#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) 102#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
103 103
104#define NR_IRQS_LEGACY 16
105
104#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER) 106#if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
107
108#ifndef CONFIG_SPARSE_IRQ
105# if NR_CPUS < MAX_IO_APICS 109# if NR_CPUS < MAX_IO_APICS
106# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) 110# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
107# else 111# else
108# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) 112# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
109# endif 113# endif
114#else
115# if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
116# define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
117# else
118# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
119# endif
120#endif
110 121
111#elif defined(CONFIG_X86_VOYAGER) 122#elif defined(CONFIG_X86_VOYAGER)
112 123
diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h
index ff3a6c236c00..6cb3a467e067 100644
--- a/arch/x86/include/asm/mach-default/mach_apic.h
+++ b/arch/x86/include/asm/mach-default/mach_apic.h
@@ -32,11 +32,13 @@ static inline cpumask_t target_cpus(void)
32#define vector_allocation_domain (genapic->vector_allocation_domain) 32#define vector_allocation_domain (genapic->vector_allocation_domain)
33#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID))) 33#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID)))
34#define send_IPI_self (genapic->send_IPI_self) 34#define send_IPI_self (genapic->send_IPI_self)
35#define wakeup_secondary_cpu (genapic->wakeup_cpu)
35extern void setup_apic_routing(void); 36extern void setup_apic_routing(void);
36#else 37#else
37#define INT_DELIVERY_MODE dest_LowestPrio 38#define INT_DELIVERY_MODE dest_LowestPrio
38#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */ 39#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
39#define TARGET_CPUS (target_cpus()) 40#define TARGET_CPUS (target_cpus())
41#define wakeup_secondary_cpu wakeup_secondary_cpu_via_init
40/* 42/*
41 * Set up the logical destination ID. 43 * Set up the logical destination ID.
42 * 44 *
diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h
index 9d80db91e992..ceb013660146 100644
--- a/arch/x86/include/asm/mach-default/mach_wakecpu.h
+++ b/arch/x86/include/asm/mach-default/mach_wakecpu.h
@@ -1,17 +1,8 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H 1#ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
2#define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H 2#define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
3 3
4/* 4#define TRAMPOLINE_PHYS_LOW (0x467)
5 * This file copes with machines that wakeup secondary CPUs by the 5#define TRAMPOLINE_PHYS_HIGH (0x469)
6 * INIT, INIT, STARTUP sequence.
7 */
8
9#define WAKE_SECONDARY_VIA_INIT
10
11#define TRAMPOLINE_LOW phys_to_virt(0x467)
12#define TRAMPOLINE_HIGH phys_to_virt(0x469)
13
14#define boot_cpu_apicid boot_cpu_physical_apicid
15 6
16static inline void wait_for_init_deassert(atomic_t *deassert) 7static inline void wait_for_init_deassert(atomic_t *deassert)
17{ 8{
@@ -33,9 +24,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
33{ 24{
34} 25}
35 26
36#define inquire_remote_apic(apicid) do { \ 27extern void __inquire_remote_apic(int apicid);
37 if (apic_verbosity >= APIC_DEBUG) \ 28
38 __inquire_remote_apic(apicid); \ 29static inline void inquire_remote_apic(int apicid)
39 } while (0) 30{
31 if (apic_verbosity >= APIC_DEBUG)
32 __inquire_remote_apic(apicid);
33}
40 34
41#endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */ 35#endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/mach-default/smpboot_hooks.h
index dbab36d64d48..23bf52103b89 100644
--- a/arch/x86/include/asm/mach-default/smpboot_hooks.h
+++ b/arch/x86/include/asm/mach-default/smpboot_hooks.h
@@ -13,9 +13,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
13 CMOS_WRITE(0xa, 0xf); 13 CMOS_WRITE(0xa, 0xf);
14 local_flush_tlb(); 14 local_flush_tlb();
15 pr_debug("1.\n"); 15 pr_debug("1.\n");
16 *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4; 16 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
17 start_eip >> 4;
17 pr_debug("2.\n"); 18 pr_debug("2.\n");
18 *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf; 19 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
20 start_eip & 0xf;
19 pr_debug("3.\n"); 21 pr_debug("3.\n");
20} 22}
21 23
@@ -32,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
32 */ 34 */
33 CMOS_WRITE(0, 0xf); 35 CMOS_WRITE(0, 0xf);
34 36
35 *((volatile long *) phys_to_virt(0x467)) = 0; 37 *((volatile long *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
36} 38}
37 39
38static inline void __init smpboot_setup_io_apic(void) 40static inline void __init smpboot_setup_io_apic(void)
diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h
index 5180bd7478fb..e430f47df667 100644
--- a/arch/x86/include/asm/mach-generic/mach_apic.h
+++ b/arch/x86/include/asm/mach-generic/mach_apic.h
@@ -27,6 +27,7 @@
27#define vector_allocation_domain (genapic->vector_allocation_domain) 27#define vector_allocation_domain (genapic->vector_allocation_domain)
28#define enable_apic_mode (genapic->enable_apic_mode) 28#define enable_apic_mode (genapic->enable_apic_mode)
29#define phys_pkg_id (genapic->phys_pkg_id) 29#define phys_pkg_id (genapic->phys_pkg_id)
30#define wakeup_secondary_cpu (genapic->wakeup_cpu)
30 31
31extern void generic_bigsmp_probe(void); 32extern void generic_bigsmp_probe(void);
32 33
diff --git a/arch/x86/include/asm/mach-generic/mach_wakecpu.h b/arch/x86/include/asm/mach-generic/mach_wakecpu.h
new file mode 100644
index 000000000000..1ab16b168c8a
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_wakecpu.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H
2#define _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H
3
4#define TRAMPOLINE_PHYS_LOW (genapic->trampoline_phys_low)
5#define TRAMPOLINE_PHYS_HIGH (genapic->trampoline_phys_high)
6#define wait_for_init_deassert (genapic->wait_for_init_deassert)
7#define smp_callin_clear_local_apic (genapic->smp_callin_clear_local_apic)
8#define store_NMI_vector (genapic->store_NMI_vector)
9#define restore_NMI_vector (genapic->restore_NMI_vector)
10#define inquire_remote_apic (genapic->inquire_remote_apic)
11
12#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */
diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h
index c577bda5b1c5..6f499df8eddb 100644
--- a/arch/x86/include/asm/numaq/wakecpu.h
+++ b/arch/x86/include/asm/numaq/wakecpu.h
@@ -3,12 +3,8 @@
3 3
4/* This file copes with machines that wakeup secondary CPUs by NMIs */ 4/* This file copes with machines that wakeup secondary CPUs by NMIs */
5 5
6#define WAKE_SECONDARY_VIA_NMI 6#define TRAMPOLINE_PHYS_LOW (0x8)
7 7#define TRAMPOLINE_PHYS_HIGH (0xa)
8#define TRAMPOLINE_LOW phys_to_virt(0x8)
9#define TRAMPOLINE_HIGH phys_to_virt(0xa)
10
11#define boot_cpu_apicid boot_cpu_logical_apicid
12 8
13/* We don't do anything here because we use NMI's to boot instead */ 9/* We don't do anything here because we use NMI's to boot instead */
14static inline void wait_for_init_deassert(atomic_t *deassert) 10static inline void wait_for_init_deassert(atomic_t *deassert)
@@ -27,17 +23,23 @@ static inline void smp_callin_clear_local_apic(void)
27static inline void store_NMI_vector(unsigned short *high, unsigned short *low) 23static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
28{ 24{
29 printk("Storing NMI vector\n"); 25 printk("Storing NMI vector\n");
30 *high = *((volatile unsigned short *) TRAMPOLINE_HIGH); 26 *high =
31 *low = *((volatile unsigned short *) TRAMPOLINE_LOW); 27 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH));
28 *low =
29 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW));
32} 30}
33 31
34static inline void restore_NMI_vector(unsigned short *high, unsigned short *low) 32static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
35{ 33{
36 printk("Restoring NMI vector\n"); 34 printk("Restoring NMI vector\n");
37 *((volatile unsigned short *) TRAMPOLINE_HIGH) = *high; 35 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
38 *((volatile unsigned short *) TRAMPOLINE_LOW) = *low; 36 *high;
37 *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
38 *low;
39} 39}
40 40
41#define inquire_remote_apic(apicid) {} 41static inline void inquire_remote_apic(int apicid)
42{
43}
42 44
43#endif /* __ASM_NUMAQ_WAKECPU_H */ 45#endif /* __ASM_NUMAQ_WAKECPU_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index f12d37237465..294daeb3a006 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -16,6 +16,8 @@ static inline void visws_early_detect(void) { }
16static inline int is_visws_box(void) { return 0; } 16static inline int is_visws_box(void) { return 0; }
17#endif 17#endif
18 18
19extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip);
20extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip);
19/* 21/*
20 * Any setup quirks to be performed? 22 * Any setup quirks to be performed?
21 */ 23 */
@@ -39,6 +41,7 @@ struct x86_quirks {
39 void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable, 41 void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable,
40 unsigned short oemsize); 42 unsigned short oemsize);
41 int (*setup_ioapic_ids)(void); 43 int (*setup_ioapic_ids)(void);
44 int (*update_genapic)(void);
42}; 45};
43 46
44extern struct x86_quirks *x86_quirks; 47extern struct x86_quirks *x86_quirks;
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 2ed3f0f44ff7..07c3e4048991 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -314,6 +314,8 @@ extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
314 314
315void default_idle(void); 315void default_idle(void);
316 316
317void stop_this_cpu(void *dummy);
318
317/* 319/*
318 * Force strict CPU ordering. 320 * Force strict CPU ordering.
319 * And yes, this is required on UP too when we're talking 321 * And yes, this is required on UP too when we're talking
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad2..0921b4018c11 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -20,6 +20,8 @@
20struct task_struct; 20struct task_struct;
21struct exec_domain; 21struct exec_domain;
22#include <asm/processor.h> 22#include <asm/processor.h>
23#include <asm/ftrace.h>
24#include <asm/atomic.h>
23 25
24struct thread_info { 26struct thread_info {
25 struct task_struct *task; /* main task structure */ 27 struct task_struct *task; /* main task structure */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 35c54921b2e4..99192bb55a53 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -157,6 +157,7 @@ extern int __get_user_bad(void);
157 int __ret_gu; \ 157 int __ret_gu; \
158 unsigned long __val_gu; \ 158 unsigned long __val_gu; \
159 __chk_user_ptr(ptr); \ 159 __chk_user_ptr(ptr); \
160 might_fault(); \
160 switch (sizeof(*(ptr))) { \ 161 switch (sizeof(*(ptr))) { \
161 case 1: \ 162 case 1: \
162 __get_user_x(1, __ret_gu, __val_gu, ptr); \ 163 __get_user_x(1, __ret_gu, __val_gu, ptr); \
@@ -241,6 +242,7 @@ extern void __put_user_8(void);
241 int __ret_pu; \ 242 int __ret_pu; \
242 __typeof__(*(ptr)) __pu_val; \ 243 __typeof__(*(ptr)) __pu_val; \
243 __chk_user_ptr(ptr); \ 244 __chk_user_ptr(ptr); \
245 might_fault(); \
244 __pu_val = x; \ 246 __pu_val = x; \
245 switch (sizeof(*(ptr))) { \ 247 switch (sizeof(*(ptr))) { \
246 case 1: \ 248 case 1: \
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index d095a3aeea1b..5e06259e90e5 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -82,8 +82,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
82static __always_inline unsigned long __must_check 82static __always_inline unsigned long __must_check
83__copy_to_user(void __user *to, const void *from, unsigned long n) 83__copy_to_user(void __user *to, const void *from, unsigned long n)
84{ 84{
85 might_sleep(); 85 might_fault();
86 return __copy_to_user_inatomic(to, from, n); 86 return __copy_to_user_inatomic(to, from, n);
87} 87}
88 88
89static __always_inline unsigned long 89static __always_inline unsigned long
@@ -137,7 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
137static __always_inline unsigned long 137static __always_inline unsigned long
138__copy_from_user(void *to, const void __user *from, unsigned long n) 138__copy_from_user(void *to, const void __user *from, unsigned long n)
139{ 139{
140 might_sleep(); 140 might_fault();
141 if (__builtin_constant_p(n)) { 141 if (__builtin_constant_p(n)) {
142 unsigned long ret; 142 unsigned long ret;
143 143
@@ -159,7 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
159static __always_inline unsigned long __copy_from_user_nocache(void *to, 159static __always_inline unsigned long __copy_from_user_nocache(void *to,
160 const void __user *from, unsigned long n) 160 const void __user *from, unsigned long n)
161{ 161{
162 might_sleep(); 162 might_fault();
163 if (__builtin_constant_p(n)) { 163 if (__builtin_constant_p(n)) {
164 unsigned long ret; 164 unsigned long ret;
165 165
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f8cfd00db450..84210c479fca 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -29,6 +29,8 @@ static __always_inline __must_check
29int __copy_from_user(void *dst, const void __user *src, unsigned size) 29int __copy_from_user(void *dst, const void __user *src, unsigned size)
30{ 30{
31 int ret = 0; 31 int ret = 0;
32
33 might_fault();
32 if (!__builtin_constant_p(size)) 34 if (!__builtin_constant_p(size))
33 return copy_user_generic(dst, (__force void *)src, size); 35 return copy_user_generic(dst, (__force void *)src, size);
34 switch (size) { 36 switch (size) {
@@ -71,6 +73,8 @@ static __always_inline __must_check
71int __copy_to_user(void __user *dst, const void *src, unsigned size) 73int __copy_to_user(void __user *dst, const void *src, unsigned size)
72{ 74{
73 int ret = 0; 75 int ret = 0;
76
77 might_fault();
74 if (!__builtin_constant_p(size)) 78 if (!__builtin_constant_p(size))
75 return copy_user_generic((__force void *)dst, src, size); 79 return copy_user_generic((__force void *)dst, src, size);
76 switch (size) { 80 switch (size) {
@@ -113,6 +117,8 @@ static __always_inline __must_check
113int __copy_in_user(void __user *dst, const void __user *src, unsigned size) 117int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
114{ 118{
115 int ret = 0; 119 int ret = 0;
120
121 might_fault();
116 if (!__builtin_constant_p(size)) 122 if (!__builtin_constant_p(size))
117 return copy_user_generic((__force void *)dst, 123 return copy_user_generic((__force void *)dst,
118 (__force void *)src, size); 124 (__force void *)src, size);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b62a7667828e..1cad9318d217 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,7 @@ CFLAGS_tsc.o := $(nostackp)
25 25
26obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 26obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
27obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 27obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
28obj-y += time_$(BITS).o ioport.o ldt.o 28obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
29obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 29obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
30obj-$(CONFIG_X86_VISWS) += visws_quirks.o 30obj-$(CONFIG_X86_VISWS) += visws_quirks.o
31obj-$(CONFIG_X86_32) += probe_roms_32.o 31obj-$(CONFIG_X86_32) += probe_roms_32.o
@@ -65,6 +65,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
65obj-$(CONFIG_X86_IO_APIC) += io_apic.o 65obj-$(CONFIG_X86_IO_APIC) += io_apic.o
66obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 66obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
67obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 67obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
68obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
68obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 69obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 70obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 71obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4c51a2f8fd31..65d0b72777ea 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1360,6 +1360,17 @@ static void __init acpi_process_madt(void)
1360 disable_acpi(); 1360 disable_acpi();
1361 } 1361 }
1362 } 1362 }
1363
1364 /*
1365 * ACPI supports both logical (e.g. Hyper-Threading) and physical
1366 * processors, where MPS only supports physical.
1367 */
1368 if (acpi_lapic && acpi_ioapic)
1369 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
1370 "information\n");
1371 else if (acpi_lapic)
1372 printk(KERN_INFO "Using ACPI for processor (LAPIC) "
1373 "configuration information\n");
1363#endif 1374#endif
1364 return; 1375 return;
1365} 1376}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 5145a6e72bbb..3a26525a3f31 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -391,11 +391,7 @@ static int power_off;
391#else 391#else
392static int power_off = 1; 392static int power_off = 1;
393#endif 393#endif
394#ifdef CONFIG_APM_REAL_MODE_POWER_OFF
395static int realmode_power_off = 1;
396#else
397static int realmode_power_off; 394static int realmode_power_off;
398#endif
399#ifdef CONFIG_APM_ALLOW_INTS 395#ifdef CONFIG_APM_ALLOW_INTS
400static int allow_ints = 1; 396static int allow_ints = 1;
401#else 397#else
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8e48c5d4467d..88ea02dcb622 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/ftrace.h>
36 37
37#include <linux/acpi.h> 38#include <linux/acpi.h>
38#include <acpi/processor.h> 39#include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
391 unsigned int next_perf_state = 0; /* Index into perf table */ 392 unsigned int next_perf_state = 0; /* Index into perf table */
392 unsigned int i; 393 unsigned int i;
393 int result = 0; 394 int result = 0;
395 struct power_trace it;
394 396
395 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); 397 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
396 398
@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
427 } 429 }
428 } 430 }
429 431
432 trace_power_mark(&it, POWER_PSTATE, next_perf_state);
433
430 switch (data->cpu_feature) { 434 switch (data->cpu_feature) {
431 case SYSTEM_INTEL_MSR_CAPABLE: 435 case SYSTEM_INTEL_MSR_CAPABLE:
432 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 436 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index cce0b6118d55..816f27f289b1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -307,12 +307,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
307 set_cpu_cap(c, X86_FEATURE_P4); 307 set_cpu_cap(c, X86_FEATURE_P4);
308 if (c->x86 == 6) 308 if (c->x86 == 6)
309 set_cpu_cap(c, X86_FEATURE_P3); 309 set_cpu_cap(c, X86_FEATURE_P3);
310#endif
310 311
311 if (cpu_has_bts) 312 if (cpu_has_bts)
312 ptrace_bts_init_intel(c); 313 ptrace_bts_init_intel(c);
313 314
314#endif
315
316 detect_extended_topology(c); 315 detect_extended_topology(c);
317 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { 316 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
318 /* 317 /*
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index a2d1176c38ee..19a8c2c0389f 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -7,13 +7,12 @@
7 * 7 *
8 * It manages: 8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS 9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional) 10 * - buffer overflow handling (to be done)
11 * - buffer overflow handling
12 * - buffer access 11 * - buffer access
13 * 12 *
14 * It assumes: 13 * It assumes:
15 * - get_task_struct on all parameter tasks 14 * - get_task_struct on all traced tasks
16 * - current is allowed to trace parameter tasks 15 * - current is allowed to trace tasks
17 * 16 *
18 * 17 *
19 * Copyright (C) 2007-2008 Intel Corporation. 18 * Copyright (C) 2007-2008 Intel Corporation.
@@ -28,6 +27,7 @@
28#include <linux/slab.h> 27#include <linux/slab.h>
29#include <linux/sched.h> 28#include <linux/sched.h>
30#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/kernel.h>
31 31
32 32
33/* 33/*
@@ -44,6 +44,33 @@ struct ds_configuration {
44}; 44};
45static struct ds_configuration ds_cfg; 45static struct ds_configuration ds_cfg;
46 46
47/*
48 * A BTS or PEBS tracer.
49 *
50 * This holds the configuration of the tracer and serves as a handle
51 * to identify tracers.
52 */
53struct ds_tracer {
54 /* the DS context (partially) owned by this tracer */
55 struct ds_context *context;
56 /* the buffer provided on ds_request() and its size in bytes */
57 void *buffer;
58 size_t size;
59};
60
61struct bts_tracer {
62 /* the common DS part */
63 struct ds_tracer ds;
64 /* buffer overflow notification function */
65 bts_ovfl_callback_t ovfl;
66};
67
68struct pebs_tracer {
69 /* the common DS part */
70 struct ds_tracer ds;
71 /* buffer overflow notification function */
72 pebs_ovfl_callback_t ovfl;
73};
47 74
48/* 75/*
49 * Debug Store (DS) save area configuration (see Intel64 and IA32 76 * Debug Store (DS) save area configuration (see Intel64 and IA32
@@ -107,34 +134,13 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
107 (*(unsigned long *)base) = value; 134 (*(unsigned long *)base) = value;
108} 135}
109 136
137#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */
110 138
111/*
112 * Locking is done only for allocating BTS or PEBS resources and for
113 * guarding context and buffer memory allocation.
114 *
115 * Most functions require the current task to own the ds context part
116 * they are going to access. All the locking is done when validating
117 * access to the context.
118 */
119static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
120 139
121/* 140/*
122 * Validate that the current task is allowed to access the BTS/PEBS 141 * Locking is done only for allocating BTS or PEBS resources.
123 * buffer of the parameter task.
124 *
125 * Returns 0, if access is granted; -Eerrno, otherwise.
126 */ 142 */
127static inline int ds_validate_access(struct ds_context *context, 143static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
128 enum ds_qualifier qual)
129{
130 if (!context)
131 return -EPERM;
132
133 if (context->owner[qual] == current)
134 return 0;
135
136 return -EPERM;
137}
138 144
139 145
140/* 146/*
@@ -183,51 +189,13 @@ static inline int check_tracer(struct task_struct *task)
183 * 189 *
184 * Contexts are use-counted. They are allocated on first access and 190 * Contexts are use-counted. They are allocated on first access and
185 * deallocated when the last user puts the context. 191 * deallocated when the last user puts the context.
186 *
187 * We distinguish between an allocating and a non-allocating get of a
188 * context:
189 * - the allocating get is used for requesting BTS/PEBS resources. It
190 * requires the caller to hold the global ds_lock.
191 * - the non-allocating get is used for all other cases. A
192 * non-existing context indicates an error. It acquires and releases
193 * the ds_lock itself for obtaining the context.
194 *
195 * A context and its DS configuration are allocated and deallocated
196 * together. A context always has a DS configuration of the
197 * appropriate size.
198 */ 192 */
199static DEFINE_PER_CPU(struct ds_context *, system_context); 193static DEFINE_PER_CPU(struct ds_context *, system_context);
200 194
201#define this_system_context per_cpu(system_context, smp_processor_id()) 195#define this_system_context per_cpu(system_context, smp_processor_id())
202 196
203/*
204 * Returns the pointer to the parameter task's context or to the
205 * system-wide context, if task is NULL.
206 *
207 * Increases the use count of the returned context, if not NULL.
208 */
209static inline struct ds_context *ds_get_context(struct task_struct *task) 197static inline struct ds_context *ds_get_context(struct task_struct *task)
210{ 198{
211 struct ds_context *context;
212 unsigned long irq;
213
214 spin_lock_irqsave(&ds_lock, irq);
215
216 context = (task ? task->thread.ds_ctx : this_system_context);
217 if (context)
218 context->count++;
219
220 spin_unlock_irqrestore(&ds_lock, irq);
221
222 return context;
223}
224
225/*
226 * Same as ds_get_context, but allocates the context and it's DS
227 * structure, if necessary; returns NULL; if out of memory.
228 */
229static inline struct ds_context *ds_alloc_context(struct task_struct *task)
230{
231 struct ds_context **p_context = 199 struct ds_context **p_context =
232 (task ? &task->thread.ds_ctx : &this_system_context); 200 (task ? &task->thread.ds_ctx : &this_system_context);
233 struct ds_context *context = *p_context; 201 struct ds_context *context = *p_context;
@@ -238,16 +206,9 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)
238 if (!context) 206 if (!context)
239 return NULL; 207 return NULL;
240 208
241 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
242 if (!context->ds) {
243 kfree(context);
244 return NULL;
245 }
246
247 spin_lock_irqsave(&ds_lock, irq); 209 spin_lock_irqsave(&ds_lock, irq);
248 210
249 if (*p_context) { 211 if (*p_context) {
250 kfree(context->ds);
251 kfree(context); 212 kfree(context);
252 213
253 context = *p_context; 214 context = *p_context;
@@ -272,10 +233,6 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)
272 return context; 233 return context;
273} 234}
274 235
275/*
276 * Decreases the use count of the parameter context, if not NULL.
277 * Deallocates the context, if the use count reaches zero.
278 */
279static inline void ds_put_context(struct ds_context *context) 236static inline void ds_put_context(struct ds_context *context)
280{ 237{
281 unsigned long irq; 238 unsigned long irq;
@@ -296,13 +253,6 @@ static inline void ds_put_context(struct ds_context *context)
296 if (!context->task || (context->task == current)) 253 if (!context->task || (context->task == current))
297 wrmsrl(MSR_IA32_DS_AREA, 0); 254 wrmsrl(MSR_IA32_DS_AREA, 0);
298 255
299 put_tracer(context->task);
300
301 /* free any leftover buffers from tracers that did not
302 * deallocate them properly. */
303 kfree(context->buffer[ds_bts]);
304 kfree(context->buffer[ds_pebs]);
305 kfree(context->ds);
306 kfree(context); 256 kfree(context);
307 out: 257 out:
308 spin_unlock_irqrestore(&ds_lock, irq); 258 spin_unlock_irqrestore(&ds_lock, irq);
@@ -312,345 +262,342 @@ static inline void ds_put_context(struct ds_context *context)
312/* 262/*
313 * Handle a buffer overflow 263 * Handle a buffer overflow
314 * 264 *
315 * task: the task whose buffers are overflowing;
316 * NULL for a buffer overflow on the current cpu
317 * context: the ds context 265 * context: the ds context
318 * qual: the buffer type 266 * qual: the buffer type
319 */ 267 */
320static void ds_overflow(struct task_struct *task, struct ds_context *context, 268static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
321 enum ds_qualifier qual) 269{
322{ 270 switch (qual) {
323 if (!context) 271 case ds_bts: {
324 return; 272 struct bts_tracer *tracer =
325 273 container_of(context->owner[qual],
326 if (context->callback[qual]) 274 struct bts_tracer, ds);
327 (*context->callback[qual])(task); 275 if (tracer->ovfl)
328 276 tracer->ovfl(tracer);
329 /* todo: do some more overflow handling */ 277 }
278 break;
279 case ds_pebs: {
280 struct pebs_tracer *tracer =
281 container_of(context->owner[qual],
282 struct pebs_tracer, ds);
283 if (tracer->ovfl)
284 tracer->ovfl(tracer);
285 }
286 break;
287 }
330} 288}
331 289
332 290
333/* 291static void ds_install_ds_config(struct ds_context *context,
334 * Allocate a non-pageable buffer of the parameter size. 292 enum ds_qualifier qual,
335 * Checks the memory and the locked memory rlimit. 293 void *base, size_t size, size_t ith)
336 *
337 * Returns the buffer, if successful;
338 * NULL, if out of memory or rlimit exceeded.
339 *
340 * size: the requested buffer size in bytes
341 * pages (out): if not NULL, contains the number of pages reserved
342 */
343static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
344{ 294{
345 unsigned long rlim, vm, pgsz; 295 unsigned long buffer, adj;
346 void *buffer;
347
348 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
349
350 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
351 vm = current->mm->total_vm + pgsz;
352 if (rlim < vm)
353 return NULL;
354 296
355 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 297 /* adjust the buffer address and size to meet alignment
356 vm = current->mm->locked_vm + pgsz; 298 * constraints:
357 if (rlim < vm) 299 * - buffer is double-word aligned
358 return NULL; 300 * - size is multiple of record size
301 *
302 * We checked the size at the very beginning; we have enough
303 * space to do the adjustment.
304 */
305 buffer = (unsigned long)base;
359 306
360 buffer = kzalloc(size, GFP_KERNEL); 307 adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
361 if (!buffer) 308 buffer += adj;
362 return NULL; 309 size -= adj;
363 310
364 current->mm->total_vm += pgsz; 311 size /= ds_cfg.sizeof_rec[qual];
365 current->mm->locked_vm += pgsz; 312 size *= ds_cfg.sizeof_rec[qual];
366 313
367 if (pages) 314 ds_set(context->ds, qual, ds_buffer_base, buffer);
368 *pages = pgsz; 315 ds_set(context->ds, qual, ds_index, buffer);
316 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
369 317
370 return buffer; 318 /* The value for 'no threshold' is -1, which will set the
319 * threshold outside of the buffer, just like we want it.
320 */
321 ds_set(context->ds, qual,
322 ds_interrupt_threshold, buffer + size - ith);
371} 323}
372 324
373static int ds_request(struct task_struct *task, void *base, size_t size, 325static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual,
374 ds_ovfl_callback_t ovfl, enum ds_qualifier qual) 326 struct task_struct *task,
327 void *base, size_t size, size_t th)
375{ 328{
376 struct ds_context *context; 329 struct ds_context *context;
377 unsigned long buffer, adj;
378 const unsigned long alignment = (1 << 3);
379 unsigned long irq; 330 unsigned long irq;
380 int error = 0; 331 int error;
381 332
333 error = -EOPNOTSUPP;
382 if (!ds_cfg.sizeof_ds) 334 if (!ds_cfg.sizeof_ds)
383 return -EOPNOTSUPP; 335 goto out;
336
337 error = -EINVAL;
338 if (!base)
339 goto out;
384 340
385 /* we require some space to do alignment adjustments below */ 341 /* we require some space to do alignment adjustments below */
386 if (size < (alignment + ds_cfg.sizeof_rec[qual])) 342 error = -EINVAL;
387 return -EINVAL; 343 if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
344 goto out;
388 345
389 /* buffer overflow notification is not yet implemented */ 346 if (th != (size_t)-1) {
390 if (ovfl) 347 th *= ds_cfg.sizeof_rec[qual];
391 return -EOPNOTSUPP; 348
349 error = -EINVAL;
350 if (size <= th)
351 goto out;
352 }
392 353
354 tracer->buffer = base;
355 tracer->size = size;
393 356
394 context = ds_alloc_context(task); 357 error = -ENOMEM;
358 context = ds_get_context(task);
395 if (!context) 359 if (!context)
396 return -ENOMEM; 360 goto out;
361 tracer->context = context;
362
397 363
398 spin_lock_irqsave(&ds_lock, irq); 364 spin_lock_irqsave(&ds_lock, irq);
399 365
400 error = -EPERM; 366 error = -EPERM;
401 if (!check_tracer(task)) 367 if (!check_tracer(task))
402 goto out_unlock; 368 goto out_unlock;
403
404 get_tracer(task); 369 get_tracer(task);
405 370
406 error = -EALREADY;
407 if (context->owner[qual] == current)
408 goto out_put_tracer;
409 error = -EPERM; 371 error = -EPERM;
410 if (context->owner[qual] != NULL) 372 if (context->owner[qual])
411 goto out_put_tracer; 373 goto out_put_tracer;
412 context->owner[qual] = current; 374 context->owner[qual] = tracer;
413 375
414 spin_unlock_irqrestore(&ds_lock, irq); 376 spin_unlock_irqrestore(&ds_lock, irq);
415 377
416 378
417 error = -ENOMEM; 379 ds_install_ds_config(context, qual, base, size, th);
418 if (!base) {
419 base = ds_allocate_buffer(size, &context->pages[qual]);
420 if (!base)
421 goto out_release;
422
423 context->buffer[qual] = base;
424 }
425 error = 0;
426 380
427 context->callback[qual] = ovfl; 381 return 0;
428
429 /* adjust the buffer address and size to meet alignment
430 * constraints:
431 * - buffer is double-word aligned
432 * - size is multiple of record size
433 *
434 * We checked the size at the very beginning; we have enough
435 * space to do the adjustment.
436 */
437 buffer = (unsigned long)base;
438
439 adj = ALIGN(buffer, alignment) - buffer;
440 buffer += adj;
441 size -= adj;
442
443 size /= ds_cfg.sizeof_rec[qual];
444 size *= ds_cfg.sizeof_rec[qual];
445
446 ds_set(context->ds, qual, ds_buffer_base, buffer);
447 ds_set(context->ds, qual, ds_index, buffer);
448 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
449
450 if (ovfl) {
451 /* todo: select a suitable interrupt threshold */
452 } else
453 ds_set(context->ds, qual,
454 ds_interrupt_threshold, buffer + size + 1);
455
456 /* we keep the context until ds_release */
457 return error;
458
459 out_release:
460 context->owner[qual] = NULL;
461 ds_put_context(context);
462 put_tracer(task);
463 return error;
464 382
465 out_put_tracer: 383 out_put_tracer:
466 spin_unlock_irqrestore(&ds_lock, irq);
467 ds_put_context(context);
468 put_tracer(task); 384 put_tracer(task);
469 return error;
470
471 out_unlock: 385 out_unlock:
472 spin_unlock_irqrestore(&ds_lock, irq); 386 spin_unlock_irqrestore(&ds_lock, irq);
473 ds_put_context(context); 387 ds_put_context(context);
388 tracer->context = NULL;
389 out:
474 return error; 390 return error;
475} 391}
476 392
477int ds_request_bts(struct task_struct *task, void *base, size_t size, 393struct bts_tracer *ds_request_bts(struct task_struct *task,
478 ds_ovfl_callback_t ovfl) 394 void *base, size_t size,
395 bts_ovfl_callback_t ovfl, size_t th)
479{ 396{
480 return ds_request(task, base, size, ovfl, ds_bts); 397 struct bts_tracer *tracer;
481} 398 int error;
482 399
483int ds_request_pebs(struct task_struct *task, void *base, size_t size, 400 /* buffer overflow notification is not yet implemented */
484 ds_ovfl_callback_t ovfl) 401 error = -EOPNOTSUPP;
485{ 402 if (ovfl)
486 return ds_request(task, base, size, ovfl, ds_pebs); 403 goto out;
404
405 error = -ENOMEM;
406 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
407 if (!tracer)
408 goto out;
409 tracer->ovfl = ovfl;
410
411 error = ds_request(&tracer->ds, ds_bts, task, base, size, th);
412 if (error < 0)
413 goto out_tracer;
414
415 return tracer;
416
417 out_tracer:
418 kfree(tracer);
419 out:
420 return ERR_PTR(error);
487} 421}
488 422
489static int ds_release(struct task_struct *task, enum ds_qualifier qual) 423struct pebs_tracer *ds_request_pebs(struct task_struct *task,
424 void *base, size_t size,
425 pebs_ovfl_callback_t ovfl, size_t th)
490{ 426{
491 struct ds_context *context; 427 struct pebs_tracer *tracer;
492 int error; 428 int error;
493 429
494 context = ds_get_context(task); 430 /* buffer overflow notification is not yet implemented */
495 error = ds_validate_access(context, qual); 431 error = -EOPNOTSUPP;
496 if (error < 0) 432 if (ovfl)
497 goto out; 433 goto out;
498 434
499 kfree(context->buffer[qual]); 435 error = -ENOMEM;
500 context->buffer[qual] = NULL; 436 tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
437 if (!tracer)
438 goto out;
439 tracer->ovfl = ovfl;
501 440
502 current->mm->total_vm -= context->pages[qual]; 441 error = ds_request(&tracer->ds, ds_pebs, task, base, size, th);
503 current->mm->locked_vm -= context->pages[qual]; 442 if (error < 0)
504 context->pages[qual] = 0; 443 goto out_tracer;
505 context->owner[qual] = NULL;
506 444
507 /* 445 return tracer;
508 * we put the context twice: 446
509 * once for the ds_get_context 447 out_tracer:
510 * once for the corresponding ds_request 448 kfree(tracer);
511 */
512 ds_put_context(context);
513 out: 449 out:
514 ds_put_context(context); 450 return ERR_PTR(error);
515 return error;
516} 451}
517 452
518int ds_release_bts(struct task_struct *task) 453static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual)
519{ 454{
520 return ds_release(task, ds_bts); 455 BUG_ON(tracer->context->owner[qual] != tracer);
456 tracer->context->owner[qual] = NULL;
457
458 put_tracer(tracer->context->task);
459 ds_put_context(tracer->context);
521} 460}
522 461
523int ds_release_pebs(struct task_struct *task) 462int ds_release_bts(struct bts_tracer *tracer)
524{ 463{
525 return ds_release(task, ds_pebs); 464 if (!tracer)
465 return -EINVAL;
466
467 ds_release(&tracer->ds, ds_bts);
468 kfree(tracer);
469
470 return 0;
526} 471}
527 472
528static int ds_get_index(struct task_struct *task, size_t *pos, 473int ds_release_pebs(struct pebs_tracer *tracer)
529 enum ds_qualifier qual)
530{ 474{
531 struct ds_context *context; 475 if (!tracer)
532 unsigned long base, index; 476 return -EINVAL;
533 int error;
534 477
535 context = ds_get_context(task); 478 ds_release(&tracer->ds, ds_pebs);
536 error = ds_validate_access(context, qual); 479 kfree(tracer);
537 if (error < 0) 480
538 goto out; 481 return 0;
482}
483
484static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual)
485{
486 unsigned long base, index;
539 487
540 base = ds_get(context->ds, qual, ds_buffer_base); 488 base = ds_get(context->ds, qual, ds_buffer_base);
541 index = ds_get(context->ds, qual, ds_index); 489 index = ds_get(context->ds, qual, ds_index);
542 490
543 error = ((index - base) / ds_cfg.sizeof_rec[qual]); 491 return (index - base) / ds_cfg.sizeof_rec[qual];
544 if (pos)
545 *pos = error;
546 out:
547 ds_put_context(context);
548 return error;
549} 492}
550 493
551int ds_get_bts_index(struct task_struct *task, size_t *pos) 494int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos)
552{ 495{
553 return ds_get_index(task, pos, ds_bts); 496 if (!tracer)
497 return -EINVAL;
498
499 if (!pos)
500 return -EINVAL;
501
502 *pos = ds_get_index(tracer->ds.context, ds_bts);
503
504 return 0;
554} 505}
555 506
556int ds_get_pebs_index(struct task_struct *task, size_t *pos) 507int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos)
557{ 508{
558 return ds_get_index(task, pos, ds_pebs); 509 if (!tracer)
510 return -EINVAL;
511
512 if (!pos)
513 return -EINVAL;
514
515 *pos = ds_get_index(tracer->ds.context, ds_pebs);
516
517 return 0;
559} 518}
560 519
561static int ds_get_end(struct task_struct *task, size_t *pos, 520static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual)
562 enum ds_qualifier qual)
563{ 521{
564 struct ds_context *context; 522 unsigned long base, max;
565 unsigned long base, end;
566 int error;
567
568 context = ds_get_context(task);
569 error = ds_validate_access(context, qual);
570 if (error < 0)
571 goto out;
572 523
573 base = ds_get(context->ds, qual, ds_buffer_base); 524 base = ds_get(context->ds, qual, ds_buffer_base);
574 end = ds_get(context->ds, qual, ds_absolute_maximum); 525 max = ds_get(context->ds, qual, ds_absolute_maximum);
575 526
576 error = ((end - base) / ds_cfg.sizeof_rec[qual]); 527 return (max - base) / ds_cfg.sizeof_rec[qual];
577 if (pos)
578 *pos = error;
579 out:
580 ds_put_context(context);
581 return error;
582} 528}
583 529
584int ds_get_bts_end(struct task_struct *task, size_t *pos) 530int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos)
585{ 531{
586 return ds_get_end(task, pos, ds_bts); 532 if (!tracer)
533 return -EINVAL;
534
535 if (!pos)
536 return -EINVAL;
537
538 *pos = ds_get_end(tracer->ds.context, ds_bts);
539
540 return 0;
587} 541}
588 542
589int ds_get_pebs_end(struct task_struct *task, size_t *pos) 543int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos)
590{ 544{
591 return ds_get_end(task, pos, ds_pebs); 545 if (!tracer)
546 return -EINVAL;
547
548 if (!pos)
549 return -EINVAL;
550
551 *pos = ds_get_end(tracer->ds.context, ds_pebs);
552
553 return 0;
592} 554}
593 555
594static int ds_access(struct task_struct *task, size_t index, 556static int ds_access(struct ds_context *context, enum ds_qualifier qual,
595 const void **record, enum ds_qualifier qual) 557 size_t index, const void **record)
596{ 558{
597 struct ds_context *context;
598 unsigned long base, idx; 559 unsigned long base, idx;
599 int error;
600 560
601 if (!record) 561 if (!record)
602 return -EINVAL; 562 return -EINVAL;
603 563
604 context = ds_get_context(task);
605 error = ds_validate_access(context, qual);
606 if (error < 0)
607 goto out;
608
609 base = ds_get(context->ds, qual, ds_buffer_base); 564 base = ds_get(context->ds, qual, ds_buffer_base);
610 idx = base + (index * ds_cfg.sizeof_rec[qual]); 565 idx = base + (index * ds_cfg.sizeof_rec[qual]);
611 566
612 error = -EINVAL;
613 if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) 567 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
614 goto out; 568 return -EINVAL;
615 569
616 *record = (const void *)idx; 570 *record = (const void *)idx;
617 error = ds_cfg.sizeof_rec[qual]; 571
618 out: 572 return ds_cfg.sizeof_rec[qual];
619 ds_put_context(context);
620 return error;
621} 573}
622 574
623int ds_access_bts(struct task_struct *task, size_t index, const void **record) 575int ds_access_bts(struct bts_tracer *tracer, size_t index,
576 const void **record)
624{ 577{
625 return ds_access(task, index, record, ds_bts); 578 if (!tracer)
579 return -EINVAL;
580
581 return ds_access(tracer->ds.context, ds_bts, index, record);
626} 582}
627 583
628int ds_access_pebs(struct task_struct *task, size_t index, const void **record) 584int ds_access_pebs(struct pebs_tracer *tracer, size_t index,
585 const void **record)
629{ 586{
630 return ds_access(task, index, record, ds_pebs); 587 if (!tracer)
588 return -EINVAL;
589
590 return ds_access(tracer->ds.context, ds_pebs, index, record);
631} 591}
632 592
633static int ds_write(struct task_struct *task, const void *record, size_t size, 593static int ds_write(struct ds_context *context, enum ds_qualifier qual,
634 enum ds_qualifier qual, int force) 594 const void *record, size_t size)
635{ 595{
636 struct ds_context *context; 596 int bytes_written = 0;
637 int error;
638 597
639 if (!record) 598 if (!record)
640 return -EINVAL; 599 return -EINVAL;
641 600
642 error = -EPERM;
643 context = ds_get_context(task);
644 if (!context)
645 goto out;
646
647 if (!force) {
648 error = ds_validate_access(context, qual);
649 if (error < 0)
650 goto out;
651 }
652
653 error = 0;
654 while (size) { 601 while (size) {
655 unsigned long base, index, end, write_end, int_th; 602 unsigned long base, index, end, write_end, int_th;
656 unsigned long write_size, adj_write_size; 603 unsigned long write_size, adj_write_size;
@@ -678,14 +625,14 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,
678 write_end = end; 625 write_end = end;
679 626
680 if (write_end <= index) 627 if (write_end <= index)
681 goto out; 628 break;
682 629
683 write_size = min((unsigned long) size, write_end - index); 630 write_size = min((unsigned long) size, write_end - index);
684 memcpy((void *)index, record, write_size); 631 memcpy((void *)index, record, write_size);
685 632
686 record = (const char *)record + write_size; 633 record = (const char *)record + write_size;
687 size -= write_size; 634 size -= write_size;
688 error += write_size; 635 bytes_written += write_size;
689 636
690 adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; 637 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
691 adj_write_size *= ds_cfg.sizeof_rec[qual]; 638 adj_write_size *= ds_cfg.sizeof_rec[qual];
@@ -700,47 +647,32 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,
700 ds_set(context->ds, qual, ds_index, index); 647 ds_set(context->ds, qual, ds_index, index);
701 648
702 if (index >= int_th) 649 if (index >= int_th)
703 ds_overflow(task, context, qual); 650 ds_overflow(context, qual);
704 } 651 }
705 652
706 out: 653 return bytes_written;
707 ds_put_context(context);
708 return error;
709} 654}
710 655
711int ds_write_bts(struct task_struct *task, const void *record, size_t size) 656int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size)
712{ 657{
713 return ds_write(task, record, size, ds_bts, /* force = */ 0); 658 if (!tracer)
714} 659 return -EINVAL;
715 660
716int ds_write_pebs(struct task_struct *task, const void *record, size_t size) 661 return ds_write(tracer->ds.context, ds_bts, record, size);
717{
718 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
719} 662}
720 663
721int ds_unchecked_write_bts(struct task_struct *task, 664int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size)
722 const void *record, size_t size)
723{ 665{
724 return ds_write(task, record, size, ds_bts, /* force = */ 1); 666 if (!tracer)
725} 667 return -EINVAL;
726 668
727int ds_unchecked_write_pebs(struct task_struct *task, 669 return ds_write(tracer->ds.context, ds_pebs, record, size);
728 const void *record, size_t size)
729{
730 return ds_write(task, record, size, ds_pebs, /* force = */ 1);
731} 670}
732 671
733static int ds_reset_or_clear(struct task_struct *task, 672static void ds_reset_or_clear(struct ds_context *context,
734 enum ds_qualifier qual, int clear) 673 enum ds_qualifier qual, int clear)
735{ 674{
736 struct ds_context *context;
737 unsigned long base, end; 675 unsigned long base, end;
738 int error;
739
740 context = ds_get_context(task);
741 error = ds_validate_access(context, qual);
742 if (error < 0)
743 goto out;
744 676
745 base = ds_get(context->ds, qual, ds_buffer_base); 677 base = ds_get(context->ds, qual, ds_buffer_base);
746 end = ds_get(context->ds, qual, ds_absolute_maximum); 678 end = ds_get(context->ds, qual, ds_absolute_maximum);
@@ -749,70 +681,69 @@ static int ds_reset_or_clear(struct task_struct *task,
749 memset((void *)base, 0, end - base); 681 memset((void *)base, 0, end - base);
750 682
751 ds_set(context->ds, qual, ds_index, base); 683 ds_set(context->ds, qual, ds_index, base);
752
753 error = 0;
754 out:
755 ds_put_context(context);
756 return error;
757} 684}
758 685
759int ds_reset_bts(struct task_struct *task) 686int ds_reset_bts(struct bts_tracer *tracer)
760{ 687{
761 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); 688 if (!tracer)
689 return -EINVAL;
690
691 ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0);
692
693 return 0;
762} 694}
763 695
764int ds_reset_pebs(struct task_struct *task) 696int ds_reset_pebs(struct pebs_tracer *tracer)
765{ 697{
766 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); 698 if (!tracer)
699 return -EINVAL;
700
701 ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0);
702
703 return 0;
767} 704}
768 705
769int ds_clear_bts(struct task_struct *task) 706int ds_clear_bts(struct bts_tracer *tracer)
770{ 707{
771 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); 708 if (!tracer)
709 return -EINVAL;
710
711 ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1);
712
713 return 0;
772} 714}
773 715
774int ds_clear_pebs(struct task_struct *task) 716int ds_clear_pebs(struct pebs_tracer *tracer)
775{ 717{
776 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); 718 if (!tracer)
719 return -EINVAL;
720
721 ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1);
722
723 return 0;
777} 724}
778 725
779int ds_get_pebs_reset(struct task_struct *task, u64 *value) 726int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value)
780{ 727{
781 struct ds_context *context; 728 if (!tracer)
782 int error; 729 return -EINVAL;
783 730
784 if (!value) 731 if (!value)
785 return -EINVAL; 732 return -EINVAL;
786 733
787 context = ds_get_context(task); 734 *value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
788 error = ds_validate_access(context, ds_pebs);
789 if (error < 0)
790 goto out;
791 735
792 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); 736 return 0;
793
794 error = 0;
795 out:
796 ds_put_context(context);
797 return error;
798} 737}
799 738
800int ds_set_pebs_reset(struct task_struct *task, u64 value) 739int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
801{ 740{
802 struct ds_context *context; 741 if (!tracer)
803 int error; 742 return -EINVAL;
804
805 context = ds_get_context(task);
806 error = ds_validate_access(context, ds_pebs);
807 if (error < 0)
808 goto out;
809 743
810 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; 744 *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
811 745
812 error = 0; 746 return 0;
813 out:
814 ds_put_context(context);
815 return error;
816} 747}
817 748
818static const struct ds_configuration ds_cfg_var = { 749static const struct ds_configuration ds_cfg_var = {
@@ -840,6 +771,10 @@ static inline void
840ds_configure(const struct ds_configuration *cfg) 771ds_configure(const struct ds_configuration *cfg)
841{ 772{
842 ds_cfg = *cfg; 773 ds_cfg = *cfg;
774
775 printk(KERN_INFO "DS available\n");
776
777 BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds);
843} 778}
844 779
845void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) 780void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -847,17 +782,16 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
847 switch (c->x86) { 782 switch (c->x86) {
848 case 0x6: 783 case 0x6:
849 switch (c->x86_model) { 784 switch (c->x86_model) {
785 case 0 ... 0xC:
786 /* sorry, don't know about them */
787 break;
850 case 0xD: 788 case 0xD:
851 case 0xE: /* Pentium M */ 789 case 0xE: /* Pentium M */
852 ds_configure(&ds_cfg_var); 790 ds_configure(&ds_cfg_var);
853 break; 791 break;
854 case 0xF: /* Core2 */ 792 default: /* Core2, Atom, ... */
855 case 0x1C: /* Atom */
856 ds_configure(&ds_cfg_64); 793 ds_configure(&ds_cfg_64);
857 break; 794 break;
858 default:
859 /* sorry, don't know about them */
860 break;
861 } 795 }
862 break; 796 break;
863 case 0xF: 797 case 0xF:
@@ -884,6 +818,8 @@ void ds_free(struct ds_context *context)
884 * is dying. There should not be any user of that context left 818 * is dying. There should not be any user of that context left
885 * to disturb us, anymore. */ 819 * to disturb us, anymore. */
886 unsigned long leftovers = context->count; 820 unsigned long leftovers = context->count;
887 while (leftovers--) 821 while (leftovers--) {
822 put_tracer(context->task);
888 ds_put_context(context); 823 ds_put_context(context);
824 }
889} 825}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
new file mode 100644
index 000000000000..6b1f6f6f8661
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.c
@@ -0,0 +1,351 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16#include <linux/sysfs.h>
17
18#include <asm/stacktrace.h>
19
20#include "dumpstack.h"
21
22int panic_on_unrecovered_nmi;
23unsigned int code_bytes = 64;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static int die_counter;
26
27void printk_address(unsigned long address, int reliable)
28{
29 printk(" [<%p>] %s%pS\n", (void *) address,
30 reliable ? "" : "? ", (void *) address);
31}
32
33#ifdef CONFIG_FUNCTION_GRAPH_TRACER
34static void
35print_ftrace_graph_addr(unsigned long addr, void *data,
36 const struct stacktrace_ops *ops,
37 struct thread_info *tinfo, int *graph)
38{
39 struct task_struct *task = tinfo->task;
40 unsigned long ret_addr;
41 int index = task->curr_ret_stack;
42
43 if (addr != (unsigned long)return_to_handler)
44 return;
45
46 if (!task->ret_stack || index < *graph)
47 return;
48
49 index -= *graph;
50 ret_addr = task->ret_stack[index].ret;
51
52 ops->address(data, ret_addr, 1);
53
54 (*graph)++;
55}
56#else
57static inline void
58print_ftrace_graph_addr(unsigned long addr, void *data,
59 const struct stacktrace_ops *ops,
60 struct thread_info *tinfo, int *graph)
61{ }
62#endif
63
64/*
65 * x86-64 can have up to three kernel stacks:
66 * process stack
67 * interrupt stack
68 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
69 */
70
71static inline int valid_stack_ptr(struct thread_info *tinfo,
72 void *p, unsigned int size, void *end)
73{
74 void *t = tinfo;
75 if (end) {
76 if (p < end && p >= (end-THREAD_SIZE))
77 return 1;
78 else
79 return 0;
80 }
81 return p > t && p < t + THREAD_SIZE - size;
82}
83
84unsigned long
85print_context_stack(struct thread_info *tinfo,
86 unsigned long *stack, unsigned long bp,
87 const struct stacktrace_ops *ops, void *data,
88 unsigned long *end, int *graph)
89{
90 struct stack_frame *frame = (struct stack_frame *)bp;
91
92 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
93 unsigned long addr;
94
95 addr = *stack;
96 if (__kernel_text_address(addr)) {
97 if ((unsigned long) stack == bp + sizeof(long)) {
98 ops->address(data, addr, 1);
99 frame = frame->next_frame;
100 bp = (unsigned long) frame;
101 } else {
102 ops->address(data, addr, bp == 0);
103 }
104 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
105 }
106 stack++;
107 }
108 return bp;
109}
110
111
112static void
113print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
114{
115 printk(data);
116 print_symbol(msg, symbol);
117 printk("\n");
118}
119
120static void print_trace_warning(void *data, char *msg)
121{
122 printk("%s%s\n", (char *)data, msg);
123}
124
125static int print_trace_stack(void *data, char *name)
126{
127 printk("%s <%s> ", (char *)data, name);
128 return 0;
129}
130
131/*
132 * Print one address/symbol entries per line.
133 */
134static void print_trace_address(void *data, unsigned long addr, int reliable)
135{
136 touch_nmi_watchdog();
137 printk(data);
138 printk_address(addr, reliable);
139}
140
141static const struct stacktrace_ops print_trace_ops = {
142 .warning = print_trace_warning,
143 .warning_symbol = print_trace_warning_symbol,
144 .stack = print_trace_stack,
145 .address = print_trace_address,
146};
147
148void
149show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
150 unsigned long *stack, unsigned long bp, char *log_lvl)
151{
152 printk("%sCall Trace:\n", log_lvl);
153 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
154}
155
156void show_trace(struct task_struct *task, struct pt_regs *regs,
157 unsigned long *stack, unsigned long bp)
158{
159 show_trace_log_lvl(task, regs, stack, bp, "");
160}
161
162void show_stack(struct task_struct *task, unsigned long *sp)
163{
164 show_stack_log_lvl(task, NULL, sp, 0, "");
165}
166
167/*
168 * The architecture-independent dump_stack generator
169 */
170void dump_stack(void)
171{
172 unsigned long bp = 0;
173 unsigned long stack;
174
175#ifdef CONFIG_FRAME_POINTER
176 if (!bp)
177 get_bp(bp);
178#endif
179
180 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
181 current->pid, current->comm, print_tainted(),
182 init_utsname()->release,
183 (int)strcspn(init_utsname()->version, " "),
184 init_utsname()->version);
185 show_trace(NULL, NULL, &stack, bp);
186}
187EXPORT_SYMBOL(dump_stack);
188
189static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
190static int die_owner = -1;
191static unsigned int die_nest_count;
192
193unsigned __kprobes long oops_begin(void)
194{
195 int cpu;
196 unsigned long flags;
197
198 oops_enter();
199
200 /* racy, but better than risking deadlock. */
201 raw_local_irq_save(flags);
202 cpu = smp_processor_id();
203 if (!__raw_spin_trylock(&die_lock)) {
204 if (cpu == die_owner)
205 /* nested oops. should stop eventually */;
206 else
207 __raw_spin_lock(&die_lock);
208 }
209 die_nest_count++;
210 die_owner = cpu;
211 console_verbose();
212 bust_spinlocks(1);
213 return flags;
214}
215
216void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
217{
218 if (regs && kexec_should_crash(current))
219 crash_kexec(regs);
220
221 bust_spinlocks(0);
222 die_owner = -1;
223 add_taint(TAINT_DIE);
224 die_nest_count--;
225 if (!die_nest_count)
226 /* Nest count reaches zero, release the lock. */
227 __raw_spin_unlock(&die_lock);
228 raw_local_irq_restore(flags);
229 oops_exit();
230
231 if (!signr)
232 return;
233 if (in_interrupt())
234 panic("Fatal exception in interrupt");
235 if (panic_on_oops)
236 panic("Fatal exception");
237 do_exit(signr);
238}
239
240int __kprobes __die(const char *str, struct pt_regs *regs, long err)
241{
242#ifdef CONFIG_X86_32
243 unsigned short ss;
244 unsigned long sp;
245#endif
246 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
247#ifdef CONFIG_PREEMPT
248 printk("PREEMPT ");
249#endif
250#ifdef CONFIG_SMP
251 printk("SMP ");
252#endif
253#ifdef CONFIG_DEBUG_PAGEALLOC
254 printk("DEBUG_PAGEALLOC");
255#endif
256 printk("\n");
257 sysfs_printk_last_file();
258 if (notify_die(DIE_OOPS, str, regs, err,
259 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
260 return 1;
261
262 show_registers(regs);
263#ifdef CONFIG_X86_32
264 sp = (unsigned long) (&regs->sp);
265 savesegment(ss, ss);
266 if (user_mode(regs)) {
267 sp = regs->sp;
268 ss = regs->ss & 0xffff;
269 }
270 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
271 print_symbol("%s", regs->ip);
272 printk(" SS:ESP %04x:%08lx\n", ss, sp);
273#else
274 /* Executive summary in case the oops scrolled away */
275 printk(KERN_ALERT "RIP ");
276 printk_address(regs->ip, 1);
277 printk(" RSP <%016lx>\n", regs->sp);
278#endif
279 return 0;
280}
281
282/*
283 * This is gone through when something in the kernel has done something bad
284 * and is about to be terminated:
285 */
286void die(const char *str, struct pt_regs *regs, long err)
287{
288 unsigned long flags = oops_begin();
289 int sig = SIGSEGV;
290
291 if (!user_mode_vm(regs))
292 report_bug(regs->ip, regs);
293
294 if (__die(str, regs, err))
295 sig = 0;
296 oops_end(flags, regs, sig);
297}
298
299void notrace __kprobes
300die_nmi(char *str, struct pt_regs *regs, int do_panic)
301{
302 unsigned long flags;
303
304 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
305 return;
306
307 /*
308 * We are in trouble anyway, lets at least try
309 * to get a message out.
310 */
311 flags = oops_begin();
312 printk(KERN_EMERG "%s", str);
313 printk(" on CPU%d, ip %08lx, registers:\n",
314 smp_processor_id(), regs->ip);
315 show_registers(regs);
316 oops_end(flags, regs, 0);
317 if (do_panic || panic_on_oops)
318 panic("Non maskable interrupt");
319 nmi_exit();
320 local_irq_enable();
321 do_exit(SIGBUS);
322}
323
324static int __init oops_setup(char *s)
325{
326 if (!s)
327 return -EINVAL;
328 if (!strcmp(s, "panic"))
329 panic_on_oops = 1;
330 return 0;
331}
332early_param("oops", oops_setup);
333
334static int __init kstack_setup(char *s)
335{
336 if (!s)
337 return -EINVAL;
338 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
339 return 0;
340}
341early_param("kstack", kstack_setup);
342
343static int __init code_bytes_setup(char *s)
344{
345 code_bytes = simple_strtoul(s, NULL, 0);
346 if (code_bytes > 8192)
347 code_bytes = 8192;
348
349 return 1;
350}
351__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
new file mode 100644
index 000000000000..da87590b8698
--- /dev/null
+++ b/arch/x86/kernel/dumpstack.h
@@ -0,0 +1,39 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5
6#ifndef DUMPSTACK_H
7#define DUMPSTACK_H
8
9#ifdef CONFIG_X86_32
10#define STACKSLOTS_PER_LINE 8
11#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
12#else
13#define STACKSLOTS_PER_LINE 4
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif
16
17extern unsigned long
18print_context_stack(struct thread_info *tinfo,
19 unsigned long *stack, unsigned long bp,
20 const struct stacktrace_ops *ops, void *data,
21 unsigned long *end, int *graph);
22
23extern void
24show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
25 unsigned long *stack, unsigned long bp, char *log_lvl);
26
27extern void
28show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
29 unsigned long *sp, unsigned long bp, char *log_lvl);
30
31extern unsigned int code_bytes;
32extern int kstack_depth_to_print;
33
34/* The form of the top of the frame on the stack */
35struct stack_frame {
36 struct stack_frame *next_frame;
37 unsigned long return_address;
38};
39#endif
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index b3614752197b..d593cd1f58dc 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,69 +17,14 @@
17 17
18#include <asm/stacktrace.h> 18#include <asm/stacktrace.h>
19 19
20#define STACKSLOTS_PER_LINE 8 20#include "dumpstack.h"
21#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33
34static inline int valid_stack_ptr(struct thread_info *tinfo,
35 void *p, unsigned int size, void *end)
36{
37 void *t = tinfo;
38 if (end) {
39 if (p < end && p >= (end-THREAD_SIZE))
40 return 1;
41 else
42 return 0;
43 }
44 return p > t && p < t + THREAD_SIZE - size;
45}
46
47/* The form of the top of the frame on the stack */
48struct stack_frame {
49 struct stack_frame *next_frame;
50 unsigned long return_address;
51};
52
53static inline unsigned long
54print_context_stack(struct thread_info *tinfo,
55 unsigned long *stack, unsigned long bp,
56 const struct stacktrace_ops *ops, void *data,
57 unsigned long *end)
58{
59 struct stack_frame *frame = (struct stack_frame *)bp;
60
61 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
62 unsigned long addr;
63
64 addr = *stack;
65 if (__kernel_text_address(addr)) {
66 if ((unsigned long) stack == bp + sizeof(long)) {
67 ops->address(data, addr, 1);
68 frame = frame->next_frame;
69 bp = (unsigned long) frame;
70 } else {
71 ops->address(data, addr, bp == 0);
72 }
73 }
74 stack++;
75 }
76 return bp;
77}
78 21
79void dump_trace(struct task_struct *task, struct pt_regs *regs, 22void dump_trace(struct task_struct *task, struct pt_regs *regs,
80 unsigned long *stack, unsigned long bp, 23 unsigned long *stack, unsigned long bp,
81 const struct stacktrace_ops *ops, void *data) 24 const struct stacktrace_ops *ops, void *data)
82{ 25{
26 int graph = 0;
27
83 if (!task) 28 if (!task)
84 task = current; 29 task = current;
85 30
@@ -107,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
107 52
108 context = (struct thread_info *) 53 context = (struct thread_info *)
109 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 54 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
110 bp = print_context_stack(context, stack, bp, ops, data, NULL); 55 bp = print_context_stack(context, stack, bp, ops,
56 data, NULL, &graph);
111 57
112 stack = (unsigned long *)context->previous_esp; 58 stack = (unsigned long *)context->previous_esp;
113 if (!stack) 59 if (!stack)
@@ -119,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
119} 65}
120EXPORT_SYMBOL(dump_trace); 66EXPORT_SYMBOL(dump_trace);
121 67
122static void 68void
123print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
124{
125 printk(data);
126 print_symbol(msg, symbol);
127 printk("\n");
128}
129
130static void print_trace_warning(void *data, char *msg)
131{
132 printk("%s%s\n", (char *)data, msg);
133}
134
135static int print_trace_stack(void *data, char *name)
136{
137 printk("%s <%s> ", (char *)data, name);
138 return 0;
139}
140
141/*
142 * Print one address/symbol entries per line.
143 */
144static void print_trace_address(void *data, unsigned long addr, int reliable)
145{
146 touch_nmi_watchdog();
147 printk(data);
148 printk_address(addr, reliable);
149}
150
151static const struct stacktrace_ops print_trace_ops = {
152 .warning = print_trace_warning,
153 .warning_symbol = print_trace_warning_symbol,
154 .stack = print_trace_stack,
155 .address = print_trace_address,
156};
157
158static void
159show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
160 unsigned long *stack, unsigned long bp, char *log_lvl)
161{
162 printk("%sCall Trace:\n", log_lvl);
163 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
164}
165
166void show_trace(struct task_struct *task, struct pt_regs *regs,
167 unsigned long *stack, unsigned long bp)
168{
169 show_trace_log_lvl(task, regs, stack, bp, "");
170}
171
172static void
173show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 69show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
174 unsigned long *sp, unsigned long bp, char *log_lvl) 70 unsigned long *sp, unsigned long bp, char *log_lvl)
175{ 71{
@@ -196,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
196 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 92 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
197} 93}
198 94
199void show_stack(struct task_struct *task, unsigned long *sp)
200{
201 show_stack_log_lvl(task, NULL, sp, 0, "");
202}
203
204/*
205 * The architecture-independent dump_stack generator
206 */
207void dump_stack(void)
208{
209 unsigned long bp = 0;
210 unsigned long stack;
211
212#ifdef CONFIG_FRAME_POINTER
213 if (!bp)
214 get_bp(bp);
215#endif
216
217 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
218 current->pid, current->comm, print_tainted(),
219 init_utsname()->release,
220 (int)strcspn(init_utsname()->version, " "),
221 init_utsname()->version);
222 show_trace(NULL, NULL, &stack, bp);
223}
224
225EXPORT_SYMBOL(dump_stack);
226 95
227void show_registers(struct pt_regs *regs) 96void show_registers(struct pt_regs *regs)
228{ 97{
@@ -283,167 +152,3 @@ int is_valid_bugaddr(unsigned long ip)
283 return ud2 == 0x0b0f; 152 return ud2 == 0x0b0f;
284} 153}
285 154
286static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
287static int die_owner = -1;
288static unsigned int die_nest_count;
289
290unsigned __kprobes long oops_begin(void)
291{
292 unsigned long flags;
293
294 oops_enter();
295
296 if (die_owner != raw_smp_processor_id()) {
297 console_verbose();
298 raw_local_irq_save(flags);
299 __raw_spin_lock(&die_lock);
300 die_owner = smp_processor_id();
301 die_nest_count = 0;
302 bust_spinlocks(1);
303 } else {
304 raw_local_irq_save(flags);
305 }
306 die_nest_count++;
307 return flags;
308}
309
310void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
311{
312 bust_spinlocks(0);
313 die_owner = -1;
314 add_taint(TAINT_DIE);
315 __raw_spin_unlock(&die_lock);
316 raw_local_irq_restore(flags);
317
318 if (!regs)
319 return;
320
321 if (kexec_should_crash(current))
322 crash_kexec(regs);
323 if (in_interrupt())
324 panic("Fatal exception in interrupt");
325 if (panic_on_oops)
326 panic("Fatal exception");
327 oops_exit();
328 do_exit(signr);
329}
330
331int __kprobes __die(const char *str, struct pt_regs *regs, long err)
332{
333 unsigned short ss;
334 unsigned long sp;
335
336 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
337#ifdef CONFIG_PREEMPT
338 printk("PREEMPT ");
339#endif
340#ifdef CONFIG_SMP
341 printk("SMP ");
342#endif
343#ifdef CONFIG_DEBUG_PAGEALLOC
344 printk("DEBUG_PAGEALLOC");
345#endif
346 printk("\n");
347 sysfs_printk_last_file();
348 if (notify_die(DIE_OOPS, str, regs, err,
349 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
350 return 1;
351
352 show_registers(regs);
353 /* Executive summary in case the oops scrolled away */
354 sp = (unsigned long) (&regs->sp);
355 savesegment(ss, ss);
356 if (user_mode(regs)) {
357 sp = regs->sp;
358 ss = regs->ss & 0xffff;
359 }
360 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
361 print_symbol("%s", regs->ip);
362 printk(" SS:ESP %04x:%08lx\n", ss, sp);
363 return 0;
364}
365
366/*
367 * This is gone through when something in the kernel has done something bad
368 * and is about to be terminated:
369 */
370void die(const char *str, struct pt_regs *regs, long err)
371{
372 unsigned long flags = oops_begin();
373
374 if (die_nest_count < 3) {
375 report_bug(regs->ip, regs);
376
377 if (__die(str, regs, err))
378 regs = NULL;
379 } else {
380 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
381 }
382
383 oops_end(flags, regs, SIGSEGV);
384}
385
386static DEFINE_SPINLOCK(nmi_print_lock);
387
388void notrace __kprobes
389die_nmi(char *str, struct pt_regs *regs, int do_panic)
390{
391 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
392 return;
393
394 spin_lock(&nmi_print_lock);
395 /*
396 * We are in trouble anyway, lets at least try
397 * to get a message out:
398 */
399 bust_spinlocks(1);
400 printk(KERN_EMERG "%s", str);
401 printk(" on CPU%d, ip %08lx, registers:\n",
402 smp_processor_id(), regs->ip);
403 show_registers(regs);
404 if (do_panic)
405 panic("Non maskable interrupt");
406 console_silent();
407 spin_unlock(&nmi_print_lock);
408
409 /*
410 * If we are in kernel we are probably nested up pretty bad
411 * and might aswell get out now while we still can:
412 */
413 if (!user_mode_vm(regs)) {
414 current->thread.trap_no = 2;
415 crash_kexec(regs);
416 }
417
418 bust_spinlocks(0);
419 do_exit(SIGSEGV);
420}
421
422static int __init oops_setup(char *s)
423{
424 if (!s)
425 return -EINVAL;
426 if (!strcmp(s, "panic"))
427 panic_on_oops = 1;
428 return 0;
429}
430early_param("oops", oops_setup);
431
432static int __init kstack_setup(char *s)
433{
434 if (!s)
435 return -EINVAL;
436 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
437 return 0;
438}
439early_param("kstack", kstack_setup);
440
441static int __init code_bytes_setup(char *s)
442{
443 code_bytes = simple_strtoul(s, NULL, 0);
444 if (code_bytes > 8192)
445 code_bytes = 8192;
446
447 return 1;
448}
449__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 96a5db7da8a7..c302d0707048 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -17,19 +17,7 @@
17 17
18#include <asm/stacktrace.h> 18#include <asm/stacktrace.h>
19 19
20#define STACKSLOTS_PER_LINE 4 20#include "dumpstack.h"
21#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33 21
34static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 22static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
35 unsigned *usedp, char **idp) 23 unsigned *usedp, char **idp)
@@ -113,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
113 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack 101 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
114 */ 102 */
115 103
116static inline int valid_stack_ptr(struct thread_info *tinfo,
117 void *p, unsigned int size, void *end)
118{
119 void *t = tinfo;
120 if (end) {
121 if (p < end && p >= (end-THREAD_SIZE))
122 return 1;
123 else
124 return 0;
125 }
126 return p > t && p < t + THREAD_SIZE - size;
127}
128
129/* The form of the top of the frame on the stack */
130struct stack_frame {
131 struct stack_frame *next_frame;
132 unsigned long return_address;
133};
134
135static inline unsigned long
136print_context_stack(struct thread_info *tinfo,
137 unsigned long *stack, unsigned long bp,
138 const struct stacktrace_ops *ops, void *data,
139 unsigned long *end)
140{
141 struct stack_frame *frame = (struct stack_frame *)bp;
142
143 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
144 unsigned long addr;
145
146 addr = *stack;
147 if (__kernel_text_address(addr)) {
148 if ((unsigned long) stack == bp + sizeof(long)) {
149 ops->address(data, addr, 1);
150 frame = frame->next_frame;
151 bp = (unsigned long) frame;
152 } else {
153 ops->address(data, addr, bp == 0);
154 }
155 }
156 stack++;
157 }
158 return bp;
159}
160
161void dump_trace(struct task_struct *task, struct pt_regs *regs, 104void dump_trace(struct task_struct *task, struct pt_regs *regs,
162 unsigned long *stack, unsigned long bp, 105 unsigned long *stack, unsigned long bp,
163 const struct stacktrace_ops *ops, void *data) 106 const struct stacktrace_ops *ops, void *data)
@@ -166,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
166 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 109 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
167 unsigned used = 0; 110 unsigned used = 0;
168 struct thread_info *tinfo; 111 struct thread_info *tinfo;
112 int graph = 0;
169 113
170 if (!task) 114 if (!task)
171 task = current; 115 task = current;
@@ -206,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
206 break; 150 break;
207 151
208 bp = print_context_stack(tinfo, stack, bp, ops, 152 bp = print_context_stack(tinfo, stack, bp, ops,
209 data, estack_end); 153 data, estack_end, &graph);
210 ops->stack(data, "<EOE>"); 154 ops->stack(data, "<EOE>");
211 /* 155 /*
212 * We link to the next stack via the 156 * We link to the next stack via the
@@ -225,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
225 if (ops->stack(data, "IRQ") < 0) 169 if (ops->stack(data, "IRQ") < 0)
226 break; 170 break;
227 bp = print_context_stack(tinfo, stack, bp, 171 bp = print_context_stack(tinfo, stack, bp,
228 ops, data, irqstack_end); 172 ops, data, irqstack_end, &graph);
229 /* 173 /*
230 * We link to the next stack (which would be 174 * We link to the next stack (which would be
231 * the process stack normally) the last 175 * the process stack normally) the last
@@ -243,62 +187,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
243 /* 187 /*
244 * This handles the process stack: 188 * This handles the process stack:
245 */ 189 */
246 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); 190 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);
247 put_cpu(); 191 put_cpu();
248} 192}
249EXPORT_SYMBOL(dump_trace); 193EXPORT_SYMBOL(dump_trace);
250 194
251static void 195void
252print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
253{
254 printk(data);
255 print_symbol(msg, symbol);
256 printk("\n");
257}
258
259static void print_trace_warning(void *data, char *msg)
260{
261 printk("%s%s\n", (char *)data, msg);
262}
263
264static int print_trace_stack(void *data, char *name)
265{
266 printk("%s <%s> ", (char *)data, name);
267 return 0;
268}
269
270/*
271 * Print one address/symbol entries per line.
272 */
273static void print_trace_address(void *data, unsigned long addr, int reliable)
274{
275 touch_nmi_watchdog();
276 printk(data);
277 printk_address(addr, reliable);
278}
279
280static const struct stacktrace_ops print_trace_ops = {
281 .warning = print_trace_warning,
282 .warning_symbol = print_trace_warning_symbol,
283 .stack = print_trace_stack,
284 .address = print_trace_address,
285};
286
287static void
288show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
289 unsigned long *stack, unsigned long bp, char *log_lvl)
290{
291 printk("%sCall Trace:\n", log_lvl);
292 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
293}
294
295void show_trace(struct task_struct *task, struct pt_regs *regs,
296 unsigned long *stack, unsigned long bp)
297{
298 show_trace_log_lvl(task, regs, stack, bp, "");
299}
300
301static void
302show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 196show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
303 unsigned long *sp, unsigned long bp, char *log_lvl) 197 unsigned long *sp, unsigned long bp, char *log_lvl)
304{ 198{
@@ -342,33 +236,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
342 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 236 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
343} 237}
344 238
345void show_stack(struct task_struct *task, unsigned long *sp)
346{
347 show_stack_log_lvl(task, NULL, sp, 0, "");
348}
349
350/*
351 * The architecture-independent dump_stack generator
352 */
353void dump_stack(void)
354{
355 unsigned long bp = 0;
356 unsigned long stack;
357
358#ifdef CONFIG_FRAME_POINTER
359 if (!bp)
360 get_bp(bp);
361#endif
362
363 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
364 current->pid, current->comm, print_tainted(),
365 init_utsname()->release,
366 (int)strcspn(init_utsname()->version, " "),
367 init_utsname()->version);
368 show_trace(NULL, NULL, &stack, bp);
369}
370EXPORT_SYMBOL(dump_stack);
371
372void show_registers(struct pt_regs *regs) 239void show_registers(struct pt_regs *regs)
373{ 240{
374 int i; 241 int i;
@@ -429,147 +296,3 @@ int is_valid_bugaddr(unsigned long ip)
429 return ud2 == 0x0b0f; 296 return ud2 == 0x0b0f;
430} 297}
431 298
432static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
433static int die_owner = -1;
434static unsigned int die_nest_count;
435
436unsigned __kprobes long oops_begin(void)
437{
438 int cpu;
439 unsigned long flags;
440
441 oops_enter();
442
443 /* racy, but better than risking deadlock. */
444 raw_local_irq_save(flags);
445 cpu = smp_processor_id();
446 if (!__raw_spin_trylock(&die_lock)) {
447 if (cpu == die_owner)
448 /* nested oops. should stop eventually */;
449 else
450 __raw_spin_lock(&die_lock);
451 }
452 die_nest_count++;
453 die_owner = cpu;
454 console_verbose();
455 bust_spinlocks(1);
456 return flags;
457}
458
459void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
460{
461 die_owner = -1;
462 bust_spinlocks(0);
463 die_nest_count--;
464 if (!die_nest_count)
465 /* Nest count reaches zero, release the lock. */
466 __raw_spin_unlock(&die_lock);
467 raw_local_irq_restore(flags);
468 if (!regs) {
469 oops_exit();
470 return;
471 }
472 if (in_interrupt())
473 panic("Fatal exception in interrupt");
474 if (panic_on_oops)
475 panic("Fatal exception");
476 oops_exit();
477 do_exit(signr);
478}
479
480int __kprobes __die(const char *str, struct pt_regs *regs, long err)
481{
482 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
483#ifdef CONFIG_PREEMPT
484 printk("PREEMPT ");
485#endif
486#ifdef CONFIG_SMP
487 printk("SMP ");
488#endif
489#ifdef CONFIG_DEBUG_PAGEALLOC
490 printk("DEBUG_PAGEALLOC");
491#endif
492 printk("\n");
493 sysfs_printk_last_file();
494 if (notify_die(DIE_OOPS, str, regs, err,
495 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
496 return 1;
497
498 show_registers(regs);
499 add_taint(TAINT_DIE);
500 /* Executive summary in case the oops scrolled away */
501 printk(KERN_ALERT "RIP ");
502 printk_address(regs->ip, 1);
503 printk(" RSP <%016lx>\n", regs->sp);
504 if (kexec_should_crash(current))
505 crash_kexec(regs);
506 return 0;
507}
508
509void die(const char *str, struct pt_regs *regs, long err)
510{
511 unsigned long flags = oops_begin();
512
513 if (!user_mode(regs))
514 report_bug(regs->ip, regs);
515
516 if (__die(str, regs, err))
517 regs = NULL;
518 oops_end(flags, regs, SIGSEGV);
519}
520
521notrace __kprobes void
522die_nmi(char *str, struct pt_regs *regs, int do_panic)
523{
524 unsigned long flags;
525
526 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
527 return;
528
529 flags = oops_begin();
530 /*
531 * We are in trouble anyway, lets at least try
532 * to get a message out.
533 */
534 printk(KERN_EMERG "%s", str);
535 printk(" on CPU%d, ip %08lx, registers:\n",
536 smp_processor_id(), regs->ip);
537 show_registers(regs);
538 if (kexec_should_crash(current))
539 crash_kexec(regs);
540 if (do_panic || panic_on_oops)
541 panic("Non maskable interrupt");
542 oops_end(flags, NULL, SIGBUS);
543 nmi_exit();
544 local_irq_enable();
545 do_exit(SIGBUS);
546}
547
548static int __init oops_setup(char *s)
549{
550 if (!s)
551 return -EINVAL;
552 if (!strcmp(s, "panic"))
553 panic_on_oops = 1;
554 return 0;
555}
556early_param("oops", oops_setup);
557
558static int __init kstack_setup(char *s)
559{
560 if (!s)
561 return -EINVAL;
562 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
563 return 0;
564}
565early_param("kstack", kstack_setup);
566
567static int __init code_bytes_setup(char *s)
568{
569 code_bytes = simple_strtoul(s, NULL, 0);
570 if (code_bytes > 8192)
571 code_bytes = 8192;
572
573 return 1;
574}
575__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 28b597ef9ca1..43ceb3f454bf 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1157,6 +1157,9 @@ ENTRY(mcount)
1157END(mcount) 1157END(mcount)
1158 1158
1159ENTRY(ftrace_caller) 1159ENTRY(ftrace_caller)
1160 cmpl $0, function_trace_stop
1161 jne ftrace_stub
1162
1160 pushl %eax 1163 pushl %eax
1161 pushl %ecx 1164 pushl %ecx
1162 pushl %edx 1165 pushl %edx
@@ -1171,6 +1174,11 @@ ftrace_call:
1171 popl %edx 1174 popl %edx
1172 popl %ecx 1175 popl %ecx
1173 popl %eax 1176 popl %eax
1177#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1178.globl ftrace_graph_call
1179ftrace_graph_call:
1180 jmp ftrace_stub
1181#endif
1174 1182
1175.globl ftrace_stub 1183.globl ftrace_stub
1176ftrace_stub: 1184ftrace_stub:
@@ -1180,8 +1188,18 @@ END(ftrace_caller)
1180#else /* ! CONFIG_DYNAMIC_FTRACE */ 1188#else /* ! CONFIG_DYNAMIC_FTRACE */
1181 1189
1182ENTRY(mcount) 1190ENTRY(mcount)
1191 cmpl $0, function_trace_stop
1192 jne ftrace_stub
1193
1183 cmpl $ftrace_stub, ftrace_trace_function 1194 cmpl $ftrace_stub, ftrace_trace_function
1184 jnz trace 1195 jnz trace
1196#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1197 cmpl $ftrace_stub, ftrace_graph_return
1198 jnz ftrace_graph_caller
1199
1200 cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
1201 jnz ftrace_graph_caller
1202#endif
1185.globl ftrace_stub 1203.globl ftrace_stub
1186ftrace_stub: 1204ftrace_stub:
1187 ret 1205 ret
@@ -1200,12 +1218,43 @@ trace:
1200 popl %edx 1218 popl %edx
1201 popl %ecx 1219 popl %ecx
1202 popl %eax 1220 popl %eax
1203
1204 jmp ftrace_stub 1221 jmp ftrace_stub
1205END(mcount) 1222END(mcount)
1206#endif /* CONFIG_DYNAMIC_FTRACE */ 1223#endif /* CONFIG_DYNAMIC_FTRACE */
1207#endif /* CONFIG_FUNCTION_TRACER */ 1224#endif /* CONFIG_FUNCTION_TRACER */
1208 1225
1226#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1227ENTRY(ftrace_graph_caller)
1228 cmpl $0, function_trace_stop
1229 jne ftrace_stub
1230
1231 pushl %eax
1232 pushl %ecx
1233 pushl %edx
1234 movl 0xc(%esp), %edx
1235 lea 0x4(%ebp), %eax
1236 subl $MCOUNT_INSN_SIZE, %edx
1237 call prepare_ftrace_return
1238 popl %edx
1239 popl %ecx
1240 popl %eax
1241 ret
1242END(ftrace_graph_caller)
1243
1244.globl return_to_handler
1245return_to_handler:
1246 pushl $0
1247 pushl %eax
1248 pushl %ecx
1249 pushl %edx
1250 call ftrace_return_to_handler
1251 movl %eax, 0xc(%esp)
1252 popl %edx
1253 popl %ecx
1254 popl %eax
1255 ret
1256#endif
1257
1209.section .rodata,"a" 1258.section .rodata,"a"
1210#include "syscall_table_32.S" 1259#include "syscall_table_32.S"
1211 1260
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b86f332c96a6..54e0bbdccb99 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -68,6 +68,8 @@ ENTRY(mcount)
68END(mcount) 68END(mcount)
69 69
70ENTRY(ftrace_caller) 70ENTRY(ftrace_caller)
71 cmpl $0, function_trace_stop
72 jne ftrace_stub
71 73
72 /* taken from glibc */ 74 /* taken from glibc */
73 subq $0x38, %rsp 75 subq $0x38, %rsp
@@ -96,6 +98,12 @@ ftrace_call:
96 movq (%rsp), %rax 98 movq (%rsp), %rax
97 addq $0x38, %rsp 99 addq $0x38, %rsp
98 100
101#ifdef CONFIG_FUNCTION_GRAPH_TRACER
102.globl ftrace_graph_call
103ftrace_graph_call:
104 jmp ftrace_stub
105#endif
106
99.globl ftrace_stub 107.globl ftrace_stub
100ftrace_stub: 108ftrace_stub:
101 retq 109 retq
@@ -103,8 +111,20 @@ END(ftrace_caller)
103 111
104#else /* ! CONFIG_DYNAMIC_FTRACE */ 112#else /* ! CONFIG_DYNAMIC_FTRACE */
105ENTRY(mcount) 113ENTRY(mcount)
114 cmpl $0, function_trace_stop
115 jne ftrace_stub
116
106 cmpq $ftrace_stub, ftrace_trace_function 117 cmpq $ftrace_stub, ftrace_trace_function
107 jnz trace 118 jnz trace
119
120#ifdef CONFIG_FUNCTION_GRAPH_TRACER
121 cmpq $ftrace_stub, ftrace_graph_return
122 jnz ftrace_graph_caller
123
124 cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
125 jnz ftrace_graph_caller
126#endif
127
108.globl ftrace_stub 128.globl ftrace_stub
109ftrace_stub: 129ftrace_stub:
110 retq 130 retq
@@ -140,6 +160,69 @@ END(mcount)
140#endif /* CONFIG_DYNAMIC_FTRACE */ 160#endif /* CONFIG_DYNAMIC_FTRACE */
141#endif /* CONFIG_FUNCTION_TRACER */ 161#endif /* CONFIG_FUNCTION_TRACER */
142 162
163#ifdef CONFIG_FUNCTION_GRAPH_TRACER
164ENTRY(ftrace_graph_caller)
165 cmpl $0, function_trace_stop
166 jne ftrace_stub
167
168 subq $0x38, %rsp
169 movq %rax, (%rsp)
170 movq %rcx, 8(%rsp)
171 movq %rdx, 16(%rsp)
172 movq %rsi, 24(%rsp)
173 movq %rdi, 32(%rsp)
174 movq %r8, 40(%rsp)
175 movq %r9, 48(%rsp)
176
177 leaq 8(%rbp), %rdi
178 movq 0x38(%rsp), %rsi
179 subq $MCOUNT_INSN_SIZE, %rsi
180
181 call prepare_ftrace_return
182
183 movq 48(%rsp), %r9
184 movq 40(%rsp), %r8
185 movq 32(%rsp), %rdi
186 movq 24(%rsp), %rsi
187 movq 16(%rsp), %rdx
188 movq 8(%rsp), %rcx
189 movq (%rsp), %rax
190 addq $0x38, %rsp
191 retq
192END(ftrace_graph_caller)
193
194
195.globl return_to_handler
196return_to_handler:
197 subq $80, %rsp
198
199 movq %rax, (%rsp)
200 movq %rcx, 8(%rsp)
201 movq %rdx, 16(%rsp)
202 movq %rsi, 24(%rsp)
203 movq %rdi, 32(%rsp)
204 movq %r8, 40(%rsp)
205 movq %r9, 48(%rsp)
206 movq %r10, 56(%rsp)
207 movq %r11, 64(%rsp)
208
209 call ftrace_return_to_handler
210
211 movq %rax, 72(%rsp)
212 movq 64(%rsp), %r11
213 movq 56(%rsp), %r10
214 movq 48(%rsp), %r9
215 movq 40(%rsp), %r8
216 movq 32(%rsp), %rdi
217 movq 24(%rsp), %rsi
218 movq 16(%rsp), %rdx
219 movq 8(%rsp), %rcx
220 movq (%rsp), %rax
221 addq $72, %rsp
222 retq
223#endif
224
225
143#ifndef CONFIG_PREEMPT 226#ifndef CONFIG_PREEMPT
144#define retint_kernel retint_restore_args 227#define retint_kernel retint_restore_args
145#endif 228#endif
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
index 0aa2c443d600..53699c931ad4 100644
--- a/arch/x86/kernel/es7000_32.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -38,8 +38,11 @@
38#include <asm/io.h> 38#include <asm/io.h>
39#include <asm/nmi.h> 39#include <asm/nmi.h>
40#include <asm/smp.h> 40#include <asm/smp.h>
41#include <asm/atomic.h>
41#include <asm/apicdef.h> 42#include <asm/apicdef.h>
42#include <mach_mpparse.h> 43#include <mach_mpparse.h>
44#include <asm/genapic.h>
45#include <asm/setup.h>
43 46
44/* 47/*
45 * ES7000 chipsets 48 * ES7000 chipsets
@@ -161,6 +164,43 @@ es7000_rename_gsi(int ioapic, int gsi)
161 return gsi; 164 return gsi;
162} 165}
163 166
167static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
168{
169 unsigned long vect = 0, psaival = 0;
170
171 if (psai == NULL)
172 return -1;
173
174 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
175 psaival = (0x1000000 | vect | cpu);
176
177 while (*psai & 0x1000000)
178 ;
179
180 *psai = psaival;
181
182 return 0;
183}
184
185static void noop_wait_for_deassert(atomic_t *deassert_not_used)
186{
187}
188
189static int __init es7000_update_genapic(void)
190{
191 genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
192
193 /* MPENTIUMIII */
194 if (boot_cpu_data.x86 == 6 &&
195 (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) {
196 es7000_update_genapic_to_cluster();
197 genapic->wait_for_init_deassert = noop_wait_for_deassert;
198 genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip;
199 }
200
201 return 0;
202}
203
164void __init 204void __init
165setup_unisys(void) 205setup_unisys(void)
166{ 206{
@@ -176,6 +216,8 @@ setup_unisys(void)
176 else 216 else
177 es7000_plat = ES7000_CLASSIC; 217 es7000_plat = ES7000_CLASSIC;
178 ioapic_renumber_irq = es7000_rename_gsi; 218 ioapic_renumber_irq = es7000_rename_gsi;
219
220 x86_quirks->update_genapic = es7000_update_genapic;
179} 221}
180 222
181/* 223/*
@@ -317,26 +359,6 @@ es7000_mip_write(struct mip_reg *mip_reg)
317 return status; 359 return status;
318} 360}
319 361
320int
321es7000_start_cpu(int cpu, unsigned long eip)
322{
323 unsigned long vect = 0, psaival = 0;
324
325 if (psai == NULL)
326 return -1;
327
328 vect = ((unsigned long)__pa(eip)/0x1000) << 16;
329 psaival = (0x1000000 | vect | cpu);
330
331 while (*psai & 0x1000000)
332 ;
333
334 *psai = psaival;
335
336 return 0;
337
338}
339
340void __init 362void __init
341es7000_sw_apic(void) 363es7000_sw_apic(void)
342{ 364{
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 50ea0ac8c9bf..1b43086b097a 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -14,14 +14,17 @@
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/sched.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/list.h> 19#include <linux/list.h>
19 20
20#include <asm/ftrace.h> 21#include <asm/ftrace.h>
22#include <linux/ftrace.h>
21#include <asm/nops.h> 23#include <asm/nops.h>
24#include <asm/nmi.h>
22 25
23 26
24static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; 27#ifdef CONFIG_DYNAMIC_FTRACE
25 28
26union ftrace_code_union { 29union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE]; 30 char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@ union ftrace_code_union {
31 } __attribute__((packed)); 34 } __attribute__((packed));
32}; 35};
33 36
34
35static int ftrace_calc_offset(long ip, long addr) 37static int ftrace_calc_offset(long ip, long addr)
36{ 38{
37 return (int)(addr - ip); 39 return (int)(addr - ip);
38} 40}
39 41
40unsigned char *ftrace_nop_replace(void) 42static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
41{
42 return ftrace_nop;
43}
44
45unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
46{ 43{
47 static union ftrace_code_union calc; 44 static union ftrace_code_union calc;
48 45
@@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
56 return calc.code; 53 return calc.code;
57} 54}
58 55
59int 56/*
57 * Modifying code must take extra care. On an SMP machine, if
58 * the code being modified is also being executed on another CPU
59 * that CPU will have undefined results and possibly take a GPF.
60 * We use kstop_machine to stop other CPUS from exectuing code.
61 * But this does not stop NMIs from happening. We still need
62 * to protect against that. We separate out the modification of
63 * the code to take care of this.
64 *
65 * Two buffers are added: An IP buffer and a "code" buffer.
66 *
67 * 1) Put the instruction pointer into the IP buffer
68 * and the new code into the "code" buffer.
69 * 2) Set a flag that says we are modifying code
70 * 3) Wait for any running NMIs to finish.
71 * 4) Write the code
72 * 5) clear the flag.
73 * 6) Wait for any running NMIs to finish.
74 *
75 * If an NMI is executed, the first thing it does is to call
76 * "ftrace_nmi_enter". This will check if the flag is set to write
77 * and if it is, it will write what is in the IP and "code" buffers.
78 *
79 * The trick is, it does not matter if everyone is writing the same
80 * content to the code location. Also, if a CPU is executing code
81 * it is OK to write to that code location if the contents being written
82 * are the same as what exists.
83 */
84
85static atomic_t in_nmi = ATOMIC_INIT(0);
86static int mod_code_status; /* holds return value of text write */
87static int mod_code_write; /* set when NMI should do the write */
88static void *mod_code_ip; /* holds the IP to write to */
89static void *mod_code_newcode; /* holds the text to write to the IP */
90
91static unsigned nmi_wait_count;
92static atomic_t nmi_update_count = ATOMIC_INIT(0);
93
94int ftrace_arch_read_dyn_info(char *buf, int size)
95{
96 int r;
97
98 r = snprintf(buf, size, "%u %u",
99 nmi_wait_count,
100 atomic_read(&nmi_update_count));
101 return r;
102}
103
104static void ftrace_mod_code(void)
105{
106 /*
107 * Yes, more than one CPU process can be writing to mod_code_status.
108 * (and the code itself)
109 * But if one were to fail, then they all should, and if one were
110 * to succeed, then they all should.
111 */
112 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
113 MCOUNT_INSN_SIZE);
114}
115
116void ftrace_nmi_enter(void)
117{
118 atomic_inc(&in_nmi);
119 /* Must have in_nmi seen before reading write flag */
120 smp_mb();
121 if (mod_code_write) {
122 ftrace_mod_code();
123 atomic_inc(&nmi_update_count);
124 }
125}
126
127void ftrace_nmi_exit(void)
128{
129 /* Finish all executions before clearing in_nmi */
130 smp_wmb();
131 atomic_dec(&in_nmi);
132}
133
134static void wait_for_nmi(void)
135{
136 int waited = 0;
137
138 while (atomic_read(&in_nmi)) {
139 waited = 1;
140 cpu_relax();
141 }
142
143 if (waited)
144 nmi_wait_count++;
145}
146
147static int
148do_ftrace_mod_code(unsigned long ip, void *new_code)
149{
150 mod_code_ip = (void *)ip;
151 mod_code_newcode = new_code;
152
153 /* The buffers need to be visible before we let NMIs write them */
154 smp_wmb();
155
156 mod_code_write = 1;
157
158 /* Make sure write bit is visible before we wait on NMIs */
159 smp_mb();
160
161 wait_for_nmi();
162
163 /* Make sure all running NMIs have finished before we write the code */
164 smp_mb();
165
166 ftrace_mod_code();
167
168 /* Make sure the write happens before clearing the bit */
169 smp_wmb();
170
171 mod_code_write = 0;
172
173 /* make sure NMIs see the cleared bit */
174 smp_mb();
175
176 wait_for_nmi();
177
178 return mod_code_status;
179}
180
181
182
183
184static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
185
186static unsigned char *ftrace_nop_replace(void)
187{
188 return ftrace_nop;
189}
190
191static int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code, 192ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code) 193 unsigned char *new_code)
62{ 194{
@@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
81 return -EINVAL; 213 return -EINVAL;
82 214
83 /* replace the text with the new text */ 215 /* replace the text with the new text */
84 if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) 216 if (do_ftrace_mod_code(ip, new_code))
85 return -EPERM; 217 return -EPERM;
86 218
87 sync_core(); 219 sync_core();
@@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
89 return 0; 221 return 0;
90} 222}
91 223
224int ftrace_make_nop(struct module *mod,
225 struct dyn_ftrace *rec, unsigned long addr)
226{
227 unsigned char *new, *old;
228 unsigned long ip = rec->ip;
229
230 old = ftrace_call_replace(ip, addr);
231 new = ftrace_nop_replace();
232
233 return ftrace_modify_code(rec->ip, old, new);
234}
235
236int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
237{
238 unsigned char *new, *old;
239 unsigned long ip = rec->ip;
240
241 old = ftrace_nop_replace();
242 new = ftrace_call_replace(ip, addr);
243
244 return ftrace_modify_code(rec->ip, old, new);
245}
246
92int ftrace_update_ftrace_func(ftrace_func_t func) 247int ftrace_update_ftrace_func(ftrace_func_t func)
93{ 248{
94 unsigned long ip = (unsigned long)(&ftrace_call); 249 unsigned long ip = (unsigned long)(&ftrace_call);
@@ -165,3 +320,218 @@ int __init ftrace_dyn_arch_init(void *data)
165 320
166 return 0; 321 return 0;
167} 322}
323#endif
324
325#ifdef CONFIG_FUNCTION_GRAPH_TRACER
326
327#ifdef CONFIG_DYNAMIC_FTRACE
328extern void ftrace_graph_call(void);
329
330static int ftrace_mod_jmp(unsigned long ip,
331 int old_offset, int new_offset)
332{
333 unsigned char code[MCOUNT_INSN_SIZE];
334
335 if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE))
336 return -EFAULT;
337
338 if (code[0] != 0xe9 || old_offset != *(int *)(&code[1]))
339 return -EINVAL;
340
341 *(int *)(&code[1]) = new_offset;
342
343 if (do_ftrace_mod_code(ip, &code))
344 return -EPERM;
345
346 return 0;
347}
348
349int ftrace_enable_ftrace_graph_caller(void)
350{
351 unsigned long ip = (unsigned long)(&ftrace_graph_call);
352 int old_offset, new_offset;
353
354 old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
355 new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
356
357 return ftrace_mod_jmp(ip, old_offset, new_offset);
358}
359
360int ftrace_disable_ftrace_graph_caller(void)
361{
362 unsigned long ip = (unsigned long)(&ftrace_graph_call);
363 int old_offset, new_offset;
364
365 old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE);
366 new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE);
367
368 return ftrace_mod_jmp(ip, old_offset, new_offset);
369}
370
371#else /* CONFIG_DYNAMIC_FTRACE */
372
373/*
374 * These functions are picked from those used on
375 * this page for dynamic ftrace. They have been
376 * simplified to ignore all traces in NMI context.
377 */
378static atomic_t in_nmi;
379
380void ftrace_nmi_enter(void)
381{
382 atomic_inc(&in_nmi);
383}
384
385void ftrace_nmi_exit(void)
386{
387 atomic_dec(&in_nmi);
388}
389
390#endif /* !CONFIG_DYNAMIC_FTRACE */
391
392/* Add a function return address to the trace stack on thread info.*/
393static int push_return_trace(unsigned long ret, unsigned long long time,
394 unsigned long func, int *depth)
395{
396 int index;
397
398 if (!current->ret_stack)
399 return -EBUSY;
400
401 /* The return trace stack is full */
402 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
403 atomic_inc(&current->trace_overrun);
404 return -EBUSY;
405 }
406
407 index = ++current->curr_ret_stack;
408 barrier();
409 current->ret_stack[index].ret = ret;
410 current->ret_stack[index].func = func;
411 current->ret_stack[index].calltime = time;
412 *depth = index;
413
414 return 0;
415}
416
417/* Retrieve a function return address to the trace stack on thread info.*/
418static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
419{
420 int index;
421
422 index = current->curr_ret_stack;
423
424 if (unlikely(index < 0)) {
425 ftrace_graph_stop();
426 WARN_ON(1);
427 /* Might as well panic, otherwise we have no where to go */
428 *ret = (unsigned long)panic;
429 return;
430 }
431
432 *ret = current->ret_stack[index].ret;
433 trace->func = current->ret_stack[index].func;
434 trace->calltime = current->ret_stack[index].calltime;
435 trace->overrun = atomic_read(&current->trace_overrun);
436 trace->depth = index;
437 barrier();
438 current->curr_ret_stack--;
439
440}
441
442/*
443 * Send the trace to the ring-buffer.
444 * @return the original return address.
445 */
446unsigned long ftrace_return_to_handler(void)
447{
448 struct ftrace_graph_ret trace;
449 unsigned long ret;
450
451 pop_return_trace(&trace, &ret);
452 trace.rettime = cpu_clock(raw_smp_processor_id());
453 ftrace_graph_return(&trace);
454
455 if (unlikely(!ret)) {
456 ftrace_graph_stop();
457 WARN_ON(1);
458 /* Might as well panic. What else to do? */
459 ret = (unsigned long)panic;
460 }
461
462 return ret;
463}
464
465/*
466 * Hook the return address and push it in the stack of return addrs
467 * in current thread info.
468 */
469void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
470{
471 unsigned long old;
472 unsigned long long calltime;
473 int faulted;
474 struct ftrace_graph_ent trace;
475 unsigned long return_hooker = (unsigned long)
476 &return_to_handler;
477
478 /* Nmi's are currently unsupported */
479 if (unlikely(atomic_read(&in_nmi)))
480 return;
481
482 if (unlikely(atomic_read(&current->tracing_graph_pause)))
483 return;
484
485 /*
486 * Protect against fault, even if it shouldn't
487 * happen. This tool is too much intrusive to
488 * ignore such a protection.
489 */
490 asm volatile(
491 "1: " _ASM_MOV " (%[parent_old]), %[old]\n"
492 "2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n"
493 " movl $0, %[faulted]\n"
494
495 ".section .fixup, \"ax\"\n"
496 "3: movl $1, %[faulted]\n"
497 ".previous\n"
498
499 _ASM_EXTABLE(1b, 3b)
500 _ASM_EXTABLE(2b, 3b)
501
502 : [parent_replaced] "=r" (parent), [old] "=r" (old),
503 [faulted] "=r" (faulted)
504 : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
505 : "memory"
506 );
507
508 if (unlikely(faulted)) {
509 ftrace_graph_stop();
510 WARN_ON(1);
511 return;
512 }
513
514 if (unlikely(!__kernel_text_address(old))) {
515 ftrace_graph_stop();
516 *parent = old;
517 WARN_ON(1);
518 return;
519 }
520
521 calltime = cpu_clock(raw_smp_processor_id());
522
523 if (push_return_trace(old, calltime,
524 self_addr, &trace.depth) == -EBUSY) {
525 *parent = old;
526 return;
527 }
528
529 trace.func = self_addr;
530
531 /* Only trace if the calling function expects to */
532 if (!ftrace_graph_entry(&trace)) {
533 current->curr_ret_stack--;
534 *parent = old;
535 }
536}
537#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 6c9bfc9e1e95..2bced78b0b8e 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -21,6 +21,7 @@
21#include <asm/smp.h> 21#include <asm/smp.h>
22#include <asm/ipi.h> 22#include <asm/ipi.h>
23#include <asm/genapic.h> 23#include <asm/genapic.h>
24#include <asm/setup.h>
24 25
25extern struct genapic apic_flat; 26extern struct genapic apic_flat;
26extern struct genapic apic_physflat; 27extern struct genapic apic_physflat;
@@ -53,6 +54,9 @@ void __init setup_apic_routing(void)
53 genapic = &apic_physflat; 54 genapic = &apic_physflat;
54 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); 55 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
55 } 56 }
57
58 if (x86_quirks->update_genapic)
59 x86_quirks->update_genapic();
56} 60}
57 61
58/* Same for both flat and physical. */ 62/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 1184210e6d0c..d7f0993b8056 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -108,8 +108,33 @@ static int __init parse_noapic(char *str)
108early_param("noapic", parse_noapic); 108early_param("noapic", parse_noapic);
109 109
110struct irq_pin_list; 110struct irq_pin_list;
111
112/*
113 * This is performance-critical, we want to do it O(1)
114 *
115 * the indexing order of this array favors 1:1 mappings
116 * between pins and IRQs.
117 */
118
119struct irq_pin_list {
120 int apic, pin;
121 struct irq_pin_list *next;
122};
123
124static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
125{
126 struct irq_pin_list *pin;
127 int node;
128
129 node = cpu_to_node(cpu);
130
131 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
132 printk(KERN_DEBUG " alloc irq_2_pin on cpu %d node %d\n", cpu, node);
133
134 return pin;
135}
136
111struct irq_cfg { 137struct irq_cfg {
112 unsigned int irq;
113 struct irq_pin_list *irq_2_pin; 138 struct irq_pin_list *irq_2_pin;
114 cpumask_t domain; 139 cpumask_t domain;
115 cpumask_t old_domain; 140 cpumask_t old_domain;
@@ -119,81 +144,95 @@ struct irq_cfg {
119}; 144};
120 145
121/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 146/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
147#ifdef CONFIG_SPARSE_IRQ
148static struct irq_cfg irq_cfgx[] = {
149#else
122static struct irq_cfg irq_cfgx[NR_IRQS] = { 150static struct irq_cfg irq_cfgx[NR_IRQS] = {
123 [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, 151#endif
124 [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, 152 [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
125 [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, 153 [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
126 [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, 154 [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
127 [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, 155 [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
128 [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, 156 [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
129 [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, 157 [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
130 [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, 158 [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
131 [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, 159 [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
132 [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, 160 [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
133 [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, 161 [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
134 [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, 162 [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
135 [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, 163 [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
136 [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, 164 [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
137 [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, 165 [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
138 [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, 166 [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
167 [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
139}; 168};
140 169
141#define for_each_irq_cfg(irq, cfg) \ 170void __init arch_early_irq_init(void)
142 for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
143
144static struct irq_cfg *irq_cfg(unsigned int irq)
145{ 171{
146 return irq < nr_irqs ? irq_cfgx + irq : NULL; 172 struct irq_cfg *cfg;
173 struct irq_desc *desc;
174 int count;
175 int i;
176
177 cfg = irq_cfgx;
178 count = ARRAY_SIZE(irq_cfgx);
179
180 for (i = 0; i < count; i++) {
181 desc = irq_to_desc(i);
182 desc->chip_data = &cfg[i];
183 }
147} 184}
148 185
149static struct irq_cfg *irq_cfg_alloc(unsigned int irq) 186#ifdef CONFIG_SPARSE_IRQ
187static struct irq_cfg *irq_cfg(unsigned int irq)
150{ 188{
151 return irq_cfg(irq); 189 struct irq_cfg *cfg = NULL;
190 struct irq_desc *desc;
191
192 desc = irq_to_desc(irq);
193 if (desc)
194 cfg = desc->chip_data;
195
196 return cfg;
152} 197}
153 198
154/* 199static struct irq_cfg *get_one_free_irq_cfg(int cpu)
155 * Rough estimation of how many shared IRQs there are, can be changed 200{
156 * anytime. 201 struct irq_cfg *cfg;
157 */ 202 int node;
158#define MAX_PLUS_SHARED_IRQS NR_IRQS
159#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
160 203
161/* 204 node = cpu_to_node(cpu);
162 * This is performance-critical, we want to do it O(1)
163 *
164 * the indexing order of this array favors 1:1 mappings
165 * between pins and IRQs.
166 */
167 205
168struct irq_pin_list { 206 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
169 int apic, pin; 207 printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node);
170 struct irq_pin_list *next;
171};
172 208
173static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; 209 return cfg;
174static struct irq_pin_list *irq_2_pin_ptr; 210}
175 211
176static void __init irq_2_pin_init(void) 212void arch_init_chip_data(struct irq_desc *desc, int cpu)
177{ 213{
178 struct irq_pin_list *pin = irq_2_pin_head; 214 struct irq_cfg *cfg;
179 int i;
180
181 for (i = 1; i < PIN_MAP_SIZE; i++)
182 pin[i-1].next = &pin[i];
183 215
184 irq_2_pin_ptr = &pin[0]; 216 cfg = desc->chip_data;
217 if (!cfg) {
218 desc->chip_data = get_one_free_irq_cfg(cpu);
219 if (!desc->chip_data) {
220 printk(KERN_ERR "can not alloc irq_cfg\n");
221 BUG_ON(1);
222 }
223 }
185} 224}
186 225
187static struct irq_pin_list *get_one_free_irq_2_pin(void) 226#else
227static struct irq_cfg *irq_cfg(unsigned int irq)
188{ 228{
189 struct irq_pin_list *pin = irq_2_pin_ptr; 229 return irq < nr_irqs ? irq_cfgx + irq : NULL;
230}
190 231
191 if (!pin) 232#endif
192 panic("can not get more irq_2_pin\n");
193 233
194 irq_2_pin_ptr = pin->next; 234static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
195 pin->next = NULL; 235{
196 return pin;
197} 236}
198 237
199struct io_apic { 238struct io_apic {
@@ -237,11 +276,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
237 writel(value, &io_apic->data); 276 writel(value, &io_apic->data);
238} 277}
239 278
240static bool io_apic_level_ack_pending(unsigned int irq) 279static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
241{ 280{
242 struct irq_pin_list *entry; 281 struct irq_pin_list *entry;
243 unsigned long flags; 282 unsigned long flags;
244 struct irq_cfg *cfg = irq_cfg(irq);
245 283
246 spin_lock_irqsave(&ioapic_lock, flags); 284 spin_lock_irqsave(&ioapic_lock, flags);
247 entry = cfg->irq_2_pin; 285 entry = cfg->irq_2_pin;
@@ -323,13 +361,12 @@ static void ioapic_mask_entry(int apic, int pin)
323} 361}
324 362
325#ifdef CONFIG_SMP 363#ifdef CONFIG_SMP
326static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) 364static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
327{ 365{
328 int apic, pin; 366 int apic, pin;
329 struct irq_cfg *cfg;
330 struct irq_pin_list *entry; 367 struct irq_pin_list *entry;
368 u8 vector = cfg->vector;
331 369
332 cfg = irq_cfg(irq);
333 entry = cfg->irq_2_pin; 370 entry = cfg->irq_2_pin;
334 for (;;) { 371 for (;;) {
335 unsigned int reg; 372 unsigned int reg;
@@ -359,24 +396,27 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
359 } 396 }
360} 397}
361 398
362static int assign_irq_vector(int irq, cpumask_t mask); 399static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask);
363 400
364static void set_ioapic_affinity_irq(unsigned int irq, 401static void set_ioapic_affinity_irq_desc(struct irq_desc *desc,
365 const struct cpumask *mask) 402 const struct cpumask *mask)
366{ 403{
367 struct irq_cfg *cfg; 404 struct irq_cfg *cfg;
368 unsigned long flags; 405 unsigned long flags;
369 unsigned int dest; 406 unsigned int dest;
370 cpumask_t tmp; 407 cpumask_t tmp;
371 struct irq_desc *desc; 408 unsigned int irq;
372 409
373 if (!cpumask_intersects(mask, cpu_online_mask)) 410 if (!cpumask_intersects(mask, cpu_online_mask))
374 return; 411 return;
375 412
376 cfg = irq_cfg(irq); 413 irq = desc->irq;
377 if (assign_irq_vector(irq, *mask)) 414 cfg = desc->chip_data;
415 if (assign_irq_vector(irq, cfg, *mask))
378 return; 416 return;
379 417
418 set_extra_move_desc(desc, *mask);
419
380 cpumask_and(&tmp, &cfg->domain, mask); 420 cpumask_and(&tmp, &cfg->domain, mask);
381 dest = cpu_mask_to_apicid(tmp); 421 dest = cpu_mask_to_apicid(tmp);
382 /* 422 /*
@@ -384,12 +424,21 @@ static void set_ioapic_affinity_irq(unsigned int irq,
384 */ 424 */
385 dest = SET_APIC_LOGICAL_ID(dest); 425 dest = SET_APIC_LOGICAL_ID(dest);
386 426
387 desc = irq_to_desc(irq);
388 spin_lock_irqsave(&ioapic_lock, flags); 427 spin_lock_irqsave(&ioapic_lock, flags);
389 __target_IO_APIC_irq(irq, dest, cfg->vector); 428 __target_IO_APIC_irq(irq, dest, cfg);
390 cpumask_copy(&desc->affinity, mask); 429 cpumask_copy(&desc->affinity, mask);
391 spin_unlock_irqrestore(&ioapic_lock, flags); 430 spin_unlock_irqrestore(&ioapic_lock, flags);
392} 431}
432
433static void set_ioapic_affinity_irq(unsigned int irq,
434 const struct cpumask *mask)
435{
436 struct irq_desc *desc;
437
438 desc = irq_to_desc(irq);
439
440 set_ioapic_affinity_irq_desc(desc, mask);
441}
393#endif /* CONFIG_SMP */ 442#endif /* CONFIG_SMP */
394 443
395/* 444/*
@@ -397,16 +446,18 @@ static void set_ioapic_affinity_irq(unsigned int irq,
397 * shared ISA-space IRQs, so we have to support them. We are super 446 * shared ISA-space IRQs, so we have to support them. We are super
398 * fast in the common case, and fast for shared ISA-space IRQs. 447 * fast in the common case, and fast for shared ISA-space IRQs.
399 */ 448 */
400static void add_pin_to_irq(unsigned int irq, int apic, int pin) 449static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
401{ 450{
402 struct irq_cfg *cfg;
403 struct irq_pin_list *entry; 451 struct irq_pin_list *entry;
404 452
405 /* first time to refer irq_cfg, so with new */
406 cfg = irq_cfg_alloc(irq);
407 entry = cfg->irq_2_pin; 453 entry = cfg->irq_2_pin;
408 if (!entry) { 454 if (!entry) {
409 entry = get_one_free_irq_2_pin(); 455 entry = get_one_free_irq_2_pin(cpu);
456 if (!entry) {
457 printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
458 apic, pin);
459 return;
460 }
410 cfg->irq_2_pin = entry; 461 cfg->irq_2_pin = entry;
411 entry->apic = apic; 462 entry->apic = apic;
412 entry->pin = pin; 463 entry->pin = pin;
@@ -421,7 +472,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
421 entry = entry->next; 472 entry = entry->next;
422 } 473 }
423 474
424 entry->next = get_one_free_irq_2_pin(); 475 entry->next = get_one_free_irq_2_pin(cpu);
425 entry = entry->next; 476 entry = entry->next;
426 entry->apic = apic; 477 entry->apic = apic;
427 entry->pin = pin; 478 entry->pin = pin;
@@ -430,11 +481,10 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
430/* 481/*
431 * Reroute an IRQ to a different pin. 482 * Reroute an IRQ to a different pin.
432 */ 483 */
433static void __init replace_pin_at_irq(unsigned int irq, 484static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
434 int oldapic, int oldpin, 485 int oldapic, int oldpin,
435 int newapic, int newpin) 486 int newapic, int newpin)
436{ 487{
437 struct irq_cfg *cfg = irq_cfg(irq);
438 struct irq_pin_list *entry = cfg->irq_2_pin; 488 struct irq_pin_list *entry = cfg->irq_2_pin;
439 int replaced = 0; 489 int replaced = 0;
440 490
@@ -451,18 +501,16 @@ static void __init replace_pin_at_irq(unsigned int irq,
451 501
452 /* why? call replace before add? */ 502 /* why? call replace before add? */
453 if (!replaced) 503 if (!replaced)
454 add_pin_to_irq(irq, newapic, newpin); 504 add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
455} 505}
456 506
457static inline void io_apic_modify_irq(unsigned int irq, 507static inline void io_apic_modify_irq(struct irq_cfg *cfg,
458 int mask_and, int mask_or, 508 int mask_and, int mask_or,
459 void (*final)(struct irq_pin_list *entry)) 509 void (*final)(struct irq_pin_list *entry))
460{ 510{
461 int pin; 511 int pin;
462 struct irq_cfg *cfg;
463 struct irq_pin_list *entry; 512 struct irq_pin_list *entry;
464 513
465 cfg = irq_cfg(irq);
466 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { 514 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
467 unsigned int reg; 515 unsigned int reg;
468 pin = entry->pin; 516 pin = entry->pin;
@@ -475,9 +523,9 @@ static inline void io_apic_modify_irq(unsigned int irq,
475 } 523 }
476} 524}
477 525
478static void __unmask_IO_APIC_irq(unsigned int irq) 526static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
479{ 527{
480 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); 528 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
481} 529}
482 530
483#ifdef CONFIG_X86_64 531#ifdef CONFIG_X86_64
@@ -492,47 +540,64 @@ void io_apic_sync(struct irq_pin_list *entry)
492 readl(&io_apic->data); 540 readl(&io_apic->data);
493} 541}
494 542
495static void __mask_IO_APIC_irq(unsigned int irq) 543static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
496{ 544{
497 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 545 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
498} 546}
499#else /* CONFIG_X86_32 */ 547#else /* CONFIG_X86_32 */
500static void __mask_IO_APIC_irq(unsigned int irq) 548static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
501{ 549{
502 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); 550 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);
503} 551}
504 552
505static void __mask_and_edge_IO_APIC_irq(unsigned int irq) 553static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
506{ 554{
507 io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, 555 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
508 IO_APIC_REDIR_MASKED, NULL); 556 IO_APIC_REDIR_MASKED, NULL);
509} 557}
510 558
511static void __unmask_and_level_IO_APIC_irq(unsigned int irq) 559static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
512{ 560{
513 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 561 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
514 IO_APIC_REDIR_LEVEL_TRIGGER, NULL); 562 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
515} 563}
516#endif /* CONFIG_X86_32 */ 564#endif /* CONFIG_X86_32 */
517 565
518static void mask_IO_APIC_irq (unsigned int irq) 566static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
519{ 567{
568 struct irq_cfg *cfg = desc->chip_data;
520 unsigned long flags; 569 unsigned long flags;
521 570
571 BUG_ON(!cfg);
572
522 spin_lock_irqsave(&ioapic_lock, flags); 573 spin_lock_irqsave(&ioapic_lock, flags);
523 __mask_IO_APIC_irq(irq); 574 __mask_IO_APIC_irq(cfg);
524 spin_unlock_irqrestore(&ioapic_lock, flags); 575 spin_unlock_irqrestore(&ioapic_lock, flags);
525} 576}
526 577
527static void unmask_IO_APIC_irq (unsigned int irq) 578static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
528{ 579{
580 struct irq_cfg *cfg = desc->chip_data;
529 unsigned long flags; 581 unsigned long flags;
530 582
531 spin_lock_irqsave(&ioapic_lock, flags); 583 spin_lock_irqsave(&ioapic_lock, flags);
532 __unmask_IO_APIC_irq(irq); 584 __unmask_IO_APIC_irq(cfg);
533 spin_unlock_irqrestore(&ioapic_lock, flags); 585 spin_unlock_irqrestore(&ioapic_lock, flags);
534} 586}
535 587
588static void mask_IO_APIC_irq(unsigned int irq)
589{
590 struct irq_desc *desc = irq_to_desc(irq);
591
592 mask_IO_APIC_irq_desc(desc);
593}
594static void unmask_IO_APIC_irq(unsigned int irq)
595{
596 struct irq_desc *desc = irq_to_desc(irq);
597
598 unmask_IO_APIC_irq_desc(desc);
599}
600
536static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) 601static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
537{ 602{
538 struct IO_APIC_route_entry entry; 603 struct IO_APIC_route_entry entry;
@@ -809,7 +874,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
809 */ 874 */
810static int EISA_ELCR(unsigned int irq) 875static int EISA_ELCR(unsigned int irq)
811{ 876{
812 if (irq < 16) { 877 if (irq < NR_IRQS_LEGACY) {
813 unsigned int port = 0x4d0 + (irq >> 3); 878 unsigned int port = 0x4d0 + (irq >> 3);
814 return (inb(port) >> (irq & 7)) & 1; 879 return (inb(port) >> (irq & 7)) & 1;
815 } 880 }
@@ -1034,7 +1099,7 @@ void unlock_vector_lock(void)
1034 spin_unlock(&vector_lock); 1099 spin_unlock(&vector_lock);
1035} 1100}
1036 1101
1037static int __assign_irq_vector(int irq, cpumask_t mask) 1102static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
1038{ 1103{
1039 /* 1104 /*
1040 * NOTE! The local APIC isn't very good at handling 1105 * NOTE! The local APIC isn't very good at handling
@@ -1050,16 +1115,13 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
1050 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; 1115 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
1051 unsigned int old_vector; 1116 unsigned int old_vector;
1052 int cpu; 1117 int cpu;
1053 struct irq_cfg *cfg;
1054 1118
1055 cfg = irq_cfg(irq); 1119 if ((cfg->move_in_progress) || cfg->move_cleanup_count)
1120 return -EBUSY;
1056 1121
1057 /* Only try and allocate irqs on cpus that are present */ 1122 /* Only try and allocate irqs on cpus that are present */
1058 cpus_and(mask, mask, cpu_online_map); 1123 cpus_and(mask, mask, cpu_online_map);
1059 1124
1060 if ((cfg->move_in_progress) || cfg->move_cleanup_count)
1061 return -EBUSY;
1062
1063 old_vector = cfg->vector; 1125 old_vector = cfg->vector;
1064 if (old_vector) { 1126 if (old_vector) {
1065 cpumask_t tmp; 1127 cpumask_t tmp;
@@ -1113,24 +1175,22 @@ next:
1113 return -ENOSPC; 1175 return -ENOSPC;
1114} 1176}
1115 1177
1116static int assign_irq_vector(int irq, cpumask_t mask) 1178static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)
1117{ 1179{
1118 int err; 1180 int err;
1119 unsigned long flags; 1181 unsigned long flags;
1120 1182
1121 spin_lock_irqsave(&vector_lock, flags); 1183 spin_lock_irqsave(&vector_lock, flags);
1122 err = __assign_irq_vector(irq, mask); 1184 err = __assign_irq_vector(irq, cfg, mask);
1123 spin_unlock_irqrestore(&vector_lock, flags); 1185 spin_unlock_irqrestore(&vector_lock, flags);
1124 return err; 1186 return err;
1125} 1187}
1126 1188
1127static void __clear_irq_vector(int irq) 1189static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1128{ 1190{
1129 struct irq_cfg *cfg;
1130 cpumask_t mask; 1191 cpumask_t mask;
1131 int cpu, vector; 1192 int cpu, vector;
1132 1193
1133 cfg = irq_cfg(irq);
1134 BUG_ON(!cfg->vector); 1194 BUG_ON(!cfg->vector);
1135 1195
1136 vector = cfg->vector; 1196 vector = cfg->vector;
@@ -1162,9 +1222,13 @@ void __setup_vector_irq(int cpu)
1162 /* This function must be called with vector_lock held */ 1222 /* This function must be called with vector_lock held */
1163 int irq, vector; 1223 int irq, vector;
1164 struct irq_cfg *cfg; 1224 struct irq_cfg *cfg;
1225 struct irq_desc *desc;
1165 1226
1166 /* Mark the inuse vectors */ 1227 /* Mark the inuse vectors */
1167 for_each_irq_cfg(irq, cfg) { 1228 for_each_irq_desc(irq, desc) {
1229 if (!desc)
1230 continue;
1231 cfg = desc->chip_data;
1168 if (!cpu_isset(cpu, cfg->domain)) 1232 if (!cpu_isset(cpu, cfg->domain))
1169 continue; 1233 continue;
1170 vector = cfg->vector; 1234 vector = cfg->vector;
@@ -1215,11 +1279,8 @@ static inline int IO_APIC_irq_trigger(int irq)
1215} 1279}
1216#endif 1280#endif
1217 1281
1218static void ioapic_register_intr(int irq, unsigned long trigger) 1282static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)
1219{ 1283{
1220 struct irq_desc *desc;
1221
1222 desc = irq_to_desc(irq);
1223 1284
1224 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1285 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1225 trigger == IOAPIC_LEVEL) 1286 trigger == IOAPIC_LEVEL)
@@ -1311,7 +1372,7 @@ static int setup_ioapic_entry(int apic, int irq,
1311 return 0; 1372 return 0;
1312} 1373}
1313 1374
1314static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, 1375static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,
1315 int trigger, int polarity) 1376 int trigger, int polarity)
1316{ 1377{
1317 struct irq_cfg *cfg; 1378 struct irq_cfg *cfg;
@@ -1321,10 +1382,10 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
1321 if (!IO_APIC_IRQ(irq)) 1382 if (!IO_APIC_IRQ(irq))
1322 return; 1383 return;
1323 1384
1324 cfg = irq_cfg(irq); 1385 cfg = desc->chip_data;
1325 1386
1326 mask = TARGET_CPUS; 1387 mask = TARGET_CPUS;
1327 if (assign_irq_vector(irq, mask)) 1388 if (assign_irq_vector(irq, cfg, mask))
1328 return; 1389 return;
1329 1390
1330 cpus_and(mask, cfg->domain, mask); 1391 cpus_and(mask, cfg->domain, mask);
@@ -1341,12 +1402,12 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
1341 cfg->vector)) { 1402 cfg->vector)) {
1342 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1403 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1343 mp_ioapics[apic].mp_apicid, pin); 1404 mp_ioapics[apic].mp_apicid, pin);
1344 __clear_irq_vector(irq); 1405 __clear_irq_vector(irq, cfg);
1345 return; 1406 return;
1346 } 1407 }
1347 1408
1348 ioapic_register_intr(irq, trigger); 1409 ioapic_register_intr(irq, desc, trigger);
1349 if (irq < 16) 1410 if (irq < NR_IRQS_LEGACY)
1350 disable_8259A_irq(irq); 1411 disable_8259A_irq(irq);
1351 1412
1352 ioapic_write_entry(apic, pin, entry); 1413 ioapic_write_entry(apic, pin, entry);
@@ -1356,6 +1417,9 @@ static void __init setup_IO_APIC_irqs(void)
1356{ 1417{
1357 int apic, pin, idx, irq; 1418 int apic, pin, idx, irq;
1358 int notcon = 0; 1419 int notcon = 0;
1420 struct irq_desc *desc;
1421 struct irq_cfg *cfg;
1422 int cpu = boot_cpu_id;
1359 1423
1360 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1424 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1361 1425
@@ -1387,9 +1451,15 @@ static void __init setup_IO_APIC_irqs(void)
1387 if (multi_timer_check(apic, irq)) 1451 if (multi_timer_check(apic, irq))
1388 continue; 1452 continue;
1389#endif 1453#endif
1390 add_pin_to_irq(irq, apic, pin); 1454 desc = irq_to_desc_alloc_cpu(irq, cpu);
1455 if (!desc) {
1456 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1457 continue;
1458 }
1459 cfg = desc->chip_data;
1460 add_pin_to_irq_cpu(cfg, cpu, apic, pin);
1391 1461
1392 setup_IO_APIC_irq(apic, pin, irq, 1462 setup_IO_APIC_irq(apic, pin, irq, desc,
1393 irq_trigger(idx), irq_polarity(idx)); 1463 irq_trigger(idx), irq_polarity(idx));
1394 } 1464 }
1395 } 1465 }
@@ -1448,6 +1518,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1448 union IO_APIC_reg_03 reg_03; 1518 union IO_APIC_reg_03 reg_03;
1449 unsigned long flags; 1519 unsigned long flags;
1450 struct irq_cfg *cfg; 1520 struct irq_cfg *cfg;
1521 struct irq_desc *desc;
1451 unsigned int irq; 1522 unsigned int irq;
1452 1523
1453 if (apic_verbosity == APIC_QUIET) 1524 if (apic_verbosity == APIC_QUIET)
@@ -1537,8 +1608,13 @@ __apicdebuginit(void) print_IO_APIC(void)
1537 } 1608 }
1538 } 1609 }
1539 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1610 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1540 for_each_irq_cfg(irq, cfg) { 1611 for_each_irq_desc(irq, desc) {
1541 struct irq_pin_list *entry = cfg->irq_2_pin; 1612 struct irq_pin_list *entry;
1613
1614 if (!desc)
1615 continue;
1616 cfg = desc->chip_data;
1617 entry = cfg->irq_2_pin;
1542 if (!entry) 1618 if (!entry)
1543 continue; 1619 continue;
1544 printk(KERN_DEBUG "IRQ%d ", irq); 1620 printk(KERN_DEBUG "IRQ%d ", irq);
@@ -2022,14 +2098,16 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2022{ 2098{
2023 int was_pending = 0; 2099 int was_pending = 0;
2024 unsigned long flags; 2100 unsigned long flags;
2101 struct irq_cfg *cfg;
2025 2102
2026 spin_lock_irqsave(&ioapic_lock, flags); 2103 spin_lock_irqsave(&ioapic_lock, flags);
2027 if (irq < 16) { 2104 if (irq < NR_IRQS_LEGACY) {
2028 disable_8259A_irq(irq); 2105 disable_8259A_irq(irq);
2029 if (i8259A_irq_pending(irq)) 2106 if (i8259A_irq_pending(irq))
2030 was_pending = 1; 2107 was_pending = 1;
2031 } 2108 }
2032 __unmask_IO_APIC_irq(irq); 2109 cfg = irq_cfg(irq);
2110 __unmask_IO_APIC_irq(cfg);
2033 spin_unlock_irqrestore(&ioapic_lock, flags); 2111 spin_unlock_irqrestore(&ioapic_lock, flags);
2034 2112
2035 return was_pending; 2113 return was_pending;
@@ -2092,35 +2170,37 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
2092 * as simple as edge triggered migration and we can do the irq migration 2170 * as simple as edge triggered migration and we can do the irq migration
2093 * with a simple atomic update to IO-APIC RTE. 2171 * with a simple atomic update to IO-APIC RTE.
2094 */ 2172 */
2095static void migrate_ioapic_irq(int irq, cpumask_t mask) 2173static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)
2096{ 2174{
2097 struct irq_cfg *cfg; 2175 struct irq_cfg *cfg;
2098 struct irq_desc *desc;
2099 cpumask_t tmp, cleanup_mask; 2176 cpumask_t tmp, cleanup_mask;
2100 struct irte irte; 2177 struct irte irte;
2101 int modify_ioapic_rte; 2178 int modify_ioapic_rte;
2102 unsigned int dest; 2179 unsigned int dest;
2103 unsigned long flags; 2180 unsigned long flags;
2181 unsigned int irq;
2104 2182
2105 cpus_and(tmp, mask, cpu_online_map); 2183 cpus_and(tmp, mask, cpu_online_map);
2106 if (cpus_empty(tmp)) 2184 if (cpus_empty(tmp))
2107 return; 2185 return;
2108 2186
2187 irq = desc->irq;
2109 if (get_irte(irq, &irte)) 2188 if (get_irte(irq, &irte))
2110 return; 2189 return;
2111 2190
2112 if (assign_irq_vector(irq, mask)) 2191 cfg = desc->chip_data;
2192 if (assign_irq_vector(irq, cfg, mask))
2113 return; 2193 return;
2114 2194
2115 cfg = irq_cfg(irq); 2195 set_extra_move_desc(desc, mask);
2196
2116 cpus_and(tmp, cfg->domain, mask); 2197 cpus_and(tmp, cfg->domain, mask);
2117 dest = cpu_mask_to_apicid(tmp); 2198 dest = cpu_mask_to_apicid(tmp);
2118 2199
2119 desc = irq_to_desc(irq);
2120 modify_ioapic_rte = desc->status & IRQ_LEVEL; 2200 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2121 if (modify_ioapic_rte) { 2201 if (modify_ioapic_rte) {
2122 spin_lock_irqsave(&ioapic_lock, flags); 2202 spin_lock_irqsave(&ioapic_lock, flags);
2123 __target_IO_APIC_irq(irq, dest, cfg->vector); 2203 __target_IO_APIC_irq(irq, dest, cfg);
2124 spin_unlock_irqrestore(&ioapic_lock, flags); 2204 spin_unlock_irqrestore(&ioapic_lock, flags);
2125 } 2205 }
2126 2206
@@ -2142,14 +2222,14 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)
2142 desc->affinity = mask; 2222 desc->affinity = mask;
2143} 2223}
2144 2224
2145static int migrate_irq_remapped_level(int irq) 2225static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2146{ 2226{
2147 int ret = -1; 2227 int ret = -1;
2148 struct irq_desc *desc = irq_to_desc(irq); 2228 struct irq_cfg *cfg = desc->chip_data;
2149 2229
2150 mask_IO_APIC_irq(irq); 2230 mask_IO_APIC_irq_desc(desc);
2151 2231
2152 if (io_apic_level_ack_pending(irq)) { 2232 if (io_apic_level_ack_pending(cfg)) {
2153 /* 2233 /*
2154 * Interrupt in progress. Migrating irq now will change the 2234 * Interrupt in progress. Migrating irq now will change the
2155 * vector information in the IO-APIC RTE and that will confuse 2235 * vector information in the IO-APIC RTE and that will confuse
@@ -2161,14 +2241,15 @@ static int migrate_irq_remapped_level(int irq)
2161 } 2241 }
2162 2242
2163 /* everthing is clear. we have right of way */ 2243 /* everthing is clear. we have right of way */
2164 migrate_ioapic_irq(irq, desc->pending_mask); 2244 migrate_ioapic_irq_desc(desc, desc->pending_mask);
2165 2245
2166 ret = 0; 2246 ret = 0;
2167 desc->status &= ~IRQ_MOVE_PENDING; 2247 desc->status &= ~IRQ_MOVE_PENDING;
2168 cpus_clear(desc->pending_mask); 2248 cpus_clear(desc->pending_mask);
2169 2249
2170unmask: 2250unmask:
2171 unmask_IO_APIC_irq(irq); 2251 unmask_IO_APIC_irq_desc(desc);
2252
2172 return ret; 2253 return ret;
2173} 2254}
2174 2255
@@ -2178,6 +2259,9 @@ static void ir_irq_migration(struct work_struct *work)
2178 struct irq_desc *desc; 2259 struct irq_desc *desc;
2179 2260
2180 for_each_irq_desc(irq, desc) { 2261 for_each_irq_desc(irq, desc) {
2262 if (!desc)
2263 continue;
2264
2181 if (desc->status & IRQ_MOVE_PENDING) { 2265 if (desc->status & IRQ_MOVE_PENDING) {
2182 unsigned long flags; 2266 unsigned long flags;
2183 2267
@@ -2198,19 +2282,24 @@ static void ir_irq_migration(struct work_struct *work)
2198/* 2282/*
2199 * Migrates the IRQ destination in the process context. 2283 * Migrates the IRQ destination in the process context.
2200 */ 2284 */
2201static void set_ir_ioapic_affinity_irq(unsigned int irq, 2285static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2202 const struct cpumask *mask) 2286 const struct cpumask *mask)
2203{ 2287{
2204 struct irq_desc *desc = irq_to_desc(irq);
2205
2206 if (desc->status & IRQ_LEVEL) { 2288 if (desc->status & IRQ_LEVEL) {
2207 desc->status |= IRQ_MOVE_PENDING; 2289 desc->status |= IRQ_MOVE_PENDING;
2208 cpumask_copy(&desc->pending_mask, mask); 2290 cpumask_copy(&desc->pending_mask, mask);
2209 migrate_irq_remapped_level(irq); 2291 migrate_irq_remapped_level_desc(desc);
2210 return; 2292 return;
2211 } 2293 }
2212 2294
2213 migrate_ioapic_irq(irq, *mask); 2295 migrate_ioapic_irq_desc(desc, mask);
2296}
2297static void set_ir_ioapic_affinity_irq(unsigned int irq,
2298 const struct cpumask *mask)
2299{
2300 struct irq_desc *desc = irq_to_desc(irq);
2301
2302 set_ir_ioapic_affinity_irq_desc(desc, mask);
2214} 2303}
2215#endif 2304#endif
2216 2305
@@ -2230,6 +2319,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2230 struct irq_cfg *cfg; 2319 struct irq_cfg *cfg;
2231 irq = __get_cpu_var(vector_irq)[vector]; 2320 irq = __get_cpu_var(vector_irq)[vector];
2232 2321
2322 if (irq == -1)
2323 continue;
2324
2233 desc = irq_to_desc(irq); 2325 desc = irq_to_desc(irq);
2234 if (!desc) 2326 if (!desc)
2235 continue; 2327 continue;
@@ -2251,9 +2343,10 @@ unlock:
2251 irq_exit(); 2343 irq_exit();
2252} 2344}
2253 2345
2254static void irq_complete_move(unsigned int irq) 2346static void irq_complete_move(struct irq_desc **descp)
2255{ 2347{
2256 struct irq_cfg *cfg = irq_cfg(irq); 2348 struct irq_desc *desc = *descp;
2349 struct irq_cfg *cfg = desc->chip_data;
2257 unsigned vector, me; 2350 unsigned vector, me;
2258 2351
2259 if (likely(!cfg->move_in_progress)) 2352 if (likely(!cfg->move_in_progress))
@@ -2271,8 +2364,9 @@ static void irq_complete_move(unsigned int irq)
2271 } 2364 }
2272} 2365}
2273#else 2366#else
2274static inline void irq_complete_move(unsigned int irq) {} 2367static inline void irq_complete_move(struct irq_desc **descp) {}
2275#endif 2368#endif
2369
2276#ifdef CONFIG_INTR_REMAP 2370#ifdef CONFIG_INTR_REMAP
2277static void ack_x2apic_level(unsigned int irq) 2371static void ack_x2apic_level(unsigned int irq)
2278{ 2372{
@@ -2283,11 +2377,14 @@ static void ack_x2apic_edge(unsigned int irq)
2283{ 2377{
2284 ack_x2APIC_irq(); 2378 ack_x2APIC_irq();
2285} 2379}
2380
2286#endif 2381#endif
2287 2382
2288static void ack_apic_edge(unsigned int irq) 2383static void ack_apic_edge(unsigned int irq)
2289{ 2384{
2290 irq_complete_move(irq); 2385 struct irq_desc *desc = irq_to_desc(irq);
2386
2387 irq_complete_move(&desc);
2291 move_native_irq(irq); 2388 move_native_irq(irq);
2292 ack_APIC_irq(); 2389 ack_APIC_irq();
2293} 2390}
@@ -2296,18 +2393,21 @@ atomic_t irq_mis_count;
2296 2393
2297static void ack_apic_level(unsigned int irq) 2394static void ack_apic_level(unsigned int irq)
2298{ 2395{
2396 struct irq_desc *desc = irq_to_desc(irq);
2397
2299#ifdef CONFIG_X86_32 2398#ifdef CONFIG_X86_32
2300 unsigned long v; 2399 unsigned long v;
2301 int i; 2400 int i;
2302#endif 2401#endif
2402 struct irq_cfg *cfg;
2303 int do_unmask_irq = 0; 2403 int do_unmask_irq = 0;
2304 2404
2305 irq_complete_move(irq); 2405 irq_complete_move(&desc);
2306#ifdef CONFIG_GENERIC_PENDING_IRQ 2406#ifdef CONFIG_GENERIC_PENDING_IRQ
2307 /* If we are moving the irq we need to mask it */ 2407 /* If we are moving the irq we need to mask it */
2308 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { 2408 if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
2309 do_unmask_irq = 1; 2409 do_unmask_irq = 1;
2310 mask_IO_APIC_irq(irq); 2410 mask_IO_APIC_irq_desc(desc);
2311 } 2411 }
2312#endif 2412#endif
2313 2413
@@ -2331,7 +2431,8 @@ static void ack_apic_level(unsigned int irq)
2331 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2431 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2332 * The idea is from Manfred Spraul. --macro 2432 * The idea is from Manfred Spraul. --macro
2333 */ 2433 */
2334 i = irq_cfg(irq)->vector; 2434 cfg = desc->chip_data;
2435 i = cfg->vector;
2335 2436
2336 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); 2437 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2337#endif 2438#endif
@@ -2370,17 +2471,18 @@ static void ack_apic_level(unsigned int irq)
2370 * accurate and is causing problems then it is a hardware bug 2471 * accurate and is causing problems then it is a hardware bug
2371 * and you can go talk to the chipset vendor about it. 2472 * and you can go talk to the chipset vendor about it.
2372 */ 2473 */
2373 if (!io_apic_level_ack_pending(irq)) 2474 cfg = desc->chip_data;
2475 if (!io_apic_level_ack_pending(cfg))
2374 move_masked_irq(irq); 2476 move_masked_irq(irq);
2375 unmask_IO_APIC_irq(irq); 2477 unmask_IO_APIC_irq_desc(desc);
2376 } 2478 }
2377 2479
2378#ifdef CONFIG_X86_32 2480#ifdef CONFIG_X86_32
2379 if (!(v & (1 << (i & 0x1f)))) { 2481 if (!(v & (1 << (i & 0x1f)))) {
2380 atomic_inc(&irq_mis_count); 2482 atomic_inc(&irq_mis_count);
2381 spin_lock(&ioapic_lock); 2483 spin_lock(&ioapic_lock);
2382 __mask_and_edge_IO_APIC_irq(irq); 2484 __mask_and_edge_IO_APIC_irq(cfg);
2383 __unmask_and_level_IO_APIC_irq(irq); 2485 __unmask_and_level_IO_APIC_irq(cfg);
2384 spin_unlock(&ioapic_lock); 2486 spin_unlock(&ioapic_lock);
2385 } 2487 }
2386#endif 2488#endif
@@ -2431,20 +2533,22 @@ static inline void init_IO_APIC_traps(void)
2431 * Also, we've got to be careful not to trash gate 2533 * Also, we've got to be careful not to trash gate
2432 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2534 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2433 */ 2535 */
2434 for_each_irq_cfg(irq, cfg) { 2536 for_each_irq_desc(irq, desc) {
2435 if (IO_APIC_IRQ(irq) && !cfg->vector) { 2537 if (!desc)
2538 continue;
2539
2540 cfg = desc->chip_data;
2541 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
2436 /* 2542 /*
2437 * Hmm.. We don't have an entry for this, 2543 * Hmm.. We don't have an entry for this,
2438 * so default to an old-fashioned 8259 2544 * so default to an old-fashioned 8259
2439 * interrupt if we can.. 2545 * interrupt if we can..
2440 */ 2546 */
2441 if (irq < 16) 2547 if (irq < NR_IRQS_LEGACY)
2442 make_8259A_irq(irq); 2548 make_8259A_irq(irq);
2443 else { 2549 else
2444 desc = irq_to_desc(irq);
2445 /* Strange. Oh, well.. */ 2550 /* Strange. Oh, well.. */
2446 desc->chip = &no_irq_chip; 2551 desc->chip = &no_irq_chip;
2447 }
2448 } 2552 }
2449 } 2553 }
2450} 2554}
@@ -2469,7 +2573,7 @@ static void unmask_lapic_irq(unsigned int irq)
2469 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 2573 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2470} 2574}
2471 2575
2472static void ack_lapic_irq (unsigned int irq) 2576static void ack_lapic_irq(unsigned int irq)
2473{ 2577{
2474 ack_APIC_irq(); 2578 ack_APIC_irq();
2475} 2579}
@@ -2481,11 +2585,8 @@ static struct irq_chip lapic_chip __read_mostly = {
2481 .ack = ack_lapic_irq, 2585 .ack = ack_lapic_irq,
2482}; 2586};
2483 2587
2484static void lapic_register_intr(int irq) 2588static void lapic_register_intr(int irq, struct irq_desc *desc)
2485{ 2589{
2486 struct irq_desc *desc;
2487
2488 desc = irq_to_desc(irq);
2489 desc->status &= ~IRQ_LEVEL; 2590 desc->status &= ~IRQ_LEVEL;
2490 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2591 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2491 "edge"); 2592 "edge");
@@ -2589,7 +2690,9 @@ int timer_through_8259 __initdata;
2589 */ 2690 */
2590static inline void __init check_timer(void) 2691static inline void __init check_timer(void)
2591{ 2692{
2592 struct irq_cfg *cfg = irq_cfg(0); 2693 struct irq_desc *desc = irq_to_desc(0);
2694 struct irq_cfg *cfg = desc->chip_data;
2695 int cpu = boot_cpu_id;
2593 int apic1, pin1, apic2, pin2; 2696 int apic1, pin1, apic2, pin2;
2594 unsigned long flags; 2697 unsigned long flags;
2595 unsigned int ver; 2698 unsigned int ver;
@@ -2604,7 +2707,7 @@ static inline void __init check_timer(void)
2604 * get/set the timer IRQ vector: 2707 * get/set the timer IRQ vector:
2605 */ 2708 */
2606 disable_8259A_irq(0); 2709 disable_8259A_irq(0);
2607 assign_irq_vector(0, TARGET_CPUS); 2710 assign_irq_vector(0, cfg, TARGET_CPUS);
2608 2711
2609 /* 2712 /*
2610 * As IRQ0 is to be enabled in the 8259A, the virtual 2713 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2655,10 +2758,10 @@ static inline void __init check_timer(void)
2655 * Ok, does IRQ0 through the IOAPIC work? 2758 * Ok, does IRQ0 through the IOAPIC work?
2656 */ 2759 */
2657 if (no_pin1) { 2760 if (no_pin1) {
2658 add_pin_to_irq(0, apic1, pin1); 2761 add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
2659 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2762 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
2660 } 2763 }
2661 unmask_IO_APIC_irq(0); 2764 unmask_IO_APIC_irq_desc(desc);
2662 if (timer_irq_works()) { 2765 if (timer_irq_works()) {
2663 if (nmi_watchdog == NMI_IO_APIC) { 2766 if (nmi_watchdog == NMI_IO_APIC) {
2664 setup_nmi(); 2767 setup_nmi();
@@ -2684,9 +2787,9 @@ static inline void __init check_timer(void)
2684 /* 2787 /*
2685 * legacy devices should be connected to IO APIC #0 2788 * legacy devices should be connected to IO APIC #0
2686 */ 2789 */
2687 replace_pin_at_irq(0, apic1, pin1, apic2, pin2); 2790 replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
2688 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 2791 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
2689 unmask_IO_APIC_irq(0); 2792 unmask_IO_APIC_irq_desc(desc);
2690 enable_8259A_irq(0); 2793 enable_8259A_irq(0);
2691 if (timer_irq_works()) { 2794 if (timer_irq_works()) {
2692 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 2795 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2718,7 +2821,7 @@ static inline void __init check_timer(void)
2718 apic_printk(APIC_QUIET, KERN_INFO 2821 apic_printk(APIC_QUIET, KERN_INFO
2719 "...trying to set up timer as Virtual Wire IRQ...\n"); 2822 "...trying to set up timer as Virtual Wire IRQ...\n");
2720 2823
2721 lapic_register_intr(0); 2824 lapic_register_intr(0, desc);
2722 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 2825 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
2723 enable_8259A_irq(0); 2826 enable_8259A_irq(0);
2724 2827
@@ -2903,22 +3006,26 @@ unsigned int create_irq_nr(unsigned int irq_want)
2903 unsigned int irq; 3006 unsigned int irq;
2904 unsigned int new; 3007 unsigned int new;
2905 unsigned long flags; 3008 unsigned long flags;
2906 struct irq_cfg *cfg_new; 3009 struct irq_cfg *cfg_new = NULL;
2907 3010 int cpu = boot_cpu_id;
2908 irq_want = nr_irqs - 1; 3011 struct irq_desc *desc_new = NULL;
2909 3012
2910 irq = 0; 3013 irq = 0;
2911 spin_lock_irqsave(&vector_lock, flags); 3014 spin_lock_irqsave(&vector_lock, flags);
2912 for (new = irq_want; new > 0; new--) { 3015 for (new = irq_want; new < NR_IRQS; new++) {
2913 if (platform_legacy_irq(new)) 3016 if (platform_legacy_irq(new))
2914 continue; 3017 continue;
2915 cfg_new = irq_cfg(new); 3018
2916 if (cfg_new && cfg_new->vector != 0) 3019 desc_new = irq_to_desc_alloc_cpu(new, cpu);
3020 if (!desc_new) {
3021 printk(KERN_INFO "can not get irq_desc for %d\n", new);
3022 continue;
3023 }
3024 cfg_new = desc_new->chip_data;
3025
3026 if (cfg_new->vector != 0)
2917 continue; 3027 continue;
2918 /* check if need to create one */ 3028 if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)
2919 if (!cfg_new)
2920 cfg_new = irq_cfg_alloc(new);
2921 if (__assign_irq_vector(new, TARGET_CPUS) == 0)
2922 irq = new; 3029 irq = new;
2923 break; 3030 break;
2924 } 3031 }
@@ -2926,15 +3033,21 @@ unsigned int create_irq_nr(unsigned int irq_want)
2926 3033
2927 if (irq > 0) { 3034 if (irq > 0) {
2928 dynamic_irq_init(irq); 3035 dynamic_irq_init(irq);
3036 /* restore it, in case dynamic_irq_init clear it */
3037 if (desc_new)
3038 desc_new->chip_data = cfg_new;
2929 } 3039 }
2930 return irq; 3040 return irq;
2931} 3041}
2932 3042
3043static int nr_irqs_gsi = NR_IRQS_LEGACY;
2933int create_irq(void) 3044int create_irq(void)
2934{ 3045{
3046 unsigned int irq_want;
2935 int irq; 3047 int irq;
2936 3048
2937 irq = create_irq_nr(nr_irqs - 1); 3049 irq_want = nr_irqs_gsi;
3050 irq = create_irq_nr(irq_want);
2938 3051
2939 if (irq == 0) 3052 if (irq == 0)
2940 irq = -1; 3053 irq = -1;
@@ -2945,14 +3058,22 @@ int create_irq(void)
2945void destroy_irq(unsigned int irq) 3058void destroy_irq(unsigned int irq)
2946{ 3059{
2947 unsigned long flags; 3060 unsigned long flags;
3061 struct irq_cfg *cfg;
3062 struct irq_desc *desc;
2948 3063
3064 /* store it, in case dynamic_irq_cleanup clear it */
3065 desc = irq_to_desc(irq);
3066 cfg = desc->chip_data;
2949 dynamic_irq_cleanup(irq); 3067 dynamic_irq_cleanup(irq);
3068 /* connect back irq_cfg */
3069 if (desc)
3070 desc->chip_data = cfg;
2950 3071
2951#ifdef CONFIG_INTR_REMAP 3072#ifdef CONFIG_INTR_REMAP
2952 free_irte(irq); 3073 free_irte(irq);
2953#endif 3074#endif
2954 spin_lock_irqsave(&vector_lock, flags); 3075 spin_lock_irqsave(&vector_lock, flags);
2955 __clear_irq_vector(irq); 3076 __clear_irq_vector(irq, cfg);
2956 spin_unlock_irqrestore(&vector_lock, flags); 3077 spin_unlock_irqrestore(&vector_lock, flags);
2957} 3078}
2958 3079
@@ -2967,12 +3088,12 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2967 unsigned dest; 3088 unsigned dest;
2968 cpumask_t tmp; 3089 cpumask_t tmp;
2969 3090
3091 cfg = irq_cfg(irq);
2970 tmp = TARGET_CPUS; 3092 tmp = TARGET_CPUS;
2971 err = assign_irq_vector(irq, tmp); 3093 err = assign_irq_vector(irq, cfg, tmp);
2972 if (err) 3094 if (err)
2973 return err; 3095 return err;
2974 3096
2975 cfg = irq_cfg(irq);
2976 cpus_and(tmp, cfg->domain, tmp); 3097 cpus_and(tmp, cfg->domain, tmp);
2977 dest = cpu_mask_to_apicid(tmp); 3098 dest = cpu_mask_to_apicid(tmp);
2978 3099
@@ -3030,34 +3151,34 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3030#ifdef CONFIG_SMP 3151#ifdef CONFIG_SMP
3031static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3152static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3032{ 3153{
3154 struct irq_desc *desc = irq_to_desc(irq);
3033 struct irq_cfg *cfg; 3155 struct irq_cfg *cfg;
3034 struct msi_msg msg; 3156 struct msi_msg msg;
3035 unsigned int dest; 3157 unsigned int dest;
3036 cpumask_t tmp; 3158 cpumask_t tmp;
3037 struct irq_desc *desc;
3038 3159
3039 if (!cpumask_intersects(mask, cpu_online_mask)) 3160 if (!cpumask_intersects(mask, cpu_online_mask))
3040 return; 3161 return;
3041 3162
3042 if (assign_irq_vector(irq, *mask)) 3163 cfg = desc->chip_data;
3164 if (assign_irq_vector(irq, cfg, *mask))
3043 return; 3165 return;
3044 3166
3045 cfg = irq_cfg(irq); 3167 set_extra_move_desc(desc, *mask);
3168
3046 cpumask_and(&tmp, &cfg->domain, mask); 3169 cpumask_and(&tmp, &cfg->domain, mask);
3047 dest = cpu_mask_to_apicid(tmp); 3170 dest = cpu_mask_to_apicid(tmp);
3048 3171
3049 read_msi_msg(irq, &msg); 3172 read_msi_msg_desc(desc, &msg);
3050 3173
3051 msg.data &= ~MSI_DATA_VECTOR_MASK; 3174 msg.data &= ~MSI_DATA_VECTOR_MASK;
3052 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3175 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3053 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3176 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3054 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3177 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3055 3178
3056 write_msi_msg(irq, &msg); 3179 write_msi_msg_desc(desc, &msg);
3057 desc = irq_to_desc(irq);
3058 cpumask_copy(&desc->affinity, mask); 3180 cpumask_copy(&desc->affinity, mask);
3059} 3181}
3060
3061#ifdef CONFIG_INTR_REMAP 3182#ifdef CONFIG_INTR_REMAP
3062/* 3183/*
3063 * Migrate the MSI irq to another cpumask. This migration is 3184 * Migrate the MSI irq to another cpumask. This migration is
@@ -3066,11 +3187,11 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3066static void ir_set_msi_irq_affinity(unsigned int irq, 3187static void ir_set_msi_irq_affinity(unsigned int irq,
3067 const struct cpumask *mask) 3188 const struct cpumask *mask)
3068{ 3189{
3190 struct irq_desc *desc = irq_to_desc(irq);
3069 struct irq_cfg *cfg; 3191 struct irq_cfg *cfg;
3070 unsigned int dest; 3192 unsigned int dest;
3071 cpumask_t tmp, cleanup_mask; 3193 cpumask_t tmp, cleanup_mask;
3072 struct irte irte; 3194 struct irte irte;
3073 struct irq_desc *desc;
3074 3195
3075 if (!cpumask_intersects(mask, cpu_online_mask)) 3196 if (!cpumask_intersects(mask, cpu_online_mask))
3076 return; 3197 return;
@@ -3078,10 +3199,12 @@ static void ir_set_msi_irq_affinity(unsigned int irq,
3078 if (get_irte(irq, &irte)) 3199 if (get_irte(irq, &irte))
3079 return; 3200 return;
3080 3201
3081 if (assign_irq_vector(irq, *mask)) 3202 cfg = desc->chip_data;
3203 if (assign_irq_vector(irq, cfg, *mask))
3082 return; 3204 return;
3083 3205
3084 cfg = irq_cfg(irq); 3206 set_extra_move_desc(desc, mask);
3207
3085 cpumask_and(&tmp, &cfg->domain, mask); 3208 cpumask_and(&tmp, &cfg->domain, mask);
3086 dest = cpu_mask_to_apicid(tmp); 3209 dest = cpu_mask_to_apicid(tmp);
3087 3210
@@ -3105,9 +3228,9 @@ static void ir_set_msi_irq_affinity(unsigned int irq,
3105 cfg->move_in_progress = 0; 3228 cfg->move_in_progress = 0;
3106 } 3229 }
3107 3230
3108 desc = irq_to_desc(irq);
3109 cpumask_copy(&desc->affinity, mask); 3231 cpumask_copy(&desc->affinity, mask);
3110} 3232}
3233
3111#endif 3234#endif
3112#endif /* CONFIG_SMP */ 3235#endif /* CONFIG_SMP */
3113 3236
@@ -3166,7 +3289,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3166} 3289}
3167#endif 3290#endif
3168 3291
3169static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) 3292static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3170{ 3293{
3171 int ret; 3294 int ret;
3172 struct msi_msg msg; 3295 struct msi_msg msg;
@@ -3175,7 +3298,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
3175 if (ret < 0) 3298 if (ret < 0)
3176 return ret; 3299 return ret;
3177 3300
3178 set_irq_msi(irq, desc); 3301 set_irq_msi(irq, msidesc);
3179 write_msi_msg(irq, &msg); 3302 write_msi_msg(irq, &msg);
3180 3303
3181#ifdef CONFIG_INTR_REMAP 3304#ifdef CONFIG_INTR_REMAP
@@ -3195,26 +3318,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
3195 return 0; 3318 return 0;
3196} 3319}
3197 3320
3198static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) 3321int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
3199{
3200 unsigned int irq;
3201
3202 irq = dev->bus->number;
3203 irq <<= 8;
3204 irq |= dev->devfn;
3205 irq <<= 12;
3206
3207 return irq;
3208}
3209
3210int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
3211{ 3322{
3212 unsigned int irq; 3323 unsigned int irq;
3213 int ret; 3324 int ret;
3214 unsigned int irq_want; 3325 unsigned int irq_want;
3215 3326
3216 irq_want = build_irq_for_pci_dev(dev) + 0x100; 3327 irq_want = nr_irqs_gsi;
3217
3218 irq = create_irq_nr(irq_want); 3328 irq = create_irq_nr(irq_want);
3219 if (irq == 0) 3329 if (irq == 0)
3220 return -1; 3330 return -1;
@@ -3228,7 +3338,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
3228 goto error; 3338 goto error;
3229no_ir: 3339no_ir:
3230#endif 3340#endif
3231 ret = setup_msi_irq(dev, desc, irq); 3341 ret = setup_msi_irq(dev, msidesc, irq);
3232 if (ret < 0) { 3342 if (ret < 0) {
3233 destroy_irq(irq); 3343 destroy_irq(irq);
3234 return ret; 3344 return ret;
@@ -3246,7 +3356,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3246{ 3356{
3247 unsigned int irq; 3357 unsigned int irq;
3248 int ret, sub_handle; 3358 int ret, sub_handle;
3249 struct msi_desc *desc; 3359 struct msi_desc *msidesc;
3250 unsigned int irq_want; 3360 unsigned int irq_want;
3251 3361
3252#ifdef CONFIG_INTR_REMAP 3362#ifdef CONFIG_INTR_REMAP
@@ -3254,10 +3364,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3254 int index = 0; 3364 int index = 0;
3255#endif 3365#endif
3256 3366
3257 irq_want = build_irq_for_pci_dev(dev) + 0x100; 3367 irq_want = nr_irqs_gsi;
3258 sub_handle = 0; 3368 sub_handle = 0;
3259 list_for_each_entry(desc, &dev->msi_list, list) { 3369 list_for_each_entry(msidesc, &dev->msi_list, list) {
3260 irq = create_irq_nr(irq_want--); 3370 irq = create_irq_nr(irq_want);
3371 irq_want++;
3261 if (irq == 0) 3372 if (irq == 0)
3262 return -1; 3373 return -1;
3263#ifdef CONFIG_INTR_REMAP 3374#ifdef CONFIG_INTR_REMAP
@@ -3289,7 +3400,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3289 } 3400 }
3290no_ir: 3401no_ir:
3291#endif 3402#endif
3292 ret = setup_msi_irq(dev, desc, irq); 3403 ret = setup_msi_irq(dev, msidesc, irq);
3293 if (ret < 0) 3404 if (ret < 0)
3294 goto error; 3405 goto error;
3295 sub_handle++; 3406 sub_handle++;
@@ -3310,19 +3421,21 @@ void arch_teardown_msi_irq(unsigned int irq)
3310#ifdef CONFIG_SMP 3421#ifdef CONFIG_SMP
3311static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3422static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3312{ 3423{
3424 struct irq_desc *desc = irq_to_desc(irq);
3313 struct irq_cfg *cfg; 3425 struct irq_cfg *cfg;
3314 struct msi_msg msg; 3426 struct msi_msg msg;
3315 unsigned int dest; 3427 unsigned int dest;
3316 cpumask_t tmp; 3428 cpumask_t tmp;
3317 struct irq_desc *desc;
3318 3429
3319 if (!cpumask_intersects(mask, cpu_online_mask)) 3430 if (!cpumask_intersects(mask, cpu_online_mask))
3320 return; 3431 return;
3321 3432
3322 if (assign_irq_vector(irq, *mask)) 3433 cfg = desc->chip_data;
3434 if (assign_irq_vector(irq, cfg, *mask))
3323 return; 3435 return;
3324 3436
3325 cfg = irq_cfg(irq); 3437 set_extra_move_desc(desc, *mask);
3438
3326 cpumask_and(&tmp, &cfg->domain, mask); 3439 cpumask_and(&tmp, &cfg->domain, mask);
3327 dest = cpu_mask_to_apicid(tmp); 3440 dest = cpu_mask_to_apicid(tmp);
3328 3441
@@ -3334,9 +3447,9 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3334 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3447 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3335 3448
3336 dmar_msi_write(irq, &msg); 3449 dmar_msi_write(irq, &msg);
3337 desc = irq_to_desc(irq);
3338 cpumask_copy(&desc->affinity, mask); 3450 cpumask_copy(&desc->affinity, mask);
3339} 3451}
3452
3340#endif /* CONFIG_SMP */ 3453#endif /* CONFIG_SMP */
3341 3454
3342struct irq_chip dmar_msi_type = { 3455struct irq_chip dmar_msi_type = {
@@ -3370,8 +3483,8 @@ int arch_setup_dmar_msi(unsigned int irq)
3370#ifdef CONFIG_SMP 3483#ifdef CONFIG_SMP
3371static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3484static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3372{ 3485{
3486 struct irq_desc *desc = irq_to_desc(irq);
3373 struct irq_cfg *cfg; 3487 struct irq_cfg *cfg;
3374 struct irq_desc *desc;
3375 struct msi_msg msg; 3488 struct msi_msg msg;
3376 unsigned int dest; 3489 unsigned int dest;
3377 cpumask_t tmp; 3490 cpumask_t tmp;
@@ -3379,10 +3492,12 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3379 if (!cpumask_intersects(mask, cpu_online_mask)) 3492 if (!cpumask_intersects(mask, cpu_online_mask))
3380 return; 3493 return;
3381 3494
3382 if (assign_irq_vector(irq, *mask)) 3495 cfg = desc->chip_data;
3496 if (assign_irq_vector(irq, cfg, *mask))
3383 return; 3497 return;
3384 3498
3385 cfg = irq_cfg(irq); 3499 set_extra_move_desc(desc, *mask);
3500
3386 cpumask_and(&tmp, &cfg->domain, mask); 3501 cpumask_and(&tmp, &cfg->domain, mask);
3387 dest = cpu_mask_to_apicid(tmp); 3502 dest = cpu_mask_to_apicid(tmp);
3388 3503
@@ -3394,9 +3509,9 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3394 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3509 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3395 3510
3396 hpet_msi_write(irq, &msg); 3511 hpet_msi_write(irq, &msg);
3397 desc = irq_to_desc(irq);
3398 cpumask_copy(&desc->affinity, mask); 3512 cpumask_copy(&desc->affinity, mask);
3399} 3513}
3514
3400#endif /* CONFIG_SMP */ 3515#endif /* CONFIG_SMP */
3401 3516
3402struct irq_chip hpet_msi_type = { 3517struct irq_chip hpet_msi_type = {
@@ -3451,25 +3566,27 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3451 3566
3452static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) 3567static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3453{ 3568{
3569 struct irq_desc *desc = irq_to_desc(irq);
3454 struct irq_cfg *cfg; 3570 struct irq_cfg *cfg;
3455 unsigned int dest; 3571 unsigned int dest;
3456 cpumask_t tmp; 3572 cpumask_t tmp;
3457 struct irq_desc *desc;
3458 3573
3459 if (!cpumask_intersects(mask, cpu_online_mask)) 3574 if (!cpumask_intersects(mask, cpu_online_mask))
3460 return; 3575 return;
3461 3576
3462 if (assign_irq_vector(irq, *mask)) 3577 cfg = desc->chip_data;
3578 if (assign_irq_vector(irq, cfg, *mask))
3463 return; 3579 return;
3464 3580
3465 cfg = irq_cfg(irq); 3581 set_extra_move_desc(desc, *mask);
3582
3466 cpumask_and(&tmp, &cfg->domain, mask); 3583 cpumask_and(&tmp, &cfg->domain, mask);
3467 dest = cpu_mask_to_apicid(tmp); 3584 dest = cpu_mask_to_apicid(tmp);
3468 3585
3469 target_ht_irq(irq, dest, cfg->vector); 3586 target_ht_irq(irq, dest, cfg->vector);
3470 desc = irq_to_desc(irq);
3471 cpumask_copy(&desc->affinity, mask); 3587 cpumask_copy(&desc->affinity, mask);
3472} 3588}
3589
3473#endif 3590#endif
3474 3591
3475static struct irq_chip ht_irq_chip = { 3592static struct irq_chip ht_irq_chip = {
@@ -3489,13 +3606,13 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3489 int err; 3606 int err;
3490 cpumask_t tmp; 3607 cpumask_t tmp;
3491 3608
3609 cfg = irq_cfg(irq);
3492 tmp = TARGET_CPUS; 3610 tmp = TARGET_CPUS;
3493 err = assign_irq_vector(irq, tmp); 3611 err = assign_irq_vector(irq, cfg, tmp);
3494 if (!err) { 3612 if (!err) {
3495 struct ht_irq_msg msg; 3613 struct ht_irq_msg msg;
3496 unsigned dest; 3614 unsigned dest;
3497 3615
3498 cfg = irq_cfg(irq);
3499 cpus_and(tmp, cfg->domain, tmp); 3616 cpus_and(tmp, cfg->domain, tmp);
3500 dest = cpu_mask_to_apicid(tmp); 3617 dest = cpu_mask_to_apicid(tmp);
3501 3618
@@ -3541,7 +3658,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3541 unsigned long flags; 3658 unsigned long flags;
3542 int err; 3659 int err;
3543 3660
3544 err = assign_irq_vector(irq, *eligible_cpu); 3661 cfg = irq_cfg(irq);
3662
3663 err = assign_irq_vector(irq, cfg, *eligible_cpu);
3545 if (err != 0) 3664 if (err != 0)
3546 return err; 3665 return err;
3547 3666
@@ -3550,8 +3669,6 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3550 irq_name); 3669 irq_name);
3551 spin_unlock_irqrestore(&vector_lock, flags); 3670 spin_unlock_irqrestore(&vector_lock, flags);
3552 3671
3553 cfg = irq_cfg(irq);
3554
3555 mmr_value = 0; 3672 mmr_value = 0;
3556 entry = (struct uv_IO_APIC_route_entry *)&mmr_value; 3673 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3557 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); 3674 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
@@ -3603,9 +3720,16 @@ int __init io_apic_get_redir_entries (int ioapic)
3603 return reg_01.bits.entries; 3720 return reg_01.bits.entries;
3604} 3721}
3605 3722
3606int __init probe_nr_irqs(void) 3723void __init probe_nr_irqs_gsi(void)
3607{ 3724{
3608 return NR_IRQS; 3725 int idx;
3726 int nr = 0;
3727
3728 for (idx = 0; idx < nr_ioapics; idx++)
3729 nr += io_apic_get_redir_entries(idx) + 1;
3730
3731 if (nr > nr_irqs_gsi)
3732 nr_irqs_gsi = nr;
3609} 3733}
3610 3734
3611/* -------------------------------------------------------------------------- 3735/* --------------------------------------------------------------------------
@@ -3704,19 +3828,31 @@ int __init io_apic_get_version(int ioapic)
3704 3828
3705int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) 3829int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
3706{ 3830{
3831 struct irq_desc *desc;
3832 struct irq_cfg *cfg;
3833 int cpu = boot_cpu_id;
3834
3707 if (!IO_APIC_IRQ(irq)) { 3835 if (!IO_APIC_IRQ(irq)) {
3708 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", 3836 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3709 ioapic); 3837 ioapic);
3710 return -EINVAL; 3838 return -EINVAL;
3711 } 3839 }
3712 3840
3841 desc = irq_to_desc_alloc_cpu(irq, cpu);
3842 if (!desc) {
3843 printk(KERN_INFO "can not get irq_desc %d\n", irq);
3844 return 0;
3845 }
3846
3713 /* 3847 /*
3714 * IRQs < 16 are already in the irq_2_pin[] map 3848 * IRQs < 16 are already in the irq_2_pin[] map
3715 */ 3849 */
3716 if (irq >= 16) 3850 if (irq >= NR_IRQS_LEGACY) {
3717 add_pin_to_irq(irq, ioapic, pin); 3851 cfg = desc->chip_data;
3852 add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
3853 }
3718 3854
3719 setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); 3855 setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
3720 3856
3721 return 0; 3857 return 0;
3722} 3858}
@@ -3770,9 +3906,10 @@ void __init setup_ioapic_dest(void)
3770 * when you have too many devices, because at that time only boot 3906 * when you have too many devices, because at that time only boot
3771 * cpu is online. 3907 * cpu is online.
3772 */ 3908 */
3773 cfg = irq_cfg(irq); 3909 desc = irq_to_desc(irq);
3910 cfg = desc->chip_data;
3774 if (!cfg->vector) { 3911 if (!cfg->vector) {
3775 setup_IO_APIC_irq(ioapic, pin, irq, 3912 setup_IO_APIC_irq(ioapic, pin, irq, desc,
3776 irq_trigger(irq_entry), 3913 irq_trigger(irq_entry),
3777 irq_polarity(irq_entry)); 3914 irq_polarity(irq_entry));
3778 continue; 3915 continue;
@@ -3782,7 +3919,6 @@ void __init setup_ioapic_dest(void)
3782 /* 3919 /*
3783 * Honour affinities which have been set in early boot 3920 * Honour affinities which have been set in early boot
3784 */ 3921 */
3785 desc = irq_to_desc(irq);
3786 if (desc->status & 3922 if (desc->status &
3787 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 3923 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
3788 mask = desc->affinity; 3924 mask = desc->affinity;
@@ -3791,10 +3927,10 @@ void __init setup_ioapic_dest(void)
3791 3927
3792#ifdef CONFIG_INTR_REMAP 3928#ifdef CONFIG_INTR_REMAP
3793 if (intr_remapping_enabled) 3929 if (intr_remapping_enabled)
3794 set_ir_ioapic_affinity_irq(irq, &mask); 3930 set_ir_ioapic_affinity_irq_desc(desc, &mask);
3795 else 3931 else
3796#endif 3932#endif
3797 set_ioapic_affinity_irq(irq, &mask); 3933 set_ioapic_affinity_irq_desc(desc, &mask);
3798 } 3934 }
3799 3935
3800 } 3936 }
@@ -3843,7 +3979,6 @@ void __init ioapic_init_mappings(void)
3843 struct resource *ioapic_res; 3979 struct resource *ioapic_res;
3844 int i; 3980 int i;
3845 3981
3846 irq_2_pin_init();
3847 ioapic_res = ioapic_setup_resources(); 3982 ioapic_res = ioapic_setup_resources();
3848 for (i = 0; i < nr_ioapics; i++) { 3983 for (i = 0; i < nr_ioapics; i++) {
3849 if (smp_found_config) { 3984 if (smp_found_config) {
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f649..3f1d9d18df67 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -118,6 +118,9 @@ int show_interrupts(struct seq_file *p, void *v)
118 } 118 }
119 119
120 desc = irq_to_desc(i); 120 desc = irq_to_desc(i);
121 if (!desc)
122 return 0;
123
121 spin_lock_irqsave(&desc->lock, flags); 124 spin_lock_irqsave(&desc->lock, flags);
122#ifndef CONFIG_SMP 125#ifndef CONFIG_SMP
123 any_count = kstat_irqs(i); 126 any_count = kstat_irqs(i);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 87870a49be4e..9cf9cbbf7a02 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -242,6 +242,8 @@ void fixup_irqs(cpumask_t map)
242 for_each_irq_desc(irq, desc) { 242 for_each_irq_desc(irq, desc) {
243 cpumask_t mask; 243 cpumask_t mask;
244 244
245 if (!desc)
246 continue;
245 if (irq == 2) 247 if (irq == 2)
246 continue; 248 continue;
247 249
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 7d37f847544d..27f2307b0a34 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -94,6 +94,8 @@ void fixup_irqs(cpumask_t map)
94 int break_affinity = 0; 94 int break_affinity = 0;
95 int set_affinity = 1; 95 int set_affinity = 1;
96 96
97 if (!desc)
98 continue;
97 if (irq == 2) 99 if (irq == 2)
98 continue; 100 continue;
99 101
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 845aa9803e80..6a92f47c52e7 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)
68 /* 68 /*
69 * 16 old-style INTA-cycle interrupts: 69 * 16 old-style INTA-cycle interrupts:
70 */ 70 */
71 for (i = 0; i < 16; i++) { 71 for (i = 0; i < NR_IRQS_LEGACY; i++) {
72 /* first time call this irq_desc */
73 struct irq_desc *desc = irq_to_desc(i); 72 struct irq_desc *desc = irq_to_desc(i);
74 73
75 desc->status = IRQ_DISABLED; 74 desc->status = IRQ_DISABLED;
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ff0235391285..40c1e62ec785 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -142,8 +142,7 @@ void __init init_ISA_irqs(void)
142 init_bsp_APIC(); 142 init_bsp_APIC();
143 init_8259A(0); 143 init_8259A(0);
144 144
145 for (i = 0; i < 16; i++) { 145 for (i = 0; i < NR_IRQS_LEGACY; i++) {
146 /* first time call this irq_desc */
147 struct irq_desc *desc = irq_to_desc(i); 146 struct irq_desc *desc = irq_to_desc(i);
148 147
149 desc->status = IRQ_DISABLED; 148 desc->status = IRQ_DISABLED;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 0f4c1fd5a1f4..45e3b69808ba 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -586,26 +586,23 @@ static void __init __get_smp_config(unsigned int early)
586{ 586{
587 struct intel_mp_floating *mpf = mpf_found; 587 struct intel_mp_floating *mpf = mpf_found;
588 588
589 if (x86_quirks->mach_get_smp_config) { 589 if (!mpf)
590 if (x86_quirks->mach_get_smp_config(early)) 590 return;
591 return; 591
592 }
593 if (acpi_lapic && early) 592 if (acpi_lapic && early)
594 return; 593 return;
594
595 /* 595 /*
596 * ACPI supports both logical (e.g. Hyper-Threading) and physical 596 * MPS doesn't support hyperthreading, aka only have
597 * processors, where MPS only supports physical. 597 * thread 0 apic id in MPS table
598 */ 598 */
599 if (acpi_lapic && acpi_ioapic) { 599 if (acpi_lapic && acpi_ioapic)
600 printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
601 "information\n");
602 return; 600 return;
603 } else if (acpi_lapic)
604 printk(KERN_INFO "Using ACPI for processor (LAPIC) "
605 "configuration information\n");
606 601
607 if (!mpf) 602 if (x86_quirks->mach_get_smp_config) {
608 return; 603 if (x86_quirks->mach_get_smp_config(early))
604 return;
605 }
609 606
610 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 607 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
611 mpf->mpf_specification); 608 mpf->mpf_specification);
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 4caff39078e0..0deea37a53cf 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,7 +31,7 @@
31#include <asm/numaq.h> 31#include <asm/numaq.h>
32#include <asm/topology.h> 32#include <asm/topology.h>
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/mpspec.h> 34#include <asm/genapic.h>
35#include <asm/e820.h> 35#include <asm/e820.h>
36#include <asm/setup.h> 36#include <asm/setup.h>
37 37
@@ -235,6 +235,13 @@ static int __init numaq_setup_ioapic_ids(void)
235 return 1; 235 return 1;
236} 236}
237 237
238static int __init numaq_update_genapic(void)
239{
240 genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi;
241
242 return 0;
243}
244
238static struct x86_quirks numaq_x86_quirks __initdata = { 245static struct x86_quirks numaq_x86_quirks __initdata = {
239 .arch_pre_time_init = numaq_pre_time_init, 246 .arch_pre_time_init = numaq_pre_time_init,
240 .arch_time_init = NULL, 247 .arch_time_init = NULL,
@@ -250,6 +257,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
250 .mpc_oem_pci_bus = mpc_oem_pci_bus, 257 .mpc_oem_pci_bus = mpc_oem_pci_bus,
251 .smp_read_mpc_oem = smp_read_mpc_oem, 258 .smp_read_mpc_oem = smp_read_mpc_oem,
252 .setup_ioapic_ids = numaq_setup_ioapic_ids, 259 .setup_ioapic_ids = numaq_setup_ioapic_ids,
260 .update_genapic = numaq_update_genapic,
253}; 261};
254 262
255void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, 263void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c622772744d8..95d811a9594f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,7 +7,9 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/pm.h> 8#include <linux/pm.h>
9#include <linux/clockchips.h> 9#include <linux/clockchips.h>
10#include <linux/ftrace.h>
10#include <asm/system.h> 11#include <asm/system.h>
12#include <asm/apic.h>
11 13
12unsigned long idle_halt; 14unsigned long idle_halt;
13EXPORT_SYMBOL(idle_halt); 15EXPORT_SYMBOL(idle_halt);
@@ -100,6 +102,9 @@ static inline int hlt_use_halt(void)
100void default_idle(void) 102void default_idle(void)
101{ 103{
102 if (hlt_use_halt()) { 104 if (hlt_use_halt()) {
105 struct power_trace it;
106
107 trace_power_start(&it, POWER_CSTATE, 1);
103 current_thread_info()->status &= ~TS_POLLING; 108 current_thread_info()->status &= ~TS_POLLING;
104 /* 109 /*
105 * TS_POLLING-cleared state must be visible before we 110 * TS_POLLING-cleared state must be visible before we
@@ -112,6 +117,7 @@ void default_idle(void)
112 else 117 else
113 local_irq_enable(); 118 local_irq_enable();
114 current_thread_info()->status |= TS_POLLING; 119 current_thread_info()->status |= TS_POLLING;
120 trace_power_end(&it);
115 } else { 121 } else {
116 local_irq_enable(); 122 local_irq_enable();
117 /* loop is done by the caller */ 123 /* loop is done by the caller */
@@ -122,6 +128,21 @@ void default_idle(void)
122EXPORT_SYMBOL(default_idle); 128EXPORT_SYMBOL(default_idle);
123#endif 129#endif
124 130
131void stop_this_cpu(void *dummy)
132{
133 local_irq_disable();
134 /*
135 * Remove this CPU:
136 */
137 cpu_clear(smp_processor_id(), cpu_online_map);
138 disable_local_APIC();
139
140 for (;;) {
141 if (hlt_works(smp_processor_id()))
142 halt();
143 }
144}
145
125static void do_nothing(void *unused) 146static void do_nothing(void *unused)
126{ 147{
127} 148}
@@ -154,24 +175,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
154 */ 175 */
155void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 176void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
156{ 177{
178 struct power_trace it;
179
180 trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
157 if (!need_resched()) { 181 if (!need_resched()) {
158 __monitor((void *)&current_thread_info()->flags, 0, 0); 182 __monitor((void *)&current_thread_info()->flags, 0, 0);
159 smp_mb(); 183 smp_mb();
160 if (!need_resched()) 184 if (!need_resched())
161 __mwait(ax, cx); 185 __mwait(ax, cx);
162 } 186 }
187 trace_power_end(&it);
163} 188}
164 189
165/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 190/* Default MONITOR/MWAIT with no hints, used for default C1 state */
166static void mwait_idle(void) 191static void mwait_idle(void)
167{ 192{
193 struct power_trace it;
168 if (!need_resched()) { 194 if (!need_resched()) {
195 trace_power_start(&it, POWER_CSTATE, 1);
169 __monitor((void *)&current_thread_info()->flags, 0, 0); 196 __monitor((void *)&current_thread_info()->flags, 0, 0);
170 smp_mb(); 197 smp_mb();
171 if (!need_resched()) 198 if (!need_resched())
172 __sti_mwait(0, 0); 199 __sti_mwait(0, 0);
173 else 200 else
174 local_irq_enable(); 201 local_irq_enable();
202 trace_power_end(&it);
175 } else 203 } else
176 local_irq_enable(); 204 local_irq_enable();
177} 205}
@@ -183,9 +211,13 @@ static void mwait_idle(void)
183 */ 211 */
184static void poll_idle(void) 212static void poll_idle(void)
185{ 213{
214 struct power_trace it;
215
216 trace_power_start(&it, POWER_CSTATE, 0);
186 local_irq_enable(); 217 local_irq_enable();
187 while (!need_resched()) 218 while (!need_resched())
188 cpu_relax(); 219 cpu_relax();
220 trace_power_end(&it);
189} 221}
190 222
191/* 223/*
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302fe6d45..24c2276aa453 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h> 40#include <linux/dmi.h>
41#include <linux/ftrace.h>
41 42
42#include <asm/uaccess.h> 43#include <asm/uaccess.h>
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
@@ -548,7 +549,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
548 * the task-switch, and shows up in ret_from_fork in entry.S, 549 * the task-switch, and shows up in ret_from_fork in entry.S,
549 * for example. 550 * for example.
550 */ 551 */
551struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 552__notrace_funcgraph struct task_struct *
553__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
552{ 554{
553 struct thread_struct *prev = &prev_p->thread, 555 struct thread_struct *prev = &prev_p->thread,
554 *next = &next_p->thread; 556 *next = &next_p->thread;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c958120fb1b6..fbb321d53d34 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -39,6 +39,7 @@
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/uaccess.h> 40#include <linux/uaccess.h>
41#include <linux/io.h> 41#include <linux/io.h>
42#include <linux/ftrace.h>
42 43
43#include <asm/pgtable.h> 44#include <asm/pgtable.h>
44#include <asm/system.h> 45#include <asm/system.h>
@@ -551,8 +552,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
551 * - could test fs/gs bitsliced 552 * - could test fs/gs bitsliced
552 * 553 *
553 * Kprobes not supported here. Set the probe on schedule instead. 554 * Kprobes not supported here. Set the probe on schedule instead.
555 * Function graph tracer not supported too.
554 */ 556 */
555struct task_struct * 557__notrace_funcgraph struct task_struct *
556__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 558__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
557{ 559{
558 struct thread_struct *prev = &prev_p->thread; 560 struct thread_struct *prev = &prev_p->thread;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0a6d8c12e10d..2c8ec1ba75e6 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -668,14 +668,14 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
668 size_t bts_index, bts_end; 668 size_t bts_index, bts_end;
669 int error; 669 int error;
670 670
671 error = ds_get_bts_end(child, &bts_end); 671 error = ds_get_bts_end(child->bts, &bts_end);
672 if (error < 0) 672 if (error < 0)
673 return error; 673 return error;
674 674
675 if (bts_end <= index) 675 if (bts_end <= index)
676 return -EINVAL; 676 return -EINVAL;
677 677
678 error = ds_get_bts_index(child, &bts_index); 678 error = ds_get_bts_index(child->bts, &bts_index);
679 if (error < 0) 679 if (error < 0)
680 return error; 680 return error;
681 681
@@ -684,7 +684,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
684 if (bts_end <= bts_index) 684 if (bts_end <= bts_index)
685 bts_index -= bts_end; 685 bts_index -= bts_end;
686 686
687 error = ds_access_bts(child, bts_index, &bts_record); 687 error = ds_access_bts(child->bts, bts_index, &bts_record);
688 if (error < 0) 688 if (error < 0)
689 return error; 689 return error;
690 690
@@ -705,14 +705,14 @@ static int ptrace_bts_drain(struct task_struct *child,
705 size_t end, i; 705 size_t end, i;
706 int error; 706 int error;
707 707
708 error = ds_get_bts_index(child, &end); 708 error = ds_get_bts_index(child->bts, &end);
709 if (error < 0) 709 if (error < 0)
710 return error; 710 return error;
711 711
712 if (size < (end * sizeof(struct bts_struct))) 712 if (size < (end * sizeof(struct bts_struct)))
713 return -EIO; 713 return -EIO;
714 714
715 error = ds_access_bts(child, 0, (const void **)&raw); 715 error = ds_access_bts(child->bts, 0, (const void **)&raw);
716 if (error < 0) 716 if (error < 0)
717 return error; 717 return error;
718 718
@@ -723,18 +723,13 @@ static int ptrace_bts_drain(struct task_struct *child,
723 return -EFAULT; 723 return -EFAULT;
724 } 724 }
725 725
726 error = ds_clear_bts(child); 726 error = ds_clear_bts(child->bts);
727 if (error < 0) 727 if (error < 0)
728 return error; 728 return error;
729 729
730 return end; 730 return end;
731} 731}
732 732
733static void ptrace_bts_ovfl(struct task_struct *child)
734{
735 send_sig(child->thread.bts_ovfl_signal, child, 0);
736}
737
738static int ptrace_bts_config(struct task_struct *child, 733static int ptrace_bts_config(struct task_struct *child,
739 long cfg_size, 734 long cfg_size,
740 const struct ptrace_bts_config __user *ucfg) 735 const struct ptrace_bts_config __user *ucfg)
@@ -760,23 +755,45 @@ static int ptrace_bts_config(struct task_struct *child,
760 goto errout; 755 goto errout;
761 756
762 if (cfg.flags & PTRACE_BTS_O_ALLOC) { 757 if (cfg.flags & PTRACE_BTS_O_ALLOC) {
763 ds_ovfl_callback_t ovfl = NULL; 758 bts_ovfl_callback_t ovfl = NULL;
764 unsigned int sig = 0; 759 unsigned int sig = 0;
765 760
766 /* we ignore the error in case we were not tracing child */ 761 error = -EINVAL;
767 (void)ds_release_bts(child); 762 if (cfg.size < (10 * bts_cfg.sizeof_bts))
763 goto errout;
768 764
769 if (cfg.flags & PTRACE_BTS_O_SIGNAL) { 765 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
770 if (!cfg.signal) 766 if (!cfg.signal)
771 goto errout; 767 goto errout;
772 768
769 error = -EOPNOTSUPP;
770 goto errout;
771
773 sig = cfg.signal; 772 sig = cfg.signal;
774 ovfl = ptrace_bts_ovfl;
775 } 773 }
776 774
777 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); 775 if (child->bts) {
778 if (error < 0) 776 (void)ds_release_bts(child->bts);
777 kfree(child->bts_buffer);
778
779 child->bts = NULL;
780 child->bts_buffer = NULL;
781 }
782
783 error = -ENOMEM;
784 child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL);
785 if (!child->bts_buffer)
786 goto errout;
787
788 child->bts = ds_request_bts(child, child->bts_buffer, cfg.size,
789 ovfl, /* th = */ (size_t)-1);
790 if (IS_ERR(child->bts)) {
791 error = PTR_ERR(child->bts);
792 kfree(child->bts_buffer);
793 child->bts = NULL;
794 child->bts_buffer = NULL;
779 goto errout; 795 goto errout;
796 }
780 797
781 child->thread.bts_ovfl_signal = sig; 798 child->thread.bts_ovfl_signal = sig;
782 } 799 }
@@ -823,15 +840,15 @@ static int ptrace_bts_status(struct task_struct *child,
823 if (cfg_size < sizeof(cfg)) 840 if (cfg_size < sizeof(cfg))
824 return -EIO; 841 return -EIO;
825 842
826 error = ds_get_bts_end(child, &end); 843 error = ds_get_bts_end(child->bts, &end);
827 if (error < 0) 844 if (error < 0)
828 return error; 845 return error;
829 846
830 error = ds_access_bts(child, /* index = */ 0, &base); 847 error = ds_access_bts(child->bts, /* index = */ 0, &base);
831 if (error < 0) 848 if (error < 0)
832 return error; 849 return error;
833 850
834 error = ds_access_bts(child, /* index = */ end, &max); 851 error = ds_access_bts(child->bts, /* index = */ end, &max);
835 if (error < 0) 852 if (error < 0)
836 return error; 853 return error;
837 854
@@ -884,10 +901,7 @@ static int ptrace_bts_write_record(struct task_struct *child,
884 return -EINVAL; 901 return -EINVAL;
885 } 902 }
886 903
887 /* The writing task will be the switched-to task on a context 904 return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts);
888 * switch. It needs to write into the switched-from task's BTS
889 * buffer. */
890 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
891} 905}
892 906
893void ptrace_bts_take_timestamp(struct task_struct *tsk, 907void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -929,17 +943,16 @@ void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
929 switch (c->x86) { 943 switch (c->x86) {
930 case 0x6: 944 case 0x6:
931 switch (c->x86_model) { 945 switch (c->x86_model) {
946 case 0 ... 0xC:
947 /* sorry, don't know about them */
948 break;
932 case 0xD: 949 case 0xD:
933 case 0xE: /* Pentium M */ 950 case 0xE: /* Pentium M */
934 bts_configure(&bts_cfg_pentium_m); 951 bts_configure(&bts_cfg_pentium_m);
935 break; 952 break;
936 case 0xF: /* Core2 */ 953 default: /* Core2, Atom, ... */
937 case 0x1C: /* Atom */
938 bts_configure(&bts_cfg_core2); 954 bts_configure(&bts_cfg_core2);
939 break; 955 break;
940 default:
941 /* sorry, don't know about them */
942 break;
943 } 956 }
944 break; 957 break;
945 case 0xF: 958 case 0xF:
@@ -973,13 +986,17 @@ void ptrace_disable(struct task_struct *child)
973 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 986 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
974#endif 987#endif
975#ifdef CONFIG_X86_PTRACE_BTS 988#ifdef CONFIG_X86_PTRACE_BTS
976 (void)ds_release_bts(child); 989 if (child->bts) {
990 (void)ds_release_bts(child->bts);
991 kfree(child->bts_buffer);
992 child->bts_buffer = NULL;
977 993
978 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; 994 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
979 if (!child->thread.debugctlmsr) 995 if (!child->thread.debugctlmsr)
980 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 996 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
981 997
982 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 998 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
999 }
983#endif /* CONFIG_X86_PTRACE_BTS */ 1000#endif /* CONFIG_X86_PTRACE_BTS */
984} 1001}
985 1002
@@ -1111,9 +1128,16 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1111 (child, data, (struct ptrace_bts_config __user *)addr); 1128 (child, data, (struct ptrace_bts_config __user *)addr);
1112 break; 1129 break;
1113 1130
1114 case PTRACE_BTS_SIZE: 1131 case PTRACE_BTS_SIZE: {
1115 ret = ds_get_bts_index(child, /* pos = */ NULL); 1132 size_t size;
1133
1134 ret = ds_get_bts_index(child->bts, &size);
1135 if (ret == 0) {
1136 BUG_ON(size != (int) size);
1137 ret = (int) size;
1138 }
1116 break; 1139 break;
1140 }
1117 1141
1118 case PTRACE_BTS_GET: 1142 case PTRACE_BTS_GET:
1119 ret = ptrace_bts_read_record 1143 ret = ptrace_bts_read_record
@@ -1121,7 +1145,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1121 break; 1145 break;
1122 1146
1123 case PTRACE_BTS_CLEAR: 1147 case PTRACE_BTS_CLEAR:
1124 ret = ds_clear_bts(child); 1148 ret = ds_clear_bts(child->bts);
1125 break; 1149 break;
1126 1150
1127 case PTRACE_BTS_DRAIN: 1151 case PTRACE_BTS_DRAIN:
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index cc5a2545dd41..0e3dbc7b2bdb 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,7 +36,10 @@ int reboot_force;
36static int reboot_cpu = -1; 36static int reboot_cpu = -1;
37#endif 37#endif
38 38
39/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] 39/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
40bool port_cf9_safe = false;
41
42/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
40 warm Don't set the cold reboot flag 43 warm Don't set the cold reboot flag
41 cold Set the cold reboot flag 44 cold Set the cold reboot flag
42 bios Reboot by jumping through the BIOS (only for X86_32) 45 bios Reboot by jumping through the BIOS (only for X86_32)
@@ -45,6 +48,7 @@ static int reboot_cpu = -1;
45 kbd Use the keyboard controller. cold reset (default) 48 kbd Use the keyboard controller. cold reset (default)
46 acpi Use the RESET_REG in the FADT 49 acpi Use the RESET_REG in the FADT
47 efi Use efi reset_system runtime service 50 efi Use efi reset_system runtime service
51 pci Use the so-called "PCI reset register", CF9
48 force Avoid anything that could hang. 52 force Avoid anything that could hang.
49 */ 53 */
50static int __init reboot_setup(char *str) 54static int __init reboot_setup(char *str)
@@ -79,6 +83,7 @@ static int __init reboot_setup(char *str)
79 case 'k': 83 case 'k':
80 case 't': 84 case 't':
81 case 'e': 85 case 'e':
86 case 'p':
82 reboot_type = *str; 87 reboot_type = *str;
83 break; 88 break;
84 89
@@ -404,12 +409,27 @@ static void native_machine_emergency_restart(void)
404 reboot_type = BOOT_KBD; 409 reboot_type = BOOT_KBD;
405 break; 410 break;
406 411
407
408 case BOOT_EFI: 412 case BOOT_EFI:
409 if (efi_enabled) 413 if (efi_enabled)
410 efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, 414 efi.reset_system(reboot_mode ?
415 EFI_RESET_WARM :
416 EFI_RESET_COLD,
411 EFI_SUCCESS, 0, NULL); 417 EFI_SUCCESS, 0, NULL);
418 reboot_type = BOOT_KBD;
419 break;
420
421 case BOOT_CF9:
422 port_cf9_safe = true;
423 /* fall through */
412 424
425 case BOOT_CF9_COND:
426 if (port_cf9_safe) {
427 u8 cf9 = inb(0xcf9) & ~6;
428 outb(cf9|2, 0xcf9); /* Request hard reset */
429 udelay(50);
430 outb(cf9|6, 0xcf9); /* Actually do the reset */
431 udelay(50);
432 }
413 reboot_type = BOOT_KBD; 433 reboot_type = BOOT_KBD;
414 break; 434 break;
415 } 435 }
@@ -470,6 +490,11 @@ static void native_machine_restart(char *__unused)
470 490
471static void native_machine_halt(void) 491static void native_machine_halt(void)
472{ 492{
493 /* stop other cpus and apics */
494 machine_shutdown();
495
496 /* stop this cpu */
497 stop_this_cpu(NULL);
473} 498}
474 499
475static void native_machine_power_off(void) 500static void native_machine_power_off(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9d5674f7b6cc..b9018955a04f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -583,7 +583,20 @@ static int __init setup_elfcorehdr(char *arg)
583early_param("elfcorehdr", setup_elfcorehdr); 583early_param("elfcorehdr", setup_elfcorehdr);
584#endif 584#endif
585 585
586static struct x86_quirks default_x86_quirks __initdata; 586static int __init default_update_genapic(void)
587{
588#ifdef CONFIG_X86_SMP
589# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64)
590 genapic->wakeup_cpu = wakeup_secondary_cpu_via_init;
591# endif
592#endif
593
594 return 0;
595}
596
597static struct x86_quirks default_x86_quirks __initdata = {
598 .update_genapic = default_update_genapic,
599};
587 600
588struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; 601struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
589 602
@@ -1082,7 +1095,7 @@ void __init setup_arch(char **cmdline_p)
1082 ioapic_init_mappings(); 1095 ioapic_init_mappings();
1083 1096
1084 /* need to wait for io_apic is mapped */ 1097 /* need to wait for io_apic is mapped */
1085 nr_irqs = probe_nr_irqs(); 1098 probe_nr_irqs_gsi();
1086 1099
1087 kvm_guest_init(); 1100 kvm_guest_init();
1088 1101
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 18f9b19f5f8f..3f92b134ab90 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -140,19 +140,6 @@ void native_send_call_func_ipi(cpumask_t mask)
140 send_IPI_mask(mask, CALL_FUNCTION_VECTOR); 140 send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
141} 141}
142 142
143static void stop_this_cpu(void *dummy)
144{
145 local_irq_disable();
146 /*
147 * Remove this CPU:
148 */
149 cpu_clear(smp_processor_id(), cpu_online_map);
150 disable_local_APIC();
151 if (hlt_works(smp_processor_id()))
152 for (;;) halt();
153 for (;;);
154}
155
156/* 143/*
157 * this function calls the 'stop' function on all other CPUs in the system. 144 * this function calls the 'stop' function on all other CPUs in the system.
158 */ 145 */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 468c2f9d47ae..9d58134e0231 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,6 +62,7 @@
62#include <asm/mtrr.h> 62#include <asm/mtrr.h>
63#include <asm/vmi.h> 63#include <asm/vmi.h>
64#include <asm/genapic.h> 64#include <asm/genapic.h>
65#include <asm/setup.h>
65#include <linux/mc146818rtc.h> 66#include <linux/mc146818rtc.h>
66 67
67#include <mach_apic.h> 68#include <mach_apic.h>
@@ -530,7 +531,7 @@ static void impress_friends(void)
530 pr_debug("Before bogocount - setting activated=1.\n"); 531 pr_debug("Before bogocount - setting activated=1.\n");
531} 532}
532 533
533static inline void __inquire_remote_apic(int apicid) 534void __inquire_remote_apic(int apicid)
534{ 535{
535 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; 536 unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
536 char *names[] = { "ID", "VERSION", "SPIV" }; 537 char *names[] = { "ID", "VERSION", "SPIV" };
@@ -569,14 +570,13 @@ static inline void __inquire_remote_apic(int apicid)
569 } 570 }
570} 571}
571 572
572#ifdef WAKE_SECONDARY_VIA_NMI
573/* 573/*
574 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal 574 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
575 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this 575 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
576 * won't ... remember to clear down the APIC, etc later. 576 * won't ... remember to clear down the APIC, etc later.
577 */ 577 */
578static int __devinit 578int __devinit
579wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) 579wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
580{ 580{
581 unsigned long send_status, accept_status = 0; 581 unsigned long send_status, accept_status = 0;
582 int maxlvt; 582 int maxlvt;
@@ -593,7 +593,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
593 * Give the other CPU some time to accept the IPI. 593 * Give the other CPU some time to accept the IPI.
594 */ 594 */
595 udelay(200); 595 udelay(200);
596 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 596 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
597 maxlvt = lapic_get_maxlvt(); 597 maxlvt = lapic_get_maxlvt();
598 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 598 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
599 apic_write(APIC_ESR, 0); 599 apic_write(APIC_ESR, 0);
@@ -608,11 +608,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
608 608
609 return (send_status | accept_status); 609 return (send_status | accept_status);
610} 610}
611#endif /* WAKE_SECONDARY_VIA_NMI */
612 611
613#ifdef WAKE_SECONDARY_VIA_INIT 612int __devinit
614static int __devinit 613wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
615wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
616{ 614{
617 unsigned long send_status, accept_status = 0; 615 unsigned long send_status, accept_status = 0;
618 int maxlvt, num_starts, j; 616 int maxlvt, num_starts, j;
@@ -731,7 +729,6 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
731 729
732 return (send_status | accept_status); 730 return (send_status | accept_status);
733} 731}
734#endif /* WAKE_SECONDARY_VIA_INIT */
735 732
736struct create_idle { 733struct create_idle {
737 struct work_struct work; 734 struct work_struct work;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index a03e7f6d90c3..10786af95545 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -6,6 +6,7 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/stacktrace.h> 7#include <linux/stacktrace.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/uaccess.h>
9#include <asm/stacktrace.h> 10#include <asm/stacktrace.h>
10 11
11static void save_stack_warning(void *data, char *msg) 12static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
83 trace->entries[trace->nr_entries++] = ULONG_MAX; 84 trace->entries[trace->nr_entries++] = ULONG_MAX;
84} 85}
85EXPORT_SYMBOL_GPL(save_stack_trace_tsk); 86EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
87
88/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
89
90struct stack_frame {
91 const void __user *next_fp;
92 unsigned long ret_addr;
93};
94
95static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
96{
97 int ret;
98
99 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
100 return 0;
101
102 ret = 1;
103 pagefault_disable();
104 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
105 ret = 0;
106 pagefault_enable();
107
108 return ret;
109}
110
111static inline void __save_stack_trace_user(struct stack_trace *trace)
112{
113 const struct pt_regs *regs = task_pt_regs(current);
114 const void __user *fp = (const void __user *)regs->bp;
115
116 if (trace->nr_entries < trace->max_entries)
117 trace->entries[trace->nr_entries++] = regs->ip;
118
119 while (trace->nr_entries < trace->max_entries) {
120 struct stack_frame frame;
121
122 frame.next_fp = NULL;
123 frame.ret_addr = 0;
124 if (!copy_stack_frame(fp, &frame))
125 break;
126 if ((unsigned long)fp < regs->sp)
127 break;
128 if (frame.ret_addr) {
129 trace->entries[trace->nr_entries++] =
130 frame.ret_addr;
131 }
132 if (fp == frame.next_fp)
133 break;
134 fp = frame.next_fp;
135 }
136}
137
138void save_stack_trace_user(struct stack_trace *trace)
139{
140 /*
141 * Trace user stack if we are not a kernel thread
142 */
143 if (current->mm) {
144 __save_stack_trace_user(trace);
145 }
146 if (trace->nr_entries < trace->max_entries)
147 trace->entries[trace->nr_entries++] = ULONG_MAX;
148}
149
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0b8b6690a86d..6f3d3d4cd973 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,9 @@
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0. 17 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
18 */ 18 */
19 19
20/* Disable profiling for userspace code: */
21#define DISABLE_BRANCH_PROFILING
22
20#include <linux/time.h> 23#include <linux/time.h>
21#include <linux/init.h> 24#include <linux/init.h>
22#include <linux/kernel.h> 25#include <linux/kernel.h>
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 9e68075544f6..4a20b2f9a381 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -39,7 +39,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
39#define __do_strncpy_from_user(dst, src, count, res) \ 39#define __do_strncpy_from_user(dst, src, count, res) \
40do { \ 40do { \
41 int __d0, __d1, __d2; \ 41 int __d0, __d1, __d2; \
42 might_sleep(); \ 42 might_fault(); \
43 __asm__ __volatile__( \ 43 __asm__ __volatile__( \
44 " testl %1,%1\n" \ 44 " testl %1,%1\n" \
45 " jz 2f\n" \ 45 " jz 2f\n" \
@@ -126,7 +126,7 @@ EXPORT_SYMBOL(strncpy_from_user);
126#define __do_clear_user(addr,size) \ 126#define __do_clear_user(addr,size) \
127do { \ 127do { \
128 int __d0; \ 128 int __d0; \
129 might_sleep(); \ 129 might_fault(); \
130 __asm__ __volatile__( \ 130 __asm__ __volatile__( \
131 "0: rep; stosl\n" \ 131 "0: rep; stosl\n" \
132 " movl %2,%0\n" \ 132 " movl %2,%0\n" \
@@ -155,7 +155,7 @@ do { \
155unsigned long 155unsigned long
156clear_user(void __user *to, unsigned long n) 156clear_user(void __user *to, unsigned long n)
157{ 157{
158 might_sleep(); 158 might_fault();
159 if (access_ok(VERIFY_WRITE, to, n)) 159 if (access_ok(VERIFY_WRITE, to, n))
160 __do_clear_user(to, n); 160 __do_clear_user(to, n);
161 return n; 161 return n;
@@ -197,7 +197,7 @@ long strnlen_user(const char __user *s, long n)
197 unsigned long mask = -__addr_ok(s); 197 unsigned long mask = -__addr_ok(s);
198 unsigned long res, tmp; 198 unsigned long res, tmp;
199 199
200 might_sleep(); 200 might_fault();
201 201
202 __asm__ __volatile__( 202 __asm__ __volatile__(
203 " testl %0, %0\n" 203 " testl %0, %0\n"
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index f4df6e7c718b..64d6c84e6353 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -15,7 +15,7 @@
15#define __do_strncpy_from_user(dst,src,count,res) \ 15#define __do_strncpy_from_user(dst,src,count,res) \
16do { \ 16do { \
17 long __d0, __d1, __d2; \ 17 long __d0, __d1, __d2; \
18 might_sleep(); \ 18 might_fault(); \
19 __asm__ __volatile__( \ 19 __asm__ __volatile__( \
20 " testq %1,%1\n" \ 20 " testq %1,%1\n" \
21 " jz 2f\n" \ 21 " jz 2f\n" \
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user);
64unsigned long __clear_user(void __user *addr, unsigned long size) 64unsigned long __clear_user(void __user *addr, unsigned long size)
65{ 65{
66 long __d0; 66 long __d0;
67 might_sleep(); 67 might_fault();
68 /* no memory constraint because it doesn't change any memory gcc knows 68 /* no memory constraint because it doesn't change any memory gcc knows
69 about */ 69 about */
70 asm volatile( 70 asm volatile(
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 3c3b471ea496..3624a364b7f3 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -17,6 +17,7 @@
17#include <asm/bigsmp/apic.h> 17#include <asm/bigsmp/apic.h>
18#include <asm/bigsmp/ipi.h> 18#include <asm/bigsmp/ipi.h>
19#include <asm/mach-default/mach_mpparse.h> 19#include <asm/mach-default/mach_mpparse.h>
20#include <asm/mach-default/mach_wakecpu.h>
20 21
21static int dmi_bigsmp; /* can be set by dmi scanners */ 22static int dmi_bigsmp; /* can be set by dmi scanners */
22 23
diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c
index 9e835a11a13a..e63a4a76d8cd 100644
--- a/arch/x86/mach-generic/default.c
+++ b/arch/x86/mach-generic/default.c
@@ -16,6 +16,7 @@
16#include <asm/mach-default/mach_apic.h> 16#include <asm/mach-default/mach_apic.h>
17#include <asm/mach-default/mach_ipi.h> 17#include <asm/mach-default/mach_ipi.h>
18#include <asm/mach-default/mach_mpparse.h> 18#include <asm/mach-default/mach_mpparse.h>
19#include <asm/mach-default/mach_wakecpu.h>
19 20
20/* should be called last. */ 21/* should be called last. */
21static int probe_default(void) 22static int probe_default(void)
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 28459cab3ddb..7b4e6d0d1690 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -16,7 +16,19 @@
16#include <asm/es7000/apic.h> 16#include <asm/es7000/apic.h>
17#include <asm/es7000/ipi.h> 17#include <asm/es7000/ipi.h>
18#include <asm/es7000/mpparse.h> 18#include <asm/es7000/mpparse.h>
19#include <asm/es7000/wakecpu.h> 19#include <asm/mach-default/mach_wakecpu.h>
20
21void __init es7000_update_genapic_to_cluster(void)
22{
23 genapic->target_cpus = target_cpus_cluster;
24 genapic->int_delivery_mode = INT_DELIVERY_MODE_CLUSTER;
25 genapic->int_dest_mode = INT_DEST_MODE_CLUSTER;
26 genapic->no_balance_irq = NO_BALANCE_IRQ_CLUSTER;
27
28 genapic->init_apic_ldr = init_apic_ldr_cluster;
29
30 genapic->cpu_mask_to_apicid = cpu_mask_to_apicid_cluster;
31}
20 32
21static int probe_es7000(void) 33static int probe_es7000(void)
22{ 34{
diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c
index 5a7e4619e1c4..c346d9d0226f 100644
--- a/arch/x86/mach-generic/probe.c
+++ b/arch/x86/mach-generic/probe.c
@@ -15,6 +15,7 @@
15#include <asm/mpspec.h> 15#include <asm/mpspec.h>
16#include <asm/apicdef.h> 16#include <asm/apicdef.h>
17#include <asm/genapic.h> 17#include <asm/genapic.h>
18#include <asm/setup.h>
18 19
19extern struct genapic apic_numaq; 20extern struct genapic apic_numaq;
20extern struct genapic apic_summit; 21extern struct genapic apic_summit;
@@ -57,6 +58,9 @@ static int __init parse_apic(char *arg)
57 } 58 }
58 } 59 }
59 60
61 if (x86_quirks->update_genapic)
62 x86_quirks->update_genapic();
63
60 /* Parsed again by __setup for debug/verbose */ 64 /* Parsed again by __setup for debug/verbose */
61 return 0; 65 return 0;
62} 66}
@@ -72,12 +76,15 @@ void __init generic_bigsmp_probe(void)
72 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support 76 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
73 */ 77 */
74 78
75 if (!cmdline_apic && genapic == &apic_default) 79 if (!cmdline_apic && genapic == &apic_default) {
76 if (apic_bigsmp.probe()) { 80 if (apic_bigsmp.probe()) {
77 genapic = &apic_bigsmp; 81 genapic = &apic_bigsmp;
82 if (x86_quirks->update_genapic)
83 x86_quirks->update_genapic();
78 printk(KERN_INFO "Overriding APIC driver with %s\n", 84 printk(KERN_INFO "Overriding APIC driver with %s\n",
79 genapic->name); 85 genapic->name);
80 } 86 }
87 }
81#endif 88#endif
82} 89}
83 90
@@ -94,6 +101,9 @@ void __init generic_apic_probe(void)
94 /* Not visible without early console */ 101 /* Not visible without early console */
95 if (!apic_probe[i]) 102 if (!apic_probe[i])
96 panic("Didn't find an APIC driver"); 103 panic("Didn't find an APIC driver");
104
105 if (x86_quirks->update_genapic)
106 x86_quirks->update_genapic();
97 } 107 }
98 printk(KERN_INFO "Using APIC driver %s\n", genapic->name); 108 printk(KERN_INFO "Using APIC driver %s\n", genapic->name);
99} 109}
@@ -108,6 +118,8 @@ int __init mps_oem_check(struct mp_config_table *mpc, char *oem,
108 if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) { 118 if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) {
109 if (!cmdline_apic) { 119 if (!cmdline_apic) {
110 genapic = apic_probe[i]; 120 genapic = apic_probe[i];
121 if (x86_quirks->update_genapic)
122 x86_quirks->update_genapic();
111 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 123 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
112 genapic->name); 124 genapic->name);
113 } 125 }
@@ -124,6 +136,8 @@ int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
124 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { 136 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
125 if (!cmdline_apic) { 137 if (!cmdline_apic) {
126 genapic = apic_probe[i]; 138 genapic = apic_probe[i];
139 if (x86_quirks->update_genapic)
140 x86_quirks->update_genapic();
127 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 141 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
128 genapic->name); 142 genapic->name);
129 } 143 }
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index 6272b5e69da6..2c6d234e0009 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -16,6 +16,7 @@
16#include <asm/summit/apic.h> 16#include <asm/summit/apic.h>
17#include <asm/summit/ipi.h> 17#include <asm/summit/ipi.h>
18#include <asm/summit/mpparse.h> 18#include <asm/summit/mpparse.h>
19#include <asm/mach-default/mach_wakecpu.h>
19 20
20static int probe_summit(void) 21static int probe_summit(void)
21{ 22{
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fea4565ff576..d8cc96a2738f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -8,9 +8,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
8 8
9obj-$(CONFIG_HIGHMEM) += highmem_32.o 9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10 10
11obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
12obj-$(CONFIG_MMIOTRACE) += mmiotrace.o 11obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o 12mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 13obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15 14
16obj-$(CONFIG_NUMA) += numa_$(BITS).o 15obj-$(CONFIG_NUMA) += numa_$(BITS).o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 31e8730fa246..21e996a70d68 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -53,7 +53,7 @@
53 53
54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
55{ 55{
56#ifdef CONFIG_MMIOTRACE_HOOKS 56#ifdef CONFIG_MMIOTRACE
57 if (unlikely(is_kmmio_active())) 57 if (unlikely(is_kmmio_active()))
58 if (kmmio_handler(regs, addr) == 1) 58 if (kmmio_handler(regs, addr) == 1)
59 return -1; 59 return -1;
@@ -413,6 +413,7 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
413 unsigned long error_code) 413 unsigned long error_code)
414{ 414{
415 unsigned long flags = oops_begin(); 415 unsigned long flags = oops_begin();
416 int sig = SIGKILL;
416 struct task_struct *tsk; 417 struct task_struct *tsk;
417 418
418 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", 419 printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
@@ -423,8 +424,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,
423 tsk->thread.trap_no = 14; 424 tsk->thread.trap_no = 14;
424 tsk->thread.error_code = error_code; 425 tsk->thread.error_code = error_code;
425 if (__die("Bad pagetable", regs, error_code)) 426 if (__die("Bad pagetable", regs, error_code))
426 regs = NULL; 427 sig = 0;
427 oops_end(flags, regs, SIGKILL); 428 oops_end(flags, regs, sig);
428} 429}
429#endif 430#endif
430 431
@@ -590,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
590 int fault; 591 int fault;
591#ifdef CONFIG_X86_64 592#ifdef CONFIG_X86_64
592 unsigned long flags; 593 unsigned long flags;
594 int sig;
593#endif 595#endif
594 596
595 tsk = current; 597 tsk = current;
@@ -849,11 +851,12 @@ no_context:
849 bust_spinlocks(0); 851 bust_spinlocks(0);
850 do_exit(SIGKILL); 852 do_exit(SIGKILL);
851#else 853#else
854 sig = SIGKILL;
852 if (__die("Oops", regs, error_code)) 855 if (__die("Oops", regs, error_code))
853 regs = NULL; 856 sig = 0;
854 /* Executive summary in case the body of the oops scrolled away */ 857 /* Executive summary in case the body of the oops scrolled away */
855 printk(KERN_EMERG "CR2: %016lx\n", address); 858 printk(KERN_EMERG "CR2: %016lx\n", address);
856 oops_end(flags, regs, SIGKILL); 859 oops_end(flags, regs, sig);
857#endif 860#endif
858 861
859/* 862/*
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 9915293500fb..9a5af6c8fbe9 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -173,7 +173,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
173 173
174#undef PCI_CONF2_ADDRESS 174#undef PCI_CONF2_ADDRESS
175 175
176static struct pci_raw_ops pci_direct_conf2 = { 176struct pci_raw_ops pci_direct_conf2 = {
177 .read = pci_conf2_read, 177 .read = pci_conf2_read,
178 .write = pci_conf2_write, 178 .write = pci_conf2_write,
179}; 179};
@@ -289,6 +289,7 @@ int __init pci_direct_probe(void)
289 289
290 if (pci_check_type1()) { 290 if (pci_check_type1()) {
291 raw_pci_ops = &pci_direct_conf1; 291 raw_pci_ops = &pci_direct_conf1;
292 port_cf9_safe = true;
292 return 1; 293 return 1;
293 } 294 }
294 release_resource(region); 295 release_resource(region);
@@ -305,6 +306,7 @@ int __init pci_direct_probe(void)
305 306
306 if (pci_check_type2()) { 307 if (pci_check_type2()) {
307 raw_pci_ops = &pci_direct_conf2; 308 raw_pci_ops = &pci_direct_conf2;
309 port_cf9_safe = true;
308 return 2; 310 return 2;
309 } 311 }
310 312
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index 15b9cf6be729..1959018aac02 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -96,6 +96,7 @@ extern struct pci_raw_ops *raw_pci_ops;
96extern struct pci_raw_ops *raw_pci_ext_ops; 96extern struct pci_raw_ops *raw_pci_ext_ops;
97 97
98extern struct pci_raw_ops pci_direct_conf1; 98extern struct pci_raw_ops pci_direct_conf1;
99extern bool port_cf9_safe;
99 100
100/* arch_initcall level */ 101/* arch_initcall level */
101extern int pci_direct_probe(void); 102extern int pci_direct_probe(void);
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 1ef0f90813d6..d9d35824c56f 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -9,6 +9,9 @@
9 * Also alternative() doesn't work. 9 * Also alternative() doesn't work.
10 */ 10 */
11 11
12/* Disable profiling for userspace code: */
13#define DISABLE_BRANCH_PROFILING
14
12#include <linux/kernel.h> 15#include <linux/kernel.h>
13#include <linux/posix-timers.h> 16#include <linux/posix-timers.h>
14#include <linux/time.h> 17#include <linux/time.h>
diff --git a/block/Kconfig b/block/Kconfig
index 1ab7c15c8d7a..290b219fad9c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -47,6 +47,7 @@ config BLK_DEV_IO_TRACE
47 depends on SYSFS 47 depends on SYSFS
48 select RELAY 48 select RELAY
49 select DEBUG_FS 49 select DEBUG_FS
50 select TRACEPOINTS
50 help 51 help
51 Say Y here if you want to be able to trace the block layer actions 52 Say Y here if you want to be able to trace the block layer actions
52 on a given queue. Tracing allows you to see any traffic happening 53 on a given queue. Tracing allows you to see any traffic happening
diff --git a/block/blk-core.c b/block/blk-core.c
index c36aa98fafa3..561e8a1b43a4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,9 +28,23 @@
28#include <linux/task_io_accounting_ops.h> 28#include <linux/task_io_accounting_ops.h>
29#include <linux/blktrace_api.h> 29#include <linux/blktrace_api.h>
30#include <linux/fault-inject.h> 30#include <linux/fault-inject.h>
31#include <trace/block.h>
31 32
32#include "blk.h" 33#include "blk.h"
33 34
35DEFINE_TRACE(block_plug);
36DEFINE_TRACE(block_unplug_io);
37DEFINE_TRACE(block_unplug_timer);
38DEFINE_TRACE(block_getrq);
39DEFINE_TRACE(block_sleeprq);
40DEFINE_TRACE(block_rq_requeue);
41DEFINE_TRACE(block_bio_backmerge);
42DEFINE_TRACE(block_bio_frontmerge);
43DEFINE_TRACE(block_bio_queue);
44DEFINE_TRACE(block_rq_complete);
45DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */
46EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
47
34static int __make_request(struct request_queue *q, struct bio *bio); 48static int __make_request(struct request_queue *q, struct bio *bio);
35 49
36/* 50/*
@@ -205,7 +219,7 @@ void blk_plug_device(struct request_queue *q)
205 219
206 if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) { 220 if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
207 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay); 221 mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
208 blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG); 222 trace_block_plug(q);
209 } 223 }
210} 224}
211EXPORT_SYMBOL(blk_plug_device); 225EXPORT_SYMBOL(blk_plug_device);
@@ -292,9 +306,7 @@ void blk_unplug_work(struct work_struct *work)
292 struct request_queue *q = 306 struct request_queue *q =
293 container_of(work, struct request_queue, unplug_work); 307 container_of(work, struct request_queue, unplug_work);
294 308
295 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, 309 trace_block_unplug_io(q);
296 q->rq.count[READ] + q->rq.count[WRITE]);
297
298 q->unplug_fn(q); 310 q->unplug_fn(q);
299} 311}
300 312
@@ -302,9 +314,7 @@ void blk_unplug_timeout(unsigned long data)
302{ 314{
303 struct request_queue *q = (struct request_queue *)data; 315 struct request_queue *q = (struct request_queue *)data;
304 316
305 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL, 317 trace_block_unplug_timer(q);
306 q->rq.count[READ] + q->rq.count[WRITE]);
307
308 kblockd_schedule_work(q, &q->unplug_work); 318 kblockd_schedule_work(q, &q->unplug_work);
309} 319}
310 320
@@ -314,9 +324,7 @@ void blk_unplug(struct request_queue *q)
314 * devices don't necessarily have an ->unplug_fn defined 324 * devices don't necessarily have an ->unplug_fn defined
315 */ 325 */
316 if (q->unplug_fn) { 326 if (q->unplug_fn) {
317 blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL, 327 trace_block_unplug_io(q);
318 q->rq.count[READ] + q->rq.count[WRITE]);
319
320 q->unplug_fn(q); 328 q->unplug_fn(q);
321 } 329 }
322} 330}
@@ -822,7 +830,7 @@ rq_starved:
822 if (ioc_batching(q, ioc)) 830 if (ioc_batching(q, ioc))
823 ioc->nr_batch_requests--; 831 ioc->nr_batch_requests--;
824 832
825 blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ); 833 trace_block_getrq(q, bio, rw);
826out: 834out:
827 return rq; 835 return rq;
828} 836}
@@ -848,7 +856,7 @@ static struct request *get_request_wait(struct request_queue *q, int rw_flags,
848 prepare_to_wait_exclusive(&rl->wait[rw], &wait, 856 prepare_to_wait_exclusive(&rl->wait[rw], &wait,
849 TASK_UNINTERRUPTIBLE); 857 TASK_UNINTERRUPTIBLE);
850 858
851 blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ); 859 trace_block_sleeprq(q, bio, rw);
852 860
853 __generic_unplug_device(q); 861 __generic_unplug_device(q);
854 spin_unlock_irq(q->queue_lock); 862 spin_unlock_irq(q->queue_lock);
@@ -928,7 +936,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
928{ 936{
929 blk_delete_timer(rq); 937 blk_delete_timer(rq);
930 blk_clear_rq_complete(rq); 938 blk_clear_rq_complete(rq);
931 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); 939 trace_block_rq_requeue(q, rq);
932 940
933 if (blk_rq_tagged(rq)) 941 if (blk_rq_tagged(rq))
934 blk_queue_end_tag(q, rq); 942 blk_queue_end_tag(q, rq);
@@ -1167,7 +1175,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1167 if (!ll_back_merge_fn(q, req, bio)) 1175 if (!ll_back_merge_fn(q, req, bio))
1168 break; 1176 break;
1169 1177
1170 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 1178 trace_block_bio_backmerge(q, bio);
1171 1179
1172 req->biotail->bi_next = bio; 1180 req->biotail->bi_next = bio;
1173 req->biotail = bio; 1181 req->biotail = bio;
@@ -1186,7 +1194,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
1186 if (!ll_front_merge_fn(q, req, bio)) 1194 if (!ll_front_merge_fn(q, req, bio))
1187 break; 1195 break;
1188 1196
1189 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 1197 trace_block_bio_frontmerge(q, bio);
1190 1198
1191 bio->bi_next = req->bio; 1199 bio->bi_next = req->bio;
1192 req->bio = bio; 1200 req->bio = bio;
@@ -1269,7 +1277,7 @@ static inline void blk_partition_remap(struct bio *bio)
1269 bio->bi_sector += p->start_sect; 1277 bio->bi_sector += p->start_sect;
1270 bio->bi_bdev = bdev->bd_contains; 1278 bio->bi_bdev = bdev->bd_contains;
1271 1279
1272 blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio, 1280 trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
1273 bdev->bd_dev, bio->bi_sector, 1281 bdev->bd_dev, bio->bi_sector,
1274 bio->bi_sector - p->start_sect); 1282 bio->bi_sector - p->start_sect);
1275 } 1283 }
@@ -1441,10 +1449,10 @@ end_io:
1441 goto end_io; 1449 goto end_io;
1442 1450
1443 if (old_sector != -1) 1451 if (old_sector != -1)
1444 blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 1452 trace_block_remap(q, bio, old_dev, bio->bi_sector,
1445 old_sector); 1453 old_sector);
1446 1454
1447 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 1455 trace_block_bio_queue(q, bio);
1448 1456
1449 old_sector = bio->bi_sector; 1457 old_sector = bio->bi_sector;
1450 old_dev = bio->bi_bdev->bd_dev; 1458 old_dev = bio->bi_bdev->bd_dev;
@@ -1678,7 +1686,7 @@ static int __end_that_request_first(struct request *req, int error,
1678 int total_bytes, bio_nbytes, next_idx = 0; 1686 int total_bytes, bio_nbytes, next_idx = 0;
1679 struct bio *bio; 1687 struct bio *bio;
1680 1688
1681 blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE); 1689 trace_block_rq_complete(req->q, req);
1682 1690
1683 /* 1691 /*
1684 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual 1692 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
diff --git a/block/blktrace.c b/block/blktrace.c
index 85049a7e7a17..b0a2cae886db 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -23,10 +23,18 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/debugfs.h> 24#include <linux/debugfs.h>
25#include <linux/time.h> 25#include <linux/time.h>
26#include <trace/block.h>
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27 28
28static unsigned int blktrace_seq __read_mostly = 1; 29static unsigned int blktrace_seq __read_mostly = 1;
29 30
31/* Global reference count of probes */
32static DEFINE_MUTEX(blk_probe_mutex);
33static atomic_t blk_probes_ref = ATOMIC_INIT(0);
34
35static int blk_register_tracepoints(void);
36static void blk_unregister_tracepoints(void);
37
30/* 38/*
31 * Send out a notify message. 39 * Send out a notify message.
32 */ 40 */
@@ -119,7 +127,7 @@ static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK
119 * The worker for the various blk_add_trace*() types. Fills out a 127 * The worker for the various blk_add_trace*() types. Fills out a
120 * blk_io_trace structure and places it in a per-cpu subbuffer. 128 * blk_io_trace structure and places it in a per-cpu subbuffer.
121 */ 129 */
122void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, 130static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
123 int rw, u32 what, int error, int pdu_len, void *pdu_data) 131 int rw, u32 what, int error, int pdu_len, void *pdu_data)
124{ 132{
125 struct task_struct *tsk = current; 133 struct task_struct *tsk = current;
@@ -177,8 +185,6 @@ void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
177 local_irq_restore(flags); 185 local_irq_restore(flags);
178} 186}
179 187
180EXPORT_SYMBOL_GPL(__blk_add_trace);
181
182static struct dentry *blk_tree_root; 188static struct dentry *blk_tree_root;
183static DEFINE_MUTEX(blk_tree_mutex); 189static DEFINE_MUTEX(blk_tree_mutex);
184static unsigned int root_users; 190static unsigned int root_users;
@@ -237,6 +243,10 @@ static void blk_trace_cleanup(struct blk_trace *bt)
237 free_percpu(bt->sequence); 243 free_percpu(bt->sequence);
238 free_percpu(bt->msg_data); 244 free_percpu(bt->msg_data);
239 kfree(bt); 245 kfree(bt);
246 mutex_lock(&blk_probe_mutex);
247 if (atomic_dec_and_test(&blk_probes_ref))
248 blk_unregister_tracepoints();
249 mutex_unlock(&blk_probe_mutex);
240} 250}
241 251
242int blk_trace_remove(struct request_queue *q) 252int blk_trace_remove(struct request_queue *q)
@@ -428,6 +438,14 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
428 bt->pid = buts->pid; 438 bt->pid = buts->pid;
429 bt->trace_state = Blktrace_setup; 439 bt->trace_state = Blktrace_setup;
430 440
441 mutex_lock(&blk_probe_mutex);
442 if (atomic_add_return(1, &blk_probes_ref) == 1) {
443 ret = blk_register_tracepoints();
444 if (ret)
445 goto probe_err;
446 }
447 mutex_unlock(&blk_probe_mutex);
448
431 ret = -EBUSY; 449 ret = -EBUSY;
432 old_bt = xchg(&q->blk_trace, bt); 450 old_bt = xchg(&q->blk_trace, bt);
433 if (old_bt) { 451 if (old_bt) {
@@ -436,6 +454,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
436 } 454 }
437 455
438 return 0; 456 return 0;
457probe_err:
458 atomic_dec(&blk_probes_ref);
459 mutex_unlock(&blk_probe_mutex);
439err: 460err:
440 if (dir) 461 if (dir)
441 blk_remove_tree(dir); 462 blk_remove_tree(dir);
@@ -562,3 +583,308 @@ void blk_trace_shutdown(struct request_queue *q)
562 blk_trace_remove(q); 583 blk_trace_remove(q);
563 } 584 }
564} 585}
586
587/*
588 * blktrace probes
589 */
590
591/**
592 * blk_add_trace_rq - Add a trace for a request oriented action
593 * @q: queue the io is for
594 * @rq: the source request
595 * @what: the action
596 *
597 * Description:
598 * Records an action against a request. Will log the bio offset + size.
599 *
600 **/
601static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
602 u32 what)
603{
604 struct blk_trace *bt = q->blk_trace;
605 int rw = rq->cmd_flags & 0x03;
606
607 if (likely(!bt))
608 return;
609
610 if (blk_discard_rq(rq))
611 rw |= (1 << BIO_RW_DISCARD);
612
613 if (blk_pc_request(rq)) {
614 what |= BLK_TC_ACT(BLK_TC_PC);
615 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
616 sizeof(rq->cmd), rq->cmd);
617 } else {
618 what |= BLK_TC_ACT(BLK_TC_FS);
619 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
620 rw, what, rq->errors, 0, NULL);
621 }
622}
623
624static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
625{
626 blk_add_trace_rq(q, rq, BLK_TA_ABORT);
627}
628
629static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
630{
631 blk_add_trace_rq(q, rq, BLK_TA_INSERT);
632}
633
634static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
635{
636 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
637}
638
639static void blk_add_trace_rq_requeue(struct request_queue *q, struct request *rq)
640{
641 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
642}
643
644static void blk_add_trace_rq_complete(struct request_queue *q, struct request *rq)
645{
646 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
647}
648
649/**
650 * blk_add_trace_bio - Add a trace for a bio oriented action
651 * @q: queue the io is for
652 * @bio: the source bio
653 * @what: the action
654 *
655 * Description:
656 * Records an action against a bio. Will log the bio offset + size.
657 *
658 **/
659static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
660 u32 what)
661{
662 struct blk_trace *bt = q->blk_trace;
663
664 if (likely(!bt))
665 return;
666
667 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
668 !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
669}
670
671static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
672{
673 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
674}
675
676static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
677{
678 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
679}
680
681static void blk_add_trace_bio_backmerge(struct request_queue *q, struct bio *bio)
682{
683 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
684}
685
686static void blk_add_trace_bio_frontmerge(struct request_queue *q, struct bio *bio)
687{
688 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
689}
690
691static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
692{
693 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
694}
695
696static void blk_add_trace_getrq(struct request_queue *q, struct bio *bio, int rw)
697{
698 if (bio)
699 blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
700 else {
701 struct blk_trace *bt = q->blk_trace;
702
703 if (bt)
704 __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
705 }
706}
707
708
709static void blk_add_trace_sleeprq(struct request_queue *q, struct bio *bio, int rw)
710{
711 if (bio)
712 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
713 else {
714 struct blk_trace *bt = q->blk_trace;
715
716 if (bt)
717 __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, 0, 0, NULL);
718 }
719}
720
721static void blk_add_trace_plug(struct request_queue *q)
722{
723 struct blk_trace *bt = q->blk_trace;
724
725 if (bt)
726 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
727}
728
729static void blk_add_trace_unplug_io(struct request_queue *q)
730{
731 struct blk_trace *bt = q->blk_trace;
732
733 if (bt) {
734 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
735 __be64 rpdu = cpu_to_be64(pdu);
736
737 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
738 sizeof(rpdu), &rpdu);
739 }
740}
741
742static void blk_add_trace_unplug_timer(struct request_queue *q)
743{
744 struct blk_trace *bt = q->blk_trace;
745
746 if (bt) {
747 unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
748 __be64 rpdu = cpu_to_be64(pdu);
749
750 __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
751 sizeof(rpdu), &rpdu);
752 }
753}
754
755static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
756 unsigned int pdu)
757{
758 struct blk_trace *bt = q->blk_trace;
759
760 if (bt) {
761 __be64 rpdu = cpu_to_be64(pdu);
762
763 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
764 BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
765 sizeof(rpdu), &rpdu);
766 }
767}
768
769/**
770 * blk_add_trace_remap - Add a trace for a remap operation
771 * @q: queue the io is for
772 * @bio: the source bio
773 * @dev: target device
774 * @from: source sector
775 * @to: target sector
776 *
777 * Description:
778 * Device mapper or raid target sometimes need to split a bio because
779 * it spans a stripe (or similar). Add a trace for that action.
780 *
781 **/
782static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
783 dev_t dev, sector_t from, sector_t to)
784{
785 struct blk_trace *bt = q->blk_trace;
786 struct blk_io_trace_remap r;
787
788 if (likely(!bt))
789 return;
790
791 r.device = cpu_to_be32(dev);
792 r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
793 r.sector = cpu_to_be64(to);
794
795 __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
796 !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
797}
798
799/**
800 * blk_add_driver_data - Add binary message with driver-specific data
801 * @q: queue the io is for
802 * @rq: io request
803 * @data: driver-specific data
804 * @len: length of driver-specific data
805 *
806 * Description:
807 * Some drivers might want to write driver-specific data per request.
808 *
809 **/
810void blk_add_driver_data(struct request_queue *q,
811 struct request *rq,
812 void *data, size_t len)
813{
814 struct blk_trace *bt = q->blk_trace;
815
816 if (likely(!bt))
817 return;
818
819 if (blk_pc_request(rq))
820 __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
821 rq->errors, len, data);
822 else
823 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
824 0, BLK_TA_DRV_DATA, rq->errors, len, data);
825}
826EXPORT_SYMBOL_GPL(blk_add_driver_data);
827
828static int blk_register_tracepoints(void)
829{
830 int ret;
831
832 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
833 WARN_ON(ret);
834 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
835 WARN_ON(ret);
836 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
837 WARN_ON(ret);
838 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
839 WARN_ON(ret);
840 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
841 WARN_ON(ret);
842 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
843 WARN_ON(ret);
844 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
845 WARN_ON(ret);
846 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
847 WARN_ON(ret);
848 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
849 WARN_ON(ret);
850 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
851 WARN_ON(ret);
852 ret = register_trace_block_getrq(blk_add_trace_getrq);
853 WARN_ON(ret);
854 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
855 WARN_ON(ret);
856 ret = register_trace_block_plug(blk_add_trace_plug);
857 WARN_ON(ret);
858 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
859 WARN_ON(ret);
860 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
861 WARN_ON(ret);
862 ret = register_trace_block_split(blk_add_trace_split);
863 WARN_ON(ret);
864 ret = register_trace_block_remap(blk_add_trace_remap);
865 WARN_ON(ret);
866 return 0;
867}
868
869static void blk_unregister_tracepoints(void)
870{
871 unregister_trace_block_remap(blk_add_trace_remap);
872 unregister_trace_block_split(blk_add_trace_split);
873 unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
874 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
875 unregister_trace_block_plug(blk_add_trace_plug);
876 unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
877 unregister_trace_block_getrq(blk_add_trace_getrq);
878 unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
879 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
880 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
881 unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
882 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
883 unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
884 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
885 unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
886 unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
887 unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
888
889 tracepoint_synchronize_unregister();
890}
diff --git a/block/elevator.c b/block/elevator.c
index a6951f76ba0c..86836dd179c0 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,6 +33,7 @@
33#include <linux/compiler.h> 33#include <linux/compiler.h>
34#include <linux/delay.h> 34#include <linux/delay.h>
35#include <linux/blktrace_api.h> 35#include <linux/blktrace_api.h>
36#include <trace/block.h>
36#include <linux/hash.h> 37#include <linux/hash.h>
37#include <linux/uaccess.h> 38#include <linux/uaccess.h>
38 39
@@ -41,6 +42,8 @@
41static DEFINE_SPINLOCK(elv_list_lock); 42static DEFINE_SPINLOCK(elv_list_lock);
42static LIST_HEAD(elv_list); 43static LIST_HEAD(elv_list);
43 44
45DEFINE_TRACE(block_rq_abort);
46
44/* 47/*
45 * Merge hash stuff. 48 * Merge hash stuff.
46 */ 49 */
@@ -52,6 +55,9 @@ static const int elv_hash_shift = 6;
52#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) 55#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors)
53#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) 56#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash))
54 57
58DEFINE_TRACE(block_rq_insert);
59DEFINE_TRACE(block_rq_issue);
60
55/* 61/*
56 * Query io scheduler to see if the current process issuing bio may be 62 * Query io scheduler to see if the current process issuing bio may be
57 * merged with rq. 63 * merged with rq.
@@ -586,7 +592,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
586 unsigned ordseq; 592 unsigned ordseq;
587 int unplug_it = 1; 593 int unplug_it = 1;
588 594
589 blk_add_trace_rq(q, rq, BLK_TA_INSERT); 595 trace_block_rq_insert(q, rq);
590 596
591 rq->q = q; 597 rq->q = q;
592 598
@@ -772,7 +778,7 @@ struct request *elv_next_request(struct request_queue *q)
772 * not be passed by new incoming requests 778 * not be passed by new incoming requests
773 */ 779 */
774 rq->cmd_flags |= REQ_STARTED; 780 rq->cmd_flags |= REQ_STARTED;
775 blk_add_trace_rq(q, rq, BLK_TA_ISSUE); 781 trace_block_rq_issue(q, rq);
776 } 782 }
777 783
778 if (!q->boundary_rq || q->boundary_rq == rq) { 784 if (!q->boundary_rq || q->boundary_rq == rq) {
@@ -914,7 +920,7 @@ void elv_abort_queue(struct request_queue *q)
914 while (!list_empty(&q->queue_head)) { 920 while (!list_empty(&q->queue_head)) {
915 rq = list_entry_rq(q->queue_head.next); 921 rq = list_entry_rq(q->queue_head.next);
916 rq->cmd_flags |= REQ_QUIET; 922 rq->cmd_flags |= REQ_QUIET;
917 blk_add_trace_rq(q, rq, BLK_TA_ABORT); 923 trace_block_rq_abort(q, rq);
918 __blk_end_request(rq, -EIO, blk_rq_bytes(rq)); 924 __blk_end_request(rq, -EIO, blk_rq_bytes(rq));
919 } 925 }
920} 926}
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 675076f5fca8..d26891bfcd41 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -558,23 +558,9 @@ struct timer_rand_state {
558 unsigned dont_count_entropy:1; 558 unsigned dont_count_entropy:1;
559}; 559};
560 560
561static struct timer_rand_state *irq_timer_state[NR_IRQS]; 561#ifndef CONFIG_SPARSE_IRQ
562 562struct timer_rand_state *irq_timer_state[NR_IRQS];
563static struct timer_rand_state *get_timer_rand_state(unsigned int irq) 563#endif
564{
565 if (irq >= nr_irqs)
566 return NULL;
567
568 return irq_timer_state[irq];
569}
570
571static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
572{
573 if (irq >= nr_irqs)
574 return;
575
576 irq_timer_state[irq] = state;
577}
578 564
579static struct timer_rand_state input_timer_state; 565static struct timer_rand_state input_timer_state;
580 566
@@ -933,8 +919,10 @@ void rand_initialize_irq(int irq)
933{ 919{
934 struct timer_rand_state *state; 920 struct timer_rand_state *state;
935 921
922#ifndef CONFIG_SPARSE_IRQ
936 if (irq >= nr_irqs) 923 if (irq >= nr_irqs)
937 return; 924 return;
925#endif
938 926
939 state = get_timer_rand_state(irq); 927 state = get_timer_rand_state(irq);
940 928
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index ce0d9da52a8a..94966edfb44d 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -274,6 +274,22 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = {
274 .enable_mask = SYSRQ_ENABLE_DUMP, 274 .enable_mask = SYSRQ_ENABLE_DUMP,
275}; 275};
276 276
277#ifdef CONFIG_TRACING
278#include <linux/ftrace.h>
279
280static void sysrq_ftrace_dump(int key, struct tty_struct *tty)
281{
282 ftrace_dump();
283}
284static struct sysrq_key_op sysrq_ftrace_dump_op = {
285 .handler = sysrq_ftrace_dump,
286 .help_msg = "dumpZ-ftrace-buffer",
287 .action_msg = "Dump ftrace buffer",
288 .enable_mask = SYSRQ_ENABLE_DUMP,
289};
290#else
291#define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)0)
292#endif
277 293
278static void sysrq_handle_showmem(int key, struct tty_struct *tty) 294static void sysrq_handle_showmem(int key, struct tty_struct *tty)
279{ 295{
@@ -406,7 +422,7 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
406 NULL, /* x */ 422 NULL, /* x */
407 /* y: May be registered on sparc64 for global register dump */ 423 /* y: May be registered on sparc64 for global register dump */
408 NULL, /* y */ 424 NULL, /* y */
409 NULL /* z */ 425 &sysrq_ftrace_dump_op, /* z */
410}; 426};
411 427
412/* key2index calculation, -1 on invalid index */ 428/* key2index calculation, -1 on invalid index */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c99e4728ff41..343094c3feeb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -21,6 +21,7 @@
21#include <linux/idr.h> 21#include <linux/idr.h>
22#include <linux/hdreg.h> 22#include <linux/hdreg.h>
23#include <linux/blktrace_api.h> 23#include <linux/blktrace_api.h>
24#include <trace/block.h>
24 25
25#define DM_MSG_PREFIX "core" 26#define DM_MSG_PREFIX "core"
26 27
@@ -51,6 +52,8 @@ struct dm_target_io {
51 union map_info info; 52 union map_info info;
52}; 53};
53 54
55DEFINE_TRACE(block_bio_complete);
56
54union map_info *dm_get_mapinfo(struct bio *bio) 57union map_info *dm_get_mapinfo(struct bio *bio)
55{ 58{
56 if (bio && bio->bi_private) 59 if (bio && bio->bi_private)
@@ -504,8 +507,7 @@ static void dec_pending(struct dm_io *io, int error)
504 end_io_acct(io); 507 end_io_acct(io);
505 508
506 if (io->error != DM_ENDIO_REQUEUE) { 509 if (io->error != DM_ENDIO_REQUEUE) {
507 blk_add_trace_bio(io->md->queue, io->bio, 510 trace_block_bio_complete(io->md->queue, io->bio);
508 BLK_TA_COMPLETE);
509 511
510 bio_endio(io->bio, io->error); 512 bio_endio(io->bio, io->error);
511 } 513 }
@@ -598,7 +600,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
598 if (r == DM_MAPIO_REMAPPED) { 600 if (r == DM_MAPIO_REMAPPED) {
599 /* the bio has been remapped so dispatch it */ 601 /* the bio has been remapped so dispatch it */
600 602
601 blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 603 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
602 tio->io->bio->bi_bdev->bd_dev, 604 tio->io->bio->bi_bdev->bd_dev,
603 clone->bi_sector, sector); 605 clone->bi_sector, sector);
604 606
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 2de5a3238c94..c9958ec5e25e 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -19,17 +19,75 @@ struct irq_2_iommu {
19 u8 irte_mask; 19 u8 irte_mask;
20}; 20};
21 21
22static struct irq_2_iommu irq_2_iommuX[NR_IRQS]; 22#ifdef CONFIG_SPARSE_IRQ
23static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
24{
25 struct irq_2_iommu *iommu;
26 int node;
27
28 node = cpu_to_node(cpu);
29
30 iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
31 printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
32
33 return iommu;
34}
23 35
24static struct irq_2_iommu *irq_2_iommu(unsigned int irq) 36static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
25{ 37{
26 return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL; 38 struct irq_desc *desc;
39
40 desc = irq_to_desc(irq);
41
42 if (WARN_ON_ONCE(!desc))
43 return NULL;
44
45 return desc->irq_2_iommu;
46}
47
48static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
49{
50 struct irq_desc *desc;
51 struct irq_2_iommu *irq_iommu;
52
53 /*
54 * alloc irq desc if not allocated already.
55 */
56 desc = irq_to_desc_alloc_cpu(irq, cpu);
57 if (!desc) {
58 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
59 return NULL;
60 }
61
62 irq_iommu = desc->irq_2_iommu;
63
64 if (!irq_iommu)
65 desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
66
67 return desc->irq_2_iommu;
27} 68}
28 69
29static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) 70static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
30{ 71{
72 return irq_2_iommu_alloc_cpu(irq, boot_cpu_id);
73}
74
75#else /* !CONFIG_SPARSE_IRQ */
76
77static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
78
79static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
80{
81 if (irq < nr_irqs)
82 return &irq_2_iommuX[irq];
83
84 return NULL;
85}
86static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
87{
31 return irq_2_iommu(irq); 88 return irq_2_iommu(irq);
32} 89}
90#endif
33 91
34static DEFINE_SPINLOCK(irq_2_ir_lock); 92static DEFINE_SPINLOCK(irq_2_ir_lock);
35 93
@@ -86,9 +144,11 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
86 if (!count) 144 if (!count)
87 return -1; 145 return -1;
88 146
147#ifndef CONFIG_SPARSE_IRQ
89 /* protect irq_2_iommu_alloc later */ 148 /* protect irq_2_iommu_alloc later */
90 if (irq >= nr_irqs) 149 if (irq >= nr_irqs)
91 return -1; 150 return -1;
151#endif
92 152
93 /* 153 /*
94 * start the IRTE search from index 0. 154 * start the IRTE search from index 0.
@@ -130,6 +190,12 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
130 table->base[i].present = 1; 190 table->base[i].present = 1;
131 191
132 irq_iommu = irq_2_iommu_alloc(irq); 192 irq_iommu = irq_2_iommu_alloc(irq);
193 if (!irq_iommu) {
194 spin_unlock(&irq_2_ir_lock);
195 printk(KERN_ERR "can't allocate irq_2_iommu\n");
196 return -1;
197 }
198
133 irq_iommu->iommu = iommu; 199 irq_iommu->iommu = iommu;
134 irq_iommu->irte_index = index; 200 irq_iommu->irte_index = index;
135 irq_iommu->sub_handle = 0; 201 irq_iommu->sub_handle = 0;
@@ -177,6 +243,12 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
177 243
178 irq_iommu = irq_2_iommu_alloc(irq); 244 irq_iommu = irq_2_iommu_alloc(irq);
179 245
246 if (!irq_iommu) {
247 spin_unlock(&irq_2_ir_lock);
248 printk(KERN_ERR "can't allocate irq_2_iommu\n");
249 return -1;
250 }
251
180 irq_iommu->iommu = iommu; 252 irq_iommu->iommu = iommu;
181 irq_iommu->irte_index = index; 253 irq_iommu->irte_index = index;
182 irq_iommu->sub_handle = subhandle; 254 irq_iommu->sub_handle = subhandle;
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 74801f7df9c9..11a51f8ed3b3 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -103,11 +103,11 @@ static void msix_set_enable(struct pci_dev *dev, int enable)
103 } 103 }
104} 104}
105 105
106static void msix_flush_writes(unsigned int irq) 106static void msix_flush_writes(struct irq_desc *desc)
107{ 107{
108 struct msi_desc *entry; 108 struct msi_desc *entry;
109 109
110 entry = get_irq_msi(irq); 110 entry = get_irq_desc_msi(desc);
111 BUG_ON(!entry || !entry->dev); 111 BUG_ON(!entry || !entry->dev);
112 switch (entry->msi_attrib.type) { 112 switch (entry->msi_attrib.type) {
113 case PCI_CAP_ID_MSI: 113 case PCI_CAP_ID_MSI:
@@ -135,11 +135,11 @@ static void msix_flush_writes(unsigned int irq)
135 * Returns 1 if it succeeded in masking the interrupt and 0 if the device 135 * Returns 1 if it succeeded in masking the interrupt and 0 if the device
136 * doesn't support MSI masking. 136 * doesn't support MSI masking.
137 */ 137 */
138static int msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag) 138static int msi_set_mask_bits(struct irq_desc *desc, u32 mask, u32 flag)
139{ 139{
140 struct msi_desc *entry; 140 struct msi_desc *entry;
141 141
142 entry = get_irq_msi(irq); 142 entry = get_irq_desc_msi(desc);
143 BUG_ON(!entry || !entry->dev); 143 BUG_ON(!entry || !entry->dev);
144 switch (entry->msi_attrib.type) { 144 switch (entry->msi_attrib.type) {
145 case PCI_CAP_ID_MSI: 145 case PCI_CAP_ID_MSI:
@@ -172,9 +172,9 @@ static int msi_set_mask_bits(unsigned int irq, u32 mask, u32 flag)
172 return 1; 172 return 1;
173} 173}
174 174
175void read_msi_msg(unsigned int irq, struct msi_msg *msg) 175void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
176{ 176{
177 struct msi_desc *entry = get_irq_msi(irq); 177 struct msi_desc *entry = get_irq_desc_msi(desc);
178 switch(entry->msi_attrib.type) { 178 switch(entry->msi_attrib.type) {
179 case PCI_CAP_ID_MSI: 179 case PCI_CAP_ID_MSI:
180 { 180 {
@@ -211,9 +211,16 @@ void read_msi_msg(unsigned int irq, struct msi_msg *msg)
211 } 211 }
212} 212}
213 213
214void write_msi_msg(unsigned int irq, struct msi_msg *msg) 214void read_msi_msg(unsigned int irq, struct msi_msg *msg)
215{ 215{
216 struct msi_desc *entry = get_irq_msi(irq); 216 struct irq_desc *desc = irq_to_desc(irq);
217
218 read_msi_msg_desc(desc, msg);
219}
220
221void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
222{
223 struct msi_desc *entry = get_irq_desc_msi(desc);
217 switch (entry->msi_attrib.type) { 224 switch (entry->msi_attrib.type) {
218 case PCI_CAP_ID_MSI: 225 case PCI_CAP_ID_MSI:
219 { 226 {
@@ -252,21 +259,31 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
252 entry->msg = *msg; 259 entry->msg = *msg;
253} 260}
254 261
262void write_msi_msg(unsigned int irq, struct msi_msg *msg)
263{
264 struct irq_desc *desc = irq_to_desc(irq);
265
266 write_msi_msg_desc(desc, msg);
267}
268
255void mask_msi_irq(unsigned int irq) 269void mask_msi_irq(unsigned int irq)
256{ 270{
257 msi_set_mask_bits(irq, 1, 1); 271 struct irq_desc *desc = irq_to_desc(irq);
258 msix_flush_writes(irq); 272
273 msi_set_mask_bits(desc, 1, 1);
274 msix_flush_writes(desc);
259} 275}
260 276
261void unmask_msi_irq(unsigned int irq) 277void unmask_msi_irq(unsigned int irq)
262{ 278{
263 msi_set_mask_bits(irq, 1, 0); 279 struct irq_desc *desc = irq_to_desc(irq);
264 msix_flush_writes(irq); 280
281 msi_set_mask_bits(desc, 1, 0);
282 msix_flush_writes(desc);
265} 283}
266 284
267static int msi_free_irqs(struct pci_dev* dev); 285static int msi_free_irqs(struct pci_dev* dev);
268 286
269
270static struct msi_desc* alloc_msi_entry(void) 287static struct msi_desc* alloc_msi_entry(void)
271{ 288{
272 struct msi_desc *entry; 289 struct msi_desc *entry;
@@ -303,9 +320,11 @@ static void __pci_restore_msi_state(struct pci_dev *dev)
303 pci_intx_for_msi(dev, 0); 320 pci_intx_for_msi(dev, 0);
304 msi_set_enable(dev, 0); 321 msi_set_enable(dev, 0);
305 write_msi_msg(dev->irq, &entry->msg); 322 write_msi_msg(dev->irq, &entry->msg);
306 if (entry->msi_attrib.maskbit) 323 if (entry->msi_attrib.maskbit) {
307 msi_set_mask_bits(dev->irq, entry->msi_attrib.maskbits_mask, 324 struct irq_desc *desc = irq_to_desc(dev->irq);
325 msi_set_mask_bits(desc, entry->msi_attrib.maskbits_mask,
308 entry->msi_attrib.masked); 326 entry->msi_attrib.masked);
327 }
309 328
310 pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control); 329 pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
311 control &= ~PCI_MSI_FLAGS_QSIZE; 330 control &= ~PCI_MSI_FLAGS_QSIZE;
@@ -327,8 +346,9 @@ static void __pci_restore_msix_state(struct pci_dev *dev)
327 msix_set_enable(dev, 0); 346 msix_set_enable(dev, 0);
328 347
329 list_for_each_entry(entry, &dev->msi_list, list) { 348 list_for_each_entry(entry, &dev->msi_list, list) {
349 struct irq_desc *desc = irq_to_desc(entry->irq);
330 write_msi_msg(entry->irq, &entry->msg); 350 write_msi_msg(entry->irq, &entry->msg);
331 msi_set_mask_bits(entry->irq, 1, entry->msi_attrib.masked); 351 msi_set_mask_bits(desc, 1, entry->msi_attrib.masked);
332 } 352 }
333 353
334 BUG_ON(list_empty(&dev->msi_list)); 354 BUG_ON(list_empty(&dev->msi_list));
@@ -596,7 +616,8 @@ void pci_msi_shutdown(struct pci_dev* dev)
596 /* Return the the pci reset with msi irqs unmasked */ 616 /* Return the the pci reset with msi irqs unmasked */
597 if (entry->msi_attrib.maskbit) { 617 if (entry->msi_attrib.maskbit) {
598 u32 mask = entry->msi_attrib.maskbits_mask; 618 u32 mask = entry->msi_attrib.maskbits_mask;
599 msi_set_mask_bits(dev->irq, mask, ~mask); 619 struct irq_desc *desc = irq_to_desc(dev->irq);
620 msi_set_mask_bits(desc, mask, ~mask);
600 } 621 }
601 if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI) 622 if (!entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI)
602 return; 623 return;
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index eba5ec5b020e..6c8193046e0d 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -141,8 +141,12 @@ static void init_evtchn_cpu_bindings(void)
141 int i; 141 int i;
142 142
143 /* By default all event channels notify CPU#0. */ 143 /* By default all event channels notify CPU#0. */
144 for_each_irq_desc(i, desc) 144 for_each_irq_desc(i, desc) {
145 if (!desc)
146 continue;
147
145 desc->affinity = cpumask_of_cpu(0); 148 desc->affinity = cpumask_of_cpu(0);
149 }
146#endif 150#endif
147 151
148 memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); 152 memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
@@ -231,7 +235,7 @@ static int find_unbound_irq(void)
231 int irq; 235 int irq;
232 236
233 /* Only allocate from dynirq range */ 237 /* Only allocate from dynirq range */
234 for_each_irq_nr(irq) 238 for (irq = 0; irq < nr_irqs; irq++)
235 if (irq_bindcount[irq] == 0) 239 if (irq_bindcount[irq] == 0)
236 break; 240 break;
237 241
@@ -792,7 +796,7 @@ void xen_irq_resume(void)
792 mask_evtchn(evtchn); 796 mask_evtchn(evtchn);
793 797
794 /* No IRQ <-> event-channel mappings. */ 798 /* No IRQ <-> event-channel mappings. */
795 for_each_irq_nr(irq) 799 for (irq = 0; irq < nr_irqs; irq++)
796 irq_info[irq].evtchn = 0; /* zap event-channel binding */ 800 irq_info[irq].evtchn = 0; /* zap event-channel binding */
797 801
798 for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) 802 for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++)
@@ -824,7 +828,7 @@ void __init xen_init_IRQ(void)
824 mask_evtchn(i); 828 mask_evtchn(i);
825 829
826 /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ 830 /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
827 for_each_irq_nr(i) 831 for (i = 0; i < nr_irqs; i++)
828 irq_bindcount[i] = 0; 832 irq_bindcount[i] = 0;
829 833
830 irq_ctx_init(smp_processor_id()); 834 irq_ctx_init(smp_processor_id());
diff --git a/fs/bio.c b/fs/bio.c
index 77a55bcceedb..df99c882b807 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,8 +26,11 @@
26#include <linux/mempool.h> 26#include <linux/mempool.h>
27#include <linux/workqueue.h> 27#include <linux/workqueue.h>
28#include <linux/blktrace_api.h> 28#include <linux/blktrace_api.h>
29#include <trace/block.h>
29#include <scsi/sg.h> /* for struct sg_iovec */ 30#include <scsi/sg.h> /* for struct sg_iovec */
30 31
32DEFINE_TRACE(block_split);
33
31static struct kmem_cache *bio_slab __read_mostly; 34static struct kmem_cache *bio_slab __read_mostly;
32 35
33static mempool_t *bio_split_pool __read_mostly; 36static mempool_t *bio_split_pool __read_mostly;
@@ -1263,7 +1266,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
1263 if (!bp) 1266 if (!bp)
1264 return bp; 1267 return bp;
1265 1268
1266 blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi, 1269 trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
1267 bi->bi_sector + first_sectors); 1270 bi->bi_sector + first_sectors);
1268 1271
1269 BUG_ON(bi->bi_vcnt != 1); 1272 BUG_ON(bi->bi_vcnt != 1);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 81904f07679d..3cb9492801c0 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -27,6 +27,7 @@ static int show_stat(struct seq_file *p, void *v)
27 u64 sum = 0; 27 u64 sum = 0;
28 struct timespec boottime; 28 struct timespec boottime;
29 unsigned int per_irq_sum; 29 unsigned int per_irq_sum;
30 struct irq_desc *desc;
30 31
31 user = nice = system = idle = iowait = 32 user = nice = system = idle = iowait =
32 irq = softirq = steal = cputime64_zero; 33 irq = softirq = steal = cputime64_zero;
@@ -44,10 +45,14 @@ static int show_stat(struct seq_file *p, void *v)
44 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); 45 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
45 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); 46 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
46 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); 47 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
47 48 for_each_irq_nr(j) {
48 for_each_irq_nr(j) 49#ifdef CONFIG_SPARSE_IRQ
50 desc = irq_to_desc(j);
51 if (!desc)
52 continue;
53#endif
49 sum += kstat_irqs_cpu(j, i); 54 sum += kstat_irqs_cpu(j, i);
50 55 }
51 sum += arch_irq_stat_cpu(i); 56 sum += arch_irq_stat_cpu(i);
52 } 57 }
53 sum += arch_irq_stat(); 58 sum += arch_irq_stat();
@@ -92,7 +97,13 @@ static int show_stat(struct seq_file *p, void *v)
92 /* sum again ? it could be updated? */ 97 /* sum again ? it could be updated? */
93 for_each_irq_nr(j) { 98 for_each_irq_nr(j) {
94 per_irq_sum = 0; 99 per_irq_sum = 0;
95 100#ifdef CONFIG_SPARSE_IRQ
101 desc = irq_to_desc(j);
102 if (!desc) {
103 seq_printf(p, " %u", per_irq_sum);
104 continue;
105 }
106#endif
96 for_each_possible_cpu(i) 107 for_each_possible_cpu(i)
97 per_irq_sum += kstat_irqs_cpu(j, i); 108 per_irq_sum += kstat_irqs_cpu(j, i);
98 109
diff --git a/fs/seq_file.c b/fs/seq_file.c
index eba2eabcd2b8..16c211558c22 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -357,7 +357,18 @@ int seq_printf(struct seq_file *m, const char *f, ...)
357} 357}
358EXPORT_SYMBOL(seq_printf); 358EXPORT_SYMBOL(seq_printf);
359 359
360static char *mangle_path(char *s, char *p, char *esc) 360/**
361 * mangle_path - mangle and copy path to buffer beginning
362 * @s: buffer start
363 * @p: beginning of path in above buffer
364 * @esc: set of characters that need escaping
365 *
366 * Copy the path from @p to @s, replacing each occurrence of character from
367 * @esc with usual octal escape.
368 * Returns pointer past last written character in @s, or NULL in case of
369 * failure.
370 */
371char *mangle_path(char *s, char *p, char *esc)
361{ 372{
362 while (s <= p) { 373 while (s <= p) {
363 char c = *p++; 374 char c = *p++;
@@ -376,6 +387,7 @@ static char *mangle_path(char *s, char *p, char *esc)
376 } 387 }
377 return NULL; 388 return NULL;
378} 389}
390EXPORT_SYMBOL(mangle_path);
379 391
380/* 392/*
381 * return the absolute path of 'dentry' residing in mount 'mnt'. 393 * return the absolute path of 'dentry' residing in mount 'mnt'.
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 80744606bad1..eba835a2c2cd 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -45,6 +45,22 @@
45#define MCOUNT_REC() 45#define MCOUNT_REC()
46#endif 46#endif
47 47
48#ifdef CONFIG_TRACE_BRANCH_PROFILING
49#define LIKELY_PROFILE() VMLINUX_SYMBOL(__start_annotated_branch_profile) = .; \
50 *(_ftrace_annotated_branch) \
51 VMLINUX_SYMBOL(__stop_annotated_branch_profile) = .;
52#else
53#define LIKELY_PROFILE()
54#endif
55
56#ifdef CONFIG_PROFILE_ALL_BRANCHES
57#define BRANCH_PROFILE() VMLINUX_SYMBOL(__start_branch_profile) = .; \
58 *(_ftrace_branch) \
59 VMLINUX_SYMBOL(__stop_branch_profile) = .;
60#else
61#define BRANCH_PROFILE()
62#endif
63
48/* .data section */ 64/* .data section */
49#define DATA_DATA \ 65#define DATA_DATA \
50 *(.data) \ 66 *(.data) \
@@ -60,9 +76,12 @@
60 VMLINUX_SYMBOL(__start___markers) = .; \ 76 VMLINUX_SYMBOL(__start___markers) = .; \
61 *(__markers) \ 77 *(__markers) \
62 VMLINUX_SYMBOL(__stop___markers) = .; \ 78 VMLINUX_SYMBOL(__stop___markers) = .; \
79 . = ALIGN(32); \
63 VMLINUX_SYMBOL(__start___tracepoints) = .; \ 80 VMLINUX_SYMBOL(__start___tracepoints) = .; \
64 *(__tracepoints) \ 81 *(__tracepoints) \
65 VMLINUX_SYMBOL(__stop___tracepoints) = .; 82 VMLINUX_SYMBOL(__stop___tracepoints) = .; \
83 LIKELY_PROFILE() \
84 BRANCH_PROFILE()
66 85
67#define RO_DATA(align) \ 86#define RO_DATA(align) \
68 . = ALIGN((align)); \ 87 . = ALIGN((align)); \
diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h
index 70a57c8c002b..c980f5ba8de7 100644
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@@ -23,7 +23,7 @@
23 */ 23 */
24 24
25#if defined(CONFIG_FRAME_POINTER) || \ 25#if defined(CONFIG_FRAME_POINTER) || \
26 !defined(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER) 26 !defined(CONFIG_SCHED_OMIT_FRAME_POINTER)
27#define M32R_PUSH_FP " push fp\n" 27#define M32R_PUSH_FP " push fp\n"
28#define M32R_POP_FP " pop fp\n" 28#define M32R_POP_FP " pop fp\n"
29#else 29#else
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index e531783e5d78..95ac82340c3b 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -313,6 +313,7 @@ unifdef-y += ptrace.h
313unifdef-y += qnx4_fs.h 313unifdef-y += qnx4_fs.h
314unifdef-y += quota.h 314unifdef-y += quota.h
315unifdef-y += random.h 315unifdef-y += random.h
316unifdef-y += irqnr.h
316unifdef-y += reboot.h 317unifdef-y += reboot.h
317unifdef-y += reiserfs_fs.h 318unifdef-y += reiserfs_fs.h
318unifdef-y += reiserfs_xattr.h 319unifdef-y += reiserfs_xattr.h
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index bdf505d33e77..1dba3493d520 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -160,7 +160,6 @@ struct blk_trace {
160 160
161extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); 161extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
162extern void blk_trace_shutdown(struct request_queue *); 162extern void blk_trace_shutdown(struct request_queue *);
163extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
164extern int do_blk_trace_setup(struct request_queue *q, 163extern int do_blk_trace_setup(struct request_queue *q,
165 char *name, dev_t dev, struct blk_user_trace_setup *buts); 164 char *name, dev_t dev, struct blk_user_trace_setup *buts);
166extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); 165extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
@@ -186,168 +185,8 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
186 } while (0) 185 } while (0)
187#define BLK_TN_MAX_MSG 128 186#define BLK_TN_MAX_MSG 128
188 187
189/** 188extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
190 * blk_add_trace_rq - Add a trace for a request oriented action 189 void *data, size_t len);
191 * @q: queue the io is for
192 * @rq: the source request
193 * @what: the action
194 *
195 * Description:
196 * Records an action against a request. Will log the bio offset + size.
197 *
198 **/
199static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
200 u32 what)
201{
202 struct blk_trace *bt = q->blk_trace;
203 int rw = rq->cmd_flags & 0x03;
204
205 if (likely(!bt))
206 return;
207
208 if (blk_discard_rq(rq))
209 rw |= (1 << BIO_RW_DISCARD);
210
211 if (blk_pc_request(rq)) {
212 what |= BLK_TC_ACT(BLK_TC_PC);
213 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
214 } else {
215 what |= BLK_TC_ACT(BLK_TC_FS);
216 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
217 }
218}
219
220/**
221 * blk_add_trace_bio - Add a trace for a bio oriented action
222 * @q: queue the io is for
223 * @bio: the source bio
224 * @what: the action
225 *
226 * Description:
227 * Records an action against a bio. Will log the bio offset + size.
228 *
229 **/
230static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
231 u32 what)
232{
233 struct blk_trace *bt = q->blk_trace;
234
235 if (likely(!bt))
236 return;
237
238 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
239}
240
241/**
242 * blk_add_trace_generic - Add a trace for a generic action
243 * @q: queue the io is for
244 * @bio: the source bio
245 * @rw: the data direction
246 * @what: the action
247 *
248 * Description:
249 * Records a simple trace
250 *
251 **/
252static inline void blk_add_trace_generic(struct request_queue *q,
253 struct bio *bio, int rw, u32 what)
254{
255 struct blk_trace *bt = q->blk_trace;
256
257 if (likely(!bt))
258 return;
259
260 if (bio)
261 blk_add_trace_bio(q, bio, what);
262 else
263 __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
264}
265
266/**
267 * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
268 * @q: queue the io is for
269 * @what: the action
270 * @bio: the source bio
271 * @pdu: the integer payload
272 *
273 * Description:
274 * Adds a trace with some integer payload. This might be an unplug
275 * option given as the action, with the depth at unplug time given
276 * as the payload
277 *
278 **/
279static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
280 struct bio *bio, unsigned int pdu)
281{
282 struct blk_trace *bt = q->blk_trace;
283 __be64 rpdu = cpu_to_be64(pdu);
284
285 if (likely(!bt))
286 return;
287
288 if (bio)
289 __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
290 else
291 __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
292}
293
294/**
295 * blk_add_trace_remap - Add a trace for a remap operation
296 * @q: queue the io is for
297 * @bio: the source bio
298 * @dev: target device
299 * @from: source sector
300 * @to: target sector
301 *
302 * Description:
303 * Device mapper or raid target sometimes need to split a bio because
304 * it spans a stripe (or similar). Add a trace for that action.
305 *
306 **/
307static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
308 dev_t dev, sector_t from, sector_t to)
309{
310 struct blk_trace *bt = q->blk_trace;
311 struct blk_io_trace_remap r;
312
313 if (likely(!bt))
314 return;
315
316 r.device = cpu_to_be32(dev);
317 r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
318 r.sector = cpu_to_be64(to);
319
320 __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
321}
322
323/**
324 * blk_add_driver_data - Add binary message with driver-specific data
325 * @q: queue the io is for
326 * @rq: io request
327 * @data: driver-specific data
328 * @len: length of driver-specific data
329 *
330 * Description:
331 * Some drivers might want to write driver-specific data per request.
332 *
333 **/
334static inline void blk_add_driver_data(struct request_queue *q,
335 struct request *rq,
336 void *data, size_t len)
337{
338 struct blk_trace *bt = q->blk_trace;
339
340 if (likely(!bt))
341 return;
342
343 if (blk_pc_request(rq))
344 __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
345 rq->errors, len, data);
346 else
347 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
348 0, BLK_TA_DRV_DATA, rq->errors, len, data);
349}
350
351extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, 190extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
352 char __user *arg); 191 char __user *arg);
353extern int blk_trace_startstop(struct request_queue *q, int start); 192extern int blk_trace_startstop(struct request_queue *q, int start);
@@ -356,13 +195,8 @@ extern int blk_trace_remove(struct request_queue *q);
356#else /* !CONFIG_BLK_DEV_IO_TRACE */ 195#else /* !CONFIG_BLK_DEV_IO_TRACE */
357#define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) 196#define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY)
358#define blk_trace_shutdown(q) do { } while (0) 197#define blk_trace_shutdown(q) do { } while (0)
359#define blk_add_trace_rq(q, rq, what) do { } while (0)
360#define blk_add_trace_bio(q, rq, what) do { } while (0)
361#define blk_add_trace_generic(q, rq, rw, what) do { } while (0)
362#define blk_add_trace_pdu_int(q, what, bio, pdu) do { } while (0)
363#define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0)
364#define blk_add_driver_data(q, rq, data, len) do {} while (0)
365#define do_blk_trace_setup(q, name, dev, buts) (-ENOTTY) 198#define do_blk_trace_setup(q, name, dev, buts) (-ENOTTY)
199#define blk_add_driver_data(q, rq, data, len) do {} while (0)
366#define blk_trace_setup(q, name, dev, arg) (-ENOTTY) 200#define blk_trace_setup(q, name, dev, arg) (-ENOTTY)
367#define blk_trace_startstop(q, start) (-ENOTTY) 201#define blk_trace_startstop(q, start) (-ENOTTY)
368#define blk_trace_remove(q) (-ENOTTY) 202#define blk_trace_remove(q) (-ENOTTY)
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 98115d9d04da..ea7c6be354b7 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -59,8 +59,88 @@ extern void __chk_io_ptr(const volatile void __iomem *);
59 * specific implementations come from the above header files 59 * specific implementations come from the above header files
60 */ 60 */
61 61
62#define likely(x) __builtin_expect(!!(x), 1) 62struct ftrace_branch_data {
63#define unlikely(x) __builtin_expect(!!(x), 0) 63 const char *func;
64 const char *file;
65 unsigned line;
66 union {
67 struct {
68 unsigned long correct;
69 unsigned long incorrect;
70 };
71 struct {
72 unsigned long miss;
73 unsigned long hit;
74 };
75 };
76};
77
78/*
79 * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
80 * to disable branch tracing on a per file basis.
81 */
82#if defined(CONFIG_TRACE_BRANCH_PROFILING) && !defined(DISABLE_BRANCH_PROFILING)
83void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
84
85#define likely_notrace(x) __builtin_expect(!!(x), 1)
86#define unlikely_notrace(x) __builtin_expect(!!(x), 0)
87
88#define __branch_check__(x, expect) ({ \
89 int ______r; \
90 static struct ftrace_branch_data \
91 __attribute__((__aligned__(4))) \
92 __attribute__((section("_ftrace_annotated_branch"))) \
93 ______f = { \
94 .func = __func__, \
95 .file = __FILE__, \
96 .line = __LINE__, \
97 }; \
98 ______r = likely_notrace(x); \
99 ftrace_likely_update(&______f, ______r, expect); \
100 ______r; \
101 })
102
103/*
104 * Using __builtin_constant_p(x) to ignore cases where the return
105 * value is always the same. This idea is taken from a similar patch
106 * written by Daniel Walker.
107 */
108# ifndef likely
109# define likely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1))
110# endif
111# ifndef unlikely
112# define unlikely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
113# endif
114
115#ifdef CONFIG_PROFILE_ALL_BRANCHES
116/*
117 * "Define 'is'", Bill Clinton
118 * "Define 'if'", Steven Rostedt
119 */
120#define if(cond) if (__builtin_constant_p((cond)) ? !!(cond) : \
121 ({ \
122 int ______r; \
123 static struct ftrace_branch_data \
124 __attribute__((__aligned__(4))) \
125 __attribute__((section("_ftrace_branch"))) \
126 ______f = { \
127 .func = __func__, \
128 .file = __FILE__, \
129 .line = __LINE__, \
130 }; \
131 ______r = !!(cond); \
132 if (______r) \
133 ______f.hit++; \
134 else \
135 ______f.miss++; \
136 ______r; \
137 }))
138#endif /* CONFIG_PROFILE_ALL_BRANCHES */
139
140#else
141# define likely(x) __builtin_expect(!!(x), 1)
142# define unlikely(x) __builtin_expect(!!(x), 0)
143#endif
64 144
65/* Optimization barrier */ 145/* Optimization barrier */
66#ifndef barrier 146#ifndef barrier
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 4aaa4afb1cb9..096476f1fb35 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -17,7 +17,7 @@ extern int debug_locks_off(void);
17({ \ 17({ \
18 int __ret = 0; \ 18 int __ret = 0; \
19 \ 19 \
20 if (unlikely(c)) { \ 20 if (!oops_in_progress && unlikely(c)) { \
21 if (debug_locks_off() && !debug_locks_silent) \ 21 if (debug_locks_off() && !debug_locks_silent) \
22 WARN_ON(1); \ 22 WARN_ON(1); \
23 __ret = 1; \ 23 __ret = 1; \
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9c5bc6be2b09..985b28dc2ba9 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -8,6 +8,8 @@
8#include <linux/types.h> 8#include <linux/types.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/kallsyms.h> 10#include <linux/kallsyms.h>
11#include <linux/bitops.h>
12#include <linux/sched.h>
11 13
12#ifdef CONFIG_FUNCTION_TRACER 14#ifdef CONFIG_FUNCTION_TRACER
13 15
@@ -24,6 +26,45 @@ struct ftrace_ops {
24 struct ftrace_ops *next; 26 struct ftrace_ops *next;
25}; 27};
26 28
29extern int function_trace_stop;
30
31/*
32 * Type of the current tracing.
33 */
34enum ftrace_tracing_type_t {
35 FTRACE_TYPE_ENTER = 0, /* Hook the call of the function */
36 FTRACE_TYPE_RETURN, /* Hook the return of the function */
37};
38
39/* Current tracing type, default is FTRACE_TYPE_ENTER */
40extern enum ftrace_tracing_type_t ftrace_tracing_type;
41
42/**
43 * ftrace_stop - stop function tracer.
44 *
45 * A quick way to stop the function tracer. Note this an on off switch,
46 * it is not something that is recursive like preempt_disable.
47 * This does not disable the calling of mcount, it only stops the
48 * calling of functions from mcount.
49 */
50static inline void ftrace_stop(void)
51{
52 function_trace_stop = 1;
53}
54
55/**
56 * ftrace_start - start the function tracer.
57 *
58 * This function is the inverse of ftrace_stop. This does not enable
59 * the function tracing if the function tracer is disabled. This only
60 * sets the function tracer flag to continue calling the functions
61 * from mcount.
62 */
63static inline void ftrace_start(void)
64{
65 function_trace_stop = 0;
66}
67
27/* 68/*
28 * The ftrace_ops must be a static and should also 69 * The ftrace_ops must be a static and should also
29 * be read_mostly. These functions do modify read_mostly variables 70 * be read_mostly. These functions do modify read_mostly variables
@@ -42,9 +83,13 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1);
42# define unregister_ftrace_function(ops) do { } while (0) 83# define unregister_ftrace_function(ops) do { } while (0)
43# define clear_ftrace_function(ops) do { } while (0) 84# define clear_ftrace_function(ops) do { } while (0)
44static inline void ftrace_kill(void) { } 85static inline void ftrace_kill(void) { }
86static inline void ftrace_stop(void) { }
87static inline void ftrace_start(void) { }
45#endif /* CONFIG_FUNCTION_TRACER */ 88#endif /* CONFIG_FUNCTION_TRACER */
46 89
47#ifdef CONFIG_DYNAMIC_FTRACE 90#ifdef CONFIG_DYNAMIC_FTRACE
91/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
92#include <asm/ftrace.h>
48 93
49enum { 94enum {
50 FTRACE_FL_FREE = (1 << 0), 95 FTRACE_FL_FREE = (1 << 0),
@@ -60,6 +105,7 @@ struct dyn_ftrace {
60 struct list_head list; 105 struct list_head list;
61 unsigned long ip; /* address of mcount call-site */ 106 unsigned long ip; /* address of mcount call-site */
62 unsigned long flags; 107 unsigned long flags;
108 struct dyn_arch_ftrace arch;
63}; 109};
64 110
65int ftrace_force_update(void); 111int ftrace_force_update(void);
@@ -67,19 +113,25 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset);
67 113
68/* defined in arch */ 114/* defined in arch */
69extern int ftrace_ip_converted(unsigned long ip); 115extern int ftrace_ip_converted(unsigned long ip);
70extern unsigned char *ftrace_nop_replace(void);
71extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
72extern int ftrace_dyn_arch_init(void *data); 116extern int ftrace_dyn_arch_init(void *data);
73extern int ftrace_update_ftrace_func(ftrace_func_t func); 117extern int ftrace_update_ftrace_func(ftrace_func_t func);
74extern void ftrace_caller(void); 118extern void ftrace_caller(void);
75extern void ftrace_call(void); 119extern void ftrace_call(void);
76extern void mcount_call(void); 120extern void mcount_call(void);
121#ifdef CONFIG_FUNCTION_GRAPH_TRACER
122extern void ftrace_graph_caller(void);
123extern int ftrace_enable_ftrace_graph_caller(void);
124extern int ftrace_disable_ftrace_graph_caller(void);
125#else
126static inline int ftrace_enable_ftrace_graph_caller(void) { return 0; }
127static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; }
128#endif
77 129
78/** 130/**
79 * ftrace_modify_code - modify code segment 131 * ftrace_make_nop - convert code into top
80 * @ip: the address of the code segment 132 * @mod: module structure if called by module load initialization
81 * @old_code: the contents of what is expected to be there 133 * @rec: the mcount call site record
82 * @new_code: the code to patch in 134 * @addr: the address that the call site should be calling
83 * 135 *
84 * This is a very sensitive operation and great care needs 136 * This is a very sensitive operation and great care needs
85 * to be taken by the arch. The operation should carefully 137 * to be taken by the arch. The operation should carefully
@@ -87,6 +139,8 @@ extern void mcount_call(void);
87 * what we expect it to be, and then on success of the compare, 139 * what we expect it to be, and then on success of the compare,
88 * it should write to the location. 140 * it should write to the location.
89 * 141 *
142 * The code segment at @rec->ip should be a caller to @addr
143 *
90 * Return must be: 144 * Return must be:
91 * 0 on success 145 * 0 on success
92 * -EFAULT on error reading the location 146 * -EFAULT on error reading the location
@@ -94,8 +148,34 @@ extern void mcount_call(void);
94 * -EPERM on error writing to the location 148 * -EPERM on error writing to the location
95 * Any other value will be considered a failure. 149 * Any other value will be considered a failure.
96 */ 150 */
97extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code, 151extern int ftrace_make_nop(struct module *mod,
98 unsigned char *new_code); 152 struct dyn_ftrace *rec, unsigned long addr);
153
154/**
155 * ftrace_make_call - convert a nop call site into a call to addr
156 * @rec: the mcount call site record
157 * @addr: the address that the call site should call
158 *
159 * This is a very sensitive operation and great care needs
160 * to be taken by the arch. The operation should carefully
161 * read the location, check to see if what is read is indeed
162 * what we expect it to be, and then on success of the compare,
163 * it should write to the location.
164 *
165 * The code segment at @rec->ip should be a nop
166 *
167 * Return must be:
168 * 0 on success
169 * -EFAULT on error reading the location
170 * -EINVAL on a failed compare of the contents
171 * -EPERM on error writing to the location
172 * Any other value will be considered a failure.
173 */
174extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
175
176
177/* May be defined in arch */
178extern int ftrace_arch_read_dyn_info(char *buf, int size);
99 179
100extern int skip_trace(unsigned long ip); 180extern int skip_trace(unsigned long ip);
101 181
@@ -103,7 +183,6 @@ extern void ftrace_release(void *start, unsigned long size);
103 183
104extern void ftrace_disable_daemon(void); 184extern void ftrace_disable_daemon(void);
105extern void ftrace_enable_daemon(void); 185extern void ftrace_enable_daemon(void);
106
107#else 186#else
108# define skip_trace(ip) ({ 0; }) 187# define skip_trace(ip) ({ 0; })
109# define ftrace_force_update() ({ 0; }) 188# define ftrace_force_update() ({ 0; })
@@ -182,6 +261,12 @@ static inline void __ftrace_enabled_restore(int enabled)
182#endif 261#endif
183 262
184#ifdef CONFIG_TRACING 263#ifdef CONFIG_TRACING
264extern int ftrace_dump_on_oops;
265
266extern void tracing_start(void);
267extern void tracing_stop(void);
268extern void ftrace_off_permanent(void);
269
185extern void 270extern void
186ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); 271ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
187 272
@@ -212,6 +297,9 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
212static inline int 297static inline int
213ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0))); 298ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));
214 299
300static inline void tracing_start(void) { }
301static inline void tracing_stop(void) { }
302static inline void ftrace_off_permanent(void) { }
215static inline int 303static inline int
216ftrace_printk(const char *fmt, ...) 304ftrace_printk(const char *fmt, ...)
217{ 305{
@@ -222,33 +310,167 @@ static inline void ftrace_dump(void) { }
222 310
223#ifdef CONFIG_FTRACE_MCOUNT_RECORD 311#ifdef CONFIG_FTRACE_MCOUNT_RECORD
224extern void ftrace_init(void); 312extern void ftrace_init(void);
225extern void ftrace_init_module(unsigned long *start, unsigned long *end); 313extern void ftrace_init_module(struct module *mod,
314 unsigned long *start, unsigned long *end);
226#else 315#else
227static inline void ftrace_init(void) { } 316static inline void ftrace_init(void) { }
228static inline void 317static inline void
229ftrace_init_module(unsigned long *start, unsigned long *end) { } 318ftrace_init_module(struct module *mod,
319 unsigned long *start, unsigned long *end) { }
230#endif 320#endif
231 321
322enum {
323 POWER_NONE = 0,
324 POWER_CSTATE = 1,
325 POWER_PSTATE = 2,
326};
327
328struct power_trace {
329#ifdef CONFIG_POWER_TRACER
330 ktime_t stamp;
331 ktime_t end;
332 int type;
333 int state;
334#endif
335};
336
337#ifdef CONFIG_POWER_TRACER
338extern void trace_power_start(struct power_trace *it, unsigned int type,
339 unsigned int state);
340extern void trace_power_mark(struct power_trace *it, unsigned int type,
341 unsigned int state);
342extern void trace_power_end(struct power_trace *it);
343#else
344static inline void trace_power_start(struct power_trace *it, unsigned int type,
345 unsigned int state) { }
346static inline void trace_power_mark(struct power_trace *it, unsigned int type,
347 unsigned int state) { }
348static inline void trace_power_end(struct power_trace *it) { }
349#endif
350
351
352/*
353 * Structure that defines an entry function trace.
354 */
355struct ftrace_graph_ent {
356 unsigned long func; /* Current function */
357 int depth;
358};
232 359
233struct boot_trace { 360/*
234 pid_t caller; 361 * Structure that defines a return function trace.
235 char func[KSYM_SYMBOL_LEN]; 362 */
236 int result; 363struct ftrace_graph_ret {
237 unsigned long long duration; /* usecs */ 364 unsigned long func; /* Current function */
238 ktime_t calltime; 365 unsigned long long calltime;
239 ktime_t rettime; 366 unsigned long long rettime;
367 /* Number of functions that overran the depth limit for current task */
368 unsigned long overrun;
369 int depth;
240}; 370};
241 371
242#ifdef CONFIG_BOOT_TRACER 372#ifdef CONFIG_FUNCTION_GRAPH_TRACER
243extern void trace_boot(struct boot_trace *it, initcall_t fn); 373
244extern void start_boot_trace(void); 374/*
245extern void stop_boot_trace(void); 375 * Sometimes we don't want to trace a function with the function
376 * graph tracer but we want them to keep traced by the usual function
377 * tracer if the function graph tracer is not configured.
378 */
379#define __notrace_funcgraph notrace
380
381#define FTRACE_RETFUNC_DEPTH 50
382#define FTRACE_RETSTACK_ALLOC_SIZE 32
383/* Type of the callback handlers for tracing function graph*/
384typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
385typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
386
387extern int register_ftrace_graph(trace_func_graph_ret_t retfunc,
388 trace_func_graph_ent_t entryfunc);
389
390extern void ftrace_graph_stop(void);
391
392/* The current handlers in use */
393extern trace_func_graph_ret_t ftrace_graph_return;
394extern trace_func_graph_ent_t ftrace_graph_entry;
395
396extern void unregister_ftrace_graph(void);
397
398extern void ftrace_graph_init_task(struct task_struct *t);
399extern void ftrace_graph_exit_task(struct task_struct *t);
400
401static inline int task_curr_ret_stack(struct task_struct *t)
402{
403 return t->curr_ret_stack;
404}
405
406static inline void pause_graph_tracing(void)
407{
408 atomic_inc(&current->tracing_graph_pause);
409}
410
411static inline void unpause_graph_tracing(void)
412{
413 atomic_dec(&current->tracing_graph_pause);
414}
246#else 415#else
247static inline void trace_boot(struct boot_trace *it, initcall_t fn) { } 416
248static inline void start_boot_trace(void) { } 417#define __notrace_funcgraph
249static inline void stop_boot_trace(void) { } 418
419static inline void ftrace_graph_init_task(struct task_struct *t) { }
420static inline void ftrace_graph_exit_task(struct task_struct *t) { }
421
422static inline int task_curr_ret_stack(struct task_struct *tsk)
423{
424 return -1;
425}
426
427static inline void pause_graph_tracing(void) { }
428static inline void unpause_graph_tracing(void) { }
250#endif 429#endif
251 430
431#ifdef CONFIG_TRACING
432#include <linux/sched.h>
433
434/* flags for current->trace */
435enum {
436 TSK_TRACE_FL_TRACE_BIT = 0,
437 TSK_TRACE_FL_GRAPH_BIT = 1,
438};
439enum {
440 TSK_TRACE_FL_TRACE = 1 << TSK_TRACE_FL_TRACE_BIT,
441 TSK_TRACE_FL_GRAPH = 1 << TSK_TRACE_FL_GRAPH_BIT,
442};
443
444static inline void set_tsk_trace_trace(struct task_struct *tsk)
445{
446 set_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
447}
448
449static inline void clear_tsk_trace_trace(struct task_struct *tsk)
450{
451 clear_bit(TSK_TRACE_FL_TRACE_BIT, &tsk->trace);
452}
453
454static inline int test_tsk_trace_trace(struct task_struct *tsk)
455{
456 return tsk->trace & TSK_TRACE_FL_TRACE;
457}
458
459static inline void set_tsk_trace_graph(struct task_struct *tsk)
460{
461 set_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
462}
463
464static inline void clear_tsk_trace_graph(struct task_struct *tsk)
465{
466 clear_bit(TSK_TRACE_FL_GRAPH_BIT, &tsk->trace);
467}
468
469static inline int test_tsk_trace_graph(struct task_struct *tsk)
470{
471 return tsk->trace & TSK_TRACE_FL_GRAPH;
472}
252 473
474#endif /* CONFIG_TRACING */
253 475
254#endif /* _LINUX_FTRACE_H */ 476#endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
new file mode 100644
index 000000000000..366a054d0b05
--- /dev/null
+++ b/include/linux/ftrace_irq.h
@@ -0,0 +1,13 @@
1#ifndef _LINUX_FTRACE_IRQ_H
2#define _LINUX_FTRACE_IRQ_H
3
4
5#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
6extern void ftrace_nmi_enter(void);
7extern void ftrace_nmi_exit(void);
8#else
9static inline void ftrace_nmi_enter(void) { }
10static inline void ftrace_nmi_exit(void) { }
11#endif
12
13#endif /* _LINUX_FTRACE_IRQ_H */
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 586ab56a3ec3..8f627b9ae2b1 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -164,6 +164,8 @@ union futex_key {
164 } both; 164 } both;
165}; 165};
166 166
167#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
168
167#ifdef CONFIG_FUTEX 169#ifdef CONFIG_FUTEX
168extern void exit_robust_list(struct task_struct *curr); 170extern void exit_robust_list(struct task_struct *curr);
169extern void exit_pi_state_list(struct task_struct *curr); 171extern void exit_pi_state_list(struct task_struct *curr);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 181006cc94a0..89a56d79e4c6 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -4,6 +4,7 @@
4#include <linux/preempt.h> 4#include <linux/preempt.h>
5#include <linux/smp_lock.h> 5#include <linux/smp_lock.h>
6#include <linux/lockdep.h> 6#include <linux/lockdep.h>
7#include <linux/ftrace_irq.h>
7#include <asm/hardirq.h> 8#include <asm/hardirq.h>
8#include <asm/system.h> 9#include <asm/system.h>
9 10
@@ -161,7 +162,17 @@ extern void irq_enter(void);
161 */ 162 */
162extern void irq_exit(void); 163extern void irq_exit(void);
163 164
164#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0) 165#define nmi_enter() \
165#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0) 166 do { \
167 ftrace_nmi_enter(); \
168 lockdep_off(); \
169 __irq_enter(); \
170 } while (0)
171#define nmi_exit() \
172 do { \
173 __irq_exit(); \
174 lockdep_on(); \
175 ftrace_nmi_exit(); \
176 } while (0)
166 177
167#endif /* LINUX_HARDIRQ_H */ 178#endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 48e63934fabe..7e85a6e89e41 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -14,6 +14,8 @@
14#include <linux/irqflags.h> 14#include <linux/irqflags.h>
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/irqnr.h>
18
17#include <asm/atomic.h> 19#include <asm/atomic.h>
18#include <asm/ptrace.h> 20#include <asm/ptrace.h>
19#include <asm/system.h> 21#include <asm/system.h>
diff --git a/include/linux/irq.h b/include/linux/irq.h
index ab70fd604d3a..59525b74979f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -130,6 +130,8 @@ struct irq_chip {
130 const char *typename; 130 const char *typename;
131}; 131};
132 132
133struct timer_rand_state;
134struct irq_2_iommu;
133/** 135/**
134 * struct irq_desc - interrupt descriptor 136 * struct irq_desc - interrupt descriptor
135 * @irq: interrupt number for this descriptor 137 * @irq: interrupt number for this descriptor
@@ -155,6 +157,13 @@ struct irq_chip {
155 */ 157 */
156struct irq_desc { 158struct irq_desc {
157 unsigned int irq; 159 unsigned int irq;
160#ifdef CONFIG_SPARSE_IRQ
161 struct timer_rand_state *timer_rand_state;
162 unsigned int *kstat_irqs;
163# ifdef CONFIG_INTR_REMAP
164 struct irq_2_iommu *irq_2_iommu;
165# endif
166#endif
158 irq_flow_handler_t handle_irq; 167 irq_flow_handler_t handle_irq;
159 struct irq_chip *chip; 168 struct irq_chip *chip;
160 struct msi_desc *msi_desc; 169 struct msi_desc *msi_desc;
@@ -182,14 +191,43 @@ struct irq_desc {
182 const char *name; 191 const char *name;
183} ____cacheline_internodealigned_in_smp; 192} ____cacheline_internodealigned_in_smp;
184 193
194extern void early_irq_init(void);
195extern void arch_early_irq_init(void);
196extern void arch_init_chip_data(struct irq_desc *desc, int cpu);
197extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
198 struct irq_desc *desc, int cpu);
199extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
185 200
201#ifndef CONFIG_SPARSE_IRQ
186extern struct irq_desc irq_desc[NR_IRQS]; 202extern struct irq_desc irq_desc[NR_IRQS];
187 203
188static inline struct irq_desc *irq_to_desc(unsigned int irq) 204static inline struct irq_desc *irq_to_desc(unsigned int irq)
189{ 205{
190 return (irq < nr_irqs) ? irq_desc + irq : NULL; 206 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
207}
208static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
209{
210 return irq_to_desc(irq);
191} 211}
192 212
213#else
214
215extern struct irq_desc *irq_to_desc(unsigned int irq);
216extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
217extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
218
219# define for_each_irq_desc(irq, desc) \
220 for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; irq++, desc = irq_to_desc(irq))
221# define for_each_irq_desc_reverse(irq, desc) \
222 for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; irq--, desc = irq_to_desc(irq))
223
224#define kstat_irqs_this_cpu(DESC) \
225 ((DESC)->kstat_irqs[smp_processor_id()])
226#define kstat_incr_irqs_this_cpu(irqno, DESC) \
227 ((DESC)->kstat_irqs[smp_processor_id()]++)
228
229#endif
230
193/* 231/*
194 * Migration helpers for obsolete names, they will go away: 232 * Migration helpers for obsolete names, they will go away:
195 */ 233 */
@@ -381,6 +419,11 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
381#define get_irq_data(irq) (irq_to_desc(irq)->handler_data) 419#define get_irq_data(irq) (irq_to_desc(irq)->handler_data)
382#define get_irq_msi(irq) (irq_to_desc(irq)->msi_desc) 420#define get_irq_msi(irq) (irq_to_desc(irq)->msi_desc)
383 421
422#define get_irq_desc_chip(desc) ((desc)->chip)
423#define get_irq_desc_chip_data(desc) ((desc)->chip_data)
424#define get_irq_desc_data(desc) ((desc)->handler_data)
425#define get_irq_desc_msi(desc) ((desc)->msi_desc)
426
384#endif /* CONFIG_GENERIC_HARDIRQS */ 427#endif /* CONFIG_GENERIC_HARDIRQS */
385 428
386#endif /* !CONFIG_S390 */ 429#endif /* !CONFIG_S390 */
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 452c280c8115..95d2b74641f5 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -1,24 +1,38 @@
1#ifndef _LINUX_IRQNR_H 1#ifndef _LINUX_IRQNR_H
2#define _LINUX_IRQNR_H 2#define _LINUX_IRQNR_H
3 3
4/*
5 * Generic irq_desc iterators:
6 */
7#ifdef __KERNEL__
8
4#ifndef CONFIG_GENERIC_HARDIRQS 9#ifndef CONFIG_GENERIC_HARDIRQS
5#include <asm/irq.h> 10#include <asm/irq.h>
6# define nr_irqs NR_IRQS 11# define nr_irqs NR_IRQS
7 12
8# define for_each_irq_desc(irq, desc) \ 13# define for_each_irq_desc(irq, desc) \
9 for (irq = 0; irq < nr_irqs; irq++) 14 for (irq = 0; irq < nr_irqs; irq++)
15
16# define for_each_irq_desc_reverse(irq, desc) \
17 for (irq = nr_irqs - 1; irq >= 0; irq--)
10#else 18#else
19
11extern int nr_irqs; 20extern int nr_irqs;
12 21
22#ifndef CONFIG_SPARSE_IRQ
23
24struct irq_desc;
13# define for_each_irq_desc(irq, desc) \ 25# define for_each_irq_desc(irq, desc) \
14 for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) 26 for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++)
15 27# define for_each_irq_desc_reverse(irq, desc) \
16# define for_each_irq_desc_reverse(irq, desc) \ 28 for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1); \
17 for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1); \ 29 irq >= 0; irq--, desc--)
18 irq >= 0; irq--, desc--) 30#endif
19#endif 31#endif
20 32
21#define for_each_irq_nr(irq) \ 33#define for_each_irq_nr(irq) \
22 for (irq = 0; irq < nr_irqs; irq++) 34 for (irq = 0; irq < nr_irqs; irq++)
35
36#endif /* __KERNEL__ */
23 37
24#endif 38#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index dc7e0d0a6474..269df5a17b30 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -141,6 +141,15 @@ extern int _cond_resched(void);
141 (__x < 0) ? -__x : __x; \ 141 (__x < 0) ? -__x : __x; \
142 }) 142 })
143 143
144#ifdef CONFIG_PROVE_LOCKING
145void might_fault(void);
146#else
147static inline void might_fault(void)
148{
149 might_sleep();
150}
151#endif
152
144extern struct atomic_notifier_head panic_notifier_list; 153extern struct atomic_notifier_head panic_notifier_list;
145extern long (*panic_blink)(long time); 154extern long (*panic_blink)(long time);
146NORET_TYPE void panic(const char * fmt, ...) 155NORET_TYPE void panic(const char * fmt, ...)
@@ -188,6 +197,8 @@ extern unsigned long long memparse(const char *ptr, char **retptr);
188extern int core_kernel_text(unsigned long addr); 197extern int core_kernel_text(unsigned long addr);
189extern int __kernel_text_address(unsigned long addr); 198extern int __kernel_text_address(unsigned long addr);
190extern int kernel_text_address(unsigned long addr); 199extern int kernel_text_address(unsigned long addr);
200extern int func_ptr_is_kernel_text(void *ptr);
201
191struct pid; 202struct pid;
192extern struct pid *session_of_pgrp(struct pid *pgrp); 203extern struct pid *session_of_pgrp(struct pid *pgrp);
193 204
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 4a145caeee07..4ee4b3d2316f 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,7 +28,9 @@ struct cpu_usage_stat {
28 28
29struct kernel_stat { 29struct kernel_stat {
30 struct cpu_usage_stat cpustat; 30 struct cpu_usage_stat cpustat;
31 unsigned int irqs[NR_IRQS]; 31#ifndef CONFIG_SPARSE_IRQ
32 unsigned int irqs[NR_IRQS];
33#endif
32}; 34};
33 35
34DECLARE_PER_CPU(struct kernel_stat, kstat); 36DECLARE_PER_CPU(struct kernel_stat, kstat);
@@ -39,6 +41,10 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
39 41
40extern unsigned long long nr_context_switches(void); 42extern unsigned long long nr_context_switches(void);
41 43
44#ifndef CONFIG_SPARSE_IRQ
45#define kstat_irqs_this_cpu(irq) \
46 (kstat_this_cpu.irqs[irq])
47
42struct irq_desc; 48struct irq_desc;
43 49
44static inline void kstat_incr_irqs_this_cpu(unsigned int irq, 50static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
@@ -46,11 +52,17 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
46{ 52{
47 kstat_this_cpu.irqs[irq]++; 53 kstat_this_cpu.irqs[irq]++;
48} 54}
55#endif
56
49 57
58#ifndef CONFIG_SPARSE_IRQ
50static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) 59static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
51{ 60{
52 return kstat_cpu(cpu).irqs[irq]; 61 return kstat_cpu(cpu).irqs[irq];
53} 62}
63#else
64extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
65#endif
54 66
55/* 67/*
56 * Number of interrupts per specific IRQ source, since bootup 68 * Number of interrupts per specific IRQ source, since bootup
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 29aec6e10020..8956daf64abd 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -73,6 +73,8 @@ struct lock_class_key {
73 struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; 73 struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES];
74}; 74};
75 75
76#define LOCKSTAT_POINTS 4
77
76/* 78/*
77 * The lock-class itself: 79 * The lock-class itself:
78 */ 80 */
@@ -119,7 +121,8 @@ struct lock_class {
119 int name_version; 121 int name_version;
120 122
121#ifdef CONFIG_LOCK_STAT 123#ifdef CONFIG_LOCK_STAT
122 unsigned long contention_point[4]; 124 unsigned long contention_point[LOCKSTAT_POINTS];
125 unsigned long contending_point[LOCKSTAT_POINTS];
123#endif 126#endif
124}; 127};
125 128
@@ -144,6 +147,7 @@ enum bounce_type {
144 147
145struct lock_class_stats { 148struct lock_class_stats {
146 unsigned long contention_point[4]; 149 unsigned long contention_point[4];
150 unsigned long contending_point[4];
147 struct lock_time read_waittime; 151 struct lock_time read_waittime;
148 struct lock_time write_waittime; 152 struct lock_time write_waittime;
149 struct lock_time read_holdtime; 153 struct lock_time read_holdtime;
@@ -165,6 +169,7 @@ struct lockdep_map {
165 const char *name; 169 const char *name;
166#ifdef CONFIG_LOCK_STAT 170#ifdef CONFIG_LOCK_STAT
167 int cpu; 171 int cpu;
172 unsigned long ip;
168#endif 173#endif
169}; 174};
170 175
@@ -356,7 +361,7 @@ struct lock_class_key { };
356#ifdef CONFIG_LOCK_STAT 361#ifdef CONFIG_LOCK_STAT
357 362
358extern void lock_contended(struct lockdep_map *lock, unsigned long ip); 363extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
359extern void lock_acquired(struct lockdep_map *lock); 364extern void lock_acquired(struct lockdep_map *lock, unsigned long ip);
360 365
361#define LOCK_CONTENDED(_lock, try, lock) \ 366#define LOCK_CONTENDED(_lock, try, lock) \
362do { \ 367do { \
@@ -364,13 +369,13 @@ do { \
364 lock_contended(&(_lock)->dep_map, _RET_IP_); \ 369 lock_contended(&(_lock)->dep_map, _RET_IP_); \
365 lock(_lock); \ 370 lock(_lock); \
366 } \ 371 } \
367 lock_acquired(&(_lock)->dep_map); \ 372 lock_acquired(&(_lock)->dep_map, _RET_IP_); \
368} while (0) 373} while (0)
369 374
370#else /* CONFIG_LOCK_STAT */ 375#else /* CONFIG_LOCK_STAT */
371 376
372#define lock_contended(lockdep_map, ip) do {} while (0) 377#define lock_contended(lockdep_map, ip) do {} while (0)
373#define lock_acquired(lockdep_map) do {} while (0) 378#define lock_acquired(lockdep_map, ip) do {} while (0)
374 379
375#define LOCK_CONTENDED(_lock, try, lock) \ 380#define LOCK_CONTENDED(_lock, try, lock) \
376 lock(_lock) 381 lock(_lock)
@@ -481,4 +486,22 @@ static inline void print_irqtrace_events(struct task_struct *curr)
481# define lock_map_release(l) do { } while (0) 486# define lock_map_release(l) do { } while (0)
482#endif 487#endif
483 488
489#ifdef CONFIG_PROVE_LOCKING
490# define might_lock(lock) \
491do { \
492 typecheck(struct lockdep_map *, &(lock)->dep_map); \
493 lock_acquire(&(lock)->dep_map, 0, 0, 0, 2, NULL, _THIS_IP_); \
494 lock_release(&(lock)->dep_map, 0, _THIS_IP_); \
495} while (0)
496# define might_lock_read(lock) \
497do { \
498 typecheck(struct lockdep_map *, &(lock)->dep_map); \
499 lock_acquire(&(lock)->dep_map, 0, 0, 1, 2, NULL, _THIS_IP_); \
500 lock_release(&(lock)->dep_map, 0, _THIS_IP_); \
501} while (0)
502#else
503# define might_lock(lock) do { } while (0)
504# define might_lock_read(lock) do { } while (0)
505#endif
506
484#endif /* __LINUX_LOCKDEP_H */ 507#endif /* __LINUX_LOCKDEP_H */
diff --git a/include/linux/marker.h b/include/linux/marker.h
index 889196c7fbb1..b85e74ca782f 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -12,6 +12,7 @@
12 * See the file COPYING for more details. 12 * See the file COPYING for more details.
13 */ 13 */
14 14
15#include <stdarg.h>
15#include <linux/types.h> 16#include <linux/types.h>
16 17
17struct module; 18struct module;
@@ -48,10 +49,28 @@ struct marker {
48 void (*call)(const struct marker *mdata, void *call_private, ...); 49 void (*call)(const struct marker *mdata, void *call_private, ...);
49 struct marker_probe_closure single; 50 struct marker_probe_closure single;
50 struct marker_probe_closure *multi; 51 struct marker_probe_closure *multi;
52 const char *tp_name; /* Optional tracepoint name */
53 void *tp_cb; /* Optional tracepoint callback */
51} __attribute__((aligned(8))); 54} __attribute__((aligned(8)));
52 55
53#ifdef CONFIG_MARKERS 56#ifdef CONFIG_MARKERS
54 57
58#define _DEFINE_MARKER(name, tp_name_str, tp_cb, format) \
59 static const char __mstrtab_##name[] \
60 __attribute__((section("__markers_strings"))) \
61 = #name "\0" format; \
62 static struct marker __mark_##name \
63 __attribute__((section("__markers"), aligned(8))) = \
64 { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \
65 0, 0, marker_probe_cb, { __mark_empty_function, NULL},\
66 NULL, tp_name_str, tp_cb }
67
68#define DEFINE_MARKER(name, format) \
69 _DEFINE_MARKER(name, NULL, NULL, format)
70
71#define DEFINE_MARKER_TP(name, tp_name, tp_cb, format) \
72 _DEFINE_MARKER(name, #tp_name, tp_cb, format)
73
55/* 74/*
56 * Note : the empty asm volatile with read constraint is used here instead of a 75 * Note : the empty asm volatile with read constraint is used here instead of a
57 * "used" attribute to fix a gcc 4.1.x bug. 76 * "used" attribute to fix a gcc 4.1.x bug.
@@ -65,14 +84,7 @@ struct marker {
65 */ 84 */
66#define __trace_mark(generic, name, call_private, format, args...) \ 85#define __trace_mark(generic, name, call_private, format, args...) \
67 do { \ 86 do { \
68 static const char __mstrtab_##name[] \ 87 DEFINE_MARKER(name, format); \
69 __attribute__((section("__markers_strings"))) \
70 = #name "\0" format; \
71 static struct marker __mark_##name \
72 __attribute__((section("__markers"), aligned(8))) = \
73 { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \
74 0, 0, marker_probe_cb, \
75 { __mark_empty_function, NULL}, NULL }; \
76 __mark_check_format(format, ## args); \ 88 __mark_check_format(format, ## args); \
77 if (unlikely(__mark_##name.state)) { \ 89 if (unlikely(__mark_##name.state)) { \
78 (*__mark_##name.call) \ 90 (*__mark_##name.call) \
@@ -80,14 +92,39 @@ struct marker {
80 } \ 92 } \
81 } while (0) 93 } while (0)
82 94
95#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
96 do { \
97 void __check_tp_type(void) \
98 { \
99 register_trace_##tp_name(tp_cb); \
100 } \
101 DEFINE_MARKER_TP(name, tp_name, tp_cb, format); \
102 __mark_check_format(format, ## args); \
103 (*__mark_##name.call)(&__mark_##name, call_private, \
104 ## args); \
105 } while (0)
106
83extern void marker_update_probe_range(struct marker *begin, 107extern void marker_update_probe_range(struct marker *begin,
84 struct marker *end); 108 struct marker *end);
109
110#define GET_MARKER(name) (__mark_##name)
111
85#else /* !CONFIG_MARKERS */ 112#else /* !CONFIG_MARKERS */
113#define DEFINE_MARKER(name, tp_name, tp_cb, format)
86#define __trace_mark(generic, name, call_private, format, args...) \ 114#define __trace_mark(generic, name, call_private, format, args...) \
87 __mark_check_format(format, ## args) 115 __mark_check_format(format, ## args)
116#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
117 do { \
118 void __check_tp_type(void) \
119 { \
120 register_trace_##tp_name(tp_cb); \
121 } \
122 __mark_check_format(format, ## args); \
123 } while (0)
88static inline void marker_update_probe_range(struct marker *begin, 124static inline void marker_update_probe_range(struct marker *begin,
89 struct marker *end) 125 struct marker *end)
90{ } 126{ }
127#define GET_MARKER(name)
91#endif /* CONFIG_MARKERS */ 128#endif /* CONFIG_MARKERS */
92 129
93/** 130/**
@@ -117,6 +154,20 @@ static inline void marker_update_probe_range(struct marker *begin,
117 __trace_mark(1, name, NULL, format, ## args) 154 __trace_mark(1, name, NULL, format, ## args)
118 155
119/** 156/**
157 * trace_mark_tp - Marker in a tracepoint callback
158 * @name: marker name, not quoted.
159 * @tp_name: tracepoint name, not quoted.
160 * @tp_cb: tracepoint callback. Should have an associated global symbol so it
161 * is not optimized away by the compiler (should not be static).
162 * @format: format string
163 * @args...: variable argument list
164 *
165 * Places a marker in a tracepoint callback.
166 */
167#define trace_mark_tp(name, tp_name, tp_cb, format, args...) \
168 __trace_mark_tp(name, NULL, tp_name, tp_cb, format, ## args)
169
170/**
120 * MARK_NOARGS - Format string for a marker with no argument. 171 * MARK_NOARGS - Format string for a marker with no argument.
121 */ 172 */
122#define MARK_NOARGS " " 173#define MARK_NOARGS " "
@@ -136,8 +187,6 @@ extern marker_probe_func __mark_empty_function;
136 187
137extern void marker_probe_cb(const struct marker *mdata, 188extern void marker_probe_cb(const struct marker *mdata,
138 void *call_private, ...); 189 void *call_private, ...);
139extern void marker_probe_cb_noarg(const struct marker *mdata,
140 void *call_private, ...);
141 190
142/* 191/*
143 * Connect a probe to a marker. 192 * Connect a probe to a marker.
@@ -162,8 +211,10 @@ extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
162 211
163/* 212/*
164 * marker_synchronize_unregister must be called between the last marker probe 213 * marker_synchronize_unregister must be called between the last marker probe
165 * unregistration and the end of module exit to make sure there is no caller 214 * unregistration and the first one of
166 * executing a probe when it is freed. 215 * - the end of module exit function
216 * - the free of any resource used by the probes
217 * to ensure the code and data are valid for any possibly running probes.
167 */ 218 */
168#define marker_synchronize_unregister() synchronize_sched() 219#define marker_synchronize_unregister() synchronize_sched()
169 220
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 8f2939227207..d2b8a1e8ca11 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -10,8 +10,11 @@ struct msi_msg {
10}; 10};
11 11
12/* Helper functions */ 12/* Helper functions */
13struct irq_desc;
13extern void mask_msi_irq(unsigned int irq); 14extern void mask_msi_irq(unsigned int irq);
14extern void unmask_msi_irq(unsigned int irq); 15extern void unmask_msi_irq(unsigned int irq);
16extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
17extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg);
15extern void read_msi_msg(unsigned int irq, struct msi_msg *msg); 18extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
16extern void write_msi_msg(unsigned int irq, struct msi_msg *msg); 19extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
17 20
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index bc6da10ceee0..7a0e5c4f8072 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -144,6 +144,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
144/* 144/*
145 * NOTE: mutex_trylock() follows the spin_trylock() convention, 145 * NOTE: mutex_trylock() follows the spin_trylock() convention,
146 * not the down_trylock() convention! 146 * not the down_trylock() convention!
147 *
148 * Returns 1 if the mutex has been acquired successfully, and 0 on contention.
147 */ 149 */
148extern int mutex_trylock(struct mutex *lock); 150extern int mutex_trylock(struct mutex *lock);
149extern void mutex_unlock(struct mutex *lock); 151extern void mutex_unlock(struct mutex *lock);
diff --git a/include/linux/pid.h b/include/linux/pid.h
index d7e98ff8021e..bb206c56d1f0 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -147,9 +147,9 @@ pid_t pid_vnr(struct pid *pid);
147#define do_each_pid_task(pid, type, task) \ 147#define do_each_pid_task(pid, type, task) \
148 do { \ 148 do { \
149 struct hlist_node *pos___; \ 149 struct hlist_node *pos___; \
150 if (pid != NULL) \ 150 if ((pid) != NULL) \
151 hlist_for_each_entry_rcu((task), pos___, \ 151 hlist_for_each_entry_rcu((task), pos___, \
152 &pid->tasks[type], pids[type].node) { 152 &(pid)->tasks[type], pids[type].node) {
153 153
154 /* 154 /*
155 * Both old and new leaders may be attached to 155 * Both old and new leaders may be attached to
diff --git a/include/linux/random.h b/include/linux/random.h
index 36f125c0c603..adbf3bd3c6b3 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -8,6 +8,7 @@
8#define _LINUX_RANDOM_H 8#define _LINUX_RANDOM_H
9 9
10#include <linux/ioctl.h> 10#include <linux/ioctl.h>
11#include <linux/irqnr.h>
11 12
12/* ioctl()'s for the random number generator */ 13/* ioctl()'s for the random number generator */
13 14
@@ -44,6 +45,56 @@ struct rand_pool_info {
44 45
45extern void rand_initialize_irq(int irq); 46extern void rand_initialize_irq(int irq);
46 47
48struct timer_rand_state;
49#ifndef CONFIG_SPARSE_IRQ
50
51extern struct timer_rand_state *irq_timer_state[];
52
53static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq)
54{
55 if (irq >= nr_irqs)
56 return NULL;
57
58 return irq_timer_state[irq];
59}
60
61static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
62{
63 if (irq >= nr_irqs)
64 return;
65
66 irq_timer_state[irq] = state;
67}
68
69#else
70
71#include <linux/irq.h>
72static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq)
73{
74 struct irq_desc *desc;
75
76 desc = irq_to_desc(irq);
77
78 if (!desc)
79 return NULL;
80
81 return desc->timer_rand_state;
82}
83
84static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state)
85{
86 struct irq_desc *desc;
87
88 desc = irq_to_desc(irq);
89
90 if (!desc)
91 return;
92
93 desc->timer_rand_state = state;
94}
95#endif
96
97
47extern void add_input_randomness(unsigned int type, unsigned int code, 98extern void add_input_randomness(unsigned int type, unsigned int code,
48 unsigned int value); 99 unsigned int value);
49extern void add_interrupt_randomness(int irq); 100extern void add_interrupt_randomness(int irq);
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 5f89b62e6983..301dda829e37 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -41,7 +41,7 @@
41#include <linux/seqlock.h> 41#include <linux/seqlock.h>
42 42
43#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 43#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
44#define RCU_SECONDS_TILL_STALL_CHECK ( 3 * HZ) /* for rcp->jiffies_stall */ 44#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rcp->jiffies_stall */
45#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ 45#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */
46#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 46#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
47 47
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 86f1f5e43e33..895dc9c1088c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -142,6 +142,7 @@ struct rcu_head {
142 * on the write-side to insure proper synchronization. 142 * on the write-side to insure proper synchronization.
143 */ 143 */
144#define rcu_read_lock_sched() preempt_disable() 144#define rcu_read_lock_sched() preempt_disable()
145#define rcu_read_lock_sched_notrace() preempt_disable_notrace()
145 146
146/* 147/*
147 * rcu_read_unlock_sched - marks the end of a RCU-classic critical section 148 * rcu_read_unlock_sched - marks the end of a RCU-classic critical section
@@ -149,6 +150,7 @@ struct rcu_head {
149 * See rcu_read_lock_sched for more information. 150 * See rcu_read_lock_sched for more information.
150 */ 151 */
151#define rcu_read_unlock_sched() preempt_enable() 152#define rcu_read_unlock_sched() preempt_enable()
153#define rcu_read_unlock_sched_notrace() preempt_enable_notrace()
152 154
153 155
154 156
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index e097c2e6b6dc..d363467c8f13 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -28,17 +28,19 @@ struct ring_buffer_event {
28 * size = 8 bytes 28 * size = 8 bytes
29 * 29 *
30 * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock 30 * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock
31 * array[0] = tv_nsec 31 * array[0] = tv_nsec
32 * array[1] = tv_sec 32 * array[1..2] = tv_sec
33 * size = 16 bytes 33 * size = 16 bytes
34 * 34 *
35 * @RINGBUF_TYPE_DATA: Data record 35 * @RINGBUF_TYPE_DATA: Data record
36 * If len is zero: 36 * If len is zero:
37 * array[0] holds the actual length 37 * array[0] holds the actual length
38 * array[1..(length+3)/4-1] holds data 38 * array[1..(length+3)/4] holds data
39 * size = 4 + 4 + length (bytes)
39 * else 40 * else
40 * length = len << 2 41 * length = len << 2
41 * array[0..(length+3)/4] holds data 42 * array[0..(length+3)/4-1] holds data
43 * size = 4 + length (bytes)
42 */ 44 */
43enum ring_buffer_type { 45enum ring_buffer_type {
44 RINGBUF_TYPE_PADDING, 46 RINGBUF_TYPE_PADDING,
@@ -122,6 +124,12 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
122 124
123void tracing_on(void); 125void tracing_on(void);
124void tracing_off(void); 126void tracing_off(void);
127void tracing_off_permanent(void);
128
129void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
130void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
131int ring_buffer_read_page(struct ring_buffer *buffer,
132 void **data_page, int cpu, int full);
125 133
126enum ring_buffer_flags { 134enum ring_buffer_flags {
127 RB_FL_OVERWRITE = 1 << 0, 135 RB_FL_OVERWRITE = 1 << 0,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55e30d114477..4240f6bfa812 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -96,6 +96,7 @@ struct exec_domain;
96struct futex_pi_state; 96struct futex_pi_state;
97struct robust_list_head; 97struct robust_list_head;
98struct bio; 98struct bio;
99struct bts_tracer;
99 100
100/* 101/*
101 * List of flags we want to share for kernel threads, 102 * List of flags we want to share for kernel threads,
@@ -249,7 +250,7 @@ extern void init_idle_bootup_task(struct task_struct *idle);
249extern int runqueue_is_locked(void); 250extern int runqueue_is_locked(void);
250extern void task_rq_unlock_wait(struct task_struct *p); 251extern void task_rq_unlock_wait(struct task_struct *p);
251 252
252extern cpumask_t nohz_cpu_mask; 253extern cpumask_var_t nohz_cpu_mask;
253#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) 254#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
254extern int select_nohz_load_balancer(int cpu); 255extern int select_nohz_load_balancer(int cpu);
255#else 256#else
@@ -259,8 +260,6 @@ static inline int select_nohz_load_balancer(int cpu)
259} 260}
260#endif 261#endif
261 262
262extern unsigned long rt_needs_cpu(int cpu);
263
264/* 263/*
265 * Only dump TASK_* tasks. (0 for all tasks) 264 * Only dump TASK_* tasks. (0 for all tasks)
266 */ 265 */
@@ -777,7 +776,6 @@ enum cpu_idle_type {
777 776
778struct sched_group { 777struct sched_group {
779 struct sched_group *next; /* Must be a circular list */ 778 struct sched_group *next; /* Must be a circular list */
780 cpumask_t cpumask;
781 779
782 /* 780 /*
783 * CPU power of this group, SCHED_LOAD_SCALE being max power for a 781 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@ -790,8 +788,15 @@ struct sched_group {
790 * (see include/linux/reciprocal_div.h) 788 * (see include/linux/reciprocal_div.h)
791 */ 789 */
792 u32 reciprocal_cpu_power; 790 u32 reciprocal_cpu_power;
791
792 unsigned long cpumask[];
793}; 793};
794 794
795static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
796{
797 return to_cpumask(sg->cpumask);
798}
799
795enum sched_domain_level { 800enum sched_domain_level {
796 SD_LV_NONE = 0, 801 SD_LV_NONE = 0,
797 SD_LV_SIBLING, 802 SD_LV_SIBLING,
@@ -815,7 +820,6 @@ struct sched_domain {
815 struct sched_domain *parent; /* top domain must be null terminated */ 820 struct sched_domain *parent; /* top domain must be null terminated */
816 struct sched_domain *child; /* bottom domain must be null terminated */ 821 struct sched_domain *child; /* bottom domain must be null terminated */
817 struct sched_group *groups; /* the balancing groups of the domain */ 822 struct sched_group *groups; /* the balancing groups of the domain */
818 cpumask_t span; /* span of all CPUs in this domain */
819 unsigned long min_interval; /* Minimum balance interval ms */ 823 unsigned long min_interval; /* Minimum balance interval ms */
820 unsigned long max_interval; /* Maximum balance interval ms */ 824 unsigned long max_interval; /* Maximum balance interval ms */
821 unsigned int busy_factor; /* less balancing by factor if busy */ 825 unsigned int busy_factor; /* less balancing by factor if busy */
@@ -870,9 +874,17 @@ struct sched_domain {
870#ifdef CONFIG_SCHED_DEBUG 874#ifdef CONFIG_SCHED_DEBUG
871 char *name; 875 char *name;
872#endif 876#endif
877
878 /* span of all CPUs in this domain */
879 unsigned long span[];
873}; 880};
874 881
875extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 882static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
883{
884 return to_cpumask(sd->span);
885}
886
887extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
876 struct sched_domain_attr *dattr_new); 888 struct sched_domain_attr *dattr_new);
877extern int arch_reinit_sched_domains(void); 889extern int arch_reinit_sched_domains(void);
878 890
@@ -881,7 +893,7 @@ extern int arch_reinit_sched_domains(void);
881struct sched_domain_attr; 893struct sched_domain_attr;
882 894
883static inline void 895static inline void
884partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 896partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
885 struct sched_domain_attr *dattr_new) 897 struct sched_domain_attr *dattr_new)
886{ 898{
887} 899}
@@ -963,7 +975,7 @@ struct sched_class {
963 void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); 975 void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
964 976
965 void (*set_cpus_allowed)(struct task_struct *p, 977 void (*set_cpus_allowed)(struct task_struct *p,
966 const cpumask_t *newmask); 978 const struct cpumask *newmask);
967 979
968 void (*rq_online)(struct rq *rq); 980 void (*rq_online)(struct rq *rq);
969 void (*rq_offline)(struct rq *rq); 981 void (*rq_offline)(struct rq *rq);
@@ -1165,6 +1177,18 @@ struct task_struct {
1165 struct list_head ptraced; 1177 struct list_head ptraced;
1166 struct list_head ptrace_entry; 1178 struct list_head ptrace_entry;
1167 1179
1180#ifdef CONFIG_X86_PTRACE_BTS
1181 /*
1182 * This is the tracer handle for the ptrace BTS extension.
1183 * This field actually belongs to the ptracer task.
1184 */
1185 struct bts_tracer *bts;
1186 /*
1187 * The buffer to hold the BTS data.
1188 */
1189 void *bts_buffer;
1190#endif /* CONFIG_X86_PTRACE_BTS */
1191
1168 /* PID/PID hash table linkage. */ 1192 /* PID/PID hash table linkage. */
1169 struct pid_link pids[PIDTYPE_MAX]; 1193 struct pid_link pids[PIDTYPE_MAX];
1170 struct list_head thread_group; 1194 struct list_head thread_group;
@@ -1356,6 +1380,23 @@ struct task_struct {
1356 unsigned long default_timer_slack_ns; 1380 unsigned long default_timer_slack_ns;
1357 1381
1358 struct list_head *scm_work_list; 1382 struct list_head *scm_work_list;
1383#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1384 /* Index of current stored adress in ret_stack */
1385 int curr_ret_stack;
1386 /* Stack of return addresses for return function tracing */
1387 struct ftrace_ret_stack *ret_stack;
1388 /*
1389 * Number of functions that haven't been traced
1390 * because of depth overrun.
1391 */
1392 atomic_t trace_overrun;
1393 /* Pause for the tracing */
1394 atomic_t tracing_graph_pause;
1395#endif
1396#ifdef CONFIG_TRACING
1397 /* state flags for use by tracers */
1398 unsigned long trace;
1399#endif
1359}; 1400};
1360 1401
1361/* 1402/*
@@ -1594,12 +1635,12 @@ extern cputime_t task_gtime(struct task_struct *p);
1594 1635
1595#ifdef CONFIG_SMP 1636#ifdef CONFIG_SMP
1596extern int set_cpus_allowed_ptr(struct task_struct *p, 1637extern int set_cpus_allowed_ptr(struct task_struct *p,
1597 const cpumask_t *new_mask); 1638 const struct cpumask *new_mask);
1598#else 1639#else
1599static inline int set_cpus_allowed_ptr(struct task_struct *p, 1640static inline int set_cpus_allowed_ptr(struct task_struct *p,
1600 const cpumask_t *new_mask) 1641 const struct cpumask *new_mask)
1601{ 1642{
1602 if (!cpu_isset(0, *new_mask)) 1643 if (!cpumask_test_cpu(0, new_mask))
1603 return -EINVAL; 1644 return -EINVAL;
1604 return 0; 1645 return 0;
1605} 1646}
@@ -2212,8 +2253,8 @@ __trace_special(void *__tr, void *__data,
2212} 2253}
2213#endif 2254#endif
2214 2255
2215extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); 2256extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
2216extern long sched_getaffinity(pid_t pid, cpumask_t *mask); 2257extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
2217 2258
2218extern int sched_mc_power_savings, sched_smt_power_savings; 2259extern int sched_mc_power_savings, sched_smt_power_savings;
2219 2260
@@ -2224,6 +2265,7 @@ extern void normalize_rt_tasks(void);
2224extern struct task_group init_task_group; 2265extern struct task_group init_task_group;
2225#ifdef CONFIG_USER_SCHED 2266#ifdef CONFIG_USER_SCHED
2226extern struct task_group root_task_group; 2267extern struct task_group root_task_group;
2268extern void set_tg_uid(struct user_struct *user);
2227#endif 2269#endif
2228 2270
2229extern struct task_group *sched_create_group(struct task_group *parent); 2271extern struct task_group *sched_create_group(struct task_group *parent);
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index dc50bcc282a8..b3dfa72f13b9 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -34,6 +34,7 @@ struct seq_operations {
34 34
35#define SEQ_SKIP 1 35#define SEQ_SKIP 1
36 36
37char *mangle_path(char *s, char *p, char *esc);
37int seq_open(struct file *, const struct seq_operations *); 38int seq_open(struct file *, const struct seq_operations *);
38ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); 39ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
39loff_t seq_lseek(struct file *, loff_t, int); 40loff_t seq_lseek(struct file *, loff_t, int);
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index b106fd8e0d5c..1a8cecc4f38c 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -15,9 +15,17 @@ extern void save_stack_trace_tsk(struct task_struct *tsk,
15 struct stack_trace *trace); 15 struct stack_trace *trace);
16 16
17extern void print_stack_trace(struct stack_trace *trace, int spaces); 17extern void print_stack_trace(struct stack_trace *trace, int spaces);
18
19#ifdef CONFIG_USER_STACKTRACE_SUPPORT
20extern void save_stack_trace_user(struct stack_trace *trace);
21#else
22# define save_stack_trace_user(trace) do { } while (0)
23#endif
24
18#else 25#else
19# define save_stack_trace(trace) do { } while (0) 26# define save_stack_trace(trace) do { } while (0)
20# define save_stack_trace_tsk(tsk, trace) do { } while (0) 27# define save_stack_trace_tsk(tsk, trace) do { } while (0)
28# define save_stack_trace_user(trace) do { } while (0)
21# define print_stack_trace(trace, spaces) do { } while (0) 29# define print_stack_trace(trace, spaces) do { } while (0)
22#endif 30#endif
23 31
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 117f1b7405cf..0c5b5ac36d8e 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -49,7 +49,7 @@
49 for_each_online_node(node) \ 49 for_each_online_node(node) \
50 if (nr_cpus_node(node)) 50 if (nr_cpus_node(node))
51 51
52void arch_update_cpu_topology(void); 52int arch_update_cpu_topology(void);
53 53
54/* Conform to ACPI 2.0 SLIT distance definitions */ 54/* Conform to ACPI 2.0 SLIT distance definitions */
55#define LOCAL_DISTANCE 10 55#define LOCAL_DISTANCE 10
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index c5bb39c7a770..757005458366 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -24,8 +24,12 @@ struct tracepoint {
24 const char *name; /* Tracepoint name */ 24 const char *name; /* Tracepoint name */
25 int state; /* State. */ 25 int state; /* State. */
26 void **funcs; 26 void **funcs;
27} __attribute__((aligned(8))); 27} __attribute__((aligned(32))); /*
28 28 * Aligned on 32 bytes because it is
29 * globally visible and gcc happily
30 * align these on the structure size.
31 * Keep in sync with vmlinux.lds.h.
32 */
29 33
30#define TPPROTO(args...) args 34#define TPPROTO(args...) args
31#define TPARGS(args...) args 35#define TPARGS(args...) args
@@ -40,14 +44,14 @@ struct tracepoint {
40 do { \ 44 do { \
41 void **it_func; \ 45 void **it_func; \
42 \ 46 \
43 rcu_read_lock_sched(); \ 47 rcu_read_lock_sched_notrace(); \
44 it_func = rcu_dereference((tp)->funcs); \ 48 it_func = rcu_dereference((tp)->funcs); \
45 if (it_func) { \ 49 if (it_func) { \
46 do { \ 50 do { \
47 ((void(*)(proto))(*it_func))(args); \ 51 ((void(*)(proto))(*it_func))(args); \
48 } while (*(++it_func)); \ 52 } while (*(++it_func)); \
49 } \ 53 } \
50 rcu_read_unlock_sched(); \ 54 rcu_read_unlock_sched_notrace(); \
51 } while (0) 55 } while (0)
52 56
53/* 57/*
@@ -55,35 +59,40 @@ struct tracepoint {
55 * not add unwanted padding between the beginning of the section and the 59 * not add unwanted padding between the beginning of the section and the
56 * structure. Force alignment to the same alignment as the section start. 60 * structure. Force alignment to the same alignment as the section start.
57 */ 61 */
58#define DEFINE_TRACE(name, proto, args) \ 62#define DECLARE_TRACE(name, proto, args) \
63 extern struct tracepoint __tracepoint_##name; \
59 static inline void trace_##name(proto) \ 64 static inline void trace_##name(proto) \
60 { \ 65 { \
61 static const char __tpstrtab_##name[] \
62 __attribute__((section("__tracepoints_strings"))) \
63 = #name ":" #proto; \
64 static struct tracepoint __tracepoint_##name \
65 __attribute__((section("__tracepoints"), aligned(8))) = \
66 { __tpstrtab_##name, 0, NULL }; \
67 if (unlikely(__tracepoint_##name.state)) \ 66 if (unlikely(__tracepoint_##name.state)) \
68 __DO_TRACE(&__tracepoint_##name, \ 67 __DO_TRACE(&__tracepoint_##name, \
69 TPPROTO(proto), TPARGS(args)); \ 68 TPPROTO(proto), TPARGS(args)); \
70 } \ 69 } \
71 static inline int register_trace_##name(void (*probe)(proto)) \ 70 static inline int register_trace_##name(void (*probe)(proto)) \
72 { \ 71 { \
73 return tracepoint_probe_register(#name ":" #proto, \ 72 return tracepoint_probe_register(#name, (void *)probe); \
74 (void *)probe); \
75 } \ 73 } \
76 static inline void unregister_trace_##name(void (*probe)(proto))\ 74 static inline int unregister_trace_##name(void (*probe)(proto)) \
77 { \ 75 { \
78 tracepoint_probe_unregister(#name ":" #proto, \ 76 return tracepoint_probe_unregister(#name, (void *)probe);\
79 (void *)probe); \
80 } 77 }
81 78
79#define DEFINE_TRACE(name) \
80 static const char __tpstrtab_##name[] \
81 __attribute__((section("__tracepoints_strings"))) = #name; \
82 struct tracepoint __tracepoint_##name \
83 __attribute__((section("__tracepoints"), aligned(32))) = \
84 { __tpstrtab_##name, 0, NULL }
85
86#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \
87 EXPORT_SYMBOL_GPL(__tracepoint_##name)
88#define EXPORT_TRACEPOINT_SYMBOL(name) \
89 EXPORT_SYMBOL(__tracepoint_##name)
90
82extern void tracepoint_update_probe_range(struct tracepoint *begin, 91extern void tracepoint_update_probe_range(struct tracepoint *begin,
83 struct tracepoint *end); 92 struct tracepoint *end);
84 93
85#else /* !CONFIG_TRACEPOINTS */ 94#else /* !CONFIG_TRACEPOINTS */
86#define DEFINE_TRACE(name, proto, args) \ 95#define DECLARE_TRACE(name, proto, args) \
87 static inline void _do_trace_##name(struct tracepoint *tp, proto) \ 96 static inline void _do_trace_##name(struct tracepoint *tp, proto) \
88 { } \ 97 { } \
89 static inline void trace_##name(proto) \ 98 static inline void trace_##name(proto) \
@@ -92,8 +101,14 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
92 { \ 101 { \
93 return -ENOSYS; \ 102 return -ENOSYS; \
94 } \ 103 } \
95 static inline void unregister_trace_##name(void (*probe)(proto))\ 104 static inline int unregister_trace_##name(void (*probe)(proto)) \
96 { } 105 { \
106 return -ENOSYS; \
107 }
108
109#define DEFINE_TRACE(name)
110#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
111#define EXPORT_TRACEPOINT_SYMBOL(name)
97 112
98static inline void tracepoint_update_probe_range(struct tracepoint *begin, 113static inline void tracepoint_update_probe_range(struct tracepoint *begin,
99 struct tracepoint *end) 114 struct tracepoint *end)
@@ -112,6 +127,10 @@ extern int tracepoint_probe_register(const char *name, void *probe);
112 */ 127 */
113extern int tracepoint_probe_unregister(const char *name, void *probe); 128extern int tracepoint_probe_unregister(const char *name, void *probe);
114 129
130extern int tracepoint_probe_register_noupdate(const char *name, void *probe);
131extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe);
132extern void tracepoint_probe_update_all(void);
133
115struct tracepoint_iter { 134struct tracepoint_iter {
116 struct module *module; 135 struct module *module;
117 struct tracepoint *tracepoint; 136 struct tracepoint *tracepoint;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 3b8121d4e36f..eaec37c9d83d 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -325,7 +325,7 @@ extern struct class *tty_class;
325 * go away 325 * go away
326 */ 326 */
327 327
328extern inline struct tty_struct *tty_kref_get(struct tty_struct *tty) 328static inline struct tty_struct *tty_kref_get(struct tty_struct *tty)
329{ 329{
330 if (tty) 330 if (tty)
331 kref_get(&tty->kref); 331 kref_get(&tty->kref);
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index fec6decfb983..6b58367d145e 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to,
78 \ 78 \
79 set_fs(KERNEL_DS); \ 79 set_fs(KERNEL_DS); \
80 pagefault_disable(); \ 80 pagefault_disable(); \
81 ret = __get_user(retval, (__force typeof(retval) __user *)(addr)); \ 81 ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \
82 pagefault_enable(); \ 82 pagefault_enable(); \
83 set_fs(old_fs); \ 83 set_fs(old_fs); \
84 ret; \ 84 ret; \
diff --git a/include/trace/block.h b/include/trace/block.h
new file mode 100644
index 000000000000..25c6a1fd5b77
--- /dev/null
+++ b/include/trace/block.h
@@ -0,0 +1,76 @@
1#ifndef _TRACE_BLOCK_H
2#define _TRACE_BLOCK_H
3
4#include <linux/blkdev.h>
5#include <linux/tracepoint.h>
6
7DECLARE_TRACE(block_rq_abort,
8 TPPROTO(struct request_queue *q, struct request *rq),
9 TPARGS(q, rq));
10
11DECLARE_TRACE(block_rq_insert,
12 TPPROTO(struct request_queue *q, struct request *rq),
13 TPARGS(q, rq));
14
15DECLARE_TRACE(block_rq_issue,
16 TPPROTO(struct request_queue *q, struct request *rq),
17 TPARGS(q, rq));
18
19DECLARE_TRACE(block_rq_requeue,
20 TPPROTO(struct request_queue *q, struct request *rq),
21 TPARGS(q, rq));
22
23DECLARE_TRACE(block_rq_complete,
24 TPPROTO(struct request_queue *q, struct request *rq),
25 TPARGS(q, rq));
26
27DECLARE_TRACE(block_bio_bounce,
28 TPPROTO(struct request_queue *q, struct bio *bio),
29 TPARGS(q, bio));
30
31DECLARE_TRACE(block_bio_complete,
32 TPPROTO(struct request_queue *q, struct bio *bio),
33 TPARGS(q, bio));
34
35DECLARE_TRACE(block_bio_backmerge,
36 TPPROTO(struct request_queue *q, struct bio *bio),
37 TPARGS(q, bio));
38
39DECLARE_TRACE(block_bio_frontmerge,
40 TPPROTO(struct request_queue *q, struct bio *bio),
41 TPARGS(q, bio));
42
43DECLARE_TRACE(block_bio_queue,
44 TPPROTO(struct request_queue *q, struct bio *bio),
45 TPARGS(q, bio));
46
47DECLARE_TRACE(block_getrq,
48 TPPROTO(struct request_queue *q, struct bio *bio, int rw),
49 TPARGS(q, bio, rw));
50
51DECLARE_TRACE(block_sleeprq,
52 TPPROTO(struct request_queue *q, struct bio *bio, int rw),
53 TPARGS(q, bio, rw));
54
55DECLARE_TRACE(block_plug,
56 TPPROTO(struct request_queue *q),
57 TPARGS(q));
58
59DECLARE_TRACE(block_unplug_timer,
60 TPPROTO(struct request_queue *q),
61 TPARGS(q));
62
63DECLARE_TRACE(block_unplug_io,
64 TPPROTO(struct request_queue *q),
65 TPARGS(q));
66
67DECLARE_TRACE(block_split,
68 TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
69 TPARGS(q, bio, pdu));
70
71DECLARE_TRACE(block_remap,
72 TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev,
73 sector_t from, sector_t to),
74 TPARGS(q, bio, dev, from, to));
75
76#endif
diff --git a/include/trace/boot.h b/include/trace/boot.h
new file mode 100644
index 000000000000..088ea089e31d
--- /dev/null
+++ b/include/trace/boot.h
@@ -0,0 +1,60 @@
1#ifndef _LINUX_TRACE_BOOT_H
2#define _LINUX_TRACE_BOOT_H
3
4#include <linux/module.h>
5#include <linux/kallsyms.h>
6#include <linux/init.h>
7
8/*
9 * Structure which defines the trace of an initcall
10 * while it is called.
11 * You don't have to fill the func field since it is
12 * only used internally by the tracer.
13 */
14struct boot_trace_call {
15 pid_t caller;
16 char func[KSYM_SYMBOL_LEN];
17};
18
19/*
20 * Structure which defines the trace of an initcall
21 * while it returns.
22 */
23struct boot_trace_ret {
24 char func[KSYM_SYMBOL_LEN];
25 int result;
26 unsigned long long duration; /* nsecs */
27};
28
29#ifdef CONFIG_BOOT_TRACER
30/* Append the traces on the ring-buffer */
31extern void trace_boot_call(struct boot_trace_call *bt, initcall_t fn);
32extern void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn);
33
34/* Tells the tracer that smp_pre_initcall is finished.
35 * So we can start the tracing
36 */
37extern void start_boot_trace(void);
38
39/* Resume the tracing of other necessary events
40 * such as sched switches
41 */
42extern void enable_boot_trace(void);
43
44/* Suspend this tracing. Actually, only sched_switches tracing have
45 * to be suspended. Initcalls doesn't need it.)
46 */
47extern void disable_boot_trace(void);
48#else
49static inline
50void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { }
51
52static inline
53void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { }
54
55static inline void start_boot_trace(void) { }
56static inline void enable_boot_trace(void) { }
57static inline void disable_boot_trace(void) { }
58#endif /* CONFIG_BOOT_TRACER */
59
60#endif /* __LINUX_TRACE_BOOT_H */
diff --git a/include/trace/sched.h b/include/trace/sched.h
index ad47369d01b5..9b2854abf7e2 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -4,52 +4,52 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/tracepoint.h> 5#include <linux/tracepoint.h>
6 6
7DEFINE_TRACE(sched_kthread_stop, 7DECLARE_TRACE(sched_kthread_stop,
8 TPPROTO(struct task_struct *t), 8 TPPROTO(struct task_struct *t),
9 TPARGS(t)); 9 TPARGS(t));
10 10
11DEFINE_TRACE(sched_kthread_stop_ret, 11DECLARE_TRACE(sched_kthread_stop_ret,
12 TPPROTO(int ret), 12 TPPROTO(int ret),
13 TPARGS(ret)); 13 TPARGS(ret));
14 14
15DEFINE_TRACE(sched_wait_task, 15DECLARE_TRACE(sched_wait_task,
16 TPPROTO(struct rq *rq, struct task_struct *p), 16 TPPROTO(struct rq *rq, struct task_struct *p),
17 TPARGS(rq, p)); 17 TPARGS(rq, p));
18 18
19DEFINE_TRACE(sched_wakeup, 19DECLARE_TRACE(sched_wakeup,
20 TPPROTO(struct rq *rq, struct task_struct *p), 20 TPPROTO(struct rq *rq, struct task_struct *p),
21 TPARGS(rq, p)); 21 TPARGS(rq, p));
22 22
23DEFINE_TRACE(sched_wakeup_new, 23DECLARE_TRACE(sched_wakeup_new,
24 TPPROTO(struct rq *rq, struct task_struct *p), 24 TPPROTO(struct rq *rq, struct task_struct *p),
25 TPARGS(rq, p)); 25 TPARGS(rq, p));
26 26
27DEFINE_TRACE(sched_switch, 27DECLARE_TRACE(sched_switch,
28 TPPROTO(struct rq *rq, struct task_struct *prev, 28 TPPROTO(struct rq *rq, struct task_struct *prev,
29 struct task_struct *next), 29 struct task_struct *next),
30 TPARGS(rq, prev, next)); 30 TPARGS(rq, prev, next));
31 31
32DEFINE_TRACE(sched_migrate_task, 32DECLARE_TRACE(sched_migrate_task,
33 TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu), 33 TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu),
34 TPARGS(rq, p, dest_cpu)); 34 TPARGS(rq, p, dest_cpu));
35 35
36DEFINE_TRACE(sched_process_free, 36DECLARE_TRACE(sched_process_free,
37 TPPROTO(struct task_struct *p), 37 TPPROTO(struct task_struct *p),
38 TPARGS(p)); 38 TPARGS(p));
39 39
40DEFINE_TRACE(sched_process_exit, 40DECLARE_TRACE(sched_process_exit,
41 TPPROTO(struct task_struct *p), 41 TPPROTO(struct task_struct *p),
42 TPARGS(p)); 42 TPARGS(p));
43 43
44DEFINE_TRACE(sched_process_wait, 44DECLARE_TRACE(sched_process_wait,
45 TPPROTO(struct pid *pid), 45 TPPROTO(struct pid *pid),
46 TPARGS(pid)); 46 TPARGS(pid));
47 47
48DEFINE_TRACE(sched_process_fork, 48DECLARE_TRACE(sched_process_fork,
49 TPPROTO(struct task_struct *parent, struct task_struct *child), 49 TPPROTO(struct task_struct *parent, struct task_struct *child),
50 TPARGS(parent, child)); 50 TPARGS(parent, child));
51 51
52DEFINE_TRACE(sched_signal_send, 52DECLARE_TRACE(sched_signal_send,
53 TPPROTO(int sig, struct task_struct *p), 53 TPPROTO(int sig, struct task_struct *p),
54 TPARGS(sig, p)); 54 TPARGS(sig, p));
55 55
diff --git a/init/Kconfig b/init/Kconfig
index 7656623f5006..b3782c6d5ede 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -808,6 +808,7 @@ config TRACEPOINTS
808 808
809config MARKERS 809config MARKERS
810 bool "Activate markers" 810 bool "Activate markers"
811 depends on TRACEPOINTS
811 help 812 help
812 Place an empty function call at each marker site. Can be 813 Place an empty function call at each marker site. Can be
813 dynamically changed for a probe function. 814 dynamically changed for a probe function.
diff --git a/init/main.c b/init/main.c
index 7e117a231af1..9d761aa53296 100644
--- a/init/main.c
+++ b/init/main.c
@@ -63,6 +63,7 @@
63#include <linux/signal.h> 63#include <linux/signal.h>
64#include <linux/idr.h> 64#include <linux/idr.h>
65#include <linux/ftrace.h> 65#include <linux/ftrace.h>
66#include <trace/boot.h>
66 67
67#include <asm/io.h> 68#include <asm/io.h>
68#include <asm/bugs.h> 69#include <asm/bugs.h>
@@ -539,6 +540,15 @@ void __init __weak thread_info_cache_init(void)
539{ 540{
540} 541}
541 542
543void __init __weak arch_early_irq_init(void)
544{
545}
546
547void __init __weak early_irq_init(void)
548{
549 arch_early_irq_init();
550}
551
542asmlinkage void __init start_kernel(void) 552asmlinkage void __init start_kernel(void)
543{ 553{
544 char * command_line; 554 char * command_line;
@@ -603,6 +613,8 @@ asmlinkage void __init start_kernel(void)
603 sort_main_extable(); 613 sort_main_extable();
604 trap_init(); 614 trap_init();
605 rcu_init(); 615 rcu_init();
616 /* init some links before init_ISA_irqs() */
617 early_irq_init();
606 init_IRQ(); 618 init_IRQ();
607 pidhash_init(); 619 pidhash_init();
608 init_timers(); 620 init_timers();
@@ -703,31 +715,35 @@ core_param(initcall_debug, initcall_debug, bool, 0644);
703int do_one_initcall(initcall_t fn) 715int do_one_initcall(initcall_t fn)
704{ 716{
705 int count = preempt_count(); 717 int count = preempt_count();
706 ktime_t delta; 718 ktime_t calltime, delta, rettime;
707 char msgbuf[64]; 719 char msgbuf[64];
708 struct boot_trace it; 720 struct boot_trace_call call;
721 struct boot_trace_ret ret;
709 722
710 if (initcall_debug) { 723 if (initcall_debug) {
711 it.caller = task_pid_nr(current); 724 call.caller = task_pid_nr(current);
712 printk("calling %pF @ %i\n", fn, it.caller); 725 printk("calling %pF @ %i\n", fn, call.caller);
713 it.calltime = ktime_get(); 726 calltime = ktime_get();
727 trace_boot_call(&call, fn);
728 enable_boot_trace();
714 } 729 }
715 730
716 it.result = fn(); 731 ret.result = fn();
717 732
718 if (initcall_debug) { 733 if (initcall_debug) {
719 it.rettime = ktime_get(); 734 disable_boot_trace();
720 delta = ktime_sub(it.rettime, it.calltime); 735 rettime = ktime_get();
721 it.duration = (unsigned long long) delta.tv64 >> 10; 736 delta = ktime_sub(rettime, calltime);
737 ret.duration = (unsigned long long) ktime_to_ns(delta) >> 10;
738 trace_boot_ret(&ret, fn);
722 printk("initcall %pF returned %d after %Ld usecs\n", fn, 739 printk("initcall %pF returned %d after %Ld usecs\n", fn,
723 it.result, it.duration); 740 ret.result, ret.duration);
724 trace_boot(&it, fn);
725 } 741 }
726 742
727 msgbuf[0] = 0; 743 msgbuf[0] = 0;
728 744
729 if (it.result && it.result != -ENODEV && initcall_debug) 745 if (ret.result && ret.result != -ENODEV && initcall_debug)
730 sprintf(msgbuf, "error code %d ", it.result); 746 sprintf(msgbuf, "error code %d ", ret.result);
731 747
732 if (preempt_count() != count) { 748 if (preempt_count() != count) {
733 strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf)); 749 strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf));
@@ -741,7 +757,7 @@ int do_one_initcall(initcall_t fn)
741 printk("initcall %pF returned with %s\n", fn, msgbuf); 757 printk("initcall %pF returned with %s\n", fn, msgbuf);
742 } 758 }
743 759
744 return it.result; 760 return ret.result;
745} 761}
746 762
747 763
@@ -882,7 +898,7 @@ static int __init kernel_init(void * unused)
882 * we're essentially up and running. Get rid of the 898 * we're essentially up and running. Get rid of the
883 * initmem segments and start the user-mode stuff.. 899 * initmem segments and start the user-mode stuff..
884 */ 900 */
885 stop_boot_trace(); 901
886 init_post(); 902 init_post();
887 return 0; 903 return 0;
888} 904}
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad003b19d..6a212b842d86 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -19,7 +19,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
19CFLAGS_REMOVE_rtmutex-debug.o = -pg 19CFLAGS_REMOVE_rtmutex-debug.o = -pg
20CFLAGS_REMOVE_cgroup-debug.o = -pg 20CFLAGS_REMOVE_cgroup-debug.o = -pg
21CFLAGS_REMOVE_sched_clock.o = -pg 21CFLAGS_REMOVE_sched_clock.o = -pg
22CFLAGS_REMOVE_sched.o = -pg
23endif 22endif
24 23
25obj-$(CONFIG_FREEZER) += freezer.o 24obj-$(CONFIG_FREEZER) += freezer.o
@@ -90,7 +89,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
90obj-$(CONFIG_TRACING) += trace/ 89obj-$(CONFIG_TRACING) += trace/
91obj-$(CONFIG_SMP) += sched_cpupri.o 90obj-$(CONFIG_SMP) += sched_cpupri.o
92 91
93ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 92ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
94# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 93# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
95# needed for x86 only. Why this used to be enabled for all architectures is beyond 94# needed for x86 only. Why this used to be enabled for all architectures is beyond
96# me. I suspect most platforms don't need this, but until we know that for sure 95# me. I suspect most platforms don't need this, but until we know that for sure
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7ebb0f7..61ba5b4b10cf 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -53,6 +53,10 @@
53#include <asm/pgtable.h> 53#include <asm/pgtable.h>
54#include <asm/mmu_context.h> 54#include <asm/mmu_context.h>
55 55
56DEFINE_TRACE(sched_process_free);
57DEFINE_TRACE(sched_process_exit);
58DEFINE_TRACE(sched_process_wait);
59
56static void exit_mm(struct task_struct * tsk); 60static void exit_mm(struct task_struct * tsk);
57 61
58static inline int task_detached(struct task_struct *p) 62static inline int task_detached(struct task_struct *p)
@@ -1123,7 +1127,6 @@ NORET_TYPE void do_exit(long code)
1123 preempt_disable(); 1127 preempt_disable();
1124 /* causes final put_task_struct in finish_task_switch(). */ 1128 /* causes final put_task_struct in finish_task_switch(). */
1125 tsk->state = TASK_DEAD; 1129 tsk->state = TASK_DEAD;
1126
1127 schedule(); 1130 schedule();
1128 BUG(); 1131 BUG();
1129 /* Avoid "noreturn function does return". */ 1132 /* Avoid "noreturn function does return". */
@@ -1321,10 +1324,10 @@ static int wait_task_zombie(struct task_struct *p, int options,
1321 * group, which consolidates times for all threads in the 1324 * group, which consolidates times for all threads in the
1322 * group including the group leader. 1325 * group including the group leader.
1323 */ 1326 */
1327 thread_group_cputime(p, &cputime);
1324 spin_lock_irq(&p->parent->sighand->siglock); 1328 spin_lock_irq(&p->parent->sighand->siglock);
1325 psig = p->parent->signal; 1329 psig = p->parent->signal;
1326 sig = p->signal; 1330 sig = p->signal;
1327 thread_group_cputime(p, &cputime);
1328 psig->cutime = 1331 psig->cutime =
1329 cputime_add(psig->cutime, 1332 cputime_add(psig->cutime,
1330 cputime_add(cputime.utime, 1333 cputime_add(cputime.utime,
diff --git a/kernel/extable.c b/kernel/extable.c
index a26cb2e17023..e136ed8d82ba 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -17,6 +17,7 @@
17*/ 17*/
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/init.h> 19#include <linux/init.h>
20#include <linux/ftrace.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include <asm/sections.h> 22#include <asm/sections.h>
22 23
@@ -40,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
40 return e; 41 return e;
41} 42}
42 43
43int core_kernel_text(unsigned long addr) 44__notrace_funcgraph int core_kernel_text(unsigned long addr)
44{ 45{
45 if (addr >= (unsigned long)_stext && 46 if (addr >= (unsigned long)_stext &&
46 addr <= (unsigned long)_etext) 47 addr <= (unsigned long)_etext)
@@ -53,7 +54,7 @@ int core_kernel_text(unsigned long addr)
53 return 0; 54 return 0;
54} 55}
55 56
56int __kernel_text_address(unsigned long addr) 57__notrace_funcgraph int __kernel_text_address(unsigned long addr)
57{ 58{
58 if (core_kernel_text(addr)) 59 if (core_kernel_text(addr))
59 return 1; 60 return 1;
@@ -66,3 +67,19 @@ int kernel_text_address(unsigned long addr)
66 return 1; 67 return 1;
67 return module_text_address(addr) != NULL; 68 return module_text_address(addr) != NULL;
68} 69}
70
71/*
72 * On some architectures (PPC64, IA64) function pointers
73 * are actually only tokens to some data that then holds the
74 * real function address. As a result, to find if a function
75 * pointer is part of the kernel text, we need to do some
76 * special dereferencing first.
77 */
78int func_ptr_is_kernel_text(void *ptr)
79{
80 unsigned long addr;
81 addr = (unsigned long) dereference_function_descriptor(ptr);
82 if (core_kernel_text(addr))
83 return 1;
84 return module_text_address(addr) != NULL;
85}
diff --git a/kernel/fork.c b/kernel/fork.c
index 495da2e9a8b4..7b93da72d4a2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -47,6 +47,7 @@
47#include <linux/mount.h> 47#include <linux/mount.h>
48#include <linux/audit.h> 48#include <linux/audit.h>
49#include <linux/memcontrol.h> 49#include <linux/memcontrol.h>
50#include <linux/ftrace.h>
50#include <linux/profile.h> 51#include <linux/profile.h>
51#include <linux/rmap.h> 52#include <linux/rmap.h>
52#include <linux/acct.h> 53#include <linux/acct.h>
@@ -80,6 +81,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
80 81
81__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 82__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
82 83
84DEFINE_TRACE(sched_process_fork);
85
83int nr_processes(void) 86int nr_processes(void)
84{ 87{
85 int cpu; 88 int cpu;
@@ -137,6 +140,7 @@ void free_task(struct task_struct *tsk)
137 prop_local_destroy_single(&tsk->dirties); 140 prop_local_destroy_single(&tsk->dirties);
138 free_thread_info(tsk->stack); 141 free_thread_info(tsk->stack);
139 rt_mutex_debug_task_free(tsk); 142 rt_mutex_debug_task_free(tsk);
143 ftrace_graph_exit_task(tsk);
140 free_task_struct(tsk); 144 free_task_struct(tsk);
141} 145}
142EXPORT_SYMBOL(free_task); 146EXPORT_SYMBOL(free_task);
@@ -1136,6 +1140,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1136 } 1140 }
1137 } 1141 }
1138 1142
1143 ftrace_graph_init_task(p);
1144
1139 p->pid = pid_nr(pid); 1145 p->pid = pid_nr(pid);
1140 p->tgid = p->pid; 1146 p->tgid = p->pid;
1141 if (clone_flags & CLONE_THREAD) 1147 if (clone_flags & CLONE_THREAD)
@@ -1144,7 +1150,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1144 if (current->nsproxy != p->nsproxy) { 1150 if (current->nsproxy != p->nsproxy) {
1145 retval = ns_cgroup_clone(p, pid); 1151 retval = ns_cgroup_clone(p, pid);
1146 if (retval) 1152 if (retval)
1147 goto bad_fork_free_pid; 1153 goto bad_fork_free_graph;
1148 } 1154 }
1149 1155
1150 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1156 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1237,7 +1243,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1237 spin_unlock(&current->sighand->siglock); 1243 spin_unlock(&current->sighand->siglock);
1238 write_unlock_irq(&tasklist_lock); 1244 write_unlock_irq(&tasklist_lock);
1239 retval = -ERESTARTNOINTR; 1245 retval = -ERESTARTNOINTR;
1240 goto bad_fork_free_pid; 1246 goto bad_fork_free_graph;
1241 } 1247 }
1242 1248
1243 if (clone_flags & CLONE_THREAD) { 1249 if (clone_flags & CLONE_THREAD) {
@@ -1274,6 +1280,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1274 cgroup_post_fork(p); 1280 cgroup_post_fork(p);
1275 return p; 1281 return p;
1276 1282
1283bad_fork_free_graph:
1284 ftrace_graph_exit_task(p);
1277bad_fork_free_pid: 1285bad_fork_free_pid:
1278 if (pid != &init_struct_pid) 1286 if (pid != &init_struct_pid)
1279 free_pid(pid); 1287 free_pid(pid);
diff --git a/kernel/futex.c b/kernel/futex.c
index 8af10027514b..e10c5c8786a6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -123,24 +123,6 @@ struct futex_hash_bucket {
123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
124 124
125/* 125/*
126 * Take mm->mmap_sem, when futex is shared
127 */
128static inline void futex_lock_mm(struct rw_semaphore *fshared)
129{
130 if (fshared)
131 down_read(fshared);
132}
133
134/*
135 * Release mm->mmap_sem, when the futex is shared
136 */
137static inline void futex_unlock_mm(struct rw_semaphore *fshared)
138{
139 if (fshared)
140 up_read(fshared);
141}
142
143/*
144 * We hash on the keys returned from get_futex_key (see below). 126 * We hash on the keys returned from get_futex_key (see below).
145 */ 127 */
146static struct futex_hash_bucket *hash_futex(union futex_key *key) 128static struct futex_hash_bucket *hash_futex(union futex_key *key)
@@ -161,6 +143,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
161 && key1->both.offset == key2->both.offset); 143 && key1->both.offset == key2->both.offset);
162} 144}
163 145
146/*
147 * Take a reference to the resource addressed by a key.
148 * Can be called while holding spinlocks.
149 *
150 */
151static void get_futex_key_refs(union futex_key *key)
152{
153 if (!key->both.ptr)
154 return;
155
156 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
157 case FUT_OFF_INODE:
158 atomic_inc(&key->shared.inode->i_count);
159 break;
160 case FUT_OFF_MMSHARED:
161 atomic_inc(&key->private.mm->mm_count);
162 break;
163 }
164}
165
166/*
167 * Drop a reference to the resource addressed by a key.
168 * The hash bucket spinlock must not be held.
169 */
170static void drop_futex_key_refs(union futex_key *key)
171{
172 if (!key->both.ptr)
173 return;
174
175 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
176 case FUT_OFF_INODE:
177 iput(key->shared.inode);
178 break;
179 case FUT_OFF_MMSHARED:
180 mmdrop(key->private.mm);
181 break;
182 }
183}
184
164/** 185/**
165 * get_futex_key - Get parameters which are the keys for a futex. 186 * get_futex_key - Get parameters which are the keys for a futex.
166 * @uaddr: virtual address of the futex 187 * @uaddr: virtual address of the futex
@@ -179,12 +200,10 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
179 * For other futexes, it points to &current->mm->mmap_sem and 200 * For other futexes, it points to &current->mm->mmap_sem and
180 * caller must have taken the reader lock. but NOT any spinlocks. 201 * caller must have taken the reader lock. but NOT any spinlocks.
181 */ 202 */
182static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, 203static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
183 union futex_key *key)
184{ 204{
185 unsigned long address = (unsigned long)uaddr; 205 unsigned long address = (unsigned long)uaddr;
186 struct mm_struct *mm = current->mm; 206 struct mm_struct *mm = current->mm;
187 struct vm_area_struct *vma;
188 struct page *page; 207 struct page *page;
189 int err; 208 int err;
190 209
@@ -208,100 +227,50 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
208 return -EFAULT; 227 return -EFAULT;
209 key->private.mm = mm; 228 key->private.mm = mm;
210 key->private.address = address; 229 key->private.address = address;
230 get_futex_key_refs(key);
211 return 0; 231 return 0;
212 } 232 }
213 /*
214 * The futex is hashed differently depending on whether
215 * it's in a shared or private mapping. So check vma first.
216 */
217 vma = find_extend_vma(mm, address);
218 if (unlikely(!vma))
219 return -EFAULT;
220 233
221 /* 234again:
222 * Permissions. 235 err = get_user_pages_fast(address, 1, 0, &page);
223 */ 236 if (err < 0)
224 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 237 return err;
225 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 238
239 lock_page(page);
240 if (!page->mapping) {
241 unlock_page(page);
242 put_page(page);
243 goto again;
244 }
226 245
227 /* 246 /*
228 * Private mappings are handled in a simple way. 247 * Private mappings are handled in a simple way.
229 * 248 *
230 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 249 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
231 * it's a read-only handle, it's expected that futexes attach to 250 * it's a read-only handle, it's expected that futexes attach to
232 * the object not the particular process. Therefore we use 251 * the object not the particular process.
233 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
234 * mappings of _writable_ handles.
235 */ 252 */
236 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 253 if (PageAnon(page)) {
237 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ 254 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
238 key->private.mm = mm; 255 key->private.mm = mm;
239 key->private.address = address; 256 key->private.address = address;
240 return 0; 257 } else {
258 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
259 key->shared.inode = page->mapping->host;
260 key->shared.pgoff = page->index;
241 } 261 }
242 262
243 /* 263 get_futex_key_refs(key);
244 * Linear file mappings are also simple.
245 */
246 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
247 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
248 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
249 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
250 + vma->vm_pgoff);
251 return 0;
252 }
253 264
254 /* 265 unlock_page(page);
255 * We could walk the page table to read the non-linear 266 put_page(page);
256 * pte, and get the page index without fetching the page 267 return 0;
257 * from swap. But that's a lot of code to duplicate here
258 * for a rare case, so we simply fetch the page.
259 */
260 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
261 if (err >= 0) {
262 key->shared.pgoff =
263 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
264 put_page(page);
265 return 0;
266 }
267 return err;
268}
269
270/*
271 * Take a reference to the resource addressed by a key.
272 * Can be called while holding spinlocks.
273 *
274 */
275static void get_futex_key_refs(union futex_key *key)
276{
277 if (key->both.ptr == NULL)
278 return;
279 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
280 case FUT_OFF_INODE:
281 atomic_inc(&key->shared.inode->i_count);
282 break;
283 case FUT_OFF_MMSHARED:
284 atomic_inc(&key->private.mm->mm_count);
285 break;
286 }
287} 268}
288 269
289/* 270static inline
290 * Drop a reference to the resource addressed by a key. 271void put_futex_key(int fshared, union futex_key *key)
291 * The hash bucket spinlock must not be held.
292 */
293static void drop_futex_key_refs(union futex_key *key)
294{ 272{
295 if (!key->both.ptr) 273 drop_futex_key_refs(key);
296 return;
297 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
298 case FUT_OFF_INODE:
299 iput(key->shared.inode);
300 break;
301 case FUT_OFF_MMSHARED:
302 mmdrop(key->private.mm);
303 break;
304 }
305} 274}
306 275
307static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 276static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -328,10 +297,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
328 297
329/* 298/*
330 * Fault handling. 299 * Fault handling.
331 * if fshared is non NULL, current->mm->mmap_sem is already held
332 */ 300 */
333static int futex_handle_fault(unsigned long address, 301static int futex_handle_fault(unsigned long address, int attempt)
334 struct rw_semaphore *fshared, int attempt)
335{ 302{
336 struct vm_area_struct * vma; 303 struct vm_area_struct * vma;
337 struct mm_struct *mm = current->mm; 304 struct mm_struct *mm = current->mm;
@@ -340,8 +307,7 @@ static int futex_handle_fault(unsigned long address,
340 if (attempt > 2) 307 if (attempt > 2)
341 return ret; 308 return ret;
342 309
343 if (!fshared) 310 down_read(&mm->mmap_sem);
344 down_read(&mm->mmap_sem);
345 vma = find_vma(mm, address); 311 vma = find_vma(mm, address);
346 if (vma && address >= vma->vm_start && 312 if (vma && address >= vma->vm_start &&
347 (vma->vm_flags & VM_WRITE)) { 313 (vma->vm_flags & VM_WRITE)) {
@@ -361,8 +327,7 @@ static int futex_handle_fault(unsigned long address,
361 current->min_flt++; 327 current->min_flt++;
362 } 328 }
363 } 329 }
364 if (!fshared) 330 up_read(&mm->mmap_sem);
365 up_read(&mm->mmap_sem);
366 return ret; 331 return ret;
367} 332}
368 333
@@ -385,6 +350,7 @@ static int refill_pi_state_cache(void)
385 /* pi_mutex gets initialized later */ 350 /* pi_mutex gets initialized later */
386 pi_state->owner = NULL; 351 pi_state->owner = NULL;
387 atomic_set(&pi_state->refcount, 1); 352 atomic_set(&pi_state->refcount, 1);
353 pi_state->key = FUTEX_KEY_INIT;
388 354
389 current->pi_state_cache = pi_state; 355 current->pi_state_cache = pi_state;
390 356
@@ -462,7 +428,7 @@ void exit_pi_state_list(struct task_struct *curr)
462 struct list_head *next, *head = &curr->pi_state_list; 428 struct list_head *next, *head = &curr->pi_state_list;
463 struct futex_pi_state *pi_state; 429 struct futex_pi_state *pi_state;
464 struct futex_hash_bucket *hb; 430 struct futex_hash_bucket *hb;
465 union futex_key key; 431 union futex_key key = FUTEX_KEY_INIT;
466 432
467 if (!futex_cmpxchg_enabled) 433 if (!futex_cmpxchg_enabled)
468 return; 434 return;
@@ -719,20 +685,17 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
719 * Wake up all waiters hashed on the physical page that is mapped 685 * Wake up all waiters hashed on the physical page that is mapped
720 * to this virtual address: 686 * to this virtual address:
721 */ 687 */
722static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, 688static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
723 int nr_wake, u32 bitset)
724{ 689{
725 struct futex_hash_bucket *hb; 690 struct futex_hash_bucket *hb;
726 struct futex_q *this, *next; 691 struct futex_q *this, *next;
727 struct plist_head *head; 692 struct plist_head *head;
728 union futex_key key; 693 union futex_key key = FUTEX_KEY_INIT;
729 int ret; 694 int ret;
730 695
731 if (!bitset) 696 if (!bitset)
732 return -EINVAL; 697 return -EINVAL;
733 698
734 futex_lock_mm(fshared);
735
736 ret = get_futex_key(uaddr, fshared, &key); 699 ret = get_futex_key(uaddr, fshared, &key);
737 if (unlikely(ret != 0)) 700 if (unlikely(ret != 0))
738 goto out; 701 goto out;
@@ -760,7 +723,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
760 723
761 spin_unlock(&hb->lock); 724 spin_unlock(&hb->lock);
762out: 725out:
763 futex_unlock_mm(fshared); 726 put_futex_key(fshared, &key);
764 return ret; 727 return ret;
765} 728}
766 729
@@ -769,19 +732,16 @@ out:
769 * to this virtual address: 732 * to this virtual address:
770 */ 733 */
771static int 734static int
772futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, 735futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
773 u32 __user *uaddr2,
774 int nr_wake, int nr_wake2, int op) 736 int nr_wake, int nr_wake2, int op)
775{ 737{
776 union futex_key key1, key2; 738 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
777 struct futex_hash_bucket *hb1, *hb2; 739 struct futex_hash_bucket *hb1, *hb2;
778 struct plist_head *head; 740 struct plist_head *head;
779 struct futex_q *this, *next; 741 struct futex_q *this, *next;
780 int ret, op_ret, attempt = 0; 742 int ret, op_ret, attempt = 0;
781 743
782retryfull: 744retryfull:
783 futex_lock_mm(fshared);
784
785 ret = get_futex_key(uaddr1, fshared, &key1); 745 ret = get_futex_key(uaddr1, fshared, &key1);
786 if (unlikely(ret != 0)) 746 if (unlikely(ret != 0))
787 goto out; 747 goto out;
@@ -826,18 +786,12 @@ retry:
826 */ 786 */
827 if (attempt++) { 787 if (attempt++) {
828 ret = futex_handle_fault((unsigned long)uaddr2, 788 ret = futex_handle_fault((unsigned long)uaddr2,
829 fshared, attempt); 789 attempt);
830 if (ret) 790 if (ret)
831 goto out; 791 goto out;
832 goto retry; 792 goto retry;
833 } 793 }
834 794
835 /*
836 * If we would have faulted, release mmap_sem,
837 * fault it in and start all over again.
838 */
839 futex_unlock_mm(fshared);
840
841 ret = get_user(dummy, uaddr2); 795 ret = get_user(dummy, uaddr2);
842 if (ret) 796 if (ret)
843 return ret; 797 return ret;
@@ -873,7 +827,8 @@ retry:
873 if (hb1 != hb2) 827 if (hb1 != hb2)
874 spin_unlock(&hb2->lock); 828 spin_unlock(&hb2->lock);
875out: 829out:
876 futex_unlock_mm(fshared); 830 put_futex_key(fshared, &key2);
831 put_futex_key(fshared, &key1);
877 832
878 return ret; 833 return ret;
879} 834}
@@ -882,19 +837,16 @@ out:
882 * Requeue all waiters hashed on one physical page to another 837 * Requeue all waiters hashed on one physical page to another
883 * physical page. 838 * physical page.
884 */ 839 */
885static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, 840static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
886 u32 __user *uaddr2,
887 int nr_wake, int nr_requeue, u32 *cmpval) 841 int nr_wake, int nr_requeue, u32 *cmpval)
888{ 842{
889 union futex_key key1, key2; 843 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
890 struct futex_hash_bucket *hb1, *hb2; 844 struct futex_hash_bucket *hb1, *hb2;
891 struct plist_head *head1; 845 struct plist_head *head1;
892 struct futex_q *this, *next; 846 struct futex_q *this, *next;
893 int ret, drop_count = 0; 847 int ret, drop_count = 0;
894 848
895 retry: 849 retry:
896 futex_lock_mm(fshared);
897
898 ret = get_futex_key(uaddr1, fshared, &key1); 850 ret = get_futex_key(uaddr1, fshared, &key1);
899 if (unlikely(ret != 0)) 851 if (unlikely(ret != 0))
900 goto out; 852 goto out;
@@ -917,12 +869,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
917 if (hb1 != hb2) 869 if (hb1 != hb2)
918 spin_unlock(&hb2->lock); 870 spin_unlock(&hb2->lock);
919 871
920 /*
921 * If we would have faulted, release mmap_sem, fault
922 * it in and start all over again.
923 */
924 futex_unlock_mm(fshared);
925
926 ret = get_user(curval, uaddr1); 872 ret = get_user(curval, uaddr1);
927 873
928 if (!ret) 874 if (!ret)
@@ -974,7 +920,8 @@ out_unlock:
974 drop_futex_key_refs(&key1); 920 drop_futex_key_refs(&key1);
975 921
976out: 922out:
977 futex_unlock_mm(fshared); 923 put_futex_key(fshared, &key2);
924 put_futex_key(fshared, &key1);
978 return ret; 925 return ret;
979} 926}
980 927
@@ -1096,8 +1043,7 @@ static void unqueue_me_pi(struct futex_q *q)
1096 * private futexes. 1043 * private futexes.
1097 */ 1044 */
1098static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1045static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1099 struct task_struct *newowner, 1046 struct task_struct *newowner, int fshared)
1100 struct rw_semaphore *fshared)
1101{ 1047{
1102 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1048 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1103 struct futex_pi_state *pi_state = q->pi_state; 1049 struct futex_pi_state *pi_state = q->pi_state;
@@ -1176,7 +1122,7 @@ retry:
1176handle_fault: 1122handle_fault:
1177 spin_unlock(q->lock_ptr); 1123 spin_unlock(q->lock_ptr);
1178 1124
1179 ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); 1125 ret = futex_handle_fault((unsigned long)uaddr, attempt++);
1180 1126
1181 spin_lock(q->lock_ptr); 1127 spin_lock(q->lock_ptr);
1182 1128
@@ -1200,7 +1146,7 @@ handle_fault:
1200 1146
1201static long futex_wait_restart(struct restart_block *restart); 1147static long futex_wait_restart(struct restart_block *restart);
1202 1148
1203static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1149static int futex_wait(u32 __user *uaddr, int fshared,
1204 u32 val, ktime_t *abs_time, u32 bitset) 1150 u32 val, ktime_t *abs_time, u32 bitset)
1205{ 1151{
1206 struct task_struct *curr = current; 1152 struct task_struct *curr = current;
@@ -1218,8 +1164,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1218 q.pi_state = NULL; 1164 q.pi_state = NULL;
1219 q.bitset = bitset; 1165 q.bitset = bitset;
1220 retry: 1166 retry:
1221 futex_lock_mm(fshared); 1167 q.key = FUTEX_KEY_INIT;
1222
1223 ret = get_futex_key(uaddr, fshared, &q.key); 1168 ret = get_futex_key(uaddr, fshared, &q.key);
1224 if (unlikely(ret != 0)) 1169 if (unlikely(ret != 0))
1225 goto out_release_sem; 1170 goto out_release_sem;
@@ -1251,12 +1196,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1251 if (unlikely(ret)) { 1196 if (unlikely(ret)) {
1252 queue_unlock(&q, hb); 1197 queue_unlock(&q, hb);
1253 1198
1254 /*
1255 * If we would have faulted, release mmap_sem, fault it in and
1256 * start all over again.
1257 */
1258 futex_unlock_mm(fshared);
1259
1260 ret = get_user(uval, uaddr); 1199 ret = get_user(uval, uaddr);
1261 1200
1262 if (!ret) 1201 if (!ret)
@@ -1271,12 +1210,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1271 queue_me(&q, hb); 1210 queue_me(&q, hb);
1272 1211
1273 /* 1212 /*
1274 * Now the futex is queued and we have checked the data, we
1275 * don't want to hold mmap_sem while we sleep.
1276 */
1277 futex_unlock_mm(fshared);
1278
1279 /*
1280 * There might have been scheduling since the queue_me(), as we 1213 * There might have been scheduling since the queue_me(), as we
1281 * cannot hold a spinlock across the get_user() in case it 1214 * cannot hold a spinlock across the get_user() in case it
1282 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1215 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
@@ -1363,7 +1296,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1363 queue_unlock(&q, hb); 1296 queue_unlock(&q, hb);
1364 1297
1365 out_release_sem: 1298 out_release_sem:
1366 futex_unlock_mm(fshared); 1299 put_futex_key(fshared, &q.key);
1367 return ret; 1300 return ret;
1368} 1301}
1369 1302
@@ -1371,13 +1304,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1371static long futex_wait_restart(struct restart_block *restart) 1304static long futex_wait_restart(struct restart_block *restart)
1372{ 1305{
1373 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1306 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1374 struct rw_semaphore *fshared = NULL; 1307 int fshared = 0;
1375 ktime_t t; 1308 ktime_t t;
1376 1309
1377 t.tv64 = restart->futex.time; 1310 t.tv64 = restart->futex.time;
1378 restart->fn = do_no_restart_syscall; 1311 restart->fn = do_no_restart_syscall;
1379 if (restart->futex.flags & FLAGS_SHARED) 1312 if (restart->futex.flags & FLAGS_SHARED)
1380 fshared = &current->mm->mmap_sem; 1313 fshared = 1;
1381 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1314 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
1382 restart->futex.bitset); 1315 restart->futex.bitset);
1383} 1316}
@@ -1389,7 +1322,7 @@ static long futex_wait_restart(struct restart_block *restart)
1389 * if there are waiters then it will block, it does PI, etc. (Due to 1322 * if there are waiters then it will block, it does PI, etc. (Due to
1390 * races the kernel might see a 0 value of the futex too.) 1323 * races the kernel might see a 0 value of the futex too.)
1391 */ 1324 */
1392static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, 1325static int futex_lock_pi(u32 __user *uaddr, int fshared,
1393 int detect, ktime_t *time, int trylock) 1326 int detect, ktime_t *time, int trylock)
1394{ 1327{
1395 struct hrtimer_sleeper timeout, *to = NULL; 1328 struct hrtimer_sleeper timeout, *to = NULL;
@@ -1412,8 +1345,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1412 1345
1413 q.pi_state = NULL; 1346 q.pi_state = NULL;
1414 retry: 1347 retry:
1415 futex_lock_mm(fshared); 1348 q.key = FUTEX_KEY_INIT;
1416
1417 ret = get_futex_key(uaddr, fshared, &q.key); 1349 ret = get_futex_key(uaddr, fshared, &q.key);
1418 if (unlikely(ret != 0)) 1350 if (unlikely(ret != 0))
1419 goto out_release_sem; 1351 goto out_release_sem;
@@ -1502,7 +1434,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1502 * exit to complete. 1434 * exit to complete.
1503 */ 1435 */
1504 queue_unlock(&q, hb); 1436 queue_unlock(&q, hb);
1505 futex_unlock_mm(fshared);
1506 cond_resched(); 1437 cond_resched();
1507 goto retry; 1438 goto retry;
1508 1439
@@ -1534,12 +1465,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1534 */ 1465 */
1535 queue_me(&q, hb); 1466 queue_me(&q, hb);
1536 1467
1537 /*
1538 * Now the futex is queued and we have checked the data, we
1539 * don't want to hold mmap_sem while we sleep.
1540 */
1541 futex_unlock_mm(fshared);
1542
1543 WARN_ON(!q.pi_state); 1468 WARN_ON(!q.pi_state);
1544 /* 1469 /*
1545 * Block on the PI mutex: 1470 * Block on the PI mutex:
@@ -1552,7 +1477,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1552 ret = ret ? 0 : -EWOULDBLOCK; 1477 ret = ret ? 0 : -EWOULDBLOCK;
1553 } 1478 }
1554 1479
1555 futex_lock_mm(fshared);
1556 spin_lock(q.lock_ptr); 1480 spin_lock(q.lock_ptr);
1557 1481
1558 if (!ret) { 1482 if (!ret) {
@@ -1618,7 +1542,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1618 1542
1619 /* Unqueue and drop the lock */ 1543 /* Unqueue and drop the lock */
1620 unqueue_me_pi(&q); 1544 unqueue_me_pi(&q);
1621 futex_unlock_mm(fshared);
1622 1545
1623 if (to) 1546 if (to)
1624 destroy_hrtimer_on_stack(&to->timer); 1547 destroy_hrtimer_on_stack(&to->timer);
@@ -1628,7 +1551,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1628 queue_unlock(&q, hb); 1551 queue_unlock(&q, hb);
1629 1552
1630 out_release_sem: 1553 out_release_sem:
1631 futex_unlock_mm(fshared); 1554 put_futex_key(fshared, &q.key);
1632 if (to) 1555 if (to)
1633 destroy_hrtimer_on_stack(&to->timer); 1556 destroy_hrtimer_on_stack(&to->timer);
1634 return ret; 1557 return ret;
@@ -1645,15 +1568,12 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1645 queue_unlock(&q, hb); 1568 queue_unlock(&q, hb);
1646 1569
1647 if (attempt++) { 1570 if (attempt++) {
1648 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1571 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1649 attempt);
1650 if (ret) 1572 if (ret)
1651 goto out_release_sem; 1573 goto out_release_sem;
1652 goto retry_unlocked; 1574 goto retry_unlocked;
1653 } 1575 }
1654 1576
1655 futex_unlock_mm(fshared);
1656
1657 ret = get_user(uval, uaddr); 1577 ret = get_user(uval, uaddr);
1658 if (!ret && (uval != -EFAULT)) 1578 if (!ret && (uval != -EFAULT))
1659 goto retry; 1579 goto retry;
@@ -1668,13 +1588,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1668 * This is the in-kernel slowpath: we look up the PI state (if any), 1588 * This is the in-kernel slowpath: we look up the PI state (if any),
1669 * and do the rt-mutex unlock. 1589 * and do the rt-mutex unlock.
1670 */ 1590 */
1671static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) 1591static int futex_unlock_pi(u32 __user *uaddr, int fshared)
1672{ 1592{
1673 struct futex_hash_bucket *hb; 1593 struct futex_hash_bucket *hb;
1674 struct futex_q *this, *next; 1594 struct futex_q *this, *next;
1675 u32 uval; 1595 u32 uval;
1676 struct plist_head *head; 1596 struct plist_head *head;
1677 union futex_key key; 1597 union futex_key key = FUTEX_KEY_INIT;
1678 int ret, attempt = 0; 1598 int ret, attempt = 0;
1679 1599
1680retry: 1600retry:
@@ -1685,10 +1605,6 @@ retry:
1685 */ 1605 */
1686 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 1606 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1687 return -EPERM; 1607 return -EPERM;
1688 /*
1689 * First take all the futex related locks:
1690 */
1691 futex_lock_mm(fshared);
1692 1608
1693 ret = get_futex_key(uaddr, fshared, &key); 1609 ret = get_futex_key(uaddr, fshared, &key);
1694 if (unlikely(ret != 0)) 1610 if (unlikely(ret != 0))
@@ -1747,7 +1663,7 @@ retry_unlocked:
1747out_unlock: 1663out_unlock:
1748 spin_unlock(&hb->lock); 1664 spin_unlock(&hb->lock);
1749out: 1665out:
1750 futex_unlock_mm(fshared); 1666 put_futex_key(fshared, &key);
1751 1667
1752 return ret; 1668 return ret;
1753 1669
@@ -1763,16 +1679,13 @@ pi_faulted:
1763 spin_unlock(&hb->lock); 1679 spin_unlock(&hb->lock);
1764 1680
1765 if (attempt++) { 1681 if (attempt++) {
1766 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1682 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1767 attempt);
1768 if (ret) 1683 if (ret)
1769 goto out; 1684 goto out;
1770 uval = 0; 1685 uval = 0;
1771 goto retry_unlocked; 1686 goto retry_unlocked;
1772 } 1687 }
1773 1688
1774 futex_unlock_mm(fshared);
1775
1776 ret = get_user(uval, uaddr); 1689 ret = get_user(uval, uaddr);
1777 if (!ret && (uval != -EFAULT)) 1690 if (!ret && (uval != -EFAULT))
1778 goto retry; 1691 goto retry;
@@ -1898,8 +1811,7 @@ retry:
1898 * PI futexes happens in exit_pi_state(): 1811 * PI futexes happens in exit_pi_state():
1899 */ 1812 */
1900 if (!pi && (uval & FUTEX_WAITERS)) 1813 if (!pi && (uval & FUTEX_WAITERS))
1901 futex_wake(uaddr, &curr->mm->mmap_sem, 1, 1814 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
1902 FUTEX_BITSET_MATCH_ANY);
1903 } 1815 }
1904 return 0; 1816 return 0;
1905} 1817}
@@ -1995,10 +1907,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1995{ 1907{
1996 int ret = -ENOSYS; 1908 int ret = -ENOSYS;
1997 int cmd = op & FUTEX_CMD_MASK; 1909 int cmd = op & FUTEX_CMD_MASK;
1998 struct rw_semaphore *fshared = NULL; 1910 int fshared = 0;
1999 1911
2000 if (!(op & FUTEX_PRIVATE_FLAG)) 1912 if (!(op & FUTEX_PRIVATE_FLAG))
2001 fshared = &current->mm->mmap_sem; 1913 fshared = 1;
2002 1914
2003 switch (cmd) { 1915 switch (cmd) {
2004 case FUTEX_WAIT: 1916 case FUTEX_WAIT:
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index cc0f7321b8ce..650ce4102a63 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -40,6 +40,9 @@ unsigned long probe_irq_on(void)
40 * flush such a longstanding irq before considering it as spurious. 40 * flush such a longstanding irq before considering it as spurious.
41 */ 41 */
42 for_each_irq_desc_reverse(i, desc) { 42 for_each_irq_desc_reverse(i, desc) {
43 if (!desc)
44 continue;
45
43 spin_lock_irq(&desc->lock); 46 spin_lock_irq(&desc->lock);
44 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 47 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
45 /* 48 /*
@@ -68,6 +71,9 @@ unsigned long probe_irq_on(void)
68 * happened in the previous stage, it may have masked itself) 71 * happened in the previous stage, it may have masked itself)
69 */ 72 */
70 for_each_irq_desc_reverse(i, desc) { 73 for_each_irq_desc_reverse(i, desc) {
74 if (!desc)
75 continue;
76
71 spin_lock_irq(&desc->lock); 77 spin_lock_irq(&desc->lock);
72 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
73 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
@@ -86,6 +92,9 @@ unsigned long probe_irq_on(void)
86 * Now filter out any obviously spurious interrupts 92 * Now filter out any obviously spurious interrupts
87 */ 93 */
88 for_each_irq_desc(i, desc) { 94 for_each_irq_desc(i, desc) {
95 if (!desc)
96 continue;
97
89 spin_lock_irq(&desc->lock); 98 spin_lock_irq(&desc->lock);
90 status = desc->status; 99 status = desc->status;
91 100
@@ -124,6 +133,9 @@ unsigned int probe_irq_mask(unsigned long val)
124 int i; 133 int i;
125 134
126 for_each_irq_desc(i, desc) { 135 for_each_irq_desc(i, desc) {
136 if (!desc)
137 continue;
138
127 spin_lock_irq(&desc->lock); 139 spin_lock_irq(&desc->lock);
128 status = desc->status; 140 status = desc->status;
129 141
@@ -166,6 +178,9 @@ int probe_irq_off(unsigned long val)
166 unsigned int status; 178 unsigned int status;
167 179
168 for_each_irq_desc(i, desc) { 180 for_each_irq_desc(i, desc) {
181 if (!desc)
182 continue;
183
169 spin_lock_irq(&desc->lock); 184 spin_lock_irq(&desc->lock);
170 status = desc->status; 185 status = desc->status;
171 186
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 58d8e31daa49..0af16aeee8b6 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -24,9 +24,10 @@
24 */ 24 */
25void dynamic_irq_init(unsigned int irq) 25void dynamic_irq_init(unsigned int irq)
26{ 26{
27 struct irq_desc *desc = irq_to_desc(irq); 27 struct irq_desc *desc;
28 unsigned long flags; 28 unsigned long flags;
29 29
30 desc = irq_to_desc(irq);
30 if (!desc) { 31 if (!desc) {
31 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); 32 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
32 return; 33 return;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c815b42d0f5b..8aa09547f5ef 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -15,9 +15,16 @@
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18#include <linux/rculist.h>
19#include <linux/hash.h>
18 20
19#include "internals.h" 21#include "internals.h"
20 22
23/*
24 * lockdep: we want to handle all irq_desc locks as a single lock-class:
25 */
26static struct lock_class_key irq_desc_lock_class;
27
21/** 28/**
22 * handle_bad_irq - handle spurious and unhandled irqs 29 * handle_bad_irq - handle spurious and unhandled irqs
23 * @irq: the interrupt number 30 * @irq: the interrupt number
@@ -49,6 +56,155 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
49int nr_irqs = NR_IRQS; 56int nr_irqs = NR_IRQS;
50EXPORT_SYMBOL_GPL(nr_irqs); 57EXPORT_SYMBOL_GPL(nr_irqs);
51 58
59void __init __attribute__((weak)) arch_early_irq_init(void)
60{
61}
62
63#ifdef CONFIG_SPARSE_IRQ
64static struct irq_desc irq_desc_init = {
65 .irq = -1,
66 .status = IRQ_DISABLED,
67 .chip = &no_irq_chip,
68 .handle_irq = handle_bad_irq,
69 .depth = 1,
70 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
71#ifdef CONFIG_SMP
72 .affinity = CPU_MASK_ALL
73#endif
74};
75
76static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
77{
78 unsigned long bytes;
79 char *ptr;
80 int node;
81
82 /* Compute how many bytes we need per irq and allocate them */
83 bytes = nr * sizeof(unsigned int);
84
85 node = cpu_to_node(cpu);
86 ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
87 printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node);
88
89 if (ptr)
90 desc->kstat_irqs = (unsigned int *)ptr;
91}
92
93void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
94{
95}
96
97static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
98{
99 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
100 desc->irq = irq;
101#ifdef CONFIG_SMP
102 desc->cpu = cpu;
103#endif
104 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
105 init_kstat_irqs(desc, cpu, nr_cpu_ids);
106 if (!desc->kstat_irqs) {
107 printk(KERN_ERR "can not alloc kstat_irqs\n");
108 BUG_ON(1);
109 }
110 arch_init_chip_data(desc, cpu);
111}
112
113/*
114 * Protect the sparse_irqs:
115 */
116static DEFINE_SPINLOCK(sparse_irq_lock);
117
118struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
119
120static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
121 [0 ... NR_IRQS_LEGACY-1] = {
122 .irq = -1,
123 .status = IRQ_DISABLED,
124 .chip = &no_irq_chip,
125 .handle_irq = handle_bad_irq,
126 .depth = 1,
127 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
128#ifdef CONFIG_SMP
129 .affinity = CPU_MASK_ALL
130#endif
131 }
132};
133
134/* FIXME: use bootmem alloc ...*/
135static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
136
137void __init early_irq_init(void)
138{
139 struct irq_desc *desc;
140 int legacy_count;
141 int i;
142
143 desc = irq_desc_legacy;
144 legacy_count = ARRAY_SIZE(irq_desc_legacy);
145
146 for (i = 0; i < legacy_count; i++) {
147 desc[i].irq = i;
148 desc[i].kstat_irqs = kstat_irqs_legacy[i];
149
150 irq_desc_ptrs[i] = desc + i;
151 }
152
153 for (i = legacy_count; i < NR_IRQS; i++)
154 irq_desc_ptrs[i] = NULL;
155
156 arch_early_irq_init();
157}
158
159struct irq_desc *irq_to_desc(unsigned int irq)
160{
161 return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
162}
163
164struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
165{
166 struct irq_desc *desc;
167 unsigned long flags;
168 int node;
169
170 if (irq >= NR_IRQS) {
171 printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
172 irq, NR_IRQS);
173 WARN_ON(1);
174 return NULL;
175 }
176
177 desc = irq_desc_ptrs[irq];
178 if (desc)
179 return desc;
180
181 spin_lock_irqsave(&sparse_irq_lock, flags);
182
183 /* We have to check it to avoid races with another CPU */
184 desc = irq_desc_ptrs[irq];
185 if (desc)
186 goto out_unlock;
187
188 node = cpu_to_node(cpu);
189 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
190 printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n",
191 irq, cpu, node);
192 if (!desc) {
193 printk(KERN_ERR "can not alloc irq_desc\n");
194 BUG_ON(1);
195 }
196 init_one_irq_desc(irq, desc, cpu);
197
198 irq_desc_ptrs[irq] = desc;
199
200out_unlock:
201 spin_unlock_irqrestore(&sparse_irq_lock, flags);
202
203 return desc;
204}
205
206#else
207
52struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 208struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
53 [0 ... NR_IRQS-1] = { 209 [0 ... NR_IRQS-1] = {
54 .status = IRQ_DISABLED, 210 .status = IRQ_DISABLED,
@@ -62,6 +218,8 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
62 } 218 }
63}; 219};
64 220
221#endif
222
65/* 223/*
66 * What should we do if we get a hw irq event on an illegal vector? 224 * What should we do if we get a hw irq event on an illegal vector?
67 * Each architecture has to answer this themself. 225 * Each architecture has to answer this themself.
@@ -261,17 +419,28 @@ out:
261 419
262 420
263#ifdef CONFIG_TRACE_IRQFLAGS 421#ifdef CONFIG_TRACE_IRQFLAGS
264/*
265 * lockdep: we want to handle all irq_desc locks as a single lock-class:
266 */
267static struct lock_class_key irq_desc_lock_class;
268
269void early_init_irq_lock_class(void) 422void early_init_irq_lock_class(void)
270{ 423{
424#ifndef CONFIG_SPARSE_IRQ
271 struct irq_desc *desc; 425 struct irq_desc *desc;
272 int i; 426 int i;
273 427
274 for_each_irq_desc(i, desc) 428 for_each_irq_desc(i, desc) {
429 if (!desc)
430 continue;
431
275 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 432 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
433 }
434#endif
435}
436#endif
437
438#ifdef CONFIG_SPARSE_IRQ
439unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
440{
441 struct irq_desc *desc = irq_to_desc(irq);
442 return desc->kstat_irqs[cpu];
276} 443}
277#endif 444#endif
445EXPORT_SYMBOL(kstat_irqs_cpu);
446
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 8e91c9762520..d2c0e5ee53c5 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -252,7 +252,11 @@ void init_irq_proc(void)
252 /* 252 /*
253 * Create entries for all existing IRQs. 253 * Create entries for all existing IRQs.
254 */ 254 */
255 for_each_irq_desc(irq, desc) 255 for_each_irq_desc(irq, desc) {
256 if (!desc)
257 continue;
258
256 register_irq_proc(irq, desc); 259 register_irq_proc(irq, desc);
260 }
257} 261}
258 262
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd364c11e56e..3738107531fd 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -91,6 +91,9 @@ static int misrouted_irq(int irq)
91 int i, ok = 0; 91 int i, ok = 0;
92 92
93 for_each_irq_desc(i, desc) { 93 for_each_irq_desc(i, desc) {
94 if (!desc)
95 continue;
96
94 if (!i) 97 if (!i)
95 continue; 98 continue;
96 99
@@ -112,6 +115,8 @@ static void poll_spurious_irqs(unsigned long dummy)
112 for_each_irq_desc(i, desc) { 115 for_each_irq_desc(i, desc) {
113 unsigned int status; 116 unsigned int status;
114 117
118 if (!desc)
119 continue;
115 if (!i) 120 if (!i)
116 continue; 121 continue;
117 122
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8e7a7ce3ed0a..4fbc456f393d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
21static LIST_HEAD(kthread_create_list); 21static LIST_HEAD(kthread_create_list);
22struct task_struct *kthreadd_task; 22struct task_struct *kthreadd_task;
23 23
24DEFINE_TRACE(sched_kthread_stop);
25DEFINE_TRACE(sched_kthread_stop_ret);
26
24struct kthread_create_info 27struct kthread_create_info
25{ 28{
26 /* Information passed to kthread() from kthreadd. */ 29 /* Information passed to kthread() from kthreadd. */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 46a404173db2..c4c7df23f8c7 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -25,6 +25,7 @@
25 * Thanks to Arjan van de Ven for coming up with the initial idea of 25 * Thanks to Arjan van de Ven for coming up with the initial idea of
26 * mapping lock dependencies runtime. 26 * mapping lock dependencies runtime.
27 */ 27 */
28#define DISABLE_BRANCH_PROFILING
28#include <linux/mutex.h> 29#include <linux/mutex.h>
29#include <linux/sched.h> 30#include <linux/sched.h>
30#include <linux/delay.h> 31#include <linux/delay.h>
@@ -136,16 +137,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
136#ifdef CONFIG_LOCK_STAT 137#ifdef CONFIG_LOCK_STAT
137static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 138static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
138 139
139static int lock_contention_point(struct lock_class *class, unsigned long ip) 140static int lock_point(unsigned long points[], unsigned long ip)
140{ 141{
141 int i; 142 int i;
142 143
143 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 144 for (i = 0; i < LOCKSTAT_POINTS; i++) {
144 if (class->contention_point[i] == 0) { 145 if (points[i] == 0) {
145 class->contention_point[i] = ip; 146 points[i] = ip;
146 break; 147 break;
147 } 148 }
148 if (class->contention_point[i] == ip) 149 if (points[i] == ip)
149 break; 150 break;
150 } 151 }
151 152
@@ -185,6 +186,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
185 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 186 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
186 stats.contention_point[i] += pcs->contention_point[i]; 187 stats.contention_point[i] += pcs->contention_point[i];
187 188
189 for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
190 stats.contending_point[i] += pcs->contending_point[i];
191
188 lock_time_add(&pcs->read_waittime, &stats.read_waittime); 192 lock_time_add(&pcs->read_waittime, &stats.read_waittime);
189 lock_time_add(&pcs->write_waittime, &stats.write_waittime); 193 lock_time_add(&pcs->write_waittime, &stats.write_waittime);
190 194
@@ -209,6 +213,7 @@ void clear_lock_stats(struct lock_class *class)
209 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 213 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
210 } 214 }
211 memset(class->contention_point, 0, sizeof(class->contention_point)); 215 memset(class->contention_point, 0, sizeof(class->contention_point));
216 memset(class->contending_point, 0, sizeof(class->contending_point));
212} 217}
213 218
214static struct lock_class_stats *get_lock_stats(struct lock_class *class) 219static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@ -2999,7 +3004,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
2999 struct held_lock *hlock, *prev_hlock; 3004 struct held_lock *hlock, *prev_hlock;
3000 struct lock_class_stats *stats; 3005 struct lock_class_stats *stats;
3001 unsigned int depth; 3006 unsigned int depth;
3002 int i, point; 3007 int i, contention_point, contending_point;
3003 3008
3004 depth = curr->lockdep_depth; 3009 depth = curr->lockdep_depth;
3005 if (DEBUG_LOCKS_WARN_ON(!depth)) 3010 if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -3023,18 +3028,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3023found_it: 3028found_it:
3024 hlock->waittime_stamp = sched_clock(); 3029 hlock->waittime_stamp = sched_clock();
3025 3030
3026 point = lock_contention_point(hlock_class(hlock), ip); 3031 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3032 contending_point = lock_point(hlock_class(hlock)->contending_point,
3033 lock->ip);
3027 3034
3028 stats = get_lock_stats(hlock_class(hlock)); 3035 stats = get_lock_stats(hlock_class(hlock));
3029 if (point < ARRAY_SIZE(stats->contention_point)) 3036 if (contention_point < LOCKSTAT_POINTS)
3030 stats->contention_point[point]++; 3037 stats->contention_point[contention_point]++;
3038 if (contending_point < LOCKSTAT_POINTS)
3039 stats->contending_point[contending_point]++;
3031 if (lock->cpu != smp_processor_id()) 3040 if (lock->cpu != smp_processor_id())
3032 stats->bounces[bounce_contended + !!hlock->read]++; 3041 stats->bounces[bounce_contended + !!hlock->read]++;
3033 put_lock_stats(stats); 3042 put_lock_stats(stats);
3034} 3043}
3035 3044
3036static void 3045static void
3037__lock_acquired(struct lockdep_map *lock) 3046__lock_acquired(struct lockdep_map *lock, unsigned long ip)
3038{ 3047{
3039 struct task_struct *curr = current; 3048 struct task_struct *curr = current;
3040 struct held_lock *hlock, *prev_hlock; 3049 struct held_lock *hlock, *prev_hlock;
@@ -3083,6 +3092,7 @@ found_it:
3083 put_lock_stats(stats); 3092 put_lock_stats(stats);
3084 3093
3085 lock->cpu = cpu; 3094 lock->cpu = cpu;
3095 lock->ip = ip;
3086} 3096}
3087 3097
3088void lock_contended(struct lockdep_map *lock, unsigned long ip) 3098void lock_contended(struct lockdep_map *lock, unsigned long ip)
@@ -3104,7 +3114,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3104} 3114}
3105EXPORT_SYMBOL_GPL(lock_contended); 3115EXPORT_SYMBOL_GPL(lock_contended);
3106 3116
3107void lock_acquired(struct lockdep_map *lock) 3117void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3108{ 3118{
3109 unsigned long flags; 3119 unsigned long flags;
3110 3120
@@ -3117,7 +3127,7 @@ void lock_acquired(struct lockdep_map *lock)
3117 raw_local_irq_save(flags); 3127 raw_local_irq_save(flags);
3118 check_flags(flags); 3128 check_flags(flags);
3119 current->lockdep_recursion = 1; 3129 current->lockdep_recursion = 1;
3120 __lock_acquired(lock); 3130 __lock_acquired(lock, ip);
3121 current->lockdep_recursion = 0; 3131 current->lockdep_recursion = 0;
3122 raw_local_irq_restore(flags); 3132 raw_local_irq_restore(flags);
3123} 3133}
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 20dbcbf9c7dd..13716b813896 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
470 470
471static void snprint_time(char *buf, size_t bufsiz, s64 nr) 471static void snprint_time(char *buf, size_t bufsiz, s64 nr)
472{ 472{
473 unsigned long rem; 473 s64 div;
474 s32 rem;
474 475
475 nr += 5; /* for display rounding */ 476 nr += 5; /* for display rounding */
476 rem = do_div(nr, 1000); /* XXX: do_div_signed */ 477 div = div_s64_rem(nr, 1000, &rem);
477 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); 478 snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
478} 479}
479 480
480static void seq_time(struct seq_file *m, s64 time) 481static void seq_time(struct seq_file *m, s64 time)
@@ -556,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
556 if (stats->read_holdtime.nr) 557 if (stats->read_holdtime.nr)
557 namelen += 2; 558 namelen += 2;
558 559
559 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 560 for (i = 0; i < LOCKSTAT_POINTS; i++) {
560 char sym[KSYM_SYMBOL_LEN]; 561 char sym[KSYM_SYMBOL_LEN];
561 char ip[32]; 562 char ip[32];
562 563
@@ -573,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
573 stats->contention_point[i], 574 stats->contention_point[i],
574 ip, sym); 575 ip, sym);
575 } 576 }
577 for (i = 0; i < LOCKSTAT_POINTS; i++) {
578 char sym[KSYM_SYMBOL_LEN];
579 char ip[32];
580
581 if (class->contending_point[i] == 0)
582 break;
583
584 if (!i)
585 seq_line(m, '-', 40-namelen, namelen);
586
587 sprint_symbol(sym, class->contending_point[i]);
588 snprintf(ip, sizeof(ip), "[<%p>]",
589 (void *)class->contending_point[i]);
590 seq_printf(m, "%40s %14lu %29s %s\n", name,
591 stats->contending_point[i],
592 ip, sym);
593 }
576 if (i) { 594 if (i) {
577 seq_puts(m, "\n"); 595 seq_puts(m, "\n");
578 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); 596 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
@@ -582,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
582 600
583static void seq_header(struct seq_file *m) 601static void seq_header(struct seq_file *m)
584{ 602{
585 seq_printf(m, "lock_stat version 0.2\n"); 603 seq_printf(m, "lock_stat version 0.3\n");
586 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 604 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
587 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 605 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
588 "%14s %14s\n", 606 "%14s %14s\n",
diff --git a/kernel/marker.c b/kernel/marker.c
index e9c6b2bc9400..ea54f2647868 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex);
43 */ 43 */
44#define MARKER_HASH_BITS 6 44#define MARKER_HASH_BITS 6
45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) 45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
46static struct hlist_head marker_table[MARKER_TABLE_SIZE];
46 47
47/* 48/*
48 * Note about RCU : 49 * Note about RCU :
@@ -64,11 +65,10 @@ struct marker_entry {
64 void *oldptr; 65 void *oldptr;
65 int rcu_pending; 66 int rcu_pending;
66 unsigned char ptype:1; 67 unsigned char ptype:1;
68 unsigned char format_allocated:1;
67 char name[0]; /* Contains name'\0'format'\0' */ 69 char name[0]; /* Contains name'\0'format'\0' */
68}; 70};
69 71
70static struct hlist_head marker_table[MARKER_TABLE_SIZE];
71
72/** 72/**
73 * __mark_empty_function - Empty probe callback 73 * __mark_empty_function - Empty probe callback
74 * @probe_private: probe private data 74 * @probe_private: probe private data
@@ -81,7 +81,7 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
81 * though the function pointer change and the marker enabling are two distinct 81 * though the function pointer change and the marker enabling are two distinct
82 * operations that modifies the execution flow of preemptible code. 82 * operations that modifies the execution flow of preemptible code.
83 */ 83 */
84void __mark_empty_function(void *probe_private, void *call_private, 84notrace void __mark_empty_function(void *probe_private, void *call_private,
85 const char *fmt, va_list *args) 85 const char *fmt, va_list *args)
86{ 86{
87} 87}
@@ -97,7 +97,8 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
97 * need to put a full smp_rmb() in this branch. This is why we do not use 97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read. 98 * rcu_dereference() for the pointer read.
99 */ 99 */
100void marker_probe_cb(const struct marker *mdata, void *call_private, ...) 100notrace void marker_probe_cb(const struct marker *mdata,
101 void *call_private, ...)
101{ 102{
102 va_list args; 103 va_list args;
103 char ptype; 104 char ptype;
@@ -107,7 +108,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
107 * sure the teardown of the callbacks can be done correctly when they 108 * sure the teardown of the callbacks can be done correctly when they
108 * are in modules and they insure RCU read coherency. 109 * are in modules and they insure RCU read coherency.
109 */ 110 */
110 rcu_read_lock_sched(); 111 rcu_read_lock_sched_notrace();
111 ptype = mdata->ptype; 112 ptype = mdata->ptype;
112 if (likely(!ptype)) { 113 if (likely(!ptype)) {
113 marker_probe_func *func; 114 marker_probe_func *func;
@@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
145 va_end(args); 146 va_end(args);
146 } 147 }
147 } 148 }
148 rcu_read_unlock_sched(); 149 rcu_read_unlock_sched_notrace();
149} 150}
150EXPORT_SYMBOL_GPL(marker_probe_cb); 151EXPORT_SYMBOL_GPL(marker_probe_cb);
151 152
@@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
157 * 158 *
158 * Should be connected to markers "MARK_NOARGS". 159 * Should be connected to markers "MARK_NOARGS".
159 */ 160 */
160void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) 161static notrace void marker_probe_cb_noarg(const struct marker *mdata,
162 void *call_private, ...)
161{ 163{
162 va_list args; /* not initialized */ 164 va_list args; /* not initialized */
163 char ptype; 165 char ptype;
164 166
165 rcu_read_lock_sched(); 167 rcu_read_lock_sched_notrace();
166 ptype = mdata->ptype; 168 ptype = mdata->ptype;
167 if (likely(!ptype)) { 169 if (likely(!ptype)) {
168 marker_probe_func *func; 170 marker_probe_func *func;
@@ -195,9 +197,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
195 multi[i].func(multi[i].probe_private, call_private, 197 multi[i].func(multi[i].probe_private, call_private,
196 mdata->format, &args); 198 mdata->format, &args);
197 } 199 }
198 rcu_read_unlock_sched(); 200 rcu_read_unlock_sched_notrace();
199} 201}
200EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
201 202
202static void free_old_closure(struct rcu_head *head) 203static void free_old_closure(struct rcu_head *head)
203{ 204{
@@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format)
416 e->single.probe_private = NULL; 417 e->single.probe_private = NULL;
417 e->multi = NULL; 418 e->multi = NULL;
418 e->ptype = 0; 419 e->ptype = 0;
420 e->format_allocated = 0;
419 e->refcount = 0; 421 e->refcount = 0;
420 e->rcu_pending = 0; 422 e->rcu_pending = 0;
421 hlist_add_head(&e->hlist, head); 423 hlist_add_head(&e->hlist, head);
@@ -447,6 +449,8 @@ static int remove_marker(const char *name)
447 if (e->single.func != __mark_empty_function) 449 if (e->single.func != __mark_empty_function)
448 return -EBUSY; 450 return -EBUSY;
449 hlist_del(&e->hlist); 451 hlist_del(&e->hlist);
452 if (e->format_allocated)
453 kfree(e->format);
450 /* Make sure the call_rcu has been executed */ 454 /* Make sure the call_rcu has been executed */
451 if (e->rcu_pending) 455 if (e->rcu_pending)
452 rcu_barrier_sched(); 456 rcu_barrier_sched();
@@ -457,57 +461,34 @@ static int remove_marker(const char *name)
457/* 461/*
458 * Set the mark_entry format to the format found in the element. 462 * Set the mark_entry format to the format found in the element.
459 */ 463 */
460static int marker_set_format(struct marker_entry **entry, const char *format) 464static int marker_set_format(struct marker_entry *entry, const char *format)
461{ 465{
462 struct marker_entry *e; 466 entry->format = kstrdup(format, GFP_KERNEL);
463 size_t name_len = strlen((*entry)->name) + 1; 467 if (!entry->format)
464 size_t format_len = strlen(format) + 1;
465
466
467 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
468 GFP_KERNEL);
469 if (!e)
470 return -ENOMEM; 468 return -ENOMEM;
471 memcpy(&e->name[0], (*entry)->name, name_len); 469 entry->format_allocated = 1;
472 e->format = &e->name[name_len]; 470
473 memcpy(e->format, format, format_len);
474 if (strcmp(e->format, MARK_NOARGS) == 0)
475 e->call = marker_probe_cb_noarg;
476 else
477 e->call = marker_probe_cb;
478 e->single = (*entry)->single;
479 e->multi = (*entry)->multi;
480 e->ptype = (*entry)->ptype;
481 e->refcount = (*entry)->refcount;
482 e->rcu_pending = 0;
483 hlist_add_before(&e->hlist, &(*entry)->hlist);
484 hlist_del(&(*entry)->hlist);
485 /* Make sure the call_rcu has been executed */
486 if ((*entry)->rcu_pending)
487 rcu_barrier_sched();
488 kfree(*entry);
489 *entry = e;
490 trace_mark(core_marker_format, "name %s format %s", 471 trace_mark(core_marker_format, "name %s format %s",
491 e->name, e->format); 472 entry->name, entry->format);
492 return 0; 473 return 0;
493} 474}
494 475
495/* 476/*
496 * Sets the probe callback corresponding to one marker. 477 * Sets the probe callback corresponding to one marker.
497 */ 478 */
498static int set_marker(struct marker_entry **entry, struct marker *elem, 479static int set_marker(struct marker_entry *entry, struct marker *elem,
499 int active) 480 int active)
500{ 481{
501 int ret; 482 int ret = 0;
502 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 483 WARN_ON(strcmp(entry->name, elem->name) != 0);
503 484
504 if ((*entry)->format) { 485 if (entry->format) {
505 if (strcmp((*entry)->format, elem->format) != 0) { 486 if (strcmp(entry->format, elem->format) != 0) {
506 printk(KERN_NOTICE 487 printk(KERN_NOTICE
507 "Format mismatch for probe %s " 488 "Format mismatch for probe %s "
508 "(%s), marker (%s)\n", 489 "(%s), marker (%s)\n",
509 (*entry)->name, 490 entry->name,
510 (*entry)->format, 491 entry->format,
511 elem->format); 492 elem->format);
512 return -EPERM; 493 return -EPERM;
513 } 494 }
@@ -523,37 +504,67 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
523 * pass from a "safe" callback (with argument) to an "unsafe" 504 * pass from a "safe" callback (with argument) to an "unsafe"
524 * callback (does not set arguments). 505 * callback (does not set arguments).
525 */ 506 */
526 elem->call = (*entry)->call; 507 elem->call = entry->call;
527 /* 508 /*
528 * Sanity check : 509 * Sanity check :
529 * We only update the single probe private data when the ptr is 510 * We only update the single probe private data when the ptr is
530 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) 511 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
531 */ 512 */
532 WARN_ON(elem->single.func != __mark_empty_function 513 WARN_ON(elem->single.func != __mark_empty_function
533 && elem->single.probe_private 514 && elem->single.probe_private != entry->single.probe_private
534 != (*entry)->single.probe_private && 515 && !elem->ptype);
535 !elem->ptype); 516 elem->single.probe_private = entry->single.probe_private;
536 elem->single.probe_private = (*entry)->single.probe_private;
537 /* 517 /*
538 * Make sure the private data is valid when we update the 518 * Make sure the private data is valid when we update the
539 * single probe ptr. 519 * single probe ptr.
540 */ 520 */
541 smp_wmb(); 521 smp_wmb();
542 elem->single.func = (*entry)->single.func; 522 elem->single.func = entry->single.func;
543 /* 523 /*
544 * We also make sure that the new probe callbacks array is consistent 524 * We also make sure that the new probe callbacks array is consistent
545 * before setting a pointer to it. 525 * before setting a pointer to it.
546 */ 526 */
547 rcu_assign_pointer(elem->multi, (*entry)->multi); 527 rcu_assign_pointer(elem->multi, entry->multi);
548 /* 528 /*
549 * Update the function or multi probe array pointer before setting the 529 * Update the function or multi probe array pointer before setting the
550 * ptype. 530 * ptype.
551 */ 531 */
552 smp_wmb(); 532 smp_wmb();
553 elem->ptype = (*entry)->ptype; 533 elem->ptype = entry->ptype;
534
535 if (elem->tp_name && (active ^ elem->state)) {
536 WARN_ON(!elem->tp_cb);
537 /*
538 * It is ok to directly call the probe registration because type
539 * checking has been done in the __trace_mark_tp() macro.
540 */
541
542 if (active) {
543 /*
544 * try_module_get should always succeed because we hold
545 * lock_module() to get the tp_cb address.
546 */
547 ret = try_module_get(__module_text_address(
548 (unsigned long)elem->tp_cb));
549 BUG_ON(!ret);
550 ret = tracepoint_probe_register_noupdate(
551 elem->tp_name,
552 elem->tp_cb);
553 } else {
554 ret = tracepoint_probe_unregister_noupdate(
555 elem->tp_name,
556 elem->tp_cb);
557 /*
558 * tracepoint_probe_update_all() must be called
559 * before the module containing tp_cb is unloaded.
560 */
561 module_put(__module_text_address(
562 (unsigned long)elem->tp_cb));
563 }
564 }
554 elem->state = active; 565 elem->state = active;
555 566
556 return 0; 567 return ret;
557} 568}
558 569
559/* 570/*
@@ -564,7 +575,24 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
564 */ 575 */
565static void disable_marker(struct marker *elem) 576static void disable_marker(struct marker *elem)
566{ 577{
578 int ret;
579
567 /* leave "call" as is. It is known statically. */ 580 /* leave "call" as is. It is known statically. */
581 if (elem->tp_name && elem->state) {
582 WARN_ON(!elem->tp_cb);
583 /*
584 * It is ok to directly call the probe registration because type
585 * checking has been done in the __trace_mark_tp() macro.
586 */
587 ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
588 elem->tp_cb);
589 WARN_ON(ret);
590 /*
591 * tracepoint_probe_update_all() must be called
592 * before the module containing tp_cb is unloaded.
593 */
594 module_put(__module_text_address((unsigned long)elem->tp_cb));
595 }
568 elem->state = 0; 596 elem->state = 0;
569 elem->single.func = __mark_empty_function; 597 elem->single.func = __mark_empty_function;
570 /* Update the function before setting the ptype */ 598 /* Update the function before setting the ptype */
@@ -594,8 +622,7 @@ void marker_update_probe_range(struct marker *begin,
594 for (iter = begin; iter < end; iter++) { 622 for (iter = begin; iter < end; iter++) {
595 mark_entry = get_marker(iter->name); 623 mark_entry = get_marker(iter->name);
596 if (mark_entry) { 624 if (mark_entry) {
597 set_marker(&mark_entry, iter, 625 set_marker(mark_entry, iter, !!mark_entry->refcount);
598 !!mark_entry->refcount);
599 /* 626 /*
600 * ignore error, continue 627 * ignore error, continue
601 */ 628 */
@@ -629,6 +656,7 @@ static void marker_update_probes(void)
629 marker_update_probe_range(__start___markers, __stop___markers); 656 marker_update_probe_range(__start___markers, __stop___markers);
630 /* Markers in modules. */ 657 /* Markers in modules. */
631 module_update_markers(); 658 module_update_markers();
659 tracepoint_probe_update_all();
632} 660}
633 661
634/** 662/**
@@ -657,7 +685,7 @@ int marker_probe_register(const char *name, const char *format,
657 ret = PTR_ERR(entry); 685 ret = PTR_ERR(entry);
658 } else if (format) { 686 } else if (format) {
659 if (!entry->format) 687 if (!entry->format)
660 ret = marker_set_format(&entry, format); 688 ret = marker_set_format(entry, format);
661 else if (strcmp(entry->format, format)) 689 else if (strcmp(entry->format, format))
662 ret = -EPERM; 690 ret = -EPERM;
663 } 691 }
@@ -676,10 +704,11 @@ int marker_probe_register(const char *name, const char *format,
676 goto end; 704 goto end;
677 } 705 }
678 mutex_unlock(&markers_mutex); 706 mutex_unlock(&markers_mutex);
679 marker_update_probes(); /* may update entry */ 707 marker_update_probes();
680 mutex_lock(&markers_mutex); 708 mutex_lock(&markers_mutex);
681 entry = get_marker(name); 709 entry = get_marker(name);
682 WARN_ON(!entry); 710 if (!entry)
711 goto end;
683 if (entry->rcu_pending) 712 if (entry->rcu_pending)
684 rcu_barrier_sched(); 713 rcu_barrier_sched();
685 entry->oldptr = old; 714 entry->oldptr = old;
@@ -720,7 +749,7 @@ int marker_probe_unregister(const char *name,
720 rcu_barrier_sched(); 749 rcu_barrier_sched();
721 old = marker_entry_remove_probe(entry, probe, probe_private); 750 old = marker_entry_remove_probe(entry, probe, probe_private);
722 mutex_unlock(&markers_mutex); 751 mutex_unlock(&markers_mutex);
723 marker_update_probes(); /* may update entry */ 752 marker_update_probes();
724 mutex_lock(&markers_mutex); 753 mutex_lock(&markers_mutex);
725 entry = get_marker(name); 754 entry = get_marker(name);
726 if (!entry) 755 if (!entry)
@@ -801,10 +830,11 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
801 rcu_barrier_sched(); 830 rcu_barrier_sched();
802 old = marker_entry_remove_probe(entry, NULL, probe_private); 831 old = marker_entry_remove_probe(entry, NULL, probe_private);
803 mutex_unlock(&markers_mutex); 832 mutex_unlock(&markers_mutex);
804 marker_update_probes(); /* may update entry */ 833 marker_update_probes();
805 mutex_lock(&markers_mutex); 834 mutex_lock(&markers_mutex);
806 entry = get_marker_from_private_data(probe, probe_private); 835 entry = get_marker_from_private_data(probe, probe_private);
807 WARN_ON(!entry); 836 if (!entry)
837 goto end;
808 if (entry->rcu_pending) 838 if (entry->rcu_pending)
809 rcu_barrier_sched(); 839 rcu_barrier_sched();
810 entry->oldptr = old; 840 entry->oldptr = old;
@@ -848,8 +878,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
848 if (!e->ptype) { 878 if (!e->ptype) {
849 if (num == 0 && e->single.func == probe) 879 if (num == 0 && e->single.func == probe)
850 return e->single.probe_private; 880 return e->single.probe_private;
851 else
852 break;
853 } else { 881 } else {
854 struct marker_probe_closure *closure; 882 struct marker_probe_closure *closure;
855 int match = 0; 883 int match = 0;
@@ -861,8 +889,42 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
861 return closure[i].probe_private; 889 return closure[i].probe_private;
862 } 890 }
863 } 891 }
892 break;
864 } 893 }
865 } 894 }
866 return ERR_PTR(-ENOENT); 895 return ERR_PTR(-ENOENT);
867} 896}
868EXPORT_SYMBOL_GPL(marker_get_private_data); 897EXPORT_SYMBOL_GPL(marker_get_private_data);
898
899#ifdef CONFIG_MODULES
900
901int marker_module_notify(struct notifier_block *self,
902 unsigned long val, void *data)
903{
904 struct module *mod = data;
905
906 switch (val) {
907 case MODULE_STATE_COMING:
908 marker_update_probe_range(mod->markers,
909 mod->markers + mod->num_markers);
910 break;
911 case MODULE_STATE_GOING:
912 marker_update_probe_range(mod->markers,
913 mod->markers + mod->num_markers);
914 break;
915 }
916 return 0;
917}
918
919struct notifier_block marker_module_nb = {
920 .notifier_call = marker_module_notify,
921 .priority = 0,
922};
923
924static int init_markers(void)
925{
926 return register_module_notifier(&marker_module_nb);
927}
928__initcall(init_markers);
929
930#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 1f4cc00e0c20..dd2a54155b54 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2184,24 +2184,15 @@ static noinline struct module *load_module(void __user *umod,
2184 struct mod_debug *debug; 2184 struct mod_debug *debug;
2185 unsigned int num_debug; 2185 unsigned int num_debug;
2186 2186
2187#ifdef CONFIG_MARKERS
2188 marker_update_probe_range(mod->markers,
2189 mod->markers + mod->num_markers);
2190#endif
2191 debug = section_objs(hdr, sechdrs, secstrings, "__verbose", 2187 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2192 sizeof(*debug), &num_debug); 2188 sizeof(*debug), &num_debug);
2193 dynamic_printk_setup(debug, num_debug); 2189 dynamic_printk_setup(debug, num_debug);
2194
2195#ifdef CONFIG_TRACEPOINTS
2196 tracepoint_update_probe_range(mod->tracepoints,
2197 mod->tracepoints + mod->num_tracepoints);
2198#endif
2199 } 2190 }
2200 2191
2201 /* sechdrs[0].sh_size is always zero */ 2192 /* sechdrs[0].sh_size is always zero */
2202 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", 2193 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
2203 sizeof(*mseg), &num_mcount); 2194 sizeof(*mseg), &num_mcount);
2204 ftrace_init_module(mseg, mseg + num_mcount); 2195 ftrace_init_module(mod, mseg, mseg + num_mcount);
2205 2196
2206 err = module_finalize(hdr, sechdrs, mod); 2197 err = module_finalize(hdr, sechdrs, mod);
2207 if (err < 0) 2198 if (err < 0)
@@ -2713,7 +2704,7 @@ int is_module_address(unsigned long addr)
2713 2704
2714 2705
2715/* Is this a valid kernel address? */ 2706/* Is this a valid kernel address? */
2716struct module *__module_text_address(unsigned long addr) 2707__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
2717{ 2708{
2718 struct module *mod; 2709 struct module *mod;
2719 2710
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 12c779dc65d4..4f45d4b658ef 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
59 * We also put the fastpath first in the kernel image, to make sure the 59 * We also put the fastpath first in the kernel image, to make sure the
60 * branch is predicted by the CPU as default-untaken. 60 * branch is predicted by the CPU as default-untaken.
61 */ 61 */
62static void noinline __sched 62static __used noinline void __sched
63__mutex_lock_slowpath(atomic_t *lock_count); 63__mutex_lock_slowpath(atomic_t *lock_count);
64 64
65/*** 65/***
@@ -96,7 +96,7 @@ void inline __sched mutex_lock(struct mutex *lock)
96EXPORT_SYMBOL(mutex_lock); 96EXPORT_SYMBOL(mutex_lock);
97#endif 97#endif
98 98
99static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 99static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
100 100
101/*** 101/***
102 * mutex_unlock - release the mutex 102 * mutex_unlock - release the mutex
@@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
184 } 184 }
185 185
186done: 186done:
187 lock_acquired(&lock->dep_map); 187 lock_acquired(&lock->dep_map, ip);
188 /* got the lock - rejoice! */ 188 /* got the lock - rejoice! */
189 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 189 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
190 debug_mutex_set_owner(lock, task_thread_info(task)); 190 debug_mutex_set_owner(lock, task_thread_info(task));
@@ -268,7 +268,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
268/* 268/*
269 * Release the lock, slowpath: 269 * Release the lock, slowpath:
270 */ 270 */
271static noinline void 271static __used noinline void
272__mutex_unlock_slowpath(atomic_t *lock_count) 272__mutex_unlock_slowpath(atomic_t *lock_count)
273{ 273{
274 __mutex_unlock_common_slowpath(lock_count, 1); 274 __mutex_unlock_common_slowpath(lock_count, 1);
@@ -313,7 +313,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
313} 313}
314EXPORT_SYMBOL(mutex_lock_killable); 314EXPORT_SYMBOL(mutex_lock_killable);
315 315
316static noinline void __sched 316static __used noinline void __sched
317__mutex_lock_slowpath(atomic_t *lock_count) 317__mutex_lock_slowpath(atomic_t *lock_count)
318{ 318{
319 struct mutex *lock = container_of(lock_count, struct mutex, count); 319 struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4282c0a40a57..61d5aa5eced3 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -82,6 +82,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference(nb->next);
85
86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
88 WARN(1, "Invalid notifier called!");
89 nb = next_nb;
90 continue;
91 }
92#endif
85 ret = nb->notifier_call(nb, val, v); 93 ret = nb->notifier_call(nb, val, v);
86 94
87 if (nr_calls) 95 if (nr_calls)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 4e5288a831de..157de3a47832 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -58,21 +58,21 @@ void thread_group_cputime(
58 struct task_struct *tsk, 58 struct task_struct *tsk,
59 struct task_cputime *times) 59 struct task_cputime *times)
60{ 60{
61 struct signal_struct *sig; 61 struct task_cputime *totals, *tot;
62 int i; 62 int i;
63 struct task_cputime *tot;
64 63
65 sig = tsk->signal; 64 totals = tsk->signal->cputime.totals;
66 if (unlikely(!sig) || !sig->cputime.totals) { 65 if (!totals) {
67 times->utime = tsk->utime; 66 times->utime = tsk->utime;
68 times->stime = tsk->stime; 67 times->stime = tsk->stime;
69 times->sum_exec_runtime = tsk->se.sum_exec_runtime; 68 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
70 return; 69 return;
71 } 70 }
71
72 times->stime = times->utime = cputime_zero; 72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0; 73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) { 74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(tsk->signal->cputime.totals, i); 75 tot = per_cpu_ptr(totals, i);
76 times->utime = cputime_add(times->utime, tot->utime); 76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime); 77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime; 78 times->sum_exec_runtime += tot->sum_exec_runtime;
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index c9d74083746f..f77d3819ef57 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,7 +22,6 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/ftrace.h>
26 25
27#include "power.h" 26#include "power.h"
28 27
@@ -257,7 +256,7 @@ static int create_image(int platform_mode)
257 256
258int hibernation_snapshot(int platform_mode) 257int hibernation_snapshot(int platform_mode)
259{ 258{
260 int error, ftrace_save; 259 int error;
261 260
262 /* Free memory before shutting down devices. */ 261 /* Free memory before shutting down devices. */
263 error = swsusp_shrink_memory(); 262 error = swsusp_shrink_memory();
@@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode)
269 goto Close; 268 goto Close;
270 269
271 suspend_console(); 270 suspend_console();
272 ftrace_save = __ftrace_enabled_save();
273 error = device_suspend(PMSG_FREEZE); 271 error = device_suspend(PMSG_FREEZE);
274 if (error) 272 if (error)
275 goto Recover_platform; 273 goto Recover_platform;
@@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode)
299 Resume_devices: 297 Resume_devices:
300 device_resume(in_suspend ? 298 device_resume(in_suspend ?
301 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 299 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
302 __ftrace_enabled_restore(ftrace_save);
303 resume_console(); 300 resume_console();
304 Close: 301 Close:
305 platform_end(platform_mode); 302 platform_end(platform_mode);
@@ -370,11 +367,10 @@ static int resume_target_kernel(void)
370 367
371int hibernation_restore(int platform_mode) 368int hibernation_restore(int platform_mode)
372{ 369{
373 int error, ftrace_save; 370 int error;
374 371
375 pm_prepare_console(); 372 pm_prepare_console();
376 suspend_console(); 373 suspend_console();
377 ftrace_save = __ftrace_enabled_save();
378 error = device_suspend(PMSG_QUIESCE); 374 error = device_suspend(PMSG_QUIESCE);
379 if (error) 375 if (error)
380 goto Finish; 376 goto Finish;
@@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode)
389 platform_restore_cleanup(platform_mode); 385 platform_restore_cleanup(platform_mode);
390 device_resume(PMSG_RECOVER); 386 device_resume(PMSG_RECOVER);
391 Finish: 387 Finish:
392 __ftrace_enabled_restore(ftrace_save);
393 resume_console(); 388 resume_console();
394 pm_restore_console(); 389 pm_restore_console();
395 return error; 390 return error;
@@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode)
402 397
403int hibernation_platform_enter(void) 398int hibernation_platform_enter(void)
404{ 399{
405 int error, ftrace_save; 400 int error;
406 401
407 if (!hibernation_ops) 402 if (!hibernation_ops)
408 return -ENOSYS; 403 return -ENOSYS;
@@ -417,7 +412,6 @@ int hibernation_platform_enter(void)
417 goto Close; 412 goto Close;
418 413
419 suspend_console(); 414 suspend_console();
420 ftrace_save = __ftrace_enabled_save();
421 error = device_suspend(PMSG_HIBERNATE); 415 error = device_suspend(PMSG_HIBERNATE);
422 if (error) { 416 if (error) {
423 if (hibernation_ops->recover) 417 if (hibernation_ops->recover)
@@ -452,7 +446,6 @@ int hibernation_platform_enter(void)
452 hibernation_ops->finish(); 446 hibernation_ops->finish();
453 Resume_devices: 447 Resume_devices:
454 device_resume(PMSG_RESTORE); 448 device_resume(PMSG_RESTORE);
455 __ftrace_enabled_restore(ftrace_save);
456 resume_console(); 449 resume_console();
457 Close: 450 Close:
458 hibernation_ops->end(); 451 hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b8f7ce9473e8..613f16941b85 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -22,7 +22,6 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/vmstat.h> 23#include <linux/vmstat.h>
24#include <linux/syscalls.h> 24#include <linux/syscalls.h>
25#include <linux/ftrace.h>
26 25
27#include "power.h" 26#include "power.h"
28 27
@@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state)
317 */ 316 */
318int suspend_devices_and_enter(suspend_state_t state) 317int suspend_devices_and_enter(suspend_state_t state)
319{ 318{
320 int error, ftrace_save; 319 int error;
321 320
322 if (!suspend_ops) 321 if (!suspend_ops)
323 return -ENOSYS; 322 return -ENOSYS;
@@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state)
328 goto Close; 327 goto Close;
329 } 328 }
330 suspend_console(); 329 suspend_console();
331 ftrace_save = __ftrace_enabled_save();
332 suspend_test_start(); 330 suspend_test_start();
333 error = device_suspend(PMSG_SUSPEND); 331 error = device_suspend(PMSG_SUSPEND);
334 if (error) { 332 if (error) {
@@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state)
360 suspend_test_start(); 358 suspend_test_start();
361 device_resume(PMSG_RESUME); 359 device_resume(PMSG_RESUME);
362 suspend_test_finish("resume devices"); 360 suspend_test_finish("resume devices");
363 __ftrace_enabled_restore(ftrace_save);
364 resume_console(); 361 resume_console();
365 Close: 362 Close:
366 if (suspend_ops->end) 363 if (suspend_ops->end)
diff --git a/kernel/profile.c b/kernel/profile.c
index 7d620dfdde59..4cb7d68fed82 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = {
544}; 544};
545 545
546#ifdef CONFIG_SMP 546#ifdef CONFIG_SMP
547static inline void profile_nop(void *unused) 547static void profile_nop(void *unused)
548{ 548{
549} 549}
550 550
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 37f72e551542..c03ca3e61919 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -191,7 +191,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
191 191
192 /* OK, time to rat on our buddy... */ 192 /* OK, time to rat on our buddy... */
193 193
194 printk(KERN_ERR "RCU detected CPU stalls:"); 194 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) { 195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask)) 196 if (cpu_isset(cpu, rcp->cpumask))
197 printk(" %d", cpu); 197 printk(" %d", cpu);
@@ -204,7 +204,7 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{ 204{
205 unsigned long flags; 205 unsigned long flags;
206 206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", 207 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies, 208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start); 209 jiffies - rcp->gp_start);
210 dump_stack(); 210 dump_stack();
@@ -393,7 +393,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
393 * unnecessarily. 393 * unnecessarily.
394 */ 394 */
395 smp_mb(); 395 smp_mb();
396 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); 396 cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask);
397 397
398 rcp->signaled = 0; 398 rcp->signaled = 0;
399 } 399 }
diff --git a/kernel/sched.c b/kernel/sched.c
index d2d16d1273b1..b309027bf9e8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -118,6 +118,12 @@
118 */ 118 */
119#define RUNTIME_INF ((u64)~0ULL) 119#define RUNTIME_INF ((u64)~0ULL)
120 120
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
121#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
122/* 128/*
123 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 129 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -261,6 +267,10 @@ struct task_group {
261 struct cgroup_subsys_state css; 267 struct cgroup_subsys_state css;
262#endif 268#endif
263 269
270#ifdef CONFIG_USER_SCHED
271 uid_t uid;
272#endif
273
264#ifdef CONFIG_FAIR_GROUP_SCHED 274#ifdef CONFIG_FAIR_GROUP_SCHED
265 /* schedulable entities of this group on each cpu */ 275 /* schedulable entities of this group on each cpu */
266 struct sched_entity **se; 276 struct sched_entity **se;
@@ -286,6 +296,12 @@ struct task_group {
286 296
287#ifdef CONFIG_USER_SCHED 297#ifdef CONFIG_USER_SCHED
288 298
299/* Helper function to pass uid information to create_sched_user() */
300void set_tg_uid(struct user_struct *user)
301{
302 user->tg->uid = user->uid;
303}
304
289/* 305/*
290 * Root task group. 306 * Root task group.
291 * Every UID task group (including init_task_group aka UID-0) will 307 * Every UID task group (including init_task_group aka UID-0) will
@@ -481,14 +497,14 @@ struct rt_rq {
481 */ 497 */
482struct root_domain { 498struct root_domain {
483 atomic_t refcount; 499 atomic_t refcount;
484 cpumask_t span; 500 cpumask_var_t span;
485 cpumask_t online; 501 cpumask_var_t online;
486 502
487 /* 503 /*
488 * The "RT overload" flag: it gets set if a CPU has more than 504 * The "RT overload" flag: it gets set if a CPU has more than
489 * one runnable RT task. 505 * one runnable RT task.
490 */ 506 */
491 cpumask_t rto_mask; 507 cpumask_var_t rto_mask;
492 atomic_t rto_count; 508 atomic_t rto_count;
493#ifdef CONFIG_SMP 509#ifdef CONFIG_SMP
494 struct cpupri cpupri; 510 struct cpupri cpupri;
@@ -703,45 +719,18 @@ static __read_mostly char *sched_feat_names[] = {
703 719
704#undef SCHED_FEAT 720#undef SCHED_FEAT
705 721
706static int sched_feat_open(struct inode *inode, struct file *filp) 722static int sched_feat_show(struct seq_file *m, void *v)
707{ 723{
708 filp->private_data = inode->i_private;
709 return 0;
710}
711
712static ssize_t
713sched_feat_read(struct file *filp, char __user *ubuf,
714 size_t cnt, loff_t *ppos)
715{
716 char *buf;
717 int r = 0;
718 int len = 0;
719 int i; 724 int i;
720 725
721 for (i = 0; sched_feat_names[i]; i++) { 726 for (i = 0; sched_feat_names[i]; i++) {
722 len += strlen(sched_feat_names[i]); 727 if (!(sysctl_sched_features & (1UL << i)))
723 len += 4; 728 seq_puts(m, "NO_");
724 } 729 seq_printf(m, "%s ", sched_feat_names[i]);
725
726 buf = kmalloc(len + 2, GFP_KERNEL);
727 if (!buf)
728 return -ENOMEM;
729
730 for (i = 0; sched_feat_names[i]; i++) {
731 if (sysctl_sched_features & (1UL << i))
732 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733 else
734 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735 } 730 }
731 seq_puts(m, "\n");
736 732
737 r += sprintf(buf + r, "\n"); 733 return 0;
738 WARN_ON(r >= len + 2);
739
740 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742 kfree(buf);
743
744 return r;
745} 734}
746 735
747static ssize_t 736static ssize_t
@@ -786,10 +775,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
786 return cnt; 775 return cnt;
787} 776}
788 777
778static int sched_feat_open(struct inode *inode, struct file *filp)
779{
780 return single_open(filp, sched_feat_show, NULL);
781}
782
789static struct file_operations sched_feat_fops = { 783static struct file_operations sched_feat_fops = {
790 .open = sched_feat_open, 784 .open = sched_feat_open,
791 .read = sched_feat_read, 785 .write = sched_feat_write,
792 .write = sched_feat_write, 786 .read = seq_read,
787 .llseek = seq_lseek,
788 .release = single_release,
793}; 789};
794 790
795static __init int sched_init_debug(void) 791static __init int sched_init_debug(void)
@@ -1474,27 +1470,13 @@ static void
1474update_group_shares_cpu(struct task_group *tg, int cpu, 1470update_group_shares_cpu(struct task_group *tg, int cpu,
1475 unsigned long sd_shares, unsigned long sd_rq_weight) 1471 unsigned long sd_shares, unsigned long sd_rq_weight)
1476{ 1472{
1477 int boost = 0;
1478 unsigned long shares; 1473 unsigned long shares;
1479 unsigned long rq_weight; 1474 unsigned long rq_weight;
1480 1475
1481 if (!tg->se[cpu]) 1476 if (!tg->se[cpu])
1482 return; 1477 return;
1483 1478
1484 rq_weight = tg->cfs_rq[cpu]->load.weight; 1479 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1485
1486 /*
1487 * If there are currently no tasks on the cpu pretend there is one of
1488 * average load so that when a new task gets to run here it will not
1489 * get delayed by group starvation.
1490 */
1491 if (!rq_weight) {
1492 boost = 1;
1493 rq_weight = NICE_0_LOAD;
1494 }
1495
1496 if (unlikely(rq_weight > sd_rq_weight))
1497 rq_weight = sd_rq_weight;
1498 1480
1499 /* 1481 /*
1500 * \Sum shares * rq_weight 1482 * \Sum shares * rq_weight
@@ -1502,7 +1484,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1502 * \Sum rq_weight 1484 * \Sum rq_weight
1503 * 1485 *
1504 */ 1486 */
1505 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1487 shares = (sd_shares * rq_weight) / sd_rq_weight;
1506 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1488 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1507 1489
1508 if (abs(shares - tg->se[cpu]->load.weight) > 1490 if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1511,11 +1493,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1511 unsigned long flags; 1493 unsigned long flags;
1512 1494
1513 spin_lock_irqsave(&rq->lock, flags); 1495 spin_lock_irqsave(&rq->lock, flags);
1514 /* 1496 tg->cfs_rq[cpu]->shares = shares;
1515 * record the actual number of shares, not the boosted amount.
1516 */
1517 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1518 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1519 1497
1520 __set_se_shares(tg->se[cpu], shares); 1498 __set_se_shares(tg->se[cpu], shares);
1521 spin_unlock_irqrestore(&rq->lock, flags); 1499 spin_unlock_irqrestore(&rq->lock, flags);
@@ -1529,13 +1507,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1529 */ 1507 */
1530static int tg_shares_up(struct task_group *tg, void *data) 1508static int tg_shares_up(struct task_group *tg, void *data)
1531{ 1509{
1532 unsigned long rq_weight = 0; 1510 unsigned long weight, rq_weight = 0;
1533 unsigned long shares = 0; 1511 unsigned long shares = 0;
1534 struct sched_domain *sd = data; 1512 struct sched_domain *sd = data;
1535 int i; 1513 int i;
1536 1514
1537 for_each_cpu_mask(i, sd->span) { 1515 for_each_cpu(i, sched_domain_span(sd)) {
1538 rq_weight += tg->cfs_rq[i]->load.weight; 1516 /*
1517 * If there are currently no tasks on the cpu pretend there
1518 * is one of average load so that when a new task gets to
1519 * run here it will not get delayed by group starvation.
1520 */
1521 weight = tg->cfs_rq[i]->load.weight;
1522 if (!weight)
1523 weight = NICE_0_LOAD;
1524
1525 tg->cfs_rq[i]->rq_weight = weight;
1526 rq_weight += weight;
1539 shares += tg->cfs_rq[i]->shares; 1527 shares += tg->cfs_rq[i]->shares;
1540 } 1528 }
1541 1529
@@ -1545,10 +1533,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1545 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1533 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1546 shares = tg->shares; 1534 shares = tg->shares;
1547 1535
1548 if (!rq_weight) 1536 for_each_cpu(i, sched_domain_span(sd))
1549 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1550
1551 for_each_cpu_mask(i, sd->span)
1552 update_group_shares_cpu(tg, i, shares, rq_weight); 1537 update_group_shares_cpu(tg, i, shares, rq_weight);
1553 1538
1554 return 0; 1539 return 0;
@@ -1612,6 +1597,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1612 1597
1613#endif 1598#endif
1614 1599
1600/*
1601 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1602 */
1603static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1604 __releases(this_rq->lock)
1605 __acquires(busiest->lock)
1606 __acquires(this_rq->lock)
1607{
1608 int ret = 0;
1609
1610 if (unlikely(!irqs_disabled())) {
1611 /* printk() doesn't work good under rq->lock */
1612 spin_unlock(&this_rq->lock);
1613 BUG_ON(1);
1614 }
1615 if (unlikely(!spin_trylock(&busiest->lock))) {
1616 if (busiest < this_rq) {
1617 spin_unlock(&this_rq->lock);
1618 spin_lock(&busiest->lock);
1619 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1620 ret = 1;
1621 } else
1622 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1623 }
1624 return ret;
1625}
1626
1627static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1628 __releases(busiest->lock)
1629{
1630 spin_unlock(&busiest->lock);
1631 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1632}
1615#endif 1633#endif
1616 1634
1617#ifdef CONFIG_FAIR_GROUP_SCHED 1635#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2079,15 +2097,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2079 int i; 2097 int i;
2080 2098
2081 /* Skip over this group if it has no CPUs allowed */ 2099 /* Skip over this group if it has no CPUs allowed */
2082 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 2100 if (!cpumask_intersects(sched_group_cpus(group),
2101 &p->cpus_allowed))
2083 continue; 2102 continue;
2084 2103
2085 local_group = cpu_isset(this_cpu, group->cpumask); 2104 local_group = cpumask_test_cpu(this_cpu,
2105 sched_group_cpus(group));
2086 2106
2087 /* Tally up the load of all CPUs in the group */ 2107 /* Tally up the load of all CPUs in the group */
2088 avg_load = 0; 2108 avg_load = 0;
2089 2109
2090 for_each_cpu_mask_nr(i, group->cpumask) { 2110 for_each_cpu(i, sched_group_cpus(group)) {
2091 /* Bias balancing toward cpus of our domain */ 2111 /* Bias balancing toward cpus of our domain */
2092 if (local_group) 2112 if (local_group)
2093 load = source_load(i, load_idx); 2113 load = source_load(i, load_idx);
@@ -2119,17 +2139,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2119 * find_idlest_cpu - find the idlest cpu among the cpus in group. 2139 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2120 */ 2140 */
2121static int 2141static int
2122find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, 2142find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2123 cpumask_t *tmp)
2124{ 2143{
2125 unsigned long load, min_load = ULONG_MAX; 2144 unsigned long load, min_load = ULONG_MAX;
2126 int idlest = -1; 2145 int idlest = -1;
2127 int i; 2146 int i;
2128 2147
2129 /* Traverse only the allowed CPUs */ 2148 /* Traverse only the allowed CPUs */
2130 cpus_and(*tmp, group->cpumask, p->cpus_allowed); 2149 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2131
2132 for_each_cpu_mask_nr(i, *tmp) {
2133 load = weighted_cpuload(i); 2150 load = weighted_cpuload(i);
2134 2151
2135 if (load < min_load || (load == min_load && i == this_cpu)) { 2152 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2171,7 +2188,6 @@ static int sched_balance_self(int cpu, int flag)
2171 update_shares(sd); 2188 update_shares(sd);
2172 2189
2173 while (sd) { 2190 while (sd) {
2174 cpumask_t span, tmpmask;
2175 struct sched_group *group; 2191 struct sched_group *group;
2176 int new_cpu, weight; 2192 int new_cpu, weight;
2177 2193
@@ -2180,14 +2196,13 @@ static int sched_balance_self(int cpu, int flag)
2180 continue; 2196 continue;
2181 } 2197 }
2182 2198
2183 span = sd->span;
2184 group = find_idlest_group(sd, t, cpu); 2199 group = find_idlest_group(sd, t, cpu);
2185 if (!group) { 2200 if (!group) {
2186 sd = sd->child; 2201 sd = sd->child;
2187 continue; 2202 continue;
2188 } 2203 }
2189 2204
2190 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); 2205 new_cpu = find_idlest_cpu(group, t, cpu);
2191 if (new_cpu == -1 || new_cpu == cpu) { 2206 if (new_cpu == -1 || new_cpu == cpu) {
2192 /* Now try balancing at a lower domain level of cpu */ 2207 /* Now try balancing at a lower domain level of cpu */
2193 sd = sd->child; 2208 sd = sd->child;
@@ -2196,10 +2211,10 @@ static int sched_balance_self(int cpu, int flag)
2196 2211
2197 /* Now try balancing at a lower domain level of new_cpu */ 2212 /* Now try balancing at a lower domain level of new_cpu */
2198 cpu = new_cpu; 2213 cpu = new_cpu;
2214 weight = cpumask_weight(sched_domain_span(sd));
2199 sd = NULL; 2215 sd = NULL;
2200 weight = cpus_weight(span);
2201 for_each_domain(cpu, tmp) { 2216 for_each_domain(cpu, tmp) {
2202 if (weight <= cpus_weight(tmp->span)) 2217 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2203 break; 2218 break;
2204 if (tmp->flags & flag) 2219 if (tmp->flags & flag)
2205 sd = tmp; 2220 sd = tmp;
@@ -2244,7 +2259,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2244 cpu = task_cpu(p); 2259 cpu = task_cpu(p);
2245 2260
2246 for_each_domain(this_cpu, sd) { 2261 for_each_domain(this_cpu, sd) {
2247 if (cpu_isset(cpu, sd->span)) { 2262 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2248 update_shares(sd); 2263 update_shares(sd);
2249 break; 2264 break;
2250 } 2265 }
@@ -2292,7 +2307,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2292 else { 2307 else {
2293 struct sched_domain *sd; 2308 struct sched_domain *sd;
2294 for_each_domain(this_cpu, sd) { 2309 for_each_domain(this_cpu, sd) {
2295 if (cpu_isset(cpu, sd->span)) { 2310 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2296 schedstat_inc(sd, ttwu_wake_remote); 2311 schedstat_inc(sd, ttwu_wake_remote);
2297 break; 2312 break;
2298 } 2313 }
@@ -2812,40 +2827,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2812} 2827}
2813 2828
2814/* 2829/*
2815 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2816 */
2817static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2818 __releases(this_rq->lock)
2819 __acquires(busiest->lock)
2820 __acquires(this_rq->lock)
2821{
2822 int ret = 0;
2823
2824 if (unlikely(!irqs_disabled())) {
2825 /* printk() doesn't work good under rq->lock */
2826 spin_unlock(&this_rq->lock);
2827 BUG_ON(1);
2828 }
2829 if (unlikely(!spin_trylock(&busiest->lock))) {
2830 if (busiest < this_rq) {
2831 spin_unlock(&this_rq->lock);
2832 spin_lock(&busiest->lock);
2833 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2834 ret = 1;
2835 } else
2836 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2837 }
2838 return ret;
2839}
2840
2841static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2842 __releases(busiest->lock)
2843{
2844 spin_unlock(&busiest->lock);
2845 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2846}
2847
2848/*
2849 * If dest_cpu is allowed for this process, migrate the task to it. 2830 * If dest_cpu is allowed for this process, migrate the task to it.
2850 * This is accomplished by forcing the cpu_allowed mask to only 2831 * This is accomplished by forcing the cpu_allowed mask to only
2851 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2832 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -2858,7 +2839,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2858 struct rq *rq; 2839 struct rq *rq;
2859 2840
2860 rq = task_rq_lock(p, &flags); 2841 rq = task_rq_lock(p, &flags);
2861 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2842 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
2862 || unlikely(!cpu_active(dest_cpu))) 2843 || unlikely(!cpu_active(dest_cpu)))
2863 goto out; 2844 goto out;
2864 2845
@@ -2924,7 +2905,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2924 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2905 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2925 * 3) are cache-hot on their current CPU. 2906 * 3) are cache-hot on their current CPU.
2926 */ 2907 */
2927 if (!cpu_isset(this_cpu, p->cpus_allowed)) { 2908 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
2928 schedstat_inc(p, se.nr_failed_migrations_affine); 2909 schedstat_inc(p, se.nr_failed_migrations_affine);
2929 return 0; 2910 return 0;
2930 } 2911 }
@@ -3099,7 +3080,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3099static struct sched_group * 3080static struct sched_group *
3100find_busiest_group(struct sched_domain *sd, int this_cpu, 3081find_busiest_group(struct sched_domain *sd, int this_cpu,
3101 unsigned long *imbalance, enum cpu_idle_type idle, 3082 unsigned long *imbalance, enum cpu_idle_type idle,
3102 int *sd_idle, const cpumask_t *cpus, int *balance) 3083 int *sd_idle, const struct cpumask *cpus, int *balance)
3103{ 3084{
3104 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3085 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3105 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3086 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3135,10 +3116,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3135 unsigned long sum_avg_load_per_task; 3116 unsigned long sum_avg_load_per_task;
3136 unsigned long avg_load_per_task; 3117 unsigned long avg_load_per_task;
3137 3118
3138 local_group = cpu_isset(this_cpu, group->cpumask); 3119 local_group = cpumask_test_cpu(this_cpu,
3120 sched_group_cpus(group));
3139 3121
3140 if (local_group) 3122 if (local_group)
3141 balance_cpu = first_cpu(group->cpumask); 3123 balance_cpu = cpumask_first(sched_group_cpus(group));
3142 3124
3143 /* Tally up the load of all CPUs in the group */ 3125 /* Tally up the load of all CPUs in the group */
3144 sum_weighted_load = sum_nr_running = avg_load = 0; 3126 sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3147,13 +3129,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3147 max_cpu_load = 0; 3129 max_cpu_load = 0;
3148 min_cpu_load = ~0UL; 3130 min_cpu_load = ~0UL;
3149 3131
3150 for_each_cpu_mask_nr(i, group->cpumask) { 3132 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3151 struct rq *rq; 3133 struct rq *rq = cpu_rq(i);
3152
3153 if (!cpu_isset(i, *cpus))
3154 continue;
3155
3156 rq = cpu_rq(i);
3157 3134
3158 if (*sd_idle && rq->nr_running) 3135 if (*sd_idle && rq->nr_running)
3159 *sd_idle = 0; 3136 *sd_idle = 0;
@@ -3264,8 +3241,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3264 */ 3241 */
3265 if ((sum_nr_running < min_nr_running) || 3242 if ((sum_nr_running < min_nr_running) ||
3266 (sum_nr_running == min_nr_running && 3243 (sum_nr_running == min_nr_running &&
3267 first_cpu(group->cpumask) < 3244 cpumask_first(sched_group_cpus(group)) <
3268 first_cpu(group_min->cpumask))) { 3245 cpumask_first(sched_group_cpus(group_min)))) {
3269 group_min = group; 3246 group_min = group;
3270 min_nr_running = sum_nr_running; 3247 min_nr_running = sum_nr_running;
3271 min_load_per_task = sum_weighted_load / 3248 min_load_per_task = sum_weighted_load /
@@ -3280,8 +3257,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3280 if (sum_nr_running <= group_capacity - 1) { 3257 if (sum_nr_running <= group_capacity - 1) {
3281 if (sum_nr_running > leader_nr_running || 3258 if (sum_nr_running > leader_nr_running ||
3282 (sum_nr_running == leader_nr_running && 3259 (sum_nr_running == leader_nr_running &&
3283 first_cpu(group->cpumask) > 3260 cpumask_first(sched_group_cpus(group)) >
3284 first_cpu(group_leader->cpumask))) { 3261 cpumask_first(sched_group_cpus(group_leader)))) {
3285 group_leader = group; 3262 group_leader = group;
3286 leader_nr_running = sum_nr_running; 3263 leader_nr_running = sum_nr_running;
3287 } 3264 }
@@ -3420,16 +3397,16 @@ ret:
3420 */ 3397 */
3421static struct rq * 3398static struct rq *
3422find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 3399find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3423 unsigned long imbalance, const cpumask_t *cpus) 3400 unsigned long imbalance, const struct cpumask *cpus)
3424{ 3401{
3425 struct rq *busiest = NULL, *rq; 3402 struct rq *busiest = NULL, *rq;
3426 unsigned long max_load = 0; 3403 unsigned long max_load = 0;
3427 int i; 3404 int i;
3428 3405
3429 for_each_cpu_mask_nr(i, group->cpumask) { 3406 for_each_cpu(i, sched_group_cpus(group)) {
3430 unsigned long wl; 3407 unsigned long wl;
3431 3408
3432 if (!cpu_isset(i, *cpus)) 3409 if (!cpumask_test_cpu(i, cpus))
3433 continue; 3410 continue;
3434 3411
3435 rq = cpu_rq(i); 3412 rq = cpu_rq(i);
@@ -3459,7 +3436,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3459 */ 3436 */
3460static int load_balance(int this_cpu, struct rq *this_rq, 3437static int load_balance(int this_cpu, struct rq *this_rq,
3461 struct sched_domain *sd, enum cpu_idle_type idle, 3438 struct sched_domain *sd, enum cpu_idle_type idle,
3462 int *balance, cpumask_t *cpus) 3439 int *balance, struct cpumask *cpus)
3463{ 3440{
3464 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3441 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3465 struct sched_group *group; 3442 struct sched_group *group;
@@ -3467,7 +3444,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3467 struct rq *busiest; 3444 struct rq *busiest;
3468 unsigned long flags; 3445 unsigned long flags;
3469 3446
3470 cpus_setall(*cpus); 3447 cpumask_setall(cpus);
3471 3448
3472 /* 3449 /*
3473 * When power savings policy is enabled for the parent domain, idle 3450 * When power savings policy is enabled for the parent domain, idle
@@ -3527,8 +3504,8 @@ redo:
3527 3504
3528 /* All tasks on this runqueue were pinned by CPU affinity */ 3505 /* All tasks on this runqueue were pinned by CPU affinity */
3529 if (unlikely(all_pinned)) { 3506 if (unlikely(all_pinned)) {
3530 cpu_clear(cpu_of(busiest), *cpus); 3507 cpumask_clear_cpu(cpu_of(busiest), cpus);
3531 if (!cpus_empty(*cpus)) 3508 if (!cpumask_empty(cpus))
3532 goto redo; 3509 goto redo;
3533 goto out_balanced; 3510 goto out_balanced;
3534 } 3511 }
@@ -3545,7 +3522,8 @@ redo:
3545 /* don't kick the migration_thread, if the curr 3522 /* don't kick the migration_thread, if the curr
3546 * task on busiest cpu can't be moved to this_cpu 3523 * task on busiest cpu can't be moved to this_cpu
3547 */ 3524 */
3548 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 3525 if (!cpumask_test_cpu(this_cpu,
3526 &busiest->curr->cpus_allowed)) {
3549 spin_unlock_irqrestore(&busiest->lock, flags); 3527 spin_unlock_irqrestore(&busiest->lock, flags);
3550 all_pinned = 1; 3528 all_pinned = 1;
3551 goto out_one_pinned; 3529 goto out_one_pinned;
@@ -3620,7 +3598,7 @@ out:
3620 */ 3598 */
3621static int 3599static int
3622load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 3600load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3623 cpumask_t *cpus) 3601 struct cpumask *cpus)
3624{ 3602{
3625 struct sched_group *group; 3603 struct sched_group *group;
3626 struct rq *busiest = NULL; 3604 struct rq *busiest = NULL;
@@ -3629,7 +3607,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3629 int sd_idle = 0; 3607 int sd_idle = 0;
3630 int all_pinned = 0; 3608 int all_pinned = 0;
3631 3609
3632 cpus_setall(*cpus); 3610 cpumask_setall(cpus);
3633 3611
3634 /* 3612 /*
3635 * When power savings policy is enabled for the parent domain, idle 3613 * When power savings policy is enabled for the parent domain, idle
@@ -3673,8 +3651,8 @@ redo:
3673 double_unlock_balance(this_rq, busiest); 3651 double_unlock_balance(this_rq, busiest);
3674 3652
3675 if (unlikely(all_pinned)) { 3653 if (unlikely(all_pinned)) {
3676 cpu_clear(cpu_of(busiest), *cpus); 3654 cpumask_clear_cpu(cpu_of(busiest), cpus);
3677 if (!cpus_empty(*cpus)) 3655 if (!cpumask_empty(cpus))
3678 goto redo; 3656 goto redo;
3679 } 3657 }
3680 } 3658 }
@@ -3707,9 +3685,12 @@ out_balanced:
3707static void idle_balance(int this_cpu, struct rq *this_rq) 3685static void idle_balance(int this_cpu, struct rq *this_rq)
3708{ 3686{
3709 struct sched_domain *sd; 3687 struct sched_domain *sd;
3710 int pulled_task = -1; 3688 int pulled_task = 0;
3711 unsigned long next_balance = jiffies + HZ; 3689 unsigned long next_balance = jiffies + HZ;
3712 cpumask_t tmpmask; 3690 cpumask_var_t tmpmask;
3691
3692 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
3693 return;
3713 3694
3714 for_each_domain(this_cpu, sd) { 3695 for_each_domain(this_cpu, sd) {
3715 unsigned long interval; 3696 unsigned long interval;
@@ -3720,7 +3701,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3720 if (sd->flags & SD_BALANCE_NEWIDLE) 3701 if (sd->flags & SD_BALANCE_NEWIDLE)
3721 /* If we've pulled tasks over stop searching: */ 3702 /* If we've pulled tasks over stop searching: */
3722 pulled_task = load_balance_newidle(this_cpu, this_rq, 3703 pulled_task = load_balance_newidle(this_cpu, this_rq,
3723 sd, &tmpmask); 3704 sd, tmpmask);
3724 3705
3725 interval = msecs_to_jiffies(sd->balance_interval); 3706 interval = msecs_to_jiffies(sd->balance_interval);
3726 if (time_after(next_balance, sd->last_balance + interval)) 3707 if (time_after(next_balance, sd->last_balance + interval))
@@ -3735,6 +3716,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3735 */ 3716 */
3736 this_rq->next_balance = next_balance; 3717 this_rq->next_balance = next_balance;
3737 } 3718 }
3719 free_cpumask_var(tmpmask);
3738} 3720}
3739 3721
3740/* 3722/*
@@ -3772,7 +3754,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3772 /* Search for an sd spanning us and the target CPU. */ 3754 /* Search for an sd spanning us and the target CPU. */
3773 for_each_domain(target_cpu, sd) { 3755 for_each_domain(target_cpu, sd) {
3774 if ((sd->flags & SD_LOAD_BALANCE) && 3756 if ((sd->flags & SD_LOAD_BALANCE) &&
3775 cpu_isset(busiest_cpu, sd->span)) 3757 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3776 break; 3758 break;
3777 } 3759 }
3778 3760
@@ -3791,10 +3773,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3791#ifdef CONFIG_NO_HZ 3773#ifdef CONFIG_NO_HZ
3792static struct { 3774static struct {
3793 atomic_t load_balancer; 3775 atomic_t load_balancer;
3794 cpumask_t cpu_mask; 3776 cpumask_var_t cpu_mask;
3795} nohz ____cacheline_aligned = { 3777} nohz ____cacheline_aligned = {
3796 .load_balancer = ATOMIC_INIT(-1), 3778 .load_balancer = ATOMIC_INIT(-1),
3797 .cpu_mask = CPU_MASK_NONE,
3798}; 3779};
3799 3780
3800/* 3781/*
@@ -3822,7 +3803,7 @@ int select_nohz_load_balancer(int stop_tick)
3822 int cpu = smp_processor_id(); 3803 int cpu = smp_processor_id();
3823 3804
3824 if (stop_tick) { 3805 if (stop_tick) {
3825 cpu_set(cpu, nohz.cpu_mask); 3806 cpumask_set_cpu(cpu, nohz.cpu_mask);
3826 cpu_rq(cpu)->in_nohz_recently = 1; 3807 cpu_rq(cpu)->in_nohz_recently = 1;
3827 3808
3828 /* 3809 /*
@@ -3836,7 +3817,7 @@ int select_nohz_load_balancer(int stop_tick)
3836 } 3817 }
3837 3818
3838 /* time for ilb owner also to sleep */ 3819 /* time for ilb owner also to sleep */
3839 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 3820 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3840 if (atomic_read(&nohz.load_balancer) == cpu) 3821 if (atomic_read(&nohz.load_balancer) == cpu)
3841 atomic_set(&nohz.load_balancer, -1); 3822 atomic_set(&nohz.load_balancer, -1);
3842 return 0; 3823 return 0;
@@ -3849,10 +3830,10 @@ int select_nohz_load_balancer(int stop_tick)
3849 } else if (atomic_read(&nohz.load_balancer) == cpu) 3830 } else if (atomic_read(&nohz.load_balancer) == cpu)
3850 return 1; 3831 return 1;
3851 } else { 3832 } else {
3852 if (!cpu_isset(cpu, nohz.cpu_mask)) 3833 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3853 return 0; 3834 return 0;
3854 3835
3855 cpu_clear(cpu, nohz.cpu_mask); 3836 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3856 3837
3857 if (atomic_read(&nohz.load_balancer) == cpu) 3838 if (atomic_read(&nohz.load_balancer) == cpu)
3858 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3839 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@ -3880,7 +3861,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3880 unsigned long next_balance = jiffies + 60*HZ; 3861 unsigned long next_balance = jiffies + 60*HZ;
3881 int update_next_balance = 0; 3862 int update_next_balance = 0;
3882 int need_serialize; 3863 int need_serialize;
3883 cpumask_t tmp; 3864 cpumask_var_t tmp;
3865
3866 /* Fails alloc? Rebalancing probably not a priority right now. */
3867 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
3868 return;
3884 3869
3885 for_each_domain(cpu, sd) { 3870 for_each_domain(cpu, sd) {
3886 if (!(sd->flags & SD_LOAD_BALANCE)) 3871 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3905,7 +3890,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3905 } 3890 }
3906 3891
3907 if (time_after_eq(jiffies, sd->last_balance + interval)) { 3892 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3908 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { 3893 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
3909 /* 3894 /*
3910 * We've pulled tasks over so either we're no 3895 * We've pulled tasks over so either we're no
3911 * longer idle, or one of our SMT siblings is 3896 * longer idle, or one of our SMT siblings is
@@ -3939,6 +3924,8 @@ out:
3939 */ 3924 */
3940 if (likely(update_next_balance)) 3925 if (likely(update_next_balance))
3941 rq->next_balance = next_balance; 3926 rq->next_balance = next_balance;
3927
3928 free_cpumask_var(tmp);
3942} 3929}
3943 3930
3944/* 3931/*
@@ -3963,12 +3950,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3963 */ 3950 */
3964 if (this_rq->idle_at_tick && 3951 if (this_rq->idle_at_tick &&
3965 atomic_read(&nohz.load_balancer) == this_cpu) { 3952 atomic_read(&nohz.load_balancer) == this_cpu) {
3966 cpumask_t cpus = nohz.cpu_mask;
3967 struct rq *rq; 3953 struct rq *rq;
3968 int balance_cpu; 3954 int balance_cpu;
3969 3955
3970 cpu_clear(this_cpu, cpus); 3956 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3971 for_each_cpu_mask_nr(balance_cpu, cpus) { 3957 if (balance_cpu == this_cpu)
3958 continue;
3959
3972 /* 3960 /*
3973 * If this cpu gets work to do, stop the load balancing 3961 * If this cpu gets work to do, stop the load balancing
3974 * work being done for other cpus. Next load 3962 * work being done for other cpus. Next load
@@ -4006,7 +3994,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4006 rq->in_nohz_recently = 0; 3994 rq->in_nohz_recently = 0;
4007 3995
4008 if (atomic_read(&nohz.load_balancer) == cpu) { 3996 if (atomic_read(&nohz.load_balancer) == cpu) {
4009 cpu_clear(cpu, nohz.cpu_mask); 3997 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4010 atomic_set(&nohz.load_balancer, -1); 3998 atomic_set(&nohz.load_balancer, -1);
4011 } 3999 }
4012 4000
@@ -4019,7 +4007,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4019 * TBD: Traverse the sched domains and nominate 4007 * TBD: Traverse the sched domains and nominate
4020 * the nearest cpu in the nohz.cpu_mask. 4008 * the nearest cpu in the nohz.cpu_mask.
4021 */ 4009 */
4022 int ilb = first_cpu(nohz.cpu_mask); 4010 int ilb = cpumask_first(nohz.cpu_mask);
4023 4011
4024 if (ilb < nr_cpu_ids) 4012 if (ilb < nr_cpu_ids)
4025 resched_cpu(ilb); 4013 resched_cpu(ilb);
@@ -4031,7 +4019,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4031 * cpus with ticks stopped, is it time for that to stop? 4019 * cpus with ticks stopped, is it time for that to stop?
4032 */ 4020 */
4033 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && 4021 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4034 cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 4022 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4035 resched_cpu(cpu); 4023 resched_cpu(cpu);
4036 return; 4024 return;
4037 } 4025 }
@@ -4041,7 +4029,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4041 * someone else, then no need raise the SCHED_SOFTIRQ 4029 * someone else, then no need raise the SCHED_SOFTIRQ
4042 */ 4030 */
4043 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && 4031 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4044 cpu_isset(cpu, nohz.cpu_mask)) 4032 cpumask_test_cpu(cpu, nohz.cpu_mask))
4045 return; 4033 return;
4046#endif 4034#endif
4047 if (time_after_eq(jiffies, rq->next_balance)) 4035 if (time_after_eq(jiffies, rq->next_balance))
@@ -4203,7 +4191,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4203 4191
4204 if (p == rq->idle) { 4192 if (p == rq->idle) {
4205 p->stime = cputime_add(p->stime, steal); 4193 p->stime = cputime_add(p->stime, steal);
4206 account_group_system_time(p, steal);
4207 if (atomic_read(&rq->nr_iowait) > 0) 4194 if (atomic_read(&rq->nr_iowait) > 0)
4208 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4195 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4209 else 4196 else
@@ -4339,7 +4326,7 @@ void __kprobes sub_preempt_count(int val)
4339 /* 4326 /*
4340 * Underflow? 4327 * Underflow?
4341 */ 4328 */
4342 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4329 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
4343 return; 4330 return;
4344 /* 4331 /*
4345 * Is the spinlock portion underflowing? 4332 * Is the spinlock portion underflowing?
@@ -5400,10 +5387,9 @@ out_unlock:
5400 return retval; 5387 return retval;
5401} 5388}
5402 5389
5403long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) 5390long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5404{ 5391{
5405 cpumask_t cpus_allowed; 5392 cpumask_var_t cpus_allowed, new_mask;
5406 cpumask_t new_mask = *in_mask;
5407 struct task_struct *p; 5393 struct task_struct *p;
5408 int retval; 5394 int retval;
5409 5395
@@ -5425,6 +5411,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5425 get_task_struct(p); 5411 get_task_struct(p);
5426 read_unlock(&tasklist_lock); 5412 read_unlock(&tasklist_lock);
5427 5413
5414 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5415 retval = -ENOMEM;
5416 goto out_put_task;
5417 }
5418 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5419 retval = -ENOMEM;
5420 goto out_free_cpus_allowed;
5421 }
5428 retval = -EPERM; 5422 retval = -EPERM;
5429 if ((current->euid != p->euid) && (current->euid != p->uid) && 5423 if ((current->euid != p->euid) && (current->euid != p->uid) &&
5430 !capable(CAP_SYS_NICE)) 5424 !capable(CAP_SYS_NICE))
@@ -5434,37 +5428,41 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5434 if (retval) 5428 if (retval)
5435 goto out_unlock; 5429 goto out_unlock;
5436 5430
5437 cpuset_cpus_allowed(p, &cpus_allowed); 5431 cpuset_cpus_allowed(p, cpus_allowed);
5438 cpus_and(new_mask, new_mask, cpus_allowed); 5432 cpumask_and(new_mask, in_mask, cpus_allowed);
5439 again: 5433 again:
5440 retval = set_cpus_allowed_ptr(p, &new_mask); 5434 retval = set_cpus_allowed_ptr(p, new_mask);
5441 5435
5442 if (!retval) { 5436 if (!retval) {
5443 cpuset_cpus_allowed(p, &cpus_allowed); 5437 cpuset_cpus_allowed(p, cpus_allowed);
5444 if (!cpus_subset(new_mask, cpus_allowed)) { 5438 if (!cpumask_subset(new_mask, cpus_allowed)) {
5445 /* 5439 /*
5446 * We must have raced with a concurrent cpuset 5440 * We must have raced with a concurrent cpuset
5447 * update. Just reset the cpus_allowed to the 5441 * update. Just reset the cpus_allowed to the
5448 * cpuset's cpus_allowed 5442 * cpuset's cpus_allowed
5449 */ 5443 */
5450 new_mask = cpus_allowed; 5444 cpumask_copy(new_mask, cpus_allowed);
5451 goto again; 5445 goto again;
5452 } 5446 }
5453 } 5447 }
5454out_unlock: 5448out_unlock:
5449 free_cpumask_var(new_mask);
5450out_free_cpus_allowed:
5451 free_cpumask_var(cpus_allowed);
5452out_put_task:
5455 put_task_struct(p); 5453 put_task_struct(p);
5456 put_online_cpus(); 5454 put_online_cpus();
5457 return retval; 5455 return retval;
5458} 5456}
5459 5457
5460static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 5458static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5461 cpumask_t *new_mask) 5459 struct cpumask *new_mask)
5462{ 5460{
5463 if (len < sizeof(cpumask_t)) { 5461 if (len < cpumask_size())
5464 memset(new_mask, 0, sizeof(cpumask_t)); 5462 cpumask_clear(new_mask);
5465 } else if (len > sizeof(cpumask_t)) { 5463 else if (len > cpumask_size())
5466 len = sizeof(cpumask_t); 5464 len = cpumask_size();
5467 } 5465
5468 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 5466 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5469} 5467}
5470 5468
@@ -5477,17 +5475,20 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5477asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 5475asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5478 unsigned long __user *user_mask_ptr) 5476 unsigned long __user *user_mask_ptr)
5479{ 5477{
5480 cpumask_t new_mask; 5478 cpumask_var_t new_mask;
5481 int retval; 5479 int retval;
5482 5480
5483 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); 5481 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5484 if (retval) 5482 return -ENOMEM;
5485 return retval;
5486 5483
5487 return sched_setaffinity(pid, &new_mask); 5484 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5485 if (retval == 0)
5486 retval = sched_setaffinity(pid, new_mask);
5487 free_cpumask_var(new_mask);
5488 return retval;
5488} 5489}
5489 5490
5490long sched_getaffinity(pid_t pid, cpumask_t *mask) 5491long sched_getaffinity(pid_t pid, struct cpumask *mask)
5491{ 5492{
5492 struct task_struct *p; 5493 struct task_struct *p;
5493 int retval; 5494 int retval;
@@ -5504,7 +5505,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
5504 if (retval) 5505 if (retval)
5505 goto out_unlock; 5506 goto out_unlock;
5506 5507
5507 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 5508 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5508 5509
5509out_unlock: 5510out_unlock:
5510 read_unlock(&tasklist_lock); 5511 read_unlock(&tasklist_lock);
@@ -5523,19 +5524,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
5523 unsigned long __user *user_mask_ptr) 5524 unsigned long __user *user_mask_ptr)
5524{ 5525{
5525 int ret; 5526 int ret;
5526 cpumask_t mask; 5527 cpumask_var_t mask;
5527 5528
5528 if (len < sizeof(cpumask_t)) 5529 if (len < cpumask_size())
5529 return -EINVAL; 5530 return -EINVAL;
5530 5531
5531 ret = sched_getaffinity(pid, &mask); 5532 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5532 if (ret < 0) 5533 return -ENOMEM;
5533 return ret;
5534 5534
5535 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) 5535 ret = sched_getaffinity(pid, mask);
5536 return -EFAULT; 5536 if (ret == 0) {
5537 if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
5538 ret = -EFAULT;
5539 else
5540 ret = cpumask_size();
5541 }
5542 free_cpumask_var(mask);
5537 5543
5538 return sizeof(cpumask_t); 5544 return ret;
5539} 5545}
5540 5546
5541/** 5547/**
@@ -5877,7 +5883,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5877 idle->se.exec_start = sched_clock(); 5883 idle->se.exec_start = sched_clock();
5878 5884
5879 idle->prio = idle->normal_prio = MAX_PRIO; 5885 idle->prio = idle->normal_prio = MAX_PRIO;
5880 idle->cpus_allowed = cpumask_of_cpu(cpu); 5886 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5881 __set_task_cpu(idle, cpu); 5887 __set_task_cpu(idle, cpu);
5882 5888
5883 rq->curr = rq->idle = idle; 5889 rq->curr = rq->idle = idle;
@@ -5896,6 +5902,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5896 * The idle tasks have their own, simple scheduling class: 5902 * The idle tasks have their own, simple scheduling class:
5897 */ 5903 */
5898 idle->sched_class = &idle_sched_class; 5904 idle->sched_class = &idle_sched_class;
5905 ftrace_graph_init_task(idle);
5899} 5906}
5900 5907
5901/* 5908/*
@@ -5903,9 +5910,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5903 * indicates which cpus entered this state. This is used 5910 * indicates which cpus entered this state. This is used
5904 * in the rcu update to wait only for active cpus. For system 5911 * in the rcu update to wait only for active cpus. For system
5905 * which do not switch off the HZ timer nohz_cpu_mask should 5912 * which do not switch off the HZ timer nohz_cpu_mask should
5906 * always be CPU_MASK_NONE. 5913 * always be CPU_BITS_NONE.
5907 */ 5914 */
5908cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 5915cpumask_var_t nohz_cpu_mask;
5909 5916
5910/* 5917/*
5911 * Increase the granularity value when there are more CPUs, 5918 * Increase the granularity value when there are more CPUs,
@@ -5960,7 +5967,7 @@ static inline void sched_init_granularity(void)
5960 * task must not exit() & deallocate itself prematurely. The 5967 * task must not exit() & deallocate itself prematurely. The
5961 * call is not atomic; no spinlocks may be held. 5968 * call is not atomic; no spinlocks may be held.
5962 */ 5969 */
5963int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) 5970int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5964{ 5971{
5965 struct migration_req req; 5972 struct migration_req req;
5966 unsigned long flags; 5973 unsigned long flags;
@@ -5968,13 +5975,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5968 int ret = 0; 5975 int ret = 0;
5969 5976
5970 rq = task_rq_lock(p, &flags); 5977 rq = task_rq_lock(p, &flags);
5971 if (!cpus_intersects(*new_mask, cpu_online_map)) { 5978 if (!cpumask_intersects(new_mask, cpu_online_mask)) {
5972 ret = -EINVAL; 5979 ret = -EINVAL;
5973 goto out; 5980 goto out;
5974 } 5981 }
5975 5982
5976 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5983 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5977 !cpus_equal(p->cpus_allowed, *new_mask))) { 5984 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5978 ret = -EINVAL; 5985 ret = -EINVAL;
5979 goto out; 5986 goto out;
5980 } 5987 }
@@ -5982,15 +5989,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5982 if (p->sched_class->set_cpus_allowed) 5989 if (p->sched_class->set_cpus_allowed)
5983 p->sched_class->set_cpus_allowed(p, new_mask); 5990 p->sched_class->set_cpus_allowed(p, new_mask);
5984 else { 5991 else {
5985 p->cpus_allowed = *new_mask; 5992 cpumask_copy(&p->cpus_allowed, new_mask);
5986 p->rt.nr_cpus_allowed = cpus_weight(*new_mask); 5993 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5987 } 5994 }
5988 5995
5989 /* Can the task run on the task's current CPU? If so, we're done */ 5996 /* Can the task run on the task's current CPU? If so, we're done */
5990 if (cpu_isset(task_cpu(p), *new_mask)) 5997 if (cpumask_test_cpu(task_cpu(p), new_mask))
5991 goto out; 5998 goto out;
5992 5999
5993 if (migrate_task(p, any_online_cpu(*new_mask), &req)) { 6000 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
5994 /* Need help from migration thread: drop lock and wait. */ 6001 /* Need help from migration thread: drop lock and wait. */
5995 task_rq_unlock(rq, &flags); 6002 task_rq_unlock(rq, &flags);
5996 wake_up_process(rq->migration_thread); 6003 wake_up_process(rq->migration_thread);
@@ -6032,7 +6039,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6032 if (task_cpu(p) != src_cpu) 6039 if (task_cpu(p) != src_cpu)
6033 goto done; 6040 goto done;
6034 /* Affinity changed (again). */ 6041 /* Affinity changed (again). */
6035 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 6042 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6036 goto fail; 6043 goto fail;
6037 6044
6038 on_rq = p->se.on_rq; 6045 on_rq = p->se.on_rq;
@@ -6126,54 +6133,46 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6126 6133
6127/* 6134/*
6128 * Figure out where task on dead CPU should go, use force if necessary. 6135 * Figure out where task on dead CPU should go, use force if necessary.
6129 * NOTE: interrupts should be disabled by the caller
6130 */ 6136 */
6131static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6137static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6132{ 6138{
6133 unsigned long flags;
6134 cpumask_t mask;
6135 struct rq *rq;
6136 int dest_cpu; 6139 int dest_cpu;
6140 /* FIXME: Use cpumask_of_node here. */
6141 cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
6142 const struct cpumask *nodemask = &_nodemask;
6143
6144again:
6145 /* Look for allowed, online CPU in same node. */
6146 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
6147 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6148 goto move;
6149
6150 /* Any allowed, online CPU? */
6151 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
6152 if (dest_cpu < nr_cpu_ids)
6153 goto move;
6154
6155 /* No more Mr. Nice Guy. */
6156 if (dest_cpu >= nr_cpu_ids) {
6157 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
6158 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
6137 6159
6138 do { 6160 /*
6139 /* On same node? */ 6161 * Don't tell them about moving exiting tasks or
6140 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 6162 * kernel threads (both mm NULL), since they never
6141 cpus_and(mask, mask, p->cpus_allowed); 6163 * leave kernel.
6142 dest_cpu = any_online_cpu(mask); 6164 */
6143 6165 if (p->mm && printk_ratelimit()) {
6144 /* On any allowed CPU? */ 6166 printk(KERN_INFO "process %d (%s) no "
6145 if (dest_cpu >= nr_cpu_ids) 6167 "longer affine to cpu%d\n",
6146 dest_cpu = any_online_cpu(p->cpus_allowed); 6168 task_pid_nr(p), p->comm, dead_cpu);
6147
6148 /* No more Mr. Nice Guy. */
6149 if (dest_cpu >= nr_cpu_ids) {
6150 cpumask_t cpus_allowed;
6151
6152 cpuset_cpus_allowed_locked(p, &cpus_allowed);
6153 /*
6154 * Try to stay on the same cpuset, where the
6155 * current cpuset may be a subset of all cpus.
6156 * The cpuset_cpus_allowed_locked() variant of
6157 * cpuset_cpus_allowed() will not block. It must be
6158 * called within calls to cpuset_lock/cpuset_unlock.
6159 */
6160 rq = task_rq_lock(p, &flags);
6161 p->cpus_allowed = cpus_allowed;
6162 dest_cpu = any_online_cpu(p->cpus_allowed);
6163 task_rq_unlock(rq, &flags);
6164
6165 /*
6166 * Don't tell them about moving exiting tasks or
6167 * kernel threads (both mm NULL), since they never
6168 * leave kernel.
6169 */
6170 if (p->mm && printk_ratelimit()) {
6171 printk(KERN_INFO "process %d (%s) no "
6172 "longer affine to cpu%d\n",
6173 task_pid_nr(p), p->comm, dead_cpu);
6174 }
6175 } 6169 }
6176 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); 6170 }
6171
6172move:
6173 /* It can have affinity changed while we were choosing. */
6174 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
6175 goto again;
6177} 6176}
6178 6177
6179/* 6178/*
@@ -6185,7 +6184,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6185 */ 6184 */
6186static void migrate_nr_uninterruptible(struct rq *rq_src) 6185static void migrate_nr_uninterruptible(struct rq *rq_src)
6187{ 6186{
6188 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); 6187 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
6189 unsigned long flags; 6188 unsigned long flags;
6190 6189
6191 local_irq_save(flags); 6190 local_irq_save(flags);
@@ -6475,7 +6474,7 @@ static void set_rq_online(struct rq *rq)
6475 if (!rq->online) { 6474 if (!rq->online) {
6476 const struct sched_class *class; 6475 const struct sched_class *class;
6477 6476
6478 cpu_set(rq->cpu, rq->rd->online); 6477 cpumask_set_cpu(rq->cpu, rq->rd->online);
6479 rq->online = 1; 6478 rq->online = 1;
6480 6479
6481 for_each_class(class) { 6480 for_each_class(class) {
@@ -6495,7 +6494,7 @@ static void set_rq_offline(struct rq *rq)
6495 class->rq_offline(rq); 6494 class->rq_offline(rq);
6496 } 6495 }
6497 6496
6498 cpu_clear(rq->cpu, rq->rd->online); 6497 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6499 rq->online = 0; 6498 rq->online = 0;
6500 } 6499 }
6501} 6500}
@@ -6536,7 +6535,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6536 rq = cpu_rq(cpu); 6535 rq = cpu_rq(cpu);
6537 spin_lock_irqsave(&rq->lock, flags); 6536 spin_lock_irqsave(&rq->lock, flags);
6538 if (rq->rd) { 6537 if (rq->rd) {
6539 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6538 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6540 6539
6541 set_rq_online(rq); 6540 set_rq_online(rq);
6542 } 6541 }
@@ -6550,7 +6549,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6550 break; 6549 break;
6551 /* Unbind it from offline cpu so it can run. Fall thru. */ 6550 /* Unbind it from offline cpu so it can run. Fall thru. */
6552 kthread_bind(cpu_rq(cpu)->migration_thread, 6551 kthread_bind(cpu_rq(cpu)->migration_thread,
6553 any_online_cpu(cpu_online_map)); 6552 cpumask_any(cpu_online_mask));
6554 kthread_stop(cpu_rq(cpu)->migration_thread); 6553 kthread_stop(cpu_rq(cpu)->migration_thread);
6555 cpu_rq(cpu)->migration_thread = NULL; 6554 cpu_rq(cpu)->migration_thread = NULL;
6556 break; 6555 break;
@@ -6600,7 +6599,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6600 rq = cpu_rq(cpu); 6599 rq = cpu_rq(cpu);
6601 spin_lock_irqsave(&rq->lock, flags); 6600 spin_lock_irqsave(&rq->lock, flags);
6602 if (rq->rd) { 6601 if (rq->rd) {
6603 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6602 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6604 set_rq_offline(rq); 6603 set_rq_offline(rq);
6605 } 6604 }
6606 spin_unlock_irqrestore(&rq->lock, flags); 6605 spin_unlock_irqrestore(&rq->lock, flags);
@@ -6638,36 +6637,14 @@ early_initcall(migration_init);
6638 6637
6639#ifdef CONFIG_SCHED_DEBUG 6638#ifdef CONFIG_SCHED_DEBUG
6640 6639
6641static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6642{
6643 switch (lvl) {
6644 case SD_LV_NONE:
6645 return "NONE";
6646 case SD_LV_SIBLING:
6647 return "SIBLING";
6648 case SD_LV_MC:
6649 return "MC";
6650 case SD_LV_CPU:
6651 return "CPU";
6652 case SD_LV_NODE:
6653 return "NODE";
6654 case SD_LV_ALLNODES:
6655 return "ALLNODES";
6656 case SD_LV_MAX:
6657 return "MAX";
6658
6659 }
6660 return "MAX";
6661}
6662
6663static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6640static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6664 cpumask_t *groupmask) 6641 struct cpumask *groupmask)
6665{ 6642{
6666 struct sched_group *group = sd->groups; 6643 struct sched_group *group = sd->groups;
6667 char str[256]; 6644 char str[256];
6668 6645
6669 cpulist_scnprintf(str, sizeof(str), &sd->span); 6646 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
6670 cpus_clear(*groupmask); 6647 cpumask_clear(groupmask);
6671 6648
6672 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6649 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6673 6650
@@ -6679,14 +6656,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6679 return -1; 6656 return -1;
6680 } 6657 }
6681 6658
6682 printk(KERN_CONT "span %s level %s\n", 6659 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6683 str, sd_level_to_string(sd->level));
6684 6660
6685 if (!cpu_isset(cpu, sd->span)) { 6661 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6686 printk(KERN_ERR "ERROR: domain->span does not contain " 6662 printk(KERN_ERR "ERROR: domain->span does not contain "
6687 "CPU%d\n", cpu); 6663 "CPU%d\n", cpu);
6688 } 6664 }
6689 if (!cpu_isset(cpu, group->cpumask)) { 6665 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6690 printk(KERN_ERR "ERROR: domain->groups does not contain" 6666 printk(KERN_ERR "ERROR: domain->groups does not contain"
6691 " CPU%d\n", cpu); 6667 " CPU%d\n", cpu);
6692 } 6668 }
@@ -6706,31 +6682,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6706 break; 6682 break;
6707 } 6683 }
6708 6684
6709 if (!cpus_weight(group->cpumask)) { 6685 if (!cpumask_weight(sched_group_cpus(group))) {
6710 printk(KERN_CONT "\n"); 6686 printk(KERN_CONT "\n");
6711 printk(KERN_ERR "ERROR: empty group\n"); 6687 printk(KERN_ERR "ERROR: empty group\n");
6712 break; 6688 break;
6713 } 6689 }
6714 6690
6715 if (cpus_intersects(*groupmask, group->cpumask)) { 6691 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6716 printk(KERN_CONT "\n"); 6692 printk(KERN_CONT "\n");
6717 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6693 printk(KERN_ERR "ERROR: repeated CPUs\n");
6718 break; 6694 break;
6719 } 6695 }
6720 6696
6721 cpus_or(*groupmask, *groupmask, group->cpumask); 6697 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6722 6698
6723 cpulist_scnprintf(str, sizeof(str), &group->cpumask); 6699 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6724 printk(KERN_CONT " %s", str); 6700 printk(KERN_CONT " %s", str);
6725 6701
6726 group = group->next; 6702 group = group->next;
6727 } while (group != sd->groups); 6703 } while (group != sd->groups);
6728 printk(KERN_CONT "\n"); 6704 printk(KERN_CONT "\n");
6729 6705
6730 if (!cpus_equal(sd->span, *groupmask)) 6706 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6731 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6707 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6732 6708
6733 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) 6709 if (sd->parent &&
6710 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6734 printk(KERN_ERR "ERROR: parent span is not a superset " 6711 printk(KERN_ERR "ERROR: parent span is not a superset "
6735 "of domain->span\n"); 6712 "of domain->span\n");
6736 return 0; 6713 return 0;
@@ -6738,7 +6715,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6738 6715
6739static void sched_domain_debug(struct sched_domain *sd, int cpu) 6716static void sched_domain_debug(struct sched_domain *sd, int cpu)
6740{ 6717{
6741 cpumask_t *groupmask; 6718 cpumask_var_t groupmask;
6742 int level = 0; 6719 int level = 0;
6743 6720
6744 if (!sd) { 6721 if (!sd) {
@@ -6748,8 +6725,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6748 6725
6749 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6726 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6750 6727
6751 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 6728 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6752 if (!groupmask) {
6753 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); 6729 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6754 return; 6730 return;
6755 } 6731 }
@@ -6762,7 +6738,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6762 if (!sd) 6738 if (!sd)
6763 break; 6739 break;
6764 } 6740 }
6765 kfree(groupmask); 6741 free_cpumask_var(groupmask);
6766} 6742}
6767#else /* !CONFIG_SCHED_DEBUG */ 6743#else /* !CONFIG_SCHED_DEBUG */
6768# define sched_domain_debug(sd, cpu) do { } while (0) 6744# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6770,7 +6746,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6770 6746
6771static int sd_degenerate(struct sched_domain *sd) 6747static int sd_degenerate(struct sched_domain *sd)
6772{ 6748{
6773 if (cpus_weight(sd->span) == 1) 6749 if (cpumask_weight(sched_domain_span(sd)) == 1)
6774 return 1; 6750 return 1;
6775 6751
6776 /* Following flags need at least 2 groups */ 6752 /* Following flags need at least 2 groups */
@@ -6801,7 +6777,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6801 if (sd_degenerate(parent)) 6777 if (sd_degenerate(parent))
6802 return 1; 6778 return 1;
6803 6779
6804 if (!cpus_equal(sd->span, parent->span)) 6780 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6805 return 0; 6781 return 0;
6806 6782
6807 /* Does parent contain flags not in child? */ 6783 /* Does parent contain flags not in child? */
@@ -6816,6 +6792,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6816 SD_BALANCE_EXEC | 6792 SD_BALANCE_EXEC |
6817 SD_SHARE_CPUPOWER | 6793 SD_SHARE_CPUPOWER |
6818 SD_SHARE_PKG_RESOURCES); 6794 SD_SHARE_PKG_RESOURCES);
6795 if (nr_node_ids == 1)
6796 pflags &= ~SD_SERIALIZE;
6819 } 6797 }
6820 if (~cflags & pflags) 6798 if (~cflags & pflags)
6821 return 0; 6799 return 0;
@@ -6823,6 +6801,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6823 return 1; 6801 return 1;
6824} 6802}
6825 6803
6804static void free_rootdomain(struct root_domain *rd)
6805{
6806 cpupri_cleanup(&rd->cpupri);
6807
6808 free_cpumask_var(rd->rto_mask);
6809 free_cpumask_var(rd->online);
6810 free_cpumask_var(rd->span);
6811 kfree(rd);
6812}
6813
6826static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6814static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6827{ 6815{
6828 unsigned long flags; 6816 unsigned long flags;
@@ -6832,38 +6820,63 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6832 if (rq->rd) { 6820 if (rq->rd) {
6833 struct root_domain *old_rd = rq->rd; 6821 struct root_domain *old_rd = rq->rd;
6834 6822
6835 if (cpu_isset(rq->cpu, old_rd->online)) 6823 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6836 set_rq_offline(rq); 6824 set_rq_offline(rq);
6837 6825
6838 cpu_clear(rq->cpu, old_rd->span); 6826 cpumask_clear_cpu(rq->cpu, old_rd->span);
6839 6827
6840 if (atomic_dec_and_test(&old_rd->refcount)) 6828 if (atomic_dec_and_test(&old_rd->refcount))
6841 kfree(old_rd); 6829 free_rootdomain(old_rd);
6842 } 6830 }
6843 6831
6844 atomic_inc(&rd->refcount); 6832 atomic_inc(&rd->refcount);
6845 rq->rd = rd; 6833 rq->rd = rd;
6846 6834
6847 cpu_set(rq->cpu, rd->span); 6835 cpumask_set_cpu(rq->cpu, rd->span);
6848 if (cpu_isset(rq->cpu, cpu_online_map)) 6836 if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
6849 set_rq_online(rq); 6837 set_rq_online(rq);
6850 6838
6851 spin_unlock_irqrestore(&rq->lock, flags); 6839 spin_unlock_irqrestore(&rq->lock, flags);
6852} 6840}
6853 6841
6854static void init_rootdomain(struct root_domain *rd) 6842static int init_rootdomain(struct root_domain *rd, bool bootmem)
6855{ 6843{
6856 memset(rd, 0, sizeof(*rd)); 6844 memset(rd, 0, sizeof(*rd));
6857 6845
6858 cpus_clear(rd->span); 6846 if (bootmem) {
6859 cpus_clear(rd->online); 6847 alloc_bootmem_cpumask_var(&def_root_domain.span);
6848 alloc_bootmem_cpumask_var(&def_root_domain.online);
6849 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
6850 cpupri_init(&rd->cpupri, true);
6851 return 0;
6852 }
6853
6854 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6855 goto free_rd;
6856 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6857 goto free_span;
6858 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6859 goto free_online;
6860
6861 if (cpupri_init(&rd->cpupri, false) != 0)
6862 goto free_rto_mask;
6863 return 0;
6860 6864
6861 cpupri_init(&rd->cpupri); 6865free_rto_mask:
6866 free_cpumask_var(rd->rto_mask);
6867free_online:
6868 free_cpumask_var(rd->online);
6869free_span:
6870 free_cpumask_var(rd->span);
6871free_rd:
6872 kfree(rd);
6873 return -ENOMEM;
6862} 6874}
6863 6875
6864static void init_defrootdomain(void) 6876static void init_defrootdomain(void)
6865{ 6877{
6866 init_rootdomain(&def_root_domain); 6878 init_rootdomain(&def_root_domain, true);
6879
6867 atomic_set(&def_root_domain.refcount, 1); 6880 atomic_set(&def_root_domain.refcount, 1);
6868} 6881}
6869 6882
@@ -6875,7 +6888,10 @@ static struct root_domain *alloc_rootdomain(void)
6875 if (!rd) 6888 if (!rd)
6876 return NULL; 6889 return NULL;
6877 6890
6878 init_rootdomain(rd); 6891 if (init_rootdomain(rd, false) != 0) {
6892 kfree(rd);
6893 return NULL;
6894 }
6879 6895
6880 return rd; 6896 return rd;
6881} 6897}
@@ -6917,19 +6933,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6917} 6933}
6918 6934
6919/* cpus with isolated domains */ 6935/* cpus with isolated domains */
6920static cpumask_t cpu_isolated_map = CPU_MASK_NONE; 6936static cpumask_var_t cpu_isolated_map;
6921 6937
6922/* Setup the mask of cpus configured for isolated domains */ 6938/* Setup the mask of cpus configured for isolated domains */
6923static int __init isolated_cpu_setup(char *str) 6939static int __init isolated_cpu_setup(char *str)
6924{ 6940{
6925 static int __initdata ints[NR_CPUS]; 6941 cpulist_parse(str, cpu_isolated_map);
6926 int i;
6927
6928 str = get_options(str, ARRAY_SIZE(ints), ints);
6929 cpus_clear(cpu_isolated_map);
6930 for (i = 1; i <= ints[0]; i++)
6931 if (ints[i] < NR_CPUS)
6932 cpu_set(ints[i], cpu_isolated_map);
6933 return 1; 6942 return 1;
6934} 6943}
6935 6944
@@ -6938,42 +6947,43 @@ __setup("isolcpus=", isolated_cpu_setup);
6938/* 6947/*
6939 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 6948 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6940 * to a function which identifies what group(along with sched group) a CPU 6949 * to a function which identifies what group(along with sched group) a CPU
6941 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS 6950 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6942 * (due to the fact that we keep track of groups covered with a cpumask_t). 6951 * (due to the fact that we keep track of groups covered with a struct cpumask).
6943 * 6952 *
6944 * init_sched_build_groups will build a circular linked list of the groups 6953 * init_sched_build_groups will build a circular linked list of the groups
6945 * covered by the given span, and will set each group's ->cpumask correctly, 6954 * covered by the given span, and will set each group's ->cpumask correctly,
6946 * and ->cpu_power to 0. 6955 * and ->cpu_power to 0.
6947 */ 6956 */
6948static void 6957static void
6949init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, 6958init_sched_build_groups(const struct cpumask *span,
6950 int (*group_fn)(int cpu, const cpumask_t *cpu_map, 6959 const struct cpumask *cpu_map,
6960 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6951 struct sched_group **sg, 6961 struct sched_group **sg,
6952 cpumask_t *tmpmask), 6962 struct cpumask *tmpmask),
6953 cpumask_t *covered, cpumask_t *tmpmask) 6963 struct cpumask *covered, struct cpumask *tmpmask)
6954{ 6964{
6955 struct sched_group *first = NULL, *last = NULL; 6965 struct sched_group *first = NULL, *last = NULL;
6956 int i; 6966 int i;
6957 6967
6958 cpus_clear(*covered); 6968 cpumask_clear(covered);
6959 6969
6960 for_each_cpu_mask_nr(i, *span) { 6970 for_each_cpu(i, span) {
6961 struct sched_group *sg; 6971 struct sched_group *sg;
6962 int group = group_fn(i, cpu_map, &sg, tmpmask); 6972 int group = group_fn(i, cpu_map, &sg, tmpmask);
6963 int j; 6973 int j;
6964 6974
6965 if (cpu_isset(i, *covered)) 6975 if (cpumask_test_cpu(i, covered))
6966 continue; 6976 continue;
6967 6977
6968 cpus_clear(sg->cpumask); 6978 cpumask_clear(sched_group_cpus(sg));
6969 sg->__cpu_power = 0; 6979 sg->__cpu_power = 0;
6970 6980
6971 for_each_cpu_mask_nr(j, *span) { 6981 for_each_cpu(j, span) {
6972 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 6982 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6973 continue; 6983 continue;
6974 6984
6975 cpu_set(j, *covered); 6985 cpumask_set_cpu(j, covered);
6976 cpu_set(j, sg->cpumask); 6986 cpumask_set_cpu(j, sched_group_cpus(sg));
6977 } 6987 }
6978 if (!first) 6988 if (!first)
6979 first = sg; 6989 first = sg;
@@ -7037,9 +7047,10 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
7037 * should be one that prevents unnecessary balancing, but also spreads tasks 7047 * should be one that prevents unnecessary balancing, but also spreads tasks
7038 * out optimally. 7048 * out optimally.
7039 */ 7049 */
7040static void sched_domain_node_span(int node, cpumask_t *span) 7050static void sched_domain_node_span(int node, struct cpumask *span)
7041{ 7051{
7042 nodemask_t used_nodes; 7052 nodemask_t used_nodes;
7053 /* FIXME: use cpumask_of_node() */
7043 node_to_cpumask_ptr(nodemask, node); 7054 node_to_cpumask_ptr(nodemask, node);
7044 int i; 7055 int i;
7045 7056
@@ -7061,18 +7072,33 @@ static void sched_domain_node_span(int node, cpumask_t *span)
7061int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7072int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7062 7073
7063/* 7074/*
7075 * The cpus mask in sched_group and sched_domain hangs off the end.
7076 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
7077 * for nr_cpu_ids < CONFIG_NR_CPUS.
7078 */
7079struct static_sched_group {
7080 struct sched_group sg;
7081 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
7082};
7083
7084struct static_sched_domain {
7085 struct sched_domain sd;
7086 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
7087};
7088
7089/*
7064 * SMT sched-domains: 7090 * SMT sched-domains:
7065 */ 7091 */
7066#ifdef CONFIG_SCHED_SMT 7092#ifdef CONFIG_SCHED_SMT
7067static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 7093static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
7068static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 7094static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
7069 7095
7070static int 7096static int
7071cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7097cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
7072 cpumask_t *unused) 7098 struct sched_group **sg, struct cpumask *unused)
7073{ 7099{
7074 if (sg) 7100 if (sg)
7075 *sg = &per_cpu(sched_group_cpus, cpu); 7101 *sg = &per_cpu(sched_group_cpus, cpu).sg;
7076 return cpu; 7102 return cpu;
7077} 7103}
7078#endif /* CONFIG_SCHED_SMT */ 7104#endif /* CONFIG_SCHED_SMT */
@@ -7081,56 +7107,55 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7081 * multi-core sched-domains: 7107 * multi-core sched-domains:
7082 */ 7108 */
7083#ifdef CONFIG_SCHED_MC 7109#ifdef CONFIG_SCHED_MC
7084static DEFINE_PER_CPU(struct sched_domain, core_domains); 7110static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
7085static DEFINE_PER_CPU(struct sched_group, sched_group_core); 7111static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
7086#endif /* CONFIG_SCHED_MC */ 7112#endif /* CONFIG_SCHED_MC */
7087 7113
7088#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7114#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
7089static int 7115static int
7090cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7116cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7091 cpumask_t *mask) 7117 struct sched_group **sg, struct cpumask *mask)
7092{ 7118{
7093 int group; 7119 int group;
7094 7120
7095 *mask = per_cpu(cpu_sibling_map, cpu); 7121 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7096 cpus_and(*mask, *mask, *cpu_map); 7122 group = cpumask_first(mask);
7097 group = first_cpu(*mask);
7098 if (sg) 7123 if (sg)
7099 *sg = &per_cpu(sched_group_core, group); 7124 *sg = &per_cpu(sched_group_core, group).sg;
7100 return group; 7125 return group;
7101} 7126}
7102#elif defined(CONFIG_SCHED_MC) 7127#elif defined(CONFIG_SCHED_MC)
7103static int 7128static int
7104cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7129cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7105 cpumask_t *unused) 7130 struct sched_group **sg, struct cpumask *unused)
7106{ 7131{
7107 if (sg) 7132 if (sg)
7108 *sg = &per_cpu(sched_group_core, cpu); 7133 *sg = &per_cpu(sched_group_core, cpu).sg;
7109 return cpu; 7134 return cpu;
7110} 7135}
7111#endif 7136#endif
7112 7137
7113static DEFINE_PER_CPU(struct sched_domain, phys_domains); 7138static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
7114static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 7139static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
7115 7140
7116static int 7141static int
7117cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7142cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7118 cpumask_t *mask) 7143 struct sched_group **sg, struct cpumask *mask)
7119{ 7144{
7120 int group; 7145 int group;
7121#ifdef CONFIG_SCHED_MC 7146#ifdef CONFIG_SCHED_MC
7147 /* FIXME: Use cpu_coregroup_mask. */
7122 *mask = cpu_coregroup_map(cpu); 7148 *mask = cpu_coregroup_map(cpu);
7123 cpus_and(*mask, *mask, *cpu_map); 7149 cpus_and(*mask, *mask, *cpu_map);
7124 group = first_cpu(*mask); 7150 group = cpumask_first(mask);
7125#elif defined(CONFIG_SCHED_SMT) 7151#elif defined(CONFIG_SCHED_SMT)
7126 *mask = per_cpu(cpu_sibling_map, cpu); 7152 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7127 cpus_and(*mask, *mask, *cpu_map); 7153 group = cpumask_first(mask);
7128 group = first_cpu(*mask);
7129#else 7154#else
7130 group = cpu; 7155 group = cpu;
7131#endif 7156#endif
7132 if (sg) 7157 if (sg)
7133 *sg = &per_cpu(sched_group_phys, group); 7158 *sg = &per_cpu(sched_group_phys, group).sg;
7134 return group; 7159 return group;
7135} 7160}
7136 7161
@@ -7144,19 +7169,21 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
7144static struct sched_group ***sched_group_nodes_bycpu; 7169static struct sched_group ***sched_group_nodes_bycpu;
7145 7170
7146static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 7171static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
7147static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); 7172static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7148 7173
7149static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, 7174static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7150 struct sched_group **sg, cpumask_t *nodemask) 7175 struct sched_group **sg,
7176 struct cpumask *nodemask)
7151{ 7177{
7152 int group; 7178 int group;
7179 /* FIXME: use cpumask_of_node */
7180 node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
7153 7181
7154 *nodemask = node_to_cpumask(cpu_to_node(cpu)); 7182 cpumask_and(nodemask, pnodemask, cpu_map);
7155 cpus_and(*nodemask, *nodemask, *cpu_map); 7183 group = cpumask_first(nodemask);
7156 group = first_cpu(*nodemask);
7157 7184
7158 if (sg) 7185 if (sg)
7159 *sg = &per_cpu(sched_group_allnodes, group); 7186 *sg = &per_cpu(sched_group_allnodes, group).sg;
7160 return group; 7187 return group;
7161} 7188}
7162 7189
@@ -7168,11 +7195,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7168 if (!sg) 7195 if (!sg)
7169 return; 7196 return;
7170 do { 7197 do {
7171 for_each_cpu_mask_nr(j, sg->cpumask) { 7198 for_each_cpu(j, sched_group_cpus(sg)) {
7172 struct sched_domain *sd; 7199 struct sched_domain *sd;
7173 7200
7174 sd = &per_cpu(phys_domains, j); 7201 sd = &per_cpu(phys_domains, j).sd;
7175 if (j != first_cpu(sd->groups->cpumask)) { 7202 if (j != cpumask_first(sched_group_cpus(sd->groups))) {
7176 /* 7203 /*
7177 * Only add "power" once for each 7204 * Only add "power" once for each
7178 * physical package. 7205 * physical package.
@@ -7189,11 +7216,12 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7189 7216
7190#ifdef CONFIG_NUMA 7217#ifdef CONFIG_NUMA
7191/* Free memory allocated for various sched_group structures */ 7218/* Free memory allocated for various sched_group structures */
7192static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7219static void free_sched_groups(const struct cpumask *cpu_map,
7220 struct cpumask *nodemask)
7193{ 7221{
7194 int cpu, i; 7222 int cpu, i;
7195 7223
7196 for_each_cpu_mask_nr(cpu, *cpu_map) { 7224 for_each_cpu(cpu, cpu_map) {
7197 struct sched_group **sched_group_nodes 7225 struct sched_group **sched_group_nodes
7198 = sched_group_nodes_bycpu[cpu]; 7226 = sched_group_nodes_bycpu[cpu];
7199 7227
@@ -7202,10 +7230,11 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7202 7230
7203 for (i = 0; i < nr_node_ids; i++) { 7231 for (i = 0; i < nr_node_ids; i++) {
7204 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7232 struct sched_group *oldsg, *sg = sched_group_nodes[i];
7233 /* FIXME: Use cpumask_of_node */
7234 node_to_cpumask_ptr(pnodemask, i);
7205 7235
7206 *nodemask = node_to_cpumask(i); 7236 cpus_and(*nodemask, *pnodemask, *cpu_map);
7207 cpus_and(*nodemask, *nodemask, *cpu_map); 7237 if (cpumask_empty(nodemask))
7208 if (cpus_empty(*nodemask))
7209 continue; 7238 continue;
7210 7239
7211 if (sg == NULL) 7240 if (sg == NULL)
@@ -7223,7 +7252,8 @@ next_sg:
7223 } 7252 }
7224} 7253}
7225#else /* !CONFIG_NUMA */ 7254#else /* !CONFIG_NUMA */
7226static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7255static void free_sched_groups(const struct cpumask *cpu_map,
7256 struct cpumask *nodemask)
7227{ 7257{
7228} 7258}
7229#endif /* CONFIG_NUMA */ 7259#endif /* CONFIG_NUMA */
@@ -7249,7 +7279,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7249 7279
7250 WARN_ON(!sd || !sd->groups); 7280 WARN_ON(!sd || !sd->groups);
7251 7281
7252 if (cpu != first_cpu(sd->groups->cpumask)) 7282 if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
7253 return; 7283 return;
7254 7284
7255 child = sd->child; 7285 child = sd->child;
@@ -7314,40 +7344,6 @@ SD_INIT_FUNC(CPU)
7314 SD_INIT_FUNC(MC) 7344 SD_INIT_FUNC(MC)
7315#endif 7345#endif
7316 7346
7317/*
7318 * To minimize stack usage kmalloc room for cpumasks and share the
7319 * space as the usage in build_sched_domains() dictates. Used only
7320 * if the amount of space is significant.
7321 */
7322struct allmasks {
7323 cpumask_t tmpmask; /* make this one first */
7324 union {
7325 cpumask_t nodemask;
7326 cpumask_t this_sibling_map;
7327 cpumask_t this_core_map;
7328 };
7329 cpumask_t send_covered;
7330
7331#ifdef CONFIG_NUMA
7332 cpumask_t domainspan;
7333 cpumask_t covered;
7334 cpumask_t notcovered;
7335#endif
7336};
7337
7338#if NR_CPUS > 128
7339#define SCHED_CPUMASK_ALLOC 1
7340#define SCHED_CPUMASK_FREE(v) kfree(v)
7341#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7342#else
7343#define SCHED_CPUMASK_ALLOC 0
7344#define SCHED_CPUMASK_FREE(v)
7345#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7346#endif
7347
7348#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
7349 ((unsigned long)(a) + offsetof(struct allmasks, v))
7350
7351static int default_relax_domain_level = -1; 7347static int default_relax_domain_level = -1;
7352 7348
7353static int __init setup_relax_domain_level(char *str) 7349static int __init setup_relax_domain_level(char *str)
@@ -7387,17 +7383,38 @@ static void set_domain_attribute(struct sched_domain *sd,
7387 * Build sched domains for a given set of cpus and attach the sched domains 7383 * Build sched domains for a given set of cpus and attach the sched domains
7388 * to the individual cpus 7384 * to the individual cpus
7389 */ 7385 */
7390static int __build_sched_domains(const cpumask_t *cpu_map, 7386static int __build_sched_domains(const struct cpumask *cpu_map,
7391 struct sched_domain_attr *attr) 7387 struct sched_domain_attr *attr)
7392{ 7388{
7393 int i; 7389 int i, err = -ENOMEM;
7394 struct root_domain *rd; 7390 struct root_domain *rd;
7395 SCHED_CPUMASK_DECLARE(allmasks); 7391 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
7396 cpumask_t *tmpmask; 7392 tmpmask;
7397#ifdef CONFIG_NUMA 7393#ifdef CONFIG_NUMA
7394 cpumask_var_t domainspan, covered, notcovered;
7398 struct sched_group **sched_group_nodes = NULL; 7395 struct sched_group **sched_group_nodes = NULL;
7399 int sd_allnodes = 0; 7396 int sd_allnodes = 0;
7400 7397
7398 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
7399 goto out;
7400 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
7401 goto free_domainspan;
7402 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
7403 goto free_covered;
7404#endif
7405
7406 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
7407 goto free_notcovered;
7408 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
7409 goto free_nodemask;
7410 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
7411 goto free_this_sibling_map;
7412 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
7413 goto free_this_core_map;
7414 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
7415 goto free_send_covered;
7416
7417#ifdef CONFIG_NUMA
7401 /* 7418 /*
7402 * Allocate the per-node list of sched groups 7419 * Allocate the per-node list of sched groups
7403 */ 7420 */
@@ -7405,55 +7422,37 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7405 GFP_KERNEL); 7422 GFP_KERNEL);
7406 if (!sched_group_nodes) { 7423 if (!sched_group_nodes) {
7407 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7424 printk(KERN_WARNING "Can not alloc sched group node list\n");
7408 return -ENOMEM; 7425 goto free_tmpmask;
7409 } 7426 }
7410#endif 7427#endif
7411 7428
7412 rd = alloc_rootdomain(); 7429 rd = alloc_rootdomain();
7413 if (!rd) { 7430 if (!rd) {
7414 printk(KERN_WARNING "Cannot alloc root domain\n"); 7431 printk(KERN_WARNING "Cannot alloc root domain\n");
7415#ifdef CONFIG_NUMA 7432 goto free_sched_groups;
7416 kfree(sched_group_nodes);
7417#endif
7418 return -ENOMEM;
7419 } 7433 }
7420 7434
7421#if SCHED_CPUMASK_ALLOC
7422 /* get space for all scratch cpumask variables */
7423 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
7424 if (!allmasks) {
7425 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7426 kfree(rd);
7427#ifdef CONFIG_NUMA 7435#ifdef CONFIG_NUMA
7428 kfree(sched_group_nodes); 7436 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
7429#endif
7430 return -ENOMEM;
7431 }
7432#endif
7433 tmpmask = (cpumask_t *)allmasks;
7434
7435
7436#ifdef CONFIG_NUMA
7437 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
7438#endif 7437#endif
7439 7438
7440 /* 7439 /*
7441 * Set up domains for cpus specified by the cpu_map. 7440 * Set up domains for cpus specified by the cpu_map.
7442 */ 7441 */
7443 for_each_cpu_mask_nr(i, *cpu_map) { 7442 for_each_cpu(i, cpu_map) {
7444 struct sched_domain *sd = NULL, *p; 7443 struct sched_domain *sd = NULL, *p;
7445 SCHED_CPUMASK_VAR(nodemask, allmasks);
7446 7444
7445 /* FIXME: use cpumask_of_node */
7447 *nodemask = node_to_cpumask(cpu_to_node(i)); 7446 *nodemask = node_to_cpumask(cpu_to_node(i));
7448 cpus_and(*nodemask, *nodemask, *cpu_map); 7447 cpus_and(*nodemask, *nodemask, *cpu_map);
7449 7448
7450#ifdef CONFIG_NUMA 7449#ifdef CONFIG_NUMA
7451 if (cpus_weight(*cpu_map) > 7450 if (cpumask_weight(cpu_map) >
7452 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { 7451 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
7453 sd = &per_cpu(allnodes_domains, i); 7452 sd = &per_cpu(allnodes_domains, i);
7454 SD_INIT(sd, ALLNODES); 7453 SD_INIT(sd, ALLNODES);
7455 set_domain_attribute(sd, attr); 7454 set_domain_attribute(sd, attr);
7456 sd->span = *cpu_map; 7455 cpumask_copy(sched_domain_span(sd), cpu_map);
7457 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7456 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7458 p = sd; 7457 p = sd;
7459 sd_allnodes = 1; 7458 sd_allnodes = 1;
@@ -7463,18 +7462,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7463 sd = &per_cpu(node_domains, i); 7462 sd = &per_cpu(node_domains, i);
7464 SD_INIT(sd, NODE); 7463 SD_INIT(sd, NODE);
7465 set_domain_attribute(sd, attr); 7464 set_domain_attribute(sd, attr);
7466 sched_domain_node_span(cpu_to_node(i), &sd->span); 7465 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7467 sd->parent = p; 7466 sd->parent = p;
7468 if (p) 7467 if (p)
7469 p->child = sd; 7468 p->child = sd;
7470 cpus_and(sd->span, sd->span, *cpu_map); 7469 cpumask_and(sched_domain_span(sd),
7470 sched_domain_span(sd), cpu_map);
7471#endif 7471#endif
7472 7472
7473 p = sd; 7473 p = sd;
7474 sd = &per_cpu(phys_domains, i); 7474 sd = &per_cpu(phys_domains, i).sd;
7475 SD_INIT(sd, CPU); 7475 SD_INIT(sd, CPU);
7476 set_domain_attribute(sd, attr); 7476 set_domain_attribute(sd, attr);
7477 sd->span = *nodemask; 7477 cpumask_copy(sched_domain_span(sd), nodemask);
7478 sd->parent = p; 7478 sd->parent = p;
7479 if (p) 7479 if (p)
7480 p->child = sd; 7480 p->child = sd;
@@ -7482,11 +7482,12 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7482 7482
7483#ifdef CONFIG_SCHED_MC 7483#ifdef CONFIG_SCHED_MC
7484 p = sd; 7484 p = sd;
7485 sd = &per_cpu(core_domains, i); 7485 sd = &per_cpu(core_domains, i).sd;
7486 SD_INIT(sd, MC); 7486 SD_INIT(sd, MC);
7487 set_domain_attribute(sd, attr); 7487 set_domain_attribute(sd, attr);
7488 sd->span = cpu_coregroup_map(i); 7488 *sched_domain_span(sd) = cpu_coregroup_map(i);
7489 cpus_and(sd->span, sd->span, *cpu_map); 7489 cpumask_and(sched_domain_span(sd),
7490 sched_domain_span(sd), cpu_map);
7490 sd->parent = p; 7491 sd->parent = p;
7491 p->child = sd; 7492 p->child = sd;
7492 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); 7493 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7494,11 +7495,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7494 7495
7495#ifdef CONFIG_SCHED_SMT 7496#ifdef CONFIG_SCHED_SMT
7496 p = sd; 7497 p = sd;
7497 sd = &per_cpu(cpu_domains, i); 7498 sd = &per_cpu(cpu_domains, i).sd;
7498 SD_INIT(sd, SIBLING); 7499 SD_INIT(sd, SIBLING);
7499 set_domain_attribute(sd, attr); 7500 set_domain_attribute(sd, attr);
7500 sd->span = per_cpu(cpu_sibling_map, i); 7501 cpumask_and(sched_domain_span(sd),
7501 cpus_and(sd->span, sd->span, *cpu_map); 7502 &per_cpu(cpu_sibling_map, i), cpu_map);
7502 sd->parent = p; 7503 sd->parent = p;
7503 p->child = sd; 7504 p->child = sd;
7504 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 7505 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7507,13 +7508,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7507 7508
7508#ifdef CONFIG_SCHED_SMT 7509#ifdef CONFIG_SCHED_SMT
7509 /* Set up CPU (sibling) groups */ 7510 /* Set up CPU (sibling) groups */
7510 for_each_cpu_mask_nr(i, *cpu_map) { 7511 for_each_cpu(i, cpu_map) {
7511 SCHED_CPUMASK_VAR(this_sibling_map, allmasks); 7512 cpumask_and(this_sibling_map,
7512 SCHED_CPUMASK_VAR(send_covered, allmasks); 7513 &per_cpu(cpu_sibling_map, i), cpu_map);
7513 7514 if (i != cpumask_first(this_sibling_map))
7514 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7515 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7516 if (i != first_cpu(*this_sibling_map))
7517 continue; 7515 continue;
7518 7516
7519 init_sched_build_groups(this_sibling_map, cpu_map, 7517 init_sched_build_groups(this_sibling_map, cpu_map,
@@ -7524,13 +7522,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7524 7522
7525#ifdef CONFIG_SCHED_MC 7523#ifdef CONFIG_SCHED_MC
7526 /* Set up multi-core groups */ 7524 /* Set up multi-core groups */
7527 for_each_cpu_mask_nr(i, *cpu_map) { 7525 for_each_cpu(i, cpu_map) {
7528 SCHED_CPUMASK_VAR(this_core_map, allmasks); 7526 /* FIXME: Use cpu_coregroup_mask */
7529 SCHED_CPUMASK_VAR(send_covered, allmasks);
7530
7531 *this_core_map = cpu_coregroup_map(i); 7527 *this_core_map = cpu_coregroup_map(i);
7532 cpus_and(*this_core_map, *this_core_map, *cpu_map); 7528 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7533 if (i != first_cpu(*this_core_map)) 7529 if (i != cpumask_first(this_core_map))
7534 continue; 7530 continue;
7535 7531
7536 init_sched_build_groups(this_core_map, cpu_map, 7532 init_sched_build_groups(this_core_map, cpu_map,
@@ -7541,12 +7537,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7541 7537
7542 /* Set up physical groups */ 7538 /* Set up physical groups */
7543 for (i = 0; i < nr_node_ids; i++) { 7539 for (i = 0; i < nr_node_ids; i++) {
7544 SCHED_CPUMASK_VAR(nodemask, allmasks); 7540 /* FIXME: Use cpumask_of_node */
7545 SCHED_CPUMASK_VAR(send_covered, allmasks);
7546
7547 *nodemask = node_to_cpumask(i); 7541 *nodemask = node_to_cpumask(i);
7548 cpus_and(*nodemask, *nodemask, *cpu_map); 7542 cpus_and(*nodemask, *nodemask, *cpu_map);
7549 if (cpus_empty(*nodemask)) 7543 if (cpumask_empty(nodemask))
7550 continue; 7544 continue;
7551 7545
7552 init_sched_build_groups(nodemask, cpu_map, 7546 init_sched_build_groups(nodemask, cpu_map,
@@ -7557,8 +7551,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7557#ifdef CONFIG_NUMA 7551#ifdef CONFIG_NUMA
7558 /* Set up node groups */ 7552 /* Set up node groups */
7559 if (sd_allnodes) { 7553 if (sd_allnodes) {
7560 SCHED_CPUMASK_VAR(send_covered, allmasks);
7561
7562 init_sched_build_groups(cpu_map, cpu_map, 7554 init_sched_build_groups(cpu_map, cpu_map,
7563 &cpu_to_allnodes_group, 7555 &cpu_to_allnodes_group,
7564 send_covered, tmpmask); 7556 send_covered, tmpmask);
@@ -7567,58 +7559,58 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7567 for (i = 0; i < nr_node_ids; i++) { 7559 for (i = 0; i < nr_node_ids; i++) {
7568 /* Set up node groups */ 7560 /* Set up node groups */
7569 struct sched_group *sg, *prev; 7561 struct sched_group *sg, *prev;
7570 SCHED_CPUMASK_VAR(nodemask, allmasks);
7571 SCHED_CPUMASK_VAR(domainspan, allmasks);
7572 SCHED_CPUMASK_VAR(covered, allmasks);
7573 int j; 7562 int j;
7574 7563
7564 /* FIXME: Use cpumask_of_node */
7575 *nodemask = node_to_cpumask(i); 7565 *nodemask = node_to_cpumask(i);
7576 cpus_clear(*covered); 7566 cpumask_clear(covered);
7577 7567
7578 cpus_and(*nodemask, *nodemask, *cpu_map); 7568 cpus_and(*nodemask, *nodemask, *cpu_map);
7579 if (cpus_empty(*nodemask)) { 7569 if (cpumask_empty(nodemask)) {
7580 sched_group_nodes[i] = NULL; 7570 sched_group_nodes[i] = NULL;
7581 continue; 7571 continue;
7582 } 7572 }
7583 7573
7584 sched_domain_node_span(i, domainspan); 7574 sched_domain_node_span(i, domainspan);
7585 cpus_and(*domainspan, *domainspan, *cpu_map); 7575 cpumask_and(domainspan, domainspan, cpu_map);
7586 7576
7587 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); 7577 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7578 GFP_KERNEL, i);
7588 if (!sg) { 7579 if (!sg) {
7589 printk(KERN_WARNING "Can not alloc domain group for " 7580 printk(KERN_WARNING "Can not alloc domain group for "
7590 "node %d\n", i); 7581 "node %d\n", i);
7591 goto error; 7582 goto error;
7592 } 7583 }
7593 sched_group_nodes[i] = sg; 7584 sched_group_nodes[i] = sg;
7594 for_each_cpu_mask_nr(j, *nodemask) { 7585 for_each_cpu(j, nodemask) {
7595 struct sched_domain *sd; 7586 struct sched_domain *sd;
7596 7587
7597 sd = &per_cpu(node_domains, j); 7588 sd = &per_cpu(node_domains, j);
7598 sd->groups = sg; 7589 sd->groups = sg;
7599 } 7590 }
7600 sg->__cpu_power = 0; 7591 sg->__cpu_power = 0;
7601 sg->cpumask = *nodemask; 7592 cpumask_copy(sched_group_cpus(sg), nodemask);
7602 sg->next = sg; 7593 sg->next = sg;
7603 cpus_or(*covered, *covered, *nodemask); 7594 cpumask_or(covered, covered, nodemask);
7604 prev = sg; 7595 prev = sg;
7605 7596
7606 for (j = 0; j < nr_node_ids; j++) { 7597 for (j = 0; j < nr_node_ids; j++) {
7607 SCHED_CPUMASK_VAR(notcovered, allmasks);
7608 int n = (i + j) % nr_node_ids; 7598 int n = (i + j) % nr_node_ids;
7599 /* FIXME: Use cpumask_of_node */
7609 node_to_cpumask_ptr(pnodemask, n); 7600 node_to_cpumask_ptr(pnodemask, n);
7610 7601
7611 cpus_complement(*notcovered, *covered); 7602 cpumask_complement(notcovered, covered);
7612 cpus_and(*tmpmask, *notcovered, *cpu_map); 7603 cpumask_and(tmpmask, notcovered, cpu_map);
7613 cpus_and(*tmpmask, *tmpmask, *domainspan); 7604 cpumask_and(tmpmask, tmpmask, domainspan);
7614 if (cpus_empty(*tmpmask)) 7605 if (cpumask_empty(tmpmask))
7615 break; 7606 break;
7616 7607
7617 cpus_and(*tmpmask, *tmpmask, *pnodemask); 7608 cpumask_and(tmpmask, tmpmask, pnodemask);
7618 if (cpus_empty(*tmpmask)) 7609 if (cpumask_empty(tmpmask))
7619 continue; 7610 continue;
7620 7611
7621 sg = kmalloc_node(sizeof(struct sched_group), 7612 sg = kmalloc_node(sizeof(struct sched_group) +
7613 cpumask_size(),
7622 GFP_KERNEL, i); 7614 GFP_KERNEL, i);
7623 if (!sg) { 7615 if (!sg) {
7624 printk(KERN_WARNING 7616 printk(KERN_WARNING
@@ -7626,9 +7618,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7626 goto error; 7618 goto error;
7627 } 7619 }
7628 sg->__cpu_power = 0; 7620 sg->__cpu_power = 0;
7629 sg->cpumask = *tmpmask; 7621 cpumask_copy(sched_group_cpus(sg), tmpmask);
7630 sg->next = prev->next; 7622 sg->next = prev->next;
7631 cpus_or(*covered, *covered, *tmpmask); 7623 cpumask_or(covered, covered, tmpmask);
7632 prev->next = sg; 7624 prev->next = sg;
7633 prev = sg; 7625 prev = sg;
7634 } 7626 }
@@ -7637,22 +7629,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7637 7629
7638 /* Calculate CPU power for physical packages and nodes */ 7630 /* Calculate CPU power for physical packages and nodes */
7639#ifdef CONFIG_SCHED_SMT 7631#ifdef CONFIG_SCHED_SMT
7640 for_each_cpu_mask_nr(i, *cpu_map) { 7632 for_each_cpu(i, cpu_map) {
7641 struct sched_domain *sd = &per_cpu(cpu_domains, i); 7633 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
7642 7634
7643 init_sched_groups_power(i, sd); 7635 init_sched_groups_power(i, sd);
7644 } 7636 }
7645#endif 7637#endif
7646#ifdef CONFIG_SCHED_MC 7638#ifdef CONFIG_SCHED_MC
7647 for_each_cpu_mask_nr(i, *cpu_map) { 7639 for_each_cpu(i, cpu_map) {
7648 struct sched_domain *sd = &per_cpu(core_domains, i); 7640 struct sched_domain *sd = &per_cpu(core_domains, i).sd;
7649 7641
7650 init_sched_groups_power(i, sd); 7642 init_sched_groups_power(i, sd);
7651 } 7643 }
7652#endif 7644#endif
7653 7645
7654 for_each_cpu_mask_nr(i, *cpu_map) { 7646 for_each_cpu(i, cpu_map) {
7655 struct sched_domain *sd = &per_cpu(phys_domains, i); 7647 struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
7656 7648
7657 init_sched_groups_power(i, sd); 7649 init_sched_groups_power(i, sd);
7658 } 7650 }
@@ -7664,56 +7656,87 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7664 if (sd_allnodes) { 7656 if (sd_allnodes) {
7665 struct sched_group *sg; 7657 struct sched_group *sg;
7666 7658
7667 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, 7659 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
7668 tmpmask); 7660 tmpmask);
7669 init_numa_sched_groups_power(sg); 7661 init_numa_sched_groups_power(sg);
7670 } 7662 }
7671#endif 7663#endif
7672 7664
7673 /* Attach the domains */ 7665 /* Attach the domains */
7674 for_each_cpu_mask_nr(i, *cpu_map) { 7666 for_each_cpu(i, cpu_map) {
7675 struct sched_domain *sd; 7667 struct sched_domain *sd;
7676#ifdef CONFIG_SCHED_SMT 7668#ifdef CONFIG_SCHED_SMT
7677 sd = &per_cpu(cpu_domains, i); 7669 sd = &per_cpu(cpu_domains, i).sd;
7678#elif defined(CONFIG_SCHED_MC) 7670#elif defined(CONFIG_SCHED_MC)
7679 sd = &per_cpu(core_domains, i); 7671 sd = &per_cpu(core_domains, i).sd;
7680#else 7672#else
7681 sd = &per_cpu(phys_domains, i); 7673 sd = &per_cpu(phys_domains, i).sd;
7682#endif 7674#endif
7683 cpu_attach_domain(sd, rd, i); 7675 cpu_attach_domain(sd, rd, i);
7684 } 7676 }
7685 7677
7686 SCHED_CPUMASK_FREE((void *)allmasks); 7678 err = 0;
7687 return 0; 7679
7680free_tmpmask:
7681 free_cpumask_var(tmpmask);
7682free_send_covered:
7683 free_cpumask_var(send_covered);
7684free_this_core_map:
7685 free_cpumask_var(this_core_map);
7686free_this_sibling_map:
7687 free_cpumask_var(this_sibling_map);
7688free_nodemask:
7689 free_cpumask_var(nodemask);
7690free_notcovered:
7691#ifdef CONFIG_NUMA
7692 free_cpumask_var(notcovered);
7693free_covered:
7694 free_cpumask_var(covered);
7695free_domainspan:
7696 free_cpumask_var(domainspan);
7697out:
7698#endif
7699 return err;
7700
7701free_sched_groups:
7702#ifdef CONFIG_NUMA
7703 kfree(sched_group_nodes);
7704#endif
7705 goto free_tmpmask;
7688 7706
7689#ifdef CONFIG_NUMA 7707#ifdef CONFIG_NUMA
7690error: 7708error:
7691 free_sched_groups(cpu_map, tmpmask); 7709 free_sched_groups(cpu_map, tmpmask);
7692 SCHED_CPUMASK_FREE((void *)allmasks); 7710 free_rootdomain(rd);
7693 kfree(rd); 7711 goto free_tmpmask;
7694 return -ENOMEM;
7695#endif 7712#endif
7696} 7713}
7697 7714
7698static int build_sched_domains(const cpumask_t *cpu_map) 7715static int build_sched_domains(const struct cpumask *cpu_map)
7699{ 7716{
7700 return __build_sched_domains(cpu_map, NULL); 7717 return __build_sched_domains(cpu_map, NULL);
7701} 7718}
7702 7719
7703static cpumask_t *doms_cur; /* current sched domains */ 7720static struct cpumask *doms_cur; /* current sched domains */
7704static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7721static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7705static struct sched_domain_attr *dattr_cur; 7722static struct sched_domain_attr *dattr_cur;
7706 /* attribues of custom domains in 'doms_cur' */ 7723 /* attribues of custom domains in 'doms_cur' */
7707 7724
7708/* 7725/*
7709 * Special case: If a kmalloc of a doms_cur partition (array of 7726 * Special case: If a kmalloc of a doms_cur partition (array of
7710 * cpumask_t) fails, then fallback to a single sched domain, 7727 * cpumask) fails, then fallback to a single sched domain,
7711 * as determined by the single cpumask_t fallback_doms. 7728 * as determined by the single cpumask fallback_doms.
7712 */ 7729 */
7713static cpumask_t fallback_doms; 7730static cpumask_var_t fallback_doms;
7714 7731
7715void __attribute__((weak)) arch_update_cpu_topology(void) 7732/*
7733 * arch_update_cpu_topology lets virtualized architectures update the
7734 * cpu core maps. It is supposed to return 1 if the topology changed
7735 * or 0 if it stayed the same.
7736 */
7737int __attribute__((weak)) arch_update_cpu_topology(void)
7716{ 7738{
7739 return 0;
7717} 7740}
7718 7741
7719/* 7742/*
@@ -7721,16 +7744,16 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7721 * For now this just excludes isolated cpus, but could be used to 7744 * For now this just excludes isolated cpus, but could be used to
7722 * exclude other special cases in the future. 7745 * exclude other special cases in the future.
7723 */ 7746 */
7724static int arch_init_sched_domains(const cpumask_t *cpu_map) 7747static int arch_init_sched_domains(const struct cpumask *cpu_map)
7725{ 7748{
7726 int err; 7749 int err;
7727 7750
7728 arch_update_cpu_topology(); 7751 arch_update_cpu_topology();
7729 ndoms_cur = 1; 7752 ndoms_cur = 1;
7730 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 7753 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
7731 if (!doms_cur) 7754 if (!doms_cur)
7732 doms_cur = &fallback_doms; 7755 doms_cur = fallback_doms;
7733 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); 7756 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
7734 dattr_cur = NULL; 7757 dattr_cur = NULL;
7735 err = build_sched_domains(doms_cur); 7758 err = build_sched_domains(doms_cur);
7736 register_sched_domain_sysctl(); 7759 register_sched_domain_sysctl();
@@ -7738,8 +7761,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
7738 return err; 7761 return err;
7739} 7762}
7740 7763
7741static void arch_destroy_sched_domains(const cpumask_t *cpu_map, 7764static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7742 cpumask_t *tmpmask) 7765 struct cpumask *tmpmask)
7743{ 7766{
7744 free_sched_groups(cpu_map, tmpmask); 7767 free_sched_groups(cpu_map, tmpmask);
7745} 7768}
@@ -7748,17 +7771,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7748 * Detach sched domains from a group of cpus specified in cpu_map 7771 * Detach sched domains from a group of cpus specified in cpu_map
7749 * These cpus will now be attached to the NULL domain 7772 * These cpus will now be attached to the NULL domain
7750 */ 7773 */
7751static void detach_destroy_domains(const cpumask_t *cpu_map) 7774static void detach_destroy_domains(const struct cpumask *cpu_map)
7752{ 7775{
7753 cpumask_t tmpmask; 7776 /* Save because hotplug lock held. */
7777 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7754 int i; 7778 int i;
7755 7779
7756 unregister_sched_domain_sysctl(); 7780 for_each_cpu(i, cpu_map)
7757
7758 for_each_cpu_mask_nr(i, *cpu_map)
7759 cpu_attach_domain(NULL, &def_root_domain, i); 7781 cpu_attach_domain(NULL, &def_root_domain, i);
7760 synchronize_sched(); 7782 synchronize_sched();
7761 arch_destroy_sched_domains(cpu_map, &tmpmask); 7783 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7762} 7784}
7763 7785
7764/* handle null as "default" */ 7786/* handle null as "default" */
@@ -7783,7 +7805,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7783 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7805 * doms_new[] to the current sched domain partitioning, doms_cur[].
7784 * It destroys each deleted domain and builds each new domain. 7806 * It destroys each deleted domain and builds each new domain.
7785 * 7807 *
7786 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. 7808 * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
7787 * The masks don't intersect (don't overlap.) We should setup one 7809 * The masks don't intersect (don't overlap.) We should setup one
7788 * sched domain for each mask. CPUs not in any of the cpumasks will 7810 * sched domain for each mask. CPUs not in any of the cpumasks will
7789 * not be load balanced. If the same cpumask appears both in the 7811 * not be load balanced. If the same cpumask appears both in the
@@ -7797,28 +7819,33 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7797 * the single partition 'fallback_doms', it also forces the domains 7819 * the single partition 'fallback_doms', it also forces the domains
7798 * to be rebuilt. 7820 * to be rebuilt.
7799 * 7821 *
7800 * If doms_new == NULL it will be replaced with cpu_online_map. 7822 * If doms_new == NULL it will be replaced with cpu_online_mask.
7801 * ndoms_new == 0 is a special case for destroying existing domains, 7823 * ndoms_new == 0 is a special case for destroying existing domains,
7802 * and it will not create the default domain. 7824 * and it will not create the default domain.
7803 * 7825 *
7804 * Call with hotplug lock held 7826 * Call with hotplug lock held
7805 */ 7827 */
7806void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7828/* FIXME: Change to struct cpumask *doms_new[] */
7829void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
7807 struct sched_domain_attr *dattr_new) 7830 struct sched_domain_attr *dattr_new)
7808{ 7831{
7809 int i, j, n; 7832 int i, j, n;
7833 int new_topology;
7810 7834
7811 mutex_lock(&sched_domains_mutex); 7835 mutex_lock(&sched_domains_mutex);
7812 7836
7813 /* always unregister in case we don't destroy any domains */ 7837 /* always unregister in case we don't destroy any domains */
7814 unregister_sched_domain_sysctl(); 7838 unregister_sched_domain_sysctl();
7815 7839
7840 /* Let architecture update cpu core mappings. */
7841 new_topology = arch_update_cpu_topology();
7842
7816 n = doms_new ? ndoms_new : 0; 7843 n = doms_new ? ndoms_new : 0;
7817 7844
7818 /* Destroy deleted domains */ 7845 /* Destroy deleted domains */
7819 for (i = 0; i < ndoms_cur; i++) { 7846 for (i = 0; i < ndoms_cur; i++) {
7820 for (j = 0; j < n; j++) { 7847 for (j = 0; j < n && !new_topology; j++) {
7821 if (cpus_equal(doms_cur[i], doms_new[j]) 7848 if (cpumask_equal(&doms_cur[i], &doms_new[j])
7822 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7849 && dattrs_equal(dattr_cur, i, dattr_new, j))
7823 goto match1; 7850 goto match1;
7824 } 7851 }
@@ -7830,15 +7857,15 @@ match1:
7830 7857
7831 if (doms_new == NULL) { 7858 if (doms_new == NULL) {
7832 ndoms_cur = 0; 7859 ndoms_cur = 0;
7833 doms_new = &fallback_doms; 7860 doms_new = fallback_doms;
7834 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7861 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
7835 dattr_new = NULL; 7862 WARN_ON_ONCE(dattr_new);
7836 } 7863 }
7837 7864
7838 /* Build new domains */ 7865 /* Build new domains */
7839 for (i = 0; i < ndoms_new; i++) { 7866 for (i = 0; i < ndoms_new; i++) {
7840 for (j = 0; j < ndoms_cur; j++) { 7867 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7841 if (cpus_equal(doms_new[i], doms_cur[j]) 7868 if (cpumask_equal(&doms_new[i], &doms_cur[j])
7842 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7869 && dattrs_equal(dattr_new, i, dattr_cur, j))
7843 goto match2; 7870 goto match2;
7844 } 7871 }
@@ -7850,7 +7877,7 @@ match2:
7850 } 7877 }
7851 7878
7852 /* Remember the new sched domains */ 7879 /* Remember the new sched domains */
7853 if (doms_cur != &fallback_doms) 7880 if (doms_cur != fallback_doms)
7854 kfree(doms_cur); 7881 kfree(doms_cur);
7855 kfree(dattr_cur); /* kfree(NULL) is safe */ 7882 kfree(dattr_cur); /* kfree(NULL) is safe */
7856 doms_cur = doms_new; 7883 doms_cur = doms_new;
@@ -7990,7 +8017,9 @@ static int update_runtime(struct notifier_block *nfb,
7990 8017
7991void __init sched_init_smp(void) 8018void __init sched_init_smp(void)
7992{ 8019{
7993 cpumask_t non_isolated_cpus; 8020 cpumask_var_t non_isolated_cpus;
8021
8022 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7994 8023
7995#if defined(CONFIG_NUMA) 8024#if defined(CONFIG_NUMA)
7996 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 8025 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -7999,10 +8028,10 @@ void __init sched_init_smp(void)
7999#endif 8028#endif
8000 get_online_cpus(); 8029 get_online_cpus();
8001 mutex_lock(&sched_domains_mutex); 8030 mutex_lock(&sched_domains_mutex);
8002 arch_init_sched_domains(&cpu_online_map); 8031 arch_init_sched_domains(cpu_online_mask);
8003 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 8032 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
8004 if (cpus_empty(non_isolated_cpus)) 8033 if (cpumask_empty(non_isolated_cpus))
8005 cpu_set(smp_processor_id(), non_isolated_cpus); 8034 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
8006 mutex_unlock(&sched_domains_mutex); 8035 mutex_unlock(&sched_domains_mutex);
8007 put_online_cpus(); 8036 put_online_cpus();
8008 8037
@@ -8017,9 +8046,13 @@ void __init sched_init_smp(void)
8017 init_hrtick(); 8046 init_hrtick();
8018 8047
8019 /* Move init over to a non-isolated CPU */ 8048 /* Move init over to a non-isolated CPU */
8020 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) 8049 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
8021 BUG(); 8050 BUG();
8022 sched_init_granularity(); 8051 sched_init_granularity();
8052 free_cpumask_var(non_isolated_cpus);
8053
8054 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
8055 init_sched_rt_class();
8023} 8056}
8024#else 8057#else
8025void __init sched_init_smp(void) 8058void __init sched_init_smp(void)
@@ -8334,6 +8367,15 @@ void __init sched_init(void)
8334 */ 8367 */
8335 current->sched_class = &fair_sched_class; 8368 current->sched_class = &fair_sched_class;
8336 8369
8370 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8371 alloc_bootmem_cpumask_var(&nohz_cpu_mask);
8372#ifdef CONFIG_SMP
8373#ifdef CONFIG_NO_HZ
8374 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
8375#endif
8376 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8377#endif /* SMP */
8378
8337 scheduler_running = 1; 8379 scheduler_running = 1;
8338} 8380}
8339 8381
@@ -8492,7 +8534,7 @@ static
8492int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8534int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8493{ 8535{
8494 struct cfs_rq *cfs_rq; 8536 struct cfs_rq *cfs_rq;
8495 struct sched_entity *se, *parent_se; 8537 struct sched_entity *se;
8496 struct rq *rq; 8538 struct rq *rq;
8497 int i; 8539 int i;
8498 8540
@@ -8508,18 +8550,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8508 for_each_possible_cpu(i) { 8550 for_each_possible_cpu(i) {
8509 rq = cpu_rq(i); 8551 rq = cpu_rq(i);
8510 8552
8511 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), 8553 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8512 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8554 GFP_KERNEL, cpu_to_node(i));
8513 if (!cfs_rq) 8555 if (!cfs_rq)
8514 goto err; 8556 goto err;
8515 8557
8516 se = kmalloc_node(sizeof(struct sched_entity), 8558 se = kzalloc_node(sizeof(struct sched_entity),
8517 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8559 GFP_KERNEL, cpu_to_node(i));
8518 if (!se) 8560 if (!se)
8519 goto err; 8561 goto err;
8520 8562
8521 parent_se = parent ? parent->se[i] : NULL; 8563 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8522 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8523 } 8564 }
8524 8565
8525 return 1; 8566 return 1;
@@ -8580,7 +8621,7 @@ static
8580int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8621int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8581{ 8622{
8582 struct rt_rq *rt_rq; 8623 struct rt_rq *rt_rq;
8583 struct sched_rt_entity *rt_se, *parent_se; 8624 struct sched_rt_entity *rt_se;
8584 struct rq *rq; 8625 struct rq *rq;
8585 int i; 8626 int i;
8586 8627
@@ -8597,18 +8638,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8597 for_each_possible_cpu(i) { 8638 for_each_possible_cpu(i) {
8598 rq = cpu_rq(i); 8639 rq = cpu_rq(i);
8599 8640
8600 rt_rq = kmalloc_node(sizeof(struct rt_rq), 8641 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8601 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8642 GFP_KERNEL, cpu_to_node(i));
8602 if (!rt_rq) 8643 if (!rt_rq)
8603 goto err; 8644 goto err;
8604 8645
8605 rt_se = kmalloc_node(sizeof(struct sched_rt_entity), 8646 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8606 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8647 GFP_KERNEL, cpu_to_node(i));
8607 if (!rt_se) 8648 if (!rt_se)
8608 goto err; 8649 goto err;
8609 8650
8610 parent_se = parent ? parent->rt_se[i] : NULL; 8651 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8611 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8612 } 8652 }
8613 8653
8614 return 1; 8654 return 1;
@@ -9251,11 +9291,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9251 * (balbir@in.ibm.com). 9291 * (balbir@in.ibm.com).
9252 */ 9292 */
9253 9293
9254/* track cpu usage of a group of tasks */ 9294/* track cpu usage of a group of tasks and its child groups */
9255struct cpuacct { 9295struct cpuacct {
9256 struct cgroup_subsys_state css; 9296 struct cgroup_subsys_state css;
9257 /* cpuusage holds pointer to a u64-type object on every cpu */ 9297 /* cpuusage holds pointer to a u64-type object on every cpu */
9258 u64 *cpuusage; 9298 u64 *cpuusage;
9299 struct cpuacct *parent;
9259}; 9300};
9260 9301
9261struct cgroup_subsys cpuacct_subsys; 9302struct cgroup_subsys cpuacct_subsys;
@@ -9289,6 +9330,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9289 return ERR_PTR(-ENOMEM); 9330 return ERR_PTR(-ENOMEM);
9290 } 9331 }
9291 9332
9333 if (cgrp->parent)
9334 ca->parent = cgroup_ca(cgrp->parent);
9335
9292 return &ca->css; 9336 return &ca->css;
9293} 9337}
9294 9338
@@ -9368,14 +9412,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9368static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9412static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9369{ 9413{
9370 struct cpuacct *ca; 9414 struct cpuacct *ca;
9415 int cpu;
9371 9416
9372 if (!cpuacct_subsys.active) 9417 if (!cpuacct_subsys.active)
9373 return; 9418 return;
9374 9419
9420 cpu = task_cpu(tsk);
9375 ca = task_ca(tsk); 9421 ca = task_ca(tsk);
9376 if (ca) {
9377 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9378 9422
9423 for (; ca; ca = ca->parent) {
9424 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9379 *cpuusage += cputime; 9425 *cpuusage += cputime;
9380 } 9426 }
9381} 9427}
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 52154fefab7e..018b7be1db2e 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -67,24 +67,21 @@ static int convert_prio(int prio)
67 * Returns: (int)bool - CPUs were found 67 * Returns: (int)bool - CPUs were found
68 */ 68 */
69int cpupri_find(struct cpupri *cp, struct task_struct *p, 69int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 cpumask_t *lowest_mask) 70 struct cpumask *lowest_mask)
71{ 71{
72 int idx = 0; 72 int idx = 0;
73 int task_pri = convert_prio(p->prio); 73 int task_pri = convert_prio(p->prio);
74 74
75 for_each_cpupri_active(cp->pri_active, idx) { 75 for_each_cpupri_active(cp->pri_active, idx) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 cpumask_t mask;
78 77
79 if (idx >= task_pri) 78 if (idx >= task_pri)
80 break; 79 break;
81 80
82 cpus_and(mask, p->cpus_allowed, vec->mask); 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
83
84 if (cpus_empty(mask))
85 continue; 82 continue;
86 83
87 *lowest_mask = mask; 84 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
88 return 1; 85 return 1;
89 } 86 }
90 87
@@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
126 vec->count--; 123 vec->count--;
127 if (!vec->count) 124 if (!vec->count)
128 clear_bit(oldpri, cp->pri_active); 125 clear_bit(oldpri, cp->pri_active);
129 cpu_clear(cpu, vec->mask); 126 cpumask_clear_cpu(cpu, vec->mask);
130 127
131 spin_unlock_irqrestore(&vec->lock, flags); 128 spin_unlock_irqrestore(&vec->lock, flags);
132 } 129 }
@@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
136 133
137 spin_lock_irqsave(&vec->lock, flags); 134 spin_lock_irqsave(&vec->lock, flags);
138 135
139 cpu_set(cpu, vec->mask); 136 cpumask_set_cpu(cpu, vec->mask);
140 vec->count++; 137 vec->count++;
141 if (vec->count == 1) 138 if (vec->count == 1)
142 set_bit(newpri, cp->pri_active); 139 set_bit(newpri, cp->pri_active);
@@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
150/** 147/**
151 * cpupri_init - initialize the cpupri structure 148 * cpupri_init - initialize the cpupri structure
152 * @cp: The cpupri context 149 * @cp: The cpupri context
150 * @bootmem: true if allocations need to use bootmem
153 * 151 *
154 * Returns: (void) 152 * Returns: -ENOMEM if memory fails.
155 */ 153 */
156void cpupri_init(struct cpupri *cp) 154int cpupri_init(struct cpupri *cp, bool bootmem)
157{ 155{
158 int i; 156 int i;
159 157
@@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp)
164 162
165 spin_lock_init(&vec->lock); 163 spin_lock_init(&vec->lock);
166 vec->count = 0; 164 vec->count = 0;
167 cpus_clear(vec->mask); 165 if (bootmem)
166 alloc_bootmem_cpumask_var(&vec->mask);
167 else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
168 goto cleanup;
168 } 169 }
169 170
170 for_each_possible_cpu(i) 171 for_each_possible_cpu(i)
171 cp->cpu_to_pri[i] = CPUPRI_INVALID; 172 cp->cpu_to_pri[i] = CPUPRI_INVALID;
173 return 0;
174
175cleanup:
176 for (i--; i >= 0; i--)
177 free_cpumask_var(cp->pri_to_cpu[i].mask);
178 return -ENOMEM;
172} 179}
173 180
181/**
182 * cpupri_cleanup - clean up the cpupri structure
183 * @cp: The cpupri context
184 */
185void cpupri_cleanup(struct cpupri *cp)
186{
187 int i;
174 188
189 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
190 free_cpumask_var(cp->pri_to_cpu[i].mask);
191}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index f25811b0f931..642a94ef8a0a 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -14,7 +14,7 @@
14struct cpupri_vec { 14struct cpupri_vec {
15 spinlock_t lock; 15 spinlock_t lock;
16 int count; 16 int count;
17 cpumask_t mask; 17 cpumask_var_t mask;
18}; 18};
19 19
20struct cpupri { 20struct cpupri {
@@ -27,7 +27,8 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask); 28 struct task_struct *p, cpumask_t *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30void cpupri_init(struct cpupri *cp); 30int cpupri_init(struct cpupri *cp, bool bootmem);
31void cpupri_cleanup(struct cpupri *cp);
31#else 32#else
32#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
33#define cpupri_init() do { } while (0) 34#define cpupri_init() do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 26ed8e3d1c15..4293cfa9681d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
53 53
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu,
58 struct task_group *tg)
59{
60 struct sched_entity *se = tg->se[cpu];
61 if (!se)
62 return;
63
64#define P(F) \
65 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
66#define PN(F) \
67 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
68
69 PN(se->exec_start);
70 PN(se->vruntime);
71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start);
74 PN(se->sleep_start);
75 PN(se->block_start);
76 PN(se->sleep_max);
77 PN(se->block_max);
78 PN(se->exec_max);
79 PN(se->slice_max);
80 PN(se->wait_max);
81 PN(se->wait_sum);
82 P(se->wait_count);
83#endif
84 P(se->load.weight);
85#undef PN
86#undef P
87}
88#endif
89
56static void 90static void
57print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 91print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
58{ 92{
@@ -121,20 +155,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
121 155
122#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) 156#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
123 char path[128] = ""; 157 char path[128] = "";
124 struct cgroup *cgroup = NULL;
125 struct task_group *tg = cfs_rq->tg; 158 struct task_group *tg = cfs_rq->tg;
126 159
127 if (tg) 160 cgroup_path(tg->css.cgroup, path, sizeof(path));
128 cgroup = tg->css.cgroup;
129
130 if (cgroup)
131 cgroup_path(cgroup, path, sizeof(path));
132 161
133 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 162 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
163#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
164 {
165 uid_t uid = cfs_rq->tg->uid;
166 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
167 }
134#else 168#else
135 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 169 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
136#endif 170#endif
137
138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 171 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
139 SPLIT_NS(cfs_rq->exec_clock)); 172 SPLIT_NS(cfs_rq->exec_clock));
140 173
@@ -168,6 +201,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168#ifdef CONFIG_SMP 201#ifdef CONFIG_SMP
169 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 202 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
170#endif 203#endif
204 print_cfs_group_stats(m, cpu, cfs_rq->tg);
171#endif 205#endif
172} 206}
173 207
@@ -175,14 +209,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
175{ 209{
176#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) 210#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
177 char path[128] = ""; 211 char path[128] = "";
178 struct cgroup *cgroup = NULL;
179 struct task_group *tg = rt_rq->tg; 212 struct task_group *tg = rt_rq->tg;
180 213
181 if (tg) 214 cgroup_path(tg->css.cgroup, path, sizeof(path));
182 cgroup = tg->css.cgroup;
183
184 if (cgroup)
185 cgroup_path(cgroup, path, sizeof(path));
186 215
187 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); 216 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
188#else 217#else
@@ -272,7 +301,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
272 u64 now = ktime_to_ns(ktime_get()); 301 u64 now = ktime_to_ns(ktime_get());
273 int cpu; 302 int cpu;
274 303
275 SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n", 304 SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
276 init_utsname()->release, 305 init_utsname()->release,
277 (int)strcspn(init_utsname()->version, " "), 306 (int)strcspn(init_utsname()->version, " "),
278 init_utsname()->version); 307 init_utsname()->version);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 98345e45b059..08ffffd4a410 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1017,14 +1017,13 @@ static void yield_task_fair(struct rq *rq)
1017 * search starts with cpus closest then further out as needed, 1017 * search starts with cpus closest then further out as needed,
1018 * so we always favor a closer, idle cpu. 1018 * so we always favor a closer, idle cpu.
1019 * Domains may include CPUs that are not usable for migration, 1019 * Domains may include CPUs that are not usable for migration,
1020 * hence we need to mask them out (cpu_active_map) 1020 * hence we need to mask them out (cpu_active_mask)
1021 * 1021 *
1022 * Returns the CPU we should wake onto. 1022 * Returns the CPU we should wake onto.
1023 */ 1023 */
1024#if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1024#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1025static int wake_idle(int cpu, struct task_struct *p) 1025static int wake_idle(int cpu, struct task_struct *p)
1026{ 1026{
1027 cpumask_t tmp;
1028 struct sched_domain *sd; 1027 struct sched_domain *sd;
1029 int i; 1028 int i;
1030 1029
@@ -1044,10 +1043,9 @@ static int wake_idle(int cpu, struct task_struct *p)
1044 if ((sd->flags & SD_WAKE_IDLE) 1043 if ((sd->flags & SD_WAKE_IDLE)
1045 || ((sd->flags & SD_WAKE_IDLE_FAR) 1044 || ((sd->flags & SD_WAKE_IDLE_FAR)
1046 && !task_hot(p, task_rq(p)->clock, sd))) { 1045 && !task_hot(p, task_rq(p)->clock, sd))) {
1047 cpus_and(tmp, sd->span, p->cpus_allowed); 1046 for_each_cpu_and(i, sched_domain_span(sd),
1048 cpus_and(tmp, tmp, cpu_active_map); 1047 &p->cpus_allowed) {
1049 for_each_cpu_mask_nr(i, tmp) { 1048 if (cpu_active(i) && idle_cpu(i)) {
1050 if (idle_cpu(i)) {
1051 if (i != task_cpu(p)) { 1049 if (i != task_cpu(p)) {
1052 schedstat_inc(p, 1050 schedstat_inc(p,
1053 se.nr_wakeups_idle); 1051 se.nr_wakeups_idle);
@@ -1240,13 +1238,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1240 * this_cpu and prev_cpu are present in: 1238 * this_cpu and prev_cpu are present in:
1241 */ 1239 */
1242 for_each_domain(this_cpu, sd) { 1240 for_each_domain(this_cpu, sd) {
1243 if (cpu_isset(prev_cpu, sd->span)) { 1241 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
1244 this_sd = sd; 1242 this_sd = sd;
1245 break; 1243 break;
1246 } 1244 }
1247 } 1245 }
1248 1246
1249 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1247 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
1250 goto out; 1248 goto out;
1251 1249
1252 /* 1250 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d9ba9d5f99d6..1bbd99014011 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq)
15 if (!rq->online) 15 if (!rq->online)
16 return; 16 return;
17 17
18 cpu_set(rq->cpu, rq->rd->rto_mask); 18 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
19 /* 19 /*
20 * Make sure the mask is visible before we set 20 * Make sure the mask is visible before we set
21 * the overload count. That is checked to determine 21 * the overload count. That is checked to determine
@@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq)
34 34
35 /* the order here really doesn't matter */ 35 /* the order here really doesn't matter */
36 atomic_dec(&rq->rd->rto_count); 36 atomic_dec(&rq->rd->rto_count);
37 cpu_clear(rq->cpu, rq->rd->rto_mask); 37 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
38} 38}
39 39
40static void update_rt_migration(struct rq *rq) 40static void update_rt_migration(struct rq *rq)
@@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
139} 139}
140 140
141#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
142static inline cpumask_t sched_rt_period_mask(void) 142static inline const struct cpumask *sched_rt_period_mask(void)
143{ 143{
144 return cpu_rq(smp_processor_id())->rd->span; 144 return cpu_rq(smp_processor_id())->rd->span;
145} 145}
146#else 146#else
147static inline cpumask_t sched_rt_period_mask(void) 147static inline const struct cpumask *sched_rt_period_mask(void)
148{ 148{
149 return cpu_online_map; 149 return cpu_online_mask;
150} 150}
151#endif 151#endif
152 152
@@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
212 return rt_rq->rt_throttled; 212 return rt_rq->rt_throttled;
213} 213}
214 214
215static inline cpumask_t sched_rt_period_mask(void) 215static inline const struct cpumask *sched_rt_period_mask(void)
216{ 216{
217 return cpu_online_map; 217 return cpu_online_mask;
218} 218}
219 219
220static inline 220static inline
@@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
241 int i, weight, more = 0; 241 int i, weight, more = 0;
242 u64 rt_period; 242 u64 rt_period;
243 243
244 weight = cpus_weight(rd->span); 244 weight = cpumask_weight(rd->span);
245 245
246 spin_lock(&rt_b->rt_runtime_lock); 246 spin_lock(&rt_b->rt_runtime_lock);
247 rt_period = ktime_to_ns(rt_b->rt_period); 247 rt_period = ktime_to_ns(rt_b->rt_period);
248 for_each_cpu_mask_nr(i, rd->span) { 248 for_each_cpu(i, rd->span) {
249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
250 s64 diff; 250 s64 diff;
251 251
@@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq)
324 /* 324 /*
325 * Greedy reclaim, take back as much as we can. 325 * Greedy reclaim, take back as much as we can.
326 */ 326 */
327 for_each_cpu_mask(i, rd->span) { 327 for_each_cpu(i, rd->span) {
328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
329 s64 diff; 329 s64 diff;
330 330
@@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
430{ 430{
431 int i, idle = 1; 431 int i, idle = 1;
432 cpumask_t span; 432 const struct cpumask *span;
433 433
434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
435 return 1; 435 return 1;
436 436
437 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
438 for_each_cpu_mask(i, span) { 438 for_each_cpu(i, span) {
439 int enqueue = 0; 439 int enqueue = 0;
440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
441 struct rq *rq = rq_of_rt_rq(rt_rq); 441 struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq)
537 for_each_sched_rt_entity(rt_se) { 537 for_each_sched_rt_entity(rt_se) {
538 rt_rq = rt_rq_of_se(rt_se); 538 rt_rq = rt_rq_of_se(rt_se);
539 539
540 spin_lock(&rt_rq->rt_runtime_lock);
541 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 540 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
541 spin_lock(&rt_rq->rt_runtime_lock);
542 rt_rq->rt_time += delta_exec; 542 rt_rq->rt_time += delta_exec;
543 if (sched_rt_runtime_exceeded(rt_rq)) 543 if (sched_rt_runtime_exceeded(rt_rq))
544 resched_task(curr); 544 resched_task(curr);
545 spin_unlock(&rt_rq->rt_runtime_lock);
545 } 546 }
546 spin_unlock(&rt_rq->rt_runtime_lock);
547 } 547 }
548} 548}
549 549
@@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
805 805
806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
807{ 807{
808 cpumask_t mask; 808 cpumask_var_t mask;
809 809
810 if (rq->curr->rt.nr_cpus_allowed == 1) 810 if (rq->curr->rt.nr_cpus_allowed == 1)
811 return; 811 return;
812 812
813 if (p->rt.nr_cpus_allowed != 1 813 if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
814 && cpupri_find(&rq->rd->cpupri, p, &mask))
815 return; 814 return;
816 815
817 if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) 816 if (p->rt.nr_cpus_allowed != 1
818 return; 817 && cpupri_find(&rq->rd->cpupri, p, mask))
818 goto free;
819
820 if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
821 goto free;
819 822
820 /* 823 /*
821 * There appears to be other cpus that can accept 824 * There appears to be other cpus that can accept
@@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
824 */ 827 */
825 requeue_task_rt(rq, p, 1); 828 requeue_task_rt(rq, p, 1);
826 resched_task(rq->curr); 829 resched_task(rq->curr);
830free:
831 free_cpumask_var(mask);
827} 832}
828 833
829#endif /* CONFIG_SMP */ 834#endif /* CONFIG_SMP */
@@ -909,15 +914,12 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
909/* Only try algorithms three times */ 914/* Only try algorithms three times */
910#define RT_MAX_TRIES 3 915#define RT_MAX_TRIES 3
911 916
912static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
913static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
914
915static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); 917static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
916 918
917static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 919static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
918{ 920{
919 if (!task_running(rq, p) && 921 if (!task_running(rq, p) &&
920 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && 922 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
921 (p->rt.nr_cpus_allowed > 1)) 923 (p->rt.nr_cpus_allowed > 1))
922 return 1; 924 return 1;
923 return 0; 925 return 0;
@@ -956,7 +958,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
956 return next; 958 return next;
957} 959}
958 960
959static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); 961static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
960 962
961static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 963static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
962{ 964{
@@ -976,7 +978,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
976static int find_lowest_rq(struct task_struct *task) 978static int find_lowest_rq(struct task_struct *task)
977{ 979{
978 struct sched_domain *sd; 980 struct sched_domain *sd;
979 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); 981 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
980 int this_cpu = smp_processor_id(); 982 int this_cpu = smp_processor_id();
981 int cpu = task_cpu(task); 983 int cpu = task_cpu(task);
982 984
@@ -991,7 +993,7 @@ static int find_lowest_rq(struct task_struct *task)
991 * I guess we might want to change cpupri_find() to ignore those 993 * I guess we might want to change cpupri_find() to ignore those
992 * in the first place. 994 * in the first place.
993 */ 995 */
994 cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); 996 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
995 997
996 /* 998 /*
997 * At this point we have built a mask of cpus representing the 999 * At this point we have built a mask of cpus representing the
@@ -1001,7 +1003,7 @@ static int find_lowest_rq(struct task_struct *task)
1001 * We prioritize the last cpu that the task executed on since 1003 * We prioritize the last cpu that the task executed on since
1002 * it is most likely cache-hot in that location. 1004 * it is most likely cache-hot in that location.
1003 */ 1005 */
1004 if (cpu_isset(cpu, *lowest_mask)) 1006 if (cpumask_test_cpu(cpu, lowest_mask))
1005 return cpu; 1007 return cpu;
1006 1008
1007 /* 1009 /*
@@ -1016,7 +1018,8 @@ static int find_lowest_rq(struct task_struct *task)
1016 cpumask_t domain_mask; 1018 cpumask_t domain_mask;
1017 int best_cpu; 1019 int best_cpu;
1018 1020
1019 cpus_and(domain_mask, sd->span, *lowest_mask); 1021 cpumask_and(&domain_mask, sched_domain_span(sd),
1022 lowest_mask);
1020 1023
1021 best_cpu = pick_optimal_cpu(this_cpu, 1024 best_cpu = pick_optimal_cpu(this_cpu,
1022 &domain_mask); 1025 &domain_mask);
@@ -1057,8 +1060,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1057 * Also make sure that it wasn't scheduled on its rq. 1060 * Also make sure that it wasn't scheduled on its rq.
1058 */ 1061 */
1059 if (unlikely(task_rq(task) != rq || 1062 if (unlikely(task_rq(task) != rq ||
1060 !cpu_isset(lowest_rq->cpu, 1063 !cpumask_test_cpu(lowest_rq->cpu,
1061 task->cpus_allowed) || 1064 &task->cpus_allowed) ||
1062 task_running(rq, task) || 1065 task_running(rq, task) ||
1063 !task->se.on_rq)) { 1066 !task->se.on_rq)) {
1064 1067
@@ -1179,7 +1182,7 @@ static int pull_rt_task(struct rq *this_rq)
1179 1182
1180 next = pick_next_task_rt(this_rq); 1183 next = pick_next_task_rt(this_rq);
1181 1184
1182 for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { 1185 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1183 if (this_cpu == cpu) 1186 if (this_cpu == cpu)
1184 continue; 1187 continue;
1185 1188
@@ -1308,9 +1311,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1308} 1311}
1309 1312
1310static void set_cpus_allowed_rt(struct task_struct *p, 1313static void set_cpus_allowed_rt(struct task_struct *p,
1311 const cpumask_t *new_mask) 1314 const struct cpumask *new_mask)
1312{ 1315{
1313 int weight = cpus_weight(*new_mask); 1316 int weight = cpumask_weight(new_mask);
1314 1317
1315 BUG_ON(!rt_task(p)); 1318 BUG_ON(!rt_task(p));
1316 1319
@@ -1331,7 +1334,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1331 update_rt_migration(rq); 1334 update_rt_migration(rq);
1332 } 1335 }
1333 1336
1334 p->cpus_allowed = *new_mask; 1337 cpumask_copy(&p->cpus_allowed, new_mask);
1335 p->rt.nr_cpus_allowed = weight; 1338 p->rt.nr_cpus_allowed = weight;
1336} 1339}
1337 1340
@@ -1374,6 +1377,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1374 if (!rq->rt.rt_nr_running) 1377 if (!rq->rt.rt_nr_running)
1375 pull_rt_task(rq); 1378 pull_rt_task(rq);
1376} 1379}
1380
1381static inline void init_sched_rt_class(void)
1382{
1383 unsigned int i;
1384
1385 for_each_possible_cpu(i)
1386 alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
1387}
1377#endif /* CONFIG_SMP */ 1388#endif /* CONFIG_SMP */
1378 1389
1379/* 1390/*
@@ -1544,3 +1555,4 @@ static void print_rt_stats(struct seq_file *m, int cpu)
1544 rcu_read_unlock(); 1555 rcu_read_unlock();
1545} 1556}
1546#endif /* CONFIG_SCHED_DEBUG */ 1557#endif /* CONFIG_SCHED_DEBUG */
1558
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 6beff1e4eeae..5fcf0e184586 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
42 for_each_domain(cpu, sd) { 42 for_each_domain(cpu, sd) {
43 enum cpu_idle_type itype; 43 enum cpu_idle_type itype;
44 44
45 cpumask_scnprintf(mask_str, mask_len, &sd->span); 45 cpumask_scnprintf(mask_str, mask_len,
46 sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str); 47 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 48 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) { 49 itype++) {
diff --git a/kernel/signal.c b/kernel/signal.c
index 4530fc654455..e9afe63da24b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -41,6 +41,8 @@
41 41
42static struct kmem_cache *sigqueue_cachep; 42static struct kmem_cache *sigqueue_cachep;
43 43
44DEFINE_TRACE(sched_signal_send);
45
44static void __user *sig_handler(struct task_struct *t, int sig) 46static void __user *sig_handler(struct task_struct *t, int sig)
45{ 47{
46 return t->sighand->action[sig - 1].sa.sa_handler; 48 return t->sighand->action[sig - 1].sa.sa_handler;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index dc0b3be6b7d5..1ab790c67b17 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
164/* 164/*
165 * Zero means infinite timeout - no checking done: 165 * Zero means infinite timeout - no checking done:
166 */ 166 */
167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
168 168
169unsigned long __read_mostly sysctl_hung_task_warnings = 10; 169unsigned long __read_mostly sysctl_hung_task_warnings = 10;
170 170
diff --git a/kernel/sys.c b/kernel/sys.c
index 31deba8f7d16..5fc3a0cfb994 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -858,8 +858,8 @@ void do_sys_times(struct tms *tms)
858 struct task_cputime cputime; 858 struct task_cputime cputime;
859 cputime_t cutime, cstime; 859 cputime_t cutime, cstime;
860 860
861 spin_lock_irq(&current->sighand->siglock);
862 thread_group_cputime(current, &cputime); 861 thread_group_cputime(current, &cputime);
862 spin_lock_irq(&current->sighand->siglock);
863 cutime = current->signal->cutime; 863 cutime = current->signal->cutime;
864 cstime = current->signal->cstime; 864 cstime = current->signal->cstime;
865 spin_unlock_irq(&current->sighand->siglock); 865 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3d56fe7570da..c83f566e940a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -487,6 +487,16 @@ static struct ctl_table kern_table[] = {
487 .proc_handler = &ftrace_enable_sysctl, 487 .proc_handler = &ftrace_enable_sysctl,
488 }, 488 },
489#endif 489#endif
490#ifdef CONFIG_TRACING
491 {
492 .ctl_name = CTL_UNNUMBERED,
493 .procname = "ftrace_dump_on_oops",
494 .data = &ftrace_dump_on_oops,
495 .maxlen = sizeof(int),
496 .mode = 0644,
497 .proc_handler = &proc_dointvec,
498 },
499#endif
490#ifdef CONFIG_MODULES 500#ifdef CONFIG_MODULES
491 { 501 {
492 .ctl_name = KERN_MODPROBE, 502 .ctl_name = KERN_MODPROBE,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 342fc9ccab46..70f872c71f4e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void)
144 if (!ts->tick_stopped) 144 if (!ts->tick_stopped)
145 return; 145 return;
146 146
147 cpu_clear(cpu, nohz_cpu_mask); 147 cpumask_clear_cpu(cpu, nohz_cpu_mask);
148 now = ktime_get(); 148 now = ktime_get();
149 ts->idle_waketime = now; 149 ts->idle_waketime = now;
150 150
@@ -283,7 +283,7 @@ void tick_nohz_stop_sched_tick(int inidle)
283 if ((long)delta_jiffies >= 1) { 283 if ((long)delta_jiffies >= 1) {
284 284
285 if (delta_jiffies > 1) 285 if (delta_jiffies > 1)
286 cpu_set(cpu, nohz_cpu_mask); 286 cpumask_set_cpu(cpu, nohz_cpu_mask);
287 /* 287 /*
288 * nohz_stop_sched_tick can be called several times before 288 * nohz_stop_sched_tick can be called several times before
289 * the nohz_restart_sched_tick is called. This happens when 289 * the nohz_restart_sched_tick is called. This happens when
@@ -296,7 +296,7 @@ void tick_nohz_stop_sched_tick(int inidle)
296 /* 296 /*
297 * sched tick not stopped! 297 * sched tick not stopped!
298 */ 298 */
299 cpu_clear(cpu, nohz_cpu_mask); 299 cpumask_clear_cpu(cpu, nohz_cpu_mask);
300 goto out; 300 goto out;
301 } 301 }
302 302
@@ -354,7 +354,7 @@ void tick_nohz_stop_sched_tick(int inidle)
354 * softirq. 354 * softirq.
355 */ 355 */
356 tick_do_update_jiffies64(ktime_get()); 356 tick_do_update_jiffies64(ktime_get());
357 cpu_clear(cpu, nohz_cpu_mask); 357 cpumask_clear_cpu(cpu, nohz_cpu_mask);
358 } 358 }
359 raise_softirq_irqoff(TIMER_SOFTIRQ); 359 raise_softirq_irqoff(TIMER_SOFTIRQ);
360out: 360out:
@@ -432,7 +432,7 @@ void tick_nohz_restart_sched_tick(void)
432 select_nohz_load_balancer(0); 432 select_nohz_load_balancer(0);
433 now = ktime_get(); 433 now = ktime_get();
434 tick_do_update_jiffies64(now); 434 tick_do_update_jiffies64(now);
435 cpu_clear(cpu, nohz_cpu_mask); 435 cpumask_clear_cpu(cpu, nohz_cpu_mask);
436 436
437 /* 437 /*
438 * We stopped the tick in idle. Update process times would miss the 438 * We stopped the tick in idle. Update process times would miss the
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 33dbefd471e8..bde6f03512d5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -3,18 +3,34 @@
3# select HAVE_FUNCTION_TRACER: 3# select HAVE_FUNCTION_TRACER:
4# 4#
5 5
6config USER_STACKTRACE_SUPPORT
7 bool
8
6config NOP_TRACER 9config NOP_TRACER
7 bool 10 bool
8 11
9config HAVE_FUNCTION_TRACER 12config HAVE_FUNCTION_TRACER
10 bool 13 bool
11 14
15config HAVE_FUNCTION_GRAPH_TRACER
16 bool
17
18config HAVE_FUNCTION_TRACE_MCOUNT_TEST
19 bool
20 help
21 This gets selected when the arch tests the function_trace_stop
22 variable at the mcount call site. Otherwise, this variable
23 is tested by the called function.
24
12config HAVE_DYNAMIC_FTRACE 25config HAVE_DYNAMIC_FTRACE
13 bool 26 bool
14 27
15config HAVE_FTRACE_MCOUNT_RECORD 28config HAVE_FTRACE_MCOUNT_RECORD
16 bool 29 bool
17 30
31config HAVE_HW_BRANCH_TRACER
32 bool
33
18config TRACER_MAX_TRACE 34config TRACER_MAX_TRACE
19 bool 35 bool
20 36
@@ -47,6 +63,20 @@ config FUNCTION_TRACER
47 (the bootup default), then the overhead of the instructions is very 63 (the bootup default), then the overhead of the instructions is very
48 small and not measurable even in micro-benchmarks. 64 small and not measurable even in micro-benchmarks.
49 65
66config FUNCTION_GRAPH_TRACER
67 bool "Kernel Function Graph Tracer"
68 depends on HAVE_FUNCTION_GRAPH_TRACER
69 depends on FUNCTION_TRACER
70 default y
71 help
72 Enable the kernel to trace a function at both its return
73 and its entry.
74 It's first purpose is to trace the duration of functions and
75 draw a call graph for each thread with some informations like
76 the return value.
77 This is done by setting the current return address on the current
78 task structure into a stack of calls.
79
50config IRQSOFF_TRACER 80config IRQSOFF_TRACER
51 bool "Interrupts-off Latency Tracer" 81 bool "Interrupts-off Latency Tracer"
52 default n 82 default n
@@ -138,6 +168,70 @@ config BOOT_TRACER
138 selected, because the self-tests are an initcall as well and that 168 selected, because the self-tests are an initcall as well and that
139 would invalidate the boot trace. ) 169 would invalidate the boot trace. )
140 170
171config TRACE_BRANCH_PROFILING
172 bool "Trace likely/unlikely profiler"
173 depends on DEBUG_KERNEL
174 select TRACING
175 help
176 This tracer profiles all the the likely and unlikely macros
177 in the kernel. It will display the results in:
178
179 /debugfs/tracing/profile_annotated_branch
180
181 Note: this will add a significant overhead, only turn this
182 on if you need to profile the system's use of these macros.
183
184 Say N if unsure.
185
186config PROFILE_ALL_BRANCHES
187 bool "Profile all if conditionals"
188 depends on TRACE_BRANCH_PROFILING
189 help
190 This tracer profiles all branch conditions. Every if ()
191 taken in the kernel is recorded whether it hit or miss.
192 The results will be displayed in:
193
194 /debugfs/tracing/profile_branch
195
196 This configuration, when enabled, will impose a great overhead
197 on the system. This should only be enabled when the system
198 is to be analyzed
199
200 Say N if unsure.
201
202config TRACING_BRANCHES
203 bool
204 help
205 Selected by tracers that will trace the likely and unlikely
206 conditions. This prevents the tracers themselves from being
207 profiled. Profiling the tracing infrastructure can only happen
208 when the likelys and unlikelys are not being traced.
209
210config BRANCH_TRACER
211 bool "Trace likely/unlikely instances"
212 depends on TRACE_BRANCH_PROFILING
213 select TRACING_BRANCHES
214 help
215 This traces the events of likely and unlikely condition
216 calls in the kernel. The difference between this and the
217 "Trace likely/unlikely profiler" is that this is not a
218 histogram of the callers, but actually places the calling
219 events into a running trace buffer to see when and where the
220 events happened, as well as their results.
221
222 Say N if unsure.
223
224config POWER_TRACER
225 bool "Trace power consumption behavior"
226 depends on DEBUG_KERNEL
227 depends on X86
228 select TRACING
229 help
230 This tracer helps developers to analyze and optimize the kernels
231 power management decisions, specifically the C-state and P-state
232 behavior.
233
234
141config STACK_TRACER 235config STACK_TRACER
142 bool "Trace max stack" 236 bool "Trace max stack"
143 depends on HAVE_FUNCTION_TRACER 237 depends on HAVE_FUNCTION_TRACER
@@ -157,6 +251,14 @@ config STACK_TRACER
157 251
158 Say N if unsure. 252 Say N if unsure.
159 253
254config BTS_TRACER
255 depends on HAVE_HW_BRANCH_TRACER
256 bool "Trace branches"
257 select TRACING
258 help
259 This tracer records all branches on the system in a circular
260 buffer giving access to the last N branches for each cpu.
261
160config DYNAMIC_FTRACE 262config DYNAMIC_FTRACE
161 bool "enable/disable ftrace tracepoints dynamically" 263 bool "enable/disable ftrace tracepoints dynamically"
162 depends on FUNCTION_TRACER 264 depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c8228b1a49e9..62dc561b6676 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg
10obj-y += trace_selftest_dynamic.o 10obj-y += trace_selftest_dynamic.o
11endif 11endif
12 12
13# If unlikely tracing is enabled, do not trace these files
14ifdef CONFIG_TRACING_BRANCHES
15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
16endif
17
13obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o 18obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
14obj-$(CONFIG_RING_BUFFER) += ring_buffer.o 19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
15 20
@@ -24,5 +29,9 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
24obj-$(CONFIG_STACK_TRACER) += trace_stack.o 29obj-$(CONFIG_STACK_TRACER) += trace_stack.o
25obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 30obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
26obj-$(CONFIG_BOOT_TRACER) += trace_boot.o 31obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
32obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
33obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
34obj-$(CONFIG_BTS_TRACER) += trace_bts.o
35obj-$(CONFIG_POWER_TRACER) += trace_power.o
27 36
28libftrace-y := ftrace.o 37libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 78db083390f0..a12f80efceaa 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -47,6 +47,13 @@
47int ftrace_enabled __read_mostly; 47int ftrace_enabled __read_mostly;
48static int last_ftrace_enabled; 48static int last_ftrace_enabled;
49 49
50/* set when tracing only a pid */
51struct pid *ftrace_pid_trace;
52static struct pid * const ftrace_swapper_pid = &init_struct_pid;
53
54/* Quick disabling of function tracer. */
55int function_trace_stop;
56
50/* 57/*
51 * ftrace_disabled is set when an anomaly is discovered. 58 * ftrace_disabled is set when an anomaly is discovered.
52 * ftrace_disabled is much stronger than ftrace_enabled. 59 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -55,6 +62,7 @@ static int ftrace_disabled __read_mostly;
55 62
56static DEFINE_SPINLOCK(ftrace_lock); 63static DEFINE_SPINLOCK(ftrace_lock);
57static DEFINE_MUTEX(ftrace_sysctl_lock); 64static DEFINE_MUTEX(ftrace_sysctl_lock);
65static DEFINE_MUTEX(ftrace_start_lock);
58 66
59static struct ftrace_ops ftrace_list_end __read_mostly = 67static struct ftrace_ops ftrace_list_end __read_mostly =
60{ 68{
@@ -63,6 +71,8 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
63 71
64static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 72static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
65ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 73ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
74ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
75ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
66 76
67static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 77static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
68{ 78{
@@ -79,6 +89,21 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
79 }; 89 };
80} 90}
81 91
92static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
93{
94 if (!test_tsk_trace_trace(current))
95 return;
96
97 ftrace_pid_function(ip, parent_ip);
98}
99
100static void set_ftrace_pid_function(ftrace_func_t func)
101{
102 /* do not set ftrace_pid_function to itself! */
103 if (func != ftrace_pid_func)
104 ftrace_pid_function = func;
105}
106
82/** 107/**
83 * clear_ftrace_function - reset the ftrace function 108 * clear_ftrace_function - reset the ftrace function
84 * 109 *
@@ -88,7 +113,23 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
88void clear_ftrace_function(void) 113void clear_ftrace_function(void)
89{ 114{
90 ftrace_trace_function = ftrace_stub; 115 ftrace_trace_function = ftrace_stub;
116 __ftrace_trace_function = ftrace_stub;
117 ftrace_pid_function = ftrace_stub;
118}
119
120#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
121/*
122 * For those archs that do not test ftrace_trace_stop in their
123 * mcount call site, we need to do it from C.
124 */
125static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
126{
127 if (function_trace_stop)
128 return;
129
130 __ftrace_trace_function(ip, parent_ip);
91} 131}
132#endif
92 133
93static int __register_ftrace_function(struct ftrace_ops *ops) 134static int __register_ftrace_function(struct ftrace_ops *ops)
94{ 135{
@@ -106,14 +147,28 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
106 ftrace_list = ops; 147 ftrace_list = ops;
107 148
108 if (ftrace_enabled) { 149 if (ftrace_enabled) {
150 ftrace_func_t func;
151
152 if (ops->next == &ftrace_list_end)
153 func = ops->func;
154 else
155 func = ftrace_list_func;
156
157 if (ftrace_pid_trace) {
158 set_ftrace_pid_function(func);
159 func = ftrace_pid_func;
160 }
161
109 /* 162 /*
110 * For one func, simply call it directly. 163 * For one func, simply call it directly.
111 * For more than one func, call the chain. 164 * For more than one func, call the chain.
112 */ 165 */
113 if (ops->next == &ftrace_list_end) 166#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
114 ftrace_trace_function = ops->func; 167 ftrace_trace_function = func;
115 else 168#else
116 ftrace_trace_function = ftrace_list_func; 169 __ftrace_trace_function = func;
170 ftrace_trace_function = ftrace_test_stop_func;
171#endif
117 } 172 }
118 173
119 spin_unlock(&ftrace_lock); 174 spin_unlock(&ftrace_lock);
@@ -152,9 +207,19 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
152 207
153 if (ftrace_enabled) { 208 if (ftrace_enabled) {
154 /* If we only have one func left, then call that directly */ 209 /* If we only have one func left, then call that directly */
155 if (ftrace_list == &ftrace_list_end || 210 if (ftrace_list->next == &ftrace_list_end) {
156 ftrace_list->next == &ftrace_list_end) 211 ftrace_func_t func = ftrace_list->func;
157 ftrace_trace_function = ftrace_list->func; 212
213 if (ftrace_pid_trace) {
214 set_ftrace_pid_function(func);
215 func = ftrace_pid_func;
216 }
217#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
218 ftrace_trace_function = func;
219#else
220 __ftrace_trace_function = func;
221#endif
222 }
158 } 223 }
159 224
160 out: 225 out:
@@ -163,6 +228,36 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
163 return ret; 228 return ret;
164} 229}
165 230
231static void ftrace_update_pid_func(void)
232{
233 ftrace_func_t func;
234
235 /* should not be called from interrupt context */
236 spin_lock(&ftrace_lock);
237
238 if (ftrace_trace_function == ftrace_stub)
239 goto out;
240
241 func = ftrace_trace_function;
242
243 if (ftrace_pid_trace) {
244 set_ftrace_pid_function(func);
245 func = ftrace_pid_func;
246 } else {
247 if (func == ftrace_pid_func)
248 func = ftrace_pid_function;
249 }
250
251#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
252 ftrace_trace_function = func;
253#else
254 __ftrace_trace_function = func;
255#endif
256
257 out:
258 spin_unlock(&ftrace_lock);
259}
260
166#ifdef CONFIG_DYNAMIC_FTRACE 261#ifdef CONFIG_DYNAMIC_FTRACE
167#ifndef CONFIG_FTRACE_MCOUNT_RECORD 262#ifndef CONFIG_FTRACE_MCOUNT_RECORD
168# error Dynamic ftrace depends on MCOUNT_RECORD 263# error Dynamic ftrace depends on MCOUNT_RECORD
@@ -182,6 +277,8 @@ enum {
182 FTRACE_UPDATE_TRACE_FUNC = (1 << 2), 277 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
183 FTRACE_ENABLE_MCOUNT = (1 << 3), 278 FTRACE_ENABLE_MCOUNT = (1 << 3),
184 FTRACE_DISABLE_MCOUNT = (1 << 4), 279 FTRACE_DISABLE_MCOUNT = (1 << 4),
280 FTRACE_START_FUNC_RET = (1 << 5),
281 FTRACE_STOP_FUNC_RET = (1 << 6),
185}; 282};
186 283
187static int ftrace_filtered; 284static int ftrace_filtered;
@@ -308,7 +405,7 @@ ftrace_record_ip(unsigned long ip)
308{ 405{
309 struct dyn_ftrace *rec; 406 struct dyn_ftrace *rec;
310 407
311 if (!ftrace_enabled || ftrace_disabled) 408 if (ftrace_disabled)
312 return NULL; 409 return NULL;
313 410
314 rec = ftrace_alloc_dyn_node(ip); 411 rec = ftrace_alloc_dyn_node(ip);
@@ -322,14 +419,51 @@ ftrace_record_ip(unsigned long ip)
322 return rec; 419 return rec;
323} 420}
324 421
325#define FTRACE_ADDR ((long)(ftrace_caller)) 422static void print_ip_ins(const char *fmt, unsigned char *p)
423{
424 int i;
425
426 printk(KERN_CONT "%s", fmt);
427
428 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
429 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
430}
431
432static void ftrace_bug(int failed, unsigned long ip)
433{
434 switch (failed) {
435 case -EFAULT:
436 FTRACE_WARN_ON_ONCE(1);
437 pr_info("ftrace faulted on modifying ");
438 print_ip_sym(ip);
439 break;
440 case -EINVAL:
441 FTRACE_WARN_ON_ONCE(1);
442 pr_info("ftrace failed to modify ");
443 print_ip_sym(ip);
444 print_ip_ins(" actual: ", (unsigned char *)ip);
445 printk(KERN_CONT "\n");
446 break;
447 case -EPERM:
448 FTRACE_WARN_ON_ONCE(1);
449 pr_info("ftrace faulted on writing ");
450 print_ip_sym(ip);
451 break;
452 default:
453 FTRACE_WARN_ON_ONCE(1);
454 pr_info("ftrace faulted on unknown error ");
455 print_ip_sym(ip);
456 }
457}
458
326 459
327static int 460static int
328__ftrace_replace_code(struct dyn_ftrace *rec, 461__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
329 unsigned char *nop, int enable)
330{ 462{
331 unsigned long ip, fl; 463 unsigned long ip, fl;
332 unsigned char *call, *old, *new; 464 unsigned long ftrace_addr;
465
466 ftrace_addr = (unsigned long)ftrace_caller;
333 467
334 ip = rec->ip; 468 ip = rec->ip;
335 469
@@ -388,34 +522,28 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
388 } 522 }
389 } 523 }
390 524
391 call = ftrace_call_replace(ip, FTRACE_ADDR); 525 if (rec->flags & FTRACE_FL_ENABLED)
392 526 return ftrace_make_call(rec, ftrace_addr);
393 if (rec->flags & FTRACE_FL_ENABLED) { 527 else
394 old = nop; 528 return ftrace_make_nop(NULL, rec, ftrace_addr);
395 new = call;
396 } else {
397 old = call;
398 new = nop;
399 }
400
401 return ftrace_modify_code(ip, old, new);
402} 529}
403 530
404static void ftrace_replace_code(int enable) 531static void ftrace_replace_code(int enable)
405{ 532{
406 int i, failed; 533 int i, failed;
407 unsigned char *nop = NULL;
408 struct dyn_ftrace *rec; 534 struct dyn_ftrace *rec;
409 struct ftrace_page *pg; 535 struct ftrace_page *pg;
410 536
411 nop = ftrace_nop_replace();
412
413 for (pg = ftrace_pages_start; pg; pg = pg->next) { 537 for (pg = ftrace_pages_start; pg; pg = pg->next) {
414 for (i = 0; i < pg->index; i++) { 538 for (i = 0; i < pg->index; i++) {
415 rec = &pg->records[i]; 539 rec = &pg->records[i];
416 540
417 /* don't modify code that has already faulted */ 541 /*
418 if (rec->flags & FTRACE_FL_FAILED) 542 * Skip over free records and records that have
543 * failed.
544 */
545 if (rec->flags & FTRACE_FL_FREE ||
546 rec->flags & FTRACE_FL_FAILED)
419 continue; 547 continue;
420 548
421 /* ignore updates to this record's mcount site */ 549 /* ignore updates to this record's mcount site */
@@ -426,68 +554,30 @@ static void ftrace_replace_code(int enable)
426 unfreeze_record(rec); 554 unfreeze_record(rec);
427 } 555 }
428 556
429 failed = __ftrace_replace_code(rec, nop, enable); 557 failed = __ftrace_replace_code(rec, enable);
430 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { 558 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
431 rec->flags |= FTRACE_FL_FAILED; 559 rec->flags |= FTRACE_FL_FAILED;
432 if ((system_state == SYSTEM_BOOTING) || 560 if ((system_state == SYSTEM_BOOTING) ||
433 !core_kernel_text(rec->ip)) { 561 !core_kernel_text(rec->ip)) {
434 ftrace_free_rec(rec); 562 ftrace_free_rec(rec);
435 } 563 } else
564 ftrace_bug(failed, rec->ip);
436 } 565 }
437 } 566 }
438 } 567 }
439} 568}
440 569
441static void print_ip_ins(const char *fmt, unsigned char *p)
442{
443 int i;
444
445 printk(KERN_CONT "%s", fmt);
446
447 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
448 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
449}
450
451static int 570static int
452ftrace_code_disable(struct dyn_ftrace *rec) 571ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
453{ 572{
454 unsigned long ip; 573 unsigned long ip;
455 unsigned char *nop, *call;
456 int ret; 574 int ret;
457 575
458 ip = rec->ip; 576 ip = rec->ip;
459 577
460 nop = ftrace_nop_replace(); 578 ret = ftrace_make_nop(mod, rec, mcount_addr);
461 call = ftrace_call_replace(ip, mcount_addr);
462
463 ret = ftrace_modify_code(ip, call, nop);
464 if (ret) { 579 if (ret) {
465 switch (ret) { 580 ftrace_bug(ret, ip);
466 case -EFAULT:
467 FTRACE_WARN_ON_ONCE(1);
468 pr_info("ftrace faulted on modifying ");
469 print_ip_sym(ip);
470 break;
471 case -EINVAL:
472 FTRACE_WARN_ON_ONCE(1);
473 pr_info("ftrace failed to modify ");
474 print_ip_sym(ip);
475 print_ip_ins(" expected: ", call);
476 print_ip_ins(" actual: ", (unsigned char *)ip);
477 print_ip_ins(" replace: ", nop);
478 printk(KERN_CONT "\n");
479 break;
480 case -EPERM:
481 FTRACE_WARN_ON_ONCE(1);
482 pr_info("ftrace faulted on writing ");
483 print_ip_sym(ip);
484 break;
485 default:
486 FTRACE_WARN_ON_ONCE(1);
487 pr_info("ftrace faulted on unknown error ");
488 print_ip_sym(ip);
489 }
490
491 rec->flags |= FTRACE_FL_FAILED; 581 rec->flags |= FTRACE_FL_FAILED;
492 return 0; 582 return 0;
493 } 583 }
@@ -506,6 +596,11 @@ static int __ftrace_modify_code(void *data)
506 if (*command & FTRACE_UPDATE_TRACE_FUNC) 596 if (*command & FTRACE_UPDATE_TRACE_FUNC)
507 ftrace_update_ftrace_func(ftrace_trace_function); 597 ftrace_update_ftrace_func(ftrace_trace_function);
508 598
599 if (*command & FTRACE_START_FUNC_RET)
600 ftrace_enable_ftrace_graph_caller();
601 else if (*command & FTRACE_STOP_FUNC_RET)
602 ftrace_disable_ftrace_graph_caller();
603
509 return 0; 604 return 0;
510} 605}
511 606
@@ -515,43 +610,43 @@ static void ftrace_run_update_code(int command)
515} 610}
516 611
517static ftrace_func_t saved_ftrace_func; 612static ftrace_func_t saved_ftrace_func;
518static int ftrace_start; 613static int ftrace_start_up;
519static DEFINE_MUTEX(ftrace_start_lock);
520 614
521static void ftrace_startup(void) 615static void ftrace_startup_enable(int command)
522{ 616{
523 int command = 0;
524
525 if (unlikely(ftrace_disabled))
526 return;
527
528 mutex_lock(&ftrace_start_lock);
529 ftrace_start++;
530 command |= FTRACE_ENABLE_CALLS;
531
532 if (saved_ftrace_func != ftrace_trace_function) { 617 if (saved_ftrace_func != ftrace_trace_function) {
533 saved_ftrace_func = ftrace_trace_function; 618 saved_ftrace_func = ftrace_trace_function;
534 command |= FTRACE_UPDATE_TRACE_FUNC; 619 command |= FTRACE_UPDATE_TRACE_FUNC;
535 } 620 }
536 621
537 if (!command || !ftrace_enabled) 622 if (!command || !ftrace_enabled)
538 goto out; 623 return;
539 624
540 ftrace_run_update_code(command); 625 ftrace_run_update_code(command);
541 out:
542 mutex_unlock(&ftrace_start_lock);
543} 626}
544 627
545static void ftrace_shutdown(void) 628static void ftrace_startup(int command)
546{ 629{
547 int command = 0; 630 if (unlikely(ftrace_disabled))
631 return;
548 632
633 mutex_lock(&ftrace_start_lock);
634 ftrace_start_up++;
635 command |= FTRACE_ENABLE_CALLS;
636
637 ftrace_startup_enable(command);
638
639 mutex_unlock(&ftrace_start_lock);
640}
641
642static void ftrace_shutdown(int command)
643{
549 if (unlikely(ftrace_disabled)) 644 if (unlikely(ftrace_disabled))
550 return; 645 return;
551 646
552 mutex_lock(&ftrace_start_lock); 647 mutex_lock(&ftrace_start_lock);
553 ftrace_start--; 648 ftrace_start_up--;
554 if (!ftrace_start) 649 if (!ftrace_start_up)
555 command |= FTRACE_DISABLE_CALLS; 650 command |= FTRACE_DISABLE_CALLS;
556 651
557 if (saved_ftrace_func != ftrace_trace_function) { 652 if (saved_ftrace_func != ftrace_trace_function) {
@@ -577,8 +672,8 @@ static void ftrace_startup_sysctl(void)
577 mutex_lock(&ftrace_start_lock); 672 mutex_lock(&ftrace_start_lock);
578 /* Force update next time */ 673 /* Force update next time */
579 saved_ftrace_func = NULL; 674 saved_ftrace_func = NULL;
580 /* ftrace_start is true if we want ftrace running */ 675 /* ftrace_start_up is true if we want ftrace running */
581 if (ftrace_start) 676 if (ftrace_start_up)
582 command |= FTRACE_ENABLE_CALLS; 677 command |= FTRACE_ENABLE_CALLS;
583 678
584 ftrace_run_update_code(command); 679 ftrace_run_update_code(command);
@@ -593,8 +688,8 @@ static void ftrace_shutdown_sysctl(void)
593 return; 688 return;
594 689
595 mutex_lock(&ftrace_start_lock); 690 mutex_lock(&ftrace_start_lock);
596 /* ftrace_start is true if ftrace is running */ 691 /* ftrace_start_up is true if ftrace is running */
597 if (ftrace_start) 692 if (ftrace_start_up)
598 command |= FTRACE_DISABLE_CALLS; 693 command |= FTRACE_DISABLE_CALLS;
599 694
600 ftrace_run_update_code(command); 695 ftrace_run_update_code(command);
@@ -605,7 +700,7 @@ static cycle_t ftrace_update_time;
605static unsigned long ftrace_update_cnt; 700static unsigned long ftrace_update_cnt;
606unsigned long ftrace_update_tot_cnt; 701unsigned long ftrace_update_tot_cnt;
607 702
608static int ftrace_update_code(void) 703static int ftrace_update_code(struct module *mod)
609{ 704{
610 struct dyn_ftrace *p, *t; 705 struct dyn_ftrace *p, *t;
611 cycle_t start, stop; 706 cycle_t start, stop;
@@ -622,7 +717,7 @@ static int ftrace_update_code(void)
622 list_del_init(&p->list); 717 list_del_init(&p->list);
623 718
624 /* convert record (i.e, patch mcount-call with NOP) */ 719 /* convert record (i.e, patch mcount-call with NOP) */
625 if (ftrace_code_disable(p)) { 720 if (ftrace_code_disable(mod, p)) {
626 p->flags |= FTRACE_FL_CONVERTED; 721 p->flags |= FTRACE_FL_CONVERTED;
627 ftrace_update_cnt++; 722 ftrace_update_cnt++;
628 } else 723 } else
@@ -690,7 +785,6 @@ enum {
690#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ 785#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
691 786
692struct ftrace_iterator { 787struct ftrace_iterator {
693 loff_t pos;
694 struct ftrace_page *pg; 788 struct ftrace_page *pg;
695 unsigned idx; 789 unsigned idx;
696 unsigned flags; 790 unsigned flags;
@@ -715,6 +809,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
715 iter->pg = iter->pg->next; 809 iter->pg = iter->pg->next;
716 iter->idx = 0; 810 iter->idx = 0;
717 goto retry; 811 goto retry;
812 } else {
813 iter->idx = -1;
718 } 814 }
719 } else { 815 } else {
720 rec = &iter->pg->records[iter->idx++]; 816 rec = &iter->pg->records[iter->idx++];
@@ -737,8 +833,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
737 } 833 }
738 spin_unlock(&ftrace_lock); 834 spin_unlock(&ftrace_lock);
739 835
740 iter->pos = *pos;
741
742 return rec; 836 return rec;
743} 837}
744 838
@@ -746,13 +840,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
746{ 840{
747 struct ftrace_iterator *iter = m->private; 841 struct ftrace_iterator *iter = m->private;
748 void *p = NULL; 842 void *p = NULL;
749 loff_t l = -1;
750 843
751 if (*pos > iter->pos) 844 if (*pos > 0) {
752 *pos = iter->pos; 845 if (iter->idx < 0)
846 return p;
847 (*pos)--;
848 iter->idx--;
849 }
753 850
754 l = *pos; 851 p = t_next(m, p, pos);
755 p = t_next(m, p, &l);
756 852
757 return p; 853 return p;
758} 854}
@@ -763,21 +859,15 @@ static void t_stop(struct seq_file *m, void *p)
763 859
764static int t_show(struct seq_file *m, void *v) 860static int t_show(struct seq_file *m, void *v)
765{ 861{
766 struct ftrace_iterator *iter = m->private;
767 struct dyn_ftrace *rec = v; 862 struct dyn_ftrace *rec = v;
768 char str[KSYM_SYMBOL_LEN]; 863 char str[KSYM_SYMBOL_LEN];
769 int ret = 0;
770 864
771 if (!rec) 865 if (!rec)
772 return 0; 866 return 0;
773 867
774 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 868 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
775 869
776 ret = seq_printf(m, "%s\n", str); 870 seq_printf(m, "%s\n", str);
777 if (ret < 0) {
778 iter->pos--;
779 iter->idx--;
780 }
781 871
782 return 0; 872 return 0;
783} 873}
@@ -803,7 +893,6 @@ ftrace_avail_open(struct inode *inode, struct file *file)
803 return -ENOMEM; 893 return -ENOMEM;
804 894
805 iter->pg = ftrace_pages_start; 895 iter->pg = ftrace_pages_start;
806 iter->pos = 0;
807 896
808 ret = seq_open(file, &show_ftrace_seq_ops); 897 ret = seq_open(file, &show_ftrace_seq_ops);
809 if (!ret) { 898 if (!ret) {
@@ -890,7 +979,6 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
890 979
891 if (file->f_mode & FMODE_READ) { 980 if (file->f_mode & FMODE_READ) {
892 iter->pg = ftrace_pages_start; 981 iter->pg = ftrace_pages_start;
893 iter->pos = 0;
894 iter->flags = enable ? FTRACE_ITER_FILTER : 982 iter->flags = enable ? FTRACE_ITER_FILTER :
895 FTRACE_ITER_NOTRACE; 983 FTRACE_ITER_NOTRACE;
896 984
@@ -1181,7 +1269,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1181 1269
1182 mutex_lock(&ftrace_sysctl_lock); 1270 mutex_lock(&ftrace_sysctl_lock);
1183 mutex_lock(&ftrace_start_lock); 1271 mutex_lock(&ftrace_start_lock);
1184 if (ftrace_start && ftrace_enabled) 1272 if (ftrace_start_up && ftrace_enabled)
1185 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1273 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1186 mutex_unlock(&ftrace_start_lock); 1274 mutex_unlock(&ftrace_start_lock);
1187 mutex_unlock(&ftrace_sysctl_lock); 1275 mutex_unlock(&ftrace_sysctl_lock);
@@ -1233,12 +1321,233 @@ static struct file_operations ftrace_notrace_fops = {
1233 .release = ftrace_notrace_release, 1321 .release = ftrace_notrace_release,
1234}; 1322};
1235 1323
1236static __init int ftrace_init_debugfs(void) 1324#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1325
1326static DEFINE_MUTEX(graph_lock);
1327
1328int ftrace_graph_count;
1329unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
1330
1331static void *
1332g_next(struct seq_file *m, void *v, loff_t *pos)
1237{ 1333{
1238 struct dentry *d_tracer; 1334 unsigned long *array = m->private;
1239 struct dentry *entry; 1335 int index = *pos;
1240 1336
1241 d_tracer = tracing_init_dentry(); 1337 (*pos)++;
1338
1339 if (index >= ftrace_graph_count)
1340 return NULL;
1341
1342 return &array[index];
1343}
1344
1345static void *g_start(struct seq_file *m, loff_t *pos)
1346{
1347 void *p = NULL;
1348
1349 mutex_lock(&graph_lock);
1350
1351 p = g_next(m, p, pos);
1352
1353 return p;
1354}
1355
1356static void g_stop(struct seq_file *m, void *p)
1357{
1358 mutex_unlock(&graph_lock);
1359}
1360
1361static int g_show(struct seq_file *m, void *v)
1362{
1363 unsigned long *ptr = v;
1364 char str[KSYM_SYMBOL_LEN];
1365
1366 if (!ptr)
1367 return 0;
1368
1369 kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
1370
1371 seq_printf(m, "%s\n", str);
1372
1373 return 0;
1374}
1375
1376static struct seq_operations ftrace_graph_seq_ops = {
1377 .start = g_start,
1378 .next = g_next,
1379 .stop = g_stop,
1380 .show = g_show,
1381};
1382
1383static int
1384ftrace_graph_open(struct inode *inode, struct file *file)
1385{
1386 int ret = 0;
1387
1388 if (unlikely(ftrace_disabled))
1389 return -ENODEV;
1390
1391 mutex_lock(&graph_lock);
1392 if ((file->f_mode & FMODE_WRITE) &&
1393 !(file->f_flags & O_APPEND)) {
1394 ftrace_graph_count = 0;
1395 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
1396 }
1397
1398 if (file->f_mode & FMODE_READ) {
1399 ret = seq_open(file, &ftrace_graph_seq_ops);
1400 if (!ret) {
1401 struct seq_file *m = file->private_data;
1402 m->private = ftrace_graph_funcs;
1403 }
1404 } else
1405 file->private_data = ftrace_graph_funcs;
1406 mutex_unlock(&graph_lock);
1407
1408 return ret;
1409}
1410
1411static ssize_t
1412ftrace_graph_read(struct file *file, char __user *ubuf,
1413 size_t cnt, loff_t *ppos)
1414{
1415 if (file->f_mode & FMODE_READ)
1416 return seq_read(file, ubuf, cnt, ppos);
1417 else
1418 return -EPERM;
1419}
1420
1421static int
1422ftrace_set_func(unsigned long *array, int idx, char *buffer)
1423{
1424 char str[KSYM_SYMBOL_LEN];
1425 struct dyn_ftrace *rec;
1426 struct ftrace_page *pg;
1427 int found = 0;
1428 int i, j;
1429
1430 if (ftrace_disabled)
1431 return -ENODEV;
1432
1433 /* should not be called from interrupt context */
1434 spin_lock(&ftrace_lock);
1435
1436 for (pg = ftrace_pages_start; pg; pg = pg->next) {
1437 for (i = 0; i < pg->index; i++) {
1438 rec = &pg->records[i];
1439
1440 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
1441 continue;
1442
1443 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1444 if (strcmp(str, buffer) == 0) {
1445 found = 1;
1446 for (j = 0; j < idx; j++)
1447 if (array[j] == rec->ip) {
1448 found = 0;
1449 break;
1450 }
1451 if (found)
1452 array[idx] = rec->ip;
1453 break;
1454 }
1455 }
1456 }
1457 spin_unlock(&ftrace_lock);
1458
1459 return found ? 0 : -EINVAL;
1460}
1461
1462static ssize_t
1463ftrace_graph_write(struct file *file, const char __user *ubuf,
1464 size_t cnt, loff_t *ppos)
1465{
1466 unsigned char buffer[FTRACE_BUFF_MAX+1];
1467 unsigned long *array;
1468 size_t read = 0;
1469 ssize_t ret;
1470 int index = 0;
1471 char ch;
1472
1473 if (!cnt || cnt < 0)
1474 return 0;
1475
1476 mutex_lock(&graph_lock);
1477
1478 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
1479 ret = -EBUSY;
1480 goto out;
1481 }
1482
1483 if (file->f_mode & FMODE_READ) {
1484 struct seq_file *m = file->private_data;
1485 array = m->private;
1486 } else
1487 array = file->private_data;
1488
1489 ret = get_user(ch, ubuf++);
1490 if (ret)
1491 goto out;
1492 read++;
1493 cnt--;
1494
1495 /* skip white space */
1496 while (cnt && isspace(ch)) {
1497 ret = get_user(ch, ubuf++);
1498 if (ret)
1499 goto out;
1500 read++;
1501 cnt--;
1502 }
1503
1504 if (isspace(ch)) {
1505 *ppos += read;
1506 ret = read;
1507 goto out;
1508 }
1509
1510 while (cnt && !isspace(ch)) {
1511 if (index < FTRACE_BUFF_MAX)
1512 buffer[index++] = ch;
1513 else {
1514 ret = -EINVAL;
1515 goto out;
1516 }
1517 ret = get_user(ch, ubuf++);
1518 if (ret)
1519 goto out;
1520 read++;
1521 cnt--;
1522 }
1523 buffer[index] = 0;
1524
1525 /* we allow only one at a time */
1526 ret = ftrace_set_func(array, ftrace_graph_count, buffer);
1527 if (ret)
1528 goto out;
1529
1530 ftrace_graph_count++;
1531
1532 file->f_pos += read;
1533
1534 ret = read;
1535 out:
1536 mutex_unlock(&graph_lock);
1537
1538 return ret;
1539}
1540
1541static const struct file_operations ftrace_graph_fops = {
1542 .open = ftrace_graph_open,
1543 .read = ftrace_graph_read,
1544 .write = ftrace_graph_write,
1545};
1546#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1547
1548static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
1549{
1550 struct dentry *entry;
1242 1551
1243 entry = debugfs_create_file("available_filter_functions", 0444, 1552 entry = debugfs_create_file("available_filter_functions", 0444,
1244 d_tracer, NULL, &ftrace_avail_fops); 1553 d_tracer, NULL, &ftrace_avail_fops);
@@ -1263,12 +1572,20 @@ static __init int ftrace_init_debugfs(void)
1263 pr_warning("Could not create debugfs " 1572 pr_warning("Could not create debugfs "
1264 "'set_ftrace_notrace' entry\n"); 1573 "'set_ftrace_notrace' entry\n");
1265 1574
1575#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1576 entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
1577 NULL,
1578 &ftrace_graph_fops);
1579 if (!entry)
1580 pr_warning("Could not create debugfs "
1581 "'set_graph_function' entry\n");
1582#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
1583
1266 return 0; 1584 return 0;
1267} 1585}
1268 1586
1269fs_initcall(ftrace_init_debugfs); 1587static int ftrace_convert_nops(struct module *mod,
1270 1588 unsigned long *start,
1271static int ftrace_convert_nops(unsigned long *start,
1272 unsigned long *end) 1589 unsigned long *end)
1273{ 1590{
1274 unsigned long *p; 1591 unsigned long *p;
@@ -1279,23 +1596,32 @@ static int ftrace_convert_nops(unsigned long *start,
1279 p = start; 1596 p = start;
1280 while (p < end) { 1597 while (p < end) {
1281 addr = ftrace_call_adjust(*p++); 1598 addr = ftrace_call_adjust(*p++);
1599 /*
1600 * Some architecture linkers will pad between
1601 * the different mcount_loc sections of different
1602 * object files to satisfy alignments.
1603 * Skip any NULL pointers.
1604 */
1605 if (!addr)
1606 continue;
1282 ftrace_record_ip(addr); 1607 ftrace_record_ip(addr);
1283 } 1608 }
1284 1609
1285 /* disable interrupts to prevent kstop machine */ 1610 /* disable interrupts to prevent kstop machine */
1286 local_irq_save(flags); 1611 local_irq_save(flags);
1287 ftrace_update_code(); 1612 ftrace_update_code(mod);
1288 local_irq_restore(flags); 1613 local_irq_restore(flags);
1289 mutex_unlock(&ftrace_start_lock); 1614 mutex_unlock(&ftrace_start_lock);
1290 1615
1291 return 0; 1616 return 0;
1292} 1617}
1293 1618
1294void ftrace_init_module(unsigned long *start, unsigned long *end) 1619void ftrace_init_module(struct module *mod,
1620 unsigned long *start, unsigned long *end)
1295{ 1621{
1296 if (ftrace_disabled || start == end) 1622 if (ftrace_disabled || start == end)
1297 return; 1623 return;
1298 ftrace_convert_nops(start, end); 1624 ftrace_convert_nops(mod, start, end);
1299} 1625}
1300 1626
1301extern unsigned long __start_mcount_loc[]; 1627extern unsigned long __start_mcount_loc[];
@@ -1325,7 +1651,8 @@ void __init ftrace_init(void)
1325 1651
1326 last_ftrace_enabled = ftrace_enabled = 1; 1652 last_ftrace_enabled = ftrace_enabled = 1;
1327 1653
1328 ret = ftrace_convert_nops(__start_mcount_loc, 1654 ret = ftrace_convert_nops(NULL,
1655 __start_mcount_loc,
1329 __stop_mcount_loc); 1656 __stop_mcount_loc);
1330 1657
1331 return; 1658 return;
@@ -1342,12 +1669,186 @@ static int __init ftrace_nodyn_init(void)
1342} 1669}
1343device_initcall(ftrace_nodyn_init); 1670device_initcall(ftrace_nodyn_init);
1344 1671
1345# define ftrace_startup() do { } while (0) 1672static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
1346# define ftrace_shutdown() do { } while (0) 1673static inline void ftrace_startup_enable(int command) { }
1674/* Keep as macros so we do not need to define the commands */
1675# define ftrace_startup(command) do { } while (0)
1676# define ftrace_shutdown(command) do { } while (0)
1347# define ftrace_startup_sysctl() do { } while (0) 1677# define ftrace_startup_sysctl() do { } while (0)
1348# define ftrace_shutdown_sysctl() do { } while (0) 1678# define ftrace_shutdown_sysctl() do { } while (0)
1349#endif /* CONFIG_DYNAMIC_FTRACE */ 1679#endif /* CONFIG_DYNAMIC_FTRACE */
1350 1680
1681static ssize_t
1682ftrace_pid_read(struct file *file, char __user *ubuf,
1683 size_t cnt, loff_t *ppos)
1684{
1685 char buf[64];
1686 int r;
1687
1688 if (ftrace_pid_trace == ftrace_swapper_pid)
1689 r = sprintf(buf, "swapper tasks\n");
1690 else if (ftrace_pid_trace)
1691 r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
1692 else
1693 r = sprintf(buf, "no pid\n");
1694
1695 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
1696}
1697
1698static void clear_ftrace_swapper(void)
1699{
1700 struct task_struct *p;
1701 int cpu;
1702
1703 get_online_cpus();
1704 for_each_online_cpu(cpu) {
1705 p = idle_task(cpu);
1706 clear_tsk_trace_trace(p);
1707 }
1708 put_online_cpus();
1709}
1710
1711static void set_ftrace_swapper(void)
1712{
1713 struct task_struct *p;
1714 int cpu;
1715
1716 get_online_cpus();
1717 for_each_online_cpu(cpu) {
1718 p = idle_task(cpu);
1719 set_tsk_trace_trace(p);
1720 }
1721 put_online_cpus();
1722}
1723
1724static void clear_ftrace_pid(struct pid *pid)
1725{
1726 struct task_struct *p;
1727
1728 do_each_pid_task(pid, PIDTYPE_PID, p) {
1729 clear_tsk_trace_trace(p);
1730 } while_each_pid_task(pid, PIDTYPE_PID, p);
1731 put_pid(pid);
1732}
1733
1734static void set_ftrace_pid(struct pid *pid)
1735{
1736 struct task_struct *p;
1737
1738 do_each_pid_task(pid, PIDTYPE_PID, p) {
1739 set_tsk_trace_trace(p);
1740 } while_each_pid_task(pid, PIDTYPE_PID, p);
1741}
1742
1743static void clear_ftrace_pid_task(struct pid **pid)
1744{
1745 if (*pid == ftrace_swapper_pid)
1746 clear_ftrace_swapper();
1747 else
1748 clear_ftrace_pid(*pid);
1749
1750 *pid = NULL;
1751}
1752
1753static void set_ftrace_pid_task(struct pid *pid)
1754{
1755 if (pid == ftrace_swapper_pid)
1756 set_ftrace_swapper();
1757 else
1758 set_ftrace_pid(pid);
1759}
1760
1761static ssize_t
1762ftrace_pid_write(struct file *filp, const char __user *ubuf,
1763 size_t cnt, loff_t *ppos)
1764{
1765 struct pid *pid;
1766 char buf[64];
1767 long val;
1768 int ret;
1769
1770 if (cnt >= sizeof(buf))
1771 return -EINVAL;
1772
1773 if (copy_from_user(&buf, ubuf, cnt))
1774 return -EFAULT;
1775
1776 buf[cnt] = 0;
1777
1778 ret = strict_strtol(buf, 10, &val);
1779 if (ret < 0)
1780 return ret;
1781
1782 mutex_lock(&ftrace_start_lock);
1783 if (val < 0) {
1784 /* disable pid tracing */
1785 if (!ftrace_pid_trace)
1786 goto out;
1787
1788 clear_ftrace_pid_task(&ftrace_pid_trace);
1789
1790 } else {
1791 /* swapper task is special */
1792 if (!val) {
1793 pid = ftrace_swapper_pid;
1794 if (pid == ftrace_pid_trace)
1795 goto out;
1796 } else {
1797 pid = find_get_pid(val);
1798
1799 if (pid == ftrace_pid_trace) {
1800 put_pid(pid);
1801 goto out;
1802 }
1803 }
1804
1805 if (ftrace_pid_trace)
1806 clear_ftrace_pid_task(&ftrace_pid_trace);
1807
1808 if (!pid)
1809 goto out;
1810
1811 ftrace_pid_trace = pid;
1812
1813 set_ftrace_pid_task(ftrace_pid_trace);
1814 }
1815
1816 /* update the function call */
1817 ftrace_update_pid_func();
1818 ftrace_startup_enable(0);
1819
1820 out:
1821 mutex_unlock(&ftrace_start_lock);
1822
1823 return cnt;
1824}
1825
1826static struct file_operations ftrace_pid_fops = {
1827 .read = ftrace_pid_read,
1828 .write = ftrace_pid_write,
1829};
1830
1831static __init int ftrace_init_debugfs(void)
1832{
1833 struct dentry *d_tracer;
1834 struct dentry *entry;
1835
1836 d_tracer = tracing_init_dentry();
1837 if (!d_tracer)
1838 return 0;
1839
1840 ftrace_init_dyn_debugfs(d_tracer);
1841
1842 entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
1843 NULL, &ftrace_pid_fops);
1844 if (!entry)
1845 pr_warning("Could not create debugfs "
1846 "'set_ftrace_pid' entry\n");
1847 return 0;
1848}
1849
1850fs_initcall(ftrace_init_debugfs);
1851
1351/** 1852/**
1352 * ftrace_kill - kill ftrace 1853 * ftrace_kill - kill ftrace
1353 * 1854 *
@@ -1381,10 +1882,11 @@ int register_ftrace_function(struct ftrace_ops *ops)
1381 return -1; 1882 return -1;
1382 1883
1383 mutex_lock(&ftrace_sysctl_lock); 1884 mutex_lock(&ftrace_sysctl_lock);
1885
1384 ret = __register_ftrace_function(ops); 1886 ret = __register_ftrace_function(ops);
1385 ftrace_startup(); 1887 ftrace_startup(0);
1386 mutex_unlock(&ftrace_sysctl_lock);
1387 1888
1889 mutex_unlock(&ftrace_sysctl_lock);
1388 return ret; 1890 return ret;
1389} 1891}
1390 1892
@@ -1400,7 +1902,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
1400 1902
1401 mutex_lock(&ftrace_sysctl_lock); 1903 mutex_lock(&ftrace_sysctl_lock);
1402 ret = __unregister_ftrace_function(ops); 1904 ret = __unregister_ftrace_function(ops);
1403 ftrace_shutdown(); 1905 ftrace_shutdown(0);
1404 mutex_unlock(&ftrace_sysctl_lock); 1906 mutex_unlock(&ftrace_sysctl_lock);
1405 1907
1406 return ret; 1908 return ret;
@@ -1449,3 +1951,153 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1449 return ret; 1951 return ret;
1450} 1952}
1451 1953
1954#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1955
1956static atomic_t ftrace_graph_active;
1957
1958int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
1959{
1960 return 0;
1961}
1962
1963/* The callbacks that hook a function */
1964trace_func_graph_ret_t ftrace_graph_return =
1965 (trace_func_graph_ret_t)ftrace_stub;
1966trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
1967
1968/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
1969static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
1970{
1971 int i;
1972 int ret = 0;
1973 unsigned long flags;
1974 int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
1975 struct task_struct *g, *t;
1976
1977 for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
1978 ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
1979 * sizeof(struct ftrace_ret_stack),
1980 GFP_KERNEL);
1981 if (!ret_stack_list[i]) {
1982 start = 0;
1983 end = i;
1984 ret = -ENOMEM;
1985 goto free;
1986 }
1987 }
1988
1989 read_lock_irqsave(&tasklist_lock, flags);
1990 do_each_thread(g, t) {
1991 if (start == end) {
1992 ret = -EAGAIN;
1993 goto unlock;
1994 }
1995
1996 if (t->ret_stack == NULL) {
1997 t->curr_ret_stack = -1;
1998 /* Make sure IRQs see the -1 first: */
1999 barrier();
2000 t->ret_stack = ret_stack_list[start++];
2001 atomic_set(&t->tracing_graph_pause, 0);
2002 atomic_set(&t->trace_overrun, 0);
2003 }
2004 } while_each_thread(g, t);
2005
2006unlock:
2007 read_unlock_irqrestore(&tasklist_lock, flags);
2008free:
2009 for (i = start; i < end; i++)
2010 kfree(ret_stack_list[i]);
2011 return ret;
2012}
2013
2014/* Allocate a return stack for each task */
2015static int start_graph_tracing(void)
2016{
2017 struct ftrace_ret_stack **ret_stack_list;
2018 int ret;
2019
2020 ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
2021 sizeof(struct ftrace_ret_stack *),
2022 GFP_KERNEL);
2023
2024 if (!ret_stack_list)
2025 return -ENOMEM;
2026
2027 do {
2028 ret = alloc_retstack_tasklist(ret_stack_list);
2029 } while (ret == -EAGAIN);
2030
2031 kfree(ret_stack_list);
2032 return ret;
2033}
2034
2035int register_ftrace_graph(trace_func_graph_ret_t retfunc,
2036 trace_func_graph_ent_t entryfunc)
2037{
2038 int ret = 0;
2039
2040 mutex_lock(&ftrace_sysctl_lock);
2041
2042 atomic_inc(&ftrace_graph_active);
2043 ret = start_graph_tracing();
2044 if (ret) {
2045 atomic_dec(&ftrace_graph_active);
2046 goto out;
2047 }
2048
2049 ftrace_graph_return = retfunc;
2050 ftrace_graph_entry = entryfunc;
2051
2052 ftrace_startup(FTRACE_START_FUNC_RET);
2053
2054out:
2055 mutex_unlock(&ftrace_sysctl_lock);
2056 return ret;
2057}
2058
2059void unregister_ftrace_graph(void)
2060{
2061 mutex_lock(&ftrace_sysctl_lock);
2062
2063 atomic_dec(&ftrace_graph_active);
2064 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
2065 ftrace_graph_entry = ftrace_graph_entry_stub;
2066 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
2067
2068 mutex_unlock(&ftrace_sysctl_lock);
2069}
2070
2071/* Allocate a return stack for newly created task */
2072void ftrace_graph_init_task(struct task_struct *t)
2073{
2074 if (atomic_read(&ftrace_graph_active)) {
2075 t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
2076 * sizeof(struct ftrace_ret_stack),
2077 GFP_KERNEL);
2078 if (!t->ret_stack)
2079 return;
2080 t->curr_ret_stack = -1;
2081 atomic_set(&t->tracing_graph_pause, 0);
2082 atomic_set(&t->trace_overrun, 0);
2083 } else
2084 t->ret_stack = NULL;
2085}
2086
2087void ftrace_graph_exit_task(struct task_struct *t)
2088{
2089 struct ftrace_ret_stack *ret_stack = t->ret_stack;
2090
2091 t->ret_stack = NULL;
2092 /* NULL must become visible to IRQs before we free it: */
2093 barrier();
2094
2095 kfree(ret_stack);
2096}
2097
2098void ftrace_graph_stop(void)
2099{
2100 ftrace_stop();
2101}
2102#endif
2103
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 668bbb5ef2bd..7f69cfeaadf7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -18,8 +18,46 @@
18 18
19#include "trace.h" 19#include "trace.h"
20 20
21/* Global flag to disable all recording to ring buffers */ 21/*
22static int ring_buffers_off __read_mostly; 22 * A fast way to enable or disable all ring buffers is to
23 * call tracing_on or tracing_off. Turning off the ring buffers
24 * prevents all ring buffers from being recorded to.
25 * Turning this switch on, makes it OK to write to the
26 * ring buffer, if the ring buffer is enabled itself.
27 *
28 * There's three layers that must be on in order to write
29 * to the ring buffer.
30 *
31 * 1) This global flag must be set.
32 * 2) The ring buffer must be enabled for recording.
33 * 3) The per cpu buffer must be enabled for recording.
34 *
35 * In case of an anomaly, this global flag has a bit set that
36 * will permantly disable all ring buffers.
37 */
38
39/*
40 * Global flag to disable all recording to ring buffers
41 * This has two bits: ON, DISABLED
42 *
43 * ON DISABLED
44 * ---- ----------
45 * 0 0 : ring buffers are off
46 * 1 0 : ring buffers are on
47 * X 1 : ring buffers are permanently disabled
48 */
49
50enum {
51 RB_BUFFERS_ON_BIT = 0,
52 RB_BUFFERS_DISABLED_BIT = 1,
53};
54
55enum {
56 RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT,
57 RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
58};
59
60static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
23 61
24/** 62/**
25 * tracing_on - enable all tracing buffers 63 * tracing_on - enable all tracing buffers
@@ -29,7 +67,7 @@ static int ring_buffers_off __read_mostly;
29 */ 67 */
30void tracing_on(void) 68void tracing_on(void)
31{ 69{
32 ring_buffers_off = 0; 70 set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
33} 71}
34 72
35/** 73/**
@@ -42,9 +80,22 @@ void tracing_on(void)
42 */ 80 */
43void tracing_off(void) 81void tracing_off(void)
44{ 82{
45 ring_buffers_off = 1; 83 clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
46} 84}
47 85
86/**
87 * tracing_off_permanent - permanently disable ring buffers
88 *
89 * This function, once called, will disable all ring buffers
90 * permanenty.
91 */
92void tracing_off_permanent(void)
93{
94 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
95}
96
97#include "trace.h"
98
48/* Up this if you want to test the TIME_EXTENTS and normalization */ 99/* Up this if you want to test the TIME_EXTENTS and normalization */
49#define DEBUG_SHIFT 0 100#define DEBUG_SHIFT 0
50 101
@@ -144,20 +195,24 @@ void *ring_buffer_event_data(struct ring_buffer_event *event)
144#define TS_MASK ((1ULL << TS_SHIFT) - 1) 195#define TS_MASK ((1ULL << TS_SHIFT) - 1)
145#define TS_DELTA_TEST (~TS_MASK) 196#define TS_DELTA_TEST (~TS_MASK)
146 197
147/* 198struct buffer_data_page {
148 * This hack stolen from mm/slob.c.
149 * We can store per page timing information in the page frame of the page.
150 * Thanks to Peter Zijlstra for suggesting this idea.
151 */
152struct buffer_page {
153 u64 time_stamp; /* page time stamp */ 199 u64 time_stamp; /* page time stamp */
154 local_t write; /* index for next write */
155 local_t commit; /* write commited index */ 200 local_t commit; /* write commited index */
201 unsigned char data[]; /* data of buffer page */
202};
203
204struct buffer_page {
205 local_t write; /* index for next write */
156 unsigned read; /* index for next read */ 206 unsigned read; /* index for next read */
157 struct list_head list; /* list of free pages */ 207 struct list_head list; /* list of free pages */
158 void *page; /* Actual data page */ 208 struct buffer_data_page *page; /* Actual data page */
159}; 209};
160 210
211static void rb_init_page(struct buffer_data_page *bpage)
212{
213 local_set(&bpage->commit, 0);
214}
215
161/* 216/*
162 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing 217 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
163 * this issue out. 218 * this issue out.
@@ -179,7 +234,7 @@ static inline int test_time_stamp(u64 delta)
179 return 0; 234 return 0;
180} 235}
181 236
182#define BUF_PAGE_SIZE PAGE_SIZE 237#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page))
183 238
184/* 239/*
185 * head_page == tail_page && head == tail then buffer is empty. 240 * head_page == tail_page && head == tail then buffer is empty.
@@ -187,7 +242,8 @@ static inline int test_time_stamp(u64 delta)
187struct ring_buffer_per_cpu { 242struct ring_buffer_per_cpu {
188 int cpu; 243 int cpu;
189 struct ring_buffer *buffer; 244 struct ring_buffer *buffer;
190 spinlock_t lock; 245 spinlock_t reader_lock; /* serialize readers */
246 raw_spinlock_t lock;
191 struct lock_class_key lock_key; 247 struct lock_class_key lock_key;
192 struct list_head pages; 248 struct list_head pages;
193 struct buffer_page *head_page; /* read from head */ 249 struct buffer_page *head_page; /* read from head */
@@ -221,32 +277,16 @@ struct ring_buffer_iter {
221 u64 read_stamp; 277 u64 read_stamp;
222}; 278};
223 279
280/* buffer may be either ring_buffer or ring_buffer_per_cpu */
224#define RB_WARN_ON(buffer, cond) \ 281#define RB_WARN_ON(buffer, cond) \
225 do { \ 282 ({ \
226 if (unlikely(cond)) { \ 283 int _____ret = unlikely(cond); \
227 atomic_inc(&buffer->record_disabled); \ 284 if (_____ret) { \
228 WARN_ON(1); \
229 } \
230 } while (0)
231
232#define RB_WARN_ON_RET(buffer, cond) \
233 do { \
234 if (unlikely(cond)) { \
235 atomic_inc(&buffer->record_disabled); \
236 WARN_ON(1); \
237 return -1; \
238 } \
239 } while (0)
240
241#define RB_WARN_ON_ONCE(buffer, cond) \
242 do { \
243 static int once; \
244 if (unlikely(cond) && !once) { \
245 once++; \
246 atomic_inc(&buffer->record_disabled); \ 285 atomic_inc(&buffer->record_disabled); \
247 WARN_ON(1); \ 286 WARN_ON(1); \
248 } \ 287 } \
249 } while (0) 288 _____ret; \
289 })
250 290
251/** 291/**
252 * check_pages - integrity check of buffer pages 292 * check_pages - integrity check of buffer pages
@@ -258,16 +298,20 @@ struct ring_buffer_iter {
258static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) 298static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
259{ 299{
260 struct list_head *head = &cpu_buffer->pages; 300 struct list_head *head = &cpu_buffer->pages;
261 struct buffer_page *page, *tmp; 301 struct buffer_page *bpage, *tmp;
262 302
263 RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); 303 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
264 RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); 304 return -1;
305 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
306 return -1;
265 307
266 list_for_each_entry_safe(page, tmp, head, list) { 308 list_for_each_entry_safe(bpage, tmp, head, list) {
267 RB_WARN_ON_RET(cpu_buffer, 309 if (RB_WARN_ON(cpu_buffer,
268 page->list.next->prev != &page->list); 310 bpage->list.next->prev != &bpage->list))
269 RB_WARN_ON_RET(cpu_buffer, 311 return -1;
270 page->list.prev->next != &page->list); 312 if (RB_WARN_ON(cpu_buffer,
313 bpage->list.prev->next != &bpage->list))
314 return -1;
271 } 315 }
272 316
273 return 0; 317 return 0;
@@ -277,22 +321,23 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
277 unsigned nr_pages) 321 unsigned nr_pages)
278{ 322{
279 struct list_head *head = &cpu_buffer->pages; 323 struct list_head *head = &cpu_buffer->pages;
280 struct buffer_page *page, *tmp; 324 struct buffer_page *bpage, *tmp;
281 unsigned long addr; 325 unsigned long addr;
282 LIST_HEAD(pages); 326 LIST_HEAD(pages);
283 unsigned i; 327 unsigned i;
284 328
285 for (i = 0; i < nr_pages; i++) { 329 for (i = 0; i < nr_pages; i++) {
286 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), 330 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
287 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); 331 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
288 if (!page) 332 if (!bpage)
289 goto free_pages; 333 goto free_pages;
290 list_add(&page->list, &pages); 334 list_add(&bpage->list, &pages);
291 335
292 addr = __get_free_page(GFP_KERNEL); 336 addr = __get_free_page(GFP_KERNEL);
293 if (!addr) 337 if (!addr)
294 goto free_pages; 338 goto free_pages;
295 page->page = (void *)addr; 339 bpage->page = (void *)addr;
340 rb_init_page(bpage->page);
296 } 341 }
297 342
298 list_splice(&pages, head); 343 list_splice(&pages, head);
@@ -302,9 +347,9 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
302 return 0; 347 return 0;
303 348
304 free_pages: 349 free_pages:
305 list_for_each_entry_safe(page, tmp, &pages, list) { 350 list_for_each_entry_safe(bpage, tmp, &pages, list) {
306 list_del_init(&page->list); 351 list_del_init(&bpage->list);
307 free_buffer_page(page); 352 free_buffer_page(bpage);
308 } 353 }
309 return -ENOMEM; 354 return -ENOMEM;
310} 355}
@@ -313,7 +358,7 @@ static struct ring_buffer_per_cpu *
313rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) 358rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
314{ 359{
315 struct ring_buffer_per_cpu *cpu_buffer; 360 struct ring_buffer_per_cpu *cpu_buffer;
316 struct buffer_page *page; 361 struct buffer_page *bpage;
317 unsigned long addr; 362 unsigned long addr;
318 int ret; 363 int ret;
319 364
@@ -324,19 +369,21 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
324 369
325 cpu_buffer->cpu = cpu; 370 cpu_buffer->cpu = cpu;
326 cpu_buffer->buffer = buffer; 371 cpu_buffer->buffer = buffer;
327 spin_lock_init(&cpu_buffer->lock); 372 spin_lock_init(&cpu_buffer->reader_lock);
373 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
328 INIT_LIST_HEAD(&cpu_buffer->pages); 374 INIT_LIST_HEAD(&cpu_buffer->pages);
329 375
330 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), 376 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
331 GFP_KERNEL, cpu_to_node(cpu)); 377 GFP_KERNEL, cpu_to_node(cpu));
332 if (!page) 378 if (!bpage)
333 goto fail_free_buffer; 379 goto fail_free_buffer;
334 380
335 cpu_buffer->reader_page = page; 381 cpu_buffer->reader_page = bpage;
336 addr = __get_free_page(GFP_KERNEL); 382 addr = __get_free_page(GFP_KERNEL);
337 if (!addr) 383 if (!addr)
338 goto fail_free_reader; 384 goto fail_free_reader;
339 page->page = (void *)addr; 385 bpage->page = (void *)addr;
386 rb_init_page(bpage->page);
340 387
341 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 388 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
342 389
@@ -361,14 +408,14 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
361static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) 408static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
362{ 409{
363 struct list_head *head = &cpu_buffer->pages; 410 struct list_head *head = &cpu_buffer->pages;
364 struct buffer_page *page, *tmp; 411 struct buffer_page *bpage, *tmp;
365 412
366 list_del_init(&cpu_buffer->reader_page->list); 413 list_del_init(&cpu_buffer->reader_page->list);
367 free_buffer_page(cpu_buffer->reader_page); 414 free_buffer_page(cpu_buffer->reader_page);
368 415
369 list_for_each_entry_safe(page, tmp, head, list) { 416 list_for_each_entry_safe(bpage, tmp, head, list) {
370 list_del_init(&page->list); 417 list_del_init(&bpage->list);
371 free_buffer_page(page); 418 free_buffer_page(bpage);
372 } 419 }
373 kfree(cpu_buffer); 420 kfree(cpu_buffer);
374} 421}
@@ -465,7 +512,7 @@ static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
465static void 512static void
466rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) 513rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
467{ 514{
468 struct buffer_page *page; 515 struct buffer_page *bpage;
469 struct list_head *p; 516 struct list_head *p;
470 unsigned i; 517 unsigned i;
471 518
@@ -473,13 +520,15 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
473 synchronize_sched(); 520 synchronize_sched();
474 521
475 for (i = 0; i < nr_pages; i++) { 522 for (i = 0; i < nr_pages; i++) {
476 BUG_ON(list_empty(&cpu_buffer->pages)); 523 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
524 return;
477 p = cpu_buffer->pages.next; 525 p = cpu_buffer->pages.next;
478 page = list_entry(p, struct buffer_page, list); 526 bpage = list_entry(p, struct buffer_page, list);
479 list_del_init(&page->list); 527 list_del_init(&bpage->list);
480 free_buffer_page(page); 528 free_buffer_page(bpage);
481 } 529 }
482 BUG_ON(list_empty(&cpu_buffer->pages)); 530 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
531 return;
483 532
484 rb_reset_cpu(cpu_buffer); 533 rb_reset_cpu(cpu_buffer);
485 534
@@ -493,7 +542,7 @@ static void
493rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, 542rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
494 struct list_head *pages, unsigned nr_pages) 543 struct list_head *pages, unsigned nr_pages)
495{ 544{
496 struct buffer_page *page; 545 struct buffer_page *bpage;
497 struct list_head *p; 546 struct list_head *p;
498 unsigned i; 547 unsigned i;
499 548
@@ -501,11 +550,12 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
501 synchronize_sched(); 550 synchronize_sched();
502 551
503 for (i = 0; i < nr_pages; i++) { 552 for (i = 0; i < nr_pages; i++) {
504 BUG_ON(list_empty(pages)); 553 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
554 return;
505 p = pages->next; 555 p = pages->next;
506 page = list_entry(p, struct buffer_page, list); 556 bpage = list_entry(p, struct buffer_page, list);
507 list_del_init(&page->list); 557 list_del_init(&bpage->list);
508 list_add_tail(&page->list, &cpu_buffer->pages); 558 list_add_tail(&bpage->list, &cpu_buffer->pages);
509 } 559 }
510 rb_reset_cpu(cpu_buffer); 560 rb_reset_cpu(cpu_buffer);
511 561
@@ -532,7 +582,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
532{ 582{
533 struct ring_buffer_per_cpu *cpu_buffer; 583 struct ring_buffer_per_cpu *cpu_buffer;
534 unsigned nr_pages, rm_pages, new_pages; 584 unsigned nr_pages, rm_pages, new_pages;
535 struct buffer_page *page, *tmp; 585 struct buffer_page *bpage, *tmp;
536 unsigned long buffer_size; 586 unsigned long buffer_size;
537 unsigned long addr; 587 unsigned long addr;
538 LIST_HEAD(pages); 588 LIST_HEAD(pages);
@@ -562,7 +612,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
562 if (size < buffer_size) { 612 if (size < buffer_size) {
563 613
564 /* easy case, just free pages */ 614 /* easy case, just free pages */
565 BUG_ON(nr_pages >= buffer->pages); 615 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
616 mutex_unlock(&buffer->mutex);
617 return -1;
618 }
566 619
567 rm_pages = buffer->pages - nr_pages; 620 rm_pages = buffer->pages - nr_pages;
568 621
@@ -581,21 +634,26 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
581 * add these pages to the cpu_buffers. Otherwise we just free 634 * add these pages to the cpu_buffers. Otherwise we just free
582 * them all and return -ENOMEM; 635 * them all and return -ENOMEM;
583 */ 636 */
584 BUG_ON(nr_pages <= buffer->pages); 637 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
638 mutex_unlock(&buffer->mutex);
639 return -1;
640 }
641
585 new_pages = nr_pages - buffer->pages; 642 new_pages = nr_pages - buffer->pages;
586 643
587 for_each_buffer_cpu(buffer, cpu) { 644 for_each_buffer_cpu(buffer, cpu) {
588 for (i = 0; i < new_pages; i++) { 645 for (i = 0; i < new_pages; i++) {
589 page = kzalloc_node(ALIGN(sizeof(*page), 646 bpage = kzalloc_node(ALIGN(sizeof(*bpage),
590 cache_line_size()), 647 cache_line_size()),
591 GFP_KERNEL, cpu_to_node(cpu)); 648 GFP_KERNEL, cpu_to_node(cpu));
592 if (!page) 649 if (!bpage)
593 goto free_pages; 650 goto free_pages;
594 list_add(&page->list, &pages); 651 list_add(&bpage->list, &pages);
595 addr = __get_free_page(GFP_KERNEL); 652 addr = __get_free_page(GFP_KERNEL);
596 if (!addr) 653 if (!addr)
597 goto free_pages; 654 goto free_pages;
598 page->page = (void *)addr; 655 bpage->page = (void *)addr;
656 rb_init_page(bpage->page);
599 } 657 }
600 } 658 }
601 659
@@ -604,7 +662,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
604 rb_insert_pages(cpu_buffer, &pages, new_pages); 662 rb_insert_pages(cpu_buffer, &pages, new_pages);
605 } 663 }
606 664
607 BUG_ON(!list_empty(&pages)); 665 if (RB_WARN_ON(buffer, !list_empty(&pages))) {
666 mutex_unlock(&buffer->mutex);
667 return -1;
668 }
608 669
609 out: 670 out:
610 buffer->pages = nr_pages; 671 buffer->pages = nr_pages;
@@ -613,9 +674,9 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
613 return size; 674 return size;
614 675
615 free_pages: 676 free_pages:
616 list_for_each_entry_safe(page, tmp, &pages, list) { 677 list_for_each_entry_safe(bpage, tmp, &pages, list) {
617 list_del_init(&page->list); 678 list_del_init(&bpage->list);
618 free_buffer_page(page); 679 free_buffer_page(bpage);
619 } 680 }
620 mutex_unlock(&buffer->mutex); 681 mutex_unlock(&buffer->mutex);
621 return -ENOMEM; 682 return -ENOMEM;
@@ -626,9 +687,15 @@ static inline int rb_null_event(struct ring_buffer_event *event)
626 return event->type == RINGBUF_TYPE_PADDING; 687 return event->type == RINGBUF_TYPE_PADDING;
627} 688}
628 689
629static inline void *__rb_page_index(struct buffer_page *page, unsigned index) 690static inline void *
691__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
692{
693 return bpage->data + index;
694}
695
696static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
630{ 697{
631 return page->page + index; 698 return bpage->page->data + index;
632} 699}
633 700
634static inline struct ring_buffer_event * 701static inline struct ring_buffer_event *
@@ -658,7 +725,7 @@ static inline unsigned rb_page_write(struct buffer_page *bpage)
658 725
659static inline unsigned rb_page_commit(struct buffer_page *bpage) 726static inline unsigned rb_page_commit(struct buffer_page *bpage)
660{ 727{
661 return local_read(&bpage->commit); 728 return local_read(&bpage->page->commit);
662} 729}
663 730
664/* Size is determined by what has been commited */ 731/* Size is determined by what has been commited */
@@ -693,7 +760,8 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
693 head += rb_event_length(event)) { 760 head += rb_event_length(event)) {
694 761
695 event = __rb_page_index(cpu_buffer->head_page, head); 762 event = __rb_page_index(cpu_buffer->head_page, head);
696 BUG_ON(rb_null_event(event)); 763 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
764 return;
697 /* Only count data entries */ 765 /* Only count data entries */
698 if (event->type != RINGBUF_TYPE_DATA) 766 if (event->type != RINGBUF_TYPE_DATA)
699 continue; 767 continue;
@@ -703,14 +771,14 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
703} 771}
704 772
705static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, 773static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
706 struct buffer_page **page) 774 struct buffer_page **bpage)
707{ 775{
708 struct list_head *p = (*page)->list.next; 776 struct list_head *p = (*bpage)->list.next;
709 777
710 if (p == &cpu_buffer->pages) 778 if (p == &cpu_buffer->pages)
711 p = p->next; 779 p = p->next;
712 780
713 *page = list_entry(p, struct buffer_page, list); 781 *bpage = list_entry(p, struct buffer_page, list);
714} 782}
715 783
716static inline unsigned 784static inline unsigned
@@ -746,16 +814,18 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
746 addr &= PAGE_MASK; 814 addr &= PAGE_MASK;
747 815
748 while (cpu_buffer->commit_page->page != (void *)addr) { 816 while (cpu_buffer->commit_page->page != (void *)addr) {
749 RB_WARN_ON(cpu_buffer, 817 if (RB_WARN_ON(cpu_buffer,
750 cpu_buffer->commit_page == cpu_buffer->tail_page); 818 cpu_buffer->commit_page == cpu_buffer->tail_page))
751 cpu_buffer->commit_page->commit = 819 return;
820 cpu_buffer->commit_page->page->commit =
752 cpu_buffer->commit_page->write; 821 cpu_buffer->commit_page->write;
753 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 822 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
754 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; 823 cpu_buffer->write_stamp =
824 cpu_buffer->commit_page->page->time_stamp;
755 } 825 }
756 826
757 /* Now set the commit to the event's index */ 827 /* Now set the commit to the event's index */
758 local_set(&cpu_buffer->commit_page->commit, index); 828 local_set(&cpu_buffer->commit_page->page->commit, index);
759} 829}
760 830
761static inline void 831static inline void
@@ -770,16 +840,17 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
770 * assign the commit to the tail. 840 * assign the commit to the tail.
771 */ 841 */
772 while (cpu_buffer->commit_page != cpu_buffer->tail_page) { 842 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
773 cpu_buffer->commit_page->commit = 843 cpu_buffer->commit_page->page->commit =
774 cpu_buffer->commit_page->write; 844 cpu_buffer->commit_page->write;
775 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 845 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
776 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp; 846 cpu_buffer->write_stamp =
847 cpu_buffer->commit_page->page->time_stamp;
777 /* add barrier to keep gcc from optimizing too much */ 848 /* add barrier to keep gcc from optimizing too much */
778 barrier(); 849 barrier();
779 } 850 }
780 while (rb_commit_index(cpu_buffer) != 851 while (rb_commit_index(cpu_buffer) !=
781 rb_page_write(cpu_buffer->commit_page)) { 852 rb_page_write(cpu_buffer->commit_page)) {
782 cpu_buffer->commit_page->commit = 853 cpu_buffer->commit_page->page->commit =
783 cpu_buffer->commit_page->write; 854 cpu_buffer->commit_page->write;
784 barrier(); 855 barrier();
785 } 856 }
@@ -787,7 +858,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
787 858
788static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 859static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
789{ 860{
790 cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp; 861 cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
791 cpu_buffer->reader_page->read = 0; 862 cpu_buffer->reader_page->read = 0;
792} 863}
793 864
@@ -806,7 +877,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
806 else 877 else
807 rb_inc_page(cpu_buffer, &iter->head_page); 878 rb_inc_page(cpu_buffer, &iter->head_page);
808 879
809 iter->read_stamp = iter->head_page->time_stamp; 880 iter->read_stamp = iter->head_page->page->time_stamp;
810 iter->head = 0; 881 iter->head = 0;
811} 882}
812 883
@@ -894,7 +965,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
894 if (write > BUF_PAGE_SIZE) { 965 if (write > BUF_PAGE_SIZE) {
895 struct buffer_page *next_page = tail_page; 966 struct buffer_page *next_page = tail_page;
896 967
897 spin_lock_irqsave(&cpu_buffer->lock, flags); 968 local_irq_save(flags);
969 __raw_spin_lock(&cpu_buffer->lock);
898 970
899 rb_inc_page(cpu_buffer, &next_page); 971 rb_inc_page(cpu_buffer, &next_page);
900 972
@@ -902,7 +974,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
902 reader_page = cpu_buffer->reader_page; 974 reader_page = cpu_buffer->reader_page;
903 975
904 /* we grabbed the lock before incrementing */ 976 /* we grabbed the lock before incrementing */
905 RB_WARN_ON(cpu_buffer, next_page == reader_page); 977 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
978 goto out_unlock;
906 979
907 /* 980 /*
908 * If for some reason, we had an interrupt storm that made 981 * If for some reason, we had an interrupt storm that made
@@ -940,12 +1013,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
940 */ 1013 */
941 if (tail_page == cpu_buffer->tail_page) { 1014 if (tail_page == cpu_buffer->tail_page) {
942 local_set(&next_page->write, 0); 1015 local_set(&next_page->write, 0);
943 local_set(&next_page->commit, 0); 1016 local_set(&next_page->page->commit, 0);
944 cpu_buffer->tail_page = next_page; 1017 cpu_buffer->tail_page = next_page;
945 1018
946 /* reread the time stamp */ 1019 /* reread the time stamp */
947 *ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1020 *ts = ring_buffer_time_stamp(cpu_buffer->cpu);
948 cpu_buffer->tail_page->time_stamp = *ts; 1021 cpu_buffer->tail_page->page->time_stamp = *ts;
949 } 1022 }
950 1023
951 /* 1024 /*
@@ -970,7 +1043,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
970 rb_set_commit_to_write(cpu_buffer); 1043 rb_set_commit_to_write(cpu_buffer);
971 } 1044 }
972 1045
973 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1046 __raw_spin_unlock(&cpu_buffer->lock);
1047 local_irq_restore(flags);
974 1048
975 /* fail and let the caller try again */ 1049 /* fail and let the caller try again */
976 return ERR_PTR(-EAGAIN); 1050 return ERR_PTR(-EAGAIN);
@@ -978,7 +1052,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
978 1052
979 /* We reserved something on the buffer */ 1053 /* We reserved something on the buffer */
980 1054
981 BUG_ON(write > BUF_PAGE_SIZE); 1055 if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
1056 return NULL;
982 1057
983 event = __rb_page_index(tail_page, tail); 1058 event = __rb_page_index(tail_page, tail);
984 rb_update_event(event, type, length); 1059 rb_update_event(event, type, length);
@@ -988,12 +1063,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
988 * this page's time stamp. 1063 * this page's time stamp.
989 */ 1064 */
990 if (!tail && rb_is_commit(cpu_buffer, event)) 1065 if (!tail && rb_is_commit(cpu_buffer, event))
991 cpu_buffer->commit_page->time_stamp = *ts; 1066 cpu_buffer->commit_page->page->time_stamp = *ts;
992 1067
993 return event; 1068 return event;
994 1069
995 out_unlock: 1070 out_unlock:
996 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1071 __raw_spin_unlock(&cpu_buffer->lock);
1072 local_irq_restore(flags);
997 return NULL; 1073 return NULL;
998} 1074}
999 1075
@@ -1038,7 +1114,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1038 event->time_delta = *delta & TS_MASK; 1114 event->time_delta = *delta & TS_MASK;
1039 event->array[0] = *delta >> TS_SHIFT; 1115 event->array[0] = *delta >> TS_SHIFT;
1040 } else { 1116 } else {
1041 cpu_buffer->commit_page->time_stamp = *ts; 1117 cpu_buffer->commit_page->page->time_stamp = *ts;
1042 event->time_delta = 0; 1118 event->time_delta = 0;
1043 event->array[0] = 0; 1119 event->array[0] = 0;
1044 } 1120 }
@@ -1076,10 +1152,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1076 * storm or we have something buggy. 1152 * storm or we have something buggy.
1077 * Bail! 1153 * Bail!
1078 */ 1154 */
1079 if (unlikely(++nr_loops > 1000)) { 1155 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1080 RB_WARN_ON(cpu_buffer, 1);
1081 return NULL; 1156 return NULL;
1082 }
1083 1157
1084 ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1158 ts = ring_buffer_time_stamp(cpu_buffer->cpu);
1085 1159
@@ -1175,15 +1249,14 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1175 struct ring_buffer_event *event; 1249 struct ring_buffer_event *event;
1176 int cpu, resched; 1250 int cpu, resched;
1177 1251
1178 if (ring_buffers_off) 1252 if (ring_buffer_flags != RB_BUFFERS_ON)
1179 return NULL; 1253 return NULL;
1180 1254
1181 if (atomic_read(&buffer->record_disabled)) 1255 if (atomic_read(&buffer->record_disabled))
1182 return NULL; 1256 return NULL;
1183 1257
1184 /* If we are tracing schedule, we don't want to recurse */ 1258 /* If we are tracing schedule, we don't want to recurse */
1185 resched = need_resched(); 1259 resched = ftrace_preempt_disable();
1186 preempt_disable_notrace();
1187 1260
1188 cpu = raw_smp_processor_id(); 1261 cpu = raw_smp_processor_id();
1189 1262
@@ -1214,10 +1287,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1214 return event; 1287 return event;
1215 1288
1216 out: 1289 out:
1217 if (resched) 1290 ftrace_preempt_enable(resched);
1218 preempt_enable_no_resched_notrace();
1219 else
1220 preempt_enable_notrace();
1221 return NULL; 1291 return NULL;
1222} 1292}
1223 1293
@@ -1259,12 +1329,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1259 /* 1329 /*
1260 * Only the last preempt count needs to restore preemption. 1330 * Only the last preempt count needs to restore preemption.
1261 */ 1331 */
1262 if (preempt_count() == 1) { 1332 if (preempt_count() == 1)
1263 if (per_cpu(rb_need_resched, cpu)) 1333 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
1264 preempt_enable_no_resched_notrace(); 1334 else
1265 else
1266 preempt_enable_notrace();
1267 } else
1268 preempt_enable_no_resched_notrace(); 1335 preempt_enable_no_resched_notrace();
1269 1336
1270 return 0; 1337 return 0;
@@ -1294,14 +1361,13 @@ int ring_buffer_write(struct ring_buffer *buffer,
1294 int ret = -EBUSY; 1361 int ret = -EBUSY;
1295 int cpu, resched; 1362 int cpu, resched;
1296 1363
1297 if (ring_buffers_off) 1364 if (ring_buffer_flags != RB_BUFFERS_ON)
1298 return -EBUSY; 1365 return -EBUSY;
1299 1366
1300 if (atomic_read(&buffer->record_disabled)) 1367 if (atomic_read(&buffer->record_disabled))
1301 return -EBUSY; 1368 return -EBUSY;
1302 1369
1303 resched = need_resched(); 1370 resched = ftrace_preempt_disable();
1304 preempt_disable_notrace();
1305 1371
1306 cpu = raw_smp_processor_id(); 1372 cpu = raw_smp_processor_id();
1307 1373
@@ -1327,10 +1393,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1327 1393
1328 ret = 0; 1394 ret = 0;
1329 out: 1395 out:
1330 if (resched) 1396 ftrace_preempt_enable(resched);
1331 preempt_enable_no_resched_notrace();
1332 else
1333 preempt_enable_notrace();
1334 1397
1335 return ret; 1398 return ret;
1336} 1399}
@@ -1489,14 +1552,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
1489 return overruns; 1552 return overruns;
1490} 1553}
1491 1554
1492/** 1555static void rb_iter_reset(struct ring_buffer_iter *iter)
1493 * ring_buffer_iter_reset - reset an iterator
1494 * @iter: The iterator to reset
1495 *
1496 * Resets the iterator, so that it will start from the beginning
1497 * again.
1498 */
1499void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1500{ 1556{
1501 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 1557 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1502 1558
@@ -1511,7 +1567,24 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1511 if (iter->head) 1567 if (iter->head)
1512 iter->read_stamp = cpu_buffer->read_stamp; 1568 iter->read_stamp = cpu_buffer->read_stamp;
1513 else 1569 else
1514 iter->read_stamp = iter->head_page->time_stamp; 1570 iter->read_stamp = iter->head_page->page->time_stamp;
1571}
1572
1573/**
1574 * ring_buffer_iter_reset - reset an iterator
1575 * @iter: The iterator to reset
1576 *
1577 * Resets the iterator, so that it will start from the beginning
1578 * again.
1579 */
1580void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1581{
1582 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1583 unsigned long flags;
1584
1585 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1586 rb_iter_reset(iter);
1587 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1515} 1588}
1516 1589
1517/** 1590/**
@@ -1597,7 +1670,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1597 unsigned long flags; 1670 unsigned long flags;
1598 int nr_loops = 0; 1671 int nr_loops = 0;
1599 1672
1600 spin_lock_irqsave(&cpu_buffer->lock, flags); 1673 local_irq_save(flags);
1674 __raw_spin_lock(&cpu_buffer->lock);
1601 1675
1602 again: 1676 again:
1603 /* 1677 /*
@@ -1606,8 +1680,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1606 * a case where we will loop three times. There should be no 1680 * a case where we will loop three times. There should be no
1607 * reason to loop four times (that I know of). 1681 * reason to loop four times (that I know of).
1608 */ 1682 */
1609 if (unlikely(++nr_loops > 3)) { 1683 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
1610 RB_WARN_ON(cpu_buffer, 1);
1611 reader = NULL; 1684 reader = NULL;
1612 goto out; 1685 goto out;
1613 } 1686 }
@@ -1619,8 +1692,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1619 goto out; 1692 goto out;
1620 1693
1621 /* Never should we have an index greater than the size */ 1694 /* Never should we have an index greater than the size */
1622 RB_WARN_ON(cpu_buffer, 1695 if (RB_WARN_ON(cpu_buffer,
1623 cpu_buffer->reader_page->read > rb_page_size(reader)); 1696 cpu_buffer->reader_page->read > rb_page_size(reader)))
1697 goto out;
1624 1698
1625 /* check if we caught up to the tail */ 1699 /* check if we caught up to the tail */
1626 reader = NULL; 1700 reader = NULL;
@@ -1637,7 +1711,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1637 cpu_buffer->reader_page->list.prev = reader->list.prev; 1711 cpu_buffer->reader_page->list.prev = reader->list.prev;
1638 1712
1639 local_set(&cpu_buffer->reader_page->write, 0); 1713 local_set(&cpu_buffer->reader_page->write, 0);
1640 local_set(&cpu_buffer->reader_page->commit, 0); 1714 local_set(&cpu_buffer->reader_page->page->commit, 0);
1641 1715
1642 /* Make the reader page now replace the head */ 1716 /* Make the reader page now replace the head */
1643 reader->list.prev->next = &cpu_buffer->reader_page->list; 1717 reader->list.prev->next = &cpu_buffer->reader_page->list;
@@ -1659,7 +1733,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1659 goto again; 1733 goto again;
1660 1734
1661 out: 1735 out:
1662 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1736 __raw_spin_unlock(&cpu_buffer->lock);
1737 local_irq_restore(flags);
1663 1738
1664 return reader; 1739 return reader;
1665} 1740}
@@ -1673,7 +1748,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
1673 reader = rb_get_reader_page(cpu_buffer); 1748 reader = rb_get_reader_page(cpu_buffer);
1674 1749
1675 /* This function should not be called when buffer is empty */ 1750 /* This function should not be called when buffer is empty */
1676 BUG_ON(!reader); 1751 if (RB_WARN_ON(cpu_buffer, !reader))
1752 return;
1677 1753
1678 event = rb_reader_event(cpu_buffer); 1754 event = rb_reader_event(cpu_buffer);
1679 1755
@@ -1700,7 +1776,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1700 * Check if we are at the end of the buffer. 1776 * Check if we are at the end of the buffer.
1701 */ 1777 */
1702 if (iter->head >= rb_page_size(iter->head_page)) { 1778 if (iter->head >= rb_page_size(iter->head_page)) {
1703 BUG_ON(iter->head_page == cpu_buffer->commit_page); 1779 if (RB_WARN_ON(buffer,
1780 iter->head_page == cpu_buffer->commit_page))
1781 return;
1704 rb_inc_iter(iter); 1782 rb_inc_iter(iter);
1705 return; 1783 return;
1706 } 1784 }
@@ -1713,8 +1791,10 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1713 * This should not be called to advance the header if we are 1791 * This should not be called to advance the header if we are
1714 * at the tail of the buffer. 1792 * at the tail of the buffer.
1715 */ 1793 */
1716 BUG_ON((iter->head_page == cpu_buffer->commit_page) && 1794 if (RB_WARN_ON(cpu_buffer,
1717 (iter->head + length > rb_commit_index(cpu_buffer))); 1795 (iter->head_page == cpu_buffer->commit_page) &&
1796 (iter->head + length > rb_commit_index(cpu_buffer))))
1797 return;
1718 1798
1719 rb_update_iter_read_stamp(iter, event); 1799 rb_update_iter_read_stamp(iter, event);
1720 1800
@@ -1726,17 +1806,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1726 rb_advance_iter(iter); 1806 rb_advance_iter(iter);
1727} 1807}
1728 1808
1729/** 1809static struct ring_buffer_event *
1730 * ring_buffer_peek - peek at the next event to be read 1810rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1731 * @buffer: The ring buffer to read
1732 * @cpu: The cpu to peak at
1733 * @ts: The timestamp counter of this event.
1734 *
1735 * This will return the event that will be read next, but does
1736 * not consume the data.
1737 */
1738struct ring_buffer_event *
1739ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1740{ 1811{
1741 struct ring_buffer_per_cpu *cpu_buffer; 1812 struct ring_buffer_per_cpu *cpu_buffer;
1742 struct ring_buffer_event *event; 1813 struct ring_buffer_event *event;
@@ -1757,10 +1828,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1757 * can have. Nesting 10 deep of interrupts is clearly 1828 * can have. Nesting 10 deep of interrupts is clearly
1758 * an anomaly. 1829 * an anomaly.
1759 */ 1830 */
1760 if (unlikely(++nr_loops > 10)) { 1831 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
1761 RB_WARN_ON(cpu_buffer, 1);
1762 return NULL; 1832 return NULL;
1763 }
1764 1833
1765 reader = rb_get_reader_page(cpu_buffer); 1834 reader = rb_get_reader_page(cpu_buffer);
1766 if (!reader) 1835 if (!reader)
@@ -1798,16 +1867,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1798 return NULL; 1867 return NULL;
1799} 1868}
1800 1869
1801/** 1870static struct ring_buffer_event *
1802 * ring_buffer_iter_peek - peek at the next event to be read 1871rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1803 * @iter: The ring buffer iterator
1804 * @ts: The timestamp counter of this event.
1805 *
1806 * This will return the event that will be read next, but does
1807 * not increment the iterator.
1808 */
1809struct ring_buffer_event *
1810ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1811{ 1872{
1812 struct ring_buffer *buffer; 1873 struct ring_buffer *buffer;
1813 struct ring_buffer_per_cpu *cpu_buffer; 1874 struct ring_buffer_per_cpu *cpu_buffer;
@@ -1829,10 +1890,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1829 * can have. Nesting 10 deep of interrupts is clearly 1890 * can have. Nesting 10 deep of interrupts is clearly
1830 * an anomaly. 1891 * an anomaly.
1831 */ 1892 */
1832 if (unlikely(++nr_loops > 10)) { 1893 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
1833 RB_WARN_ON(cpu_buffer, 1);
1834 return NULL; 1894 return NULL;
1835 }
1836 1895
1837 if (rb_per_cpu_empty(cpu_buffer)) 1896 if (rb_per_cpu_empty(cpu_buffer))
1838 return NULL; 1897 return NULL;
@@ -1869,6 +1928,51 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1869} 1928}
1870 1929
1871/** 1930/**
1931 * ring_buffer_peek - peek at the next event to be read
1932 * @buffer: The ring buffer to read
1933 * @cpu: The cpu to peak at
1934 * @ts: The timestamp counter of this event.
1935 *
1936 * This will return the event that will be read next, but does
1937 * not consume the data.
1938 */
1939struct ring_buffer_event *
1940ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1941{
1942 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
1943 struct ring_buffer_event *event;
1944 unsigned long flags;
1945
1946 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1947 event = rb_buffer_peek(buffer, cpu, ts);
1948 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1949
1950 return event;
1951}
1952
1953/**
1954 * ring_buffer_iter_peek - peek at the next event to be read
1955 * @iter: The ring buffer iterator
1956 * @ts: The timestamp counter of this event.
1957 *
1958 * This will return the event that will be read next, but does
1959 * not increment the iterator.
1960 */
1961struct ring_buffer_event *
1962ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1963{
1964 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1965 struct ring_buffer_event *event;
1966 unsigned long flags;
1967
1968 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1969 event = rb_iter_peek(iter, ts);
1970 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1971
1972 return event;
1973}
1974
1975/**
1872 * ring_buffer_consume - return an event and consume it 1976 * ring_buffer_consume - return an event and consume it
1873 * @buffer: The ring buffer to get the next event from 1977 * @buffer: The ring buffer to get the next event from
1874 * 1978 *
@@ -1879,19 +1983,24 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1879struct ring_buffer_event * 1983struct ring_buffer_event *
1880ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 1984ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
1881{ 1985{
1882 struct ring_buffer_per_cpu *cpu_buffer; 1986 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
1883 struct ring_buffer_event *event; 1987 struct ring_buffer_event *event;
1988 unsigned long flags;
1884 1989
1885 if (!cpu_isset(cpu, buffer->cpumask)) 1990 if (!cpu_isset(cpu, buffer->cpumask))
1886 return NULL; 1991 return NULL;
1887 1992
1888 event = ring_buffer_peek(buffer, cpu, ts); 1993 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1994
1995 event = rb_buffer_peek(buffer, cpu, ts);
1889 if (!event) 1996 if (!event)
1890 return NULL; 1997 goto out;
1891 1998
1892 cpu_buffer = buffer->buffers[cpu];
1893 rb_advance_reader(cpu_buffer); 1999 rb_advance_reader(cpu_buffer);
1894 2000
2001 out:
2002 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2003
1895 return event; 2004 return event;
1896} 2005}
1897 2006
@@ -1928,9 +2037,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
1928 atomic_inc(&cpu_buffer->record_disabled); 2037 atomic_inc(&cpu_buffer->record_disabled);
1929 synchronize_sched(); 2038 synchronize_sched();
1930 2039
1931 spin_lock_irqsave(&cpu_buffer->lock, flags); 2040 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1932 ring_buffer_iter_reset(iter); 2041 __raw_spin_lock(&cpu_buffer->lock);
1933 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 2042 rb_iter_reset(iter);
2043 __raw_spin_unlock(&cpu_buffer->lock);
2044 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1934 2045
1935 return iter; 2046 return iter;
1936} 2047}
@@ -1962,12 +2073,17 @@ struct ring_buffer_event *
1962ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) 2073ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
1963{ 2074{
1964 struct ring_buffer_event *event; 2075 struct ring_buffer_event *event;
2076 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2077 unsigned long flags;
1965 2078
1966 event = ring_buffer_iter_peek(iter, ts); 2079 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2080 event = rb_iter_peek(iter, ts);
1967 if (!event) 2081 if (!event)
1968 return NULL; 2082 goto out;
1969 2083
1970 rb_advance_iter(iter); 2084 rb_advance_iter(iter);
2085 out:
2086 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1971 2087
1972 return event; 2088 return event;
1973} 2089}
@@ -1987,7 +2103,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
1987 cpu_buffer->head_page 2103 cpu_buffer->head_page
1988 = list_entry(cpu_buffer->pages.next, struct buffer_page, list); 2104 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
1989 local_set(&cpu_buffer->head_page->write, 0); 2105 local_set(&cpu_buffer->head_page->write, 0);
1990 local_set(&cpu_buffer->head_page->commit, 0); 2106 local_set(&cpu_buffer->head_page->page->commit, 0);
1991 2107
1992 cpu_buffer->head_page->read = 0; 2108 cpu_buffer->head_page->read = 0;
1993 2109
@@ -1996,7 +2112,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
1996 2112
1997 INIT_LIST_HEAD(&cpu_buffer->reader_page->list); 2113 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1998 local_set(&cpu_buffer->reader_page->write, 0); 2114 local_set(&cpu_buffer->reader_page->write, 0);
1999 local_set(&cpu_buffer->reader_page->commit, 0); 2115 local_set(&cpu_buffer->reader_page->page->commit, 0);
2000 cpu_buffer->reader_page->read = 0; 2116 cpu_buffer->reader_page->read = 0;
2001 2117
2002 cpu_buffer->overrun = 0; 2118 cpu_buffer->overrun = 0;
@@ -2016,11 +2132,15 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2016 if (!cpu_isset(cpu, buffer->cpumask)) 2132 if (!cpu_isset(cpu, buffer->cpumask))
2017 return; 2133 return;
2018 2134
2019 spin_lock_irqsave(&cpu_buffer->lock, flags); 2135 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2136
2137 __raw_spin_lock(&cpu_buffer->lock);
2020 2138
2021 rb_reset_cpu(cpu_buffer); 2139 rb_reset_cpu(cpu_buffer);
2022 2140
2023 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 2141 __raw_spin_unlock(&cpu_buffer->lock);
2142
2143 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2024} 2144}
2025 2145
2026/** 2146/**
@@ -2118,16 +2238,178 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
2118 return 0; 2238 return 0;
2119} 2239}
2120 2240
2241static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
2242 struct buffer_data_page *bpage)
2243{
2244 struct ring_buffer_event *event;
2245 unsigned long head;
2246
2247 __raw_spin_lock(&cpu_buffer->lock);
2248 for (head = 0; head < local_read(&bpage->commit);
2249 head += rb_event_length(event)) {
2250
2251 event = __rb_data_page_index(bpage, head);
2252 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
2253 return;
2254 /* Only count data entries */
2255 if (event->type != RINGBUF_TYPE_DATA)
2256 continue;
2257 cpu_buffer->entries--;
2258 }
2259 __raw_spin_unlock(&cpu_buffer->lock);
2260}
2261
2262/**
2263 * ring_buffer_alloc_read_page - allocate a page to read from buffer
2264 * @buffer: the buffer to allocate for.
2265 *
2266 * This function is used in conjunction with ring_buffer_read_page.
2267 * When reading a full page from the ring buffer, these functions
2268 * can be used to speed up the process. The calling function should
2269 * allocate a few pages first with this function. Then when it
2270 * needs to get pages from the ring buffer, it passes the result
2271 * of this function into ring_buffer_read_page, which will swap
2272 * the page that was allocated, with the read page of the buffer.
2273 *
2274 * Returns:
2275 * The page allocated, or NULL on error.
2276 */
2277void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
2278{
2279 unsigned long addr;
2280 struct buffer_data_page *bpage;
2281
2282 addr = __get_free_page(GFP_KERNEL);
2283 if (!addr)
2284 return NULL;
2285
2286 bpage = (void *)addr;
2287
2288 return bpage;
2289}
2290
2291/**
2292 * ring_buffer_free_read_page - free an allocated read page
2293 * @buffer: the buffer the page was allocate for
2294 * @data: the page to free
2295 *
2296 * Free a page allocated from ring_buffer_alloc_read_page.
2297 */
2298void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
2299{
2300 free_page((unsigned long)data);
2301}
2302
2303/**
2304 * ring_buffer_read_page - extract a page from the ring buffer
2305 * @buffer: buffer to extract from
2306 * @data_page: the page to use allocated from ring_buffer_alloc_read_page
2307 * @cpu: the cpu of the buffer to extract
2308 * @full: should the extraction only happen when the page is full.
2309 *
2310 * This function will pull out a page from the ring buffer and consume it.
2311 * @data_page must be the address of the variable that was returned
2312 * from ring_buffer_alloc_read_page. This is because the page might be used
2313 * to swap with a page in the ring buffer.
2314 *
2315 * for example:
2316 * rpage = ring_buffer_alloc_page(buffer);
2317 * if (!rpage)
2318 * return error;
2319 * ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
2320 * if (ret)
2321 * process_page(rpage);
2322 *
2323 * When @full is set, the function will not return true unless
2324 * the writer is off the reader page.
2325 *
2326 * Note: it is up to the calling functions to handle sleeps and wakeups.
2327 * The ring buffer can be used anywhere in the kernel and can not
2328 * blindly call wake_up. The layer that uses the ring buffer must be
2329 * responsible for that.
2330 *
2331 * Returns:
2332 * 1 if data has been transferred
2333 * 0 if no data has been transferred.
2334 */
2335int ring_buffer_read_page(struct ring_buffer *buffer,
2336 void **data_page, int cpu, int full)
2337{
2338 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
2339 struct ring_buffer_event *event;
2340 struct buffer_data_page *bpage;
2341 unsigned long flags;
2342 int ret = 0;
2343
2344 if (!data_page)
2345 return 0;
2346
2347 bpage = *data_page;
2348 if (!bpage)
2349 return 0;
2350
2351 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2352
2353 /*
2354 * rb_buffer_peek will get the next ring buffer if
2355 * the current reader page is empty.
2356 */
2357 event = rb_buffer_peek(buffer, cpu, NULL);
2358 if (!event)
2359 goto out;
2360
2361 /* check for data */
2362 if (!local_read(&cpu_buffer->reader_page->page->commit))
2363 goto out;
2364 /*
2365 * If the writer is already off of the read page, then simply
2366 * switch the read page with the given page. Otherwise
2367 * we need to copy the data from the reader to the writer.
2368 */
2369 if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
2370 unsigned int read = cpu_buffer->reader_page->read;
2371
2372 if (full)
2373 goto out;
2374 /* The writer is still on the reader page, we must copy */
2375 bpage = cpu_buffer->reader_page->page;
2376 memcpy(bpage->data,
2377 cpu_buffer->reader_page->page->data + read,
2378 local_read(&bpage->commit) - read);
2379
2380 /* consume what was read */
2381 cpu_buffer->reader_page += read;
2382
2383 } else {
2384 /* swap the pages */
2385 rb_init_page(bpage);
2386 bpage = cpu_buffer->reader_page->page;
2387 cpu_buffer->reader_page->page = *data_page;
2388 cpu_buffer->reader_page->read = 0;
2389 *data_page = bpage;
2390 }
2391 ret = 1;
2392
2393 /* update the entry counter */
2394 rb_remove_entries(cpu_buffer, bpage);
2395 out:
2396 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2397
2398 return ret;
2399}
2400
2121static ssize_t 2401static ssize_t
2122rb_simple_read(struct file *filp, char __user *ubuf, 2402rb_simple_read(struct file *filp, char __user *ubuf,
2123 size_t cnt, loff_t *ppos) 2403 size_t cnt, loff_t *ppos)
2124{ 2404{
2125 int *p = filp->private_data; 2405 long *p = filp->private_data;
2126 char buf[64]; 2406 char buf[64];
2127 int r; 2407 int r;
2128 2408
2129 /* !ring_buffers_off == tracing_on */ 2409 if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
2130 r = sprintf(buf, "%d\n", !*p); 2410 r = sprintf(buf, "permanently disabled\n");
2411 else
2412 r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
2131 2413
2132 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2414 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2133} 2415}
@@ -2136,7 +2418,7 @@ static ssize_t
2136rb_simple_write(struct file *filp, const char __user *ubuf, 2418rb_simple_write(struct file *filp, const char __user *ubuf,
2137 size_t cnt, loff_t *ppos) 2419 size_t cnt, loff_t *ppos)
2138{ 2420{
2139 int *p = filp->private_data; 2421 long *p = filp->private_data;
2140 char buf[64]; 2422 char buf[64];
2141 long val; 2423 long val;
2142 int ret; 2424 int ret;
@@ -2153,8 +2435,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
2153 if (ret < 0) 2435 if (ret < 0)
2154 return ret; 2436 return ret;
2155 2437
2156 /* !ring_buffers_off == tracing_on */ 2438 if (val)
2157 *p = !val; 2439 set_bit(RB_BUFFERS_ON_BIT, p);
2440 else
2441 clear_bit(RB_BUFFERS_ON_BIT, p);
2158 2442
2159 (*ppos)++; 2443 (*ppos)++;
2160 2444
@@ -2176,7 +2460,7 @@ static __init int rb_init_debugfs(void)
2176 d_tracer = tracing_init_dentry(); 2460 d_tracer = tracing_init_dentry();
2177 2461
2178 entry = debugfs_create_file("tracing_on", 0644, d_tracer, 2462 entry = debugfs_create_file("tracing_on", 0644, d_tracer,
2179 &ring_buffers_off, &rb_simple_fops); 2463 &ring_buffer_flags, &rb_simple_fops);
2180 if (!entry) 2464 if (!entry)
2181 pr_warning("Could not create debugfs 'tracing_on' entry\n"); 2465 pr_warning("Could not create debugfs 'tracing_on' entry\n");
2182 2466
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d2e75479dc50..6adf660fc816 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,6 +30,7 @@
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/kprobes.h> 32#include <linux/kprobes.h>
33#include <linux/seq_file.h>
33#include <linux/writeback.h> 34#include <linux/writeback.h>
34 35
35#include <linux/stacktrace.h> 36#include <linux/stacktrace.h>
@@ -43,6 +44,38 @@
43unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; 44unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
44unsigned long __read_mostly tracing_thresh; 45unsigned long __read_mostly tracing_thresh;
45 46
47/*
48 * We need to change this state when a selftest is running.
49 * A selftest will lurk into the ring-buffer to count the
50 * entries inserted during the selftest although some concurrent
51 * insertions into the ring-buffer such as ftrace_printk could occurred
52 * at the same time, giving false positive or negative results.
53 */
54static bool __read_mostly tracing_selftest_running;
55
56/* For tracers that don't implement custom flags */
57static struct tracer_opt dummy_tracer_opt[] = {
58 { }
59};
60
61static struct tracer_flags dummy_tracer_flags = {
62 .val = 0,
63 .opts = dummy_tracer_opt
64};
65
66static int dummy_set_flag(u32 old_flags, u32 bit, int set)
67{
68 return 0;
69}
70
71/*
72 * Kill all tracing for good (never come back).
73 * It is initialized to 1 but will turn to zero if the initialization
74 * of the tracer is successful. But that is the only place that sets
75 * this back to zero.
76 */
77int tracing_disabled = 1;
78
46static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 79static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
47 80
48static inline void ftrace_disable_cpu(void) 81static inline void ftrace_disable_cpu(void)
@@ -62,7 +95,36 @@ static cpumask_t __read_mostly tracing_buffer_mask;
62#define for_each_tracing_cpu(cpu) \ 95#define for_each_tracing_cpu(cpu) \
63 for_each_cpu_mask(cpu, tracing_buffer_mask) 96 for_each_cpu_mask(cpu, tracing_buffer_mask)
64 97
65static int tracing_disabled = 1; 98/*
99 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
100 *
101 * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
102 * is set, then ftrace_dump is called. This will output the contents
103 * of the ftrace buffers to the console. This is very useful for
104 * capturing traces that lead to crashes and outputing it to a
105 * serial console.
106 *
107 * It is default off, but you can enable it with either specifying
108 * "ftrace_dump_on_oops" in the kernel command line, or setting
109 * /proc/sys/kernel/ftrace_dump_on_oops to true.
110 */
111int ftrace_dump_on_oops;
112
113static int tracing_set_tracer(char *buf);
114
115static int __init set_ftrace(char *str)
116{
117 tracing_set_tracer(str);
118 return 1;
119}
120__setup("ftrace", set_ftrace);
121
122static int __init set_ftrace_dump_on_oops(char *str)
123{
124 ftrace_dump_on_oops = 1;
125 return 1;
126}
127__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
66 128
67long 129long
68ns2usecs(cycle_t nsec) 130ns2usecs(cycle_t nsec)
@@ -112,6 +174,19 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
112/* tracer_enabled is used to toggle activation of a tracer */ 174/* tracer_enabled is used to toggle activation of a tracer */
113static int tracer_enabled = 1; 175static int tracer_enabled = 1;
114 176
177/**
178 * tracing_is_enabled - return tracer_enabled status
179 *
180 * This function is used by other tracers to know the status
181 * of the tracer_enabled flag. Tracers may use this function
182 * to know if it should enable their features when starting
183 * up. See irqsoff tracer for an example (start_irqsoff_tracer).
184 */
185int tracing_is_enabled(void)
186{
187 return tracer_enabled;
188}
189
115/* function tracing enabled */ 190/* function tracing enabled */
116int ftrace_function_enabled; 191int ftrace_function_enabled;
117 192
@@ -153,8 +228,9 @@ static DEFINE_MUTEX(trace_types_lock);
153/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 228/* trace_wait is a waitqueue for tasks blocked on trace_poll */
154static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 229static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
155 230
156/* trace_flags holds iter_ctrl options */ 231/* trace_flags holds trace_options default values */
157unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; 232unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
233 TRACE_ITER_ANNOTATE;
158 234
159/** 235/**
160 * trace_wake_up - wake up tasks waiting for trace input 236 * trace_wake_up - wake up tasks waiting for trace input
@@ -193,13 +269,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
193 return nsecs / 1000; 269 return nsecs / 1000;
194} 270}
195 271
196/*
197 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
198 * control the output of kernel symbols.
199 */
200#define TRACE_ITER_SYM_MASK \
201 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
202
203/* These must match the bit postions in trace_iterator_flags */ 272/* These must match the bit postions in trace_iterator_flags */
204static const char *trace_options[] = { 273static const char *trace_options[] = {
205 "print-parent", 274 "print-parent",
@@ -213,6 +282,11 @@ static const char *trace_options[] = {
213 "stacktrace", 282 "stacktrace",
214 "sched-tree", 283 "sched-tree",
215 "ftrace_printk", 284 "ftrace_printk",
285 "ftrace_preempt",
286 "branch",
287 "annotate",
288 "userstacktrace",
289 "sym-userobj",
216 NULL 290 NULL
217}; 291};
218 292
@@ -359,6 +433,28 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
359 return trace_seq_putmem(s, hex, j); 433 return trace_seq_putmem(s, hex, j);
360} 434}
361 435
436static int
437trace_seq_path(struct trace_seq *s, struct path *path)
438{
439 unsigned char *p;
440
441 if (s->len >= (PAGE_SIZE - 1))
442 return 0;
443 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
444 if (!IS_ERR(p)) {
445 p = mangle_path(s->buffer + s->len, p, "\n");
446 if (p) {
447 s->len = p - s->buffer;
448 return 1;
449 }
450 } else {
451 s->buffer[s->len++] = '?';
452 return 1;
453 }
454
455 return 0;
456}
457
362static void 458static void
363trace_seq_reset(struct trace_seq *s) 459trace_seq_reset(struct trace_seq *s)
364{ 460{
@@ -470,7 +566,17 @@ int register_tracer(struct tracer *type)
470 return -1; 566 return -1;
471 } 567 }
472 568
569 /*
570 * When this gets called we hold the BKL which means that
571 * preemption is disabled. Various trace selftests however
572 * need to disable and enable preemption for successful tests.
573 * So we drop the BKL here and grab it after the tests again.
574 */
575 unlock_kernel();
473 mutex_lock(&trace_types_lock); 576 mutex_lock(&trace_types_lock);
577
578 tracing_selftest_running = true;
579
474 for (t = trace_types; t; t = t->next) { 580 for (t = trace_types; t; t = t->next) {
475 if (strcmp(type->name, t->name) == 0) { 581 if (strcmp(type->name, t->name) == 0) {
476 /* already found */ 582 /* already found */
@@ -481,12 +587,20 @@ int register_tracer(struct tracer *type)
481 } 587 }
482 } 588 }
483 589
590 if (!type->set_flag)
591 type->set_flag = &dummy_set_flag;
592 if (!type->flags)
593 type->flags = &dummy_tracer_flags;
594 else
595 if (!type->flags->opts)
596 type->flags->opts = dummy_tracer_opt;
597
484#ifdef CONFIG_FTRACE_STARTUP_TEST 598#ifdef CONFIG_FTRACE_STARTUP_TEST
485 if (type->selftest) { 599 if (type->selftest) {
486 struct tracer *saved_tracer = current_trace; 600 struct tracer *saved_tracer = current_trace;
487 struct trace_array *tr = &global_trace; 601 struct trace_array *tr = &global_trace;
488 int saved_ctrl = tr->ctrl;
489 int i; 602 int i;
603
490 /* 604 /*
491 * Run a selftest on this tracer. 605 * Run a selftest on this tracer.
492 * Here we reset the trace buffer, and set the current 606 * Here we reset the trace buffer, and set the current
@@ -494,25 +608,23 @@ int register_tracer(struct tracer *type)
494 * internal tracing to verify that everything is in order. 608 * internal tracing to verify that everything is in order.
495 * If we fail, we do not register this tracer. 609 * If we fail, we do not register this tracer.
496 */ 610 */
497 for_each_tracing_cpu(i) { 611 for_each_tracing_cpu(i)
498 tracing_reset(tr, i); 612 tracing_reset(tr, i);
499 } 613
500 current_trace = type; 614 current_trace = type;
501 tr->ctrl = 0;
502 /* the test is responsible for initializing and enabling */ 615 /* the test is responsible for initializing and enabling */
503 pr_info("Testing tracer %s: ", type->name); 616 pr_info("Testing tracer %s: ", type->name);
504 ret = type->selftest(type, tr); 617 ret = type->selftest(type, tr);
505 /* the test is responsible for resetting too */ 618 /* the test is responsible for resetting too */
506 current_trace = saved_tracer; 619 current_trace = saved_tracer;
507 tr->ctrl = saved_ctrl;
508 if (ret) { 620 if (ret) {
509 printk(KERN_CONT "FAILED!\n"); 621 printk(KERN_CONT "FAILED!\n");
510 goto out; 622 goto out;
511 } 623 }
512 /* Only reset on passing, to avoid touching corrupted buffers */ 624 /* Only reset on passing, to avoid touching corrupted buffers */
513 for_each_tracing_cpu(i) { 625 for_each_tracing_cpu(i)
514 tracing_reset(tr, i); 626 tracing_reset(tr, i);
515 } 627
516 printk(KERN_CONT "PASSED\n"); 628 printk(KERN_CONT "PASSED\n");
517 } 629 }
518#endif 630#endif
@@ -524,7 +636,9 @@ int register_tracer(struct tracer *type)
524 max_tracer_type_len = len; 636 max_tracer_type_len = len;
525 637
526 out: 638 out:
639 tracing_selftest_running = false;
527 mutex_unlock(&trace_types_lock); 640 mutex_unlock(&trace_types_lock);
641 lock_kernel();
528 642
529 return ret; 643 return ret;
530} 644}
@@ -581,6 +695,91 @@ static void trace_init_cmdlines(void)
581 cmdline_idx = 0; 695 cmdline_idx = 0;
582} 696}
583 697
698static int trace_stop_count;
699static DEFINE_SPINLOCK(tracing_start_lock);
700
701/**
702 * ftrace_off_permanent - disable all ftrace code permanently
703 *
704 * This should only be called when a serious anomally has
705 * been detected. This will turn off the function tracing,
706 * ring buffers, and other tracing utilites. It takes no
707 * locks and can be called from any context.
708 */
709void ftrace_off_permanent(void)
710{
711 tracing_disabled = 1;
712 ftrace_stop();
713 tracing_off_permanent();
714}
715
716/**
717 * tracing_start - quick start of the tracer
718 *
719 * If tracing is enabled but was stopped by tracing_stop,
720 * this will start the tracer back up.
721 */
722void tracing_start(void)
723{
724 struct ring_buffer *buffer;
725 unsigned long flags;
726
727 if (tracing_disabled)
728 return;
729
730 spin_lock_irqsave(&tracing_start_lock, flags);
731 if (--trace_stop_count)
732 goto out;
733
734 if (trace_stop_count < 0) {
735 /* Someone screwed up their debugging */
736 WARN_ON_ONCE(1);
737 trace_stop_count = 0;
738 goto out;
739 }
740
741
742 buffer = global_trace.buffer;
743 if (buffer)
744 ring_buffer_record_enable(buffer);
745
746 buffer = max_tr.buffer;
747 if (buffer)
748 ring_buffer_record_enable(buffer);
749
750 ftrace_start();
751 out:
752 spin_unlock_irqrestore(&tracing_start_lock, flags);
753}
754
755/**
756 * tracing_stop - quick stop of the tracer
757 *
758 * Light weight way to stop tracing. Use in conjunction with
759 * tracing_start.
760 */
761void tracing_stop(void)
762{
763 struct ring_buffer *buffer;
764 unsigned long flags;
765
766 ftrace_stop();
767 spin_lock_irqsave(&tracing_start_lock, flags);
768 if (trace_stop_count++)
769 goto out;
770
771 buffer = global_trace.buffer;
772 if (buffer)
773 ring_buffer_record_disable(buffer);
774
775 buffer = max_tr.buffer;
776 if (buffer)
777 ring_buffer_record_disable(buffer);
778
779 out:
780 spin_unlock_irqrestore(&tracing_start_lock, flags);
781}
782
584void trace_stop_cmdline_recording(void); 783void trace_stop_cmdline_recording(void);
585 784
586static void trace_save_cmdline(struct task_struct *tsk) 785static void trace_save_cmdline(struct task_struct *tsk)
@@ -618,7 +817,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
618 spin_unlock(&trace_cmdline_lock); 817 spin_unlock(&trace_cmdline_lock);
619} 818}
620 819
621static char *trace_find_cmdline(int pid) 820char *trace_find_cmdline(int pid)
622{ 821{
623 char *cmdline = "<...>"; 822 char *cmdline = "<...>";
624 unsigned map; 823 unsigned map;
@@ -655,6 +854,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
655 854
656 entry->preempt_count = pc & 0xff; 855 entry->preempt_count = pc & 0xff;
657 entry->pid = (tsk) ? tsk->pid : 0; 856 entry->pid = (tsk) ? tsk->pid : 0;
857 entry->tgid = (tsk) ? tsk->tgid : 0;
658 entry->flags = 858 entry->flags =
659#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 859#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
660 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 860 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -691,6 +891,56 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
691 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 891 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
692} 892}
693 893
894#ifdef CONFIG_FUNCTION_GRAPH_TRACER
895static void __trace_graph_entry(struct trace_array *tr,
896 struct trace_array_cpu *data,
897 struct ftrace_graph_ent *trace,
898 unsigned long flags,
899 int pc)
900{
901 struct ring_buffer_event *event;
902 struct ftrace_graph_ent_entry *entry;
903 unsigned long irq_flags;
904
905 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
906 return;
907
908 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
909 &irq_flags);
910 if (!event)
911 return;
912 entry = ring_buffer_event_data(event);
913 tracing_generic_entry_update(&entry->ent, flags, pc);
914 entry->ent.type = TRACE_GRAPH_ENT;
915 entry->graph_ent = *trace;
916 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
917}
918
919static void __trace_graph_return(struct trace_array *tr,
920 struct trace_array_cpu *data,
921 struct ftrace_graph_ret *trace,
922 unsigned long flags,
923 int pc)
924{
925 struct ring_buffer_event *event;
926 struct ftrace_graph_ret_entry *entry;
927 unsigned long irq_flags;
928
929 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
930 return;
931
932 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
933 &irq_flags);
934 if (!event)
935 return;
936 entry = ring_buffer_event_data(event);
937 tracing_generic_entry_update(&entry->ent, flags, pc);
938 entry->ent.type = TRACE_GRAPH_RET;
939 entry->ret = *trace;
940 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
941}
942#endif
943
694void 944void
695ftrace(struct trace_array *tr, struct trace_array_cpu *data, 945ftrace(struct trace_array *tr, struct trace_array_cpu *data,
696 unsigned long ip, unsigned long parent_ip, unsigned long flags, 946 unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -742,6 +992,46 @@ void __trace_stack(struct trace_array *tr,
742 ftrace_trace_stack(tr, data, flags, skip, preempt_count()); 992 ftrace_trace_stack(tr, data, flags, skip, preempt_count());
743} 993}
744 994
995static void ftrace_trace_userstack(struct trace_array *tr,
996 struct trace_array_cpu *data,
997 unsigned long flags, int pc)
998{
999#ifdef CONFIG_STACKTRACE
1000 struct ring_buffer_event *event;
1001 struct userstack_entry *entry;
1002 struct stack_trace trace;
1003 unsigned long irq_flags;
1004
1005 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
1006 return;
1007
1008 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
1009 &irq_flags);
1010 if (!event)
1011 return;
1012 entry = ring_buffer_event_data(event);
1013 tracing_generic_entry_update(&entry->ent, flags, pc);
1014 entry->ent.type = TRACE_USER_STACK;
1015
1016 memset(&entry->caller, 0, sizeof(entry->caller));
1017
1018 trace.nr_entries = 0;
1019 trace.max_entries = FTRACE_STACK_ENTRIES;
1020 trace.skip = 0;
1021 trace.entries = entry->caller;
1022
1023 save_stack_trace_user(&trace);
1024 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
1025#endif
1026}
1027
1028void __trace_userstack(struct trace_array *tr,
1029 struct trace_array_cpu *data,
1030 unsigned long flags)
1031{
1032 ftrace_trace_userstack(tr, data, flags, preempt_count());
1033}
1034
745static void 1035static void
746ftrace_trace_special(void *__tr, void *__data, 1036ftrace_trace_special(void *__tr, void *__data,
747 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1037 unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -765,6 +1055,7 @@ ftrace_trace_special(void *__tr, void *__data,
765 entry->arg3 = arg3; 1055 entry->arg3 = arg3;
766 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1056 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
767 ftrace_trace_stack(tr, data, irq_flags, 4, pc); 1057 ftrace_trace_stack(tr, data, irq_flags, 4, pc);
1058 ftrace_trace_userstack(tr, data, irq_flags, pc);
768 1059
769 trace_wake_up(); 1060 trace_wake_up();
770} 1061}
@@ -803,6 +1094,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
803 entry->next_cpu = task_cpu(next); 1094 entry->next_cpu = task_cpu(next);
804 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1095 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
805 ftrace_trace_stack(tr, data, flags, 5, pc); 1096 ftrace_trace_stack(tr, data, flags, 5, pc);
1097 ftrace_trace_userstack(tr, data, flags, pc);
806} 1098}
807 1099
808void 1100void
@@ -832,6 +1124,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
832 entry->next_cpu = task_cpu(wakee); 1124 entry->next_cpu = task_cpu(wakee);
833 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1125 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
834 ftrace_trace_stack(tr, data, flags, 6, pc); 1126 ftrace_trace_stack(tr, data, flags, 6, pc);
1127 ftrace_trace_userstack(tr, data, flags, pc);
835 1128
836 trace_wake_up(); 1129 trace_wake_up();
837} 1130}
@@ -841,26 +1134,28 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
841{ 1134{
842 struct trace_array *tr = &global_trace; 1135 struct trace_array *tr = &global_trace;
843 struct trace_array_cpu *data; 1136 struct trace_array_cpu *data;
1137 unsigned long flags;
844 int cpu; 1138 int cpu;
845 int pc; 1139 int pc;
846 1140
847 if (tracing_disabled || !tr->ctrl) 1141 if (tracing_disabled)
848 return; 1142 return;
849 1143
850 pc = preempt_count(); 1144 pc = preempt_count();
851 preempt_disable_notrace(); 1145 local_irq_save(flags);
852 cpu = raw_smp_processor_id(); 1146 cpu = raw_smp_processor_id();
853 data = tr->data[cpu]; 1147 data = tr->data[cpu];
854 1148
855 if (likely(!atomic_read(&data->disabled))) 1149 if (likely(atomic_inc_return(&data->disabled) == 1))
856 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); 1150 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
857 1151
858 preempt_enable_notrace(); 1152 atomic_dec(&data->disabled);
1153 local_irq_restore(flags);
859} 1154}
860 1155
861#ifdef CONFIG_FUNCTION_TRACER 1156#ifdef CONFIG_FUNCTION_TRACER
862static void 1157static void
863function_trace_call(unsigned long ip, unsigned long parent_ip) 1158function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
864{ 1159{
865 struct trace_array *tr = &global_trace; 1160 struct trace_array *tr = &global_trace;
866 struct trace_array_cpu *data; 1161 struct trace_array_cpu *data;
@@ -873,8 +1168,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
873 return; 1168 return;
874 1169
875 pc = preempt_count(); 1170 pc = preempt_count();
876 resched = need_resched(); 1171 resched = ftrace_preempt_disable();
877 preempt_disable_notrace();
878 local_save_flags(flags); 1172 local_save_flags(flags);
879 cpu = raw_smp_processor_id(); 1173 cpu = raw_smp_processor_id();
880 data = tr->data[cpu]; 1174 data = tr->data[cpu];
@@ -884,11 +1178,96 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
884 trace_function(tr, data, ip, parent_ip, flags, pc); 1178 trace_function(tr, data, ip, parent_ip, flags, pc);
885 1179
886 atomic_dec(&data->disabled); 1180 atomic_dec(&data->disabled);
887 if (resched) 1181 ftrace_preempt_enable(resched);
888 preempt_enable_no_resched_notrace(); 1182}
889 else 1183
890 preempt_enable_notrace(); 1184static void
1185function_trace_call(unsigned long ip, unsigned long parent_ip)
1186{
1187 struct trace_array *tr = &global_trace;
1188 struct trace_array_cpu *data;
1189 unsigned long flags;
1190 long disabled;
1191 int cpu;
1192 int pc;
1193
1194 if (unlikely(!ftrace_function_enabled))
1195 return;
1196
1197 /*
1198 * Need to use raw, since this must be called before the
1199 * recursive protection is performed.
1200 */
1201 local_irq_save(flags);
1202 cpu = raw_smp_processor_id();
1203 data = tr->data[cpu];
1204 disabled = atomic_inc_return(&data->disabled);
1205
1206 if (likely(disabled == 1)) {
1207 pc = preempt_count();
1208 trace_function(tr, data, ip, parent_ip, flags, pc);
1209 }
1210
1211 atomic_dec(&data->disabled);
1212 local_irq_restore(flags);
1213}
1214
1215#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1216int trace_graph_entry(struct ftrace_graph_ent *trace)
1217{
1218 struct trace_array *tr = &global_trace;
1219 struct trace_array_cpu *data;
1220 unsigned long flags;
1221 long disabled;
1222 int cpu;
1223 int pc;
1224
1225 if (!ftrace_trace_task(current))
1226 return 0;
1227
1228 if (!ftrace_graph_addr(trace->func))
1229 return 0;
1230
1231 local_irq_save(flags);
1232 cpu = raw_smp_processor_id();
1233 data = tr->data[cpu];
1234 disabled = atomic_inc_return(&data->disabled);
1235 if (likely(disabled == 1)) {
1236 pc = preempt_count();
1237 __trace_graph_entry(tr, data, trace, flags, pc);
1238 }
1239 /* Only do the atomic if it is not already set */
1240 if (!test_tsk_trace_graph(current))
1241 set_tsk_trace_graph(current);
1242 atomic_dec(&data->disabled);
1243 local_irq_restore(flags);
1244
1245 return 1;
1246}
1247
1248void trace_graph_return(struct ftrace_graph_ret *trace)
1249{
1250 struct trace_array *tr = &global_trace;
1251 struct trace_array_cpu *data;
1252 unsigned long flags;
1253 long disabled;
1254 int cpu;
1255 int pc;
1256
1257 local_irq_save(flags);
1258 cpu = raw_smp_processor_id();
1259 data = tr->data[cpu];
1260 disabled = atomic_inc_return(&data->disabled);
1261 if (likely(disabled == 1)) {
1262 pc = preempt_count();
1263 __trace_graph_return(tr, data, trace, flags, pc);
1264 }
1265 if (!trace->depth)
1266 clear_tsk_trace_graph(current);
1267 atomic_dec(&data->disabled);
1268 local_irq_restore(flags);
891} 1269}
1270#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
892 1271
893static struct ftrace_ops trace_ops __read_mostly = 1272static struct ftrace_ops trace_ops __read_mostly =
894{ 1273{
@@ -898,9 +1277,14 @@ static struct ftrace_ops trace_ops __read_mostly =
898void tracing_start_function_trace(void) 1277void tracing_start_function_trace(void)
899{ 1278{
900 ftrace_function_enabled = 0; 1279 ftrace_function_enabled = 0;
1280
1281 if (trace_flags & TRACE_ITER_PREEMPTONLY)
1282 trace_ops.func = function_trace_call_preempt_only;
1283 else
1284 trace_ops.func = function_trace_call;
1285
901 register_ftrace_function(&trace_ops); 1286 register_ftrace_function(&trace_ops);
902 if (tracer_enabled) 1287 ftrace_function_enabled = 1;
903 ftrace_function_enabled = 1;
904} 1288}
905 1289
906void tracing_stop_function_trace(void) 1290void tracing_stop_function_trace(void)
@@ -912,6 +1296,7 @@ void tracing_stop_function_trace(void)
912 1296
913enum trace_file_type { 1297enum trace_file_type {
914 TRACE_FILE_LAT_FMT = 1, 1298 TRACE_FILE_LAT_FMT = 1,
1299 TRACE_FILE_ANNOTATE = 2,
915}; 1300};
916 1301
917static void trace_iterator_increment(struct trace_iterator *iter, int cpu) 1302static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
@@ -1047,10 +1432,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1047 1432
1048 atomic_inc(&trace_record_cmdline_disabled); 1433 atomic_inc(&trace_record_cmdline_disabled);
1049 1434
1050 /* let the tracer grab locks here if needed */
1051 if (current_trace->start)
1052 current_trace->start(iter);
1053
1054 if (*pos != iter->pos) { 1435 if (*pos != iter->pos) {
1055 iter->ent = NULL; 1436 iter->ent = NULL;
1056 iter->cpu = 0; 1437 iter->cpu = 0;
@@ -1077,14 +1458,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1077 1458
1078static void s_stop(struct seq_file *m, void *p) 1459static void s_stop(struct seq_file *m, void *p)
1079{ 1460{
1080 struct trace_iterator *iter = m->private;
1081
1082 atomic_dec(&trace_record_cmdline_disabled); 1461 atomic_dec(&trace_record_cmdline_disabled);
1083
1084 /* let the tracer release locks here if needed */
1085 if (current_trace && current_trace == iter->trace && iter->trace->stop)
1086 iter->trace->stop(iter);
1087
1088 mutex_unlock(&trace_types_lock); 1462 mutex_unlock(&trace_types_lock);
1089} 1463}
1090 1464
@@ -1143,7 +1517,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1143# define IP_FMT "%016lx" 1517# define IP_FMT "%016lx"
1144#endif 1518#endif
1145 1519
1146static int 1520int
1147seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) 1521seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1148{ 1522{
1149 int ret; 1523 int ret;
@@ -1164,6 +1538,78 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1164 return ret; 1538 return ret;
1165} 1539}
1166 1540
1541static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
1542 unsigned long ip, unsigned long sym_flags)
1543{
1544 struct file *file = NULL;
1545 unsigned long vmstart = 0;
1546 int ret = 1;
1547
1548 if (mm) {
1549 const struct vm_area_struct *vma;
1550
1551 down_read(&mm->mmap_sem);
1552 vma = find_vma(mm, ip);
1553 if (vma) {
1554 file = vma->vm_file;
1555 vmstart = vma->vm_start;
1556 }
1557 if (file) {
1558 ret = trace_seq_path(s, &file->f_path);
1559 if (ret)
1560 ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
1561 }
1562 up_read(&mm->mmap_sem);
1563 }
1564 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
1565 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1566 return ret;
1567}
1568
1569static int
1570seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
1571 unsigned long sym_flags)
1572{
1573 struct mm_struct *mm = NULL;
1574 int ret = 1;
1575 unsigned int i;
1576
1577 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
1578 struct task_struct *task;
1579 /*
1580 * we do the lookup on the thread group leader,
1581 * since individual threads might have already quit!
1582 */
1583 rcu_read_lock();
1584 task = find_task_by_vpid(entry->ent.tgid);
1585 if (task)
1586 mm = get_task_mm(task);
1587 rcu_read_unlock();
1588 }
1589
1590 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1591 unsigned long ip = entry->caller[i];
1592
1593 if (ip == ULONG_MAX || !ret)
1594 break;
1595 if (i && ret)
1596 ret = trace_seq_puts(s, " <- ");
1597 if (!ip) {
1598 if (ret)
1599 ret = trace_seq_puts(s, "??");
1600 continue;
1601 }
1602 if (!ret)
1603 break;
1604 if (ret)
1605 ret = seq_print_user_ip(s, mm, ip, sym_flags);
1606 }
1607
1608 if (mm)
1609 mmput(mm);
1610 return ret;
1611}
1612
1167static void print_lat_help_header(struct seq_file *m) 1613static void print_lat_help_header(struct seq_file *m)
1168{ 1614{
1169 seq_puts(m, "# _------=> CPU# \n"); 1615 seq_puts(m, "# _------=> CPU# \n");
@@ -1338,6 +1784,23 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
1338 trace_seq_putc(s, '\n'); 1784 trace_seq_putc(s, '\n');
1339} 1785}
1340 1786
1787static void test_cpu_buff_start(struct trace_iterator *iter)
1788{
1789 struct trace_seq *s = &iter->seq;
1790
1791 if (!(trace_flags & TRACE_ITER_ANNOTATE))
1792 return;
1793
1794 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
1795 return;
1796
1797 if (cpu_isset(iter->cpu, iter->started))
1798 return;
1799
1800 cpu_set(iter->cpu, iter->started);
1801 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
1802}
1803
1341static enum print_line_t 1804static enum print_line_t
1342print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) 1805print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1343{ 1806{
@@ -1357,6 +1820,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1357 if (entry->type == TRACE_CONT) 1820 if (entry->type == TRACE_CONT)
1358 return TRACE_TYPE_HANDLED; 1821 return TRACE_TYPE_HANDLED;
1359 1822
1823 test_cpu_buff_start(iter);
1824
1360 next_entry = find_next_entry(iter, NULL, &next_ts); 1825 next_entry = find_next_entry(iter, NULL, &next_ts);
1361 if (!next_entry) 1826 if (!next_entry)
1362 next_ts = iter->ts; 1827 next_ts = iter->ts;
@@ -1448,6 +1913,27 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1448 trace_seq_print_cont(s, iter); 1913 trace_seq_print_cont(s, iter);
1449 break; 1914 break;
1450 } 1915 }
1916 case TRACE_BRANCH: {
1917 struct trace_branch *field;
1918
1919 trace_assign_type(field, entry);
1920
1921 trace_seq_printf(s, "[%s] %s:%s:%d\n",
1922 field->correct ? " ok " : " MISS ",
1923 field->func,
1924 field->file,
1925 field->line);
1926 break;
1927 }
1928 case TRACE_USER_STACK: {
1929 struct userstack_entry *field;
1930
1931 trace_assign_type(field, entry);
1932
1933 seq_print_userip_objs(field, s, sym_flags);
1934 trace_seq_putc(s, '\n');
1935 break;
1936 }
1451 default: 1937 default:
1452 trace_seq_printf(s, "Unknown type %d\n", entry->type); 1938 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1453 } 1939 }
@@ -1472,6 +1958,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1472 if (entry->type == TRACE_CONT) 1958 if (entry->type == TRACE_CONT)
1473 return TRACE_TYPE_HANDLED; 1959 return TRACE_TYPE_HANDLED;
1474 1960
1961 test_cpu_buff_start(iter);
1962
1475 comm = trace_find_cmdline(iter->ent->pid); 1963 comm = trace_find_cmdline(iter->ent->pid);
1476 1964
1477 t = ns2usecs(iter->ts); 1965 t = ns2usecs(iter->ts);
@@ -1581,6 +2069,37 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1581 trace_seq_print_cont(s, iter); 2069 trace_seq_print_cont(s, iter);
1582 break; 2070 break;
1583 } 2071 }
2072 case TRACE_GRAPH_RET: {
2073 return print_graph_function(iter);
2074 }
2075 case TRACE_GRAPH_ENT: {
2076 return print_graph_function(iter);
2077 }
2078 case TRACE_BRANCH: {
2079 struct trace_branch *field;
2080
2081 trace_assign_type(field, entry);
2082
2083 trace_seq_printf(s, "[%s] %s:%s:%d\n",
2084 field->correct ? " ok " : " MISS ",
2085 field->func,
2086 field->file,
2087 field->line);
2088 break;
2089 }
2090 case TRACE_USER_STACK: {
2091 struct userstack_entry *field;
2092
2093 trace_assign_type(field, entry);
2094
2095 ret = seq_print_userip_objs(field, s, sym_flags);
2096 if (!ret)
2097 return TRACE_TYPE_PARTIAL_LINE;
2098 ret = trace_seq_putc(s, '\n');
2099 if (!ret)
2100 return TRACE_TYPE_PARTIAL_LINE;
2101 break;
2102 }
1584 } 2103 }
1585 return TRACE_TYPE_HANDLED; 2104 return TRACE_TYPE_HANDLED;
1586} 2105}
@@ -1640,6 +2159,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1640 break; 2159 break;
1641 } 2160 }
1642 case TRACE_SPECIAL: 2161 case TRACE_SPECIAL:
2162 case TRACE_USER_STACK:
1643 case TRACE_STACK: { 2163 case TRACE_STACK: {
1644 struct special_entry *field; 2164 struct special_entry *field;
1645 2165
@@ -1728,6 +2248,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1728 break; 2248 break;
1729 } 2249 }
1730 case TRACE_SPECIAL: 2250 case TRACE_SPECIAL:
2251 case TRACE_USER_STACK:
1731 case TRACE_STACK: { 2252 case TRACE_STACK: {
1732 struct special_entry *field; 2253 struct special_entry *field;
1733 2254
@@ -1782,6 +2303,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1782 break; 2303 break;
1783 } 2304 }
1784 case TRACE_SPECIAL: 2305 case TRACE_SPECIAL:
2306 case TRACE_USER_STACK:
1785 case TRACE_STACK: { 2307 case TRACE_STACK: {
1786 struct special_entry *field; 2308 struct special_entry *field;
1787 2309
@@ -1847,7 +2369,9 @@ static int s_show(struct seq_file *m, void *v)
1847 seq_printf(m, "# tracer: %s\n", iter->trace->name); 2369 seq_printf(m, "# tracer: %s\n", iter->trace->name);
1848 seq_puts(m, "#\n"); 2370 seq_puts(m, "#\n");
1849 } 2371 }
1850 if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2372 if (iter->trace && iter->trace->print_header)
2373 iter->trace->print_header(m);
2374 else if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
1851 /* print nothing if the buffers are empty */ 2375 /* print nothing if the buffers are empty */
1852 if (trace_empty(iter)) 2376 if (trace_empty(iter))
1853 return 0; 2377 return 0;
@@ -1899,6 +2423,15 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1899 iter->trace = current_trace; 2423 iter->trace = current_trace;
1900 iter->pos = -1; 2424 iter->pos = -1;
1901 2425
2426 /* Notify the tracer early; before we stop tracing. */
2427 if (iter->trace && iter->trace->open)
2428 iter->trace->open(iter);
2429
2430 /* Annotate start of buffers if we had overruns */
2431 if (ring_buffer_overruns(iter->tr->buffer))
2432 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2433
2434
1902 for_each_tracing_cpu(cpu) { 2435 for_each_tracing_cpu(cpu) {
1903 2436
1904 iter->buffer_iter[cpu] = 2437 iter->buffer_iter[cpu] =
@@ -1917,13 +2450,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1917 m->private = iter; 2450 m->private = iter;
1918 2451
1919 /* stop the trace while dumping */ 2452 /* stop the trace while dumping */
1920 if (iter->tr->ctrl) { 2453 tracing_stop();
1921 tracer_enabled = 0;
1922 ftrace_function_enabled = 0;
1923 }
1924
1925 if (iter->trace && iter->trace->open)
1926 iter->trace->open(iter);
1927 2454
1928 mutex_unlock(&trace_types_lock); 2455 mutex_unlock(&trace_types_lock);
1929 2456
@@ -1966,14 +2493,7 @@ int tracing_release(struct inode *inode, struct file *file)
1966 iter->trace->close(iter); 2493 iter->trace->close(iter);
1967 2494
1968 /* reenable tracing if it was previously enabled */ 2495 /* reenable tracing if it was previously enabled */
1969 if (iter->tr->ctrl) { 2496 tracing_start();
1970 tracer_enabled = 1;
1971 /*
1972 * It is safe to enable function tracing even if it
1973 * isn't used
1974 */
1975 ftrace_function_enabled = 1;
1976 }
1977 mutex_unlock(&trace_types_lock); 2497 mutex_unlock(&trace_types_lock);
1978 2498
1979 seq_release(inode, file); 2499 seq_release(inode, file);
@@ -2151,7 +2671,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2151 if (err) 2671 if (err)
2152 goto err_unlock; 2672 goto err_unlock;
2153 2673
2154 raw_local_irq_disable(); 2674 local_irq_disable();
2155 __raw_spin_lock(&ftrace_max_lock); 2675 __raw_spin_lock(&ftrace_max_lock);
2156 for_each_tracing_cpu(cpu) { 2676 for_each_tracing_cpu(cpu) {
2157 /* 2677 /*
@@ -2168,7 +2688,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2168 } 2688 }
2169 } 2689 }
2170 __raw_spin_unlock(&ftrace_max_lock); 2690 __raw_spin_unlock(&ftrace_max_lock);
2171 raw_local_irq_enable(); 2691 local_irq_enable();
2172 2692
2173 tracing_cpumask = tracing_cpumask_new; 2693 tracing_cpumask = tracing_cpumask_new;
2174 2694
@@ -2189,13 +2709,16 @@ static struct file_operations tracing_cpumask_fops = {
2189}; 2709};
2190 2710
2191static ssize_t 2711static ssize_t
2192tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, 2712tracing_trace_options_read(struct file *filp, char __user *ubuf,
2193 size_t cnt, loff_t *ppos) 2713 size_t cnt, loff_t *ppos)
2194{ 2714{
2715 int i;
2195 char *buf; 2716 char *buf;
2196 int r = 0; 2717 int r = 0;
2197 int len = 0; 2718 int len = 0;
2198 int i; 2719 u32 tracer_flags = current_trace->flags->val;
2720 struct tracer_opt *trace_opts = current_trace->flags->opts;
2721
2199 2722
2200 /* calulate max size */ 2723 /* calulate max size */
2201 for (i = 0; trace_options[i]; i++) { 2724 for (i = 0; trace_options[i]; i++) {
@@ -2203,6 +2726,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2203 len += 3; /* "no" and space */ 2726 len += 3; /* "no" and space */
2204 } 2727 }
2205 2728
2729 /*
2730 * Increase the size with names of options specific
2731 * of the current tracer.
2732 */
2733 for (i = 0; trace_opts[i].name; i++) {
2734 len += strlen(trace_opts[i].name);
2735 len += 3; /* "no" and space */
2736 }
2737
2206 /* +2 for \n and \0 */ 2738 /* +2 for \n and \0 */
2207 buf = kmalloc(len + 2, GFP_KERNEL); 2739 buf = kmalloc(len + 2, GFP_KERNEL);
2208 if (!buf) 2740 if (!buf)
@@ -2215,6 +2747,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2215 r += sprintf(buf + r, "no%s ", trace_options[i]); 2747 r += sprintf(buf + r, "no%s ", trace_options[i]);
2216 } 2748 }
2217 2749
2750 for (i = 0; trace_opts[i].name; i++) {
2751 if (tracer_flags & trace_opts[i].bit)
2752 r += sprintf(buf + r, "%s ",
2753 trace_opts[i].name);
2754 else
2755 r += sprintf(buf + r, "no%s ",
2756 trace_opts[i].name);
2757 }
2758
2218 r += sprintf(buf + r, "\n"); 2759 r += sprintf(buf + r, "\n");
2219 WARN_ON(r >= len + 2); 2760 WARN_ON(r >= len + 2);
2220 2761
@@ -2225,13 +2766,48 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2225 return r; 2766 return r;
2226} 2767}
2227 2768
2769/* Try to assign a tracer specific option */
2770static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2771{
2772 struct tracer_flags *trace_flags = trace->flags;
2773 struct tracer_opt *opts = NULL;
2774 int ret = 0, i = 0;
2775 int len;
2776
2777 for (i = 0; trace_flags->opts[i].name; i++) {
2778 opts = &trace_flags->opts[i];
2779 len = strlen(opts->name);
2780
2781 if (strncmp(cmp, opts->name, len) == 0) {
2782 ret = trace->set_flag(trace_flags->val,
2783 opts->bit, !neg);
2784 break;
2785 }
2786 }
2787 /* Not found */
2788 if (!trace_flags->opts[i].name)
2789 return -EINVAL;
2790
2791 /* Refused to handle */
2792 if (ret)
2793 return ret;
2794
2795 if (neg)
2796 trace_flags->val &= ~opts->bit;
2797 else
2798 trace_flags->val |= opts->bit;
2799
2800 return 0;
2801}
2802
2228static ssize_t 2803static ssize_t
2229tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, 2804tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2230 size_t cnt, loff_t *ppos) 2805 size_t cnt, loff_t *ppos)
2231{ 2806{
2232 char buf[64]; 2807 char buf[64];
2233 char *cmp = buf; 2808 char *cmp = buf;
2234 int neg = 0; 2809 int neg = 0;
2810 int ret;
2235 int i; 2811 int i;
2236 2812
2237 if (cnt >= sizeof(buf)) 2813 if (cnt >= sizeof(buf))
@@ -2258,11 +2834,13 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2258 break; 2834 break;
2259 } 2835 }
2260 } 2836 }
2261 /* 2837
2262 * If no option could be set, return an error: 2838 /* If no option could be set, test the specific tracer options */
2263 */ 2839 if (!trace_options[i]) {
2264 if (!trace_options[i]) 2840 ret = set_tracer_option(current_trace, cmp, neg);
2265 return -EINVAL; 2841 if (ret)
2842 return ret;
2843 }
2266 2844
2267 filp->f_pos += cnt; 2845 filp->f_pos += cnt;
2268 2846
@@ -2271,8 +2849,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2271 2849
2272static struct file_operations tracing_iter_fops = { 2850static struct file_operations tracing_iter_fops = {
2273 .open = tracing_open_generic, 2851 .open = tracing_open_generic,
2274 .read = tracing_iter_ctrl_read, 2852 .read = tracing_trace_options_read,
2275 .write = tracing_iter_ctrl_write, 2853 .write = tracing_trace_options_write,
2276}; 2854};
2277 2855
2278static const char readme_msg[] = 2856static const char readme_msg[] =
@@ -2286,9 +2864,9 @@ static const char readme_msg[] =
2286 "# echo sched_switch > /debug/tracing/current_tracer\n" 2864 "# echo sched_switch > /debug/tracing/current_tracer\n"
2287 "# cat /debug/tracing/current_tracer\n" 2865 "# cat /debug/tracing/current_tracer\n"
2288 "sched_switch\n" 2866 "sched_switch\n"
2289 "# cat /debug/tracing/iter_ctrl\n" 2867 "# cat /debug/tracing/trace_options\n"
2290 "noprint-parent nosym-offset nosym-addr noverbose\n" 2868 "noprint-parent nosym-offset nosym-addr noverbose\n"
2291 "# echo print-parent > /debug/tracing/iter_ctrl\n" 2869 "# echo print-parent > /debug/tracing/trace_options\n"
2292 "# echo 1 > /debug/tracing/tracing_enabled\n" 2870 "# echo 1 > /debug/tracing/tracing_enabled\n"
2293 "# cat /debug/tracing/trace > /tmp/trace.txt\n" 2871 "# cat /debug/tracing/trace > /tmp/trace.txt\n"
2294 "echo 0 > /debug/tracing/tracing_enabled\n" 2872 "echo 0 > /debug/tracing/tracing_enabled\n"
@@ -2311,11 +2889,10 @@ static ssize_t
2311tracing_ctrl_read(struct file *filp, char __user *ubuf, 2889tracing_ctrl_read(struct file *filp, char __user *ubuf,
2312 size_t cnt, loff_t *ppos) 2890 size_t cnt, loff_t *ppos)
2313{ 2891{
2314 struct trace_array *tr = filp->private_data;
2315 char buf[64]; 2892 char buf[64];
2316 int r; 2893 int r;
2317 2894
2318 r = sprintf(buf, "%ld\n", tr->ctrl); 2895 r = sprintf(buf, "%u\n", tracer_enabled);
2319 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2896 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2320} 2897}
2321 2898
@@ -2343,16 +2920,18 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2343 val = !!val; 2920 val = !!val;
2344 2921
2345 mutex_lock(&trace_types_lock); 2922 mutex_lock(&trace_types_lock);
2346 if (tr->ctrl ^ val) { 2923 if (tracer_enabled ^ val) {
2347 if (val) 2924 if (val) {
2348 tracer_enabled = 1; 2925 tracer_enabled = 1;
2349 else 2926 if (current_trace->start)
2927 current_trace->start(tr);
2928 tracing_start();
2929 } else {
2350 tracer_enabled = 0; 2930 tracer_enabled = 0;
2351 2931 tracing_stop();
2352 tr->ctrl = val; 2932 if (current_trace->stop)
2353 2933 current_trace->stop(tr);
2354 if (current_trace && current_trace->ctrl_update) 2934 }
2355 current_trace->ctrl_update(tr);
2356 } 2935 }
2357 mutex_unlock(&trace_types_lock); 2936 mutex_unlock(&trace_types_lock);
2358 2937
@@ -2378,29 +2957,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
2378 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2957 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2379} 2958}
2380 2959
2381static ssize_t 2960static int tracing_set_tracer(char *buf)
2382tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2383 size_t cnt, loff_t *ppos)
2384{ 2961{
2385 struct trace_array *tr = &global_trace; 2962 struct trace_array *tr = &global_trace;
2386 struct tracer *t; 2963 struct tracer *t;
2387 char buf[max_tracer_type_len+1]; 2964 int ret = 0;
2388 int i;
2389 size_t ret;
2390
2391 ret = cnt;
2392
2393 if (cnt > max_tracer_type_len)
2394 cnt = max_tracer_type_len;
2395
2396 if (copy_from_user(&buf, ubuf, cnt))
2397 return -EFAULT;
2398
2399 buf[cnt] = 0;
2400
2401 /* strip ending whitespace. */
2402 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
2403 buf[i] = 0;
2404 2965
2405 mutex_lock(&trace_types_lock); 2966 mutex_lock(&trace_types_lock);
2406 for (t = trace_types; t; t = t->next) { 2967 for (t = trace_types; t; t = t->next) {
@@ -2414,18 +2975,52 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2414 if (t == current_trace) 2975 if (t == current_trace)
2415 goto out; 2976 goto out;
2416 2977
2978 trace_branch_disable();
2417 if (current_trace && current_trace->reset) 2979 if (current_trace && current_trace->reset)
2418 current_trace->reset(tr); 2980 current_trace->reset(tr);
2419 2981
2420 current_trace = t; 2982 current_trace = t;
2421 if (t->init) 2983 if (t->init) {
2422 t->init(tr); 2984 ret = t->init(tr);
2985 if (ret)
2986 goto out;
2987 }
2423 2988
2989 trace_branch_enable(tr);
2424 out: 2990 out:
2425 mutex_unlock(&trace_types_lock); 2991 mutex_unlock(&trace_types_lock);
2426 2992
2427 if (ret > 0) 2993 return ret;
2428 filp->f_pos += ret; 2994}
2995
2996static ssize_t
2997tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2998 size_t cnt, loff_t *ppos)
2999{
3000 char buf[max_tracer_type_len+1];
3001 int i;
3002 size_t ret;
3003 int err;
3004
3005 ret = cnt;
3006
3007 if (cnt > max_tracer_type_len)
3008 cnt = max_tracer_type_len;
3009
3010 if (copy_from_user(&buf, ubuf, cnt))
3011 return -EFAULT;
3012
3013 buf[cnt] = 0;
3014
3015 /* strip ending whitespace. */
3016 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
3017 buf[i] = 0;
3018
3019 err = tracing_set_tracer(buf);
3020 if (err)
3021 return err;
3022
3023 filp->f_pos += ret;
2429 3024
2430 return ret; 3025 return ret;
2431} 3026}
@@ -2492,6 +3087,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2492 return -ENOMEM; 3087 return -ENOMEM;
2493 3088
2494 mutex_lock(&trace_types_lock); 3089 mutex_lock(&trace_types_lock);
3090
3091 /* trace pipe does not show start of buffer */
3092 cpus_setall(iter->started);
3093
2495 iter->tr = &global_trace; 3094 iter->tr = &global_trace;
2496 iter->trace = current_trace; 3095 iter->trace = current_trace;
2497 filp->private_data = iter; 3096 filp->private_data = iter;
@@ -2667,7 +3266,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
2667 char buf[64]; 3266 char buf[64];
2668 int r; 3267 int r;
2669 3268
2670 r = sprintf(buf, "%lu\n", tr->entries); 3269 r = sprintf(buf, "%lu\n", tr->entries >> 10);
2671 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3270 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2672} 3271}
2673 3272
@@ -2678,7 +3277,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2678 unsigned long val; 3277 unsigned long val;
2679 char buf[64]; 3278 char buf[64];
2680 int ret, cpu; 3279 int ret, cpu;
2681 struct trace_array *tr = filp->private_data;
2682 3280
2683 if (cnt >= sizeof(buf)) 3281 if (cnt >= sizeof(buf))
2684 return -EINVAL; 3282 return -EINVAL;
@@ -2698,12 +3296,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2698 3296
2699 mutex_lock(&trace_types_lock); 3297 mutex_lock(&trace_types_lock);
2700 3298
2701 if (tr->ctrl) { 3299 tracing_stop();
2702 cnt = -EBUSY;
2703 pr_info("ftrace: please disable tracing"
2704 " before modifying buffer size\n");
2705 goto out;
2706 }
2707 3300
2708 /* disable all cpu buffers */ 3301 /* disable all cpu buffers */
2709 for_each_tracing_cpu(cpu) { 3302 for_each_tracing_cpu(cpu) {
@@ -2713,6 +3306,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2713 atomic_inc(&max_tr.data[cpu]->disabled); 3306 atomic_inc(&max_tr.data[cpu]->disabled);
2714 } 3307 }
2715 3308
3309 /* value is in KB */
3310 val <<= 10;
3311
2716 if (val != global_trace.entries) { 3312 if (val != global_trace.entries) {
2717 ret = ring_buffer_resize(global_trace.buffer, val); 3313 ret = ring_buffer_resize(global_trace.buffer, val);
2718 if (ret < 0) { 3314 if (ret < 0) {
@@ -2751,6 +3347,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2751 atomic_dec(&max_tr.data[cpu]->disabled); 3347 atomic_dec(&max_tr.data[cpu]->disabled);
2752 } 3348 }
2753 3349
3350 tracing_start();
2754 max_tr.entries = global_trace.entries; 3351 max_tr.entries = global_trace.entries;
2755 mutex_unlock(&trace_types_lock); 3352 mutex_unlock(&trace_types_lock);
2756 3353
@@ -2762,7 +3359,7 @@ static int mark_printk(const char *fmt, ...)
2762 int ret; 3359 int ret;
2763 va_list args; 3360 va_list args;
2764 va_start(args, fmt); 3361 va_start(args, fmt);
2765 ret = trace_vprintk(0, fmt, args); 3362 ret = trace_vprintk(0, -1, fmt, args);
2766 va_end(args); 3363 va_end(args);
2767 return ret; 3364 return ret;
2768} 3365}
@@ -2773,9 +3370,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
2773{ 3370{
2774 char *buf; 3371 char *buf;
2775 char *end; 3372 char *end;
2776 struct trace_array *tr = &global_trace;
2777 3373
2778 if (!tr->ctrl || tracing_disabled) 3374 if (tracing_disabled)
2779 return -EINVAL; 3375 return -EINVAL;
2780 3376
2781 if (cnt > TRACE_BUF_SIZE) 3377 if (cnt > TRACE_BUF_SIZE)
@@ -2841,22 +3437,38 @@ static struct file_operations tracing_mark_fops = {
2841 3437
2842#ifdef CONFIG_DYNAMIC_FTRACE 3438#ifdef CONFIG_DYNAMIC_FTRACE
2843 3439
3440int __weak ftrace_arch_read_dyn_info(char *buf, int size)
3441{
3442 return 0;
3443}
3444
2844static ssize_t 3445static ssize_t
2845tracing_read_long(struct file *filp, char __user *ubuf, 3446tracing_read_dyn_info(struct file *filp, char __user *ubuf,
2846 size_t cnt, loff_t *ppos) 3447 size_t cnt, loff_t *ppos)
2847{ 3448{
3449 static char ftrace_dyn_info_buffer[1024];
3450 static DEFINE_MUTEX(dyn_info_mutex);
2848 unsigned long *p = filp->private_data; 3451 unsigned long *p = filp->private_data;
2849 char buf[64]; 3452 char *buf = ftrace_dyn_info_buffer;
3453 int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
2850 int r; 3454 int r;
2851 3455
2852 r = sprintf(buf, "%ld\n", *p); 3456 mutex_lock(&dyn_info_mutex);
3457 r = sprintf(buf, "%ld ", *p);
2853 3458
2854 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3459 r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
3460 buf[r++] = '\n';
3461
3462 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3463
3464 mutex_unlock(&dyn_info_mutex);
3465
3466 return r;
2855} 3467}
2856 3468
2857static struct file_operations tracing_read_long_fops = { 3469static struct file_operations tracing_dyn_info_fops = {
2858 .open = tracing_open_generic, 3470 .open = tracing_open_generic,
2859 .read = tracing_read_long, 3471 .read = tracing_read_dyn_info,
2860}; 3472};
2861#endif 3473#endif
2862 3474
@@ -2897,10 +3509,10 @@ static __init int tracer_init_debugfs(void)
2897 if (!entry) 3509 if (!entry)
2898 pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); 3510 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
2899 3511
2900 entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, 3512 entry = debugfs_create_file("trace_options", 0644, d_tracer,
2901 NULL, &tracing_iter_fops); 3513 NULL, &tracing_iter_fops);
2902 if (!entry) 3514 if (!entry)
2903 pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); 3515 pr_warning("Could not create debugfs 'trace_options' entry\n");
2904 3516
2905 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, 3517 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
2906 NULL, &tracing_cpumask_fops); 3518 NULL, &tracing_cpumask_fops);
@@ -2950,11 +3562,11 @@ static __init int tracer_init_debugfs(void)
2950 pr_warning("Could not create debugfs " 3562 pr_warning("Could not create debugfs "
2951 "'trace_pipe' entry\n"); 3563 "'trace_pipe' entry\n");
2952 3564
2953 entry = debugfs_create_file("trace_entries", 0644, d_tracer, 3565 entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
2954 &global_trace, &tracing_entries_fops); 3566 &global_trace, &tracing_entries_fops);
2955 if (!entry) 3567 if (!entry)
2956 pr_warning("Could not create debugfs " 3568 pr_warning("Could not create debugfs "
2957 "'trace_entries' entry\n"); 3569 "'buffer_size_kb' entry\n");
2958 3570
2959 entry = debugfs_create_file("trace_marker", 0220, d_tracer, 3571 entry = debugfs_create_file("trace_marker", 0220, d_tracer,
2960 NULL, &tracing_mark_fops); 3572 NULL, &tracing_mark_fops);
@@ -2965,7 +3577,7 @@ static __init int tracer_init_debugfs(void)
2965#ifdef CONFIG_DYNAMIC_FTRACE 3577#ifdef CONFIG_DYNAMIC_FTRACE
2966 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, 3578 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
2967 &ftrace_update_tot_cnt, 3579 &ftrace_update_tot_cnt,
2968 &tracing_read_long_fops); 3580 &tracing_dyn_info_fops);
2969 if (!entry) 3581 if (!entry)
2970 pr_warning("Could not create debugfs " 3582 pr_warning("Could not create debugfs "
2971 "'dyn_ftrace_total_info' entry\n"); 3583 "'dyn_ftrace_total_info' entry\n");
@@ -2976,7 +3588,7 @@ static __init int tracer_init_debugfs(void)
2976 return 0; 3588 return 0;
2977} 3589}
2978 3590
2979int trace_vprintk(unsigned long ip, const char *fmt, va_list args) 3591int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
2980{ 3592{
2981 static DEFINE_SPINLOCK(trace_buf_lock); 3593 static DEFINE_SPINLOCK(trace_buf_lock);
2982 static char trace_buf[TRACE_BUF_SIZE]; 3594 static char trace_buf[TRACE_BUF_SIZE];
@@ -2984,11 +3596,11 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
2984 struct ring_buffer_event *event; 3596 struct ring_buffer_event *event;
2985 struct trace_array *tr = &global_trace; 3597 struct trace_array *tr = &global_trace;
2986 struct trace_array_cpu *data; 3598 struct trace_array_cpu *data;
2987 struct print_entry *entry;
2988 unsigned long flags, irq_flags;
2989 int cpu, len = 0, size, pc; 3599 int cpu, len = 0, size, pc;
3600 struct print_entry *entry;
3601 unsigned long irq_flags;
2990 3602
2991 if (!tr->ctrl || tracing_disabled) 3603 if (tracing_disabled || tracing_selftest_running)
2992 return 0; 3604 return 0;
2993 3605
2994 pc = preempt_count(); 3606 pc = preempt_count();
@@ -2999,7 +3611,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
2999 if (unlikely(atomic_read(&data->disabled))) 3611 if (unlikely(atomic_read(&data->disabled)))
3000 goto out; 3612 goto out;
3001 3613
3002 spin_lock_irqsave(&trace_buf_lock, flags); 3614 pause_graph_tracing();
3615 spin_lock_irqsave(&trace_buf_lock, irq_flags);
3003 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); 3616 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
3004 3617
3005 len = min(len, TRACE_BUF_SIZE-1); 3618 len = min(len, TRACE_BUF_SIZE-1);
@@ -3010,17 +3623,18 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
3010 if (!event) 3623 if (!event)
3011 goto out_unlock; 3624 goto out_unlock;
3012 entry = ring_buffer_event_data(event); 3625 entry = ring_buffer_event_data(event);
3013 tracing_generic_entry_update(&entry->ent, flags, pc); 3626 tracing_generic_entry_update(&entry->ent, irq_flags, pc);
3014 entry->ent.type = TRACE_PRINT; 3627 entry->ent.type = TRACE_PRINT;
3015 entry->ip = ip; 3628 entry->ip = ip;
3629 entry->depth = depth;
3016 3630
3017 memcpy(&entry->buf, trace_buf, len); 3631 memcpy(&entry->buf, trace_buf, len);
3018 entry->buf[len] = 0; 3632 entry->buf[len] = 0;
3019 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 3633 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
3020 3634
3021 out_unlock: 3635 out_unlock:
3022 spin_unlock_irqrestore(&trace_buf_lock, flags); 3636 spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
3023 3637 unpause_graph_tracing();
3024 out: 3638 out:
3025 preempt_enable_notrace(); 3639 preempt_enable_notrace();
3026 3640
@@ -3037,7 +3651,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
3037 return 0; 3651 return 0;
3038 3652
3039 va_start(ap, fmt); 3653 va_start(ap, fmt);
3040 ret = trace_vprintk(ip, fmt, ap); 3654 ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
3041 va_end(ap); 3655 va_end(ap);
3042 return ret; 3656 return ret;
3043} 3657}
@@ -3046,7 +3660,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk);
3046static int trace_panic_handler(struct notifier_block *this, 3660static int trace_panic_handler(struct notifier_block *this,
3047 unsigned long event, void *unused) 3661 unsigned long event, void *unused)
3048{ 3662{
3049 ftrace_dump(); 3663 if (ftrace_dump_on_oops)
3664 ftrace_dump();
3050 return NOTIFY_OK; 3665 return NOTIFY_OK;
3051} 3666}
3052 3667
@@ -3062,7 +3677,8 @@ static int trace_die_handler(struct notifier_block *self,
3062{ 3677{
3063 switch (val) { 3678 switch (val) {
3064 case DIE_OOPS: 3679 case DIE_OOPS:
3065 ftrace_dump(); 3680 if (ftrace_dump_on_oops)
3681 ftrace_dump();
3066 break; 3682 break;
3067 default: 3683 default:
3068 break; 3684 break;
@@ -3103,7 +3719,6 @@ trace_printk_seq(struct trace_seq *s)
3103 trace_seq_reset(s); 3719 trace_seq_reset(s);
3104} 3720}
3105 3721
3106
3107void ftrace_dump(void) 3722void ftrace_dump(void)
3108{ 3723{
3109 static DEFINE_SPINLOCK(ftrace_dump_lock); 3724 static DEFINE_SPINLOCK(ftrace_dump_lock);
@@ -3128,6 +3743,9 @@ void ftrace_dump(void)
3128 atomic_inc(&global_trace.data[cpu]->disabled); 3743 atomic_inc(&global_trace.data[cpu]->disabled);
3129 } 3744 }
3130 3745
3746 /* don't look at user memory in panic mode */
3747 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
3748
3131 printk(KERN_TRACE "Dumping ftrace buffer:\n"); 3749 printk(KERN_TRACE "Dumping ftrace buffer:\n");
3132 3750
3133 iter.tr = &global_trace; 3751 iter.tr = &global_trace;
@@ -3221,7 +3839,6 @@ __init static int tracer_alloc_buffers(void)
3221#endif 3839#endif
3222 3840
3223 /* All seems OK, enable tracing */ 3841 /* All seems OK, enable tracing */
3224 global_trace.ctrl = tracer_enabled;
3225 tracing_disabled = 0; 3842 tracing_disabled = 0;
3226 3843
3227 atomic_notifier_chain_register(&panic_notifier_list, 3844 atomic_notifier_chain_register(&panic_notifier_list,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8465ad052707..5ac697065a48 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -8,6 +8,7 @@
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <trace/boot.h>
11 12
12enum trace_type { 13enum trace_type {
13 __TRACE_FIRST_TYPE = 0, 14 __TRACE_FIRST_TYPE = 0,
@@ -21,7 +22,14 @@ enum trace_type {
21 TRACE_SPECIAL, 22 TRACE_SPECIAL,
22 TRACE_MMIO_RW, 23 TRACE_MMIO_RW,
23 TRACE_MMIO_MAP, 24 TRACE_MMIO_MAP,
24 TRACE_BOOT, 25 TRACE_BRANCH,
26 TRACE_BOOT_CALL,
27 TRACE_BOOT_RET,
28 TRACE_GRAPH_RET,
29 TRACE_GRAPH_ENT,
30 TRACE_USER_STACK,
31 TRACE_BTS,
32 TRACE_POWER,
25 33
26 __TRACE_LAST_TYPE 34 __TRACE_LAST_TYPE
27}; 35};
@@ -38,6 +46,7 @@ struct trace_entry {
38 unsigned char flags; 46 unsigned char flags;
39 unsigned char preempt_count; 47 unsigned char preempt_count;
40 int pid; 48 int pid;
49 int tgid;
41}; 50};
42 51
43/* 52/*
@@ -48,6 +57,18 @@ struct ftrace_entry {
48 unsigned long ip; 57 unsigned long ip;
49 unsigned long parent_ip; 58 unsigned long parent_ip;
50}; 59};
60
61/* Function call entry */
62struct ftrace_graph_ent_entry {
63 struct trace_entry ent;
64 struct ftrace_graph_ent graph_ent;
65};
66
67/* Function return entry */
68struct ftrace_graph_ret_entry {
69 struct trace_entry ent;
70 struct ftrace_graph_ret ret;
71};
51extern struct tracer boot_tracer; 72extern struct tracer boot_tracer;
52 73
53/* 74/*
@@ -85,12 +106,18 @@ struct stack_entry {
85 unsigned long caller[FTRACE_STACK_ENTRIES]; 106 unsigned long caller[FTRACE_STACK_ENTRIES];
86}; 107};
87 108
109struct userstack_entry {
110 struct trace_entry ent;
111 unsigned long caller[FTRACE_STACK_ENTRIES];
112};
113
88/* 114/*
89 * ftrace_printk entry: 115 * ftrace_printk entry:
90 */ 116 */
91struct print_entry { 117struct print_entry {
92 struct trace_entry ent; 118 struct trace_entry ent;
93 unsigned long ip; 119 unsigned long ip;
120 int depth;
94 char buf[]; 121 char buf[];
95}; 122};
96 123
@@ -112,9 +139,35 @@ struct trace_mmiotrace_map {
112 struct mmiotrace_map map; 139 struct mmiotrace_map map;
113}; 140};
114 141
115struct trace_boot { 142struct trace_boot_call {
116 struct trace_entry ent; 143 struct trace_entry ent;
117 struct boot_trace initcall; 144 struct boot_trace_call boot_call;
145};
146
147struct trace_boot_ret {
148 struct trace_entry ent;
149 struct boot_trace_ret boot_ret;
150};
151
152#define TRACE_FUNC_SIZE 30
153#define TRACE_FILE_SIZE 20
154struct trace_branch {
155 struct trace_entry ent;
156 unsigned line;
157 char func[TRACE_FUNC_SIZE+1];
158 char file[TRACE_FILE_SIZE+1];
159 char correct;
160};
161
162struct bts_entry {
163 struct trace_entry ent;
164 unsigned long from;
165 unsigned long to;
166};
167
168struct trace_power {
169 struct trace_entry ent;
170 struct power_trace state_data;
118}; 171};
119 172
120/* 173/*
@@ -172,7 +225,6 @@ struct trace_iterator;
172struct trace_array { 225struct trace_array {
173 struct ring_buffer *buffer; 226 struct ring_buffer *buffer;
174 unsigned long entries; 227 unsigned long entries;
175 long ctrl;
176 int cpu; 228 int cpu;
177 cycle_t time_start; 229 cycle_t time_start;
178 struct task_struct *waiter; 230 struct task_struct *waiter;
@@ -212,13 +264,22 @@ extern void __ftrace_bad_type(void);
212 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ 264 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
213 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ 265 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
214 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ 266 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
267 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
215 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 268 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
216 IF_ASSIGN(var, ent, struct special_entry, 0); \ 269 IF_ASSIGN(var, ent, struct special_entry, 0); \
217 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 270 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
218 TRACE_MMIO_RW); \ 271 TRACE_MMIO_RW); \
219 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 272 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
220 TRACE_MMIO_MAP); \ 273 TRACE_MMIO_MAP); \
221 IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \ 274 IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
275 IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
276 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
277 IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \
278 TRACE_GRAPH_ENT); \
279 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
280 TRACE_GRAPH_RET); \
281 IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\
282 IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
222 __ftrace_bad_type(); \ 283 __ftrace_bad_type(); \
223 } while (0) 284 } while (0)
224 285
@@ -229,29 +290,56 @@ enum print_line_t {
229 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ 290 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
230}; 291};
231 292
293
294/*
295 * An option specific to a tracer. This is a boolean value.
296 * The bit is the bit index that sets its value on the
297 * flags value in struct tracer_flags.
298 */
299struct tracer_opt {
300 const char *name; /* Will appear on the trace_options file */
301 u32 bit; /* Mask assigned in val field in tracer_flags */
302};
303
304/*
305 * The set of specific options for a tracer. Your tracer
306 * have to set the initial value of the flags val.
307 */
308struct tracer_flags {
309 u32 val;
310 struct tracer_opt *opts;
311};
312
313/* Makes more easy to define a tracer opt */
314#define TRACER_OPT(s, b) .name = #s, .bit = b
315
232/* 316/*
233 * A specific tracer, represented by methods that operate on a trace array: 317 * A specific tracer, represented by methods that operate on a trace array:
234 */ 318 */
235struct tracer { 319struct tracer {
236 const char *name; 320 const char *name;
237 void (*init)(struct trace_array *tr); 321 /* Your tracer should raise a warning if init fails */
322 int (*init)(struct trace_array *tr);
238 void (*reset)(struct trace_array *tr); 323 void (*reset)(struct trace_array *tr);
324 void (*start)(struct trace_array *tr);
325 void (*stop)(struct trace_array *tr);
239 void (*open)(struct trace_iterator *iter); 326 void (*open)(struct trace_iterator *iter);
240 void (*pipe_open)(struct trace_iterator *iter); 327 void (*pipe_open)(struct trace_iterator *iter);
241 void (*close)(struct trace_iterator *iter); 328 void (*close)(struct trace_iterator *iter);
242 void (*start)(struct trace_iterator *iter);
243 void (*stop)(struct trace_iterator *iter);
244 ssize_t (*read)(struct trace_iterator *iter, 329 ssize_t (*read)(struct trace_iterator *iter,
245 struct file *filp, char __user *ubuf, 330 struct file *filp, char __user *ubuf,
246 size_t cnt, loff_t *ppos); 331 size_t cnt, loff_t *ppos);
247 void (*ctrl_update)(struct trace_array *tr);
248#ifdef CONFIG_FTRACE_STARTUP_TEST 332#ifdef CONFIG_FTRACE_STARTUP_TEST
249 int (*selftest)(struct tracer *trace, 333 int (*selftest)(struct tracer *trace,
250 struct trace_array *tr); 334 struct trace_array *tr);
251#endif 335#endif
336 void (*print_header)(struct seq_file *m);
252 enum print_line_t (*print_line)(struct trace_iterator *iter); 337 enum print_line_t (*print_line)(struct trace_iterator *iter);
338 /* If you handled the flag setting, return 0 */
339 int (*set_flag)(u32 old_flags, u32 bit, int set);
253 struct tracer *next; 340 struct tracer *next;
254 int print_max; 341 int print_max;
342 struct tracer_flags *flags;
255}; 343};
256 344
257struct trace_seq { 345struct trace_seq {
@@ -279,8 +367,11 @@ struct trace_iterator {
279 unsigned long iter_flags; 367 unsigned long iter_flags;
280 loff_t pos; 368 loff_t pos;
281 long idx; 369 long idx;
370
371 cpumask_t started;
282}; 372};
283 373
374int tracing_is_enabled(void);
284void trace_wake_up(void); 375void trace_wake_up(void);
285void tracing_reset(struct trace_array *tr, int cpu); 376void tracing_reset(struct trace_array *tr, int cpu);
286int tracing_open_generic(struct inode *inode, struct file *filp); 377int tracing_open_generic(struct inode *inode, struct file *filp);
@@ -321,8 +412,17 @@ void trace_function(struct trace_array *tr,
321 unsigned long parent_ip, 412 unsigned long parent_ip,
322 unsigned long flags, int pc); 413 unsigned long flags, int pc);
323 414
415void trace_graph_return(struct ftrace_graph_ret *trace);
416int trace_graph_entry(struct ftrace_graph_ent *trace);
417void trace_bts(struct trace_array *tr,
418 unsigned long from,
419 unsigned long to);
420
324void tracing_start_cmdline_record(void); 421void tracing_start_cmdline_record(void);
325void tracing_stop_cmdline_record(void); 422void tracing_stop_cmdline_record(void);
423void tracing_sched_switch_assign_trace(struct trace_array *tr);
424void tracing_stop_sched_switch_record(void);
425void tracing_start_sched_switch_record(void);
326int register_tracer(struct tracer *type); 426int register_tracer(struct tracer *type);
327void unregister_tracer(struct tracer *type); 427void unregister_tracer(struct tracer *type);
328 428
@@ -358,6 +458,7 @@ struct tracer_switch_ops {
358 struct tracer_switch_ops *next; 458 struct tracer_switch_ops *next;
359}; 459};
360 460
461char *trace_find_cmdline(int pid);
361#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ 462#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
362 463
363#ifdef CONFIG_DYNAMIC_FTRACE 464#ifdef CONFIG_DYNAMIC_FTRACE
@@ -383,19 +484,79 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
383 struct trace_array *tr); 484 struct trace_array *tr);
384extern int trace_selftest_startup_sysprof(struct tracer *trace, 485extern int trace_selftest_startup_sysprof(struct tracer *trace,
385 struct trace_array *tr); 486 struct trace_array *tr);
487extern int trace_selftest_startup_branch(struct tracer *trace,
488 struct trace_array *tr);
386#endif /* CONFIG_FTRACE_STARTUP_TEST */ 489#endif /* CONFIG_FTRACE_STARTUP_TEST */
387 490
388extern void *head_page(struct trace_array_cpu *data); 491extern void *head_page(struct trace_array_cpu *data);
389extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); 492extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
390extern void trace_seq_print_cont(struct trace_seq *s, 493extern void trace_seq_print_cont(struct trace_seq *s,
391 struct trace_iterator *iter); 494 struct trace_iterator *iter);
495
496extern int
497seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
498 unsigned long sym_flags);
392extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, 499extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
393 size_t cnt); 500 size_t cnt);
394extern long ns2usecs(cycle_t nsec); 501extern long ns2usecs(cycle_t nsec);
395extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); 502extern int
503trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
396 504
397extern unsigned long trace_flags; 505extern unsigned long trace_flags;
398 506
507/* Standard output formatting function used for function return traces */
508#ifdef CONFIG_FUNCTION_GRAPH_TRACER
509extern enum print_line_t print_graph_function(struct trace_iterator *iter);
510
511#ifdef CONFIG_DYNAMIC_FTRACE
512/* TODO: make this variable */
513#define FTRACE_GRAPH_MAX_FUNCS 32
514extern int ftrace_graph_count;
515extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
516
517static inline int ftrace_graph_addr(unsigned long addr)
518{
519 int i;
520
521 if (!ftrace_graph_count || test_tsk_trace_graph(current))
522 return 1;
523
524 for (i = 0; i < ftrace_graph_count; i++) {
525 if (addr == ftrace_graph_funcs[i])
526 return 1;
527 }
528
529 return 0;
530}
531#else
532static inline int ftrace_trace_addr(unsigned long addr)
533{
534 return 1;
535}
536static inline int ftrace_graph_addr(unsigned long addr)
537{
538 return 1;
539}
540#endif /* CONFIG_DYNAMIC_FTRACE */
541
542#else /* CONFIG_FUNCTION_GRAPH_TRACER */
543static inline enum print_line_t
544print_graph_function(struct trace_iterator *iter)
545{
546 return TRACE_TYPE_UNHANDLED;
547}
548#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
549
550extern struct pid *ftrace_pid_trace;
551
552static inline int ftrace_trace_task(struct task_struct *task)
553{
554 if (!ftrace_pid_trace)
555 return 1;
556
557 return test_tsk_trace_trace(task);
558}
559
399/* 560/*
400 * trace_iterator_flags is an enumeration that defines bit 561 * trace_iterator_flags is an enumeration that defines bit
401 * positions into trace_flags that controls the output. 562 * positions into trace_flags that controls the output.
@@ -415,8 +576,92 @@ enum trace_iterator_flags {
415 TRACE_ITER_STACKTRACE = 0x100, 576 TRACE_ITER_STACKTRACE = 0x100,
416 TRACE_ITER_SCHED_TREE = 0x200, 577 TRACE_ITER_SCHED_TREE = 0x200,
417 TRACE_ITER_PRINTK = 0x400, 578 TRACE_ITER_PRINTK = 0x400,
579 TRACE_ITER_PREEMPTONLY = 0x800,
580 TRACE_ITER_BRANCH = 0x1000,
581 TRACE_ITER_ANNOTATE = 0x2000,
582 TRACE_ITER_USERSTACKTRACE = 0x4000,
583 TRACE_ITER_SYM_USEROBJ = 0x8000
418}; 584};
419 585
586/*
587 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
588 * control the output of kernel symbols.
589 */
590#define TRACE_ITER_SYM_MASK \
591 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
592
420extern struct tracer nop_trace; 593extern struct tracer nop_trace;
421 594
595/**
596 * ftrace_preempt_disable - disable preemption scheduler safe
597 *
598 * When tracing can happen inside the scheduler, there exists
599 * cases that the tracing might happen before the need_resched
600 * flag is checked. If this happens and the tracer calls
601 * preempt_enable (after a disable), a schedule might take place
602 * causing an infinite recursion.
603 *
604 * To prevent this, we read the need_recshed flag before
605 * disabling preemption. When we want to enable preemption we
606 * check the flag, if it is set, then we call preempt_enable_no_resched.
607 * Otherwise, we call preempt_enable.
608 *
609 * The rational for doing the above is that if need resched is set
610 * and we have yet to reschedule, we are either in an atomic location
611 * (where we do not need to check for scheduling) or we are inside
612 * the scheduler and do not want to resched.
613 */
614static inline int ftrace_preempt_disable(void)
615{
616 int resched;
617
618 resched = need_resched();
619 preempt_disable_notrace();
620
621 return resched;
622}
623
624/**
625 * ftrace_preempt_enable - enable preemption scheduler safe
626 * @resched: the return value from ftrace_preempt_disable
627 *
628 * This is a scheduler safe way to enable preemption and not miss
629 * any preemption checks. The disabled saved the state of preemption.
630 * If resched is set, then we were either inside an atomic or
631 * are inside the scheduler (we would have already scheduled
632 * otherwise). In this case, we do not want to call normal
633 * preempt_enable, but preempt_enable_no_resched instead.
634 */
635static inline void ftrace_preempt_enable(int resched)
636{
637 if (resched)
638 preempt_enable_no_resched_notrace();
639 else
640 preempt_enable_notrace();
641}
642
643#ifdef CONFIG_BRANCH_TRACER
644extern int enable_branch_tracing(struct trace_array *tr);
645extern void disable_branch_tracing(void);
646static inline int trace_branch_enable(struct trace_array *tr)
647{
648 if (trace_flags & TRACE_ITER_BRANCH)
649 return enable_branch_tracing(tr);
650 return 0;
651}
652static inline void trace_branch_disable(void)
653{
654 /* due to races, always disable */
655 disable_branch_tracing();
656}
657#else
658static inline int trace_branch_enable(struct trace_array *tr)
659{
660 return 0;
661}
662static inline void trace_branch_disable(void)
663{
664}
665#endif /* CONFIG_BRANCH_TRACER */
666
422#endif /* _LINUX_KERNEL_TRACE_H */ 667#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index d0a5e50eeff2..a4fa2c57e34e 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -13,73 +13,117 @@
13#include "trace.h" 13#include "trace.h"
14 14
15static struct trace_array *boot_trace; 15static struct trace_array *boot_trace;
16static int trace_boot_enabled; 16static bool pre_initcalls_finished;
17 17
18 18/* Tells the boot tracer that the pre_smp_initcalls are finished.
19/* Should be started after do_pre_smp_initcalls() in init/main.c */ 19 * So we are ready .
20 * It doesn't enable sched events tracing however.
21 * You have to call enable_boot_trace to do so.
22 */
20void start_boot_trace(void) 23void start_boot_trace(void)
21{ 24{
22 trace_boot_enabled = 1; 25 pre_initcalls_finished = true;
23} 26}
24 27
25void stop_boot_trace(void) 28void enable_boot_trace(void)
26{ 29{
27 trace_boot_enabled = 0; 30 if (pre_initcalls_finished)
31 tracing_start_sched_switch_record();
28} 32}
29 33
30void reset_boot_trace(struct trace_array *tr) 34void disable_boot_trace(void)
31{ 35{
32 stop_boot_trace(); 36 if (pre_initcalls_finished)
37 tracing_stop_sched_switch_record();
33} 38}
34 39
35static void boot_trace_init(struct trace_array *tr) 40static void reset_boot_trace(struct trace_array *tr)
36{ 41{
37 int cpu; 42 int cpu;
38 boot_trace = tr;
39 43
40 trace_boot_enabled = 0; 44 tr->time_start = ftrace_now(tr->cpu);
45
46 for_each_online_cpu(cpu)
47 tracing_reset(tr, cpu);
48}
49
50static int boot_trace_init(struct trace_array *tr)
51{
52 int cpu;
53 boot_trace = tr;
41 54
42 for_each_cpu_mask(cpu, cpu_possible_map) 55 for_each_cpu_mask(cpu, cpu_possible_map)
43 tracing_reset(tr, cpu); 56 tracing_reset(tr, cpu);
57
58 tracing_sched_switch_assign_trace(tr);
59 return 0;
44} 60}
45 61
46static void boot_trace_ctrl_update(struct trace_array *tr) 62static enum print_line_t
63initcall_call_print_line(struct trace_iterator *iter)
47{ 64{
48 if (tr->ctrl) 65 struct trace_entry *entry = iter->ent;
49 start_boot_trace(); 66 struct trace_seq *s = &iter->seq;
67 struct trace_boot_call *field;
68 struct boot_trace_call *call;
69 u64 ts;
70 unsigned long nsec_rem;
71 int ret;
72
73 trace_assign_type(field, entry);
74 call = &field->boot_call;
75 ts = iter->ts;
76 nsec_rem = do_div(ts, 1000000000);
77
78 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
79 (unsigned long)ts, nsec_rem, call->func, call->caller);
80
81 if (!ret)
82 return TRACE_TYPE_PARTIAL_LINE;
50 else 83 else
51 stop_boot_trace(); 84 return TRACE_TYPE_HANDLED;
52} 85}
53 86
54static enum print_line_t initcall_print_line(struct trace_iterator *iter) 87static enum print_line_t
88initcall_ret_print_line(struct trace_iterator *iter)
55{ 89{
56 int ret;
57 struct trace_entry *entry = iter->ent; 90 struct trace_entry *entry = iter->ent;
58 struct trace_boot *field = (struct trace_boot *)entry;
59 struct boot_trace *it = &field->initcall;
60 struct trace_seq *s = &iter->seq; 91 struct trace_seq *s = &iter->seq;
61 struct timespec calltime = ktime_to_timespec(it->calltime); 92 struct trace_boot_ret *field;
62 struct timespec rettime = ktime_to_timespec(it->rettime); 93 struct boot_trace_ret *init_ret;
63 94 u64 ts;
64 if (entry->type == TRACE_BOOT) { 95 unsigned long nsec_rem;
65 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", 96 int ret;
66 calltime.tv_sec, 97
67 calltime.tv_nsec, 98 trace_assign_type(field, entry);
68 it->func, it->caller); 99 init_ret = &field->boot_ret;
69 if (!ret) 100 ts = iter->ts;
70 return TRACE_TYPE_PARTIAL_LINE; 101 nsec_rem = do_div(ts, 1000000000);
71 102
72 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " 103 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
73 "returned %d after %lld msecs\n", 104 "returned %d after %llu msecs\n",
74 rettime.tv_sec, 105 (unsigned long) ts,
75 rettime.tv_nsec, 106 nsec_rem,
76 it->func, it->result, it->duration); 107 init_ret->func, init_ret->result, init_ret->duration);
77 108
78 if (!ret) 109 if (!ret)
79 return TRACE_TYPE_PARTIAL_LINE; 110 return TRACE_TYPE_PARTIAL_LINE;
111 else
80 return TRACE_TYPE_HANDLED; 112 return TRACE_TYPE_HANDLED;
113}
114
115static enum print_line_t initcall_print_line(struct trace_iterator *iter)
116{
117 struct trace_entry *entry = iter->ent;
118
119 switch (entry->type) {
120 case TRACE_BOOT_CALL:
121 return initcall_call_print_line(iter);
122 case TRACE_BOOT_RET:
123 return initcall_ret_print_line(iter);
124 default:
125 return TRACE_TYPE_UNHANDLED;
81 } 126 }
82 return TRACE_TYPE_UNHANDLED;
83} 127}
84 128
85struct tracer boot_tracer __read_mostly = 129struct tracer boot_tracer __read_mostly =
@@ -87,27 +131,53 @@ struct tracer boot_tracer __read_mostly =
87 .name = "initcall", 131 .name = "initcall",
88 .init = boot_trace_init, 132 .init = boot_trace_init,
89 .reset = reset_boot_trace, 133 .reset = reset_boot_trace,
90 .ctrl_update = boot_trace_ctrl_update,
91 .print_line = initcall_print_line, 134 .print_line = initcall_print_line,
92}; 135};
93 136
94void trace_boot(struct boot_trace *it, initcall_t fn) 137void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
95{ 138{
96 struct ring_buffer_event *event; 139 struct ring_buffer_event *event;
97 struct trace_boot *entry; 140 struct trace_boot_call *entry;
98 struct trace_array_cpu *data;
99 unsigned long irq_flags; 141 unsigned long irq_flags;
100 struct trace_array *tr = boot_trace; 142 struct trace_array *tr = boot_trace;
101 143
102 if (!trace_boot_enabled) 144 if (!pre_initcalls_finished)
103 return; 145 return;
104 146
105 /* Get its name now since this function could 147 /* Get its name now since this function could
106 * disappear because it is in the .init section. 148 * disappear because it is in the .init section.
107 */ 149 */
108 sprint_symbol(it->func, (unsigned long)fn); 150 sprint_symbol(bt->func, (unsigned long)fn);
151 preempt_disable();
152
153 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
154 &irq_flags);
155 if (!event)
156 goto out;
157 entry = ring_buffer_event_data(event);
158 tracing_generic_entry_update(&entry->ent, 0, 0);
159 entry->ent.type = TRACE_BOOT_CALL;
160 entry->boot_call = *bt;
161 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
162
163 trace_wake_up();
164
165 out:
166 preempt_enable();
167}
168
169void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
170{
171 struct ring_buffer_event *event;
172 struct trace_boot_ret *entry;
173 unsigned long irq_flags;
174 struct trace_array *tr = boot_trace;
175
176 if (!pre_initcalls_finished)
177 return;
178
179 sprint_symbol(bt->func, (unsigned long)fn);
109 preempt_disable(); 180 preempt_disable();
110 data = tr->data[smp_processor_id()];
111 181
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 182 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
113 &irq_flags); 183 &irq_flags);
@@ -115,8 +185,8 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
115 goto out; 185 goto out;
116 entry = ring_buffer_event_data(event); 186 entry = ring_buffer_event_data(event);
117 tracing_generic_entry_update(&entry->ent, 0, 0); 187 tracing_generic_entry_update(&entry->ent, 0, 0);
118 entry->ent.type = TRACE_BOOT; 188 entry->ent.type = TRACE_BOOT_RET;
119 entry->initcall = *it; 189 entry->boot_ret = *bt;
120 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 190 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
121 191
122 trace_wake_up(); 192 trace_wake_up();
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
new file mode 100644
index 000000000000..6c00feb3bac7
--- /dev/null
+++ b/kernel/trace/trace_branch.c
@@ -0,0 +1,342 @@
1/*
2 * unlikely profiler
3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/kallsyms.h>
7#include <linux/seq_file.h>
8#include <linux/spinlock.h>
9#include <linux/irqflags.h>
10#include <linux/debugfs.h>
11#include <linux/uaccess.h>
12#include <linux/module.h>
13#include <linux/ftrace.h>
14#include <linux/hash.h>
15#include <linux/fs.h>
16#include <asm/local.h>
17#include "trace.h"
18
19#ifdef CONFIG_BRANCH_TRACER
20
21static int branch_tracing_enabled __read_mostly;
22static DEFINE_MUTEX(branch_tracing_mutex);
23static struct trace_array *branch_tracer;
24
25static void
26probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
27{
28 struct trace_array *tr = branch_tracer;
29 struct ring_buffer_event *event;
30 struct trace_branch *entry;
31 unsigned long flags, irq_flags;
32 int cpu, pc;
33 const char *p;
34
35 /*
36 * I would love to save just the ftrace_likely_data pointer, but
37 * this code can also be used by modules. Ugly things can happen
38 * if the module is unloaded, and then we go and read the
39 * pointer. This is slower, but much safer.
40 */
41
42 if (unlikely(!tr))
43 return;
44
45 local_irq_save(flags);
46 cpu = raw_smp_processor_id();
47 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
48 goto out;
49
50 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
51 &irq_flags);
52 if (!event)
53 goto out;
54
55 pc = preempt_count();
56 entry = ring_buffer_event_data(event);
57 tracing_generic_entry_update(&entry->ent, flags, pc);
58 entry->ent.type = TRACE_BRANCH;
59
60 /* Strip off the path, only save the file */
61 p = f->file + strlen(f->file);
62 while (p >= f->file && *p != '/')
63 p--;
64 p++;
65
66 strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
67 strncpy(entry->file, p, TRACE_FILE_SIZE);
68 entry->func[TRACE_FUNC_SIZE] = 0;
69 entry->file[TRACE_FILE_SIZE] = 0;
70 entry->line = f->line;
71 entry->correct = val == expect;
72
73 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
74
75 out:
76 atomic_dec(&tr->data[cpu]->disabled);
77 local_irq_restore(flags);
78}
79
80static inline
81void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
82{
83 if (!branch_tracing_enabled)
84 return;
85
86 probe_likely_condition(f, val, expect);
87}
88
89int enable_branch_tracing(struct trace_array *tr)
90{
91 int ret = 0;
92
93 mutex_lock(&branch_tracing_mutex);
94 branch_tracer = tr;
95 /*
96 * Must be seen before enabling. The reader is a condition
97 * where we do not need a matching rmb()
98 */
99 smp_wmb();
100 branch_tracing_enabled++;
101 mutex_unlock(&branch_tracing_mutex);
102
103 return ret;
104}
105
106void disable_branch_tracing(void)
107{
108 mutex_lock(&branch_tracing_mutex);
109
110 if (!branch_tracing_enabled)
111 goto out_unlock;
112
113 branch_tracing_enabled--;
114
115 out_unlock:
116 mutex_unlock(&branch_tracing_mutex);
117}
118
119static void start_branch_trace(struct trace_array *tr)
120{
121 enable_branch_tracing(tr);
122}
123
124static void stop_branch_trace(struct trace_array *tr)
125{
126 disable_branch_tracing();
127}
128
129static int branch_trace_init(struct trace_array *tr)
130{
131 int cpu;
132
133 for_each_online_cpu(cpu)
134 tracing_reset(tr, cpu);
135
136 start_branch_trace(tr);
137 return 0;
138}
139
140static void branch_trace_reset(struct trace_array *tr)
141{
142 stop_branch_trace(tr);
143}
144
145struct tracer branch_trace __read_mostly =
146{
147 .name = "branch",
148 .init = branch_trace_init,
149 .reset = branch_trace_reset,
150#ifdef CONFIG_FTRACE_SELFTEST
151 .selftest = trace_selftest_startup_branch,
152#endif
153};
154
155__init static int init_branch_trace(void)
156{
157 return register_tracer(&branch_trace);
158}
159
160device_initcall(init_branch_trace);
161#else
162static inline
163void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
164{
165}
166#endif /* CONFIG_BRANCH_TRACER */
167
168void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
169{
170 /*
171 * I would love to have a trace point here instead, but the
172 * trace point code is so inundated with unlikely and likely
173 * conditions that the recursive nightmare that exists is too
174 * much to try to get working. At least for now.
175 */
176 trace_likely_condition(f, val, expect);
177
178 /* FIXME: Make this atomic! */
179 if (val == expect)
180 f->correct++;
181 else
182 f->incorrect++;
183}
184EXPORT_SYMBOL(ftrace_likely_update);
185
186struct ftrace_pointer {
187 void *start;
188 void *stop;
189 int hit;
190};
191
192static void *
193t_next(struct seq_file *m, void *v, loff_t *pos)
194{
195 const struct ftrace_pointer *f = m->private;
196 struct ftrace_branch_data *p = v;
197
198 (*pos)++;
199
200 if (v == (void *)1)
201 return f->start;
202
203 ++p;
204
205 if ((void *)p >= (void *)f->stop)
206 return NULL;
207
208 return p;
209}
210
211static void *t_start(struct seq_file *m, loff_t *pos)
212{
213 void *t = (void *)1;
214 loff_t l = 0;
215
216 for (; t && l < *pos; t = t_next(m, t, &l))
217 ;
218
219 return t;
220}
221
222static void t_stop(struct seq_file *m, void *p)
223{
224}
225
226static int t_show(struct seq_file *m, void *v)
227{
228 const struct ftrace_pointer *fp = m->private;
229 struct ftrace_branch_data *p = v;
230 const char *f;
231 long percent;
232
233 if (v == (void *)1) {
234 if (fp->hit)
235 seq_printf(m, " miss hit %% ");
236 else
237 seq_printf(m, " correct incorrect %% ");
238 seq_printf(m, " Function "
239 " File Line\n"
240 " ------- --------- - "
241 " -------- "
242 " ---- ----\n");
243 return 0;
244 }
245
246 /* Only print the file, not the path */
247 f = p->file + strlen(p->file);
248 while (f >= p->file && *f != '/')
249 f--;
250 f++;
251
252 /*
253 * The miss is overlayed on correct, and hit on incorrect.
254 */
255 if (p->correct) {
256 percent = p->incorrect * 100;
257 percent /= p->correct + p->incorrect;
258 } else
259 percent = p->incorrect ? 100 : -1;
260
261 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
262 if (percent < 0)
263 seq_printf(m, " X ");
264 else
265 seq_printf(m, "%3ld ", percent);
266 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
267 return 0;
268}
269
270static struct seq_operations tracing_likely_seq_ops = {
271 .start = t_start,
272 .next = t_next,
273 .stop = t_stop,
274 .show = t_show,
275};
276
277static int tracing_branch_open(struct inode *inode, struct file *file)
278{
279 int ret;
280
281 ret = seq_open(file, &tracing_likely_seq_ops);
282 if (!ret) {
283 struct seq_file *m = file->private_data;
284 m->private = (void *)inode->i_private;
285 }
286
287 return ret;
288}
289
290static const struct file_operations tracing_branch_fops = {
291 .open = tracing_branch_open,
292 .read = seq_read,
293 .llseek = seq_lseek,
294};
295
296#ifdef CONFIG_PROFILE_ALL_BRANCHES
297extern unsigned long __start_branch_profile[];
298extern unsigned long __stop_branch_profile[];
299
300static const struct ftrace_pointer ftrace_branch_pos = {
301 .start = __start_branch_profile,
302 .stop = __stop_branch_profile,
303 .hit = 1,
304};
305
306#endif /* CONFIG_PROFILE_ALL_BRANCHES */
307
308extern unsigned long __start_annotated_branch_profile[];
309extern unsigned long __stop_annotated_branch_profile[];
310
311static const struct ftrace_pointer ftrace_annotated_branch_pos = {
312 .start = __start_annotated_branch_profile,
313 .stop = __stop_annotated_branch_profile,
314};
315
316static __init int ftrace_branch_init(void)
317{
318 struct dentry *d_tracer;
319 struct dentry *entry;
320
321 d_tracer = tracing_init_dentry();
322
323 entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
324 (void *)&ftrace_annotated_branch_pos,
325 &tracing_branch_fops);
326 if (!entry)
327 pr_warning("Could not create debugfs "
328 "'profile_annotatet_branch' entry\n");
329
330#ifdef CONFIG_PROFILE_ALL_BRANCHES
331 entry = debugfs_create_file("profile_branch", 0444, d_tracer,
332 (void *)&ftrace_branch_pos,
333 &tracing_branch_fops);
334 if (!entry)
335 pr_warning("Could not create debugfs"
336 " 'profile_branch' entry\n");
337#endif
338
339 return 0;
340}
341
342device_initcall(ftrace_branch_init);
diff --git a/kernel/trace/trace_bts.c b/kernel/trace/trace_bts.c
new file mode 100644
index 000000000000..23b76e4690ef
--- /dev/null
+++ b/kernel/trace/trace_bts.c
@@ -0,0 +1,276 @@
1/*
2 * BTS tracer
3 *
4 * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h>
12#include <linux/kallsyms.h>
13
14#include <asm/ds.h>
15
16#include "trace.h"
17
18
19#define SIZEOF_BTS (1 << 13)
20
21static DEFINE_PER_CPU(struct bts_tracer *, tracer);
22static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
23
24#define this_tracer per_cpu(tracer, smp_processor_id())
25#define this_buffer per_cpu(buffer, smp_processor_id())
26
27
28/*
29 * Information to interpret a BTS record.
30 * This will go into an in-kernel BTS interface.
31 */
32static unsigned char sizeof_field;
33static unsigned long debugctl_mask;
34
35#define sizeof_bts (3 * sizeof_field)
36
37static void bts_trace_cpuinit(struct cpuinfo_x86 *c)
38{
39 switch (c->x86) {
40 case 0x6:
41 switch (c->x86_model) {
42 case 0x0 ... 0xC:
43 break;
44 case 0xD:
45 case 0xE: /* Pentium M */
46 sizeof_field = sizeof(long);
47 debugctl_mask = (1<<6)|(1<<7);
48 break;
49 default:
50 sizeof_field = 8;
51 debugctl_mask = (1<<6)|(1<<7);
52 break;
53 }
54 break;
55 case 0xF:
56 switch (c->x86_model) {
57 case 0x0:
58 case 0x1:
59 case 0x2: /* Netburst */
60 sizeof_field = sizeof(long);
61 debugctl_mask = (1<<2)|(1<<3);
62 break;
63 default:
64 /* sorry, don't know about them */
65 break;
66 }
67 break;
68 default:
69 /* sorry, don't know about them */
70 break;
71 }
72}
73
74static inline void bts_enable(void)
75{
76 unsigned long debugctl;
77
78 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
79 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | debugctl_mask);
80}
81
82static inline void bts_disable(void)
83{
84 unsigned long debugctl;
85
86 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
87 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl & ~debugctl_mask);
88}
89
90static void bts_trace_reset(struct trace_array *tr)
91{
92 int cpu;
93
94 tr->time_start = ftrace_now(tr->cpu);
95
96 for_each_online_cpu(cpu)
97 tracing_reset(tr, cpu);
98}
99
100static void bts_trace_start_cpu(void *arg)
101{
102 this_tracer =
103 ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
104 /* ovfl = */ NULL, /* th = */ (size_t)-1);
105 if (IS_ERR(this_tracer)) {
106 this_tracer = NULL;
107 return;
108 }
109
110 bts_enable();
111}
112
113static void bts_trace_start(struct trace_array *tr)
114{
115 int cpu;
116
117 bts_trace_reset(tr);
118
119 for_each_cpu_mask(cpu, cpu_possible_map)
120 smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
121}
122
123static void bts_trace_stop_cpu(void *arg)
124{
125 if (this_tracer) {
126 bts_disable();
127
128 ds_release_bts(this_tracer);
129 this_tracer = NULL;
130 }
131}
132
133static void bts_trace_stop(struct trace_array *tr)
134{
135 int cpu;
136
137 for_each_cpu_mask(cpu, cpu_possible_map)
138 smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
139}
140
141static int bts_trace_init(struct trace_array *tr)
142{
143 bts_trace_cpuinit(&boot_cpu_data);
144 bts_trace_reset(tr);
145 bts_trace_start(tr);
146
147 return 0;
148}
149
150static void bts_trace_print_header(struct seq_file *m)
151{
152#ifdef __i386__
153 seq_puts(m, "# CPU# FROM TO FUNCTION\n");
154 seq_puts(m, "# | | | |\n");
155#else
156 seq_puts(m,
157 "# CPU# FROM TO FUNCTION\n");
158 seq_puts(m,
159 "# | | | |\n");
160#endif
161}
162
163static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
164{
165 struct trace_entry *entry = iter->ent;
166 struct trace_seq *seq = &iter->seq;
167 struct bts_entry *it;
168
169 trace_assign_type(it, entry);
170
171 if (entry->type == TRACE_BTS) {
172 int ret;
173#ifdef CONFIG_KALLSYMS
174 char function[KSYM_SYMBOL_LEN];
175 sprint_symbol(function, it->from);
176#else
177 char *function = "<unknown>";
178#endif
179
180 ret = trace_seq_printf(seq, "%4d 0x%lx -> 0x%lx [%s]\n",
181 entry->cpu, it->from, it->to, function);
182 if (!ret)
183 return TRACE_TYPE_PARTIAL_LINE;;
184 return TRACE_TYPE_HANDLED;
185 }
186 return TRACE_TYPE_UNHANDLED;
187}
188
189void trace_bts(struct trace_array *tr, unsigned long from, unsigned long to)
190{
191 struct ring_buffer_event *event;
192 struct bts_entry *entry;
193 unsigned long irq;
194
195 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
196 if (!event)
197 return;
198 entry = ring_buffer_event_data(event);
199 tracing_generic_entry_update(&entry->ent, 0, from);
200 entry->ent.type = TRACE_BTS;
201 entry->ent.cpu = smp_processor_id();
202 entry->from = from;
203 entry->to = to;
204 ring_buffer_unlock_commit(tr->buffer, event, irq);
205}
206
207static void trace_bts_at(struct trace_array *tr, size_t index)
208{
209 const void *raw = NULL;
210 unsigned long from, to;
211 int err;
212
213 err = ds_access_bts(this_tracer, index, &raw);
214 if (err < 0)
215 return;
216
217 from = *(const unsigned long *)raw;
218 to = *(const unsigned long *)((const char *)raw + sizeof_field);
219
220 trace_bts(tr, from, to);
221}
222
223static void trace_bts_cpu(void *arg)
224{
225 struct trace_array *tr = (struct trace_array *) arg;
226 size_t index = 0, end = 0, i;
227 int err;
228
229 if (!this_tracer)
230 return;
231
232 bts_disable();
233
234 err = ds_get_bts_index(this_tracer, &index);
235 if (err < 0)
236 goto out;
237
238 err = ds_get_bts_end(this_tracer, &end);
239 if (err < 0)
240 goto out;
241
242 for (i = index; i < end; i++)
243 trace_bts_at(tr, i);
244
245 for (i = 0; i < index; i++)
246 trace_bts_at(tr, i);
247
248out:
249 bts_enable();
250}
251
252static void trace_bts_prepare(struct trace_iterator *iter)
253{
254 int cpu;
255
256 for_each_cpu_mask(cpu, cpu_possible_map)
257 smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
258}
259
260struct tracer bts_tracer __read_mostly =
261{
262 .name = "bts",
263 .init = bts_trace_init,
264 .reset = bts_trace_stop,
265 .print_header = bts_trace_print_header,
266 .print_line = bts_trace_print_line,
267 .start = bts_trace_start,
268 .stop = bts_trace_stop,
269 .open = trace_bts_prepare
270};
271
272__init static int init_bts_trace(void)
273{
274 return register_tracer(&bts_tracer);
275}
276device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 0f85a64003d3..e74f6d0a3216 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -42,24 +42,20 @@ static void stop_function_trace(struct trace_array *tr)
42 tracing_stop_cmdline_record(); 42 tracing_stop_cmdline_record();
43} 43}
44 44
45static void function_trace_init(struct trace_array *tr) 45static int function_trace_init(struct trace_array *tr)
46{ 46{
47 if (tr->ctrl) 47 start_function_trace(tr);
48 start_function_trace(tr); 48 return 0;
49} 49}
50 50
51static void function_trace_reset(struct trace_array *tr) 51static void function_trace_reset(struct trace_array *tr)
52{ 52{
53 if (tr->ctrl) 53 stop_function_trace(tr);
54 stop_function_trace(tr);
55} 54}
56 55
57static void function_trace_ctrl_update(struct trace_array *tr) 56static void function_trace_start(struct trace_array *tr)
58{ 57{
59 if (tr->ctrl) 58 function_reset(tr);
60 start_function_trace(tr);
61 else
62 stop_function_trace(tr);
63} 59}
64 60
65static struct tracer function_trace __read_mostly = 61static struct tracer function_trace __read_mostly =
@@ -67,7 +63,7 @@ static struct tracer function_trace __read_mostly =
67 .name = "function", 63 .name = "function",
68 .init = function_trace_init, 64 .init = function_trace_init,
69 .reset = function_trace_reset, 65 .reset = function_trace_reset,
70 .ctrl_update = function_trace_ctrl_update, 66 .start = function_trace_start,
71#ifdef CONFIG_FTRACE_SELFTEST 67#ifdef CONFIG_FTRACE_SELFTEST
72 .selftest = trace_selftest_startup_function, 68 .selftest = trace_selftest_startup_function,
73#endif 69#endif
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
new file mode 100644
index 000000000000..af60eef4cbcc
--- /dev/null
+++ b/kernel/trace/trace_functions_graph.c
@@ -0,0 +1,611 @@
1/*
2 *
3 * Function graph tracer.
4 * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 * Mostly borrowed from function tracer which
6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
7 *
8 */
9#include <linux/debugfs.h>
10#include <linux/uaccess.h>
11#include <linux/ftrace.h>
12#include <linux/fs.h>
13
14#include "trace.h"
15
16#define TRACE_GRAPH_INDENT 2
17
18/* Flag options */
19#define TRACE_GRAPH_PRINT_OVERRUN 0x1
20#define TRACE_GRAPH_PRINT_CPU 0x2
21#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
22#define TRACE_GRAPH_PRINT_PROC 0x8
23
24static struct tracer_opt trace_opts[] = {
25 /* Display overruns ? */
26 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
27 /* Display CPU ? */
28 { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
29 /* Display Overhead ? */
30 { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
31 /* Display proc name/pid */
32 { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
33 { } /* Empty entry */
34};
35
36static struct tracer_flags tracer_flags = {
37 /* Don't display overruns and proc by default */
38 .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
39 .opts = trace_opts
40};
41
42/* pid on the last trace processed */
43static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
44
45static int graph_trace_init(struct trace_array *tr)
46{
47 int cpu, ret;
48
49 for_each_online_cpu(cpu)
50 tracing_reset(tr, cpu);
51
52 ret = register_ftrace_graph(&trace_graph_return,
53 &trace_graph_entry);
54 if (ret)
55 return ret;
56 tracing_start_cmdline_record();
57
58 return 0;
59}
60
61static void graph_trace_reset(struct trace_array *tr)
62{
63 tracing_stop_cmdline_record();
64 unregister_ftrace_graph();
65}
66
67static inline int log10_cpu(int nb)
68{
69 if (nb / 100)
70 return 3;
71 if (nb / 10)
72 return 2;
73 return 1;
74}
75
76static enum print_line_t
77print_graph_cpu(struct trace_seq *s, int cpu)
78{
79 int i;
80 int ret;
81 int log10_this = log10_cpu(cpu);
82 int log10_all = log10_cpu(cpus_weight_nr(cpu_online_map));
83
84
85 /*
86 * Start with a space character - to make it stand out
87 * to the right a bit when trace output is pasted into
88 * email:
89 */
90 ret = trace_seq_printf(s, " ");
91
92 /*
93 * Tricky - we space the CPU field according to the max
94 * number of online CPUs. On a 2-cpu system it would take
95 * a maximum of 1 digit - on a 128 cpu system it would
96 * take up to 3 digits:
97 */
98 for (i = 0; i < log10_all - log10_this; i++) {
99 ret = trace_seq_printf(s, " ");
100 if (!ret)
101 return TRACE_TYPE_PARTIAL_LINE;
102 }
103 ret = trace_seq_printf(s, "%d) ", cpu);
104 if (!ret)
105 return TRACE_TYPE_PARTIAL_LINE;
106
107 return TRACE_TYPE_HANDLED;
108}
109
110#define TRACE_GRAPH_PROCINFO_LENGTH 14
111
112static enum print_line_t
113print_graph_proc(struct trace_seq *s, pid_t pid)
114{
115 int i;
116 int ret;
117 int len;
118 char comm[8];
119 int spaces = 0;
120 /* sign + log10(MAX_INT) + '\0' */
121 char pid_str[11];
122
123 strncpy(comm, trace_find_cmdline(pid), 7);
124 comm[7] = '\0';
125 sprintf(pid_str, "%d", pid);
126
127 /* 1 stands for the "-" character */
128 len = strlen(comm) + strlen(pid_str) + 1;
129
130 if (len < TRACE_GRAPH_PROCINFO_LENGTH)
131 spaces = TRACE_GRAPH_PROCINFO_LENGTH - len;
132
133 /* First spaces to align center */
134 for (i = 0; i < spaces / 2; i++) {
135 ret = trace_seq_printf(s, " ");
136 if (!ret)
137 return TRACE_TYPE_PARTIAL_LINE;
138 }
139
140 ret = trace_seq_printf(s, "%s-%s", comm, pid_str);
141 if (!ret)
142 return TRACE_TYPE_PARTIAL_LINE;
143
144 /* Last spaces to align center */
145 for (i = 0; i < spaces - (spaces / 2); i++) {
146 ret = trace_seq_printf(s, " ");
147 if (!ret)
148 return TRACE_TYPE_PARTIAL_LINE;
149 }
150 return TRACE_TYPE_HANDLED;
151}
152
153
154/* If the pid changed since the last trace, output this event */
155static enum print_line_t
156verif_pid(struct trace_seq *s, pid_t pid, int cpu)
157{
158 pid_t prev_pid;
159 int ret;
160
161 if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
162 return TRACE_TYPE_HANDLED;
163
164 prev_pid = last_pid[cpu];
165 last_pid[cpu] = pid;
166
167/*
168 * Context-switch trace line:
169
170 ------------------------------------------
171 | 1) migration/0--1 => sshd-1755
172 ------------------------------------------
173
174 */
175 ret = trace_seq_printf(s,
176 " ------------------------------------------\n");
177 if (!ret)
178 TRACE_TYPE_PARTIAL_LINE;
179
180 ret = print_graph_cpu(s, cpu);
181 if (ret == TRACE_TYPE_PARTIAL_LINE)
182 TRACE_TYPE_PARTIAL_LINE;
183
184 ret = print_graph_proc(s, prev_pid);
185 if (ret == TRACE_TYPE_PARTIAL_LINE)
186 TRACE_TYPE_PARTIAL_LINE;
187
188 ret = trace_seq_printf(s, " => ");
189 if (!ret)
190 TRACE_TYPE_PARTIAL_LINE;
191
192 ret = print_graph_proc(s, pid);
193 if (ret == TRACE_TYPE_PARTIAL_LINE)
194 TRACE_TYPE_PARTIAL_LINE;
195
196 ret = trace_seq_printf(s,
197 "\n ------------------------------------------\n\n");
198 if (!ret)
199 TRACE_TYPE_PARTIAL_LINE;
200
201 return ret;
202}
203
204static bool
205trace_branch_is_leaf(struct trace_iterator *iter,
206 struct ftrace_graph_ent_entry *curr)
207{
208 struct ring_buffer_iter *ring_iter;
209 struct ring_buffer_event *event;
210 struct ftrace_graph_ret_entry *next;
211
212 ring_iter = iter->buffer_iter[iter->cpu];
213
214 if (!ring_iter)
215 return false;
216
217 event = ring_buffer_iter_peek(ring_iter, NULL);
218
219 if (!event)
220 return false;
221
222 next = ring_buffer_event_data(event);
223
224 if (next->ent.type != TRACE_GRAPH_RET)
225 return false;
226
227 if (curr->ent.pid != next->ent.pid ||
228 curr->graph_ent.func != next->ret.func)
229 return false;
230
231 return true;
232}
233
234
235static enum print_line_t
236print_graph_duration(unsigned long long duration, struct trace_seq *s)
237{
238 unsigned long nsecs_rem = do_div(duration, 1000);
239 /* log10(ULONG_MAX) + '\0' */
240 char msecs_str[21];
241 char nsecs_str[5];
242 int ret, len;
243 int i;
244
245 sprintf(msecs_str, "%lu", (unsigned long) duration);
246
247 /* Print msecs */
248 ret = trace_seq_printf(s, msecs_str);
249 if (!ret)
250 return TRACE_TYPE_PARTIAL_LINE;
251
252 len = strlen(msecs_str);
253
254 /* Print nsecs (we don't want to exceed 7 numbers) */
255 if (len < 7) {
256 snprintf(nsecs_str, 8 - len, "%03lu", nsecs_rem);
257 ret = trace_seq_printf(s, ".%s", nsecs_str);
258 if (!ret)
259 return TRACE_TYPE_PARTIAL_LINE;
260 len += strlen(nsecs_str);
261 }
262
263 ret = trace_seq_printf(s, " us ");
264 if (!ret)
265 return TRACE_TYPE_PARTIAL_LINE;
266
267 /* Print remaining spaces to fit the row's width */
268 for (i = len; i < 7; i++) {
269 ret = trace_seq_printf(s, " ");
270 if (!ret)
271 return TRACE_TYPE_PARTIAL_LINE;
272 }
273
274 ret = trace_seq_printf(s, "| ");
275 if (!ret)
276 return TRACE_TYPE_PARTIAL_LINE;
277 return TRACE_TYPE_HANDLED;
278
279}
280
281/* Signal a overhead of time execution to the output */
282static int
283print_graph_overhead(unsigned long long duration, struct trace_seq *s)
284{
285 /* Duration exceeded 100 msecs */
286 if (duration > 100000ULL)
287 return trace_seq_printf(s, "! ");
288
289 /* Duration exceeded 10 msecs */
290 if (duration > 10000ULL)
291 return trace_seq_printf(s, "+ ");
292
293 return trace_seq_printf(s, " ");
294}
295
296/* Case of a leaf function on its call entry */
297static enum print_line_t
298print_graph_entry_leaf(struct trace_iterator *iter,
299 struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
300{
301 struct ftrace_graph_ret_entry *ret_entry;
302 struct ftrace_graph_ret *graph_ret;
303 struct ring_buffer_event *event;
304 struct ftrace_graph_ent *call;
305 unsigned long long duration;
306 int ret;
307 int i;
308
309 event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
310 ret_entry = ring_buffer_event_data(event);
311 graph_ret = &ret_entry->ret;
312 call = &entry->graph_ent;
313 duration = graph_ret->rettime - graph_ret->calltime;
314
315 /* Overhead */
316 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
317 ret = print_graph_overhead(duration, s);
318 if (!ret)
319 return TRACE_TYPE_PARTIAL_LINE;
320 }
321
322 /* Duration */
323 ret = print_graph_duration(duration, s);
324 if (ret == TRACE_TYPE_PARTIAL_LINE)
325 return TRACE_TYPE_PARTIAL_LINE;
326
327 /* Function */
328 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
329 ret = trace_seq_printf(s, " ");
330 if (!ret)
331 return TRACE_TYPE_PARTIAL_LINE;
332 }
333
334 ret = seq_print_ip_sym(s, call->func, 0);
335 if (!ret)
336 return TRACE_TYPE_PARTIAL_LINE;
337
338 ret = trace_seq_printf(s, "();\n");
339 if (!ret)
340 return TRACE_TYPE_PARTIAL_LINE;
341
342 return TRACE_TYPE_HANDLED;
343}
344
345static enum print_line_t
346print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
347 struct trace_seq *s)
348{
349 int i;
350 int ret;
351 struct ftrace_graph_ent *call = &entry->graph_ent;
352
353 /* No overhead */
354 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
355 ret = trace_seq_printf(s, " ");
356 if (!ret)
357 return TRACE_TYPE_PARTIAL_LINE;
358 }
359
360 /* No time */
361 ret = trace_seq_printf(s, " | ");
362
363 /* Function */
364 for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
365 ret = trace_seq_printf(s, " ");
366 if (!ret)
367 return TRACE_TYPE_PARTIAL_LINE;
368 }
369
370 ret = seq_print_ip_sym(s, call->func, 0);
371 if (!ret)
372 return TRACE_TYPE_PARTIAL_LINE;
373
374 ret = trace_seq_printf(s, "() {\n");
375 if (!ret)
376 return TRACE_TYPE_PARTIAL_LINE;
377
378 return TRACE_TYPE_HANDLED;
379}
380
381static enum print_line_t
382print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
383 struct trace_iterator *iter, int cpu)
384{
385 int ret;
386 struct trace_entry *ent = iter->ent;
387
388 /* Pid */
389 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
390 return TRACE_TYPE_PARTIAL_LINE;
391
392 /* Cpu */
393 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
394 ret = print_graph_cpu(s, cpu);
395 if (ret == TRACE_TYPE_PARTIAL_LINE)
396 return TRACE_TYPE_PARTIAL_LINE;
397 }
398
399 /* Proc */
400 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
401 ret = print_graph_proc(s, ent->pid);
402 if (ret == TRACE_TYPE_PARTIAL_LINE)
403 return TRACE_TYPE_PARTIAL_LINE;
404
405 ret = trace_seq_printf(s, " | ");
406 if (!ret)
407 return TRACE_TYPE_PARTIAL_LINE;
408 }
409
410 if (trace_branch_is_leaf(iter, field))
411 return print_graph_entry_leaf(iter, field, s);
412 else
413 return print_graph_entry_nested(field, s);
414
415}
416
417static enum print_line_t
418print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
419 struct trace_entry *ent, int cpu)
420{
421 int i;
422 int ret;
423 unsigned long long duration = trace->rettime - trace->calltime;
424
425 /* Pid */
426 if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
427 return TRACE_TYPE_PARTIAL_LINE;
428
429 /* Cpu */
430 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
431 ret = print_graph_cpu(s, cpu);
432 if (ret == TRACE_TYPE_PARTIAL_LINE)
433 return TRACE_TYPE_PARTIAL_LINE;
434 }
435
436 /* Proc */
437 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
438 ret = print_graph_proc(s, ent->pid);
439 if (ret == TRACE_TYPE_PARTIAL_LINE)
440 return TRACE_TYPE_PARTIAL_LINE;
441
442 ret = trace_seq_printf(s, " | ");
443 if (!ret)
444 return TRACE_TYPE_PARTIAL_LINE;
445 }
446
447 /* Overhead */
448 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
449 ret = print_graph_overhead(duration, s);
450 if (!ret)
451 return TRACE_TYPE_PARTIAL_LINE;
452 }
453
454 /* Duration */
455 ret = print_graph_duration(duration, s);
456 if (ret == TRACE_TYPE_PARTIAL_LINE)
457 return TRACE_TYPE_PARTIAL_LINE;
458
459 /* Closing brace */
460 for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
461 ret = trace_seq_printf(s, " ");
462 if (!ret)
463 return TRACE_TYPE_PARTIAL_LINE;
464 }
465
466 ret = trace_seq_printf(s, "}\n");
467 if (!ret)
468 return TRACE_TYPE_PARTIAL_LINE;
469
470 /* Overrun */
471 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
472 ret = trace_seq_printf(s, " (Overruns: %lu)\n",
473 trace->overrun);
474 if (!ret)
475 return TRACE_TYPE_PARTIAL_LINE;
476 }
477 return TRACE_TYPE_HANDLED;
478}
479
480static enum print_line_t
481print_graph_comment(struct print_entry *trace, struct trace_seq *s,
482 struct trace_entry *ent, struct trace_iterator *iter)
483{
484 int i;
485 int ret;
486
487 /* Pid */
488 if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
489 return TRACE_TYPE_PARTIAL_LINE;
490
491 /* Cpu */
492 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
493 ret = print_graph_cpu(s, iter->cpu);
494 if (ret == TRACE_TYPE_PARTIAL_LINE)
495 return TRACE_TYPE_PARTIAL_LINE;
496 }
497
498 /* Proc */
499 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
500 ret = print_graph_proc(s, ent->pid);
501 if (ret == TRACE_TYPE_PARTIAL_LINE)
502 return TRACE_TYPE_PARTIAL_LINE;
503
504 ret = trace_seq_printf(s, " | ");
505 if (!ret)
506 return TRACE_TYPE_PARTIAL_LINE;
507 }
508
509 /* No overhead */
510 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
511 ret = trace_seq_printf(s, " ");
512 if (!ret)
513 return TRACE_TYPE_PARTIAL_LINE;
514 }
515
516 /* No time */
517 ret = trace_seq_printf(s, " | ");
518 if (!ret)
519 return TRACE_TYPE_PARTIAL_LINE;
520
521 /* Indentation */
522 if (trace->depth > 0)
523 for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
524 ret = trace_seq_printf(s, " ");
525 if (!ret)
526 return TRACE_TYPE_PARTIAL_LINE;
527 }
528
529 /* The comment */
530 ret = trace_seq_printf(s, "/* %s", trace->buf);
531 if (!ret)
532 return TRACE_TYPE_PARTIAL_LINE;
533
534 if (ent->flags & TRACE_FLAG_CONT)
535 trace_seq_print_cont(s, iter);
536
537 ret = trace_seq_printf(s, " */\n");
538 if (!ret)
539 return TRACE_TYPE_PARTIAL_LINE;
540
541 return TRACE_TYPE_HANDLED;
542}
543
544
545enum print_line_t
546print_graph_function(struct trace_iterator *iter)
547{
548 struct trace_seq *s = &iter->seq;
549 struct trace_entry *entry = iter->ent;
550
551 switch (entry->type) {
552 case TRACE_GRAPH_ENT: {
553 struct ftrace_graph_ent_entry *field;
554 trace_assign_type(field, entry);
555 return print_graph_entry(field, s, iter,
556 iter->cpu);
557 }
558 case TRACE_GRAPH_RET: {
559 struct ftrace_graph_ret_entry *field;
560 trace_assign_type(field, entry);
561 return print_graph_return(&field->ret, s, entry, iter->cpu);
562 }
563 case TRACE_PRINT: {
564 struct print_entry *field;
565 trace_assign_type(field, entry);
566 return print_graph_comment(field, s, entry, iter);
567 }
568 default:
569 return TRACE_TYPE_UNHANDLED;
570 }
571}
572
573static void print_graph_headers(struct seq_file *s)
574{
575 /* 1st line */
576 seq_printf(s, "# ");
577 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
578 seq_printf(s, "CPU ");
579 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
580 seq_printf(s, "TASK/PID ");
581 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
582 seq_printf(s, "OVERHEAD/");
583 seq_printf(s, "DURATION FUNCTION CALLS\n");
584
585 /* 2nd line */
586 seq_printf(s, "# ");
587 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
588 seq_printf(s, "| ");
589 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
590 seq_printf(s, "| | ");
591 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
592 seq_printf(s, "| ");
593 seq_printf(s, "| | | | |\n");
594 } else
595 seq_printf(s, " | | | | |\n");
596}
597static struct tracer graph_trace __read_mostly = {
598 .name = "function_graph",
599 .init = graph_trace_init,
600 .reset = graph_trace_reset,
601 .print_line = print_graph_function,
602 .print_header = print_graph_headers,
603 .flags = &tracer_flags,
604};
605
606static __init int init_graph_trace(void)
607{
608 return register_tracer(&graph_trace);
609}
610
611device_initcall(init_graph_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9c74071c10e0..7c2e326bbc8b 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -353,15 +353,28 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
353} 353}
354#endif /* CONFIG_PREEMPT_TRACER */ 354#endif /* CONFIG_PREEMPT_TRACER */
355 355
356/*
357 * save_tracer_enabled is used to save the state of the tracer_enabled
358 * variable when we disable it when we open a trace output file.
359 */
360static int save_tracer_enabled;
361
356static void start_irqsoff_tracer(struct trace_array *tr) 362static void start_irqsoff_tracer(struct trace_array *tr)
357{ 363{
358 register_ftrace_function(&trace_ops); 364 register_ftrace_function(&trace_ops);
359 tracer_enabled = 1; 365 if (tracing_is_enabled()) {
366 tracer_enabled = 1;
367 save_tracer_enabled = 1;
368 } else {
369 tracer_enabled = 0;
370 save_tracer_enabled = 0;
371 }
360} 372}
361 373
362static void stop_irqsoff_tracer(struct trace_array *tr) 374static void stop_irqsoff_tracer(struct trace_array *tr)
363{ 375{
364 tracer_enabled = 0; 376 tracer_enabled = 0;
377 save_tracer_enabled = 0;
365 unregister_ftrace_function(&trace_ops); 378 unregister_ftrace_function(&trace_ops);
366} 379}
367 380
@@ -370,53 +383,55 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
370 irqsoff_trace = tr; 383 irqsoff_trace = tr;
371 /* make sure that the tracer is visible */ 384 /* make sure that the tracer is visible */
372 smp_wmb(); 385 smp_wmb();
373 386 start_irqsoff_tracer(tr);
374 if (tr->ctrl)
375 start_irqsoff_tracer(tr);
376} 387}
377 388
378static void irqsoff_tracer_reset(struct trace_array *tr) 389static void irqsoff_tracer_reset(struct trace_array *tr)
379{ 390{
380 if (tr->ctrl) 391 stop_irqsoff_tracer(tr);
381 stop_irqsoff_tracer(tr);
382} 392}
383 393
384static void irqsoff_tracer_ctrl_update(struct trace_array *tr) 394static void irqsoff_tracer_start(struct trace_array *tr)
385{ 395{
386 if (tr->ctrl) 396 tracer_enabled = 1;
387 start_irqsoff_tracer(tr); 397 save_tracer_enabled = 1;
388 else 398}
389 stop_irqsoff_tracer(tr); 399
400static void irqsoff_tracer_stop(struct trace_array *tr)
401{
402 tracer_enabled = 0;
403 save_tracer_enabled = 0;
390} 404}
391 405
392static void irqsoff_tracer_open(struct trace_iterator *iter) 406static void irqsoff_tracer_open(struct trace_iterator *iter)
393{ 407{
394 /* stop the trace while dumping */ 408 /* stop the trace while dumping */
395 if (iter->tr->ctrl) 409 tracer_enabled = 0;
396 stop_irqsoff_tracer(iter->tr);
397} 410}
398 411
399static void irqsoff_tracer_close(struct trace_iterator *iter) 412static void irqsoff_tracer_close(struct trace_iterator *iter)
400{ 413{
401 if (iter->tr->ctrl) 414 /* restart tracing */
402 start_irqsoff_tracer(iter->tr); 415 tracer_enabled = save_tracer_enabled;
403} 416}
404 417
405#ifdef CONFIG_IRQSOFF_TRACER 418#ifdef CONFIG_IRQSOFF_TRACER
406static void irqsoff_tracer_init(struct trace_array *tr) 419static int irqsoff_tracer_init(struct trace_array *tr)
407{ 420{
408 trace_type = TRACER_IRQS_OFF; 421 trace_type = TRACER_IRQS_OFF;
409 422
410 __irqsoff_tracer_init(tr); 423 __irqsoff_tracer_init(tr);
424 return 0;
411} 425}
412static struct tracer irqsoff_tracer __read_mostly = 426static struct tracer irqsoff_tracer __read_mostly =
413{ 427{
414 .name = "irqsoff", 428 .name = "irqsoff",
415 .init = irqsoff_tracer_init, 429 .init = irqsoff_tracer_init,
416 .reset = irqsoff_tracer_reset, 430 .reset = irqsoff_tracer_reset,
431 .start = irqsoff_tracer_start,
432 .stop = irqsoff_tracer_stop,
417 .open = irqsoff_tracer_open, 433 .open = irqsoff_tracer_open,
418 .close = irqsoff_tracer_close, 434 .close = irqsoff_tracer_close,
419 .ctrl_update = irqsoff_tracer_ctrl_update,
420 .print_max = 1, 435 .print_max = 1,
421#ifdef CONFIG_FTRACE_SELFTEST 436#ifdef CONFIG_FTRACE_SELFTEST
422 .selftest = trace_selftest_startup_irqsoff, 437 .selftest = trace_selftest_startup_irqsoff,
@@ -428,11 +443,12 @@ static struct tracer irqsoff_tracer __read_mostly =
428#endif 443#endif
429 444
430#ifdef CONFIG_PREEMPT_TRACER 445#ifdef CONFIG_PREEMPT_TRACER
431static void preemptoff_tracer_init(struct trace_array *tr) 446static int preemptoff_tracer_init(struct trace_array *tr)
432{ 447{
433 trace_type = TRACER_PREEMPT_OFF; 448 trace_type = TRACER_PREEMPT_OFF;
434 449
435 __irqsoff_tracer_init(tr); 450 __irqsoff_tracer_init(tr);
451 return 0;
436} 452}
437 453
438static struct tracer preemptoff_tracer __read_mostly = 454static struct tracer preemptoff_tracer __read_mostly =
@@ -440,9 +456,10 @@ static struct tracer preemptoff_tracer __read_mostly =
440 .name = "preemptoff", 456 .name = "preemptoff",
441 .init = preemptoff_tracer_init, 457 .init = preemptoff_tracer_init,
442 .reset = irqsoff_tracer_reset, 458 .reset = irqsoff_tracer_reset,
459 .start = irqsoff_tracer_start,
460 .stop = irqsoff_tracer_stop,
443 .open = irqsoff_tracer_open, 461 .open = irqsoff_tracer_open,
444 .close = irqsoff_tracer_close, 462 .close = irqsoff_tracer_close,
445 .ctrl_update = irqsoff_tracer_ctrl_update,
446 .print_max = 1, 463 .print_max = 1,
447#ifdef CONFIG_FTRACE_SELFTEST 464#ifdef CONFIG_FTRACE_SELFTEST
448 .selftest = trace_selftest_startup_preemptoff, 465 .selftest = trace_selftest_startup_preemptoff,
@@ -456,11 +473,12 @@ static struct tracer preemptoff_tracer __read_mostly =
456#if defined(CONFIG_IRQSOFF_TRACER) && \ 473#if defined(CONFIG_IRQSOFF_TRACER) && \
457 defined(CONFIG_PREEMPT_TRACER) 474 defined(CONFIG_PREEMPT_TRACER)
458 475
459static void preemptirqsoff_tracer_init(struct trace_array *tr) 476static int preemptirqsoff_tracer_init(struct trace_array *tr)
460{ 477{
461 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; 478 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
462 479
463 __irqsoff_tracer_init(tr); 480 __irqsoff_tracer_init(tr);
481 return 0;
464} 482}
465 483
466static struct tracer preemptirqsoff_tracer __read_mostly = 484static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -468,9 +486,10 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
468 .name = "preemptirqsoff", 486 .name = "preemptirqsoff",
469 .init = preemptirqsoff_tracer_init, 487 .init = preemptirqsoff_tracer_init,
470 .reset = irqsoff_tracer_reset, 488 .reset = irqsoff_tracer_reset,
489 .start = irqsoff_tracer_start,
490 .stop = irqsoff_tracer_stop,
471 .open = irqsoff_tracer_open, 491 .open = irqsoff_tracer_open,
472 .close = irqsoff_tracer_close, 492 .close = irqsoff_tracer_close,
473 .ctrl_update = irqsoff_tracer_ctrl_update,
474 .print_max = 1, 493 .print_max = 1,
475#ifdef CONFIG_FTRACE_SELFTEST 494#ifdef CONFIG_FTRACE_SELFTEST
476 .selftest = trace_selftest_startup_preemptirqsoff, 495 .selftest = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index e62cbf78eab6..2fb6da6523b3 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -32,34 +32,29 @@ static void mmio_reset_data(struct trace_array *tr)
32 tracing_reset(tr, cpu); 32 tracing_reset(tr, cpu);
33} 33}
34 34
35static void mmio_trace_init(struct trace_array *tr) 35static int mmio_trace_init(struct trace_array *tr)
36{ 36{
37 pr_debug("in %s\n", __func__); 37 pr_debug("in %s\n", __func__);
38 mmio_trace_array = tr; 38 mmio_trace_array = tr;
39 if (tr->ctrl) { 39
40 mmio_reset_data(tr); 40 mmio_reset_data(tr);
41 enable_mmiotrace(); 41 enable_mmiotrace();
42 } 42 return 0;
43} 43}
44 44
45static void mmio_trace_reset(struct trace_array *tr) 45static void mmio_trace_reset(struct trace_array *tr)
46{ 46{
47 pr_debug("in %s\n", __func__); 47 pr_debug("in %s\n", __func__);
48 if (tr->ctrl) 48
49 disable_mmiotrace(); 49 disable_mmiotrace();
50 mmio_reset_data(tr); 50 mmio_reset_data(tr);
51 mmio_trace_array = NULL; 51 mmio_trace_array = NULL;
52} 52}
53 53
54static void mmio_trace_ctrl_update(struct trace_array *tr) 54static void mmio_trace_start(struct trace_array *tr)
55{ 55{
56 pr_debug("in %s\n", __func__); 56 pr_debug("in %s\n", __func__);
57 if (tr->ctrl) { 57 mmio_reset_data(tr);
58 mmio_reset_data(tr);
59 enable_mmiotrace();
60 } else {
61 disable_mmiotrace();
62 }
63} 58}
64 59
65static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) 60static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
@@ -296,10 +291,10 @@ static struct tracer mmio_tracer __read_mostly =
296 .name = "mmiotrace", 291 .name = "mmiotrace",
297 .init = mmio_trace_init, 292 .init = mmio_trace_init,
298 .reset = mmio_trace_reset, 293 .reset = mmio_trace_reset,
294 .start = mmio_trace_start,
299 .pipe_open = mmio_pipe_open, 295 .pipe_open = mmio_pipe_open,
300 .close = mmio_close, 296 .close = mmio_close,
301 .read = mmio_read, 297 .read = mmio_read,
302 .ctrl_update = mmio_trace_ctrl_update,
303 .print_line = mmio_print_line, 298 .print_line = mmio_print_line,
304}; 299};
305 300
@@ -371,5 +366,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
371 366
372int mmio_trace_printk(const char *fmt, va_list args) 367int mmio_trace_printk(const char *fmt, va_list args)
373{ 368{
374 return trace_vprintk(0, fmt, args); 369 return trace_vprintk(0, -1, fmt, args);
375} 370}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 4592b4862515..b9767acd30ac 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -12,6 +12,27 @@
12 12
13#include "trace.h" 13#include "trace.h"
14 14
15/* Our two options */
16enum {
17 TRACE_NOP_OPT_ACCEPT = 0x1,
18 TRACE_NOP_OPT_REFUSE = 0x2
19};
20
21/* Options for the tracer (see trace_options file) */
22static struct tracer_opt nop_opts[] = {
23 /* Option that will be accepted by set_flag callback */
24 { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) },
25 /* Option that will be refused by set_flag callback */
26 { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) },
27 { } /* Always set a last empty entry */
28};
29
30static struct tracer_flags nop_flags = {
31 /* You can check your flags value here when you want. */
32 .val = 0, /* By default: all flags disabled */
33 .opts = nop_opts
34};
35
15static struct trace_array *ctx_trace; 36static struct trace_array *ctx_trace;
16 37
17static void start_nop_trace(struct trace_array *tr) 38static void start_nop_trace(struct trace_array *tr)
@@ -24,7 +45,7 @@ static void stop_nop_trace(struct trace_array *tr)
24 /* Nothing to do! */ 45 /* Nothing to do! */
25} 46}
26 47
27static void nop_trace_init(struct trace_array *tr) 48static int nop_trace_init(struct trace_array *tr)
28{ 49{
29 int cpu; 50 int cpu;
30 ctx_trace = tr; 51 ctx_trace = tr;
@@ -32,33 +53,53 @@ static void nop_trace_init(struct trace_array *tr)
32 for_each_online_cpu(cpu) 53 for_each_online_cpu(cpu)
33 tracing_reset(tr, cpu); 54 tracing_reset(tr, cpu);
34 55
35 if (tr->ctrl) 56 start_nop_trace(tr);
36 start_nop_trace(tr); 57 return 0;
37} 58}
38 59
39static void nop_trace_reset(struct trace_array *tr) 60static void nop_trace_reset(struct trace_array *tr)
40{ 61{
41 if (tr->ctrl) 62 stop_nop_trace(tr);
42 stop_nop_trace(tr);
43} 63}
44 64
45static void nop_trace_ctrl_update(struct trace_array *tr) 65/* It only serves as a signal handler and a callback to
66 * accept or refuse tthe setting of a flag.
67 * If you don't implement it, then the flag setting will be
68 * automatically accepted.
69 */
70static int nop_set_flag(u32 old_flags, u32 bit, int set)
46{ 71{
47 /* When starting a new trace, reset the buffers */ 72 /*
48 if (tr->ctrl) 73 * Note that you don't need to update nop_flags.val yourself.
49 start_nop_trace(tr); 74 * The tracing Api will do it automatically if you return 0
50 else 75 */
51 stop_nop_trace(tr); 76 if (bit == TRACE_NOP_OPT_ACCEPT) {
77 printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept."
78 " Now cat trace_options to see the result\n",
79 set);
80 return 0;
81 }
82
83 if (bit == TRACE_NOP_OPT_REFUSE) {
84 printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse."
85 "Now cat trace_options to see the result\n",
86 set);
87 return -EINVAL;
88 }
89
90 return 0;
52} 91}
53 92
93
54struct tracer nop_trace __read_mostly = 94struct tracer nop_trace __read_mostly =
55{ 95{
56 .name = "nop", 96 .name = "nop",
57 .init = nop_trace_init, 97 .init = nop_trace_init,
58 .reset = nop_trace_reset, 98 .reset = nop_trace_reset,
59 .ctrl_update = nop_trace_ctrl_update,
60#ifdef CONFIG_FTRACE_SELFTEST 99#ifdef CONFIG_FTRACE_SELFTEST
61 .selftest = trace_selftest_startup_nop, 100 .selftest = trace_selftest_startup_nop,
62#endif 101#endif
102 .flags = &nop_flags,
103 .set_flag = nop_set_flag
63}; 104};
64 105
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
new file mode 100644
index 000000000000..a7172a352f62
--- /dev/null
+++ b/kernel/trace/trace_power.c
@@ -0,0 +1,179 @@
1/*
2 * ring buffer based C-state tracer
3 *
4 * Arjan van de Ven <arjan@linux.intel.com>
5 * Copyright (C) 2008 Intel Corporation
6 *
7 * Much is borrowed from trace_boot.c which is
8 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
9 *
10 */
11
12#include <linux/init.h>
13#include <linux/debugfs.h>
14#include <linux/ftrace.h>
15#include <linux/kallsyms.h>
16#include <linux/module.h>
17
18#include "trace.h"
19
20static struct trace_array *power_trace;
21static int __read_mostly trace_power_enabled;
22
23
24static void start_power_trace(struct trace_array *tr)
25{
26 trace_power_enabled = 1;
27}
28
29static void stop_power_trace(struct trace_array *tr)
30{
31 trace_power_enabled = 0;
32}
33
34
35static int power_trace_init(struct trace_array *tr)
36{
37 int cpu;
38 power_trace = tr;
39
40 trace_power_enabled = 1;
41
42 for_each_cpu_mask(cpu, cpu_possible_map)
43 tracing_reset(tr, cpu);
44 return 0;
45}
46
47static enum print_line_t power_print_line(struct trace_iterator *iter)
48{
49 int ret = 0;
50 struct trace_entry *entry = iter->ent;
51 struct trace_power *field ;
52 struct power_trace *it;
53 struct trace_seq *s = &iter->seq;
54 struct timespec stamp;
55 struct timespec duration;
56
57 trace_assign_type(field, entry);
58 it = &field->state_data;
59 stamp = ktime_to_timespec(it->stamp);
60 duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
61
62 if (entry->type == TRACE_POWER) {
63 if (it->type == POWER_CSTATE)
64 ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
65 stamp.tv_sec,
66 stamp.tv_nsec,
67 it->state, iter->cpu,
68 duration.tv_sec,
69 duration.tv_nsec);
70 if (it->type == POWER_PSTATE)
71 ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
72 stamp.tv_sec,
73 stamp.tv_nsec,
74 it->state, iter->cpu);
75 if (!ret)
76 return TRACE_TYPE_PARTIAL_LINE;
77 return TRACE_TYPE_HANDLED;
78 }
79 return TRACE_TYPE_UNHANDLED;
80}
81
82static struct tracer power_tracer __read_mostly =
83{
84 .name = "power",
85 .init = power_trace_init,
86 .start = start_power_trace,
87 .stop = stop_power_trace,
88 .reset = stop_power_trace,
89 .print_line = power_print_line,
90};
91
92static int init_power_trace(void)
93{
94 return register_tracer(&power_tracer);
95}
96device_initcall(init_power_trace);
97
98void trace_power_start(struct power_trace *it, unsigned int type,
99 unsigned int level)
100{
101 if (!trace_power_enabled)
102 return;
103
104 memset(it, 0, sizeof(struct power_trace));
105 it->state = level;
106 it->type = type;
107 it->stamp = ktime_get();
108}
109EXPORT_SYMBOL_GPL(trace_power_start);
110
111
112void trace_power_end(struct power_trace *it)
113{
114 struct ring_buffer_event *event;
115 struct trace_power *entry;
116 struct trace_array_cpu *data;
117 unsigned long irq_flags;
118 struct trace_array *tr = power_trace;
119
120 if (!trace_power_enabled)
121 return;
122
123 preempt_disable();
124 it->end = ktime_get();
125 data = tr->data[smp_processor_id()];
126
127 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
128 &irq_flags);
129 if (!event)
130 goto out;
131 entry = ring_buffer_event_data(event);
132 tracing_generic_entry_update(&entry->ent, 0, 0);
133 entry->ent.type = TRACE_POWER;
134 entry->state_data = *it;
135 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
136
137 trace_wake_up();
138
139 out:
140 preempt_enable();
141}
142EXPORT_SYMBOL_GPL(trace_power_end);
143
144void trace_power_mark(struct power_trace *it, unsigned int type,
145 unsigned int level)
146{
147 struct ring_buffer_event *event;
148 struct trace_power *entry;
149 struct trace_array_cpu *data;
150 unsigned long irq_flags;
151 struct trace_array *tr = power_trace;
152
153 if (!trace_power_enabled)
154 return;
155
156 memset(it, 0, sizeof(struct power_trace));
157 it->state = level;
158 it->type = type;
159 it->stamp = ktime_get();
160 preempt_disable();
161 it->end = it->stamp;
162 data = tr->data[smp_processor_id()];
163
164 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
165 &irq_flags);
166 if (!event)
167 goto out;
168 entry = ring_buffer_event_data(event);
169 tracing_generic_entry_update(&entry->ent, 0, 0);
170 entry->ent.type = TRACE_POWER;
171 entry->state_data = *it;
172 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
173
174 trace_wake_up();
175
176 out:
177 preempt_enable();
178}
179EXPORT_SYMBOL_GPL(trace_power_mark);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index b8f56beb1a62..863390557b44 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,7 +16,8 @@
16 16
17static struct trace_array *ctx_trace; 17static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled; 18static int __read_mostly tracer_enabled;
19static atomic_t sched_ref; 19static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex);
20 21
21static void 22static void
22probe_sched_switch(struct rq *__rq, struct task_struct *prev, 23probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -27,7 +28,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
27 int cpu; 28 int cpu;
28 int pc; 29 int pc;
29 30
30 if (!atomic_read(&sched_ref)) 31 if (!sched_ref)
31 return; 32 return;
32 33
33 tracing_record_cmdline(prev); 34 tracing_record_cmdline(prev);
@@ -123,20 +124,18 @@ static void tracing_sched_unregister(void)
123 124
124static void tracing_start_sched_switch(void) 125static void tracing_start_sched_switch(void)
125{ 126{
126 long ref; 127 mutex_lock(&sched_register_mutex);
127 128 if (!(sched_ref++))
128 ref = atomic_inc_return(&sched_ref);
129 if (ref == 1)
130 tracing_sched_register(); 129 tracing_sched_register();
130 mutex_unlock(&sched_register_mutex);
131} 131}
132 132
133static void tracing_stop_sched_switch(void) 133static void tracing_stop_sched_switch(void)
134{ 134{
135 long ref; 135 mutex_lock(&sched_register_mutex);
136 136 if (!(--sched_ref))
137 ref = atomic_dec_and_test(&sched_ref);
138 if (ref)
139 tracing_sched_unregister(); 137 tracing_sched_unregister();
138 mutex_unlock(&sched_register_mutex);
140} 139}
141 140
142void tracing_start_cmdline_record(void) 141void tracing_start_cmdline_record(void)
@@ -149,40 +148,86 @@ void tracing_stop_cmdline_record(void)
149 tracing_stop_sched_switch(); 148 tracing_stop_sched_switch();
150} 149}
151 150
151/**
152 * tracing_start_sched_switch_record - start tracing context switches
153 *
154 * Turns on context switch tracing for a tracer.
155 */
156void tracing_start_sched_switch_record(void)
157{
158 if (unlikely(!ctx_trace)) {
159 WARN_ON(1);
160 return;
161 }
162
163 tracing_start_sched_switch();
164
165 mutex_lock(&sched_register_mutex);
166 tracer_enabled++;
167 mutex_unlock(&sched_register_mutex);
168}
169
170/**
171 * tracing_stop_sched_switch_record - start tracing context switches
172 *
173 * Turns off context switch tracing for a tracer.
174 */
175void tracing_stop_sched_switch_record(void)
176{
177 mutex_lock(&sched_register_mutex);
178 tracer_enabled--;
179 WARN_ON(tracer_enabled < 0);
180 mutex_unlock(&sched_register_mutex);
181
182 tracing_stop_sched_switch();
183}
184
185/**
186 * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
187 * @tr: trace array pointer to assign
188 *
189 * Some tracers might want to record the context switches in their
190 * trace. This function lets those tracers assign the trace array
191 * to use.
192 */
193void tracing_sched_switch_assign_trace(struct trace_array *tr)
194{
195 ctx_trace = tr;
196}
197
152static void start_sched_trace(struct trace_array *tr) 198static void start_sched_trace(struct trace_array *tr)
153{ 199{
154 sched_switch_reset(tr); 200 sched_switch_reset(tr);
155 tracing_start_cmdline_record(); 201 tracing_start_sched_switch_record();
156 tracer_enabled = 1;
157} 202}
158 203
159static void stop_sched_trace(struct trace_array *tr) 204static void stop_sched_trace(struct trace_array *tr)
160{ 205{
161 tracer_enabled = 0; 206 tracing_stop_sched_switch_record();
162 tracing_stop_cmdline_record();
163} 207}
164 208
165static void sched_switch_trace_init(struct trace_array *tr) 209static int sched_switch_trace_init(struct trace_array *tr)
166{ 210{
167 ctx_trace = tr; 211 ctx_trace = tr;
168 212 start_sched_trace(tr);
169 if (tr->ctrl) 213 return 0;
170 start_sched_trace(tr);
171} 214}
172 215
173static void sched_switch_trace_reset(struct trace_array *tr) 216static void sched_switch_trace_reset(struct trace_array *tr)
174{ 217{
175 if (tr->ctrl) 218 if (sched_ref)
176 stop_sched_trace(tr); 219 stop_sched_trace(tr);
177} 220}
178 221
179static void sched_switch_trace_ctrl_update(struct trace_array *tr) 222static void sched_switch_trace_start(struct trace_array *tr)
180{ 223{
181 /* When starting a new trace, reset the buffers */ 224 sched_switch_reset(tr);
182 if (tr->ctrl) 225 tracing_start_sched_switch();
183 start_sched_trace(tr); 226}
184 else 227
185 stop_sched_trace(tr); 228static void sched_switch_trace_stop(struct trace_array *tr)
229{
230 tracing_stop_sched_switch();
186} 231}
187 232
188static struct tracer sched_switch_trace __read_mostly = 233static struct tracer sched_switch_trace __read_mostly =
@@ -190,7 +235,8 @@ static struct tracer sched_switch_trace __read_mostly =
190 .name = "sched_switch", 235 .name = "sched_switch",
191 .init = sched_switch_trace_init, 236 .init = sched_switch_trace_init,
192 .reset = sched_switch_trace_reset, 237 .reset = sched_switch_trace_reset,
193 .ctrl_update = sched_switch_trace_ctrl_update, 238 .start = sched_switch_trace_start,
239 .stop = sched_switch_trace_stop,
194#ifdef CONFIG_FTRACE_SELFTEST 240#ifdef CONFIG_FTRACE_SELFTEST
195 .selftest = trace_selftest_startup_sched_switch, 241 .selftest = trace_selftest_startup_sched_switch,
196#endif 242#endif
@@ -198,14 +244,6 @@ static struct tracer sched_switch_trace __read_mostly =
198 244
199__init static int init_sched_switch_trace(void) 245__init static int init_sched_switch_trace(void)
200{ 246{
201 int ret = 0;
202
203 if (atomic_read(&sched_ref))
204 ret = tracing_sched_register();
205 if (ret) {
206 pr_info("error registering scheduler trace\n");
207 return ret;
208 }
209 return register_tracer(&sched_switch_trace); 247 return register_tracer(&sched_switch_trace);
210} 248}
211device_initcall(init_sched_switch_trace); 249device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3ae93f16b565..0067b49746c1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -50,8 +50,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
50 return; 50 return;
51 51
52 pc = preempt_count(); 52 pc = preempt_count();
53 resched = need_resched(); 53 resched = ftrace_preempt_disable();
54 preempt_disable_notrace();
55 54
56 cpu = raw_smp_processor_id(); 55 cpu = raw_smp_processor_id();
57 data = tr->data[cpu]; 56 data = tr->data[cpu];
@@ -81,15 +80,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
81 out: 80 out:
82 atomic_dec(&data->disabled); 81 atomic_dec(&data->disabled);
83 82
84 /* 83 ftrace_preempt_enable(resched);
85 * To prevent recursion from the scheduler, if the
86 * resched flag was set before we entered, then
87 * don't reschedule.
88 */
89 if (resched)
90 preempt_enable_no_resched_notrace();
91 else
92 preempt_enable_notrace();
93} 84}
94 85
95static struct ftrace_ops trace_ops __read_mostly = 86static struct ftrace_ops trace_ops __read_mostly =
@@ -271,6 +262,12 @@ out:
271 atomic_dec(&wakeup_trace->data[cpu]->disabled); 262 atomic_dec(&wakeup_trace->data[cpu]->disabled);
272} 263}
273 264
265/*
266 * save_tracer_enabled is used to save the state of the tracer_enabled
267 * variable when we disable it when we open a trace output file.
268 */
269static int save_tracer_enabled;
270
274static void start_wakeup_tracer(struct trace_array *tr) 271static void start_wakeup_tracer(struct trace_array *tr)
275{ 272{
276 int ret; 273 int ret;
@@ -309,7 +306,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
309 306
310 register_ftrace_function(&trace_ops); 307 register_ftrace_function(&trace_ops);
311 308
312 tracer_enabled = 1; 309 if (tracing_is_enabled()) {
310 tracer_enabled = 1;
311 save_tracer_enabled = 1;
312 } else {
313 tracer_enabled = 0;
314 save_tracer_enabled = 0;
315 }
313 316
314 return; 317 return;
315fail_deprobe_wake_new: 318fail_deprobe_wake_new:
@@ -321,49 +324,53 @@ fail_deprobe:
321static void stop_wakeup_tracer(struct trace_array *tr) 324static void stop_wakeup_tracer(struct trace_array *tr)
322{ 325{
323 tracer_enabled = 0; 326 tracer_enabled = 0;
327 save_tracer_enabled = 0;
324 unregister_ftrace_function(&trace_ops); 328 unregister_ftrace_function(&trace_ops);
325 unregister_trace_sched_switch(probe_wakeup_sched_switch); 329 unregister_trace_sched_switch(probe_wakeup_sched_switch);
326 unregister_trace_sched_wakeup_new(probe_wakeup); 330 unregister_trace_sched_wakeup_new(probe_wakeup);
327 unregister_trace_sched_wakeup(probe_wakeup); 331 unregister_trace_sched_wakeup(probe_wakeup);
328} 332}
329 333
330static void wakeup_tracer_init(struct trace_array *tr) 334static int wakeup_tracer_init(struct trace_array *tr)
331{ 335{
332 wakeup_trace = tr; 336 wakeup_trace = tr;
333 337 start_wakeup_tracer(tr);
334 if (tr->ctrl) 338 return 0;
335 start_wakeup_tracer(tr);
336} 339}
337 340
338static void wakeup_tracer_reset(struct trace_array *tr) 341static void wakeup_tracer_reset(struct trace_array *tr)
339{ 342{
340 if (tr->ctrl) { 343 stop_wakeup_tracer(tr);
341 stop_wakeup_tracer(tr); 344 /* make sure we put back any tasks we are tracing */
342 /* make sure we put back any tasks we are tracing */ 345 wakeup_reset(tr);
343 wakeup_reset(tr); 346}
344 } 347
348static void wakeup_tracer_start(struct trace_array *tr)
349{
350 wakeup_reset(tr);
351 tracer_enabled = 1;
352 save_tracer_enabled = 1;
345} 353}
346 354
347static void wakeup_tracer_ctrl_update(struct trace_array *tr) 355static void wakeup_tracer_stop(struct trace_array *tr)
348{ 356{
349 if (tr->ctrl) 357 tracer_enabled = 0;
350 start_wakeup_tracer(tr); 358 save_tracer_enabled = 0;
351 else
352 stop_wakeup_tracer(tr);
353} 359}
354 360
355static void wakeup_tracer_open(struct trace_iterator *iter) 361static void wakeup_tracer_open(struct trace_iterator *iter)
356{ 362{
357 /* stop the trace while dumping */ 363 /* stop the trace while dumping */
358 if (iter->tr->ctrl) 364 tracer_enabled = 0;
359 stop_wakeup_tracer(iter->tr);
360} 365}
361 366
362static void wakeup_tracer_close(struct trace_iterator *iter) 367static void wakeup_tracer_close(struct trace_iterator *iter)
363{ 368{
364 /* forget about any processes we were recording */ 369 /* forget about any processes we were recording */
365 if (iter->tr->ctrl) 370 if (save_tracer_enabled) {
366 start_wakeup_tracer(iter->tr); 371 wakeup_reset(iter->tr);
372 tracer_enabled = 1;
373 }
367} 374}
368 375
369static struct tracer wakeup_tracer __read_mostly = 376static struct tracer wakeup_tracer __read_mostly =
@@ -371,9 +378,10 @@ static struct tracer wakeup_tracer __read_mostly =
371 .name = "wakeup", 378 .name = "wakeup",
372 .init = wakeup_tracer_init, 379 .init = wakeup_tracer_init,
373 .reset = wakeup_tracer_reset, 380 .reset = wakeup_tracer_reset,
381 .start = wakeup_tracer_start,
382 .stop = wakeup_tracer_stop,
374 .open = wakeup_tracer_open, 383 .open = wakeup_tracer_open,
375 .close = wakeup_tracer_close, 384 .close = wakeup_tracer_close,
376 .ctrl_update = wakeup_tracer_ctrl_update,
377 .print_max = 1, 385 .print_max = 1,
378#ifdef CONFIG_FTRACE_SELFTEST 386#ifdef CONFIG_FTRACE_SELFTEST
379 .selftest = trace_selftest_startup_wakeup, 387 .selftest = trace_selftest_startup_wakeup,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 90bc752a7580..88c8eb70f54a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,6 +13,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
13 case TRACE_STACK: 13 case TRACE_STACK:
14 case TRACE_PRINT: 14 case TRACE_PRINT:
15 case TRACE_SPECIAL: 15 case TRACE_SPECIAL:
16 case TRACE_BRANCH:
16 return 1; 17 return 1;
17 } 18 }
18 return 0; 19 return 0;
@@ -51,7 +52,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
51 int cpu, ret = 0; 52 int cpu, ret = 0;
52 53
53 /* Don't allow flipping of max traces now */ 54 /* Don't allow flipping of max traces now */
54 raw_local_irq_save(flags); 55 local_irq_save(flags);
55 __raw_spin_lock(&ftrace_max_lock); 56 __raw_spin_lock(&ftrace_max_lock);
56 57
57 cnt = ring_buffer_entries(tr->buffer); 58 cnt = ring_buffer_entries(tr->buffer);
@@ -62,7 +63,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
62 break; 63 break;
63 } 64 }
64 __raw_spin_unlock(&ftrace_max_lock); 65 __raw_spin_unlock(&ftrace_max_lock);
65 raw_local_irq_restore(flags); 66 local_irq_restore(flags);
66 67
67 if (count) 68 if (count)
68 *count = cnt; 69 *count = cnt;
@@ -70,6 +71,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
70 return ret; 71 return ret;
71} 72}
72 73
74static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
75{
76 printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n",
77 trace->name, init_ret);
78}
73#ifdef CONFIG_FUNCTION_TRACER 79#ifdef CONFIG_FUNCTION_TRACER
74 80
75#ifdef CONFIG_DYNAMIC_FTRACE 81#ifdef CONFIG_DYNAMIC_FTRACE
@@ -110,8 +116,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
110 ftrace_set_filter(func_name, strlen(func_name), 1); 116 ftrace_set_filter(func_name, strlen(func_name), 1);
111 117
112 /* enable tracing */ 118 /* enable tracing */
113 tr->ctrl = 1; 119 ret = trace->init(tr);
114 trace->init(tr); 120 if (ret) {
121 warn_failed_init_tracer(trace, ret);
122 goto out;
123 }
115 124
116 /* Sleep for a 1/10 of a second */ 125 /* Sleep for a 1/10 of a second */
117 msleep(100); 126 msleep(100);
@@ -134,13 +143,13 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
134 msleep(100); 143 msleep(100);
135 144
136 /* stop the tracing. */ 145 /* stop the tracing. */
137 tr->ctrl = 0; 146 tracing_stop();
138 trace->ctrl_update(tr);
139 ftrace_enabled = 0; 147 ftrace_enabled = 0;
140 148
141 /* check the trace buffer */ 149 /* check the trace buffer */
142 ret = trace_test_buffer(tr, &count); 150 ret = trace_test_buffer(tr, &count);
143 trace->reset(tr); 151 trace->reset(tr);
152 tracing_start();
144 153
145 /* we should only have one item */ 154 /* we should only have one item */
146 if (!ret && count != 1) { 155 if (!ret && count != 1) {
@@ -148,6 +157,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
148 ret = -1; 157 ret = -1;
149 goto out; 158 goto out;
150 } 159 }
160
151 out: 161 out:
152 ftrace_enabled = save_ftrace_enabled; 162 ftrace_enabled = save_ftrace_enabled;
153 tracer_enabled = save_tracer_enabled; 163 tracer_enabled = save_tracer_enabled;
@@ -180,18 +190,22 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
180 ftrace_enabled = 1; 190 ftrace_enabled = 1;
181 tracer_enabled = 1; 191 tracer_enabled = 1;
182 192
183 tr->ctrl = 1; 193 ret = trace->init(tr);
184 trace->init(tr); 194 if (ret) {
195 warn_failed_init_tracer(trace, ret);
196 goto out;
197 }
198
185 /* Sleep for a 1/10 of a second */ 199 /* Sleep for a 1/10 of a second */
186 msleep(100); 200 msleep(100);
187 /* stop the tracing. */ 201 /* stop the tracing. */
188 tr->ctrl = 0; 202 tracing_stop();
189 trace->ctrl_update(tr);
190 ftrace_enabled = 0; 203 ftrace_enabled = 0;
191 204
192 /* check the trace buffer */ 205 /* check the trace buffer */
193 ret = trace_test_buffer(tr, &count); 206 ret = trace_test_buffer(tr, &count);
194 trace->reset(tr); 207 trace->reset(tr);
208 tracing_start();
195 209
196 if (!ret && !count) { 210 if (!ret && !count) {
197 printk(KERN_CONT ".. no entries found .."); 211 printk(KERN_CONT ".. no entries found ..");
@@ -223,8 +237,12 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
223 int ret; 237 int ret;
224 238
225 /* start the tracing */ 239 /* start the tracing */
226 tr->ctrl = 1; 240 ret = trace->init(tr);
227 trace->init(tr); 241 if (ret) {
242 warn_failed_init_tracer(trace, ret);
243 return ret;
244 }
245
228 /* reset the max latency */ 246 /* reset the max latency */
229 tracing_max_latency = 0; 247 tracing_max_latency = 0;
230 /* disable interrupts for a bit */ 248 /* disable interrupts for a bit */
@@ -232,13 +250,13 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
232 udelay(100); 250 udelay(100);
233 local_irq_enable(); 251 local_irq_enable();
234 /* stop the tracing. */ 252 /* stop the tracing. */
235 tr->ctrl = 0; 253 tracing_stop();
236 trace->ctrl_update(tr);
237 /* check both trace buffers */ 254 /* check both trace buffers */
238 ret = trace_test_buffer(tr, NULL); 255 ret = trace_test_buffer(tr, NULL);
239 if (!ret) 256 if (!ret)
240 ret = trace_test_buffer(&max_tr, &count); 257 ret = trace_test_buffer(&max_tr, &count);
241 trace->reset(tr); 258 trace->reset(tr);
259 tracing_start();
242 260
243 if (!ret && !count) { 261 if (!ret && !count) {
244 printk(KERN_CONT ".. no entries found .."); 262 printk(KERN_CONT ".. no entries found ..");
@@ -259,9 +277,26 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
259 unsigned long count; 277 unsigned long count;
260 int ret; 278 int ret;
261 279
280 /*
281 * Now that the big kernel lock is no longer preemptable,
282 * and this is called with the BKL held, it will always
283 * fail. If preemption is already disabled, simply
284 * pass the test. When the BKL is removed, or becomes
285 * preemptible again, we will once again test this,
286 * so keep it in.
287 */
288 if (preempt_count()) {
289 printk(KERN_CONT "can not test ... force ");
290 return 0;
291 }
292
262 /* start the tracing */ 293 /* start the tracing */
263 tr->ctrl = 1; 294 ret = trace->init(tr);
264 trace->init(tr); 295 if (ret) {
296 warn_failed_init_tracer(trace, ret);
297 return ret;
298 }
299
265 /* reset the max latency */ 300 /* reset the max latency */
266 tracing_max_latency = 0; 301 tracing_max_latency = 0;
267 /* disable preemption for a bit */ 302 /* disable preemption for a bit */
@@ -269,13 +304,13 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
269 udelay(100); 304 udelay(100);
270 preempt_enable(); 305 preempt_enable();
271 /* stop the tracing. */ 306 /* stop the tracing. */
272 tr->ctrl = 0; 307 tracing_stop();
273 trace->ctrl_update(tr);
274 /* check both trace buffers */ 308 /* check both trace buffers */
275 ret = trace_test_buffer(tr, NULL); 309 ret = trace_test_buffer(tr, NULL);
276 if (!ret) 310 if (!ret)
277 ret = trace_test_buffer(&max_tr, &count); 311 ret = trace_test_buffer(&max_tr, &count);
278 trace->reset(tr); 312 trace->reset(tr);
313 tracing_start();
279 314
280 if (!ret && !count) { 315 if (!ret && !count) {
281 printk(KERN_CONT ".. no entries found .."); 316 printk(KERN_CONT ".. no entries found ..");
@@ -296,9 +331,25 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
296 unsigned long count; 331 unsigned long count;
297 int ret; 332 int ret;
298 333
334 /*
335 * Now that the big kernel lock is no longer preemptable,
336 * and this is called with the BKL held, it will always
337 * fail. If preemption is already disabled, simply
338 * pass the test. When the BKL is removed, or becomes
339 * preemptible again, we will once again test this,
340 * so keep it in.
341 */
342 if (preempt_count()) {
343 printk(KERN_CONT "can not test ... force ");
344 return 0;
345 }
346
299 /* start the tracing */ 347 /* start the tracing */
300 tr->ctrl = 1; 348 ret = trace->init(tr);
301 trace->init(tr); 349 if (ret) {
350 warn_failed_init_tracer(trace, ret);
351 goto out;
352 }
302 353
303 /* reset the max latency */ 354 /* reset the max latency */
304 tracing_max_latency = 0; 355 tracing_max_latency = 0;
@@ -312,27 +363,30 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
312 local_irq_enable(); 363 local_irq_enable();
313 364
314 /* stop the tracing. */ 365 /* stop the tracing. */
315 tr->ctrl = 0; 366 tracing_stop();
316 trace->ctrl_update(tr);
317 /* check both trace buffers */ 367 /* check both trace buffers */
318 ret = trace_test_buffer(tr, NULL); 368 ret = trace_test_buffer(tr, NULL);
319 if (ret) 369 if (ret) {
370 tracing_start();
320 goto out; 371 goto out;
372 }
321 373
322 ret = trace_test_buffer(&max_tr, &count); 374 ret = trace_test_buffer(&max_tr, &count);
323 if (ret) 375 if (ret) {
376 tracing_start();
324 goto out; 377 goto out;
378 }
325 379
326 if (!ret && !count) { 380 if (!ret && !count) {
327 printk(KERN_CONT ".. no entries found .."); 381 printk(KERN_CONT ".. no entries found ..");
328 ret = -1; 382 ret = -1;
383 tracing_start();
329 goto out; 384 goto out;
330 } 385 }
331 386
332 /* do the test by disabling interrupts first this time */ 387 /* do the test by disabling interrupts first this time */
333 tracing_max_latency = 0; 388 tracing_max_latency = 0;
334 tr->ctrl = 1; 389 tracing_start();
335 trace->ctrl_update(tr);
336 preempt_disable(); 390 preempt_disable();
337 local_irq_disable(); 391 local_irq_disable();
338 udelay(100); 392 udelay(100);
@@ -341,8 +395,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
341 local_irq_enable(); 395 local_irq_enable();
342 396
343 /* stop the tracing. */ 397 /* stop the tracing. */
344 tr->ctrl = 0; 398 tracing_stop();
345 trace->ctrl_update(tr);
346 /* check both trace buffers */ 399 /* check both trace buffers */
347 ret = trace_test_buffer(tr, NULL); 400 ret = trace_test_buffer(tr, NULL);
348 if (ret) 401 if (ret)
@@ -358,6 +411,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
358 411
359 out: 412 out:
360 trace->reset(tr); 413 trace->reset(tr);
414 tracing_start();
361 tracing_max_latency = save_max; 415 tracing_max_latency = save_max;
362 416
363 return ret; 417 return ret;
@@ -423,8 +477,12 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
423 wait_for_completion(&isrt); 477 wait_for_completion(&isrt);
424 478
425 /* start the tracing */ 479 /* start the tracing */
426 tr->ctrl = 1; 480 ret = trace->init(tr);
427 trace->init(tr); 481 if (ret) {
482 warn_failed_init_tracer(trace, ret);
483 return ret;
484 }
485
428 /* reset the max latency */ 486 /* reset the max latency */
429 tracing_max_latency = 0; 487 tracing_max_latency = 0;
430 488
@@ -448,8 +506,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
448 msleep(100); 506 msleep(100);
449 507
450 /* stop the tracing. */ 508 /* stop the tracing. */
451 tr->ctrl = 0; 509 tracing_stop();
452 trace->ctrl_update(tr);
453 /* check both trace buffers */ 510 /* check both trace buffers */
454 ret = trace_test_buffer(tr, NULL); 511 ret = trace_test_buffer(tr, NULL);
455 if (!ret) 512 if (!ret)
@@ -457,6 +514,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
457 514
458 515
459 trace->reset(tr); 516 trace->reset(tr);
517 tracing_start();
460 518
461 tracing_max_latency = save_max; 519 tracing_max_latency = save_max;
462 520
@@ -480,16 +538,20 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
480 int ret; 538 int ret;
481 539
482 /* start the tracing */ 540 /* start the tracing */
483 tr->ctrl = 1; 541 ret = trace->init(tr);
484 trace->init(tr); 542 if (ret) {
543 warn_failed_init_tracer(trace, ret);
544 return ret;
545 }
546
485 /* Sleep for a 1/10 of a second */ 547 /* Sleep for a 1/10 of a second */
486 msleep(100); 548 msleep(100);
487 /* stop the tracing. */ 549 /* stop the tracing. */
488 tr->ctrl = 0; 550 tracing_stop();
489 trace->ctrl_update(tr);
490 /* check the trace buffer */ 551 /* check the trace buffer */
491 ret = trace_test_buffer(tr, &count); 552 ret = trace_test_buffer(tr, &count);
492 trace->reset(tr); 553 trace->reset(tr);
554 tracing_start();
493 555
494 if (!ret && !count) { 556 if (!ret && !count) {
495 printk(KERN_CONT ".. no entries found .."); 557 printk(KERN_CONT ".. no entries found ..");
@@ -508,17 +570,48 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
508 int ret; 570 int ret;
509 571
510 /* start the tracing */ 572 /* start the tracing */
511 tr->ctrl = 1; 573 ret = trace->init(tr);
512 trace->init(tr); 574 if (ret) {
575 warn_failed_init_tracer(trace, ret);
576 return 0;
577 }
578
513 /* Sleep for a 1/10 of a second */ 579 /* Sleep for a 1/10 of a second */
514 msleep(100); 580 msleep(100);
515 /* stop the tracing. */ 581 /* stop the tracing. */
516 tr->ctrl = 0; 582 tracing_stop();
517 trace->ctrl_update(tr);
518 /* check the trace buffer */ 583 /* check the trace buffer */
519 ret = trace_test_buffer(tr, &count); 584 ret = trace_test_buffer(tr, &count);
520 trace->reset(tr); 585 trace->reset(tr);
586 tracing_start();
521 587
522 return ret; 588 return ret;
523} 589}
524#endif /* CONFIG_SYSPROF_TRACER */ 590#endif /* CONFIG_SYSPROF_TRACER */
591
592#ifdef CONFIG_BRANCH_TRACER
593int
594trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
595{
596 unsigned long count;
597 int ret;
598
599 /* start the tracing */
600 ret = trace->init(tr);
601 if (ret) {
602 warn_failed_init_tracer(trace, ret);
603 return ret;
604 }
605
606 /* Sleep for a 1/10 of a second */
607 msleep(100);
608 /* stop the tracing. */
609 tracing_stop();
610 /* check the trace buffer */
611 ret = trace_test_buffer(tr, &count);
612 trace->reset(tr);
613 tracing_start();
614
615 return ret;
616}
617#endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3bdb44bde4b7..0b863f2cbc8e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -48,7 +48,7 @@ static inline void check_stack(void)
48 if (!object_is_on_stack(&this_size)) 48 if (!object_is_on_stack(&this_size))
49 return; 49 return;
50 50
51 raw_local_irq_save(flags); 51 local_irq_save(flags);
52 __raw_spin_lock(&max_stack_lock); 52 __raw_spin_lock(&max_stack_lock);
53 53
54 /* a race could have already updated it */ 54 /* a race could have already updated it */
@@ -78,6 +78,7 @@ static inline void check_stack(void)
78 * on a new max, so it is far from a fast path. 78 * on a new max, so it is far from a fast path.
79 */ 79 */
80 while (i < max_stack_trace.nr_entries) { 80 while (i < max_stack_trace.nr_entries) {
81 int found = 0;
81 82
82 stack_dump_index[i] = this_size; 83 stack_dump_index[i] = this_size;
83 p = start; 84 p = start;
@@ -86,17 +87,19 @@ static inline void check_stack(void)
86 if (*p == stack_dump_trace[i]) { 87 if (*p == stack_dump_trace[i]) {
87 this_size = stack_dump_index[i++] = 88 this_size = stack_dump_index[i++] =
88 (top - p) * sizeof(unsigned long); 89 (top - p) * sizeof(unsigned long);
90 found = 1;
89 /* Start the search from here */ 91 /* Start the search from here */
90 start = p + 1; 92 start = p + 1;
91 } 93 }
92 } 94 }
93 95
94 i++; 96 if (!found)
97 i++;
95 } 98 }
96 99
97 out: 100 out:
98 __raw_spin_unlock(&max_stack_lock); 101 __raw_spin_unlock(&max_stack_lock);
99 raw_local_irq_restore(flags); 102 local_irq_restore(flags);
100} 103}
101 104
102static void 105static void
@@ -107,8 +110,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
107 if (unlikely(!ftrace_enabled || stack_trace_disabled)) 110 if (unlikely(!ftrace_enabled || stack_trace_disabled))
108 return; 111 return;
109 112
110 resched = need_resched(); 113 resched = ftrace_preempt_disable();
111 preempt_disable_notrace();
112 114
113 cpu = raw_smp_processor_id(); 115 cpu = raw_smp_processor_id();
114 /* no atomic needed, we only modify this variable by this cpu */ 116 /* no atomic needed, we only modify this variable by this cpu */
@@ -120,10 +122,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
120 out: 122 out:
121 per_cpu(trace_active, cpu)--; 123 per_cpu(trace_active, cpu)--;
122 /* prevent recursion in schedule */ 124 /* prevent recursion in schedule */
123 if (resched) 125 ftrace_preempt_enable(resched);
124 preempt_enable_no_resched_notrace();
125 else
126 preempt_enable_notrace();
127} 126}
128 127
129static struct ftrace_ops trace_ops __read_mostly = 128static struct ftrace_ops trace_ops __read_mostly =
@@ -166,11 +165,11 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
166 if (ret < 0) 165 if (ret < 0)
167 return ret; 166 return ret;
168 167
169 raw_local_irq_save(flags); 168 local_irq_save(flags);
170 __raw_spin_lock(&max_stack_lock); 169 __raw_spin_lock(&max_stack_lock);
171 *ptr = val; 170 *ptr = val;
172 __raw_spin_unlock(&max_stack_lock); 171 __raw_spin_unlock(&max_stack_lock);
173 raw_local_irq_restore(flags); 172 local_irq_restore(flags);
174 173
175 return count; 174 return count;
176} 175}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 9587d3bcba55..54960edb96d0 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -261,27 +261,17 @@ static void stop_stack_trace(struct trace_array *tr)
261 mutex_unlock(&sample_timer_lock); 261 mutex_unlock(&sample_timer_lock);
262} 262}
263 263
264static void stack_trace_init(struct trace_array *tr) 264static int stack_trace_init(struct trace_array *tr)
265{ 265{
266 sysprof_trace = tr; 266 sysprof_trace = tr;
267 267
268 if (tr->ctrl) 268 start_stack_trace(tr);
269 start_stack_trace(tr); 269 return 0;
270} 270}
271 271
272static void stack_trace_reset(struct trace_array *tr) 272static void stack_trace_reset(struct trace_array *tr)
273{ 273{
274 if (tr->ctrl) 274 stop_stack_trace(tr);
275 stop_stack_trace(tr);
276}
277
278static void stack_trace_ctrl_update(struct trace_array *tr)
279{
280 /* When starting a new trace, reset the buffers */
281 if (tr->ctrl)
282 start_stack_trace(tr);
283 else
284 stop_stack_trace(tr);
285} 275}
286 276
287static struct tracer stack_trace __read_mostly = 277static struct tracer stack_trace __read_mostly =
@@ -289,7 +279,6 @@ static struct tracer stack_trace __read_mostly =
289 .name = "sysprof", 279 .name = "sysprof",
290 .init = stack_trace_init, 280 .init = stack_trace_init,
291 .reset = stack_trace_reset, 281 .reset = stack_trace_reset,
292 .ctrl_update = stack_trace_ctrl_update,
293#ifdef CONFIG_FTRACE_SELFTEST 282#ifdef CONFIG_FTRACE_SELFTEST
294 .selftest = trace_selftest_startup_sysprof, 283 .selftest = trace_selftest_startup_sysprof,
295#endif 284#endif
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index af8c85664882..79602740bbb5 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(tracepoints_mutex);
43 */ 43 */
44#define TRACEPOINT_HASH_BITS 6 44#define TRACEPOINT_HASH_BITS 6
45#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) 45#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
46static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
46 47
47/* 48/*
48 * Note about RCU : 49 * Note about RCU :
@@ -54,40 +55,43 @@ struct tracepoint_entry {
54 struct hlist_node hlist; 55 struct hlist_node hlist;
55 void **funcs; 56 void **funcs;
56 int refcount; /* Number of times armed. 0 if disarmed. */ 57 int refcount; /* Number of times armed. 0 if disarmed. */
57 struct rcu_head rcu;
58 void *oldptr;
59 unsigned char rcu_pending:1;
60 char name[0]; 58 char name[0];
61}; 59};
62 60
63static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; 61struct tp_probes {
62 union {
63 struct rcu_head rcu;
64 struct list_head list;
65 } u;
66 void *probes[0];
67};
64 68
65static void free_old_closure(struct rcu_head *head) 69static inline void *allocate_probes(int count)
66{ 70{
67 struct tracepoint_entry *entry = container_of(head, 71 struct tp_probes *p = kmalloc(count * sizeof(void *)
68 struct tracepoint_entry, rcu); 72 + sizeof(struct tp_probes), GFP_KERNEL);
69 kfree(entry->oldptr); 73 return p == NULL ? NULL : p->probes;
70 /* Make sure we free the data before setting the pending flag to 0 */
71 smp_wmb();
72 entry->rcu_pending = 0;
73} 74}
74 75
75static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) 76static void rcu_free_old_probes(struct rcu_head *head)
76{ 77{
77 if (!old) 78 kfree(container_of(head, struct tp_probes, u.rcu));
78 return; 79}
79 entry->oldptr = old; 80
80 entry->rcu_pending = 1; 81static inline void release_probes(void *old)
81 /* write rcu_pending before calling the RCU callback */ 82{
82 smp_wmb(); 83 if (old) {
83 call_rcu_sched(&entry->rcu, free_old_closure); 84 struct tp_probes *tp_probes = container_of(old,
85 struct tp_probes, probes[0]);
86 call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
87 }
84} 88}
85 89
86static void debug_print_probes(struct tracepoint_entry *entry) 90static void debug_print_probes(struct tracepoint_entry *entry)
87{ 91{
88 int i; 92 int i;
89 93
90 if (!tracepoint_debug) 94 if (!tracepoint_debug || !entry->funcs)
91 return; 95 return;
92 96
93 for (i = 0; entry->funcs[i]; i++) 97 for (i = 0; entry->funcs[i]; i++)
@@ -111,12 +115,13 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
111 return ERR_PTR(-EEXIST); 115 return ERR_PTR(-EEXIST);
112 } 116 }
113 /* + 2 : one for new probe, one for NULL func */ 117 /* + 2 : one for new probe, one for NULL func */
114 new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); 118 new = allocate_probes(nr_probes + 2);
115 if (new == NULL) 119 if (new == NULL)
116 return ERR_PTR(-ENOMEM); 120 return ERR_PTR(-ENOMEM);
117 if (old) 121 if (old)
118 memcpy(new, old, nr_probes * sizeof(void *)); 122 memcpy(new, old, nr_probes * sizeof(void *));
119 new[nr_probes] = probe; 123 new[nr_probes] = probe;
124 new[nr_probes + 1] = NULL;
120 entry->refcount = nr_probes + 1; 125 entry->refcount = nr_probes + 1;
121 entry->funcs = new; 126 entry->funcs = new;
122 debug_print_probes(entry); 127 debug_print_probes(entry);
@@ -132,7 +137,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
132 old = entry->funcs; 137 old = entry->funcs;
133 138
134 if (!old) 139 if (!old)
135 return NULL; 140 return ERR_PTR(-ENOENT);
136 141
137 debug_print_probes(entry); 142 debug_print_probes(entry);
138 /* (N -> M), (N > 1, M >= 0) probes */ 143 /* (N -> M), (N > 1, M >= 0) probes */
@@ -151,13 +156,13 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
151 int j = 0; 156 int j = 0;
152 /* N -> M, (N > 1, M > 0) */ 157 /* N -> M, (N > 1, M > 0) */
153 /* + 1 for NULL */ 158 /* + 1 for NULL */
154 new = kzalloc((nr_probes - nr_del + 1) 159 new = allocate_probes(nr_probes - nr_del + 1);
155 * sizeof(void *), GFP_KERNEL);
156 if (new == NULL) 160 if (new == NULL)
157 return ERR_PTR(-ENOMEM); 161 return ERR_PTR(-ENOMEM);
158 for (i = 0; old[i]; i++) 162 for (i = 0; old[i]; i++)
159 if ((probe && old[i] != probe)) 163 if ((probe && old[i] != probe))
160 new[j++] = old[i]; 164 new[j++] = old[i];
165 new[nr_probes - nr_del] = NULL;
161 entry->refcount = nr_probes - nr_del; 166 entry->refcount = nr_probes - nr_del;
162 entry->funcs = new; 167 entry->funcs = new;
163 } 168 }
@@ -215,7 +220,6 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
215 memcpy(&e->name[0], name, name_len); 220 memcpy(&e->name[0], name, name_len);
216 e->funcs = NULL; 221 e->funcs = NULL;
217 e->refcount = 0; 222 e->refcount = 0;
218 e->rcu_pending = 0;
219 hlist_add_head(&e->hlist, head); 223 hlist_add_head(&e->hlist, head);
220 return e; 224 return e;
221} 225}
@@ -224,32 +228,10 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
224 * Remove the tracepoint from the tracepoint hash table. Must be called with 228 * Remove the tracepoint from the tracepoint hash table. Must be called with
225 * mutex_lock held. 229 * mutex_lock held.
226 */ 230 */
227static int remove_tracepoint(const char *name) 231static inline void remove_tracepoint(struct tracepoint_entry *e)
228{ 232{
229 struct hlist_head *head;
230 struct hlist_node *node;
231 struct tracepoint_entry *e;
232 int found = 0;
233 size_t len = strlen(name) + 1;
234 u32 hash = jhash(name, len-1, 0);
235
236 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
237 hlist_for_each_entry(e, node, head, hlist) {
238 if (!strcmp(name, e->name)) {
239 found = 1;
240 break;
241 }
242 }
243 if (!found)
244 return -ENOENT;
245 if (e->refcount)
246 return -EBUSY;
247 hlist_del(&e->hlist); 233 hlist_del(&e->hlist);
248 /* Make sure the call_rcu_sched has been executed */
249 if (e->rcu_pending)
250 rcu_barrier_sched();
251 kfree(e); 234 kfree(e);
252 return 0;
253} 235}
254 236
255/* 237/*
@@ -280,6 +262,7 @@ static void set_tracepoint(struct tracepoint_entry **entry,
280static void disable_tracepoint(struct tracepoint *elem) 262static void disable_tracepoint(struct tracepoint *elem)
281{ 263{
282 elem->state = 0; 264 elem->state = 0;
265 rcu_assign_pointer(elem->funcs, NULL);
283} 266}
284 267
285/** 268/**
@@ -320,6 +303,23 @@ static void tracepoint_update_probes(void)
320 module_update_tracepoints(); 303 module_update_tracepoints();
321} 304}
322 305
306static void *tracepoint_add_probe(const char *name, void *probe)
307{
308 struct tracepoint_entry *entry;
309 void *old;
310
311 entry = get_tracepoint(name);
312 if (!entry) {
313 entry = add_tracepoint(name);
314 if (IS_ERR(entry))
315 return entry;
316 }
317 old = tracepoint_entry_add_probe(entry, probe);
318 if (IS_ERR(old) && !entry->refcount)
319 remove_tracepoint(entry);
320 return old;
321}
322
323/** 323/**
324 * tracepoint_probe_register - Connect a probe to a tracepoint 324 * tracepoint_probe_register - Connect a probe to a tracepoint
325 * @name: tracepoint name 325 * @name: tracepoint name
@@ -330,44 +330,36 @@ static void tracepoint_update_probes(void)
330 */ 330 */
331int tracepoint_probe_register(const char *name, void *probe) 331int tracepoint_probe_register(const char *name, void *probe)
332{ 332{
333 struct tracepoint_entry *entry;
334 int ret = 0;
335 void *old; 333 void *old;
336 334
337 mutex_lock(&tracepoints_mutex); 335 mutex_lock(&tracepoints_mutex);
338 entry = get_tracepoint(name); 336 old = tracepoint_add_probe(name, probe);
339 if (!entry) {
340 entry = add_tracepoint(name);
341 if (IS_ERR(entry)) {
342 ret = PTR_ERR(entry);
343 goto end;
344 }
345 }
346 /*
347 * If we detect that a call_rcu_sched is pending for this tracepoint,
348 * make sure it's executed now.
349 */
350 if (entry->rcu_pending)
351 rcu_barrier_sched();
352 old = tracepoint_entry_add_probe(entry, probe);
353 if (IS_ERR(old)) {
354 ret = PTR_ERR(old);
355 goto end;
356 }
357 mutex_unlock(&tracepoints_mutex); 337 mutex_unlock(&tracepoints_mutex);
338 if (IS_ERR(old))
339 return PTR_ERR(old);
340
358 tracepoint_update_probes(); /* may update entry */ 341 tracepoint_update_probes(); /* may update entry */
359 mutex_lock(&tracepoints_mutex); 342 release_probes(old);
360 entry = get_tracepoint(name); 343 return 0;
361 WARN_ON(!entry);
362 if (entry->rcu_pending)
363 rcu_barrier_sched();
364 tracepoint_entry_free_old(entry, old);
365end:
366 mutex_unlock(&tracepoints_mutex);
367 return ret;
368} 344}
369EXPORT_SYMBOL_GPL(tracepoint_probe_register); 345EXPORT_SYMBOL_GPL(tracepoint_probe_register);
370 346
347static void *tracepoint_remove_probe(const char *name, void *probe)
348{
349 struct tracepoint_entry *entry;
350 void *old;
351
352 entry = get_tracepoint(name);
353 if (!entry)
354 return ERR_PTR(-ENOENT);
355 old = tracepoint_entry_remove_probe(entry, probe);
356 if (IS_ERR(old))
357 return old;
358 if (!entry->refcount)
359 remove_tracepoint(entry);
360 return old;
361}
362
371/** 363/**
372 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint 364 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
373 * @name: tracepoint name 365 * @name: tracepoint name
@@ -380,38 +372,104 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
380 */ 372 */
381int tracepoint_probe_unregister(const char *name, void *probe) 373int tracepoint_probe_unregister(const char *name, void *probe)
382{ 374{
383 struct tracepoint_entry *entry;
384 void *old; 375 void *old;
385 int ret = -ENOENT;
386 376
387 mutex_lock(&tracepoints_mutex); 377 mutex_lock(&tracepoints_mutex);
388 entry = get_tracepoint(name); 378 old = tracepoint_remove_probe(name, probe);
389 if (!entry)
390 goto end;
391 if (entry->rcu_pending)
392 rcu_barrier_sched();
393 old = tracepoint_entry_remove_probe(entry, probe);
394 if (!old) {
395 printk(KERN_WARNING "Warning: Trying to unregister a probe"
396 "that doesn't exist\n");
397 goto end;
398 }
399 mutex_unlock(&tracepoints_mutex); 379 mutex_unlock(&tracepoints_mutex);
380 if (IS_ERR(old))
381 return PTR_ERR(old);
382
400 tracepoint_update_probes(); /* may update entry */ 383 tracepoint_update_probes(); /* may update entry */
384 release_probes(old);
385 return 0;
386}
387EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
388
389static LIST_HEAD(old_probes);
390static int need_update;
391
392static void tracepoint_add_old_probes(void *old)
393{
394 need_update = 1;
395 if (old) {
396 struct tp_probes *tp_probes = container_of(old,
397 struct tp_probes, probes[0]);
398 list_add(&tp_probes->u.list, &old_probes);
399 }
400}
401
402/**
403 * tracepoint_probe_register_noupdate - register a probe but not connect
404 * @name: tracepoint name
405 * @probe: probe handler
406 *
407 * caller must call tracepoint_probe_update_all()
408 */
409int tracepoint_probe_register_noupdate(const char *name, void *probe)
410{
411 void *old;
412
401 mutex_lock(&tracepoints_mutex); 413 mutex_lock(&tracepoints_mutex);
402 entry = get_tracepoint(name); 414 old = tracepoint_add_probe(name, probe);
403 if (!entry) 415 if (IS_ERR(old)) {
404 goto end; 416 mutex_unlock(&tracepoints_mutex);
405 if (entry->rcu_pending) 417 return PTR_ERR(old);
406 rcu_barrier_sched(); 418 }
407 tracepoint_entry_free_old(entry, old); 419 tracepoint_add_old_probes(old);
408 remove_tracepoint(name); /* Ignore busy error message */
409 ret = 0;
410end:
411 mutex_unlock(&tracepoints_mutex); 420 mutex_unlock(&tracepoints_mutex);
412 return ret; 421 return 0;
413} 422}
414EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); 423EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
424
425/**
426 * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect
427 * @name: tracepoint name
428 * @probe: probe function pointer
429 *
430 * caller must call tracepoint_probe_update_all()
431 */
432int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
433{
434 void *old;
435
436 mutex_lock(&tracepoints_mutex);
437 old = tracepoint_remove_probe(name, probe);
438 if (IS_ERR(old)) {
439 mutex_unlock(&tracepoints_mutex);
440 return PTR_ERR(old);
441 }
442 tracepoint_add_old_probes(old);
443 mutex_unlock(&tracepoints_mutex);
444 return 0;
445}
446EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
447
448/**
449 * tracepoint_probe_update_all - update tracepoints
450 */
451void tracepoint_probe_update_all(void)
452{
453 LIST_HEAD(release_probes);
454 struct tp_probes *pos, *next;
455
456 mutex_lock(&tracepoints_mutex);
457 if (!need_update) {
458 mutex_unlock(&tracepoints_mutex);
459 return;
460 }
461 if (!list_empty(&old_probes))
462 list_replace_init(&old_probes, &release_probes);
463 need_update = 0;
464 mutex_unlock(&tracepoints_mutex);
465
466 tracepoint_update_probes();
467 list_for_each_entry_safe(pos, next, &release_probes, u.list) {
468 list_del(&pos->u.list);
469 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
470 }
471}
472EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
415 473
416/** 474/**
417 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. 475 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
@@ -483,3 +541,36 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
483 iter->tracepoint = NULL; 541 iter->tracepoint = NULL;
484} 542}
485EXPORT_SYMBOL_GPL(tracepoint_iter_reset); 543EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
544
545#ifdef CONFIG_MODULES
546
547int tracepoint_module_notify(struct notifier_block *self,
548 unsigned long val, void *data)
549{
550 struct module *mod = data;
551
552 switch (val) {
553 case MODULE_STATE_COMING:
554 tracepoint_update_probe_range(mod->tracepoints,
555 mod->tracepoints + mod->num_tracepoints);
556 break;
557 case MODULE_STATE_GOING:
558 tracepoint_update_probe_range(mod->tracepoints,
559 mod->tracepoints + mod->num_tracepoints);
560 break;
561 }
562 return 0;
563}
564
565struct notifier_block tracepoint_module_nb = {
566 .notifier_call = tracepoint_module_notify,
567 .priority = 0,
568};
569
570static int init_tracepoints(void)
571{
572 return register_module_notifier(&tracepoint_module_nb);
573}
574__initcall(init_tracepoints);
575
576#endif /* CONFIG_MODULES */
diff --git a/kernel/user.c b/kernel/user.c
index 39d6159fae43..cec2224bc9f5 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -101,6 +101,8 @@ static int sched_create_user(struct user_struct *up)
101 if (IS_ERR(up->tg)) 101 if (IS_ERR(up->tg))
102 rc = -ENOMEM; 102 rc = -ENOMEM;
103 103
104 set_tg_uid(up);
105
104 return rc; 106 return rc;
105} 107}
106 108
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b0f239e443bc..1e3fd3e3436a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -545,6 +545,16 @@ config DEBUG_SG
545 545
546 If unsure, say N. 546 If unsure, say N.
547 547
548config DEBUG_NOTIFIERS
549 bool "Debug notifier call chains"
550 depends on DEBUG_KERNEL
551 help
552 Enable this to turn on sanity checking for notifier call chains.
553 This is most useful for kernel developers to make sure that
554 modules properly unregister themselves from notifier chains.
555 This is a relatively cheap check but if you care about maximum
556 performance, say N.
557
548config FRAME_POINTER 558config FRAME_POINTER
549 bool "Compile the kernel with frame pointers" 559 bool "Compile the kernel with frame pointers"
550 depends on DEBUG_KERNEL && \ 560 depends on DEBUG_KERNEL && \
diff --git a/mm/bounce.c b/mm/bounce.c
index 06722c403058..bf0cf7c8387b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -14,6 +14,7 @@
14#include <linux/hash.h> 14#include <linux/hash.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/blktrace_api.h> 16#include <linux/blktrace_api.h>
17#include <trace/block.h>
17#include <asm/tlbflush.h> 18#include <asm/tlbflush.h>
18 19
19#define POOL_SIZE 64 20#define POOL_SIZE 64
@@ -21,6 +22,8 @@
21 22
22static mempool_t *page_pool, *isa_page_pool; 23static mempool_t *page_pool, *isa_page_pool;
23 24
25DEFINE_TRACE(block_bio_bounce);
26
24#ifdef CONFIG_HIGHMEM 27#ifdef CONFIG_HIGHMEM
25static __init int init_emergency_pool(void) 28static __init int init_emergency_pool(void)
26{ 29{
@@ -222,7 +225,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
222 if (!bio) 225 if (!bio)
223 return; 226 return;
224 227
225 blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); 228 trace_block_bio_bounce(q, *bio_orig);
226 229
227 /* 230 /*
228 * at least one page was bounced, fill in possible non-highmem 231 * at least one page was bounced, fill in possible non-highmem
diff --git a/mm/memory.c b/mm/memory.c
index 164951c47305..fc031d68327e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3049,3 +3049,18 @@ void print_vma_addr(char *prefix, unsigned long ip)
3049 } 3049 }
3050 up_read(&current->mm->mmap_sem); 3050 up_read(&current->mm->mmap_sem);
3051} 3051}
3052
3053#ifdef CONFIG_PROVE_LOCKING
3054void might_fault(void)
3055{
3056 might_sleep();
3057 /*
3058 * it would be nicer only to annotate paths which are not under
3059 * pagefault_disable, however that requires a larger audit and
3060 * providing helpers like get_user_atomic.
3061 */
3062 if (!in_atomic() && current->mm)
3063 might_lock_read(&current->mm->mmap_sem);
3064}
3065EXPORT_SYMBOL(might_fault);
3066#endif
diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h
index 0216b55bd640..01724e04c556 100644
--- a/samples/tracepoints/tp-samples-trace.h
+++ b/samples/tracepoints/tp-samples-trace.h
@@ -4,10 +4,10 @@
4#include <linux/proc_fs.h> /* for struct inode and struct file */ 4#include <linux/proc_fs.h> /* for struct inode and struct file */
5#include <linux/tracepoint.h> 5#include <linux/tracepoint.h>
6 6
7DEFINE_TRACE(subsys_event, 7DECLARE_TRACE(subsys_event,
8 TPPROTO(struct inode *inode, struct file *file), 8 TPPROTO(struct inode *inode, struct file *file),
9 TPARGS(inode, file)); 9 TPARGS(inode, file));
10DEFINE_TRACE(subsys_eventb, 10DECLARE_TRACE(subsys_eventb,
11 TPPROTO(void), 11 TPPROTO(void),
12 TPARGS()); 12 TPARGS());
13#endif 13#endif
diff --git a/samples/tracepoints/tracepoint-probe-sample.c b/samples/tracepoints/tracepoint-probe-sample.c
index 55abfdda4bd4..e3a964889dc7 100644
--- a/samples/tracepoints/tracepoint-probe-sample.c
+++ b/samples/tracepoints/tracepoint-probe-sample.c
@@ -46,6 +46,7 @@ void __exit tp_sample_trace_exit(void)
46{ 46{
47 unregister_trace_subsys_eventb(probe_subsys_eventb); 47 unregister_trace_subsys_eventb(probe_subsys_eventb);
48 unregister_trace_subsys_event(probe_subsys_event); 48 unregister_trace_subsys_event(probe_subsys_event);
49 tracepoint_synchronize_unregister();
49} 50}
50 51
51module_exit(tp_sample_trace_exit); 52module_exit(tp_sample_trace_exit);
diff --git a/samples/tracepoints/tracepoint-probe-sample2.c b/samples/tracepoints/tracepoint-probe-sample2.c
index 5e9fcf4afffe..685a5acb4562 100644
--- a/samples/tracepoints/tracepoint-probe-sample2.c
+++ b/samples/tracepoints/tracepoint-probe-sample2.c
@@ -33,6 +33,7 @@ module_init(tp_sample_trace_init);
33void __exit tp_sample_trace_exit(void) 33void __exit tp_sample_trace_exit(void)
34{ 34{
35 unregister_trace_subsys_event(probe_subsys_event); 35 unregister_trace_subsys_event(probe_subsys_event);
36 tracepoint_synchronize_unregister();
36} 37}
37 38
38module_exit(tp_sample_trace_exit); 39module_exit(tp_sample_trace_exit);
diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c
index 4ae4b7fcc043..00d169792a3e 100644
--- a/samples/tracepoints/tracepoint-sample.c
+++ b/samples/tracepoints/tracepoint-sample.c
@@ -13,6 +13,9 @@
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include "tp-samples-trace.h" 14#include "tp-samples-trace.h"
15 15
16DEFINE_TRACE(subsys_event);
17DEFINE_TRACE(subsys_eventb);
18
16struct proc_dir_entry *pentry_example; 19struct proc_dir_entry *pentry_example;
17 20
18static int my_open(struct inode *inode, struct file *file) 21static int my_open(struct inode *inode, struct file *file)
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 468fbc9016c7..7a176773af85 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -198,16 +198,10 @@ cmd_modversions = \
198 fi; 198 fi;
199endif 199endif
200 200
201ifdef CONFIG_64BIT
202arch_bits = 64
203else
204arch_bits = 32
205endif
206
207ifdef CONFIG_FTRACE_MCOUNT_RECORD 201ifdef CONFIG_FTRACE_MCOUNT_RECORD
208cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl \ 202cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \
209 "$(ARCH)" "$(arch_bits)" "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" \ 203 "$(if $(CONFIG_64BIT),64,32)" \
210 "$(NM)" "$(RM)" "$(MV)" "$(@)"; 204 "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" "$(@)";
211endif 205endif
212 206
213define rule_cc_o_c 207define rule_cc_o_c
diff --git a/scripts/bootgraph.pl b/scripts/bootgraph.pl
index d2c61efc216f..f0af9aa9b243 100644
--- a/scripts/bootgraph.pl
+++ b/scripts/bootgraph.pl
@@ -78,11 +78,13 @@ while (<>) {
78} 78}
79 79
80if ($count == 0) { 80if ($count == 0) {
81 print "No data found in the dmesg. Make sure that 'printk.time=1' and\n"; 81 print STDERR <<END;
82 print "'initcall_debug' are passed on the kernel command line.\n\n"; 82No data found in the dmesg. Make sure that 'printk.time=1' and
83 print "Usage: \n"; 83'initcall_debug' are passed on the kernel command line.
84 print " dmesg | perl scripts/bootgraph.pl > output.svg\n\n"; 84Usage:
85 exit; 85 dmesg | perl scripts/bootgraph.pl > output.svg
86END
87 exit 1;
86} 88}
87 89
88print "<?xml version=\"1.0\" standalone=\"no\"?> \n"; 90print "<?xml version=\"1.0\" standalone=\"no\"?> \n";
@@ -109,8 +111,8 @@ my $stylecounter = 0;
109my %rows; 111my %rows;
110my $rowscount = 1; 112my $rowscount = 1;
111my @initcalls = sort { $start{$a} <=> $start{$b} } keys(%start); 113my @initcalls = sort { $start{$a} <=> $start{$b} } keys(%start);
112my $key; 114
113foreach $key (@initcalls) { 115foreach my $key (@initcalls) {
114 my $duration = $end{$key} - $start{$key}; 116 my $duration = $end{$key} - $start{$key};
115 117
116 if ($duration >= $threshold) { 118 if ($duration >= $threshold) {
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 6b9fe3eb8360..0b1dc9f9bb06 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -112,6 +112,8 @@ my ($arch, $bits, $objdump, $objcopy, $cc,
112# Acceptable sections to record. 112# Acceptable sections to record.
113my %text_sections = ( 113my %text_sections = (
114 ".text" => 1, 114 ".text" => 1,
115 ".sched.text" => 1,
116 ".spinlock.text" => 1,
115); 117);
116 118
117$objdump = "objdump" if ((length $objdump) == 0); 119$objdump = "objdump" if ((length $objdump) == 0);
@@ -130,10 +132,13 @@ my %weak; # List of weak functions
130my %convert; # List of local functions used that needs conversion 132my %convert; # List of local functions used that needs conversion
131 133
132my $type; 134my $type;
135my $nm_regex; # Find the local functions (return function)
133my $section_regex; # Find the start of a section 136my $section_regex; # Find the start of a section
134my $function_regex; # Find the name of a function 137my $function_regex; # Find the name of a function
135 # (return offset and func name) 138 # (return offset and func name)
136my $mcount_regex; # Find the call site to mcount (return offset) 139my $mcount_regex; # Find the call site to mcount (return offset)
140my $alignment; # The .align value to use for $mcount_section
141my $section_type; # Section header plus possible alignment command
137 142
138if ($arch eq "x86") { 143if ($arch eq "x86") {
139 if ($bits == 64) { 144 if ($bits == 64) {
@@ -143,11 +148,21 @@ if ($arch eq "x86") {
143 } 148 }
144} 149}
145 150
151#
152# We base the defaults off of i386, the other archs may
153# feel free to change them in the below if statements.
154#
155$nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)";
156$section_regex = "Disassembly of section\\s+(\\S+):";
157$function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:";
158$mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$";
159$section_type = '@progbits';
160$type = ".long";
161
146if ($arch eq "x86_64") { 162if ($arch eq "x86_64") {
147 $section_regex = "Disassembly of section\\s+(\\S+):";
148 $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:";
149 $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$"; 163 $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$";
150 $type = ".quad"; 164 $type = ".quad";
165 $alignment = 8;
151 166
152 # force flags for this arch 167 # force flags for this arch
153 $ld .= " -m elf_x86_64"; 168 $ld .= " -m elf_x86_64";
@@ -156,10 +171,7 @@ if ($arch eq "x86_64") {
156 $cc .= " -m64"; 171 $cc .= " -m64";
157 172
158} elsif ($arch eq "i386") { 173} elsif ($arch eq "i386") {
159 $section_regex = "Disassembly of section\\s+(\\S+):"; 174 $alignment = 4;
160 $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:";
161 $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$";
162 $type = ".long";
163 175
164 # force flags for this arch 176 # force flags for this arch
165 $ld .= " -m elf_i386"; 177 $ld .= " -m elf_i386";
@@ -167,6 +179,27 @@ if ($arch eq "x86_64") {
167 $objcopy .= " -O elf32-i386"; 179 $objcopy .= " -O elf32-i386";
168 $cc .= " -m32"; 180 $cc .= " -m32";
169 181
182} elsif ($arch eq "sh") {
183 $alignment = 2;
184
185 # force flags for this arch
186 $ld .= " -m shlelf_linux";
187 $objcopy .= " -O elf32-sh-linux";
188 $cc .= " -m32";
189
190} elsif ($arch eq "powerpc") {
191 $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\.?\\S+)";
192 $function_regex = "^([0-9a-fA-F]+)\\s+<(\\.?.*?)>:";
193 $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s\\.?_mcount\$";
194
195 if ($bits == 64) {
196 $type = ".quad";
197 }
198
199} elsif ($arch eq "arm") {
200 $alignment = 2;
201 $section_type = '%progbits';
202
170} else { 203} else {
171 die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD"; 204 die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD";
172} 205}
@@ -236,7 +269,7 @@ if (!$found_version) {
236# 269#
237open (IN, "$nm $inputfile|") || die "error running $nm"; 270open (IN, "$nm $inputfile|") || die "error running $nm";
238while (<IN>) { 271while (<IN>) {
239 if (/^[0-9a-fA-F]+\s+t\s+(\S+)/) { 272 if (/$nm_regex/) {
240 $locals{$1} = 1; 273 $locals{$1} = 1;
241 } elsif (/^[0-9a-fA-F]+\s+([wW])\s+(\S+)/) { 274 } elsif (/^[0-9a-fA-F]+\s+([wW])\s+(\S+)/) {
242 $weak{$2} = $1; 275 $weak{$2} = $1;
@@ -287,7 +320,8 @@ sub update_funcs
287 if (!$opened) { 320 if (!$opened) {
288 open(FILE, ">$mcount_s") || die "can't create $mcount_s\n"; 321 open(FILE, ">$mcount_s") || die "can't create $mcount_s\n";
289 $opened = 1; 322 $opened = 1;
290 print FILE "\t.section $mcount_section,\"a\",\@progbits\n"; 323 print FILE "\t.section $mcount_section,\"a\",$section_type\n";
324 print FILE "\t.align $alignment\n" if (defined($alignment));
291 } 325 }
292 printf FILE "\t%s %s + %d\n", $type, $ref_func, $offsets[$i] - $offset; 326 printf FILE "\t%s %s + %d\n", $type, $ref_func, $offsets[$i] - $offset;
293 } 327 }
diff --git a/scripts/trace/power.pl b/scripts/trace/power.pl
new file mode 100644
index 000000000000..4f729b3501e0
--- /dev/null
+++ b/scripts/trace/power.pl
@@ -0,0 +1,108 @@
1#!/usr/bin/perl
2
3# Copyright 2008, Intel Corporation
4#
5# This file is part of the Linux kernel
6#
7# This program file is free software; you can redistribute it and/or modify it
8# under the terms of the GNU General Public License as published by the
9# Free Software Foundation; version 2 of the License.
10#
11# This program is distributed in the hope that it will be useful, but WITHOUT
12# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14# for more details.
15#
16# You should have received a copy of the GNU General Public License
17# along with this program in a file named COPYING; if not, write to the
18# Free Software Foundation, Inc.,
19# 51 Franklin Street, Fifth Floor,
20# Boston, MA 02110-1301 USA
21#
22# Authors:
23# Arjan van de Ven <arjan@linux.intel.com>
24
25
26#
27# This script turns a cstate ftrace output into a SVG graphic that shows
28# historic C-state information
29#
30#
31# cat /sys/kernel/debug/tracing/trace | perl power.pl > out.svg
32#
33
34my @styles;
35my $base = 0;
36
37my @pstate_last;
38my @pstate_level;
39
40$styles[0] = "fill:rgb(0,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
41$styles[1] = "fill:rgb(0,255,0);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
42$styles[2] = "fill:rgb(255,0,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
43$styles[3] = "fill:rgb(255,255,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
44$styles[4] = "fill:rgb(255,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
45$styles[5] = "fill:rgb(0,255,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
46$styles[6] = "fill:rgb(0,128,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
47$styles[7] = "fill:rgb(0,255,128);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
48$styles[8] = "fill:rgb(0,25,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
49
50
51print "<?xml version=\"1.0\" standalone=\"no\"?> \n";
52print "<svg width=\"10000\" height=\"100%\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\">\n";
53
54my $scale = 30000.0;
55while (<>) {
56 my $line = $_;
57 if ($line =~ /([0-9\.]+)\] CSTATE: Going to C([0-9]) on cpu ([0-9]+) for ([0-9\.]+)/) {
58 if ($base == 0) {
59 $base = $1;
60 }
61 my $time = $1 - $base;
62 $time = $time * $scale;
63 my $C = $2;
64 my $cpu = $3;
65 my $y = 400 * $cpu;
66 my $duration = $4 * $scale;
67 my $msec = int($4 * 100000)/100.0;
68 my $height = $C * 20;
69 $style = $styles[$C];
70
71 $y = $y + 140 - $height;
72
73 $x2 = $time + 4;
74 $y2 = $y + 4;
75
76
77 print "<rect x=\"$time\" width=\"$duration\" y=\"$y\" height=\"$height\" style=\"$style\"/>\n";
78 print "<text transform=\"translate($x2,$y2) rotate(90)\">C$C $msec</text>\n";
79 }
80 if ($line =~ /([0-9\.]+)\] PSTATE: Going to P([0-9]) on cpu ([0-9]+)/) {
81 my $time = $1 - $base;
82 my $state = $2;
83 my $cpu = $3;
84
85 if (defined($pstate_last[$cpu])) {
86 my $from = $pstate_last[$cpu];
87 my $oldstate = $pstate_state[$cpu];
88 my $duration = ($time-$from) * $scale;
89
90 $from = $from * $scale;
91 my $to = $from + $duration;
92 my $height = 140 - ($oldstate * (140/8));
93
94 my $y = 400 * $cpu + 200 + $height;
95 my $y2 = $y+4;
96 my $style = $styles[8];
97
98 print "<rect x=\"$from\" y=\"$y\" width=\"$duration\" height=\"5\" style=\"$style\"/>\n";
99 print "<text transform=\"translate($from,$y2)\">P$oldstate (cpu $cpu)</text>\n";
100 };
101
102 $pstate_last[$cpu] = $time;
103 $pstate_state[$cpu] = $state;
104 }
105}
106
107
108print "</svg>\n";
diff --git a/scripts/tracing/draw_functrace.py b/scripts/tracing/draw_functrace.py
new file mode 100644
index 000000000000..902f9a992620
--- /dev/null
+++ b/scripts/tracing/draw_functrace.py
@@ -0,0 +1,130 @@
1#!/usr/bin/python
2
3"""
4Copyright 2008 (c) Frederic Weisbecker <fweisbec@gmail.com>
5Licensed under the terms of the GNU GPL License version 2
6
7This script parses a trace provided by the function tracer in
8kernel/trace/trace_functions.c
9The resulted trace is processed into a tree to produce a more human
10view of the call stack by drawing textual but hierarchical tree of
11calls. Only the functions's names and the the call time are provided.
12
13Usage:
14 Be sure that you have CONFIG_FUNCTION_TRACER
15 # mkdir /debugfs
16 # mount -t debug debug /debug
17 # echo function > /debug/tracing/current_tracer
18 $ cat /debug/tracing/trace_pipe > ~/raw_trace_func
19 Wait some times but not too much, the script is a bit slow.
20 Break the pipe (Ctrl + Z)
21 $ scripts/draw_functrace.py < raw_trace_func > draw_functrace
22 Then you have your drawn trace in draw_functrace
23"""
24
25
26import sys, re
27
28class CallTree:
29 """ This class provides a tree representation of the functions
30 call stack. If a function has no parent in the kernel (interrupt,
31 syscall, kernel thread...) then it is attached to a virtual parent
32 called ROOT.
33 """
34 ROOT = None
35
36 def __init__(self, func, time = None, parent = None):
37 self._func = func
38 self._time = time
39 if parent is None:
40 self._parent = CallTree.ROOT
41 else:
42 self._parent = parent
43 self._children = []
44
45 def calls(self, func, calltime):
46 """ If a function calls another one, call this method to insert it
47 into the tree at the appropriate place.
48 @return: A reference to the newly created child node.
49 """
50 child = CallTree(func, calltime, self)
51 self._children.append(child)
52 return child
53
54 def getParent(self, func):
55 """ Retrieve the last parent of the current node that
56 has the name given by func. If this function is not
57 on a parent, then create it as new child of root
58 @return: A reference to the parent.
59 """
60 tree = self
61 while tree != CallTree.ROOT and tree._func != func:
62 tree = tree._parent
63 if tree == CallTree.ROOT:
64 child = CallTree.ROOT.calls(func, None)
65 return child
66 return tree
67
68 def __repr__(self):
69 return self.__toString("", True)
70
71 def __toString(self, branch, lastChild):
72 if self._time is not None:
73 s = "%s----%s (%s)\n" % (branch, self._func, self._time)
74 else:
75 s = "%s----%s\n" % (branch, self._func)
76
77 i = 0
78 if lastChild:
79 branch = branch[:-1] + " "
80 while i < len(self._children):
81 if i != len(self._children) - 1:
82 s += "%s" % self._children[i].__toString(branch +\
83 " |", False)
84 else:
85 s += "%s" % self._children[i].__toString(branch +\
86 " |", True)
87 i += 1
88 return s
89
90class BrokenLineException(Exception):
91 """If the last line is not complete because of the pipe breakage,
92 we want to stop the processing and ignore this line.
93 """
94 pass
95
96class CommentLineException(Exception):
97 """ If the line is a comment (as in the beginning of the trace file),
98 just ignore it.
99 """
100 pass
101
102
103def parseLine(line):
104 line = line.strip()
105 if line.startswith("#"):
106 raise CommentLineException
107 m = re.match("[^]]+?\\] +([0-9.]+): (\\w+) <-(\\w+)", line)
108 if m is None:
109 raise BrokenLineException
110 return (m.group(1), m.group(2), m.group(3))
111
112
113def main():
114 CallTree.ROOT = CallTree("Root (Nowhere)", None, None)
115 tree = CallTree.ROOT
116
117 for line in sys.stdin:
118 try:
119 calltime, callee, caller = parseLine(line)
120 except BrokenLineException:
121 break
122 except CommentLineException:
123 continue
124 tree = tree.getParent(caller)
125 tree = tree.calls(callee, calltime)
126
127 print CallTree.ROOT
128
129if __name__ == "__main__":
130 main()