aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/controllers/cpuacct.txt32
-rw-r--r--Documentation/ftrace.txt62
-rw-r--r--Documentation/kernel-parameters.txt8
-rw-r--r--Documentation/lockstat.txt51
-rw-r--r--Documentation/markers.txt14
-rw-r--r--Documentation/scheduler/sched-arch.txt4
-rw-r--r--Documentation/tracepoints.txt92
-rw-r--r--arch/ia64/Kconfig2
-rw-r--r--arch/ia64/include/asm/topology.h2
-rw-r--r--arch/m32r/Kconfig2
-rw-r--r--arch/mips/Kconfig2
-rw-r--r--arch/mips/include/asm/mach-ip27/topology.h1
-rw-r--r--arch/powerpc/Kconfig2
-rw-r--r--arch/powerpc/include/asm/ftrace.h14
-rw-r--r--arch/powerpc/include/asm/module.h16
-rw-r--r--arch/powerpc/include/asm/topology.h1
-rw-r--r--arch/powerpc/kernel/ftrace.c473
-rw-r--r--arch/powerpc/kernel/idle.c5
-rw-r--r--arch/powerpc/kernel/module_32.c10
-rw-r--r--arch/powerpc/kernel/module_64.c13
-rw-r--r--arch/sh/include/asm/topology.h1
-rw-r--r--arch/um/include/asm/system.h14
-rw-r--r--arch/x86/Kconfig7
-rw-r--r--arch/x86/Kconfig.debug4
-rw-r--r--arch/x86/include/asm/ftrace.h34
-rw-r--r--arch/x86/include/asm/thread_info.h2
-rw-r--r--arch/x86/include/asm/uaccess.h2
-rw-r--r--arch/x86/include/asm/uaccess_32.h8
-rw-r--r--arch/x86/include/asm/uaccess_64.h6
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/entry_32.S42
-rw-r--r--arch/x86/kernel/entry_64.S5
-rw-r--r--arch/x86/kernel/ftrace.c312
-rw-r--r--arch/x86/kernel/stacktrace.c64
-rw-r--r--arch/x86/kernel/vsyscall_64.c3
-rw-r--r--arch/x86/lib/usercopy_32.c8
-rw-r--r--arch/x86/lib/usercopy_64.c4
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/vdso/vclock_gettime.c3
-rw-r--r--drivers/char/sysrq.c18
-rw-r--r--fs/seq_file.c14
-rw-r--r--include/asm-generic/vmlinux.lds.h21
-rw-r--r--include/asm-m32r/system.h2
-rw-r--r--include/linux/compiler.h84
-rw-r--r--include/linux/debug_locks.h2
-rw-r--r--include/linux/ftrace.h146
-rw-r--r--include/linux/ftrace_irq.h13
-rw-r--r--include/linux/futex.h2
-rw-r--r--include/linux/hardirq.h15
-rw-r--r--include/linux/kernel.h11
-rw-r--r--include/linux/lockdep.h31
-rw-r--r--include/linux/marker.h69
-rw-r--r--include/linux/mutex.h2
-rw-r--r--include/linux/rcuclassic.h2
-rw-r--r--include/linux/rcupdate.h2
-rw-r--r--include/linux/ring_buffer.h1
-rw-r--r--include/linux/sched.h48
-rw-r--r--include/linux/seq_file.h1
-rw-r--r--include/linux/stacktrace.h8
-rw-r--r--include/linux/tracepoint.h57
-rw-r--r--include/linux/uaccess.h2
-rw-r--r--include/trace/boot.h56
-rw-r--r--include/trace/sched.h24
-rw-r--r--init/Kconfig1
-rw-r--r--init/main.c35
-rw-r--r--kernel/Makefile7
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/extable.c16
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/futex.c290
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/lockdep.c33
-rw-r--r--kernel/lockdep_proc.c28
-rw-r--r--kernel/marker.c192
-rw-r--r--kernel/module.c11
-rw-r--r--kernel/mutex.c10
-rw-r--r--kernel/notifier.c8
-rw-r--r--kernel/posix-cpu-timers.c10
-rw-r--r--kernel/power/disk.c13
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/rcuclassic.c6
-rw-r--r--kernel/sched.c1053
-rw-r--r--kernel/sched_cpupri.c39
-rw-r--r--kernel/sched_cpupri.h5
-rw-r--r--kernel/sched_debug.c51
-rw-r--r--kernel/sched_fair.c14
-rw-r--r--kernel/sched_rt.c80
-rw-r--r--kernel/sched_stats.h3
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/softlockup.c2
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/sysctl.c10
-rw-r--r--kernel/time/tick-sched.c10
-rw-r--r--kernel/trace/Kconfig76
-rw-r--r--kernel/trace/Makefile7
-rw-r--r--kernel/trace/ftrace.c359
-rw-r--r--kernel/trace/ring_buffer.c377
-rw-r--r--kernel/trace/trace.c793
-rw-r--r--kernel/trace/trace.h192
-rw-r--r--kernel/trace/trace_boot.c166
-rw-r--r--kernel/trace/trace_branch.c341
-rw-r--r--kernel/trace/trace_functions.c18
-rw-r--r--kernel/trace/trace_functions_return.c98
-rw-r--r--kernel/trace/trace_irqsoff.c61
-rw-r--r--kernel/trace/trace_mmiotrace.c25
-rw-r--r--kernel/trace/trace_nop.c65
-rw-r--r--kernel/trace/trace_sched_switch.c106
-rw-r--r--kernel/trace/trace_sched_wakeup.c70
-rw-r--r--kernel/trace/trace_selftest.c173
-rw-r--r--kernel/trace/trace_stack.c8
-rw-r--r--kernel/trace/trace_sysprof.c19
-rw-r--r--kernel/tracepoint.c295
-rw-r--r--lib/Kconfig.debug10
-rw-r--r--mm/memory.c15
-rw-r--r--samples/tracepoints/tp-samples-trace.h4
-rw-r--r--samples/tracepoints/tracepoint-probe-sample.c1
-rw-r--r--samples/tracepoints/tracepoint-probe-sample2.c1
-rw-r--r--samples/tracepoints/tracepoint-sample.c3
-rw-r--r--scripts/Makefile.build12
-rw-r--r--scripts/bootgraph.pl16
-rwxr-xr-xscripts/recordmcount.pl48
-rw-r--r--scripts/tracing/draw_functrace.py130
124 files changed, 5505 insertions, 1904 deletions
diff --git a/Documentation/controllers/cpuacct.txt b/Documentation/controllers/cpuacct.txt
new file mode 100644
index 000000000000..bb775fbe43d7
--- /dev/null
+++ b/Documentation/controllers/cpuacct.txt
@@ -0,0 +1,32 @@
1CPU Accounting Controller
2-------------------------
3
4The CPU accounting controller is used to group tasks using cgroups and
5account the CPU usage of these groups of tasks.
6
7The CPU accounting controller supports multi-hierarchy groups. An accounting
8group accumulates the CPU usage of all of its child groups and the tasks
9directly present in its group.
10
11Accounting groups can be created by first mounting the cgroup filesystem.
12
13# mkdir /cgroups
14# mount -t cgroup -ocpuacct none /cgroups
15
16With the above step, the initial or the parent accounting group
17becomes visible at /cgroups. At bootup, this group includes all the
18tasks in the system. /cgroups/tasks lists the tasks in this cgroup.
19/cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by
20this group which is essentially the CPU time obtained by all the tasks
21in the system.
22
23New accounting groups can be created under the parent group /cgroups.
24
25# cd /cgroups
26# mkdir g1
27# echo $$ > g1
28
29The above steps create a new group g1 and move the current shell
30process (bash) into it. CPU time consumed by this bash and its children
31can be obtained from g1/cpuacct.usage and the same is accumulated in
32/cgroups/cpuacct.usage also.
diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 9cc4d685dde5..35a78bc6651d 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -82,7 +82,7 @@ of ftrace. Here is a list of some of the key files:
82 tracer is not adding more data, they will display 82 tracer is not adding more data, they will display
83 the same information every time they are read. 83 the same information every time they are read.
84 84
85 iter_ctrl: This file lets the user control the amount of data 85 trace_options: This file lets the user control the amount of data
86 that is displayed in one of the above output 86 that is displayed in one of the above output
87 files. 87 files.
88 88
@@ -94,10 +94,10 @@ of ftrace. Here is a list of some of the key files:
94 only be recorded if the latency is greater than 94 only be recorded if the latency is greater than
95 the value in this file. (in microseconds) 95 the value in this file. (in microseconds)
96 96
97 trace_entries: This sets or displays the number of bytes each CPU 97 buffer_size_kb: This sets or displays the number of kilobytes each CPU
98 buffer can hold. The tracer buffers are the same size 98 buffer can hold. The tracer buffers are the same size
99 for each CPU. The displayed number is the size of the 99 for each CPU. The displayed number is the size of the
100 CPU buffer and not total size of all buffers. The 100 CPU buffer and not total size of all buffers. The
101 trace buffers are allocated in pages (blocks of memory 101 trace buffers are allocated in pages (blocks of memory
102 that the kernel uses for allocation, usually 4 KB in size). 102 that the kernel uses for allocation, usually 4 KB in size).
103 If the last page allocated has room for more bytes 103 If the last page allocated has room for more bytes
@@ -316,23 +316,23 @@ The above is mostly meaningful for kernel developers.
316 The rest is the same as the 'trace' file. 316 The rest is the same as the 'trace' file.
317 317
318 318
319iter_ctrl 319trace_options
320--------- 320-------------
321 321
322The iter_ctrl file is used to control what gets printed in the trace 322The trace_options file is used to control what gets printed in the trace
323output. To see what is available, simply cat the file: 323output. To see what is available, simply cat the file:
324 324
325 cat /debug/tracing/iter_ctrl 325 cat /debug/tracing/trace_options
326 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \ 326 print-parent nosym-offset nosym-addr noverbose noraw nohex nobin \
327 noblock nostacktrace nosched-tree 327 noblock nostacktrace nosched-tree nouserstacktrace nosym-userobj
328 328
329To disable one of the options, echo in the option prepended with "no". 329To disable one of the options, echo in the option prepended with "no".
330 330
331 echo noprint-parent > /debug/tracing/iter_ctrl 331 echo noprint-parent > /debug/tracing/trace_options
332 332
333To enable an option, leave off the "no". 333To enable an option, leave off the "no".
334 334
335 echo sym-offset > /debug/tracing/iter_ctrl 335 echo sym-offset > /debug/tracing/trace_options
336 336
337Here are the available options: 337Here are the available options:
338 338
@@ -378,6 +378,20 @@ Here are the available options:
378 When a trace is recorded, so is the stack of functions. 378 When a trace is recorded, so is the stack of functions.
379 This allows for back traces of trace sites. 379 This allows for back traces of trace sites.
380 380
381 userstacktrace - This option changes the trace.
382 It records a stacktrace of the current userspace thread.
383
384 sym-userobj - when user stacktrace are enabled, look up which object the
385 address belongs to, and print a relative address
386 This is especially useful when ASLR is on, otherwise you don't
387 get a chance to resolve the address to object/file/line after the app is no
388 longer running
389
390 The lookup is performed when you read trace,trace_pipe,latency_trace. Example:
391
392 a.out-1623 [000] 40874.465068: /root/a.out[+0x480] <-/root/a.out[+0
393x494] <- /root/a.out[+0x4a8] <- /lib/libc-2.7.so[+0x1e1a6]
394
381 sched-tree - TBD (any users??) 395 sched-tree - TBD (any users??)
382 396
383 397
@@ -1299,41 +1313,29 @@ trace entries
1299------------- 1313-------------
1300 1314
1301Having too much or not enough data can be troublesome in diagnosing 1315Having too much or not enough data can be troublesome in diagnosing
1302an issue in the kernel. The file trace_entries is used to modify 1316an issue in the kernel. The file buffer_size_kb is used to modify
1303the size of the internal trace buffers. The number listed 1317the size of the internal trace buffers. The number listed
1304is the number of entries that can be recorded per CPU. To know 1318is the number of entries that can be recorded per CPU. To know
1305the full size, multiply the number of possible CPUS with the 1319the full size, multiply the number of possible CPUS with the
1306number of entries. 1320number of entries.
1307 1321
1308 # cat /debug/tracing/trace_entries 1322 # cat /debug/tracing/buffer_size_kb
130965620 13231408 (units kilobytes)
1310 1324
1311Note, to modify this, you must have tracing completely disabled. To do that, 1325Note, to modify this, you must have tracing completely disabled. To do that,
1312echo "nop" into the current_tracer. If the current_tracer is not set 1326echo "nop" into the current_tracer. If the current_tracer is not set
1313to "nop", an EINVAL error will be returned. 1327to "nop", an EINVAL error will be returned.
1314 1328
1315 # echo nop > /debug/tracing/current_tracer 1329 # echo nop > /debug/tracing/current_tracer
1316 # echo 100000 > /debug/tracing/trace_entries 1330 # echo 10000 > /debug/tracing/buffer_size_kb
1317 # cat /debug/tracing/trace_entries 1331 # cat /debug/tracing/buffer_size_kb
1318100045 133210000 (units kilobytes)
1319
1320
1321Notice that we echoed in 100,000 but the size is 100,045. The entries
1322are held in individual pages. It allocates the number of pages it takes
1323to fulfill the request. If more entries may fit on the last page
1324then they will be added.
1325
1326 # echo 1 > /debug/tracing/trace_entries
1327 # cat /debug/tracing/trace_entries
132885
1329
1330This shows us that 85 entries can fit in a single page.
1331 1333
1332The number of pages which will be allocated is limited to a percentage 1334The number of pages which will be allocated is limited to a percentage
1333of available memory. Allocating too much will produce an error. 1335of available memory. Allocating too much will produce an error.
1334 1336
1335 # echo 1000000000000 > /debug/tracing/trace_entries 1337 # echo 1000000000000 > /debug/tracing/buffer_size_kb
1336-bash: echo: write error: Cannot allocate memory 1338-bash: echo: write error: Cannot allocate memory
1337 # cat /debug/tracing/trace_entries 1339 # cat /debug/tracing/buffer_size_kb
133885 134085
1339 1341
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e0f346d201ed..2919a2e91938 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -750,6 +750,14 @@ and is between 256 and 4096 characters. It is defined in the file
750 parameter will force ia64_sal_cache_flush to call 750 parameter will force ia64_sal_cache_flush to call
751 ia64_pal_cache_flush instead of SAL_CACHE_FLUSH. 751 ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
752 752
753 ftrace=[tracer]
754 [ftrace] will set and start the specified tracer
755 as early as possible in order to facilitate early
756 boot debugging.
757
758 ftrace_dump_on_oops
759 [ftrace] will dump the trace buffers on oops.
760
753 gamecon.map[2|3]= 761 gamecon.map[2|3]=
754 [HW,JOY] Multisystem joystick and NES/SNES/PSX pad 762 [HW,JOY] Multisystem joystick and NES/SNES/PSX pad
755 support via parallel port (up to 5 devices per port) 763 support via parallel port (up to 5 devices per port)
diff --git a/Documentation/lockstat.txt b/Documentation/lockstat.txt
index 4ba4664ce5c3..9cb9138f7a79 100644
--- a/Documentation/lockstat.txt
+++ b/Documentation/lockstat.txt
@@ -71,35 +71,50 @@ Look at the current lock statistics:
71 71
72# less /proc/lock_stat 72# less /proc/lock_stat
73 73
7401 lock_stat version 0.2 7401 lock_stat version 0.3
7502 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 7502 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
7603 class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total 7603 class name con-bounces contentions waittime-min waittime-max waittime-total acq-bounces acquisitions holdtime-min holdtime-max holdtime-total
7704 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 7704 -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
7805 7805
7906 &inode->i_data.tree_lock-W: 15 21657 0.18 1093295.30 11547131054.85 58 10415 0.16 87.51 6387.60 7906 &mm->mmap_sem-W: 233 538 18446744073708 22924.27 607243.51 1342 45806 1.71 8595.89 1180582.34
8007 &inode->i_data.tree_lock-R: 0 0 0.00 0.00 0.00 23302 231198 0.25 8.45 98023.38 8007 &mm->mmap_sem-R: 205 587 18446744073708 28403.36 731975.00 1940 412426 0.58 187825.45 6307502.88
8108 -------------------------- 8108 ---------------
8209 &inode->i_data.tree_lock 0 [<ffffffff8027c08f>] add_to_page_cache+0x5f/0x190 8209 &mm->mmap_sem 487 [<ffffffff8053491f>] do_page_fault+0x466/0x928
8310 8310 &mm->mmap_sem 179 [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
8411 ............................................................................................................................................................................................... 8411 &mm->mmap_sem 279 [<ffffffff80210a57>] sys_mmap+0x75/0xce
8512 8512 &mm->mmap_sem 76 [<ffffffff802a490b>] sys_munmap+0x32/0x59
8613 dcache_lock: 1037 1161 0.38 45.32 774.51 6611 243371 0.15 306.48 77387.24 8613 ---------------
8714 ----------- 8714 &mm->mmap_sem 270 [<ffffffff80210a57>] sys_mmap+0x75/0xce
8815 dcache_lock 180 [<ffffffff802c0d7e>] sys_getcwd+0x11e/0x230 8815 &mm->mmap_sem 431 [<ffffffff8053491f>] do_page_fault+0x466/0x928
8916 dcache_lock 165 [<ffffffff802c002a>] d_alloc+0x15a/0x210 8916 &mm->mmap_sem 138 [<ffffffff802a490b>] sys_munmap+0x32/0x59
9017 dcache_lock 33 [<ffffffff8035818d>] _atomic_dec_and_lock+0x4d/0x70 9017 &mm->mmap_sem 145 [<ffffffff802a6200>] sys_mprotect+0xcd/0x21d
9118 dcache_lock 1 [<ffffffff802beef8>] shrink_dcache_parent+0x18/0x130 9118
9219 ...............................................................................................................................................................................................
9320
9421 dcache_lock: 621 623 0.52 118.26 1053.02 6745 91930 0.29 316.29 118423.41
9522 -----------
9623 dcache_lock 179 [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
9724 dcache_lock 113 [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
9825 dcache_lock 99 [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
9926 dcache_lock 104 [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
10027 -----------
10128 dcache_lock 192 [<ffffffff80378274>] _atomic_dec_and_lock+0x34/0x54
10229 dcache_lock 98 [<ffffffff802ca0dc>] d_rehash+0x1b/0x44
10330 dcache_lock 72 [<ffffffff802cc17b>] d_alloc+0x19a/0x1eb
10431 dcache_lock 112 [<ffffffff802cbca0>] d_instantiate+0x36/0x8a
92 105
93This excerpt shows the first two lock class statistics. Line 01 shows the 106This excerpt shows the first two lock class statistics. Line 01 shows the
94output version - each time the format changes this will be updated. Line 02-04 107output version - each time the format changes this will be updated. Line 02-04
95show the header with column descriptions. Lines 05-10 and 13-18 show the actual 108show the header with column descriptions. Lines 05-18 and 20-31 show the actual
96statistics. These statistics come in two parts; the actual stats separated by a 109statistics. These statistics come in two parts; the actual stats separated by a
97short separator (line 08, 14) from the contention points. 110short separator (line 08, 13) from the contention points.
98 111
99The first lock (05-10) is a read/write lock, and shows two lines above the 112The first lock (05-18) is a read/write lock, and shows two lines above the
100short separator. The contention points don't match the column descriptors, 113short separator. The contention points don't match the column descriptors,
101they have two: contentions and [<IP>] symbol. 114they have two: contentions and [<IP>] symbol. The second set of contention
115points are the points we're contending with.
102 116
117The integer part of the time values is in us.
103 118
104View the top contending locks: 119View the top contending locks:
105 120
diff --git a/Documentation/markers.txt b/Documentation/markers.txt
index 089f6138fcd9..6d275e4ef385 100644
--- a/Documentation/markers.txt
+++ b/Documentation/markers.txt
@@ -70,6 +70,20 @@ a printk warning which identifies the inconsistency:
70 70
71"Format mismatch for probe probe_name (format), marker (format)" 71"Format mismatch for probe probe_name (format), marker (format)"
72 72
73Another way to use markers is to simply define the marker without generating any
74function call to actually call into the marker. This is useful in combination
75with tracepoint probes in a scheme like this :
76
77void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk);
78
79DEFINE_MARKER_TP(marker_eventname, tracepoint_name, probe_tracepoint_name,
80 "arg1 %u pid %d");
81
82notrace void probe_tracepoint_name(unsigned int arg1, struct task_struct *tsk)
83{
84 struct marker *marker = &GET_MARKER(kernel_irq_entry);
85 /* write data to trace buffers ... */
86}
73 87
74* Probe / marker example 88* Probe / marker example
75 89
diff --git a/Documentation/scheduler/sched-arch.txt b/Documentation/scheduler/sched-arch.txt
index 941615a9769b..d43dbcbd163b 100644
--- a/Documentation/scheduler/sched-arch.txt
+++ b/Documentation/scheduler/sched-arch.txt
@@ -8,7 +8,7 @@ Context switch
8By default, the switch_to arch function is called with the runqueue 8By default, the switch_to arch function is called with the runqueue
9locked. This is usually not a problem unless switch_to may need to 9locked. This is usually not a problem unless switch_to may need to
10take the runqueue lock. This is usually due to a wake up operation in 10take the runqueue lock. This is usually due to a wake up operation in
11the context switch. See include/asm-ia64/system.h for an example. 11the context switch. See arch/ia64/include/asm/system.h for an example.
12 12
13To request the scheduler call switch_to with the runqueue unlocked, 13To request the scheduler call switch_to with the runqueue unlocked,
14you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file 14you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
@@ -23,7 +23,7 @@ disabled. Interrupts may be enabled over the call if it is likely to
23introduce a significant interrupt latency by adding the line 23introduce a significant interrupt latency by adding the line
24`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for 24`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for
25unlocked context switches. This define also implies 25unlocked context switches. This define also implies
26`__ARCH_WANT_UNLOCKED_CTXSW`. See include/asm-arm/system.h for an 26`__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an
27example. 27example.
28 28
29 29
diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt
index 5d354e167494..2d42241a25c3 100644
--- a/Documentation/tracepoints.txt
+++ b/Documentation/tracepoints.txt
@@ -3,28 +3,30 @@
3 Mathieu Desnoyers 3 Mathieu Desnoyers
4 4
5 5
6This document introduces Linux Kernel Tracepoints and their use. It provides 6This document introduces Linux Kernel Tracepoints and their use. It
7examples of how to insert tracepoints in the kernel and connect probe functions 7provides examples of how to insert tracepoints in the kernel and
8to them and provides some examples of probe functions. 8connect probe functions to them and provides some examples of probe
9functions.
9 10
10 11
11* Purpose of tracepoints 12* Purpose of tracepoints
12 13
13A tracepoint placed in code provides a hook to call a function (probe) that you 14A tracepoint placed in code provides a hook to call a function (probe)
14can provide at runtime. A tracepoint can be "on" (a probe is connected to it) or 15that you can provide at runtime. A tracepoint can be "on" (a probe is
15"off" (no probe is attached). When a tracepoint is "off" it has no effect, 16connected to it) or "off" (no probe is attached). When a tracepoint is
16except for adding a tiny time penalty (checking a condition for a branch) and 17"off" it has no effect, except for adding a tiny time penalty
17space penalty (adding a few bytes for the function call at the end of the 18(checking a condition for a branch) and space penalty (adding a few
18instrumented function and adds a data structure in a separate section). When a 19bytes for the function call at the end of the instrumented function
19tracepoint is "on", the function you provide is called each time the tracepoint 20and adds a data structure in a separate section). When a tracepoint
20is executed, in the execution context of the caller. When the function provided 21is "on", the function you provide is called each time the tracepoint
21ends its execution, it returns to the caller (continuing from the tracepoint 22is executed, in the execution context of the caller. When the function
22site). 23provided ends its execution, it returns to the caller (continuing from
24the tracepoint site).
23 25
24You can put tracepoints at important locations in the code. They are 26You can put tracepoints at important locations in the code. They are
25lightweight hooks that can pass an arbitrary number of parameters, 27lightweight hooks that can pass an arbitrary number of parameters,
26which prototypes are described in a tracepoint declaration placed in a header 28which prototypes are described in a tracepoint declaration placed in a
27file. 29header file.
28 30
29They can be used for tracing and performance accounting. 31They can be used for tracing and performance accounting.
30 32
@@ -42,7 +44,7 @@ In include/trace/subsys.h :
42 44
43#include <linux/tracepoint.h> 45#include <linux/tracepoint.h>
44 46
45DEFINE_TRACE(subsys_eventname, 47DECLARE_TRACE(subsys_eventname,
46 TPPTOTO(int firstarg, struct task_struct *p), 48 TPPTOTO(int firstarg, struct task_struct *p),
47 TPARGS(firstarg, p)); 49 TPARGS(firstarg, p));
48 50
@@ -50,6 +52,8 @@ In subsys/file.c (where the tracing statement must be added) :
50 52
51#include <trace/subsys.h> 53#include <trace/subsys.h>
52 54
55DEFINE_TRACE(subsys_eventname);
56
53void somefct(void) 57void somefct(void)
54{ 58{
55 ... 59 ...
@@ -61,31 +65,41 @@ Where :
61- subsys_eventname is an identifier unique to your event 65- subsys_eventname is an identifier unique to your event
62 - subsys is the name of your subsystem. 66 - subsys is the name of your subsystem.
63 - eventname is the name of the event to trace. 67 - eventname is the name of the event to trace.
64- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the function
65 called by this tracepoint.
66- TPARGS(firstarg, p) are the parameters names, same as found in the prototype.
67 68
68Connecting a function (probe) to a tracepoint is done by providing a probe 69- TPPTOTO(int firstarg, struct task_struct *p) is the prototype of the
69(function to call) for the specific tracepoint through 70 function called by this tracepoint.
70register_trace_subsys_eventname(). Removing a probe is done through
71unregister_trace_subsys_eventname(); it will remove the probe sure there is no
72caller left using the probe when it returns. Probe removal is preempt-safe
73because preemption is disabled around the probe call. See the "Probe example"
74section below for a sample probe module.
75
76The tracepoint mechanism supports inserting multiple instances of the same
77tracepoint, but a single definition must be made of a given tracepoint name over
78all the kernel to make sure no type conflict will occur. Name mangling of the
79tracepoints is done using the prototypes to make sure typing is correct.
80Verification of probe type correctness is done at the registration site by the
81compiler. Tracepoints can be put in inline functions, inlined static functions,
82and unrolled loops as well as regular functions.
83
84The naming scheme "subsys_event" is suggested here as a convention intended
85to limit collisions. Tracepoint names are global to the kernel: they are
86considered as being the same whether they are in the core kernel image or in
87modules.
88 71
72- TPARGS(firstarg, p) are the parameters names, same as found in the
73 prototype.
74
75Connecting a function (probe) to a tracepoint is done by providing a
76probe (function to call) for the specific tracepoint through
77register_trace_subsys_eventname(). Removing a probe is done through
78unregister_trace_subsys_eventname(); it will remove the probe.
79
80tracepoint_synchronize_unregister() must be called before the end of
81the module exit function to make sure there is no caller left using
82the probe. This, and the fact that preemption is disabled around the
83probe call, make sure that probe removal and module unload are safe.
84See the "Probe example" section below for a sample probe module.
85
86The tracepoint mechanism supports inserting multiple instances of the
87same tracepoint, but a single definition must be made of a given
88tracepoint name over all the kernel to make sure no type conflict will
89occur. Name mangling of the tracepoints is done using the prototypes
90to make sure typing is correct. Verification of probe type correctness
91is done at the registration site by the compiler. Tracepoints can be
92put in inline functions, inlined static functions, and unrolled loops
93as well as regular functions.
94
95The naming scheme "subsys_event" is suggested here as a convention
96intended to limit collisions. Tracepoint names are global to the
97kernel: they are considered as being the same whether they are in the
98core kernel image or in modules.
99
100If the tracepoint has to be used in kernel modules, an
101EXPORT_TRACEPOINT_SYMBOL_GPL() or EXPORT_TRACEPOINT_SYMBOL() can be
102used to export the defined tracepoints.
89 103
90* Probe / tracepoint example 104* Probe / tracepoint example
91 105
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 6bd91ed7cd03..7fa8f615ba6e 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -99,7 +99,7 @@ config GENERIC_IOMAP
99 bool 99 bool
100 default y 100 default y
101 101
102config SCHED_NO_NO_OMIT_FRAME_POINTER 102config SCHED_OMIT_FRAME_POINTER
103 bool 103 bool
104 default y 104 default y
105 105
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index 35bcb641c9e5..a3cc9f65f954 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -55,7 +55,6 @@
55void build_cpu_to_node_map(void); 55void build_cpu_to_node_map(void);
56 56
57#define SD_CPU_INIT (struct sched_domain) { \ 57#define SD_CPU_INIT (struct sched_domain) { \
58 .span = CPU_MASK_NONE, \
59 .parent = NULL, \ 58 .parent = NULL, \
60 .child = NULL, \ 59 .child = NULL, \
61 .groups = NULL, \ 60 .groups = NULL, \
@@ -80,7 +79,6 @@ void build_cpu_to_node_map(void);
80 79
81/* sched_domains SD_NODE_INIT for IA64 NUMA machines */ 80/* sched_domains SD_NODE_INIT for IA64 NUMA machines */
82#define SD_NODE_INIT (struct sched_domain) { \ 81#define SD_NODE_INIT (struct sched_domain) { \
83 .span = CPU_MASK_NONE, \
84 .parent = NULL, \ 82 .parent = NULL, \
85 .child = NULL, \ 83 .child = NULL, \
86 .groups = NULL, \ 84 .groups = NULL, \
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index dbaed4a63815..29047d5c259a 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -273,7 +273,7 @@ config GENERIC_CALIBRATE_DELAY
273 bool 273 bool
274 default y 274 default y
275 275
276config SCHED_NO_NO_OMIT_FRAME_POINTER 276config SCHED_OMIT_FRAME_POINTER
277 bool 277 bool
278 default y 278 default y
279 279
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index f4af967a6b30..a5255e7c79e0 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -653,7 +653,7 @@ config GENERIC_CMOS_UPDATE
653 bool 653 bool
654 default y 654 default y
655 655
656config SCHED_NO_NO_OMIT_FRAME_POINTER 656config SCHED_OMIT_FRAME_POINTER
657 bool 657 bool
658 default y 658 default y
659 659
diff --git a/arch/mips/include/asm/mach-ip27/topology.h b/arch/mips/include/asm/mach-ip27/topology.h
index 7785bec732f2..1fb959f98982 100644
--- a/arch/mips/include/asm/mach-ip27/topology.h
+++ b/arch/mips/include/asm/mach-ip27/topology.h
@@ -37,7 +37,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
37 37
38/* sched_domains SD_NODE_INIT for SGI IP27 machines */ 38/* sched_domains SD_NODE_INIT for SGI IP27 machines */
39#define SD_NODE_INIT (struct sched_domain) { \ 39#define SD_NODE_INIT (struct sched_domain) { \
40 .span = CPU_MASK_NONE, \
41 .parent = NULL, \ 40 .parent = NULL, \
42 .child = NULL, \ 41 .child = NULL, \
43 .groups = NULL, \ 42 .groups = NULL, \
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 525c13a4de93..adb23ea1c1ef 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -141,7 +141,7 @@ config GENERIC_NVRAM
141 bool 141 bool
142 default y if PPC32 142 default y if PPC32
143 143
144config SCHED_NO_NO_OMIT_FRAME_POINTER 144config SCHED_OMIT_FRAME_POINTER
145 bool 145 bool
146 default y 146 default y
147 147
diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h
index b298f7a631e6..e5f2ae8362f7 100644
--- a/arch/powerpc/include/asm/ftrace.h
+++ b/arch/powerpc/include/asm/ftrace.h
@@ -7,7 +7,19 @@
7 7
8#ifndef __ASSEMBLY__ 8#ifndef __ASSEMBLY__
9extern void _mcount(void); 9extern void _mcount(void);
10#endif 10
11#ifdef CONFIG_DYNAMIC_FTRACE
12static inline unsigned long ftrace_call_adjust(unsigned long addr)
13{
14 /* reloction of mcount call site is the same as the address */
15 return addr;
16}
17
18struct dyn_arch_ftrace {
19 struct module *mod;
20};
21#endif /* CONFIG_DYNAMIC_FTRACE */
22#endif /* __ASSEMBLY__ */
11 23
12#endif 24#endif
13 25
diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h
index e5f14b13ccf0..08454880a2c0 100644
--- a/arch/powerpc/include/asm/module.h
+++ b/arch/powerpc/include/asm/module.h
@@ -34,11 +34,19 @@ struct mod_arch_specific {
34#ifdef __powerpc64__ 34#ifdef __powerpc64__
35 unsigned int stubs_section; /* Index of stubs section in module */ 35 unsigned int stubs_section; /* Index of stubs section in module */
36 unsigned int toc_section; /* What section is the TOC? */ 36 unsigned int toc_section; /* What section is the TOC? */
37#else 37#ifdef CONFIG_DYNAMIC_FTRACE
38 unsigned long toc;
39 unsigned long tramp;
40#endif
41
42#else /* powerpc64 */
38 /* Indices of PLT sections within module. */ 43 /* Indices of PLT sections within module. */
39 unsigned int core_plt_section; 44 unsigned int core_plt_section;
40 unsigned int init_plt_section; 45 unsigned int init_plt_section;
46#ifdef CONFIG_DYNAMIC_FTRACE
47 unsigned long tramp;
41#endif 48#endif
49#endif /* powerpc64 */
42 50
43 /* List of BUG addresses, source line numbers and filenames */ 51 /* List of BUG addresses, source line numbers and filenames */
44 struct list_head bug_list; 52 struct list_head bug_list;
@@ -68,6 +76,12 @@ struct mod_arch_specific {
68# endif /* MODULE */ 76# endif /* MODULE */
69#endif 77#endif
70 78
79#ifdef CONFIG_DYNAMIC_FTRACE
80# ifdef MODULE
81 asm(".section .ftrace.tramp,\"ax\",@nobits; .align 3; .previous");
82# endif /* MODULE */
83#endif
84
71 85
72struct exception_table_entry; 86struct exception_table_entry;
73void sort_ex_table(struct exception_table_entry *start, 87void sort_ex_table(struct exception_table_entry *start,
diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index c32da6f97999..373fca394a54 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -48,7 +48,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
48 48
49/* sched_domains SD_NODE_INIT for PPC64 machines */ 49/* sched_domains SD_NODE_INIT for PPC64 machines */
50#define SD_NODE_INIT (struct sched_domain) { \ 50#define SD_NODE_INIT (struct sched_domain) { \
51 .span = CPU_MASK_NONE, \
52 .parent = NULL, \ 51 .parent = NULL, \
53 .child = NULL, \ 52 .child = NULL, \
54 .groups = NULL, \ 53 .groups = NULL, \
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index f4b006ed0ab1..3271cd698e4c 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -9,22 +9,30 @@
9 9
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
12#include <linux/uaccess.h>
13#include <linux/module.h>
12#include <linux/ftrace.h> 14#include <linux/ftrace.h>
13#include <linux/percpu.h> 15#include <linux/percpu.h>
14#include <linux/init.h> 16#include <linux/init.h>
15#include <linux/list.h> 17#include <linux/list.h>
16 18
17#include <asm/cacheflush.h> 19#include <asm/cacheflush.h>
20#include <asm/code-patching.h>
18#include <asm/ftrace.h> 21#include <asm/ftrace.h>
19 22
23#if 0
24#define DEBUGP printk
25#else
26#define DEBUGP(fmt , ...) do { } while (0)
27#endif
20 28
21static unsigned int ftrace_nop = 0x60000000; 29static unsigned int ftrace_nop = PPC_NOP_INSTR;
22 30
23#ifdef CONFIG_PPC32 31#ifdef CONFIG_PPC32
24# define GET_ADDR(addr) addr 32# define GET_ADDR(addr) addr
25#else 33#else
26/* PowerPC64's functions are data that points to the functions */ 34/* PowerPC64's functions are data that points to the functions */
27# define GET_ADDR(addr) *(unsigned long *)addr 35# define GET_ADDR(addr) (*(unsigned long *)addr)
28#endif 36#endif
29 37
30 38
@@ -33,12 +41,12 @@ static unsigned int ftrace_calc_offset(long ip, long addr)
33 return (int)(addr - ip); 41 return (int)(addr - ip);
34} 42}
35 43
36unsigned char *ftrace_nop_replace(void) 44static unsigned char *ftrace_nop_replace(void)
37{ 45{
38 return (char *)&ftrace_nop; 46 return (char *)&ftrace_nop;
39} 47}
40 48
41unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) 49static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
42{ 50{
43 static unsigned int op; 51 static unsigned int op;
44 52
@@ -68,49 +76,434 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
68# define _ASM_PTR " .long " 76# define _ASM_PTR " .long "
69#endif 77#endif
70 78
71int 79static int
72ftrace_modify_code(unsigned long ip, unsigned char *old_code, 80ftrace_modify_code(unsigned long ip, unsigned char *old_code,
73 unsigned char *new_code) 81 unsigned char *new_code)
74{ 82{
75 unsigned replaced; 83 unsigned char replaced[MCOUNT_INSN_SIZE];
76 unsigned old = *(unsigned *)old_code;
77 unsigned new = *(unsigned *)new_code;
78 int faulted = 0;
79 84
80 /* 85 /*
81 * Note: Due to modules and __init, code can 86 * Note: Due to modules and __init, code can
82 * disappear and change, we need to protect against faulting 87 * disappear and change, we need to protect against faulting
83 * as well as code changing. 88 * as well as code changing. We do this by using the
89 * probe_kernel_* functions.
84 * 90 *
85 * No real locking needed, this code is run through 91 * No real locking needed, this code is run through
86 * kstop_machine. 92 * kstop_machine, or before SMP starts.
87 */ 93 */
88 asm volatile ( 94
89 "1: lwz %1, 0(%2)\n" 95 /* read the text we want to modify */
90 " cmpw %1, %5\n" 96 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
91 " bne 2f\n" 97 return -EFAULT;
92 " stwu %3, 0(%2)\n" 98
93 "2:\n" 99 /* Make sure it is what we expect it to be */
94 ".section .fixup, \"ax\"\n" 100 if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
95 "3: li %0, 1\n" 101 return -EINVAL;
96 " b 2b\n" 102
97 ".previous\n" 103 /* replace the text with the new text */
98 ".section __ex_table,\"a\"\n" 104 if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE))
99 _ASM_ALIGN "\n" 105 return -EPERM;
100 _ASM_PTR "1b, 3b\n" 106
101 ".previous" 107 flush_icache_range(ip, ip + 8);
102 : "=r"(faulted), "=r"(replaced) 108
103 : "r"(ip), "r"(new), 109 return 0;
104 "0"(faulted), "r"(old) 110}
105 : "memory"); 111
106 112/*
107 if (replaced != old && replaced != new) 113 * Helper functions that are the same for both PPC64 and PPC32.
108 faulted = 2; 114 */
109 115static int test_24bit_addr(unsigned long ip, unsigned long addr)
110 if (!faulted) 116{
111 flush_icache_range(ip, ip + 8); 117 long diff;
112 118
113 return faulted; 119 /*
120 * Can we get to addr from ip in 24 bits?
121 * (26 really, since we mulitply by 4 for 4 byte alignment)
122 */
123 diff = addr - ip;
124
125 /*
126 * Return true if diff is less than 1 << 25
127 * and greater than -1 << 26.
128 */
129 return (diff < (1 << 25)) && (diff > (-1 << 26));
130}
131
132static int is_bl_op(unsigned int op)
133{
134 return (op & 0xfc000003) == 0x48000001;
135}
136
137static int test_offset(unsigned long offset)
138{
139 return (offset + 0x2000000 > 0x3ffffff) || ((offset & 3) != 0);
140}
141
142static unsigned long find_bl_target(unsigned long ip, unsigned int op)
143{
144 static int offset;
145
146 offset = (op & 0x03fffffc);
147 /* make it signed */
148 if (offset & 0x02000000)
149 offset |= 0xfe000000;
150
151 return ip + (long)offset;
152}
153
154static unsigned int branch_offset(unsigned long offset)
155{
156 /* return "bl ip+offset" */
157 return 0x48000001 | (offset & 0x03fffffc);
158}
159
160#ifdef CONFIG_PPC64
161static int
162__ftrace_make_nop(struct module *mod,
163 struct dyn_ftrace *rec, unsigned long addr)
164{
165 unsigned char replaced[MCOUNT_INSN_SIZE * 2];
166 unsigned int *op = (unsigned *)&replaced;
167 unsigned char jmp[8];
168 unsigned long *ptr = (unsigned long *)&jmp;
169 unsigned long ip = rec->ip;
170 unsigned long tramp;
171 int offset;
172
173 /* read where this goes */
174 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
175 return -EFAULT;
176
177 /* Make sure that that this is still a 24bit jump */
178 if (!is_bl_op(*op)) {
179 printk(KERN_ERR "Not expected bl: opcode is %x\n", *op);
180 return -EINVAL;
181 }
182
183 /* lets find where the pointer goes */
184 tramp = find_bl_target(ip, *op);
185
186 /*
187 * On PPC64 the trampoline looks like:
188 * 0x3d, 0x82, 0x00, 0x00, addis r12,r2, <high>
189 * 0x39, 0x8c, 0x00, 0x00, addi r12,r12, <low>
190 * Where the bytes 2,3,6 and 7 make up the 32bit offset
191 * to the TOC that holds the pointer.
192 * to jump to.
193 * 0xf8, 0x41, 0x00, 0x28, std r2,40(r1)
194 * 0xe9, 0x6c, 0x00, 0x20, ld r11,32(r12)
195 * The actually address is 32 bytes from the offset
196 * into the TOC.
197 * 0xe8, 0x4c, 0x00, 0x28, ld r2,40(r12)
198 */
199
200 DEBUGP("ip:%lx jumps to %lx r2: %lx", ip, tramp, mod->arch.toc);
201
202 /* Find where the trampoline jumps to */
203 if (probe_kernel_read(jmp, (void *)tramp, 8)) {
204 printk(KERN_ERR "Failed to read %lx\n", tramp);
205 return -EFAULT;
206 }
207
208 DEBUGP(" %08x %08x",
209 (unsigned)(*ptr >> 32),
210 (unsigned)*ptr);
211
212 offset = (unsigned)jmp[2] << 24 |
213 (unsigned)jmp[3] << 16 |
214 (unsigned)jmp[6] << 8 |
215 (unsigned)jmp[7];
216
217 DEBUGP(" %x ", offset);
218
219 /* get the address this jumps too */
220 tramp = mod->arch.toc + offset + 32;
221 DEBUGP("toc: %lx", tramp);
222
223 if (probe_kernel_read(jmp, (void *)tramp, 8)) {
224 printk(KERN_ERR "Failed to read %lx\n", tramp);
225 return -EFAULT;
226 }
227
228 DEBUGP(" %08x %08x\n",
229 (unsigned)(*ptr >> 32),
230 (unsigned)*ptr);
231
232 /* This should match what was called */
233 if (*ptr != GET_ADDR(addr)) {
234 printk(KERN_ERR "addr does not match %lx\n", *ptr);
235 return -EINVAL;
236 }
237
238 /*
239 * We want to nop the line, but the next line is
240 * 0xe8, 0x41, 0x00, 0x28 ld r2,40(r1)
241 * This needs to be turned to a nop too.
242 */
243 if (probe_kernel_read(replaced, (void *)(ip+4), MCOUNT_INSN_SIZE))
244 return -EFAULT;
245
246 if (*op != 0xe8410028) {
247 printk(KERN_ERR "Next line is not ld! (%08x)\n", *op);
248 return -EINVAL;
249 }
250
251 /*
252 * Milton Miller pointed out that we can not blindly do nops.
253 * If a task was preempted when calling a trace function,
254 * the nops will remove the way to restore the TOC in r2
255 * and the r2 TOC will get corrupted.
256 */
257
258 /*
259 * Replace:
260 * bl <tramp> <==== will be replaced with "b 1f"
261 * ld r2,40(r1)
262 * 1:
263 */
264 op[0] = 0x48000008; /* b +8 */
265
266 if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE))
267 return -EPERM;
268
269 return 0;
270}
271
272#else /* !PPC64 */
273static int
274__ftrace_make_nop(struct module *mod,
275 struct dyn_ftrace *rec, unsigned long addr)
276{
277 unsigned char replaced[MCOUNT_INSN_SIZE];
278 unsigned int *op = (unsigned *)&replaced;
279 unsigned char jmp[8];
280 unsigned int *ptr = (unsigned int *)&jmp;
281 unsigned long ip = rec->ip;
282 unsigned long tramp;
283 int offset;
284
285 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
286 return -EFAULT;
287
288 /* Make sure that that this is still a 24bit jump */
289 if (!is_bl_op(*op)) {
290 printk(KERN_ERR "Not expected bl: opcode is %x\n", *op);
291 return -EINVAL;
292 }
293
294 /* lets find where the pointer goes */
295 tramp = find_bl_target(ip, *op);
296
297 /*
298 * On PPC32 the trampoline looks like:
299 * lis r11,sym@ha
300 * addi r11,r11,sym@l
301 * mtctr r11
302 * bctr
303 */
304
305 DEBUGP("ip:%lx jumps to %lx", ip, tramp);
306
307 /* Find where the trampoline jumps to */
308 if (probe_kernel_read(jmp, (void *)tramp, 8)) {
309 printk(KERN_ERR "Failed to read %lx\n", tramp);
310 return -EFAULT;
311 }
312
313 DEBUGP(" %08x %08x ", ptr[0], ptr[1]);
314
315 tramp = (ptr[1] & 0xffff) |
316 ((ptr[0] & 0xffff) << 16);
317 if (tramp & 0x8000)
318 tramp -= 0x10000;
319
320 DEBUGP(" %x ", tramp);
321
322 if (tramp != addr) {
323 printk(KERN_ERR
324 "Trampoline location %08lx does not match addr\n",
325 tramp);
326 return -EINVAL;
327 }
328
329 op[0] = PPC_NOP_INSTR;
330
331 if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE))
332 return -EPERM;
333
334 return 0;
335}
336#endif /* PPC64 */
337
338int ftrace_make_nop(struct module *mod,
339 struct dyn_ftrace *rec, unsigned long addr)
340{
341 unsigned char *old, *new;
342 unsigned long ip = rec->ip;
343
344 /*
345 * If the calling address is more that 24 bits away,
346 * then we had to use a trampoline to make the call.
347 * Otherwise just update the call site.
348 */
349 if (test_24bit_addr(ip, addr)) {
350 /* within range */
351 old = ftrace_call_replace(ip, addr);
352 new = ftrace_nop_replace();
353 return ftrace_modify_code(ip, old, new);
354 }
355
356 /*
357 * Out of range jumps are called from modules.
358 * We should either already have a pointer to the module
359 * or it has been passed in.
360 */
361 if (!rec->arch.mod) {
362 if (!mod) {
363 printk(KERN_ERR "No module loaded addr=%lx\n",
364 addr);
365 return -EFAULT;
366 }
367 rec->arch.mod = mod;
368 } else if (mod) {
369 if (mod != rec->arch.mod) {
370 printk(KERN_ERR
371 "Record mod %p not equal to passed in mod %p\n",
372 rec->arch.mod, mod);
373 return -EINVAL;
374 }
375 /* nothing to do if mod == rec->arch.mod */
376 } else
377 mod = rec->arch.mod;
378
379 return __ftrace_make_nop(mod, rec, addr);
380
381}
382
383#ifdef CONFIG_PPC64
384static int
385__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
386{
387 unsigned char replaced[MCOUNT_INSN_SIZE * 2];
388 unsigned int *op = (unsigned *)&replaced;
389 unsigned long ip = rec->ip;
390 unsigned long offset;
391
392 /* read where this goes */
393 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE * 2))
394 return -EFAULT;
395
396 /*
397 * It should be pointing to two nops or
398 * b +8; ld r2,40(r1)
399 */
400 if (((op[0] != 0x48000008) || (op[1] != 0xe8410028)) &&
401 ((op[0] != PPC_NOP_INSTR) || (op[1] != PPC_NOP_INSTR))) {
402 printk(KERN_ERR "Expected NOPs but have %x %x\n", op[0], op[1]);
403 return -EINVAL;
404 }
405
406 /* If we never set up a trampoline to ftrace_caller, then bail */
407 if (!rec->arch.mod->arch.tramp) {
408 printk(KERN_ERR "No ftrace trampoline\n");
409 return -EINVAL;
410 }
411
412 /* now calculate a jump to the ftrace caller trampoline */
413 offset = rec->arch.mod->arch.tramp - ip;
414
415 if (test_offset(offset)) {
416 printk(KERN_ERR "REL24 %li out of range!\n",
417 (long int)offset);
418 return -EINVAL;
419 }
420
421 /* Set to "bl addr" */
422 op[0] = branch_offset(offset);
423 /* ld r2,40(r1) */
424 op[1] = 0xe8410028;
425
426 DEBUGP("write to %lx\n", rec->ip);
427
428 if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE * 2))
429 return -EPERM;
430
431 return 0;
432}
433#else
434static int
435__ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
436{
437 unsigned char replaced[MCOUNT_INSN_SIZE];
438 unsigned int *op = (unsigned *)&replaced;
439 unsigned long ip = rec->ip;
440 unsigned long offset;
441
442 /* read where this goes */
443 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
444 return -EFAULT;
445
446 /* It should be pointing to a nop */
447 if (op[0] != PPC_NOP_INSTR) {
448 printk(KERN_ERR "Expected NOP but have %x\n", op[0]);
449 return -EINVAL;
450 }
451
452 /* If we never set up a trampoline to ftrace_caller, then bail */
453 if (!rec->arch.mod->arch.tramp) {
454 printk(KERN_ERR "No ftrace trampoline\n");
455 return -EINVAL;
456 }
457
458 /* now calculate a jump to the ftrace caller trampoline */
459 offset = rec->arch.mod->arch.tramp - ip;
460
461 if (test_offset(offset)) {
462 printk(KERN_ERR "REL24 %li out of range!\n",
463 (long int)offset);
464 return -EINVAL;
465 }
466
467 /* Set to "bl addr" */
468 op[0] = branch_offset(offset);
469
470 DEBUGP("write to %lx\n", rec->ip);
471
472 if (probe_kernel_write((void *)ip, replaced, MCOUNT_INSN_SIZE))
473 return -EPERM;
474
475 return 0;
476}
477#endif /* CONFIG_PPC64 */
478
479int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
480{
481 unsigned char *old, *new;
482 unsigned long ip = rec->ip;
483
484 /*
485 * If the calling address is more that 24 bits away,
486 * then we had to use a trampoline to make the call.
487 * Otherwise just update the call site.
488 */
489 if (test_24bit_addr(ip, addr)) {
490 /* within range */
491 old = ftrace_nop_replace();
492 new = ftrace_call_replace(ip, addr);
493 return ftrace_modify_code(ip, old, new);
494 }
495
496 /*
497 * Out of range jumps are called from modules.
498 * Being that we are converting from nop, it had better
499 * already have a module defined.
500 */
501 if (!rec->arch.mod) {
502 printk(KERN_ERR "No module loaded\n");
503 return -EINVAL;
504 }
505
506 return __ftrace_make_call(rec, addr);
114} 507}
115 508
116int ftrace_update_ftrace_func(ftrace_func_t func) 509int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -128,10 +521,10 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
128 521
129int __init ftrace_dyn_arch_init(void *data) 522int __init ftrace_dyn_arch_init(void *data)
130{ 523{
131 /* This is running in kstop_machine */ 524 /* caller expects data to be zero */
525 unsigned long *p = data;
132 526
133 ftrace_mcount_set(data); 527 *p = 0;
134 528
135 return 0; 529 return 0;
136} 530}
137
diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
index 31982d05d81a..88d9c1d5e5fb 100644
--- a/arch/powerpc/kernel/idle.c
+++ b/arch/powerpc/kernel/idle.c
@@ -69,10 +69,15 @@ void cpu_idle(void)
69 smp_mb(); 69 smp_mb();
70 local_irq_disable(); 70 local_irq_disable();
71 71
72 /* Don't trace irqs off for idle */
73 stop_critical_timings();
74
72 /* check again after disabling irqs */ 75 /* check again after disabling irqs */
73 if (!need_resched() && !cpu_should_die()) 76 if (!need_resched() && !cpu_should_die())
74 ppc_md.power_save(); 77 ppc_md.power_save();
75 78
79 start_critical_timings();
80
76 local_irq_enable(); 81 local_irq_enable();
77 set_thread_flag(TIF_POLLING_NRFLAG); 82 set_thread_flag(TIF_POLLING_NRFLAG);
78 83
diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c
index 2df91a03462a..f832773fc28e 100644
--- a/arch/powerpc/kernel/module_32.c
+++ b/arch/powerpc/kernel/module_32.c
@@ -22,6 +22,7 @@
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/ftrace.h>
25#include <linux/cache.h> 26#include <linux/cache.h>
26#include <linux/bug.h> 27#include <linux/bug.h>
27#include <linux/sort.h> 28#include <linux/sort.h>
@@ -53,6 +54,9 @@ static unsigned int count_relocs(const Elf32_Rela *rela, unsigned int num)
53 r_addend = rela[i].r_addend; 54 r_addend = rela[i].r_addend;
54 } 55 }
55 56
57#ifdef CONFIG_DYNAMIC_FTRACE
58 _count_relocs++; /* add one for ftrace_caller */
59#endif
56 return _count_relocs; 60 return _count_relocs;
57} 61}
58 62
@@ -306,5 +310,11 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
306 return -ENOEXEC; 310 return -ENOEXEC;
307 } 311 }
308 } 312 }
313#ifdef CONFIG_DYNAMIC_FTRACE
314 module->arch.tramp =
315 do_plt_call(module->module_core,
316 (unsigned long)ftrace_caller,
317 sechdrs, module);
318#endif
309 return 0; 319 return 0;
310} 320}
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 1af2377e4992..8992b031a7b6 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -20,6 +20,7 @@
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/err.h> 21#include <linux/err.h>
22#include <linux/vmalloc.h> 22#include <linux/vmalloc.h>
23#include <linux/ftrace.h>
23#include <linux/bug.h> 24#include <linux/bug.h>
24#include <asm/module.h> 25#include <asm/module.h>
25#include <asm/firmware.h> 26#include <asm/firmware.h>
@@ -163,6 +164,11 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr,
163 } 164 }
164 } 165 }
165 166
167#ifdef CONFIG_DYNAMIC_FTRACE
168 /* make the trampoline to the ftrace_caller */
169 relocs++;
170#endif
171
166 DEBUGP("Looks like a total of %lu stubs, max\n", relocs); 172 DEBUGP("Looks like a total of %lu stubs, max\n", relocs);
167 return relocs * sizeof(struct ppc64_stub_entry); 173 return relocs * sizeof(struct ppc64_stub_entry);
168} 174}
@@ -441,5 +447,12 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
441 } 447 }
442 } 448 }
443 449
450#ifdef CONFIG_DYNAMIC_FTRACE
451 me->arch.toc = my_r2(sechdrs, me);
452 me->arch.tramp = stub_for_addr(sechdrs,
453 (unsigned long)ftrace_caller,
454 me);
455#endif
456
444 return 0; 457 return 0;
445} 458}
diff --git a/arch/sh/include/asm/topology.h b/arch/sh/include/asm/topology.h
index 95f0085e098a..279d9cc4a007 100644
--- a/arch/sh/include/asm/topology.h
+++ b/arch/sh/include/asm/topology.h
@@ -5,7 +5,6 @@
5 5
6/* sched_domains SD_NODE_INIT for sh machines */ 6/* sched_domains SD_NODE_INIT for sh machines */
7#define SD_NODE_INIT (struct sched_domain) { \ 7#define SD_NODE_INIT (struct sched_domain) { \
8 .span = CPU_MASK_NONE, \
9 .parent = NULL, \ 8 .parent = NULL, \
10 .child = NULL, \ 9 .child = NULL, \
11 .groups = NULL, \ 10 .groups = NULL, \
diff --git a/arch/um/include/asm/system.h b/arch/um/include/asm/system.h
index 753346e2cdfd..ae5f94d6317d 100644
--- a/arch/um/include/asm/system.h
+++ b/arch/um/include/asm/system.h
@@ -11,21 +11,21 @@ extern int get_signals(void);
11extern void block_signals(void); 11extern void block_signals(void);
12extern void unblock_signals(void); 12extern void unblock_signals(void);
13 13
14#define local_save_flags(flags) do { typecheck(unsigned long, flags); \ 14#define raw_local_save_flags(flags) do { typecheck(unsigned long, flags); \
15 (flags) = get_signals(); } while(0) 15 (flags) = get_signals(); } while(0)
16#define local_irq_restore(flags) do { typecheck(unsigned long, flags); \ 16#define raw_local_irq_restore(flags) do { typecheck(unsigned long, flags); \
17 set_signals(flags); } while(0) 17 set_signals(flags); } while(0)
18 18
19#define local_irq_save(flags) do { local_save_flags(flags); \ 19#define raw_local_irq_save(flags) do { raw_local_save_flags(flags); \
20 local_irq_disable(); } while(0) 20 raw_local_irq_disable(); } while(0)
21 21
22#define local_irq_enable() unblock_signals() 22#define raw_local_irq_enable() unblock_signals()
23#define local_irq_disable() block_signals() 23#define raw_local_irq_disable() block_signals()
24 24
25#define irqs_disabled() \ 25#define irqs_disabled() \
26({ \ 26({ \
27 unsigned long flags; \ 27 unsigned long flags; \
28 local_save_flags(flags); \ 28 raw_local_save_flags(flags); \
29 (flags == 0); \ 29 (flags == 0); \
30}) 30})
31 31
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ac22bb7719f7..352f63df1d80 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -29,11 +29,14 @@ config X86
29 select HAVE_FTRACE_MCOUNT_RECORD 29 select HAVE_FTRACE_MCOUNT_RECORD
30 select HAVE_DYNAMIC_FTRACE 30 select HAVE_DYNAMIC_FTRACE
31 select HAVE_FUNCTION_TRACER 31 select HAVE_FUNCTION_TRACER
32 select HAVE_FUNCTION_RET_TRACER if X86_32
33 select HAVE_FUNCTION_TRACE_MCOUNT_TEST
32 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 34 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
33 select HAVE_ARCH_KGDB if !X86_VOYAGER 35 select HAVE_ARCH_KGDB if !X86_VOYAGER
34 select HAVE_ARCH_TRACEHOOK 36 select HAVE_ARCH_TRACEHOOK
35 select HAVE_GENERIC_DMA_COHERENT if X86_32 37 select HAVE_GENERIC_DMA_COHERENT if X86_32
36 select HAVE_EFFICIENT_UNALIGNED_ACCESS 38 select HAVE_EFFICIENT_UNALIGNED_ACCESS
39 select USER_STACKTRACE_SUPPORT
37 40
38config ARCH_DEFCONFIG 41config ARCH_DEFCONFIG
39 string 42 string
@@ -367,10 +370,10 @@ config X86_RDC321X
367 as R-8610-(G). 370 as R-8610-(G).
368 If you don't have one of these chips, you should say N here. 371 If you don't have one of these chips, you should say N here.
369 372
370config SCHED_NO_NO_OMIT_FRAME_POINTER 373config SCHED_OMIT_FRAME_POINTER
371 def_bool y 374 def_bool y
372 prompt "Single-depth WCHAN output" 375 prompt "Single-depth WCHAN output"
373 depends on X86_32 376 depends on X86
374 help 377 help
375 Calculate simpler /proc/<PID>/wchan values. If this option 378 Calculate simpler /proc/<PID>/wchan values. If this option
376 is disabled then wchan values will recurse back to the 379 is disabled then wchan values will recurse back to the
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 2a3dfbd5e677..fa013f529b74 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -186,14 +186,10 @@ config IOMMU_LEAK
186 Add a simple leak tracer to the IOMMU code. This is useful when you 186 Add a simple leak tracer to the IOMMU code. This is useful when you
187 are debugging a buggy device driver that leaks IOMMU mappings. 187 are debugging a buggy device driver that leaks IOMMU mappings.
188 188
189config MMIOTRACE_HOOKS
190 bool
191
192config MMIOTRACE 189config MMIOTRACE
193 bool "Memory mapped IO tracing" 190 bool "Memory mapped IO tracing"
194 depends on DEBUG_KERNEL && PCI 191 depends on DEBUG_KERNEL && PCI
195 select TRACING 192 select TRACING
196 select MMIOTRACE_HOOKS
197 help 193 help
198 Mmiotrace traces Memory Mapped I/O access and is meant for 194 Mmiotrace traces Memory Mapped I/O access and is meant for
199 debugging and reverse engineering. It is called from the ioremap 195 debugging and reverse engineering. It is called from the ioremap
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 9e8bc29b8b17..754a3e082f94 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -17,8 +17,40 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)
17 */ 17 */
18 return addr - 1; 18 return addr - 1;
19} 19}
20#endif
21 20
21#ifdef CONFIG_DYNAMIC_FTRACE
22
23struct dyn_arch_ftrace {
24 /* No extra data needed for x86 */
25};
26
27#endif /* CONFIG_DYNAMIC_FTRACE */
28#endif /* __ASSEMBLY__ */
22#endif /* CONFIG_FUNCTION_TRACER */ 29#endif /* CONFIG_FUNCTION_TRACER */
23 30
31#ifdef CONFIG_FUNCTION_RET_TRACER
32
33#ifndef __ASSEMBLY__
34
35/*
36 * Stack of return addresses for functions
37 * of a thread.
38 * Used in struct thread_info
39 */
40struct ftrace_ret_stack {
41 unsigned long ret;
42 unsigned long func;
43 unsigned long long calltime;
44};
45
46/*
47 * Primary handler of a function return.
48 * It relays on ftrace_return_to_handler.
49 * Defined in entry32.S
50 */
51extern void return_to_handler(void);
52
53#endif /* __ASSEMBLY__ */
54#endif /* CONFIG_FUNCTION_RET_TRACER */
55
24#endif /* _ASM_X86_FTRACE_H */ 56#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad2..0921b4018c11 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -20,6 +20,8 @@
20struct task_struct; 20struct task_struct;
21struct exec_domain; 21struct exec_domain;
22#include <asm/processor.h> 22#include <asm/processor.h>
23#include <asm/ftrace.h>
24#include <asm/atomic.h>
23 25
24struct thread_info { 26struct thread_info {
25 struct task_struct *task; /* main task structure */ 27 struct task_struct *task; /* main task structure */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 35c54921b2e4..99192bb55a53 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -157,6 +157,7 @@ extern int __get_user_bad(void);
157 int __ret_gu; \ 157 int __ret_gu; \
158 unsigned long __val_gu; \ 158 unsigned long __val_gu; \
159 __chk_user_ptr(ptr); \ 159 __chk_user_ptr(ptr); \
160 might_fault(); \
160 switch (sizeof(*(ptr))) { \ 161 switch (sizeof(*(ptr))) { \
161 case 1: \ 162 case 1: \
162 __get_user_x(1, __ret_gu, __val_gu, ptr); \ 163 __get_user_x(1, __ret_gu, __val_gu, ptr); \
@@ -241,6 +242,7 @@ extern void __put_user_8(void);
241 int __ret_pu; \ 242 int __ret_pu; \
242 __typeof__(*(ptr)) __pu_val; \ 243 __typeof__(*(ptr)) __pu_val; \
243 __chk_user_ptr(ptr); \ 244 __chk_user_ptr(ptr); \
245 might_fault(); \
244 __pu_val = x; \ 246 __pu_val = x; \
245 switch (sizeof(*(ptr))) { \ 247 switch (sizeof(*(ptr))) { \
246 case 1: \ 248 case 1: \
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index d095a3aeea1b..5e06259e90e5 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -82,8 +82,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
82static __always_inline unsigned long __must_check 82static __always_inline unsigned long __must_check
83__copy_to_user(void __user *to, const void *from, unsigned long n) 83__copy_to_user(void __user *to, const void *from, unsigned long n)
84{ 84{
85 might_sleep(); 85 might_fault();
86 return __copy_to_user_inatomic(to, from, n); 86 return __copy_to_user_inatomic(to, from, n);
87} 87}
88 88
89static __always_inline unsigned long 89static __always_inline unsigned long
@@ -137,7 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
137static __always_inline unsigned long 137static __always_inline unsigned long
138__copy_from_user(void *to, const void __user *from, unsigned long n) 138__copy_from_user(void *to, const void __user *from, unsigned long n)
139{ 139{
140 might_sleep(); 140 might_fault();
141 if (__builtin_constant_p(n)) { 141 if (__builtin_constant_p(n)) {
142 unsigned long ret; 142 unsigned long ret;
143 143
@@ -159,7 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)
159static __always_inline unsigned long __copy_from_user_nocache(void *to, 159static __always_inline unsigned long __copy_from_user_nocache(void *to,
160 const void __user *from, unsigned long n) 160 const void __user *from, unsigned long n)
161{ 161{
162 might_sleep(); 162 might_fault();
163 if (__builtin_constant_p(n)) { 163 if (__builtin_constant_p(n)) {
164 unsigned long ret; 164 unsigned long ret;
165 165
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index f8cfd00db450..84210c479fca 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -29,6 +29,8 @@ static __always_inline __must_check
29int __copy_from_user(void *dst, const void __user *src, unsigned size) 29int __copy_from_user(void *dst, const void __user *src, unsigned size)
30{ 30{
31 int ret = 0; 31 int ret = 0;
32
33 might_fault();
32 if (!__builtin_constant_p(size)) 34 if (!__builtin_constant_p(size))
33 return copy_user_generic(dst, (__force void *)src, size); 35 return copy_user_generic(dst, (__force void *)src, size);
34 switch (size) { 36 switch (size) {
@@ -71,6 +73,8 @@ static __always_inline __must_check
71int __copy_to_user(void __user *dst, const void *src, unsigned size) 73int __copy_to_user(void __user *dst, const void *src, unsigned size)
72{ 74{
73 int ret = 0; 75 int ret = 0;
76
77 might_fault();
74 if (!__builtin_constant_p(size)) 78 if (!__builtin_constant_p(size))
75 return copy_user_generic((__force void *)dst, src, size); 79 return copy_user_generic((__force void *)dst, src, size);
76 switch (size) { 80 switch (size) {
@@ -113,6 +117,8 @@ static __always_inline __must_check
113int __copy_in_user(void __user *dst, const void __user *src, unsigned size) 117int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
114{ 118{
115 int ret = 0; 119 int ret = 0;
120
121 might_fault();
116 if (!__builtin_constant_p(size)) 122 if (!__builtin_constant_p(size))
117 return copy_user_generic((__force void *)dst, 123 return copy_user_generic((__force void *)dst,
118 (__force void *)src, size); 124 (__force void *)src, size);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index b62a7667828e..af2bc36ca1c4 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -14,6 +14,11 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14CFLAGS_REMOVE_ftrace.o = -pg 14CFLAGS_REMOVE_ftrace.o = -pg
15endif 15endif
16 16
17ifdef CONFIG_FUNCTION_RET_TRACER
18# Don't trace __switch_to() but let it for function tracer
19CFLAGS_REMOVE_process_32.o = -pg
20endif
21
17# 22#
18# vsyscalls (which work on the user stack) should have 23# vsyscalls (which work on the user stack) should have
19# no stack-protector checks: 24# no stack-protector checks:
@@ -65,6 +70,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
65obj-$(CONFIG_X86_IO_APIC) += io_apic.o 70obj-$(CONFIG_X86_IO_APIC) += io_apic.o
66obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 71obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
67obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 72obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
73obj-$(CONFIG_FUNCTION_RET_TRACER) += ftrace.o
68obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 74obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 75obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 76obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 28b597ef9ca1..74defe21ba42 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1157,6 +1157,9 @@ ENTRY(mcount)
1157END(mcount) 1157END(mcount)
1158 1158
1159ENTRY(ftrace_caller) 1159ENTRY(ftrace_caller)
1160 cmpl $0, function_trace_stop
1161 jne ftrace_stub
1162
1160 pushl %eax 1163 pushl %eax
1161 pushl %ecx 1164 pushl %ecx
1162 pushl %edx 1165 pushl %edx
@@ -1180,8 +1183,15 @@ END(ftrace_caller)
1180#else /* ! CONFIG_DYNAMIC_FTRACE */ 1183#else /* ! CONFIG_DYNAMIC_FTRACE */
1181 1184
1182ENTRY(mcount) 1185ENTRY(mcount)
1186 cmpl $0, function_trace_stop
1187 jne ftrace_stub
1188
1183 cmpl $ftrace_stub, ftrace_trace_function 1189 cmpl $ftrace_stub, ftrace_trace_function
1184 jnz trace 1190 jnz trace
1191#ifdef CONFIG_FUNCTION_RET_TRACER
1192 cmpl $ftrace_stub, ftrace_function_return
1193 jnz ftrace_return_caller
1194#endif
1185.globl ftrace_stub 1195.globl ftrace_stub
1186ftrace_stub: 1196ftrace_stub:
1187 ret 1197 ret
@@ -1200,12 +1210,42 @@ trace:
1200 popl %edx 1210 popl %edx
1201 popl %ecx 1211 popl %ecx
1202 popl %eax 1212 popl %eax
1203
1204 jmp ftrace_stub 1213 jmp ftrace_stub
1205END(mcount) 1214END(mcount)
1206#endif /* CONFIG_DYNAMIC_FTRACE */ 1215#endif /* CONFIG_DYNAMIC_FTRACE */
1207#endif /* CONFIG_FUNCTION_TRACER */ 1216#endif /* CONFIG_FUNCTION_TRACER */
1208 1217
1218#ifdef CONFIG_FUNCTION_RET_TRACER
1219ENTRY(ftrace_return_caller)
1220 cmpl $0, function_trace_stop
1221 jne ftrace_stub
1222
1223 pushl %eax
1224 pushl %ecx
1225 pushl %edx
1226 movl 0xc(%esp), %edx
1227 lea 0x4(%ebp), %eax
1228 call prepare_ftrace_return
1229 popl %edx
1230 popl %ecx
1231 popl %eax
1232 ret
1233END(ftrace_return_caller)
1234
1235.globl return_to_handler
1236return_to_handler:
1237 pushl $0
1238 pushl %eax
1239 pushl %ecx
1240 pushl %edx
1241 call ftrace_return_to_handler
1242 movl %eax, 0xc(%esp)
1243 popl %edx
1244 popl %ecx
1245 popl %eax
1246 ret
1247#endif
1248
1209.section .rodata,"a" 1249.section .rodata,"a"
1210#include "syscall_table_32.S" 1250#include "syscall_table_32.S"
1211 1251
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b86f332c96a6..08aa6b10933c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -68,6 +68,8 @@ ENTRY(mcount)
68END(mcount) 68END(mcount)
69 69
70ENTRY(ftrace_caller) 70ENTRY(ftrace_caller)
71 cmpl $0, function_trace_stop
72 jne ftrace_stub
71 73
72 /* taken from glibc */ 74 /* taken from glibc */
73 subq $0x38, %rsp 75 subq $0x38, %rsp
@@ -103,6 +105,9 @@ END(ftrace_caller)
103 105
104#else /* ! CONFIG_DYNAMIC_FTRACE */ 106#else /* ! CONFIG_DYNAMIC_FTRACE */
105ENTRY(mcount) 107ENTRY(mcount)
108 cmpl $0, function_trace_stop
109 jne ftrace_stub
110
106 cmpq $ftrace_stub, ftrace_trace_function 111 cmpq $ftrace_stub, ftrace_trace_function
107 jnz trace 112 jnz trace
108.globl ftrace_stub 113.globl ftrace_stub
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 50ea0ac8c9bf..bb137f7297ed 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -14,14 +14,17 @@
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ftrace.h> 15#include <linux/ftrace.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/sched.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/list.h> 19#include <linux/list.h>
19 20
20#include <asm/ftrace.h> 21#include <asm/ftrace.h>
22#include <linux/ftrace.h>
21#include <asm/nops.h> 23#include <asm/nops.h>
24#include <asm/nmi.h>
22 25
23 26
24static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; 27#ifdef CONFIG_DYNAMIC_FTRACE
25 28
26union ftrace_code_union { 29union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE]; 30 char code[MCOUNT_INSN_SIZE];
@@ -31,18 +34,12 @@ union ftrace_code_union {
31 } __attribute__((packed)); 34 } __attribute__((packed));
32}; 35};
33 36
34
35static int ftrace_calc_offset(long ip, long addr) 37static int ftrace_calc_offset(long ip, long addr)
36{ 38{
37 return (int)(addr - ip); 39 return (int)(addr - ip);
38} 40}
39 41
40unsigned char *ftrace_nop_replace(void) 42static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
41{
42 return ftrace_nop;
43}
44
45unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
46{ 43{
47 static union ftrace_code_union calc; 44 static union ftrace_code_union calc;
48 45
@@ -56,7 +53,143 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
56 return calc.code; 53 return calc.code;
57} 54}
58 55
59int 56/*
57 * Modifying code must take extra care. On an SMP machine, if
58 * the code being modified is also being executed on another CPU
59 * that CPU will have undefined results and possibly take a GPF.
60 * We use kstop_machine to stop other CPUS from exectuing code.
61 * But this does not stop NMIs from happening. We still need
62 * to protect against that. We separate out the modification of
63 * the code to take care of this.
64 *
65 * Two buffers are added: An IP buffer and a "code" buffer.
66 *
67 * 1) Put the instruction pointer into the IP buffer
68 * and the new code into the "code" buffer.
69 * 2) Set a flag that says we are modifying code
70 * 3) Wait for any running NMIs to finish.
71 * 4) Write the code
72 * 5) clear the flag.
73 * 6) Wait for any running NMIs to finish.
74 *
75 * If an NMI is executed, the first thing it does is to call
76 * "ftrace_nmi_enter". This will check if the flag is set to write
77 * and if it is, it will write what is in the IP and "code" buffers.
78 *
79 * The trick is, it does not matter if everyone is writing the same
80 * content to the code location. Also, if a CPU is executing code
81 * it is OK to write to that code location if the contents being written
82 * are the same as what exists.
83 */
84
85static atomic_t in_nmi = ATOMIC_INIT(0);
86static int mod_code_status; /* holds return value of text write */
87static int mod_code_write; /* set when NMI should do the write */
88static void *mod_code_ip; /* holds the IP to write to */
89static void *mod_code_newcode; /* holds the text to write to the IP */
90
91static unsigned nmi_wait_count;
92static atomic_t nmi_update_count = ATOMIC_INIT(0);
93
94int ftrace_arch_read_dyn_info(char *buf, int size)
95{
96 int r;
97
98 r = snprintf(buf, size, "%u %u",
99 nmi_wait_count,
100 atomic_read(&nmi_update_count));
101 return r;
102}
103
104static void ftrace_mod_code(void)
105{
106 /*
107 * Yes, more than one CPU process can be writing to mod_code_status.
108 * (and the code itself)
109 * But if one were to fail, then they all should, and if one were
110 * to succeed, then they all should.
111 */
112 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
113 MCOUNT_INSN_SIZE);
114
115}
116
117void ftrace_nmi_enter(void)
118{
119 atomic_inc(&in_nmi);
120 /* Must have in_nmi seen before reading write flag */
121 smp_mb();
122 if (mod_code_write) {
123 ftrace_mod_code();
124 atomic_inc(&nmi_update_count);
125 }
126}
127
128void ftrace_nmi_exit(void)
129{
130 /* Finish all executions before clearing in_nmi */
131 smp_wmb();
132 atomic_dec(&in_nmi);
133}
134
135static void wait_for_nmi(void)
136{
137 int waited = 0;
138
139 while (atomic_read(&in_nmi)) {
140 waited = 1;
141 cpu_relax();
142 }
143
144 if (waited)
145 nmi_wait_count++;
146}
147
148static int
149do_ftrace_mod_code(unsigned long ip, void *new_code)
150{
151 mod_code_ip = (void *)ip;
152 mod_code_newcode = new_code;
153
154 /* The buffers need to be visible before we let NMIs write them */
155 smp_wmb();
156
157 mod_code_write = 1;
158
159 /* Make sure write bit is visible before we wait on NMIs */
160 smp_mb();
161
162 wait_for_nmi();
163
164 /* Make sure all running NMIs have finished before we write the code */
165 smp_mb();
166
167 ftrace_mod_code();
168
169 /* Make sure the write happens before clearing the bit */
170 smp_wmb();
171
172 mod_code_write = 0;
173
174 /* make sure NMIs see the cleared bit */
175 smp_mb();
176
177 wait_for_nmi();
178
179 return mod_code_status;
180}
181
182
183
184
185static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
186
187static unsigned char *ftrace_nop_replace(void)
188{
189 return ftrace_nop;
190}
191
192static int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code, 193ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code) 194 unsigned char *new_code)
62{ 195{
@@ -81,7 +214,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
81 return -EINVAL; 214 return -EINVAL;
82 215
83 /* replace the text with the new text */ 216 /* replace the text with the new text */
84 if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) 217 if (do_ftrace_mod_code(ip, new_code))
85 return -EPERM; 218 return -EPERM;
86 219
87 sync_core(); 220 sync_core();
@@ -89,6 +222,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
89 return 0; 222 return 0;
90} 223}
91 224
225int ftrace_make_nop(struct module *mod,
226 struct dyn_ftrace *rec, unsigned long addr)
227{
228 unsigned char *new, *old;
229 unsigned long ip = rec->ip;
230
231 old = ftrace_call_replace(ip, addr);
232 new = ftrace_nop_replace();
233
234 return ftrace_modify_code(rec->ip, old, new);
235}
236
237int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
238{
239 unsigned char *new, *old;
240 unsigned long ip = rec->ip;
241
242 old = ftrace_nop_replace();
243 new = ftrace_call_replace(ip, addr);
244
245 return ftrace_modify_code(rec->ip, old, new);
246}
247
92int ftrace_update_ftrace_func(ftrace_func_t func) 248int ftrace_update_ftrace_func(ftrace_func_t func)
93{ 249{
94 unsigned long ip = (unsigned long)(&ftrace_call); 250 unsigned long ip = (unsigned long)(&ftrace_call);
@@ -165,3 +321,139 @@ int __init ftrace_dyn_arch_init(void *data)
165 321
166 return 0; 322 return 0;
167} 323}
324#endif
325
326#ifdef CONFIG_FUNCTION_RET_TRACER
327
328#ifndef CONFIG_DYNAMIC_FTRACE
329
330/*
331 * These functions are picked from those used on
332 * this page for dynamic ftrace. They have been
333 * simplified to ignore all traces in NMI context.
334 */
335static atomic_t in_nmi;
336
337void ftrace_nmi_enter(void)
338{
339 atomic_inc(&in_nmi);
340}
341
342void ftrace_nmi_exit(void)
343{
344 atomic_dec(&in_nmi);
345}
346#endif /* !CONFIG_DYNAMIC_FTRACE */
347
348/* Add a function return address to the trace stack on thread info.*/
349static int push_return_trace(unsigned long ret, unsigned long long time,
350 unsigned long func)
351{
352 int index;
353
354 if (!current->ret_stack)
355 return -EBUSY;
356
357 /* The return trace stack is full */
358 if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
359 atomic_inc(&current->trace_overrun);
360 return -EBUSY;
361 }
362
363 index = ++current->curr_ret_stack;
364 barrier();
365 current->ret_stack[index].ret = ret;
366 current->ret_stack[index].func = func;
367 current->ret_stack[index].calltime = time;
368
369 return 0;
370}
371
372/* Retrieve a function return address to the trace stack on thread info.*/
373static void pop_return_trace(unsigned long *ret, unsigned long long *time,
374 unsigned long *func, unsigned long *overrun)
375{
376 int index;
377
378 index = current->curr_ret_stack;
379 *ret = current->ret_stack[index].ret;
380 *func = current->ret_stack[index].func;
381 *time = current->ret_stack[index].calltime;
382 *overrun = atomic_read(&current->trace_overrun);
383 current->curr_ret_stack--;
384}
385
386/*
387 * Send the trace to the ring-buffer.
388 * @return the original return address.
389 */
390unsigned long ftrace_return_to_handler(void)
391{
392 struct ftrace_retfunc trace;
393 pop_return_trace(&trace.ret, &trace.calltime, &trace.func,
394 &trace.overrun);
395 trace.rettime = cpu_clock(raw_smp_processor_id());
396 ftrace_function_return(&trace);
397
398 return trace.ret;
399}
400
401/*
402 * Hook the return address and push it in the stack of return addrs
403 * in current thread info.
404 */
405void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
406{
407 unsigned long old;
408 unsigned long long calltime;
409 int faulted;
410 unsigned long return_hooker = (unsigned long)
411 &return_to_handler;
412
413 /* Nmi's are currently unsupported */
414 if (atomic_read(&in_nmi))
415 return;
416
417 /*
418 * Protect against fault, even if it shouldn't
419 * happen. This tool is too much intrusive to
420 * ignore such a protection.
421 */
422 asm volatile(
423 "1: movl (%[parent_old]), %[old]\n"
424 "2: movl %[return_hooker], (%[parent_replaced])\n"
425 " movl $0, %[faulted]\n"
426
427 ".section .fixup, \"ax\"\n"
428 "3: movl $1, %[faulted]\n"
429 ".previous\n"
430
431 ".section __ex_table, \"a\"\n"
432 " .long 1b, 3b\n"
433 " .long 2b, 3b\n"
434 ".previous\n"
435
436 : [parent_replaced] "=r" (parent), [old] "=r" (old),
437 [faulted] "=r" (faulted)
438 : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker)
439 : "memory"
440 );
441
442 if (WARN_ON(faulted)) {
443 unregister_ftrace_return();
444 return;
445 }
446
447 if (WARN_ON(!__kernel_text_address(old))) {
448 unregister_ftrace_return();
449 *parent = old;
450 return;
451 }
452
453 calltime = cpu_clock(raw_smp_processor_id());
454
455 if (push_return_trace(old, calltime, self_addr) == -EBUSY)
456 *parent = old;
457}
458
459#endif /* CONFIG_FUNCTION_RET_TRACER */
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index a03e7f6d90c3..10786af95545 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -6,6 +6,7 @@
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/stacktrace.h> 7#include <linux/stacktrace.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/uaccess.h>
9#include <asm/stacktrace.h> 10#include <asm/stacktrace.h>
10 11
11static void save_stack_warning(void *data, char *msg) 12static void save_stack_warning(void *data, char *msg)
@@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
83 trace->entries[trace->nr_entries++] = ULONG_MAX; 84 trace->entries[trace->nr_entries++] = ULONG_MAX;
84} 85}
85EXPORT_SYMBOL_GPL(save_stack_trace_tsk); 86EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
87
88/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
89
90struct stack_frame {
91 const void __user *next_fp;
92 unsigned long ret_addr;
93};
94
95static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
96{
97 int ret;
98
99 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
100 return 0;
101
102 ret = 1;
103 pagefault_disable();
104 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
105 ret = 0;
106 pagefault_enable();
107
108 return ret;
109}
110
111static inline void __save_stack_trace_user(struct stack_trace *trace)
112{
113 const struct pt_regs *regs = task_pt_regs(current);
114 const void __user *fp = (const void __user *)regs->bp;
115
116 if (trace->nr_entries < trace->max_entries)
117 trace->entries[trace->nr_entries++] = regs->ip;
118
119 while (trace->nr_entries < trace->max_entries) {
120 struct stack_frame frame;
121
122 frame.next_fp = NULL;
123 frame.ret_addr = 0;
124 if (!copy_stack_frame(fp, &frame))
125 break;
126 if ((unsigned long)fp < regs->sp)
127 break;
128 if (frame.ret_addr) {
129 trace->entries[trace->nr_entries++] =
130 frame.ret_addr;
131 }
132 if (fp == frame.next_fp)
133 break;
134 fp = frame.next_fp;
135 }
136}
137
138void save_stack_trace_user(struct stack_trace *trace)
139{
140 /*
141 * Trace user stack if we are not a kernel thread
142 */
143 if (current->mm) {
144 __save_stack_trace_user(trace);
145 }
146 if (trace->nr_entries < trace->max_entries)
147 trace->entries[trace->nr_entries++] = ULONG_MAX;
148}
149
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0b8b6690a86d..6f3d3d4cd973 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,9 @@
17 * want per guest time just set the kernel.vsyscall64 sysctl to 0. 17 * want per guest time just set the kernel.vsyscall64 sysctl to 0.
18 */ 18 */
19 19
20/* Disable profiling for userspace code: */
21#define DISABLE_BRANCH_PROFILING
22
20#include <linux/time.h> 23#include <linux/time.h>
21#include <linux/init.h> 24#include <linux/init.h>
22#include <linux/kernel.h> 25#include <linux/kernel.h>
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 9e68075544f6..4a20b2f9a381 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -39,7 +39,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
39#define __do_strncpy_from_user(dst, src, count, res) \ 39#define __do_strncpy_from_user(dst, src, count, res) \
40do { \ 40do { \
41 int __d0, __d1, __d2; \ 41 int __d0, __d1, __d2; \
42 might_sleep(); \ 42 might_fault(); \
43 __asm__ __volatile__( \ 43 __asm__ __volatile__( \
44 " testl %1,%1\n" \ 44 " testl %1,%1\n" \
45 " jz 2f\n" \ 45 " jz 2f\n" \
@@ -126,7 +126,7 @@ EXPORT_SYMBOL(strncpy_from_user);
126#define __do_clear_user(addr,size) \ 126#define __do_clear_user(addr,size) \
127do { \ 127do { \
128 int __d0; \ 128 int __d0; \
129 might_sleep(); \ 129 might_fault(); \
130 __asm__ __volatile__( \ 130 __asm__ __volatile__( \
131 "0: rep; stosl\n" \ 131 "0: rep; stosl\n" \
132 " movl %2,%0\n" \ 132 " movl %2,%0\n" \
@@ -155,7 +155,7 @@ do { \
155unsigned long 155unsigned long
156clear_user(void __user *to, unsigned long n) 156clear_user(void __user *to, unsigned long n)
157{ 157{
158 might_sleep(); 158 might_fault();
159 if (access_ok(VERIFY_WRITE, to, n)) 159 if (access_ok(VERIFY_WRITE, to, n))
160 __do_clear_user(to, n); 160 __do_clear_user(to, n);
161 return n; 161 return n;
@@ -197,7 +197,7 @@ long strnlen_user(const char __user *s, long n)
197 unsigned long mask = -__addr_ok(s); 197 unsigned long mask = -__addr_ok(s);
198 unsigned long res, tmp; 198 unsigned long res, tmp;
199 199
200 might_sleep(); 200 might_fault();
201 201
202 __asm__ __volatile__( 202 __asm__ __volatile__(
203 " testl %0, %0\n" 203 " testl %0, %0\n"
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index f4df6e7c718b..64d6c84e6353 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -15,7 +15,7 @@
15#define __do_strncpy_from_user(dst,src,count,res) \ 15#define __do_strncpy_from_user(dst,src,count,res) \
16do { \ 16do { \
17 long __d0, __d1, __d2; \ 17 long __d0, __d1, __d2; \
18 might_sleep(); \ 18 might_fault(); \
19 __asm__ __volatile__( \ 19 __asm__ __volatile__( \
20 " testq %1,%1\n" \ 20 " testq %1,%1\n" \
21 " jz 2f\n" \ 21 " jz 2f\n" \
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user);
64unsigned long __clear_user(void __user *addr, unsigned long size) 64unsigned long __clear_user(void __user *addr, unsigned long size)
65{ 65{
66 long __d0; 66 long __d0;
67 might_sleep(); 67 might_fault();
68 /* no memory constraint because it doesn't change any memory gcc knows 68 /* no memory constraint because it doesn't change any memory gcc knows
69 about */ 69 about */
70 asm volatile( 70 asm volatile(
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index fea4565ff576..d8cc96a2738f 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -8,9 +8,8 @@ obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
8 8
9obj-$(CONFIG_HIGHMEM) += highmem_32.o 9obj-$(CONFIG_HIGHMEM) += highmem_32.o
10 10
11obj-$(CONFIG_MMIOTRACE_HOOKS) += kmmio.o
12obj-$(CONFIG_MMIOTRACE) += mmiotrace.o 11obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o 12mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 13obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15 14
16obj-$(CONFIG_NUMA) += numa_$(BITS).o 15obj-$(CONFIG_NUMA) += numa_$(BITS).o
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 31e8730fa246..4152d3c3b138 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -53,7 +53,7 @@
53 53
54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 54static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
55{ 55{
56#ifdef CONFIG_MMIOTRACE_HOOKS 56#ifdef CONFIG_MMIOTRACE
57 if (unlikely(is_kmmio_active())) 57 if (unlikely(is_kmmio_active()))
58 if (kmmio_handler(regs, addr) == 1) 58 if (kmmio_handler(regs, addr) == 1)
59 return -1; 59 return -1;
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 1ef0f90813d6..d9d35824c56f 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -9,6 +9,9 @@
9 * Also alternative() doesn't work. 9 * Also alternative() doesn't work.
10 */ 10 */
11 11
12/* Disable profiling for userspace code: */
13#define DISABLE_BRANCH_PROFILING
14
12#include <linux/kernel.h> 15#include <linux/kernel.h>
13#include <linux/posix-timers.h> 16#include <linux/posix-timers.h>
14#include <linux/time.h> 17#include <linux/time.h>
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index ce0d9da52a8a..94966edfb44d 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -274,6 +274,22 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = {
274 .enable_mask = SYSRQ_ENABLE_DUMP, 274 .enable_mask = SYSRQ_ENABLE_DUMP,
275}; 275};
276 276
277#ifdef CONFIG_TRACING
278#include <linux/ftrace.h>
279
280static void sysrq_ftrace_dump(int key, struct tty_struct *tty)
281{
282 ftrace_dump();
283}
284static struct sysrq_key_op sysrq_ftrace_dump_op = {
285 .handler = sysrq_ftrace_dump,
286 .help_msg = "dumpZ-ftrace-buffer",
287 .action_msg = "Dump ftrace buffer",
288 .enable_mask = SYSRQ_ENABLE_DUMP,
289};
290#else
291#define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)0)
292#endif
277 293
278static void sysrq_handle_showmem(int key, struct tty_struct *tty) 294static void sysrq_handle_showmem(int key, struct tty_struct *tty)
279{ 295{
@@ -406,7 +422,7 @@ static struct sysrq_key_op *sysrq_key_table[36] = {
406 NULL, /* x */ 422 NULL, /* x */
407 /* y: May be registered on sparc64 for global register dump */ 423 /* y: May be registered on sparc64 for global register dump */
408 NULL, /* y */ 424 NULL, /* y */
409 NULL /* z */ 425 &sysrq_ftrace_dump_op, /* z */
410}; 426};
411 427
412/* key2index calculation, -1 on invalid index */ 428/* key2index calculation, -1 on invalid index */
diff --git a/fs/seq_file.c b/fs/seq_file.c
index eba2eabcd2b8..f03220d7891b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -357,7 +357,18 @@ int seq_printf(struct seq_file *m, const char *f, ...)
357} 357}
358EXPORT_SYMBOL(seq_printf); 358EXPORT_SYMBOL(seq_printf);
359 359
360static char *mangle_path(char *s, char *p, char *esc) 360/**
361 * mangle_path - mangle and copy path to buffer beginning
362 * @s: buffer start
363 * @p: beginning of path in above buffer
364 * @esc: set of characters that need escaping
365 *
366 * Copy the path from @p to @s, replacing each occurrence of character from
367 * @esc with usual octal escape.
368 * Returns pointer past last written character in @s, or NULL in case of
369 * failure.
370 */
371char *mangle_path(char *s, char *p, char *esc)
361{ 372{
362 while (s <= p) { 373 while (s <= p) {
363 char c = *p++; 374 char c = *p++;
@@ -376,6 +387,7 @@ static char *mangle_path(char *s, char *p, char *esc)
376 } 387 }
377 return NULL; 388 return NULL;
378} 389}
390EXPORT_SYMBOL_GPL(mangle_path);
379 391
380/* 392/*
381 * return the absolute path of 'dentry' residing in mount 'mnt'. 393 * return the absolute path of 'dentry' residing in mount 'mnt'.
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 80744606bad1..eba835a2c2cd 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -45,6 +45,22 @@
45#define MCOUNT_REC() 45#define MCOUNT_REC()
46#endif 46#endif
47 47
48#ifdef CONFIG_TRACE_BRANCH_PROFILING
49#define LIKELY_PROFILE() VMLINUX_SYMBOL(__start_annotated_branch_profile) = .; \
50 *(_ftrace_annotated_branch) \
51 VMLINUX_SYMBOL(__stop_annotated_branch_profile) = .;
52#else
53#define LIKELY_PROFILE()
54#endif
55
56#ifdef CONFIG_PROFILE_ALL_BRANCHES
57#define BRANCH_PROFILE() VMLINUX_SYMBOL(__start_branch_profile) = .; \
58 *(_ftrace_branch) \
59 VMLINUX_SYMBOL(__stop_branch_profile) = .;
60#else
61#define BRANCH_PROFILE()
62#endif
63
48/* .data section */ 64/* .data section */
49#define DATA_DATA \ 65#define DATA_DATA \
50 *(.data) \ 66 *(.data) \
@@ -60,9 +76,12 @@
60 VMLINUX_SYMBOL(__start___markers) = .; \ 76 VMLINUX_SYMBOL(__start___markers) = .; \
61 *(__markers) \ 77 *(__markers) \
62 VMLINUX_SYMBOL(__stop___markers) = .; \ 78 VMLINUX_SYMBOL(__stop___markers) = .; \
79 . = ALIGN(32); \
63 VMLINUX_SYMBOL(__start___tracepoints) = .; \ 80 VMLINUX_SYMBOL(__start___tracepoints) = .; \
64 *(__tracepoints) \ 81 *(__tracepoints) \
65 VMLINUX_SYMBOL(__stop___tracepoints) = .; 82 VMLINUX_SYMBOL(__stop___tracepoints) = .; \
83 LIKELY_PROFILE() \
84 BRANCH_PROFILE()
66 85
67#define RO_DATA(align) \ 86#define RO_DATA(align) \
68 . = ALIGN((align)); \ 87 . = ALIGN((align)); \
diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h
index 70a57c8c002b..c980f5ba8de7 100644
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@@ -23,7 +23,7 @@
23 */ 23 */
24 24
25#if defined(CONFIG_FRAME_POINTER) || \ 25#if defined(CONFIG_FRAME_POINTER) || \
26 !defined(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER) 26 !defined(CONFIG_SCHED_OMIT_FRAME_POINTER)
27#define M32R_PUSH_FP " push fp\n" 27#define M32R_PUSH_FP " push fp\n"
28#define M32R_POP_FP " pop fp\n" 28#define M32R_POP_FP " pop fp\n"
29#else 29#else
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 98115d9d04da..ea7c6be354b7 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -59,8 +59,88 @@ extern void __chk_io_ptr(const volatile void __iomem *);
59 * specific implementations come from the above header files 59 * specific implementations come from the above header files
60 */ 60 */
61 61
62#define likely(x) __builtin_expect(!!(x), 1) 62struct ftrace_branch_data {
63#define unlikely(x) __builtin_expect(!!(x), 0) 63 const char *func;
64 const char *file;
65 unsigned line;
66 union {
67 struct {
68 unsigned long correct;
69 unsigned long incorrect;
70 };
71 struct {
72 unsigned long miss;
73 unsigned long hit;
74 };
75 };
76};
77
78/*
79 * Note: DISABLE_BRANCH_PROFILING can be used by special lowlevel code
80 * to disable branch tracing on a per file basis.
81 */
82#if defined(CONFIG_TRACE_BRANCH_PROFILING) && !defined(DISABLE_BRANCH_PROFILING)
83void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
84
85#define likely_notrace(x) __builtin_expect(!!(x), 1)
86#define unlikely_notrace(x) __builtin_expect(!!(x), 0)
87
88#define __branch_check__(x, expect) ({ \
89 int ______r; \
90 static struct ftrace_branch_data \
91 __attribute__((__aligned__(4))) \
92 __attribute__((section("_ftrace_annotated_branch"))) \
93 ______f = { \
94 .func = __func__, \
95 .file = __FILE__, \
96 .line = __LINE__, \
97 }; \
98 ______r = likely_notrace(x); \
99 ftrace_likely_update(&______f, ______r, expect); \
100 ______r; \
101 })
102
103/*
104 * Using __builtin_constant_p(x) to ignore cases where the return
105 * value is always the same. This idea is taken from a similar patch
106 * written by Daniel Walker.
107 */
108# ifndef likely
109# define likely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 1))
110# endif
111# ifndef unlikely
112# define unlikely(x) (__builtin_constant_p(x) ? !!(x) : __branch_check__(x, 0))
113# endif
114
115#ifdef CONFIG_PROFILE_ALL_BRANCHES
116/*
117 * "Define 'is'", Bill Clinton
118 * "Define 'if'", Steven Rostedt
119 */
120#define if(cond) if (__builtin_constant_p((cond)) ? !!(cond) : \
121 ({ \
122 int ______r; \
123 static struct ftrace_branch_data \
124 __attribute__((__aligned__(4))) \
125 __attribute__((section("_ftrace_branch"))) \
126 ______f = { \
127 .func = __func__, \
128 .file = __FILE__, \
129 .line = __LINE__, \
130 }; \
131 ______r = !!(cond); \
132 if (______r) \
133 ______f.hit++; \
134 else \
135 ______f.miss++; \
136 ______r; \
137 }))
138#endif /* CONFIG_PROFILE_ALL_BRANCHES */
139
140#else
141# define likely(x) __builtin_expect(!!(x), 1)
142# define unlikely(x) __builtin_expect(!!(x), 0)
143#endif
64 144
65/* Optimization barrier */ 145/* Optimization barrier */
66#ifndef barrier 146#ifndef barrier
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 4aaa4afb1cb9..096476f1fb35 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -17,7 +17,7 @@ extern int debug_locks_off(void);
17({ \ 17({ \
18 int __ret = 0; \ 18 int __ret = 0; \
19 \ 19 \
20 if (unlikely(c)) { \ 20 if (!oops_in_progress && unlikely(c)) { \
21 if (debug_locks_off() && !debug_locks_silent) \ 21 if (debug_locks_off() && !debug_locks_silent) \
22 WARN_ON(1); \ 22 WARN_ON(1); \
23 __ret = 1; \ 23 __ret = 1; \
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 703eb53cfa2b..7854d87b97b2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -23,6 +23,45 @@ struct ftrace_ops {
23 struct ftrace_ops *next; 23 struct ftrace_ops *next;
24}; 24};
25 25
26extern int function_trace_stop;
27
28/*
29 * Type of the current tracing.
30 */
31enum ftrace_tracing_type_t {
32 FTRACE_TYPE_ENTER = 0, /* Hook the call of the function */
33 FTRACE_TYPE_RETURN, /* Hook the return of the function */
34};
35
36/* Current tracing type, default is FTRACE_TYPE_ENTER */
37extern enum ftrace_tracing_type_t ftrace_tracing_type;
38
39/**
40 * ftrace_stop - stop function tracer.
41 *
42 * A quick way to stop the function tracer. Note this an on off switch,
43 * it is not something that is recursive like preempt_disable.
44 * This does not disable the calling of mcount, it only stops the
45 * calling of functions from mcount.
46 */
47static inline void ftrace_stop(void)
48{
49 function_trace_stop = 1;
50}
51
52/**
53 * ftrace_start - start the function tracer.
54 *
55 * This function is the inverse of ftrace_stop. This does not enable
56 * the function tracing if the function tracer is disabled. This only
57 * sets the function tracer flag to continue calling the functions
58 * from mcount.
59 */
60static inline void ftrace_start(void)
61{
62 function_trace_stop = 0;
63}
64
26/* 65/*
27 * The ftrace_ops must be a static and should also 66 * The ftrace_ops must be a static and should also
28 * be read_mostly. These functions do modify read_mostly variables 67 * be read_mostly. These functions do modify read_mostly variables
@@ -41,9 +80,13 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1);
41# define unregister_ftrace_function(ops) do { } while (0) 80# define unregister_ftrace_function(ops) do { } while (0)
42# define clear_ftrace_function(ops) do { } while (0) 81# define clear_ftrace_function(ops) do { } while (0)
43static inline void ftrace_kill(void) { } 82static inline void ftrace_kill(void) { }
83static inline void ftrace_stop(void) { }
84static inline void ftrace_start(void) { }
44#endif /* CONFIG_FUNCTION_TRACER */ 85#endif /* CONFIG_FUNCTION_TRACER */
45 86
46#ifdef CONFIG_DYNAMIC_FTRACE 87#ifdef CONFIG_DYNAMIC_FTRACE
88/* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
89#include <asm/ftrace.h>
47 90
48enum { 91enum {
49 FTRACE_FL_FREE = (1 << 0), 92 FTRACE_FL_FREE = (1 << 0),
@@ -59,6 +102,7 @@ struct dyn_ftrace {
59 struct list_head list; 102 struct list_head list;
60 unsigned long ip; /* address of mcount call-site */ 103 unsigned long ip; /* address of mcount call-site */
61 unsigned long flags; 104 unsigned long flags;
105 struct dyn_arch_ftrace arch;
62}; 106};
63 107
64int ftrace_force_update(void); 108int ftrace_force_update(void);
@@ -66,19 +110,43 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset);
66 110
67/* defined in arch */ 111/* defined in arch */
68extern int ftrace_ip_converted(unsigned long ip); 112extern int ftrace_ip_converted(unsigned long ip);
69extern unsigned char *ftrace_nop_replace(void);
70extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
71extern int ftrace_dyn_arch_init(void *data); 113extern int ftrace_dyn_arch_init(void *data);
72extern int ftrace_update_ftrace_func(ftrace_func_t func); 114extern int ftrace_update_ftrace_func(ftrace_func_t func);
73extern void ftrace_caller(void); 115extern void ftrace_caller(void);
74extern void ftrace_call(void); 116extern void ftrace_call(void);
75extern void mcount_call(void); 117extern void mcount_call(void);
118#ifdef CONFIG_FUNCTION_RET_TRACER
119extern void ftrace_return_caller(void);
120#endif
121
122/**
123 * ftrace_make_nop - convert code into top
124 * @mod: module structure if called by module load initialization
125 * @rec: the mcount call site record
126 * @addr: the address that the call site should be calling
127 *
128 * This is a very sensitive operation and great care needs
129 * to be taken by the arch. The operation should carefully
130 * read the location, check to see if what is read is indeed
131 * what we expect it to be, and then on success of the compare,
132 * it should write to the location.
133 *
134 * The code segment at @rec->ip should be a caller to @addr
135 *
136 * Return must be:
137 * 0 on success
138 * -EFAULT on error reading the location
139 * -EINVAL on a failed compare of the contents
140 * -EPERM on error writing to the location
141 * Any other value will be considered a failure.
142 */
143extern int ftrace_make_nop(struct module *mod,
144 struct dyn_ftrace *rec, unsigned long addr);
76 145
77/** 146/**
78 * ftrace_modify_code - modify code segment 147 * ftrace_make_call - convert a nop call site into a call to addr
79 * @ip: the address of the code segment 148 * @rec: the mcount call site record
80 * @old_code: the contents of what is expected to be there 149 * @addr: the address that the call site should call
81 * @new_code: the code to patch in
82 * 150 *
83 * This is a very sensitive operation and great care needs 151 * This is a very sensitive operation and great care needs
84 * to be taken by the arch. The operation should carefully 152 * to be taken by the arch. The operation should carefully
@@ -86,6 +154,8 @@ extern void mcount_call(void);
86 * what we expect it to be, and then on success of the compare, 154 * what we expect it to be, and then on success of the compare,
87 * it should write to the location. 155 * it should write to the location.
88 * 156 *
157 * The code segment at @rec->ip should be a nop
158 *
89 * Return must be: 159 * Return must be:
90 * 0 on success 160 * 0 on success
91 * -EFAULT on error reading the location 161 * -EFAULT on error reading the location
@@ -93,8 +163,11 @@ extern void mcount_call(void);
93 * -EPERM on error writing to the location 163 * -EPERM on error writing to the location
94 * Any other value will be considered a failure. 164 * Any other value will be considered a failure.
95 */ 165 */
96extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code, 166extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
97 unsigned char *new_code); 167
168
169/* May be defined in arch */
170extern int ftrace_arch_read_dyn_info(char *buf, int size);
98 171
99extern int skip_trace(unsigned long ip); 172extern int skip_trace(unsigned long ip);
100 173
@@ -102,7 +175,6 @@ extern void ftrace_release(void *start, unsigned long size);
102 175
103extern void ftrace_disable_daemon(void); 176extern void ftrace_disable_daemon(void);
104extern void ftrace_enable_daemon(void); 177extern void ftrace_enable_daemon(void);
105
106#else 178#else
107# define skip_trace(ip) ({ 0; }) 179# define skip_trace(ip) ({ 0; })
108# define ftrace_force_update() ({ 0; }) 180# define ftrace_force_update() ({ 0; })
@@ -181,6 +253,12 @@ static inline void __ftrace_enabled_restore(int enabled)
181#endif 253#endif
182 254
183#ifdef CONFIG_TRACING 255#ifdef CONFIG_TRACING
256extern int ftrace_dump_on_oops;
257
258extern void tracing_start(void);
259extern void tracing_stop(void);
260extern void ftrace_off_permanent(void);
261
184extern void 262extern void
185ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3); 263ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
186 264
@@ -211,6 +289,9 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
211static inline int 289static inline int
212ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0))); 290ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 0)));
213 291
292static inline void tracing_start(void) { }
293static inline void tracing_stop(void) { }
294static inline void ftrace_off_permanent(void) { }
214static inline int 295static inline int
215ftrace_printk(const char *fmt, ...) 296ftrace_printk(const char *fmt, ...)
216{ 297{
@@ -221,33 +302,44 @@ static inline void ftrace_dump(void) { }
221 302
222#ifdef CONFIG_FTRACE_MCOUNT_RECORD 303#ifdef CONFIG_FTRACE_MCOUNT_RECORD
223extern void ftrace_init(void); 304extern void ftrace_init(void);
224extern void ftrace_init_module(unsigned long *start, unsigned long *end); 305extern void ftrace_init_module(struct module *mod,
306 unsigned long *start, unsigned long *end);
225#else 307#else
226static inline void ftrace_init(void) { } 308static inline void ftrace_init(void) { }
227static inline void 309static inline void
228ftrace_init_module(unsigned long *start, unsigned long *end) { } 310ftrace_init_module(struct module *mod,
311 unsigned long *start, unsigned long *end) { }
229#endif 312#endif
230 313
231 314
232struct boot_trace { 315/*
233 pid_t caller; 316 * Structure that defines a return function trace.
234 char func[KSYM_NAME_LEN]; 317 */
235 int result; 318struct ftrace_retfunc {
236 unsigned long long duration; /* usecs */ 319 unsigned long ret; /* Return address */
237 ktime_t calltime; 320 unsigned long func; /* Current function */
238 ktime_t rettime; 321 unsigned long long calltime;
322 unsigned long long rettime;
323 /* Number of functions that overran the depth limit for current task */
324 unsigned long overrun;
239}; 325};
240 326
241#ifdef CONFIG_BOOT_TRACER 327#ifdef CONFIG_FUNCTION_RET_TRACER
242extern void trace_boot(struct boot_trace *it, initcall_t fn); 328#define FTRACE_RETFUNC_DEPTH 50
243extern void start_boot_trace(void); 329#define FTRACE_RETSTACK_ALLOC_SIZE 32
244extern void stop_boot_trace(void); 330/* Type of a callback handler of tracing return function */
245#else 331typedef void (*trace_function_return_t)(struct ftrace_retfunc *);
246static inline void trace_boot(struct boot_trace *it, initcall_t fn) { }
247static inline void start_boot_trace(void) { }
248static inline void stop_boot_trace(void) { }
249#endif
250 332
333extern int register_ftrace_return(trace_function_return_t func);
334/* The current handler in use */
335extern trace_function_return_t ftrace_function_return;
336extern void unregister_ftrace_return(void);
251 337
338extern void ftrace_retfunc_init_task(struct task_struct *t);
339extern void ftrace_retfunc_exit_task(struct task_struct *t);
340#else
341static inline void ftrace_retfunc_init_task(struct task_struct *t) { }
342static inline void ftrace_retfunc_exit_task(struct task_struct *t) { }
343#endif
252 344
253#endif /* _LINUX_FTRACE_H */ 345#endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
new file mode 100644
index 000000000000..0b4df55d7a74
--- /dev/null
+++ b/include/linux/ftrace_irq.h
@@ -0,0 +1,13 @@
1#ifndef _LINUX_FTRACE_IRQ_H
2#define _LINUX_FTRACE_IRQ_H
3
4
5#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_RET_TRACER)
6extern void ftrace_nmi_enter(void);
7extern void ftrace_nmi_exit(void);
8#else
9static inline void ftrace_nmi_enter(void) { }
10static inline void ftrace_nmi_exit(void) { }
11#endif
12
13#endif /* _LINUX_FTRACE_IRQ_H */
diff --git a/include/linux/futex.h b/include/linux/futex.h
index 586ab56a3ec3..8f627b9ae2b1 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -164,6 +164,8 @@ union futex_key {
164 } both; 164 } both;
165}; 165};
166 166
167#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
168
167#ifdef CONFIG_FUTEX 169#ifdef CONFIG_FUTEX
168extern void exit_robust_list(struct task_struct *curr); 170extern void exit_robust_list(struct task_struct *curr);
169extern void exit_pi_state_list(struct task_struct *curr); 171extern void exit_pi_state_list(struct task_struct *curr);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 181006cc94a0..89a56d79e4c6 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -4,6 +4,7 @@
4#include <linux/preempt.h> 4#include <linux/preempt.h>
5#include <linux/smp_lock.h> 5#include <linux/smp_lock.h>
6#include <linux/lockdep.h> 6#include <linux/lockdep.h>
7#include <linux/ftrace_irq.h>
7#include <asm/hardirq.h> 8#include <asm/hardirq.h>
8#include <asm/system.h> 9#include <asm/system.h>
9 10
@@ -161,7 +162,17 @@ extern void irq_enter(void);
161 */ 162 */
162extern void irq_exit(void); 163extern void irq_exit(void);
163 164
164#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0) 165#define nmi_enter() \
165#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0) 166 do { \
167 ftrace_nmi_enter(); \
168 lockdep_off(); \
169 __irq_enter(); \
170 } while (0)
171#define nmi_exit() \
172 do { \
173 __irq_exit(); \
174 lockdep_on(); \
175 ftrace_nmi_exit(); \
176 } while (0)
166 177
167#endif /* LINUX_HARDIRQ_H */ 178#endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index dc7e0d0a6474..269df5a17b30 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -141,6 +141,15 @@ extern int _cond_resched(void);
141 (__x < 0) ? -__x : __x; \ 141 (__x < 0) ? -__x : __x; \
142 }) 142 })
143 143
144#ifdef CONFIG_PROVE_LOCKING
145void might_fault(void);
146#else
147static inline void might_fault(void)
148{
149 might_sleep();
150}
151#endif
152
144extern struct atomic_notifier_head panic_notifier_list; 153extern struct atomic_notifier_head panic_notifier_list;
145extern long (*panic_blink)(long time); 154extern long (*panic_blink)(long time);
146NORET_TYPE void panic(const char * fmt, ...) 155NORET_TYPE void panic(const char * fmt, ...)
@@ -188,6 +197,8 @@ extern unsigned long long memparse(const char *ptr, char **retptr);
188extern int core_kernel_text(unsigned long addr); 197extern int core_kernel_text(unsigned long addr);
189extern int __kernel_text_address(unsigned long addr); 198extern int __kernel_text_address(unsigned long addr);
190extern int kernel_text_address(unsigned long addr); 199extern int kernel_text_address(unsigned long addr);
200extern int func_ptr_is_kernel_text(void *ptr);
201
191struct pid; 202struct pid;
192extern struct pid *session_of_pgrp(struct pid *pgrp); 203extern struct pid *session_of_pgrp(struct pid *pgrp);
193 204
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 29aec6e10020..8956daf64abd 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -73,6 +73,8 @@ struct lock_class_key {
73 struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; 73 struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES];
74}; 74};
75 75
76#define LOCKSTAT_POINTS 4
77
76/* 78/*
77 * The lock-class itself: 79 * The lock-class itself:
78 */ 80 */
@@ -119,7 +121,8 @@ struct lock_class {
119 int name_version; 121 int name_version;
120 122
121#ifdef CONFIG_LOCK_STAT 123#ifdef CONFIG_LOCK_STAT
122 unsigned long contention_point[4]; 124 unsigned long contention_point[LOCKSTAT_POINTS];
125 unsigned long contending_point[LOCKSTAT_POINTS];
123#endif 126#endif
124}; 127};
125 128
@@ -144,6 +147,7 @@ enum bounce_type {
144 147
145struct lock_class_stats { 148struct lock_class_stats {
146 unsigned long contention_point[4]; 149 unsigned long contention_point[4];
150 unsigned long contending_point[4];
147 struct lock_time read_waittime; 151 struct lock_time read_waittime;
148 struct lock_time write_waittime; 152 struct lock_time write_waittime;
149 struct lock_time read_holdtime; 153 struct lock_time read_holdtime;
@@ -165,6 +169,7 @@ struct lockdep_map {
165 const char *name; 169 const char *name;
166#ifdef CONFIG_LOCK_STAT 170#ifdef CONFIG_LOCK_STAT
167 int cpu; 171 int cpu;
172 unsigned long ip;
168#endif 173#endif
169}; 174};
170 175
@@ -356,7 +361,7 @@ struct lock_class_key { };
356#ifdef CONFIG_LOCK_STAT 361#ifdef CONFIG_LOCK_STAT
357 362
358extern void lock_contended(struct lockdep_map *lock, unsigned long ip); 363extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
359extern void lock_acquired(struct lockdep_map *lock); 364extern void lock_acquired(struct lockdep_map *lock, unsigned long ip);
360 365
361#define LOCK_CONTENDED(_lock, try, lock) \ 366#define LOCK_CONTENDED(_lock, try, lock) \
362do { \ 367do { \
@@ -364,13 +369,13 @@ do { \
364 lock_contended(&(_lock)->dep_map, _RET_IP_); \ 369 lock_contended(&(_lock)->dep_map, _RET_IP_); \
365 lock(_lock); \ 370 lock(_lock); \
366 } \ 371 } \
367 lock_acquired(&(_lock)->dep_map); \ 372 lock_acquired(&(_lock)->dep_map, _RET_IP_); \
368} while (0) 373} while (0)
369 374
370#else /* CONFIG_LOCK_STAT */ 375#else /* CONFIG_LOCK_STAT */
371 376
372#define lock_contended(lockdep_map, ip) do {} while (0) 377#define lock_contended(lockdep_map, ip) do {} while (0)
373#define lock_acquired(lockdep_map) do {} while (0) 378#define lock_acquired(lockdep_map, ip) do {} while (0)
374 379
375#define LOCK_CONTENDED(_lock, try, lock) \ 380#define LOCK_CONTENDED(_lock, try, lock) \
376 lock(_lock) 381 lock(_lock)
@@ -481,4 +486,22 @@ static inline void print_irqtrace_events(struct task_struct *curr)
481# define lock_map_release(l) do { } while (0) 486# define lock_map_release(l) do { } while (0)
482#endif 487#endif
483 488
489#ifdef CONFIG_PROVE_LOCKING
490# define might_lock(lock) \
491do { \
492 typecheck(struct lockdep_map *, &(lock)->dep_map); \
493 lock_acquire(&(lock)->dep_map, 0, 0, 0, 2, NULL, _THIS_IP_); \
494 lock_release(&(lock)->dep_map, 0, _THIS_IP_); \
495} while (0)
496# define might_lock_read(lock) \
497do { \
498 typecheck(struct lockdep_map *, &(lock)->dep_map); \
499 lock_acquire(&(lock)->dep_map, 0, 0, 1, 2, NULL, _THIS_IP_); \
500 lock_release(&(lock)->dep_map, 0, _THIS_IP_); \
501} while (0)
502#else
503# define might_lock(lock) do { } while (0)
504# define might_lock_read(lock) do { } while (0)
505#endif
506
484#endif /* __LINUX_LOCKDEP_H */ 507#endif /* __LINUX_LOCKDEP_H */
diff --git a/include/linux/marker.h b/include/linux/marker.h
index 889196c7fbb1..34c14bc957f5 100644
--- a/include/linux/marker.h
+++ b/include/linux/marker.h
@@ -12,6 +12,7 @@
12 * See the file COPYING for more details. 12 * See the file COPYING for more details.
13 */ 13 */
14 14
15#include <stdarg.h>
15#include <linux/types.h> 16#include <linux/types.h>
16 17
17struct module; 18struct module;
@@ -48,10 +49,28 @@ struct marker {
48 void (*call)(const struct marker *mdata, void *call_private, ...); 49 void (*call)(const struct marker *mdata, void *call_private, ...);
49 struct marker_probe_closure single; 50 struct marker_probe_closure single;
50 struct marker_probe_closure *multi; 51 struct marker_probe_closure *multi;
52 const char *tp_name; /* Optional tracepoint name */
53 void *tp_cb; /* Optional tracepoint callback */
51} __attribute__((aligned(8))); 54} __attribute__((aligned(8)));
52 55
53#ifdef CONFIG_MARKERS 56#ifdef CONFIG_MARKERS
54 57
58#define _DEFINE_MARKER(name, tp_name_str, tp_cb, format) \
59 static const char __mstrtab_##name[] \
60 __attribute__((section("__markers_strings"))) \
61 = #name "\0" format; \
62 static struct marker __mark_##name \
63 __attribute__((section("__markers"), aligned(8))) = \
64 { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \
65 0, 0, marker_probe_cb, { __mark_empty_function, NULL},\
66 NULL, tp_name_str, tp_cb }
67
68#define DEFINE_MARKER(name, format) \
69 _DEFINE_MARKER(name, NULL, NULL, format)
70
71#define DEFINE_MARKER_TP(name, tp_name, tp_cb, format) \
72 _DEFINE_MARKER(name, #tp_name, tp_cb, format)
73
55/* 74/*
56 * Note : the empty asm volatile with read constraint is used here instead of a 75 * Note : the empty asm volatile with read constraint is used here instead of a
57 * "used" attribute to fix a gcc 4.1.x bug. 76 * "used" attribute to fix a gcc 4.1.x bug.
@@ -65,14 +84,7 @@ struct marker {
65 */ 84 */
66#define __trace_mark(generic, name, call_private, format, args...) \ 85#define __trace_mark(generic, name, call_private, format, args...) \
67 do { \ 86 do { \
68 static const char __mstrtab_##name[] \ 87 DEFINE_MARKER(name, format); \
69 __attribute__((section("__markers_strings"))) \
70 = #name "\0" format; \
71 static struct marker __mark_##name \
72 __attribute__((section("__markers"), aligned(8))) = \
73 { __mstrtab_##name, &__mstrtab_##name[sizeof(#name)], \
74 0, 0, marker_probe_cb, \
75 { __mark_empty_function, NULL}, NULL }; \
76 __mark_check_format(format, ## args); \ 88 __mark_check_format(format, ## args); \
77 if (unlikely(__mark_##name.state)) { \ 89 if (unlikely(__mark_##name.state)) { \
78 (*__mark_##name.call) \ 90 (*__mark_##name.call) \
@@ -80,14 +92,39 @@ struct marker {
80 } \ 92 } \
81 } while (0) 93 } while (0)
82 94
95#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
96 do { \
97 void __check_tp_type(void) \
98 { \
99 register_trace_##tp_name(tp_cb); \
100 } \
101 DEFINE_MARKER_TP(name, tp_name, tp_cb, format); \
102 __mark_check_format(format, ## args); \
103 (*__mark_##name.call)(&__mark_##name, call_private, \
104 ## args); \
105 } while (0)
106
83extern void marker_update_probe_range(struct marker *begin, 107extern void marker_update_probe_range(struct marker *begin,
84 struct marker *end); 108 struct marker *end);
109
110#define GET_MARKER(name) (__mark_##name)
111
85#else /* !CONFIG_MARKERS */ 112#else /* !CONFIG_MARKERS */
113#define DEFINE_MARKER(name, tp_name, tp_cb, format)
86#define __trace_mark(generic, name, call_private, format, args...) \ 114#define __trace_mark(generic, name, call_private, format, args...) \
87 __mark_check_format(format, ## args) 115 __mark_check_format(format, ## args)
116#define __trace_mark_tp(name, call_private, tp_name, tp_cb, format, args...) \
117 do { \
118 void __check_tp_type(void) \
119 { \
120 register_trace_##tp_name(tp_cb); \
121 } \
122 __mark_check_format(format, ## args); \
123 } while (0)
88static inline void marker_update_probe_range(struct marker *begin, 124static inline void marker_update_probe_range(struct marker *begin,
89 struct marker *end) 125 struct marker *end)
90{ } 126{ }
127#define GET_MARKER(name)
91#endif /* CONFIG_MARKERS */ 128#endif /* CONFIG_MARKERS */
92 129
93/** 130/**
@@ -117,6 +154,20 @@ static inline void marker_update_probe_range(struct marker *begin,
117 __trace_mark(1, name, NULL, format, ## args) 154 __trace_mark(1, name, NULL, format, ## args)
118 155
119/** 156/**
157 * trace_mark_tp - Marker in a tracepoint callback
158 * @name: marker name, not quoted.
159 * @tp_name: tracepoint name, not quoted.
160 * @tp_cb: tracepoint callback. Should have an associated global symbol so it
161 * is not optimized away by the compiler (should not be static).
162 * @format: format string
163 * @args...: variable argument list
164 *
165 * Places a marker in a tracepoint callback.
166 */
167#define trace_mark_tp(name, tp_name, tp_cb, format, args...) \
168 __trace_mark_tp(name, NULL, tp_name, tp_cb, format, ## args)
169
170/**
120 * MARK_NOARGS - Format string for a marker with no argument. 171 * MARK_NOARGS - Format string for a marker with no argument.
121 */ 172 */
122#define MARK_NOARGS " " 173#define MARK_NOARGS " "
@@ -136,8 +187,6 @@ extern marker_probe_func __mark_empty_function;
136 187
137extern void marker_probe_cb(const struct marker *mdata, 188extern void marker_probe_cb(const struct marker *mdata,
138 void *call_private, ...); 189 void *call_private, ...);
139extern void marker_probe_cb_noarg(const struct marker *mdata,
140 void *call_private, ...);
141 190
142/* 191/*
143 * Connect a probe to a marker. 192 * Connect a probe to a marker.
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index bc6da10ceee0..7a0e5c4f8072 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -144,6 +144,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
144/* 144/*
145 * NOTE: mutex_trylock() follows the spin_trylock() convention, 145 * NOTE: mutex_trylock() follows the spin_trylock() convention,
146 * not the down_trylock() convention! 146 * not the down_trylock() convention!
147 *
148 * Returns 1 if the mutex has been acquired successfully, and 0 on contention.
147 */ 149 */
148extern int mutex_trylock(struct mutex *lock); 150extern int mutex_trylock(struct mutex *lock);
149extern void mutex_unlock(struct mutex *lock); 151extern void mutex_unlock(struct mutex *lock);
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 5f89b62e6983..301dda829e37 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -41,7 +41,7 @@
41#include <linux/seqlock.h> 41#include <linux/seqlock.h>
42 42
43#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 43#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
44#define RCU_SECONDS_TILL_STALL_CHECK ( 3 * HZ) /* for rcp->jiffies_stall */ 44#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rcp->jiffies_stall */
45#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ 45#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */
46#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 46#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
47 47
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 86f1f5e43e33..895dc9c1088c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -142,6 +142,7 @@ struct rcu_head {
142 * on the write-side to insure proper synchronization. 142 * on the write-side to insure proper synchronization.
143 */ 143 */
144#define rcu_read_lock_sched() preempt_disable() 144#define rcu_read_lock_sched() preempt_disable()
145#define rcu_read_lock_sched_notrace() preempt_disable_notrace()
145 146
146/* 147/*
147 * rcu_read_unlock_sched - marks the end of a RCU-classic critical section 148 * rcu_read_unlock_sched - marks the end of a RCU-classic critical section
@@ -149,6 +150,7 @@ struct rcu_head {
149 * See rcu_read_lock_sched for more information. 150 * See rcu_read_lock_sched for more information.
150 */ 151 */
151#define rcu_read_unlock_sched() preempt_enable() 152#define rcu_read_unlock_sched() preempt_enable()
153#define rcu_read_unlock_sched_notrace() preempt_enable_notrace()
152 154
153 155
154 156
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index e097c2e6b6dc..3bb87a753fa3 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -122,6 +122,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
122 122
123void tracing_on(void); 123void tracing_on(void);
124void tracing_off(void); 124void tracing_off(void);
125void tracing_off_permanent(void);
125 126
126enum ring_buffer_flags { 127enum ring_buffer_flags {
127 RB_FL_OVERWRITE = 1 << 0, 128 RB_FL_OVERWRITE = 1 << 0,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55e30d114477..bf953932e676 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -249,7 +249,7 @@ extern void init_idle_bootup_task(struct task_struct *idle);
249extern int runqueue_is_locked(void); 249extern int runqueue_is_locked(void);
250extern void task_rq_unlock_wait(struct task_struct *p); 250extern void task_rq_unlock_wait(struct task_struct *p);
251 251
252extern cpumask_t nohz_cpu_mask; 252extern cpumask_var_t nohz_cpu_mask;
253#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) 253#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
254extern int select_nohz_load_balancer(int cpu); 254extern int select_nohz_load_balancer(int cpu);
255#else 255#else
@@ -259,8 +259,6 @@ static inline int select_nohz_load_balancer(int cpu)
259} 259}
260#endif 260#endif
261 261
262extern unsigned long rt_needs_cpu(int cpu);
263
264/* 262/*
265 * Only dump TASK_* tasks. (0 for all tasks) 263 * Only dump TASK_* tasks. (0 for all tasks)
266 */ 264 */
@@ -777,7 +775,6 @@ enum cpu_idle_type {
777 775
778struct sched_group { 776struct sched_group {
779 struct sched_group *next; /* Must be a circular list */ 777 struct sched_group *next; /* Must be a circular list */
780 cpumask_t cpumask;
781 778
782 /* 779 /*
783 * CPU power of this group, SCHED_LOAD_SCALE being max power for a 780 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@ -790,8 +787,15 @@ struct sched_group {
790 * (see include/linux/reciprocal_div.h) 787 * (see include/linux/reciprocal_div.h)
791 */ 788 */
792 u32 reciprocal_cpu_power; 789 u32 reciprocal_cpu_power;
790
791 unsigned long cpumask[];
793}; 792};
794 793
794static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
795{
796 return to_cpumask(sg->cpumask);
797}
798
795enum sched_domain_level { 799enum sched_domain_level {
796 SD_LV_NONE = 0, 800 SD_LV_NONE = 0,
797 SD_LV_SIBLING, 801 SD_LV_SIBLING,
@@ -815,7 +819,6 @@ struct sched_domain {
815 struct sched_domain *parent; /* top domain must be null terminated */ 819 struct sched_domain *parent; /* top domain must be null terminated */
816 struct sched_domain *child; /* bottom domain must be null terminated */ 820 struct sched_domain *child; /* bottom domain must be null terminated */
817 struct sched_group *groups; /* the balancing groups of the domain */ 821 struct sched_group *groups; /* the balancing groups of the domain */
818 cpumask_t span; /* span of all CPUs in this domain */
819 unsigned long min_interval; /* Minimum balance interval ms */ 822 unsigned long min_interval; /* Minimum balance interval ms */
820 unsigned long max_interval; /* Maximum balance interval ms */ 823 unsigned long max_interval; /* Maximum balance interval ms */
821 unsigned int busy_factor; /* less balancing by factor if busy */ 824 unsigned int busy_factor; /* less balancing by factor if busy */
@@ -870,9 +873,17 @@ struct sched_domain {
870#ifdef CONFIG_SCHED_DEBUG 873#ifdef CONFIG_SCHED_DEBUG
871 char *name; 874 char *name;
872#endif 875#endif
876
877 /* span of all CPUs in this domain */
878 unsigned long span[];
873}; 879};
874 880
875extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 881static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
882{
883 return to_cpumask(sd->span);
884}
885
886extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
876 struct sched_domain_attr *dattr_new); 887 struct sched_domain_attr *dattr_new);
877extern int arch_reinit_sched_domains(void); 888extern int arch_reinit_sched_domains(void);
878 889
@@ -881,7 +892,7 @@ extern int arch_reinit_sched_domains(void);
881struct sched_domain_attr; 892struct sched_domain_attr;
882 893
883static inline void 894static inline void
884partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 895partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
885 struct sched_domain_attr *dattr_new) 896 struct sched_domain_attr *dattr_new)
886{ 897{
887} 898}
@@ -963,7 +974,7 @@ struct sched_class {
963 void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); 974 void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
964 975
965 void (*set_cpus_allowed)(struct task_struct *p, 976 void (*set_cpus_allowed)(struct task_struct *p,
966 const cpumask_t *newmask); 977 const struct cpumask *newmask);
967 978
968 void (*rq_online)(struct rq *rq); 979 void (*rq_online)(struct rq *rq);
969 void (*rq_offline)(struct rq *rq); 980 void (*rq_offline)(struct rq *rq);
@@ -1356,6 +1367,17 @@ struct task_struct {
1356 unsigned long default_timer_slack_ns; 1367 unsigned long default_timer_slack_ns;
1357 1368
1358 struct list_head *scm_work_list; 1369 struct list_head *scm_work_list;
1370#ifdef CONFIG_FUNCTION_RET_TRACER
1371 /* Index of current stored adress in ret_stack */
1372 int curr_ret_stack;
1373 /* Stack of return addresses for return function tracing */
1374 struct ftrace_ret_stack *ret_stack;
1375 /*
1376 * Number of functions that haven't been traced
1377 * because of depth overrun.
1378 */
1379 atomic_t trace_overrun;
1380#endif
1359}; 1381};
1360 1382
1361/* 1383/*
@@ -1594,12 +1616,12 @@ extern cputime_t task_gtime(struct task_struct *p);
1594 1616
1595#ifdef CONFIG_SMP 1617#ifdef CONFIG_SMP
1596extern int set_cpus_allowed_ptr(struct task_struct *p, 1618extern int set_cpus_allowed_ptr(struct task_struct *p,
1597 const cpumask_t *new_mask); 1619 const struct cpumask *new_mask);
1598#else 1620#else
1599static inline int set_cpus_allowed_ptr(struct task_struct *p, 1621static inline int set_cpus_allowed_ptr(struct task_struct *p,
1600 const cpumask_t *new_mask) 1622 const struct cpumask *new_mask)
1601{ 1623{
1602 if (!cpu_isset(0, *new_mask)) 1624 if (!cpumask_test_cpu(0, new_mask))
1603 return -EINVAL; 1625 return -EINVAL;
1604 return 0; 1626 return 0;
1605} 1627}
@@ -2212,8 +2234,8 @@ __trace_special(void *__tr, void *__data,
2212} 2234}
2213#endif 2235#endif
2214 2236
2215extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask); 2237extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
2216extern long sched_getaffinity(pid_t pid, cpumask_t *mask); 2238extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
2217 2239
2218extern int sched_mc_power_savings, sched_smt_power_savings; 2240extern int sched_mc_power_savings, sched_smt_power_savings;
2219 2241
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index dc50bcc282a8..b3dfa72f13b9 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -34,6 +34,7 @@ struct seq_operations {
34 34
35#define SEQ_SKIP 1 35#define SEQ_SKIP 1
36 36
37char *mangle_path(char *s, char *p, char *esc);
37int seq_open(struct file *, const struct seq_operations *); 38int seq_open(struct file *, const struct seq_operations *);
38ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); 39ssize_t seq_read(struct file *, char __user *, size_t, loff_t *);
39loff_t seq_lseek(struct file *, loff_t, int); 40loff_t seq_lseek(struct file *, loff_t, int);
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index b106fd8e0d5c..1a8cecc4f38c 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -15,9 +15,17 @@ extern void save_stack_trace_tsk(struct task_struct *tsk,
15 struct stack_trace *trace); 15 struct stack_trace *trace);
16 16
17extern void print_stack_trace(struct stack_trace *trace, int spaces); 17extern void print_stack_trace(struct stack_trace *trace, int spaces);
18
19#ifdef CONFIG_USER_STACKTRACE_SUPPORT
20extern void save_stack_trace_user(struct stack_trace *trace);
21#else
22# define save_stack_trace_user(trace) do { } while (0)
23#endif
24
18#else 25#else
19# define save_stack_trace(trace) do { } while (0) 26# define save_stack_trace(trace) do { } while (0)
20# define save_stack_trace_tsk(tsk, trace) do { } while (0) 27# define save_stack_trace_tsk(tsk, trace) do { } while (0)
28# define save_stack_trace_user(trace) do { } while (0)
21# define print_stack_trace(trace, spaces) do { } while (0) 29# define print_stack_trace(trace, spaces) do { } while (0)
22#endif 30#endif
23 31
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index c5bb39c7a770..757005458366 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -24,8 +24,12 @@ struct tracepoint {
24 const char *name; /* Tracepoint name */ 24 const char *name; /* Tracepoint name */
25 int state; /* State. */ 25 int state; /* State. */
26 void **funcs; 26 void **funcs;
27} __attribute__((aligned(8))); 27} __attribute__((aligned(32))); /*
28 28 * Aligned on 32 bytes because it is
29 * globally visible and gcc happily
30 * align these on the structure size.
31 * Keep in sync with vmlinux.lds.h.
32 */
29 33
30#define TPPROTO(args...) args 34#define TPPROTO(args...) args
31#define TPARGS(args...) args 35#define TPARGS(args...) args
@@ -40,14 +44,14 @@ struct tracepoint {
40 do { \ 44 do { \
41 void **it_func; \ 45 void **it_func; \
42 \ 46 \
43 rcu_read_lock_sched(); \ 47 rcu_read_lock_sched_notrace(); \
44 it_func = rcu_dereference((tp)->funcs); \ 48 it_func = rcu_dereference((tp)->funcs); \
45 if (it_func) { \ 49 if (it_func) { \
46 do { \ 50 do { \
47 ((void(*)(proto))(*it_func))(args); \ 51 ((void(*)(proto))(*it_func))(args); \
48 } while (*(++it_func)); \ 52 } while (*(++it_func)); \
49 } \ 53 } \
50 rcu_read_unlock_sched(); \ 54 rcu_read_unlock_sched_notrace(); \
51 } while (0) 55 } while (0)
52 56
53/* 57/*
@@ -55,35 +59,40 @@ struct tracepoint {
55 * not add unwanted padding between the beginning of the section and the 59 * not add unwanted padding between the beginning of the section and the
56 * structure. Force alignment to the same alignment as the section start. 60 * structure. Force alignment to the same alignment as the section start.
57 */ 61 */
58#define DEFINE_TRACE(name, proto, args) \ 62#define DECLARE_TRACE(name, proto, args) \
63 extern struct tracepoint __tracepoint_##name; \
59 static inline void trace_##name(proto) \ 64 static inline void trace_##name(proto) \
60 { \ 65 { \
61 static const char __tpstrtab_##name[] \
62 __attribute__((section("__tracepoints_strings"))) \
63 = #name ":" #proto; \
64 static struct tracepoint __tracepoint_##name \
65 __attribute__((section("__tracepoints"), aligned(8))) = \
66 { __tpstrtab_##name, 0, NULL }; \
67 if (unlikely(__tracepoint_##name.state)) \ 66 if (unlikely(__tracepoint_##name.state)) \
68 __DO_TRACE(&__tracepoint_##name, \ 67 __DO_TRACE(&__tracepoint_##name, \
69 TPPROTO(proto), TPARGS(args)); \ 68 TPPROTO(proto), TPARGS(args)); \
70 } \ 69 } \
71 static inline int register_trace_##name(void (*probe)(proto)) \ 70 static inline int register_trace_##name(void (*probe)(proto)) \
72 { \ 71 { \
73 return tracepoint_probe_register(#name ":" #proto, \ 72 return tracepoint_probe_register(#name, (void *)probe); \
74 (void *)probe); \
75 } \ 73 } \
76 static inline void unregister_trace_##name(void (*probe)(proto))\ 74 static inline int unregister_trace_##name(void (*probe)(proto)) \
77 { \ 75 { \
78 tracepoint_probe_unregister(#name ":" #proto, \ 76 return tracepoint_probe_unregister(#name, (void *)probe);\
79 (void *)probe); \
80 } 77 }
81 78
79#define DEFINE_TRACE(name) \
80 static const char __tpstrtab_##name[] \
81 __attribute__((section("__tracepoints_strings"))) = #name; \
82 struct tracepoint __tracepoint_##name \
83 __attribute__((section("__tracepoints"), aligned(32))) = \
84 { __tpstrtab_##name, 0, NULL }
85
86#define EXPORT_TRACEPOINT_SYMBOL_GPL(name) \
87 EXPORT_SYMBOL_GPL(__tracepoint_##name)
88#define EXPORT_TRACEPOINT_SYMBOL(name) \
89 EXPORT_SYMBOL(__tracepoint_##name)
90
82extern void tracepoint_update_probe_range(struct tracepoint *begin, 91extern void tracepoint_update_probe_range(struct tracepoint *begin,
83 struct tracepoint *end); 92 struct tracepoint *end);
84 93
85#else /* !CONFIG_TRACEPOINTS */ 94#else /* !CONFIG_TRACEPOINTS */
86#define DEFINE_TRACE(name, proto, args) \ 95#define DECLARE_TRACE(name, proto, args) \
87 static inline void _do_trace_##name(struct tracepoint *tp, proto) \ 96 static inline void _do_trace_##name(struct tracepoint *tp, proto) \
88 { } \ 97 { } \
89 static inline void trace_##name(proto) \ 98 static inline void trace_##name(proto) \
@@ -92,8 +101,14 @@ extern void tracepoint_update_probe_range(struct tracepoint *begin,
92 { \ 101 { \
93 return -ENOSYS; \ 102 return -ENOSYS; \
94 } \ 103 } \
95 static inline void unregister_trace_##name(void (*probe)(proto))\ 104 static inline int unregister_trace_##name(void (*probe)(proto)) \
96 { } 105 { \
106 return -ENOSYS; \
107 }
108
109#define DEFINE_TRACE(name)
110#define EXPORT_TRACEPOINT_SYMBOL_GPL(name)
111#define EXPORT_TRACEPOINT_SYMBOL(name)
97 112
98static inline void tracepoint_update_probe_range(struct tracepoint *begin, 113static inline void tracepoint_update_probe_range(struct tracepoint *begin,
99 struct tracepoint *end) 114 struct tracepoint *end)
@@ -112,6 +127,10 @@ extern int tracepoint_probe_register(const char *name, void *probe);
112 */ 127 */
113extern int tracepoint_probe_unregister(const char *name, void *probe); 128extern int tracepoint_probe_unregister(const char *name, void *probe);
114 129
130extern int tracepoint_probe_register_noupdate(const char *name, void *probe);
131extern int tracepoint_probe_unregister_noupdate(const char *name, void *probe);
132extern void tracepoint_probe_update_all(void);
133
115struct tracepoint_iter { 134struct tracepoint_iter {
116 struct module *module; 135 struct module *module;
117 struct tracepoint *tracepoint; 136 struct tracepoint *tracepoint;
diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
index fec6decfb983..6b58367d145e 100644
--- a/include/linux/uaccess.h
+++ b/include/linux/uaccess.h
@@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to,
78 \ 78 \
79 set_fs(KERNEL_DS); \ 79 set_fs(KERNEL_DS); \
80 pagefault_disable(); \ 80 pagefault_disable(); \
81 ret = __get_user(retval, (__force typeof(retval) __user *)(addr)); \ 81 ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \
82 pagefault_enable(); \ 82 pagefault_enable(); \
83 set_fs(old_fs); \ 83 set_fs(old_fs); \
84 ret; \ 84 ret; \
diff --git a/include/trace/boot.h b/include/trace/boot.h
new file mode 100644
index 000000000000..6b54537eab02
--- /dev/null
+++ b/include/trace/boot.h
@@ -0,0 +1,56 @@
1#ifndef _LINUX_TRACE_BOOT_H
2#define _LINUX_TRACE_BOOT_H
3
4/*
5 * Structure which defines the trace of an initcall
6 * while it is called.
7 * You don't have to fill the func field since it is
8 * only used internally by the tracer.
9 */
10struct boot_trace_call {
11 pid_t caller;
12 char func[KSYM_NAME_LEN];
13};
14
15/*
16 * Structure which defines the trace of an initcall
17 * while it returns.
18 */
19struct boot_trace_ret {
20 char func[KSYM_NAME_LEN];
21 int result;
22 unsigned long long duration; /* nsecs */
23};
24
25#ifdef CONFIG_BOOT_TRACER
26/* Append the traces on the ring-buffer */
27extern void trace_boot_call(struct boot_trace_call *bt, initcall_t fn);
28extern void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn);
29
30/* Tells the tracer that smp_pre_initcall is finished.
31 * So we can start the tracing
32 */
33extern void start_boot_trace(void);
34
35/* Resume the tracing of other necessary events
36 * such as sched switches
37 */
38extern void enable_boot_trace(void);
39
40/* Suspend this tracing. Actually, only sched_switches tracing have
41 * to be suspended. Initcalls doesn't need it.)
42 */
43extern void disable_boot_trace(void);
44#else
45static inline
46void trace_boot_call(struct boot_trace_call *bt, initcall_t fn) { }
47
48static inline
49void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn) { }
50
51static inline void start_boot_trace(void) { }
52static inline void enable_boot_trace(void) { }
53static inline void disable_boot_trace(void) { }
54#endif /* CONFIG_BOOT_TRACER */
55
56#endif /* __LINUX_TRACE_BOOT_H */
diff --git a/include/trace/sched.h b/include/trace/sched.h
index ad47369d01b5..9b2854abf7e2 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -4,52 +4,52 @@
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/tracepoint.h> 5#include <linux/tracepoint.h>
6 6
7DEFINE_TRACE(sched_kthread_stop, 7DECLARE_TRACE(sched_kthread_stop,
8 TPPROTO(struct task_struct *t), 8 TPPROTO(struct task_struct *t),
9 TPARGS(t)); 9 TPARGS(t));
10 10
11DEFINE_TRACE(sched_kthread_stop_ret, 11DECLARE_TRACE(sched_kthread_stop_ret,
12 TPPROTO(int ret), 12 TPPROTO(int ret),
13 TPARGS(ret)); 13 TPARGS(ret));
14 14
15DEFINE_TRACE(sched_wait_task, 15DECLARE_TRACE(sched_wait_task,
16 TPPROTO(struct rq *rq, struct task_struct *p), 16 TPPROTO(struct rq *rq, struct task_struct *p),
17 TPARGS(rq, p)); 17 TPARGS(rq, p));
18 18
19DEFINE_TRACE(sched_wakeup, 19DECLARE_TRACE(sched_wakeup,
20 TPPROTO(struct rq *rq, struct task_struct *p), 20 TPPROTO(struct rq *rq, struct task_struct *p),
21 TPARGS(rq, p)); 21 TPARGS(rq, p));
22 22
23DEFINE_TRACE(sched_wakeup_new, 23DECLARE_TRACE(sched_wakeup_new,
24 TPPROTO(struct rq *rq, struct task_struct *p), 24 TPPROTO(struct rq *rq, struct task_struct *p),
25 TPARGS(rq, p)); 25 TPARGS(rq, p));
26 26
27DEFINE_TRACE(sched_switch, 27DECLARE_TRACE(sched_switch,
28 TPPROTO(struct rq *rq, struct task_struct *prev, 28 TPPROTO(struct rq *rq, struct task_struct *prev,
29 struct task_struct *next), 29 struct task_struct *next),
30 TPARGS(rq, prev, next)); 30 TPARGS(rq, prev, next));
31 31
32DEFINE_TRACE(sched_migrate_task, 32DECLARE_TRACE(sched_migrate_task,
33 TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu), 33 TPPROTO(struct rq *rq, struct task_struct *p, int dest_cpu),
34 TPARGS(rq, p, dest_cpu)); 34 TPARGS(rq, p, dest_cpu));
35 35
36DEFINE_TRACE(sched_process_free, 36DECLARE_TRACE(sched_process_free,
37 TPPROTO(struct task_struct *p), 37 TPPROTO(struct task_struct *p),
38 TPARGS(p)); 38 TPARGS(p));
39 39
40DEFINE_TRACE(sched_process_exit, 40DECLARE_TRACE(sched_process_exit,
41 TPPROTO(struct task_struct *p), 41 TPPROTO(struct task_struct *p),
42 TPARGS(p)); 42 TPARGS(p));
43 43
44DEFINE_TRACE(sched_process_wait, 44DECLARE_TRACE(sched_process_wait,
45 TPPROTO(struct pid *pid), 45 TPPROTO(struct pid *pid),
46 TPARGS(pid)); 46 TPARGS(pid));
47 47
48DEFINE_TRACE(sched_process_fork, 48DECLARE_TRACE(sched_process_fork,
49 TPPROTO(struct task_struct *parent, struct task_struct *child), 49 TPPROTO(struct task_struct *parent, struct task_struct *child),
50 TPARGS(parent, child)); 50 TPARGS(parent, child));
51 51
52DEFINE_TRACE(sched_signal_send, 52DECLARE_TRACE(sched_signal_send,
53 TPPROTO(int sig, struct task_struct *p), 53 TPPROTO(int sig, struct task_struct *p),
54 TPARGS(sig, p)); 54 TPARGS(sig, p));
55 55
diff --git a/init/Kconfig b/init/Kconfig
index f763762d544a..f291f086caa1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -808,6 +808,7 @@ config TRACEPOINTS
808 808
809config MARKERS 809config MARKERS
810 bool "Activate markers" 810 bool "Activate markers"
811 depends on TRACEPOINTS
811 help 812 help
812 Place an empty function call at each marker site. Can be 813 Place an empty function call at each marker site. Can be
813 dynamically changed for a probe function. 814 dynamically changed for a probe function.
diff --git a/init/main.c b/init/main.c
index 7e117a231af1..79213c0785d2 100644
--- a/init/main.c
+++ b/init/main.c
@@ -63,6 +63,7 @@
63#include <linux/signal.h> 63#include <linux/signal.h>
64#include <linux/idr.h> 64#include <linux/idr.h>
65#include <linux/ftrace.h> 65#include <linux/ftrace.h>
66#include <trace/boot.h>
66 67
67#include <asm/io.h> 68#include <asm/io.h>
68#include <asm/bugs.h> 69#include <asm/bugs.h>
@@ -703,31 +704,35 @@ core_param(initcall_debug, initcall_debug, bool, 0644);
703int do_one_initcall(initcall_t fn) 704int do_one_initcall(initcall_t fn)
704{ 705{
705 int count = preempt_count(); 706 int count = preempt_count();
706 ktime_t delta; 707 ktime_t calltime, delta, rettime;
707 char msgbuf[64]; 708 char msgbuf[64];
708 struct boot_trace it; 709 struct boot_trace_call call;
710 struct boot_trace_ret ret;
709 711
710 if (initcall_debug) { 712 if (initcall_debug) {
711 it.caller = task_pid_nr(current); 713 call.caller = task_pid_nr(current);
712 printk("calling %pF @ %i\n", fn, it.caller); 714 printk("calling %pF @ %i\n", fn, call.caller);
713 it.calltime = ktime_get(); 715 calltime = ktime_get();
716 trace_boot_call(&call, fn);
717 enable_boot_trace();
714 } 718 }
715 719
716 it.result = fn(); 720 ret.result = fn();
717 721
718 if (initcall_debug) { 722 if (initcall_debug) {
719 it.rettime = ktime_get(); 723 disable_boot_trace();
720 delta = ktime_sub(it.rettime, it.calltime); 724 rettime = ktime_get();
721 it.duration = (unsigned long long) delta.tv64 >> 10; 725 delta = ktime_sub(rettime, calltime);
726 ret.duration = (unsigned long long) ktime_to_ns(delta) >> 10;
727 trace_boot_ret(&ret, fn);
722 printk("initcall %pF returned %d after %Ld usecs\n", fn, 728 printk("initcall %pF returned %d after %Ld usecs\n", fn,
723 it.result, it.duration); 729 ret.result, ret.duration);
724 trace_boot(&it, fn);
725 } 730 }
726 731
727 msgbuf[0] = 0; 732 msgbuf[0] = 0;
728 733
729 if (it.result && it.result != -ENODEV && initcall_debug) 734 if (ret.result && ret.result != -ENODEV && initcall_debug)
730 sprintf(msgbuf, "error code %d ", it.result); 735 sprintf(msgbuf, "error code %d ", ret.result);
731 736
732 if (preempt_count() != count) { 737 if (preempt_count() != count) {
733 strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf)); 738 strlcat(msgbuf, "preemption imbalance ", sizeof(msgbuf));
@@ -741,7 +746,7 @@ int do_one_initcall(initcall_t fn)
741 printk("initcall %pF returned with %s\n", fn, msgbuf); 746 printk("initcall %pF returned with %s\n", fn, msgbuf);
742 } 747 }
743 748
744 return it.result; 749 return ret.result;
745} 750}
746 751
747 752
@@ -882,7 +887,7 @@ static int __init kernel_init(void * unused)
882 * we're essentially up and running. Get rid of the 887 * we're essentially up and running. Get rid of the
883 * initmem segments and start the user-mode stuff.. 888 * initmem segments and start the user-mode stuff..
884 */ 889 */
885 stop_boot_trace(); 890
886 init_post(); 891 init_post();
887 return 0; 892 return 0;
888} 893}
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad003b19d..010ccb311166 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -19,7 +19,10 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
19CFLAGS_REMOVE_rtmutex-debug.o = -pg 19CFLAGS_REMOVE_rtmutex-debug.o = -pg
20CFLAGS_REMOVE_cgroup-debug.o = -pg 20CFLAGS_REMOVE_cgroup-debug.o = -pg
21CFLAGS_REMOVE_sched_clock.o = -pg 21CFLAGS_REMOVE_sched_clock.o = -pg
22CFLAGS_REMOVE_sched.o = -pg 22endif
23ifdef CONFIG_FUNCTION_RET_TRACER
24CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address()
25CFLAGS_REMOVE_module.o = -pg # For __module_text_address()
23endif 26endif
24 27
25obj-$(CONFIG_FREEZER) += freezer.o 28obj-$(CONFIG_FREEZER) += freezer.o
@@ -90,7 +93,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
90obj-$(CONFIG_TRACING) += trace/ 93obj-$(CONFIG_TRACING) += trace/
91obj-$(CONFIG_SMP) += sched_cpupri.o 94obj-$(CONFIG_SMP) += sched_cpupri.o
92 95
93ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 96ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
94# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 97# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
95# needed for x86 only. Why this used to be enabled for all architectures is beyond 98# needed for x86 only. Why this used to be enabled for all architectures is beyond
96# me. I suspect most platforms don't need this, but until we know that for sure 99# me. I suspect most platforms don't need this, but until we know that for sure
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7ebb0f7..61ba5b4b10cf 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -53,6 +53,10 @@
53#include <asm/pgtable.h> 53#include <asm/pgtable.h>
54#include <asm/mmu_context.h> 54#include <asm/mmu_context.h>
55 55
56DEFINE_TRACE(sched_process_free);
57DEFINE_TRACE(sched_process_exit);
58DEFINE_TRACE(sched_process_wait);
59
56static void exit_mm(struct task_struct * tsk); 60static void exit_mm(struct task_struct * tsk);
57 61
58static inline int task_detached(struct task_struct *p) 62static inline int task_detached(struct task_struct *p)
@@ -1123,7 +1127,6 @@ NORET_TYPE void do_exit(long code)
1123 preempt_disable(); 1127 preempt_disable();
1124 /* causes final put_task_struct in finish_task_switch(). */ 1128 /* causes final put_task_struct in finish_task_switch(). */
1125 tsk->state = TASK_DEAD; 1129 tsk->state = TASK_DEAD;
1126
1127 schedule(); 1130 schedule();
1128 BUG(); 1131 BUG();
1129 /* Avoid "noreturn function does return". */ 1132 /* Avoid "noreturn function does return". */
@@ -1321,10 +1324,10 @@ static int wait_task_zombie(struct task_struct *p, int options,
1321 * group, which consolidates times for all threads in the 1324 * group, which consolidates times for all threads in the
1322 * group including the group leader. 1325 * group including the group leader.
1323 */ 1326 */
1327 thread_group_cputime(p, &cputime);
1324 spin_lock_irq(&p->parent->sighand->siglock); 1328 spin_lock_irq(&p->parent->sighand->siglock);
1325 psig = p->parent->signal; 1329 psig = p->parent->signal;
1326 sig = p->signal; 1330 sig = p->signal;
1327 thread_group_cputime(p, &cputime);
1328 psig->cutime = 1331 psig->cutime =
1329 cputime_add(psig->cutime, 1332 cputime_add(psig->cutime,
1330 cputime_add(cputime.utime, 1333 cputime_add(cputime.utime,
diff --git a/kernel/extable.c b/kernel/extable.c
index a26cb2e17023..adf0cc9c02d6 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -66,3 +66,19 @@ int kernel_text_address(unsigned long addr)
66 return 1; 66 return 1;
67 return module_text_address(addr) != NULL; 67 return module_text_address(addr) != NULL;
68} 68}
69
70/*
71 * On some architectures (PPC64, IA64) function pointers
72 * are actually only tokens to some data that then holds the
73 * real function address. As a result, to find if a function
74 * pointer is part of the kernel text, we need to do some
75 * special dereferencing first.
76 */
77int func_ptr_is_kernel_text(void *ptr)
78{
79 unsigned long addr;
80 addr = (unsigned long) dereference_function_descriptor(ptr);
81 if (core_kernel_text(addr))
82 return 1;
83 return module_text_address(addr) != NULL;
84}
diff --git a/kernel/fork.c b/kernel/fork.c
index 2a372a0e206f..d6e1a3205f62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -47,6 +47,7 @@
47#include <linux/mount.h> 47#include <linux/mount.h>
48#include <linux/audit.h> 48#include <linux/audit.h>
49#include <linux/memcontrol.h> 49#include <linux/memcontrol.h>
50#include <linux/ftrace.h>
50#include <linux/profile.h> 51#include <linux/profile.h>
51#include <linux/rmap.h> 52#include <linux/rmap.h>
52#include <linux/acct.h> 53#include <linux/acct.h>
@@ -80,6 +81,8 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
80 81
81__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 82__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
82 83
84DEFINE_TRACE(sched_process_fork);
85
83int nr_processes(void) 86int nr_processes(void)
84{ 87{
85 int cpu; 88 int cpu;
@@ -137,6 +140,7 @@ void free_task(struct task_struct *tsk)
137 prop_local_destroy_single(&tsk->dirties); 140 prop_local_destroy_single(&tsk->dirties);
138 free_thread_info(tsk->stack); 141 free_thread_info(tsk->stack);
139 rt_mutex_debug_task_free(tsk); 142 rt_mutex_debug_task_free(tsk);
143 ftrace_retfunc_exit_task(tsk);
140 free_task_struct(tsk); 144 free_task_struct(tsk);
141} 145}
142EXPORT_SYMBOL(free_task); 146EXPORT_SYMBOL(free_task);
@@ -1267,6 +1271,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1267 total_forks++; 1271 total_forks++;
1268 spin_unlock(&current->sighand->siglock); 1272 spin_unlock(&current->sighand->siglock);
1269 write_unlock_irq(&tasklist_lock); 1273 write_unlock_irq(&tasklist_lock);
1274 ftrace_retfunc_init_task(p);
1270 proc_fork_connector(p); 1275 proc_fork_connector(p);
1271 cgroup_post_fork(p); 1276 cgroup_post_fork(p);
1272 return p; 1277 return p;
diff --git a/kernel/futex.c b/kernel/futex.c
index 8af10027514b..e10c5c8786a6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -123,24 +123,6 @@ struct futex_hash_bucket {
123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS]; 123static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
124 124
125/* 125/*
126 * Take mm->mmap_sem, when futex is shared
127 */
128static inline void futex_lock_mm(struct rw_semaphore *fshared)
129{
130 if (fshared)
131 down_read(fshared);
132}
133
134/*
135 * Release mm->mmap_sem, when the futex is shared
136 */
137static inline void futex_unlock_mm(struct rw_semaphore *fshared)
138{
139 if (fshared)
140 up_read(fshared);
141}
142
143/*
144 * We hash on the keys returned from get_futex_key (see below). 126 * We hash on the keys returned from get_futex_key (see below).
145 */ 127 */
146static struct futex_hash_bucket *hash_futex(union futex_key *key) 128static struct futex_hash_bucket *hash_futex(union futex_key *key)
@@ -161,6 +143,45 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
161 && key1->both.offset == key2->both.offset); 143 && key1->both.offset == key2->both.offset);
162} 144}
163 145
146/*
147 * Take a reference to the resource addressed by a key.
148 * Can be called while holding spinlocks.
149 *
150 */
151static void get_futex_key_refs(union futex_key *key)
152{
153 if (!key->both.ptr)
154 return;
155
156 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
157 case FUT_OFF_INODE:
158 atomic_inc(&key->shared.inode->i_count);
159 break;
160 case FUT_OFF_MMSHARED:
161 atomic_inc(&key->private.mm->mm_count);
162 break;
163 }
164}
165
166/*
167 * Drop a reference to the resource addressed by a key.
168 * The hash bucket spinlock must not be held.
169 */
170static void drop_futex_key_refs(union futex_key *key)
171{
172 if (!key->both.ptr)
173 return;
174
175 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
176 case FUT_OFF_INODE:
177 iput(key->shared.inode);
178 break;
179 case FUT_OFF_MMSHARED:
180 mmdrop(key->private.mm);
181 break;
182 }
183}
184
164/** 185/**
165 * get_futex_key - Get parameters which are the keys for a futex. 186 * get_futex_key - Get parameters which are the keys for a futex.
166 * @uaddr: virtual address of the futex 187 * @uaddr: virtual address of the futex
@@ -179,12 +200,10 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
179 * For other futexes, it points to &current->mm->mmap_sem and 200 * For other futexes, it points to &current->mm->mmap_sem and
180 * caller must have taken the reader lock. but NOT any spinlocks. 201 * caller must have taken the reader lock. but NOT any spinlocks.
181 */ 202 */
182static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, 203static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
183 union futex_key *key)
184{ 204{
185 unsigned long address = (unsigned long)uaddr; 205 unsigned long address = (unsigned long)uaddr;
186 struct mm_struct *mm = current->mm; 206 struct mm_struct *mm = current->mm;
187 struct vm_area_struct *vma;
188 struct page *page; 207 struct page *page;
189 int err; 208 int err;
190 209
@@ -208,100 +227,50 @@ static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
208 return -EFAULT; 227 return -EFAULT;
209 key->private.mm = mm; 228 key->private.mm = mm;
210 key->private.address = address; 229 key->private.address = address;
230 get_futex_key_refs(key);
211 return 0; 231 return 0;
212 } 232 }
213 /*
214 * The futex is hashed differently depending on whether
215 * it's in a shared or private mapping. So check vma first.
216 */
217 vma = find_extend_vma(mm, address);
218 if (unlikely(!vma))
219 return -EFAULT;
220 233
221 /* 234again:
222 * Permissions. 235 err = get_user_pages_fast(address, 1, 0, &page);
223 */ 236 if (err < 0)
224 if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ)) 237 return err;
225 return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES; 238
239 lock_page(page);
240 if (!page->mapping) {
241 unlock_page(page);
242 put_page(page);
243 goto again;
244 }
226 245
227 /* 246 /*
228 * Private mappings are handled in a simple way. 247 * Private mappings are handled in a simple way.
229 * 248 *
230 * NOTE: When userspace waits on a MAP_SHARED mapping, even if 249 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
231 * it's a read-only handle, it's expected that futexes attach to 250 * it's a read-only handle, it's expected that futexes attach to
232 * the object not the particular process. Therefore we use 251 * the object not the particular process.
233 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
234 * mappings of _writable_ handles.
235 */ 252 */
236 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 253 if (PageAnon(page)) {
237 key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */ 254 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
238 key->private.mm = mm; 255 key->private.mm = mm;
239 key->private.address = address; 256 key->private.address = address;
240 return 0; 257 } else {
258 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
259 key->shared.inode = page->mapping->host;
260 key->shared.pgoff = page->index;
241 } 261 }
242 262
243 /* 263 get_futex_key_refs(key);
244 * Linear file mappings are also simple.
245 */
246 key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
247 key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
248 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
249 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
250 + vma->vm_pgoff);
251 return 0;
252 }
253 264
254 /* 265 unlock_page(page);
255 * We could walk the page table to read the non-linear 266 put_page(page);
256 * pte, and get the page index without fetching the page 267 return 0;
257 * from swap. But that's a lot of code to duplicate here
258 * for a rare case, so we simply fetch the page.
259 */
260 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
261 if (err >= 0) {
262 key->shared.pgoff =
263 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
264 put_page(page);
265 return 0;
266 }
267 return err;
268}
269
270/*
271 * Take a reference to the resource addressed by a key.
272 * Can be called while holding spinlocks.
273 *
274 */
275static void get_futex_key_refs(union futex_key *key)
276{
277 if (key->both.ptr == NULL)
278 return;
279 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
280 case FUT_OFF_INODE:
281 atomic_inc(&key->shared.inode->i_count);
282 break;
283 case FUT_OFF_MMSHARED:
284 atomic_inc(&key->private.mm->mm_count);
285 break;
286 }
287} 268}
288 269
289/* 270static inline
290 * Drop a reference to the resource addressed by a key. 271void put_futex_key(int fshared, union futex_key *key)
291 * The hash bucket spinlock must not be held.
292 */
293static void drop_futex_key_refs(union futex_key *key)
294{ 272{
295 if (!key->both.ptr) 273 drop_futex_key_refs(key);
296 return;
297 switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
298 case FUT_OFF_INODE:
299 iput(key->shared.inode);
300 break;
301 case FUT_OFF_MMSHARED:
302 mmdrop(key->private.mm);
303 break;
304 }
305} 274}
306 275
307static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 276static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
@@ -328,10 +297,8 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
328 297
329/* 298/*
330 * Fault handling. 299 * Fault handling.
331 * if fshared is non NULL, current->mm->mmap_sem is already held
332 */ 300 */
333static int futex_handle_fault(unsigned long address, 301static int futex_handle_fault(unsigned long address, int attempt)
334 struct rw_semaphore *fshared, int attempt)
335{ 302{
336 struct vm_area_struct * vma; 303 struct vm_area_struct * vma;
337 struct mm_struct *mm = current->mm; 304 struct mm_struct *mm = current->mm;
@@ -340,8 +307,7 @@ static int futex_handle_fault(unsigned long address,
340 if (attempt > 2) 307 if (attempt > 2)
341 return ret; 308 return ret;
342 309
343 if (!fshared) 310 down_read(&mm->mmap_sem);
344 down_read(&mm->mmap_sem);
345 vma = find_vma(mm, address); 311 vma = find_vma(mm, address);
346 if (vma && address >= vma->vm_start && 312 if (vma && address >= vma->vm_start &&
347 (vma->vm_flags & VM_WRITE)) { 313 (vma->vm_flags & VM_WRITE)) {
@@ -361,8 +327,7 @@ static int futex_handle_fault(unsigned long address,
361 current->min_flt++; 327 current->min_flt++;
362 } 328 }
363 } 329 }
364 if (!fshared) 330 up_read(&mm->mmap_sem);
365 up_read(&mm->mmap_sem);
366 return ret; 331 return ret;
367} 332}
368 333
@@ -385,6 +350,7 @@ static int refill_pi_state_cache(void)
385 /* pi_mutex gets initialized later */ 350 /* pi_mutex gets initialized later */
386 pi_state->owner = NULL; 351 pi_state->owner = NULL;
387 atomic_set(&pi_state->refcount, 1); 352 atomic_set(&pi_state->refcount, 1);
353 pi_state->key = FUTEX_KEY_INIT;
388 354
389 current->pi_state_cache = pi_state; 355 current->pi_state_cache = pi_state;
390 356
@@ -462,7 +428,7 @@ void exit_pi_state_list(struct task_struct *curr)
462 struct list_head *next, *head = &curr->pi_state_list; 428 struct list_head *next, *head = &curr->pi_state_list;
463 struct futex_pi_state *pi_state; 429 struct futex_pi_state *pi_state;
464 struct futex_hash_bucket *hb; 430 struct futex_hash_bucket *hb;
465 union futex_key key; 431 union futex_key key = FUTEX_KEY_INIT;
466 432
467 if (!futex_cmpxchg_enabled) 433 if (!futex_cmpxchg_enabled)
468 return; 434 return;
@@ -719,20 +685,17 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
719 * Wake up all waiters hashed on the physical page that is mapped 685 * Wake up all waiters hashed on the physical page that is mapped
720 * to this virtual address: 686 * to this virtual address:
721 */ 687 */
722static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, 688static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
723 int nr_wake, u32 bitset)
724{ 689{
725 struct futex_hash_bucket *hb; 690 struct futex_hash_bucket *hb;
726 struct futex_q *this, *next; 691 struct futex_q *this, *next;
727 struct plist_head *head; 692 struct plist_head *head;
728 union futex_key key; 693 union futex_key key = FUTEX_KEY_INIT;
729 int ret; 694 int ret;
730 695
731 if (!bitset) 696 if (!bitset)
732 return -EINVAL; 697 return -EINVAL;
733 698
734 futex_lock_mm(fshared);
735
736 ret = get_futex_key(uaddr, fshared, &key); 699 ret = get_futex_key(uaddr, fshared, &key);
737 if (unlikely(ret != 0)) 700 if (unlikely(ret != 0))
738 goto out; 701 goto out;
@@ -760,7 +723,7 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
760 723
761 spin_unlock(&hb->lock); 724 spin_unlock(&hb->lock);
762out: 725out:
763 futex_unlock_mm(fshared); 726 put_futex_key(fshared, &key);
764 return ret; 727 return ret;
765} 728}
766 729
@@ -769,19 +732,16 @@ out:
769 * to this virtual address: 732 * to this virtual address:
770 */ 733 */
771static int 734static int
772futex_wake_op(u32 __user *uaddr1, struct rw_semaphore *fshared, 735futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
773 u32 __user *uaddr2,
774 int nr_wake, int nr_wake2, int op) 736 int nr_wake, int nr_wake2, int op)
775{ 737{
776 union futex_key key1, key2; 738 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
777 struct futex_hash_bucket *hb1, *hb2; 739 struct futex_hash_bucket *hb1, *hb2;
778 struct plist_head *head; 740 struct plist_head *head;
779 struct futex_q *this, *next; 741 struct futex_q *this, *next;
780 int ret, op_ret, attempt = 0; 742 int ret, op_ret, attempt = 0;
781 743
782retryfull: 744retryfull:
783 futex_lock_mm(fshared);
784
785 ret = get_futex_key(uaddr1, fshared, &key1); 745 ret = get_futex_key(uaddr1, fshared, &key1);
786 if (unlikely(ret != 0)) 746 if (unlikely(ret != 0))
787 goto out; 747 goto out;
@@ -826,18 +786,12 @@ retry:
826 */ 786 */
827 if (attempt++) { 787 if (attempt++) {
828 ret = futex_handle_fault((unsigned long)uaddr2, 788 ret = futex_handle_fault((unsigned long)uaddr2,
829 fshared, attempt); 789 attempt);
830 if (ret) 790 if (ret)
831 goto out; 791 goto out;
832 goto retry; 792 goto retry;
833 } 793 }
834 794
835 /*
836 * If we would have faulted, release mmap_sem,
837 * fault it in and start all over again.
838 */
839 futex_unlock_mm(fshared);
840
841 ret = get_user(dummy, uaddr2); 795 ret = get_user(dummy, uaddr2);
842 if (ret) 796 if (ret)
843 return ret; 797 return ret;
@@ -873,7 +827,8 @@ retry:
873 if (hb1 != hb2) 827 if (hb1 != hb2)
874 spin_unlock(&hb2->lock); 828 spin_unlock(&hb2->lock);
875out: 829out:
876 futex_unlock_mm(fshared); 830 put_futex_key(fshared, &key2);
831 put_futex_key(fshared, &key1);
877 832
878 return ret; 833 return ret;
879} 834}
@@ -882,19 +837,16 @@ out:
882 * Requeue all waiters hashed on one physical page to another 837 * Requeue all waiters hashed on one physical page to another
883 * physical page. 838 * physical page.
884 */ 839 */
885static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared, 840static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
886 u32 __user *uaddr2,
887 int nr_wake, int nr_requeue, u32 *cmpval) 841 int nr_wake, int nr_requeue, u32 *cmpval)
888{ 842{
889 union futex_key key1, key2; 843 union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
890 struct futex_hash_bucket *hb1, *hb2; 844 struct futex_hash_bucket *hb1, *hb2;
891 struct plist_head *head1; 845 struct plist_head *head1;
892 struct futex_q *this, *next; 846 struct futex_q *this, *next;
893 int ret, drop_count = 0; 847 int ret, drop_count = 0;
894 848
895 retry: 849 retry:
896 futex_lock_mm(fshared);
897
898 ret = get_futex_key(uaddr1, fshared, &key1); 850 ret = get_futex_key(uaddr1, fshared, &key1);
899 if (unlikely(ret != 0)) 851 if (unlikely(ret != 0))
900 goto out; 852 goto out;
@@ -917,12 +869,6 @@ static int futex_requeue(u32 __user *uaddr1, struct rw_semaphore *fshared,
917 if (hb1 != hb2) 869 if (hb1 != hb2)
918 spin_unlock(&hb2->lock); 870 spin_unlock(&hb2->lock);
919 871
920 /*
921 * If we would have faulted, release mmap_sem, fault
922 * it in and start all over again.
923 */
924 futex_unlock_mm(fshared);
925
926 ret = get_user(curval, uaddr1); 872 ret = get_user(curval, uaddr1);
927 873
928 if (!ret) 874 if (!ret)
@@ -974,7 +920,8 @@ out_unlock:
974 drop_futex_key_refs(&key1); 920 drop_futex_key_refs(&key1);
975 921
976out: 922out:
977 futex_unlock_mm(fshared); 923 put_futex_key(fshared, &key2);
924 put_futex_key(fshared, &key1);
978 return ret; 925 return ret;
979} 926}
980 927
@@ -1096,8 +1043,7 @@ static void unqueue_me_pi(struct futex_q *q)
1096 * private futexes. 1043 * private futexes.
1097 */ 1044 */
1098static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1045static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1099 struct task_struct *newowner, 1046 struct task_struct *newowner, int fshared)
1100 struct rw_semaphore *fshared)
1101{ 1047{
1102 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1048 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1103 struct futex_pi_state *pi_state = q->pi_state; 1049 struct futex_pi_state *pi_state = q->pi_state;
@@ -1176,7 +1122,7 @@ retry:
1176handle_fault: 1122handle_fault:
1177 spin_unlock(q->lock_ptr); 1123 spin_unlock(q->lock_ptr);
1178 1124
1179 ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); 1125 ret = futex_handle_fault((unsigned long)uaddr, attempt++);
1180 1126
1181 spin_lock(q->lock_ptr); 1127 spin_lock(q->lock_ptr);
1182 1128
@@ -1200,7 +1146,7 @@ handle_fault:
1200 1146
1201static long futex_wait_restart(struct restart_block *restart); 1147static long futex_wait_restart(struct restart_block *restart);
1202 1148
1203static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1149static int futex_wait(u32 __user *uaddr, int fshared,
1204 u32 val, ktime_t *abs_time, u32 bitset) 1150 u32 val, ktime_t *abs_time, u32 bitset)
1205{ 1151{
1206 struct task_struct *curr = current; 1152 struct task_struct *curr = current;
@@ -1218,8 +1164,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1218 q.pi_state = NULL; 1164 q.pi_state = NULL;
1219 q.bitset = bitset; 1165 q.bitset = bitset;
1220 retry: 1166 retry:
1221 futex_lock_mm(fshared); 1167 q.key = FUTEX_KEY_INIT;
1222
1223 ret = get_futex_key(uaddr, fshared, &q.key); 1168 ret = get_futex_key(uaddr, fshared, &q.key);
1224 if (unlikely(ret != 0)) 1169 if (unlikely(ret != 0))
1225 goto out_release_sem; 1170 goto out_release_sem;
@@ -1251,12 +1196,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1251 if (unlikely(ret)) { 1196 if (unlikely(ret)) {
1252 queue_unlock(&q, hb); 1197 queue_unlock(&q, hb);
1253 1198
1254 /*
1255 * If we would have faulted, release mmap_sem, fault it in and
1256 * start all over again.
1257 */
1258 futex_unlock_mm(fshared);
1259
1260 ret = get_user(uval, uaddr); 1199 ret = get_user(uval, uaddr);
1261 1200
1262 if (!ret) 1201 if (!ret)
@@ -1271,12 +1210,6 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1271 queue_me(&q, hb); 1210 queue_me(&q, hb);
1272 1211
1273 /* 1212 /*
1274 * Now the futex is queued and we have checked the data, we
1275 * don't want to hold mmap_sem while we sleep.
1276 */
1277 futex_unlock_mm(fshared);
1278
1279 /*
1280 * There might have been scheduling since the queue_me(), as we 1213 * There might have been scheduling since the queue_me(), as we
1281 * cannot hold a spinlock across the get_user() in case it 1214 * cannot hold a spinlock across the get_user() in case it
1282 * faults, and we cannot just set TASK_INTERRUPTIBLE state when 1215 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
@@ -1363,7 +1296,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1363 queue_unlock(&q, hb); 1296 queue_unlock(&q, hb);
1364 1297
1365 out_release_sem: 1298 out_release_sem:
1366 futex_unlock_mm(fshared); 1299 put_futex_key(fshared, &q.key);
1367 return ret; 1300 return ret;
1368} 1301}
1369 1302
@@ -1371,13 +1304,13 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1371static long futex_wait_restart(struct restart_block *restart) 1304static long futex_wait_restart(struct restart_block *restart)
1372{ 1305{
1373 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1306 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1374 struct rw_semaphore *fshared = NULL; 1307 int fshared = 0;
1375 ktime_t t; 1308 ktime_t t;
1376 1309
1377 t.tv64 = restart->futex.time; 1310 t.tv64 = restart->futex.time;
1378 restart->fn = do_no_restart_syscall; 1311 restart->fn = do_no_restart_syscall;
1379 if (restart->futex.flags & FLAGS_SHARED) 1312 if (restart->futex.flags & FLAGS_SHARED)
1380 fshared = &current->mm->mmap_sem; 1313 fshared = 1;
1381 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, 1314 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
1382 restart->futex.bitset); 1315 restart->futex.bitset);
1383} 1316}
@@ -1389,7 +1322,7 @@ static long futex_wait_restart(struct restart_block *restart)
1389 * if there are waiters then it will block, it does PI, etc. (Due to 1322 * if there are waiters then it will block, it does PI, etc. (Due to
1390 * races the kernel might see a 0 value of the futex too.) 1323 * races the kernel might see a 0 value of the futex too.)
1391 */ 1324 */
1392static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, 1325static int futex_lock_pi(u32 __user *uaddr, int fshared,
1393 int detect, ktime_t *time, int trylock) 1326 int detect, ktime_t *time, int trylock)
1394{ 1327{
1395 struct hrtimer_sleeper timeout, *to = NULL; 1328 struct hrtimer_sleeper timeout, *to = NULL;
@@ -1412,8 +1345,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1412 1345
1413 q.pi_state = NULL; 1346 q.pi_state = NULL;
1414 retry: 1347 retry:
1415 futex_lock_mm(fshared); 1348 q.key = FUTEX_KEY_INIT;
1416
1417 ret = get_futex_key(uaddr, fshared, &q.key); 1349 ret = get_futex_key(uaddr, fshared, &q.key);
1418 if (unlikely(ret != 0)) 1350 if (unlikely(ret != 0))
1419 goto out_release_sem; 1351 goto out_release_sem;
@@ -1502,7 +1434,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1502 * exit to complete. 1434 * exit to complete.
1503 */ 1435 */
1504 queue_unlock(&q, hb); 1436 queue_unlock(&q, hb);
1505 futex_unlock_mm(fshared);
1506 cond_resched(); 1437 cond_resched();
1507 goto retry; 1438 goto retry;
1508 1439
@@ -1534,12 +1465,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1534 */ 1465 */
1535 queue_me(&q, hb); 1466 queue_me(&q, hb);
1536 1467
1537 /*
1538 * Now the futex is queued and we have checked the data, we
1539 * don't want to hold mmap_sem while we sleep.
1540 */
1541 futex_unlock_mm(fshared);
1542
1543 WARN_ON(!q.pi_state); 1468 WARN_ON(!q.pi_state);
1544 /* 1469 /*
1545 * Block on the PI mutex: 1470 * Block on the PI mutex:
@@ -1552,7 +1477,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1552 ret = ret ? 0 : -EWOULDBLOCK; 1477 ret = ret ? 0 : -EWOULDBLOCK;
1553 } 1478 }
1554 1479
1555 futex_lock_mm(fshared);
1556 spin_lock(q.lock_ptr); 1480 spin_lock(q.lock_ptr);
1557 1481
1558 if (!ret) { 1482 if (!ret) {
@@ -1618,7 +1542,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1618 1542
1619 /* Unqueue and drop the lock */ 1543 /* Unqueue and drop the lock */
1620 unqueue_me_pi(&q); 1544 unqueue_me_pi(&q);
1621 futex_unlock_mm(fshared);
1622 1545
1623 if (to) 1546 if (to)
1624 destroy_hrtimer_on_stack(&to->timer); 1547 destroy_hrtimer_on_stack(&to->timer);
@@ -1628,7 +1551,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1628 queue_unlock(&q, hb); 1551 queue_unlock(&q, hb);
1629 1552
1630 out_release_sem: 1553 out_release_sem:
1631 futex_unlock_mm(fshared); 1554 put_futex_key(fshared, &q.key);
1632 if (to) 1555 if (to)
1633 destroy_hrtimer_on_stack(&to->timer); 1556 destroy_hrtimer_on_stack(&to->timer);
1634 return ret; 1557 return ret;
@@ -1645,15 +1568,12 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1645 queue_unlock(&q, hb); 1568 queue_unlock(&q, hb);
1646 1569
1647 if (attempt++) { 1570 if (attempt++) {
1648 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1571 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1649 attempt);
1650 if (ret) 1572 if (ret)
1651 goto out_release_sem; 1573 goto out_release_sem;
1652 goto retry_unlocked; 1574 goto retry_unlocked;
1653 } 1575 }
1654 1576
1655 futex_unlock_mm(fshared);
1656
1657 ret = get_user(uval, uaddr); 1577 ret = get_user(uval, uaddr);
1658 if (!ret && (uval != -EFAULT)) 1578 if (!ret && (uval != -EFAULT))
1659 goto retry; 1579 goto retry;
@@ -1668,13 +1588,13 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1668 * This is the in-kernel slowpath: we look up the PI state (if any), 1588 * This is the in-kernel slowpath: we look up the PI state (if any),
1669 * and do the rt-mutex unlock. 1589 * and do the rt-mutex unlock.
1670 */ 1590 */
1671static int futex_unlock_pi(u32 __user *uaddr, struct rw_semaphore *fshared) 1591static int futex_unlock_pi(u32 __user *uaddr, int fshared)
1672{ 1592{
1673 struct futex_hash_bucket *hb; 1593 struct futex_hash_bucket *hb;
1674 struct futex_q *this, *next; 1594 struct futex_q *this, *next;
1675 u32 uval; 1595 u32 uval;
1676 struct plist_head *head; 1596 struct plist_head *head;
1677 union futex_key key; 1597 union futex_key key = FUTEX_KEY_INIT;
1678 int ret, attempt = 0; 1598 int ret, attempt = 0;
1679 1599
1680retry: 1600retry:
@@ -1685,10 +1605,6 @@ retry:
1685 */ 1605 */
1686 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 1606 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
1687 return -EPERM; 1607 return -EPERM;
1688 /*
1689 * First take all the futex related locks:
1690 */
1691 futex_lock_mm(fshared);
1692 1608
1693 ret = get_futex_key(uaddr, fshared, &key); 1609 ret = get_futex_key(uaddr, fshared, &key);
1694 if (unlikely(ret != 0)) 1610 if (unlikely(ret != 0))
@@ -1747,7 +1663,7 @@ retry_unlocked:
1747out_unlock: 1663out_unlock:
1748 spin_unlock(&hb->lock); 1664 spin_unlock(&hb->lock);
1749out: 1665out:
1750 futex_unlock_mm(fshared); 1666 put_futex_key(fshared, &key);
1751 1667
1752 return ret; 1668 return ret;
1753 1669
@@ -1763,16 +1679,13 @@ pi_faulted:
1763 spin_unlock(&hb->lock); 1679 spin_unlock(&hb->lock);
1764 1680
1765 if (attempt++) { 1681 if (attempt++) {
1766 ret = futex_handle_fault((unsigned long)uaddr, fshared, 1682 ret = futex_handle_fault((unsigned long)uaddr, attempt);
1767 attempt);
1768 if (ret) 1683 if (ret)
1769 goto out; 1684 goto out;
1770 uval = 0; 1685 uval = 0;
1771 goto retry_unlocked; 1686 goto retry_unlocked;
1772 } 1687 }
1773 1688
1774 futex_unlock_mm(fshared);
1775
1776 ret = get_user(uval, uaddr); 1689 ret = get_user(uval, uaddr);
1777 if (!ret && (uval != -EFAULT)) 1690 if (!ret && (uval != -EFAULT))
1778 goto retry; 1691 goto retry;
@@ -1898,8 +1811,7 @@ retry:
1898 * PI futexes happens in exit_pi_state(): 1811 * PI futexes happens in exit_pi_state():
1899 */ 1812 */
1900 if (!pi && (uval & FUTEX_WAITERS)) 1813 if (!pi && (uval & FUTEX_WAITERS))
1901 futex_wake(uaddr, &curr->mm->mmap_sem, 1, 1814 futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
1902 FUTEX_BITSET_MATCH_ANY);
1903 } 1815 }
1904 return 0; 1816 return 0;
1905} 1817}
@@ -1995,10 +1907,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
1995{ 1907{
1996 int ret = -ENOSYS; 1908 int ret = -ENOSYS;
1997 int cmd = op & FUTEX_CMD_MASK; 1909 int cmd = op & FUTEX_CMD_MASK;
1998 struct rw_semaphore *fshared = NULL; 1910 int fshared = 0;
1999 1911
2000 if (!(op & FUTEX_PRIVATE_FLAG)) 1912 if (!(op & FUTEX_PRIVATE_FLAG))
2001 fshared = &current->mm->mmap_sem; 1913 fshared = 1;
2002 1914
2003 switch (cmd) { 1915 switch (cmd) {
2004 case FUTEX_WAIT: 1916 case FUTEX_WAIT:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 8e7a7ce3ed0a..4fbc456f393d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -21,6 +21,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
21static LIST_HEAD(kthread_create_list); 21static LIST_HEAD(kthread_create_list);
22struct task_struct *kthreadd_task; 22struct task_struct *kthreadd_task;
23 23
24DEFINE_TRACE(sched_kthread_stop);
25DEFINE_TRACE(sched_kthread_stop_ret);
26
24struct kthread_create_info 27struct kthread_create_info
25{ 28{
26 /* Information passed to kthread() from kthreadd. */ 29 /* Information passed to kthread() from kthreadd. */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 46a404173db2..e4bdda8dcf04 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -136,16 +136,16 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
136#ifdef CONFIG_LOCK_STAT 136#ifdef CONFIG_LOCK_STAT
137static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 137static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
138 138
139static int lock_contention_point(struct lock_class *class, unsigned long ip) 139static int lock_point(unsigned long points[], unsigned long ip)
140{ 140{
141 int i; 141 int i;
142 142
143 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 143 for (i = 0; i < LOCKSTAT_POINTS; i++) {
144 if (class->contention_point[i] == 0) { 144 if (points[i] == 0) {
145 class->contention_point[i] = ip; 145 points[i] = ip;
146 break; 146 break;
147 } 147 }
148 if (class->contention_point[i] == ip) 148 if (points[i] == ip)
149 break; 149 break;
150 } 150 }
151 151
@@ -185,6 +185,9 @@ struct lock_class_stats lock_stats(struct lock_class *class)
185 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 185 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
186 stats.contention_point[i] += pcs->contention_point[i]; 186 stats.contention_point[i] += pcs->contention_point[i];
187 187
188 for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++)
189 stats.contending_point[i] += pcs->contending_point[i];
190
188 lock_time_add(&pcs->read_waittime, &stats.read_waittime); 191 lock_time_add(&pcs->read_waittime, &stats.read_waittime);
189 lock_time_add(&pcs->write_waittime, &stats.write_waittime); 192 lock_time_add(&pcs->write_waittime, &stats.write_waittime);
190 193
@@ -209,6 +212,7 @@ void clear_lock_stats(struct lock_class *class)
209 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 212 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
210 } 213 }
211 memset(class->contention_point, 0, sizeof(class->contention_point)); 214 memset(class->contention_point, 0, sizeof(class->contention_point));
215 memset(class->contending_point, 0, sizeof(class->contending_point));
212} 216}
213 217
214static struct lock_class_stats *get_lock_stats(struct lock_class *class) 218static struct lock_class_stats *get_lock_stats(struct lock_class *class)
@@ -2999,7 +3003,7 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
2999 struct held_lock *hlock, *prev_hlock; 3003 struct held_lock *hlock, *prev_hlock;
3000 struct lock_class_stats *stats; 3004 struct lock_class_stats *stats;
3001 unsigned int depth; 3005 unsigned int depth;
3002 int i, point; 3006 int i, contention_point, contending_point;
3003 3007
3004 depth = curr->lockdep_depth; 3008 depth = curr->lockdep_depth;
3005 if (DEBUG_LOCKS_WARN_ON(!depth)) 3009 if (DEBUG_LOCKS_WARN_ON(!depth))
@@ -3023,18 +3027,22 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
3023found_it: 3027found_it:
3024 hlock->waittime_stamp = sched_clock(); 3028 hlock->waittime_stamp = sched_clock();
3025 3029
3026 point = lock_contention_point(hlock_class(hlock), ip); 3030 contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
3031 contending_point = lock_point(hlock_class(hlock)->contending_point,
3032 lock->ip);
3027 3033
3028 stats = get_lock_stats(hlock_class(hlock)); 3034 stats = get_lock_stats(hlock_class(hlock));
3029 if (point < ARRAY_SIZE(stats->contention_point)) 3035 if (contention_point < LOCKSTAT_POINTS)
3030 stats->contention_point[point]++; 3036 stats->contention_point[contention_point]++;
3037 if (contending_point < LOCKSTAT_POINTS)
3038 stats->contending_point[contending_point]++;
3031 if (lock->cpu != smp_processor_id()) 3039 if (lock->cpu != smp_processor_id())
3032 stats->bounces[bounce_contended + !!hlock->read]++; 3040 stats->bounces[bounce_contended + !!hlock->read]++;
3033 put_lock_stats(stats); 3041 put_lock_stats(stats);
3034} 3042}
3035 3043
3036static void 3044static void
3037__lock_acquired(struct lockdep_map *lock) 3045__lock_acquired(struct lockdep_map *lock, unsigned long ip)
3038{ 3046{
3039 struct task_struct *curr = current; 3047 struct task_struct *curr = current;
3040 struct held_lock *hlock, *prev_hlock; 3048 struct held_lock *hlock, *prev_hlock;
@@ -3083,6 +3091,7 @@ found_it:
3083 put_lock_stats(stats); 3091 put_lock_stats(stats);
3084 3092
3085 lock->cpu = cpu; 3093 lock->cpu = cpu;
3094 lock->ip = ip;
3086} 3095}
3087 3096
3088void lock_contended(struct lockdep_map *lock, unsigned long ip) 3097void lock_contended(struct lockdep_map *lock, unsigned long ip)
@@ -3104,7 +3113,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
3104} 3113}
3105EXPORT_SYMBOL_GPL(lock_contended); 3114EXPORT_SYMBOL_GPL(lock_contended);
3106 3115
3107void lock_acquired(struct lockdep_map *lock) 3116void lock_acquired(struct lockdep_map *lock, unsigned long ip)
3108{ 3117{
3109 unsigned long flags; 3118 unsigned long flags;
3110 3119
@@ -3117,7 +3126,7 @@ void lock_acquired(struct lockdep_map *lock)
3117 raw_local_irq_save(flags); 3126 raw_local_irq_save(flags);
3118 check_flags(flags); 3127 check_flags(flags);
3119 current->lockdep_recursion = 1; 3128 current->lockdep_recursion = 1;
3120 __lock_acquired(lock); 3129 __lock_acquired(lock, ip);
3121 current->lockdep_recursion = 0; 3130 current->lockdep_recursion = 0;
3122 raw_local_irq_restore(flags); 3131 raw_local_irq_restore(flags);
3123} 3132}
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 20dbcbf9c7dd..13716b813896 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -470,11 +470,12 @@ static void seq_line(struct seq_file *m, char c, int offset, int length)
470 470
471static void snprint_time(char *buf, size_t bufsiz, s64 nr) 471static void snprint_time(char *buf, size_t bufsiz, s64 nr)
472{ 472{
473 unsigned long rem; 473 s64 div;
474 s32 rem;
474 475
475 nr += 5; /* for display rounding */ 476 nr += 5; /* for display rounding */
476 rem = do_div(nr, 1000); /* XXX: do_div_signed */ 477 div = div_s64_rem(nr, 1000, &rem);
477 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10); 478 snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10);
478} 479}
479 480
480static void seq_time(struct seq_file *m, s64 time) 481static void seq_time(struct seq_file *m, s64 time)
@@ -556,7 +557,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
556 if (stats->read_holdtime.nr) 557 if (stats->read_holdtime.nr)
557 namelen += 2; 558 namelen += 2;
558 559
559 for (i = 0; i < ARRAY_SIZE(class->contention_point); i++) { 560 for (i = 0; i < LOCKSTAT_POINTS; i++) {
560 char sym[KSYM_SYMBOL_LEN]; 561 char sym[KSYM_SYMBOL_LEN];
561 char ip[32]; 562 char ip[32];
562 563
@@ -573,6 +574,23 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
573 stats->contention_point[i], 574 stats->contention_point[i],
574 ip, sym); 575 ip, sym);
575 } 576 }
577 for (i = 0; i < LOCKSTAT_POINTS; i++) {
578 char sym[KSYM_SYMBOL_LEN];
579 char ip[32];
580
581 if (class->contending_point[i] == 0)
582 break;
583
584 if (!i)
585 seq_line(m, '-', 40-namelen, namelen);
586
587 sprint_symbol(sym, class->contending_point[i]);
588 snprintf(ip, sizeof(ip), "[<%p>]",
589 (void *)class->contending_point[i]);
590 seq_printf(m, "%40s %14lu %29s %s\n", name,
591 stats->contending_point[i],
592 ip, sym);
593 }
576 if (i) { 594 if (i) {
577 seq_puts(m, "\n"); 595 seq_puts(m, "\n");
578 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); 596 seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1));
@@ -582,7 +600,7 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
582 600
583static void seq_header(struct seq_file *m) 601static void seq_header(struct seq_file *m)
584{ 602{
585 seq_printf(m, "lock_stat version 0.2\n"); 603 seq_printf(m, "lock_stat version 0.3\n");
586 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); 604 seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
587 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " 605 seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
588 "%14s %14s\n", 606 "%14s %14s\n",
diff --git a/kernel/marker.c b/kernel/marker.c
index e9c6b2bc9400..ea54f2647868 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex);
43 */ 43 */
44#define MARKER_HASH_BITS 6 44#define MARKER_HASH_BITS 6
45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) 45#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
46static struct hlist_head marker_table[MARKER_TABLE_SIZE];
46 47
47/* 48/*
48 * Note about RCU : 49 * Note about RCU :
@@ -64,11 +65,10 @@ struct marker_entry {
64 void *oldptr; 65 void *oldptr;
65 int rcu_pending; 66 int rcu_pending;
66 unsigned char ptype:1; 67 unsigned char ptype:1;
68 unsigned char format_allocated:1;
67 char name[0]; /* Contains name'\0'format'\0' */ 69 char name[0]; /* Contains name'\0'format'\0' */
68}; 70};
69 71
70static struct hlist_head marker_table[MARKER_TABLE_SIZE];
71
72/** 72/**
73 * __mark_empty_function - Empty probe callback 73 * __mark_empty_function - Empty probe callback
74 * @probe_private: probe private data 74 * @probe_private: probe private data
@@ -81,7 +81,7 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
81 * though the function pointer change and the marker enabling are two distinct 81 * though the function pointer change and the marker enabling are two distinct
82 * operations that modifies the execution flow of preemptible code. 82 * operations that modifies the execution flow of preemptible code.
83 */ 83 */
84void __mark_empty_function(void *probe_private, void *call_private, 84notrace void __mark_empty_function(void *probe_private, void *call_private,
85 const char *fmt, va_list *args) 85 const char *fmt, va_list *args)
86{ 86{
87} 87}
@@ -97,7 +97,8 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
97 * need to put a full smp_rmb() in this branch. This is why we do not use 97 * need to put a full smp_rmb() in this branch. This is why we do not use
98 * rcu_dereference() for the pointer read. 98 * rcu_dereference() for the pointer read.
99 */ 99 */
100void marker_probe_cb(const struct marker *mdata, void *call_private, ...) 100notrace void marker_probe_cb(const struct marker *mdata,
101 void *call_private, ...)
101{ 102{
102 va_list args; 103 va_list args;
103 char ptype; 104 char ptype;
@@ -107,7 +108,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
107 * sure the teardown of the callbacks can be done correctly when they 108 * sure the teardown of the callbacks can be done correctly when they
108 * are in modules and they insure RCU read coherency. 109 * are in modules and they insure RCU read coherency.
109 */ 110 */
110 rcu_read_lock_sched(); 111 rcu_read_lock_sched_notrace();
111 ptype = mdata->ptype; 112 ptype = mdata->ptype;
112 if (likely(!ptype)) { 113 if (likely(!ptype)) {
113 marker_probe_func *func; 114 marker_probe_func *func;
@@ -145,7 +146,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
145 va_end(args); 146 va_end(args);
146 } 147 }
147 } 148 }
148 rcu_read_unlock_sched(); 149 rcu_read_unlock_sched_notrace();
149} 150}
150EXPORT_SYMBOL_GPL(marker_probe_cb); 151EXPORT_SYMBOL_GPL(marker_probe_cb);
151 152
@@ -157,12 +158,13 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
157 * 158 *
158 * Should be connected to markers "MARK_NOARGS". 159 * Should be connected to markers "MARK_NOARGS".
159 */ 160 */
160void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) 161static notrace void marker_probe_cb_noarg(const struct marker *mdata,
162 void *call_private, ...)
161{ 163{
162 va_list args; /* not initialized */ 164 va_list args; /* not initialized */
163 char ptype; 165 char ptype;
164 166
165 rcu_read_lock_sched(); 167 rcu_read_lock_sched_notrace();
166 ptype = mdata->ptype; 168 ptype = mdata->ptype;
167 if (likely(!ptype)) { 169 if (likely(!ptype)) {
168 marker_probe_func *func; 170 marker_probe_func *func;
@@ -195,9 +197,8 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
195 multi[i].func(multi[i].probe_private, call_private, 197 multi[i].func(multi[i].probe_private, call_private,
196 mdata->format, &args); 198 mdata->format, &args);
197 } 199 }
198 rcu_read_unlock_sched(); 200 rcu_read_unlock_sched_notrace();
199} 201}
200EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
201 202
202static void free_old_closure(struct rcu_head *head) 203static void free_old_closure(struct rcu_head *head)
203{ 204{
@@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format)
416 e->single.probe_private = NULL; 417 e->single.probe_private = NULL;
417 e->multi = NULL; 418 e->multi = NULL;
418 e->ptype = 0; 419 e->ptype = 0;
420 e->format_allocated = 0;
419 e->refcount = 0; 421 e->refcount = 0;
420 e->rcu_pending = 0; 422 e->rcu_pending = 0;
421 hlist_add_head(&e->hlist, head); 423 hlist_add_head(&e->hlist, head);
@@ -447,6 +449,8 @@ static int remove_marker(const char *name)
447 if (e->single.func != __mark_empty_function) 449 if (e->single.func != __mark_empty_function)
448 return -EBUSY; 450 return -EBUSY;
449 hlist_del(&e->hlist); 451 hlist_del(&e->hlist);
452 if (e->format_allocated)
453 kfree(e->format);
450 /* Make sure the call_rcu has been executed */ 454 /* Make sure the call_rcu has been executed */
451 if (e->rcu_pending) 455 if (e->rcu_pending)
452 rcu_barrier_sched(); 456 rcu_barrier_sched();
@@ -457,57 +461,34 @@ static int remove_marker(const char *name)
457/* 461/*
458 * Set the mark_entry format to the format found in the element. 462 * Set the mark_entry format to the format found in the element.
459 */ 463 */
460static int marker_set_format(struct marker_entry **entry, const char *format) 464static int marker_set_format(struct marker_entry *entry, const char *format)
461{ 465{
462 struct marker_entry *e; 466 entry->format = kstrdup(format, GFP_KERNEL);
463 size_t name_len = strlen((*entry)->name) + 1; 467 if (!entry->format)
464 size_t format_len = strlen(format) + 1;
465
466
467 e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
468 GFP_KERNEL);
469 if (!e)
470 return -ENOMEM; 468 return -ENOMEM;
471 memcpy(&e->name[0], (*entry)->name, name_len); 469 entry->format_allocated = 1;
472 e->format = &e->name[name_len]; 470
473 memcpy(e->format, format, format_len);
474 if (strcmp(e->format, MARK_NOARGS) == 0)
475 e->call = marker_probe_cb_noarg;
476 else
477 e->call = marker_probe_cb;
478 e->single = (*entry)->single;
479 e->multi = (*entry)->multi;
480 e->ptype = (*entry)->ptype;
481 e->refcount = (*entry)->refcount;
482 e->rcu_pending = 0;
483 hlist_add_before(&e->hlist, &(*entry)->hlist);
484 hlist_del(&(*entry)->hlist);
485 /* Make sure the call_rcu has been executed */
486 if ((*entry)->rcu_pending)
487 rcu_barrier_sched();
488 kfree(*entry);
489 *entry = e;
490 trace_mark(core_marker_format, "name %s format %s", 471 trace_mark(core_marker_format, "name %s format %s",
491 e->name, e->format); 472 entry->name, entry->format);
492 return 0; 473 return 0;
493} 474}
494 475
495/* 476/*
496 * Sets the probe callback corresponding to one marker. 477 * Sets the probe callback corresponding to one marker.
497 */ 478 */
498static int set_marker(struct marker_entry **entry, struct marker *elem, 479static int set_marker(struct marker_entry *entry, struct marker *elem,
499 int active) 480 int active)
500{ 481{
501 int ret; 482 int ret = 0;
502 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 483 WARN_ON(strcmp(entry->name, elem->name) != 0);
503 484
504 if ((*entry)->format) { 485 if (entry->format) {
505 if (strcmp((*entry)->format, elem->format) != 0) { 486 if (strcmp(entry->format, elem->format) != 0) {
506 printk(KERN_NOTICE 487 printk(KERN_NOTICE
507 "Format mismatch for probe %s " 488 "Format mismatch for probe %s "
508 "(%s), marker (%s)\n", 489 "(%s), marker (%s)\n",
509 (*entry)->name, 490 entry->name,
510 (*entry)->format, 491 entry->format,
511 elem->format); 492 elem->format);
512 return -EPERM; 493 return -EPERM;
513 } 494 }
@@ -523,37 +504,67 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
523 * pass from a "safe" callback (with argument) to an "unsafe" 504 * pass from a "safe" callback (with argument) to an "unsafe"
524 * callback (does not set arguments). 505 * callback (does not set arguments).
525 */ 506 */
526 elem->call = (*entry)->call; 507 elem->call = entry->call;
527 /* 508 /*
528 * Sanity check : 509 * Sanity check :
529 * We only update the single probe private data when the ptr is 510 * We only update the single probe private data when the ptr is
530 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) 511 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
531 */ 512 */
532 WARN_ON(elem->single.func != __mark_empty_function 513 WARN_ON(elem->single.func != __mark_empty_function
533 && elem->single.probe_private 514 && elem->single.probe_private != entry->single.probe_private
534 != (*entry)->single.probe_private && 515 && !elem->ptype);
535 !elem->ptype); 516 elem->single.probe_private = entry->single.probe_private;
536 elem->single.probe_private = (*entry)->single.probe_private;
537 /* 517 /*
538 * Make sure the private data is valid when we update the 518 * Make sure the private data is valid when we update the
539 * single probe ptr. 519 * single probe ptr.
540 */ 520 */
541 smp_wmb(); 521 smp_wmb();
542 elem->single.func = (*entry)->single.func; 522 elem->single.func = entry->single.func;
543 /* 523 /*
544 * We also make sure that the new probe callbacks array is consistent 524 * We also make sure that the new probe callbacks array is consistent
545 * before setting a pointer to it. 525 * before setting a pointer to it.
546 */ 526 */
547 rcu_assign_pointer(elem->multi, (*entry)->multi); 527 rcu_assign_pointer(elem->multi, entry->multi);
548 /* 528 /*
549 * Update the function or multi probe array pointer before setting the 529 * Update the function or multi probe array pointer before setting the
550 * ptype. 530 * ptype.
551 */ 531 */
552 smp_wmb(); 532 smp_wmb();
553 elem->ptype = (*entry)->ptype; 533 elem->ptype = entry->ptype;
534
535 if (elem->tp_name && (active ^ elem->state)) {
536 WARN_ON(!elem->tp_cb);
537 /*
538 * It is ok to directly call the probe registration because type
539 * checking has been done in the __trace_mark_tp() macro.
540 */
541
542 if (active) {
543 /*
544 * try_module_get should always succeed because we hold
545 * lock_module() to get the tp_cb address.
546 */
547 ret = try_module_get(__module_text_address(
548 (unsigned long)elem->tp_cb));
549 BUG_ON(!ret);
550 ret = tracepoint_probe_register_noupdate(
551 elem->tp_name,
552 elem->tp_cb);
553 } else {
554 ret = tracepoint_probe_unregister_noupdate(
555 elem->tp_name,
556 elem->tp_cb);
557 /*
558 * tracepoint_probe_update_all() must be called
559 * before the module containing tp_cb is unloaded.
560 */
561 module_put(__module_text_address(
562 (unsigned long)elem->tp_cb));
563 }
564 }
554 elem->state = active; 565 elem->state = active;
555 566
556 return 0; 567 return ret;
557} 568}
558 569
559/* 570/*
@@ -564,7 +575,24 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
564 */ 575 */
565static void disable_marker(struct marker *elem) 576static void disable_marker(struct marker *elem)
566{ 577{
578 int ret;
579
567 /* leave "call" as is. It is known statically. */ 580 /* leave "call" as is. It is known statically. */
581 if (elem->tp_name && elem->state) {
582 WARN_ON(!elem->tp_cb);
583 /*
584 * It is ok to directly call the probe registration because type
585 * checking has been done in the __trace_mark_tp() macro.
586 */
587 ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
588 elem->tp_cb);
589 WARN_ON(ret);
590 /*
591 * tracepoint_probe_update_all() must be called
592 * before the module containing tp_cb is unloaded.
593 */
594 module_put(__module_text_address((unsigned long)elem->tp_cb));
595 }
568 elem->state = 0; 596 elem->state = 0;
569 elem->single.func = __mark_empty_function; 597 elem->single.func = __mark_empty_function;
570 /* Update the function before setting the ptype */ 598 /* Update the function before setting the ptype */
@@ -594,8 +622,7 @@ void marker_update_probe_range(struct marker *begin,
594 for (iter = begin; iter < end; iter++) { 622 for (iter = begin; iter < end; iter++) {
595 mark_entry = get_marker(iter->name); 623 mark_entry = get_marker(iter->name);
596 if (mark_entry) { 624 if (mark_entry) {
597 set_marker(&mark_entry, iter, 625 set_marker(mark_entry, iter, !!mark_entry->refcount);
598 !!mark_entry->refcount);
599 /* 626 /*
600 * ignore error, continue 627 * ignore error, continue
601 */ 628 */
@@ -629,6 +656,7 @@ static void marker_update_probes(void)
629 marker_update_probe_range(__start___markers, __stop___markers); 656 marker_update_probe_range(__start___markers, __stop___markers);
630 /* Markers in modules. */ 657 /* Markers in modules. */
631 module_update_markers(); 658 module_update_markers();
659 tracepoint_probe_update_all();
632} 660}
633 661
634/** 662/**
@@ -657,7 +685,7 @@ int marker_probe_register(const char *name, const char *format,
657 ret = PTR_ERR(entry); 685 ret = PTR_ERR(entry);
658 } else if (format) { 686 } else if (format) {
659 if (!entry->format) 687 if (!entry->format)
660 ret = marker_set_format(&entry, format); 688 ret = marker_set_format(entry, format);
661 else if (strcmp(entry->format, format)) 689 else if (strcmp(entry->format, format))
662 ret = -EPERM; 690 ret = -EPERM;
663 } 691 }
@@ -676,10 +704,11 @@ int marker_probe_register(const char *name, const char *format,
676 goto end; 704 goto end;
677 } 705 }
678 mutex_unlock(&markers_mutex); 706 mutex_unlock(&markers_mutex);
679 marker_update_probes(); /* may update entry */ 707 marker_update_probes();
680 mutex_lock(&markers_mutex); 708 mutex_lock(&markers_mutex);
681 entry = get_marker(name); 709 entry = get_marker(name);
682 WARN_ON(!entry); 710 if (!entry)
711 goto end;
683 if (entry->rcu_pending) 712 if (entry->rcu_pending)
684 rcu_barrier_sched(); 713 rcu_barrier_sched();
685 entry->oldptr = old; 714 entry->oldptr = old;
@@ -720,7 +749,7 @@ int marker_probe_unregister(const char *name,
720 rcu_barrier_sched(); 749 rcu_barrier_sched();
721 old = marker_entry_remove_probe(entry, probe, probe_private); 750 old = marker_entry_remove_probe(entry, probe, probe_private);
722 mutex_unlock(&markers_mutex); 751 mutex_unlock(&markers_mutex);
723 marker_update_probes(); /* may update entry */ 752 marker_update_probes();
724 mutex_lock(&markers_mutex); 753 mutex_lock(&markers_mutex);
725 entry = get_marker(name); 754 entry = get_marker(name);
726 if (!entry) 755 if (!entry)
@@ -801,10 +830,11 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
801 rcu_barrier_sched(); 830 rcu_barrier_sched();
802 old = marker_entry_remove_probe(entry, NULL, probe_private); 831 old = marker_entry_remove_probe(entry, NULL, probe_private);
803 mutex_unlock(&markers_mutex); 832 mutex_unlock(&markers_mutex);
804 marker_update_probes(); /* may update entry */ 833 marker_update_probes();
805 mutex_lock(&markers_mutex); 834 mutex_lock(&markers_mutex);
806 entry = get_marker_from_private_data(probe, probe_private); 835 entry = get_marker_from_private_data(probe, probe_private);
807 WARN_ON(!entry); 836 if (!entry)
837 goto end;
808 if (entry->rcu_pending) 838 if (entry->rcu_pending)
809 rcu_barrier_sched(); 839 rcu_barrier_sched();
810 entry->oldptr = old; 840 entry->oldptr = old;
@@ -848,8 +878,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
848 if (!e->ptype) { 878 if (!e->ptype) {
849 if (num == 0 && e->single.func == probe) 879 if (num == 0 && e->single.func == probe)
850 return e->single.probe_private; 880 return e->single.probe_private;
851 else
852 break;
853 } else { 881 } else {
854 struct marker_probe_closure *closure; 882 struct marker_probe_closure *closure;
855 int match = 0; 883 int match = 0;
@@ -861,8 +889,42 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe,
861 return closure[i].probe_private; 889 return closure[i].probe_private;
862 } 890 }
863 } 891 }
892 break;
864 } 893 }
865 } 894 }
866 return ERR_PTR(-ENOENT); 895 return ERR_PTR(-ENOENT);
867} 896}
868EXPORT_SYMBOL_GPL(marker_get_private_data); 897EXPORT_SYMBOL_GPL(marker_get_private_data);
898
899#ifdef CONFIG_MODULES
900
901int marker_module_notify(struct notifier_block *self,
902 unsigned long val, void *data)
903{
904 struct module *mod = data;
905
906 switch (val) {
907 case MODULE_STATE_COMING:
908 marker_update_probe_range(mod->markers,
909 mod->markers + mod->num_markers);
910 break;
911 case MODULE_STATE_GOING:
912 marker_update_probe_range(mod->markers,
913 mod->markers + mod->num_markers);
914 break;
915 }
916 return 0;
917}
918
919struct notifier_block marker_module_nb = {
920 .notifier_call = marker_module_notify,
921 .priority = 0,
922};
923
924static int init_markers(void)
925{
926 return register_module_notifier(&marker_module_nb);
927}
928__initcall(init_markers);
929
930#endif /* CONFIG_MODULES */
diff --git a/kernel/module.c b/kernel/module.c
index 1f4cc00e0c20..89bcf7c1327d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2184,24 +2184,15 @@ static noinline struct module *load_module(void __user *umod,
2184 struct mod_debug *debug; 2184 struct mod_debug *debug;
2185 unsigned int num_debug; 2185 unsigned int num_debug;
2186 2186
2187#ifdef CONFIG_MARKERS
2188 marker_update_probe_range(mod->markers,
2189 mod->markers + mod->num_markers);
2190#endif
2191 debug = section_objs(hdr, sechdrs, secstrings, "__verbose", 2187 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2192 sizeof(*debug), &num_debug); 2188 sizeof(*debug), &num_debug);
2193 dynamic_printk_setup(debug, num_debug); 2189 dynamic_printk_setup(debug, num_debug);
2194
2195#ifdef CONFIG_TRACEPOINTS
2196 tracepoint_update_probe_range(mod->tracepoints,
2197 mod->tracepoints + mod->num_tracepoints);
2198#endif
2199 } 2190 }
2200 2191
2201 /* sechdrs[0].sh_size is always zero */ 2192 /* sechdrs[0].sh_size is always zero */
2202 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", 2193 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
2203 sizeof(*mseg), &num_mcount); 2194 sizeof(*mseg), &num_mcount);
2204 ftrace_init_module(mseg, mseg + num_mcount); 2195 ftrace_init_module(mod, mseg, mseg + num_mcount);
2205 2196
2206 err = module_finalize(hdr, sechdrs, mod); 2197 err = module_finalize(hdr, sechdrs, mod);
2207 if (err < 0) 2198 if (err < 0)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 12c779dc65d4..4f45d4b658ef 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -59,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init);
59 * We also put the fastpath first in the kernel image, to make sure the 59 * We also put the fastpath first in the kernel image, to make sure the
60 * branch is predicted by the CPU as default-untaken. 60 * branch is predicted by the CPU as default-untaken.
61 */ 61 */
62static void noinline __sched 62static __used noinline void __sched
63__mutex_lock_slowpath(atomic_t *lock_count); 63__mutex_lock_slowpath(atomic_t *lock_count);
64 64
65/*** 65/***
@@ -96,7 +96,7 @@ void inline __sched mutex_lock(struct mutex *lock)
96EXPORT_SYMBOL(mutex_lock); 96EXPORT_SYMBOL(mutex_lock);
97#endif 97#endif
98 98
99static noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); 99static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
100 100
101/*** 101/***
102 * mutex_unlock - release the mutex 102 * mutex_unlock - release the mutex
@@ -184,7 +184,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
184 } 184 }
185 185
186done: 186done:
187 lock_acquired(&lock->dep_map); 187 lock_acquired(&lock->dep_map, ip);
188 /* got the lock - rejoice! */ 188 /* got the lock - rejoice! */
189 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 189 mutex_remove_waiter(lock, &waiter, task_thread_info(task));
190 debug_mutex_set_owner(lock, task_thread_info(task)); 190 debug_mutex_set_owner(lock, task_thread_info(task));
@@ -268,7 +268,7 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
268/* 268/*
269 * Release the lock, slowpath: 269 * Release the lock, slowpath:
270 */ 270 */
271static noinline void 271static __used noinline void
272__mutex_unlock_slowpath(atomic_t *lock_count) 272__mutex_unlock_slowpath(atomic_t *lock_count)
273{ 273{
274 __mutex_unlock_common_slowpath(lock_count, 1); 274 __mutex_unlock_common_slowpath(lock_count, 1);
@@ -313,7 +313,7 @@ int __sched mutex_lock_killable(struct mutex *lock)
313} 313}
314EXPORT_SYMBOL(mutex_lock_killable); 314EXPORT_SYMBOL(mutex_lock_killable);
315 315
316static noinline void __sched 316static __used noinline void __sched
317__mutex_lock_slowpath(atomic_t *lock_count) 317__mutex_lock_slowpath(atomic_t *lock_count)
318{ 318{
319 struct mutex *lock = container_of(lock_count, struct mutex, count); 319 struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 4282c0a40a57..61d5aa5eced3 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -82,6 +82,14 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference(nb->next);
85
86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
88 WARN(1, "Invalid notifier called!");
89 nb = next_nb;
90 continue;
91 }
92#endif
85 ret = nb->notifier_call(nb, val, v); 93 ret = nb->notifier_call(nb, val, v);
86 94
87 if (nr_calls) 95 if (nr_calls)
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 895337b16a24..3f4377e0aa04 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -58,21 +58,21 @@ void thread_group_cputime(
58 struct task_struct *tsk, 58 struct task_struct *tsk,
59 struct task_cputime *times) 59 struct task_cputime *times)
60{ 60{
61 struct signal_struct *sig; 61 struct task_cputime *totals, *tot;
62 int i; 62 int i;
63 struct task_cputime *tot;
64 63
65 sig = tsk->signal; 64 totals = tsk->signal->cputime.totals;
66 if (unlikely(!sig) || !sig->cputime.totals) { 65 if (!totals) {
67 times->utime = tsk->utime; 66 times->utime = tsk->utime;
68 times->stime = tsk->stime; 67 times->stime = tsk->stime;
69 times->sum_exec_runtime = tsk->se.sum_exec_runtime; 68 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
70 return; 69 return;
71 } 70 }
71
72 times->stime = times->utime = cputime_zero; 72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0; 73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) { 74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(tsk->signal->cputime.totals, i); 75 tot = per_cpu_ptr(totals, i);
76 times->utime = cputime_add(times->utime, tot->utime); 76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime); 77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime; 78 times->sum_exec_runtime += tot->sum_exec_runtime;
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index c9d74083746f..f77d3819ef57 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,7 +22,6 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/ftrace.h>
26 25
27#include "power.h" 26#include "power.h"
28 27
@@ -257,7 +256,7 @@ static int create_image(int platform_mode)
257 256
258int hibernation_snapshot(int platform_mode) 257int hibernation_snapshot(int platform_mode)
259{ 258{
260 int error, ftrace_save; 259 int error;
261 260
262 /* Free memory before shutting down devices. */ 261 /* Free memory before shutting down devices. */
263 error = swsusp_shrink_memory(); 262 error = swsusp_shrink_memory();
@@ -269,7 +268,6 @@ int hibernation_snapshot(int platform_mode)
269 goto Close; 268 goto Close;
270 269
271 suspend_console(); 270 suspend_console();
272 ftrace_save = __ftrace_enabled_save();
273 error = device_suspend(PMSG_FREEZE); 271 error = device_suspend(PMSG_FREEZE);
274 if (error) 272 if (error)
275 goto Recover_platform; 273 goto Recover_platform;
@@ -299,7 +297,6 @@ int hibernation_snapshot(int platform_mode)
299 Resume_devices: 297 Resume_devices:
300 device_resume(in_suspend ? 298 device_resume(in_suspend ?
301 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 299 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
302 __ftrace_enabled_restore(ftrace_save);
303 resume_console(); 300 resume_console();
304 Close: 301 Close:
305 platform_end(platform_mode); 302 platform_end(platform_mode);
@@ -370,11 +367,10 @@ static int resume_target_kernel(void)
370 367
371int hibernation_restore(int platform_mode) 368int hibernation_restore(int platform_mode)
372{ 369{
373 int error, ftrace_save; 370 int error;
374 371
375 pm_prepare_console(); 372 pm_prepare_console();
376 suspend_console(); 373 suspend_console();
377 ftrace_save = __ftrace_enabled_save();
378 error = device_suspend(PMSG_QUIESCE); 374 error = device_suspend(PMSG_QUIESCE);
379 if (error) 375 if (error)
380 goto Finish; 376 goto Finish;
@@ -389,7 +385,6 @@ int hibernation_restore(int platform_mode)
389 platform_restore_cleanup(platform_mode); 385 platform_restore_cleanup(platform_mode);
390 device_resume(PMSG_RECOVER); 386 device_resume(PMSG_RECOVER);
391 Finish: 387 Finish:
392 __ftrace_enabled_restore(ftrace_save);
393 resume_console(); 388 resume_console();
394 pm_restore_console(); 389 pm_restore_console();
395 return error; 390 return error;
@@ -402,7 +397,7 @@ int hibernation_restore(int platform_mode)
402 397
403int hibernation_platform_enter(void) 398int hibernation_platform_enter(void)
404{ 399{
405 int error, ftrace_save; 400 int error;
406 401
407 if (!hibernation_ops) 402 if (!hibernation_ops)
408 return -ENOSYS; 403 return -ENOSYS;
@@ -417,7 +412,6 @@ int hibernation_platform_enter(void)
417 goto Close; 412 goto Close;
418 413
419 suspend_console(); 414 suspend_console();
420 ftrace_save = __ftrace_enabled_save();
421 error = device_suspend(PMSG_HIBERNATE); 415 error = device_suspend(PMSG_HIBERNATE);
422 if (error) { 416 if (error) {
423 if (hibernation_ops->recover) 417 if (hibernation_ops->recover)
@@ -452,7 +446,6 @@ int hibernation_platform_enter(void)
452 hibernation_ops->finish(); 446 hibernation_ops->finish();
453 Resume_devices: 447 Resume_devices:
454 device_resume(PMSG_RESTORE); 448 device_resume(PMSG_RESTORE);
455 __ftrace_enabled_restore(ftrace_save);
456 resume_console(); 449 resume_console();
457 Close: 450 Close:
458 hibernation_ops->end(); 451 hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b8f7ce9473e8..613f16941b85 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -22,7 +22,6 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/vmstat.h> 23#include <linux/vmstat.h>
24#include <linux/syscalls.h> 24#include <linux/syscalls.h>
25#include <linux/ftrace.h>
26 25
27#include "power.h" 26#include "power.h"
28 27
@@ -317,7 +316,7 @@ static int suspend_enter(suspend_state_t state)
317 */ 316 */
318int suspend_devices_and_enter(suspend_state_t state) 317int suspend_devices_and_enter(suspend_state_t state)
319{ 318{
320 int error, ftrace_save; 319 int error;
321 320
322 if (!suspend_ops) 321 if (!suspend_ops)
323 return -ENOSYS; 322 return -ENOSYS;
@@ -328,7 +327,6 @@ int suspend_devices_and_enter(suspend_state_t state)
328 goto Close; 327 goto Close;
329 } 328 }
330 suspend_console(); 329 suspend_console();
331 ftrace_save = __ftrace_enabled_save();
332 suspend_test_start(); 330 suspend_test_start();
333 error = device_suspend(PMSG_SUSPEND); 331 error = device_suspend(PMSG_SUSPEND);
334 if (error) { 332 if (error) {
@@ -360,7 +358,6 @@ int suspend_devices_and_enter(suspend_state_t state)
360 suspend_test_start(); 358 suspend_test_start();
361 device_resume(PMSG_RESUME); 359 device_resume(PMSG_RESUME);
362 suspend_test_finish("resume devices"); 360 suspend_test_finish("resume devices");
363 __ftrace_enabled_restore(ftrace_save);
364 resume_console(); 361 resume_console();
365 Close: 362 Close:
366 if (suspend_ops->end) 363 if (suspend_ops->end)
diff --git a/kernel/profile.c b/kernel/profile.c
index dc41827fbfee..60adefb59b5e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = {
544}; 544};
545 545
546#ifdef CONFIG_SMP 546#ifdef CONFIG_SMP
547static inline void profile_nop(void *unused) 547static void profile_nop(void *unused)
548{ 548{
549} 549}
550 550
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 37f72e551542..c03ca3e61919 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -191,7 +191,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
191 191
192 /* OK, time to rat on our buddy... */ 192 /* OK, time to rat on our buddy... */
193 193
194 printk(KERN_ERR "RCU detected CPU stalls:"); 194 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) { 195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask)) 196 if (cpu_isset(cpu, rcp->cpumask))
197 printk(" %d", cpu); 197 printk(" %d", cpu);
@@ -204,7 +204,7 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{ 204{
205 unsigned long flags; 205 unsigned long flags;
206 206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", 207 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies, 208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start); 209 jiffies - rcp->gp_start);
210 dump_stack(); 210 dump_stack();
@@ -393,7 +393,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
393 * unnecessarily. 393 * unnecessarily.
394 */ 394 */
395 smp_mb(); 395 smp_mb();
396 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); 396 cpumask_andnot(&rcp->cpumask, cpu_online_mask, nohz_cpu_mask);
397 397
398 rcp->signaled = 0; 398 rcp->signaled = 0;
399 } 399 }
diff --git a/kernel/sched.c b/kernel/sched.c
index b7480fb5c3dc..8050a61a7adb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -118,6 +118,12 @@
118 */ 118 */
119#define RUNTIME_INF ((u64)~0ULL) 119#define RUNTIME_INF ((u64)~0ULL)
120 120
121DEFINE_TRACE(sched_wait_task);
122DEFINE_TRACE(sched_wakeup);
123DEFINE_TRACE(sched_wakeup_new);
124DEFINE_TRACE(sched_switch);
125DEFINE_TRACE(sched_migrate_task);
126
121#ifdef CONFIG_SMP 127#ifdef CONFIG_SMP
122/* 128/*
123 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 129 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -481,14 +487,14 @@ struct rt_rq {
481 */ 487 */
482struct root_domain { 488struct root_domain {
483 atomic_t refcount; 489 atomic_t refcount;
484 cpumask_t span; 490 cpumask_var_t span;
485 cpumask_t online; 491 cpumask_var_t online;
486 492
487 /* 493 /*
488 * The "RT overload" flag: it gets set if a CPU has more than 494 * The "RT overload" flag: it gets set if a CPU has more than
489 * one runnable RT task. 495 * one runnable RT task.
490 */ 496 */
491 cpumask_t rto_mask; 497 cpumask_var_t rto_mask;
492 atomic_t rto_count; 498 atomic_t rto_count;
493#ifdef CONFIG_SMP 499#ifdef CONFIG_SMP
494 struct cpupri cpupri; 500 struct cpupri cpupri;
@@ -703,45 +709,18 @@ static __read_mostly char *sched_feat_names[] = {
703 709
704#undef SCHED_FEAT 710#undef SCHED_FEAT
705 711
706static int sched_feat_open(struct inode *inode, struct file *filp) 712static int sched_feat_show(struct seq_file *m, void *v)
707{ 713{
708 filp->private_data = inode->i_private;
709 return 0;
710}
711
712static ssize_t
713sched_feat_read(struct file *filp, char __user *ubuf,
714 size_t cnt, loff_t *ppos)
715{
716 char *buf;
717 int r = 0;
718 int len = 0;
719 int i; 714 int i;
720 715
721 for (i = 0; sched_feat_names[i]; i++) { 716 for (i = 0; sched_feat_names[i]; i++) {
722 len += strlen(sched_feat_names[i]); 717 if (!(sysctl_sched_features & (1UL << i)))
723 len += 4; 718 seq_puts(m, "NO_");
719 seq_printf(m, "%s ", sched_feat_names[i]);
724 } 720 }
721 seq_puts(m, "\n");
725 722
726 buf = kmalloc(len + 2, GFP_KERNEL); 723 return 0;
727 if (!buf)
728 return -ENOMEM;
729
730 for (i = 0; sched_feat_names[i]; i++) {
731 if (sysctl_sched_features & (1UL << i))
732 r += sprintf(buf + r, "%s ", sched_feat_names[i]);
733 else
734 r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
735 }
736
737 r += sprintf(buf + r, "\n");
738 WARN_ON(r >= len + 2);
739
740 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
741
742 kfree(buf);
743
744 return r;
745} 724}
746 725
747static ssize_t 726static ssize_t
@@ -786,10 +765,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
786 return cnt; 765 return cnt;
787} 766}
788 767
768static int sched_feat_open(struct inode *inode, struct file *filp)
769{
770 return single_open(filp, sched_feat_show, NULL);
771}
772
789static struct file_operations sched_feat_fops = { 773static struct file_operations sched_feat_fops = {
790 .open = sched_feat_open, 774 .open = sched_feat_open,
791 .read = sched_feat_read, 775 .write = sched_feat_write,
792 .write = sched_feat_write, 776 .read = seq_read,
777 .llseek = seq_lseek,
778 .release = single_release,
793}; 779};
794 780
795static __init int sched_init_debug(void) 781static __init int sched_init_debug(void)
@@ -1474,27 +1460,13 @@ static void
1474update_group_shares_cpu(struct task_group *tg, int cpu, 1460update_group_shares_cpu(struct task_group *tg, int cpu,
1475 unsigned long sd_shares, unsigned long sd_rq_weight) 1461 unsigned long sd_shares, unsigned long sd_rq_weight)
1476{ 1462{
1477 int boost = 0;
1478 unsigned long shares; 1463 unsigned long shares;
1479 unsigned long rq_weight; 1464 unsigned long rq_weight;
1480 1465
1481 if (!tg->se[cpu]) 1466 if (!tg->se[cpu])
1482 return; 1467 return;
1483 1468
1484 rq_weight = tg->cfs_rq[cpu]->load.weight; 1469 rq_weight = tg->cfs_rq[cpu]->rq_weight;
1485
1486 /*
1487 * If there are currently no tasks on the cpu pretend there is one of
1488 * average load so that when a new task gets to run here it will not
1489 * get delayed by group starvation.
1490 */
1491 if (!rq_weight) {
1492 boost = 1;
1493 rq_weight = NICE_0_LOAD;
1494 }
1495
1496 if (unlikely(rq_weight > sd_rq_weight))
1497 rq_weight = sd_rq_weight;
1498 1470
1499 /* 1471 /*
1500 * \Sum shares * rq_weight 1472 * \Sum shares * rq_weight
@@ -1502,7 +1474,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1502 * \Sum rq_weight 1474 * \Sum rq_weight
1503 * 1475 *
1504 */ 1476 */
1505 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1477 shares = (sd_shares * rq_weight) / sd_rq_weight;
1506 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1478 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1507 1479
1508 if (abs(shares - tg->se[cpu]->load.weight) > 1480 if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1511,11 +1483,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1511 unsigned long flags; 1483 unsigned long flags;
1512 1484
1513 spin_lock_irqsave(&rq->lock, flags); 1485 spin_lock_irqsave(&rq->lock, flags);
1514 /* 1486 tg->cfs_rq[cpu]->shares = shares;
1515 * record the actual number of shares, not the boosted amount.
1516 */
1517 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1518 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1519 1487
1520 __set_se_shares(tg->se[cpu], shares); 1488 __set_se_shares(tg->se[cpu], shares);
1521 spin_unlock_irqrestore(&rq->lock, flags); 1489 spin_unlock_irqrestore(&rq->lock, flags);
@@ -1529,13 +1497,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1529 */ 1497 */
1530static int tg_shares_up(struct task_group *tg, void *data) 1498static int tg_shares_up(struct task_group *tg, void *data)
1531{ 1499{
1532 unsigned long rq_weight = 0; 1500 unsigned long weight, rq_weight = 0;
1533 unsigned long shares = 0; 1501 unsigned long shares = 0;
1534 struct sched_domain *sd = data; 1502 struct sched_domain *sd = data;
1535 int i; 1503 int i;
1536 1504
1537 for_each_cpu_mask(i, sd->span) { 1505 for_each_cpu(i, sched_domain_span(sd)) {
1538 rq_weight += tg->cfs_rq[i]->load.weight; 1506 /*
1507 * If there are currently no tasks on the cpu pretend there
1508 * is one of average load so that when a new task gets to
1509 * run here it will not get delayed by group starvation.
1510 */
1511 weight = tg->cfs_rq[i]->load.weight;
1512 if (!weight)
1513 weight = NICE_0_LOAD;
1514
1515 tg->cfs_rq[i]->rq_weight = weight;
1516 rq_weight += weight;
1539 shares += tg->cfs_rq[i]->shares; 1517 shares += tg->cfs_rq[i]->shares;
1540 } 1518 }
1541 1519
@@ -1545,10 +1523,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1545 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1523 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1546 shares = tg->shares; 1524 shares = tg->shares;
1547 1525
1548 if (!rq_weight) 1526 for_each_cpu(i, sched_domain_span(sd))
1549 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1550
1551 for_each_cpu_mask(i, sd->span)
1552 update_group_shares_cpu(tg, i, shares, rq_weight); 1527 update_group_shares_cpu(tg, i, shares, rq_weight);
1553 1528
1554 return 0; 1529 return 0;
@@ -2079,15 +2054,17 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2079 int i; 2054 int i;
2080 2055
2081 /* Skip over this group if it has no CPUs allowed */ 2056 /* Skip over this group if it has no CPUs allowed */
2082 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 2057 if (!cpumask_intersects(sched_group_cpus(group),
2058 &p->cpus_allowed))
2083 continue; 2059 continue;
2084 2060
2085 local_group = cpu_isset(this_cpu, group->cpumask); 2061 local_group = cpumask_test_cpu(this_cpu,
2062 sched_group_cpus(group));
2086 2063
2087 /* Tally up the load of all CPUs in the group */ 2064 /* Tally up the load of all CPUs in the group */
2088 avg_load = 0; 2065 avg_load = 0;
2089 2066
2090 for_each_cpu_mask_nr(i, group->cpumask) { 2067 for_each_cpu(i, sched_group_cpus(group)) {
2091 /* Bias balancing toward cpus of our domain */ 2068 /* Bias balancing toward cpus of our domain */
2092 if (local_group) 2069 if (local_group)
2093 load = source_load(i, load_idx); 2070 load = source_load(i, load_idx);
@@ -2119,17 +2096,14 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2119 * find_idlest_cpu - find the idlest cpu among the cpus in group. 2096 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2120 */ 2097 */
2121static int 2098static int
2122find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu, 2099find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2123 cpumask_t *tmp)
2124{ 2100{
2125 unsigned long load, min_load = ULONG_MAX; 2101 unsigned long load, min_load = ULONG_MAX;
2126 int idlest = -1; 2102 int idlest = -1;
2127 int i; 2103 int i;
2128 2104
2129 /* Traverse only the allowed CPUs */ 2105 /* Traverse only the allowed CPUs */
2130 cpus_and(*tmp, group->cpumask, p->cpus_allowed); 2106 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2131
2132 for_each_cpu_mask_nr(i, *tmp) {
2133 load = weighted_cpuload(i); 2107 load = weighted_cpuload(i);
2134 2108
2135 if (load < min_load || (load == min_load && i == this_cpu)) { 2109 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2171,7 +2145,6 @@ static int sched_balance_self(int cpu, int flag)
2171 update_shares(sd); 2145 update_shares(sd);
2172 2146
2173 while (sd) { 2147 while (sd) {
2174 cpumask_t span, tmpmask;
2175 struct sched_group *group; 2148 struct sched_group *group;
2176 int new_cpu, weight; 2149 int new_cpu, weight;
2177 2150
@@ -2180,14 +2153,13 @@ static int sched_balance_self(int cpu, int flag)
2180 continue; 2153 continue;
2181 } 2154 }
2182 2155
2183 span = sd->span;
2184 group = find_idlest_group(sd, t, cpu); 2156 group = find_idlest_group(sd, t, cpu);
2185 if (!group) { 2157 if (!group) {
2186 sd = sd->child; 2158 sd = sd->child;
2187 continue; 2159 continue;
2188 } 2160 }
2189 2161
2190 new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask); 2162 new_cpu = find_idlest_cpu(group, t, cpu);
2191 if (new_cpu == -1 || new_cpu == cpu) { 2163 if (new_cpu == -1 || new_cpu == cpu) {
2192 /* Now try balancing at a lower domain level of cpu */ 2164 /* Now try balancing at a lower domain level of cpu */
2193 sd = sd->child; 2165 sd = sd->child;
@@ -2196,10 +2168,10 @@ static int sched_balance_self(int cpu, int flag)
2196 2168
2197 /* Now try balancing at a lower domain level of new_cpu */ 2169 /* Now try balancing at a lower domain level of new_cpu */
2198 cpu = new_cpu; 2170 cpu = new_cpu;
2171 weight = cpumask_weight(sched_domain_span(sd));
2199 sd = NULL; 2172 sd = NULL;
2200 weight = cpus_weight(span);
2201 for_each_domain(cpu, tmp) { 2173 for_each_domain(cpu, tmp) {
2202 if (weight <= cpus_weight(tmp->span)) 2174 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2203 break; 2175 break;
2204 if (tmp->flags & flag) 2176 if (tmp->flags & flag)
2205 sd = tmp; 2177 sd = tmp;
@@ -2244,7 +2216,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2244 cpu = task_cpu(p); 2216 cpu = task_cpu(p);
2245 2217
2246 for_each_domain(this_cpu, sd) { 2218 for_each_domain(this_cpu, sd) {
2247 if (cpu_isset(cpu, sd->span)) { 2219 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2248 update_shares(sd); 2220 update_shares(sd);
2249 break; 2221 break;
2250 } 2222 }
@@ -2292,7 +2264,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2292 else { 2264 else {
2293 struct sched_domain *sd; 2265 struct sched_domain *sd;
2294 for_each_domain(this_cpu, sd) { 2266 for_each_domain(this_cpu, sd) {
2295 if (cpu_isset(cpu, sd->span)) { 2267 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2296 schedstat_inc(sd, ttwu_wake_remote); 2268 schedstat_inc(sd, ttwu_wake_remote);
2297 break; 2269 break;
2298 } 2270 }
@@ -2838,7 +2810,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2838 return ret; 2810 return ret;
2839} 2811}
2840 2812
2841static void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 2813static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2842 __releases(busiest->lock) 2814 __releases(busiest->lock)
2843{ 2815{
2844 spin_unlock(&busiest->lock); 2816 spin_unlock(&busiest->lock);
@@ -2858,7 +2830,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2858 struct rq *rq; 2830 struct rq *rq;
2859 2831
2860 rq = task_rq_lock(p, &flags); 2832 rq = task_rq_lock(p, &flags);
2861 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2833 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
2862 || unlikely(!cpu_active(dest_cpu))) 2834 || unlikely(!cpu_active(dest_cpu)))
2863 goto out; 2835 goto out;
2864 2836
@@ -2924,7 +2896,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2924 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2896 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2925 * 3) are cache-hot on their current CPU. 2897 * 3) are cache-hot on their current CPU.
2926 */ 2898 */
2927 if (!cpu_isset(this_cpu, p->cpus_allowed)) { 2899 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
2928 schedstat_inc(p, se.nr_failed_migrations_affine); 2900 schedstat_inc(p, se.nr_failed_migrations_affine);
2929 return 0; 2901 return 0;
2930 } 2902 }
@@ -3099,7 +3071,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3099static struct sched_group * 3071static struct sched_group *
3100find_busiest_group(struct sched_domain *sd, int this_cpu, 3072find_busiest_group(struct sched_domain *sd, int this_cpu,
3101 unsigned long *imbalance, enum cpu_idle_type idle, 3073 unsigned long *imbalance, enum cpu_idle_type idle,
3102 int *sd_idle, const cpumask_t *cpus, int *balance) 3074 int *sd_idle, const struct cpumask *cpus, int *balance)
3103{ 3075{
3104 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3076 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
3105 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3077 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -3135,10 +3107,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3135 unsigned long sum_avg_load_per_task; 3107 unsigned long sum_avg_load_per_task;
3136 unsigned long avg_load_per_task; 3108 unsigned long avg_load_per_task;
3137 3109
3138 local_group = cpu_isset(this_cpu, group->cpumask); 3110 local_group = cpumask_test_cpu(this_cpu,
3111 sched_group_cpus(group));
3139 3112
3140 if (local_group) 3113 if (local_group)
3141 balance_cpu = first_cpu(group->cpumask); 3114 balance_cpu = cpumask_first(sched_group_cpus(group));
3142 3115
3143 /* Tally up the load of all CPUs in the group */ 3116 /* Tally up the load of all CPUs in the group */
3144 sum_weighted_load = sum_nr_running = avg_load = 0; 3117 sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3147,13 +3120,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3147 max_cpu_load = 0; 3120 max_cpu_load = 0;
3148 min_cpu_load = ~0UL; 3121 min_cpu_load = ~0UL;
3149 3122
3150 for_each_cpu_mask_nr(i, group->cpumask) { 3123 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3151 struct rq *rq; 3124 struct rq *rq = cpu_rq(i);
3152
3153 if (!cpu_isset(i, *cpus))
3154 continue;
3155
3156 rq = cpu_rq(i);
3157 3125
3158 if (*sd_idle && rq->nr_running) 3126 if (*sd_idle && rq->nr_running)
3159 *sd_idle = 0; 3127 *sd_idle = 0;
@@ -3264,8 +3232,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3264 */ 3232 */
3265 if ((sum_nr_running < min_nr_running) || 3233 if ((sum_nr_running < min_nr_running) ||
3266 (sum_nr_running == min_nr_running && 3234 (sum_nr_running == min_nr_running &&
3267 first_cpu(group->cpumask) < 3235 cpumask_first(sched_group_cpus(group)) <
3268 first_cpu(group_min->cpumask))) { 3236 cpumask_first(sched_group_cpus(group_min)))) {
3269 group_min = group; 3237 group_min = group;
3270 min_nr_running = sum_nr_running; 3238 min_nr_running = sum_nr_running;
3271 min_load_per_task = sum_weighted_load / 3239 min_load_per_task = sum_weighted_load /
@@ -3280,8 +3248,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3280 if (sum_nr_running <= group_capacity - 1) { 3248 if (sum_nr_running <= group_capacity - 1) {
3281 if (sum_nr_running > leader_nr_running || 3249 if (sum_nr_running > leader_nr_running ||
3282 (sum_nr_running == leader_nr_running && 3250 (sum_nr_running == leader_nr_running &&
3283 first_cpu(group->cpumask) > 3251 cpumask_first(sched_group_cpus(group)) >
3284 first_cpu(group_leader->cpumask))) { 3252 cpumask_first(sched_group_cpus(group_leader)))) {
3285 group_leader = group; 3253 group_leader = group;
3286 leader_nr_running = sum_nr_running; 3254 leader_nr_running = sum_nr_running;
3287 } 3255 }
@@ -3420,16 +3388,16 @@ ret:
3420 */ 3388 */
3421static struct rq * 3389static struct rq *
3422find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 3390find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3423 unsigned long imbalance, const cpumask_t *cpus) 3391 unsigned long imbalance, const struct cpumask *cpus)
3424{ 3392{
3425 struct rq *busiest = NULL, *rq; 3393 struct rq *busiest = NULL, *rq;
3426 unsigned long max_load = 0; 3394 unsigned long max_load = 0;
3427 int i; 3395 int i;
3428 3396
3429 for_each_cpu_mask_nr(i, group->cpumask) { 3397 for_each_cpu(i, sched_group_cpus(group)) {
3430 unsigned long wl; 3398 unsigned long wl;
3431 3399
3432 if (!cpu_isset(i, *cpus)) 3400 if (!cpumask_test_cpu(i, cpus))
3433 continue; 3401 continue;
3434 3402
3435 rq = cpu_rq(i); 3403 rq = cpu_rq(i);
@@ -3459,7 +3427,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3459 */ 3427 */
3460static int load_balance(int this_cpu, struct rq *this_rq, 3428static int load_balance(int this_cpu, struct rq *this_rq,
3461 struct sched_domain *sd, enum cpu_idle_type idle, 3429 struct sched_domain *sd, enum cpu_idle_type idle,
3462 int *balance, cpumask_t *cpus) 3430 int *balance, struct cpumask *cpus)
3463{ 3431{
3464 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3432 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3465 struct sched_group *group; 3433 struct sched_group *group;
@@ -3467,7 +3435,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3467 struct rq *busiest; 3435 struct rq *busiest;
3468 unsigned long flags; 3436 unsigned long flags;
3469 3437
3470 cpus_setall(*cpus); 3438 cpumask_setall(cpus);
3471 3439
3472 /* 3440 /*
3473 * When power savings policy is enabled for the parent domain, idle 3441 * When power savings policy is enabled for the parent domain, idle
@@ -3527,8 +3495,8 @@ redo:
3527 3495
3528 /* All tasks on this runqueue were pinned by CPU affinity */ 3496 /* All tasks on this runqueue were pinned by CPU affinity */
3529 if (unlikely(all_pinned)) { 3497 if (unlikely(all_pinned)) {
3530 cpu_clear(cpu_of(busiest), *cpus); 3498 cpumask_clear_cpu(cpu_of(busiest), cpus);
3531 if (!cpus_empty(*cpus)) 3499 if (!cpumask_empty(cpus))
3532 goto redo; 3500 goto redo;
3533 goto out_balanced; 3501 goto out_balanced;
3534 } 3502 }
@@ -3545,7 +3513,8 @@ redo:
3545 /* don't kick the migration_thread, if the curr 3513 /* don't kick the migration_thread, if the curr
3546 * task on busiest cpu can't be moved to this_cpu 3514 * task on busiest cpu can't be moved to this_cpu
3547 */ 3515 */
3548 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 3516 if (!cpumask_test_cpu(this_cpu,
3517 &busiest->curr->cpus_allowed)) {
3549 spin_unlock_irqrestore(&busiest->lock, flags); 3518 spin_unlock_irqrestore(&busiest->lock, flags);
3550 all_pinned = 1; 3519 all_pinned = 1;
3551 goto out_one_pinned; 3520 goto out_one_pinned;
@@ -3620,7 +3589,7 @@ out:
3620 */ 3589 */
3621static int 3590static int
3622load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 3591load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3623 cpumask_t *cpus) 3592 struct cpumask *cpus)
3624{ 3593{
3625 struct sched_group *group; 3594 struct sched_group *group;
3626 struct rq *busiest = NULL; 3595 struct rq *busiest = NULL;
@@ -3629,7 +3598,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3629 int sd_idle = 0; 3598 int sd_idle = 0;
3630 int all_pinned = 0; 3599 int all_pinned = 0;
3631 3600
3632 cpus_setall(*cpus); 3601 cpumask_setall(cpus);
3633 3602
3634 /* 3603 /*
3635 * When power savings policy is enabled for the parent domain, idle 3604 * When power savings policy is enabled for the parent domain, idle
@@ -3673,8 +3642,8 @@ redo:
3673 double_unlock_balance(this_rq, busiest); 3642 double_unlock_balance(this_rq, busiest);
3674 3643
3675 if (unlikely(all_pinned)) { 3644 if (unlikely(all_pinned)) {
3676 cpu_clear(cpu_of(busiest), *cpus); 3645 cpumask_clear_cpu(cpu_of(busiest), cpus);
3677 if (!cpus_empty(*cpus)) 3646 if (!cpumask_empty(cpus))
3678 goto redo; 3647 goto redo;
3679 } 3648 }
3680 } 3649 }
@@ -3709,7 +3678,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3709 struct sched_domain *sd; 3678 struct sched_domain *sd;
3710 int pulled_task = -1; 3679 int pulled_task = -1;
3711 unsigned long next_balance = jiffies + HZ; 3680 unsigned long next_balance = jiffies + HZ;
3712 cpumask_t tmpmask; 3681 cpumask_var_t tmpmask;
3682
3683 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
3684 return;
3713 3685
3714 for_each_domain(this_cpu, sd) { 3686 for_each_domain(this_cpu, sd) {
3715 unsigned long interval; 3687 unsigned long interval;
@@ -3720,7 +3692,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3720 if (sd->flags & SD_BALANCE_NEWIDLE) 3692 if (sd->flags & SD_BALANCE_NEWIDLE)
3721 /* If we've pulled tasks over stop searching: */ 3693 /* If we've pulled tasks over stop searching: */
3722 pulled_task = load_balance_newidle(this_cpu, this_rq, 3694 pulled_task = load_balance_newidle(this_cpu, this_rq,
3723 sd, &tmpmask); 3695 sd, tmpmask);
3724 3696
3725 interval = msecs_to_jiffies(sd->balance_interval); 3697 interval = msecs_to_jiffies(sd->balance_interval);
3726 if (time_after(next_balance, sd->last_balance + interval)) 3698 if (time_after(next_balance, sd->last_balance + interval))
@@ -3735,6 +3707,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3735 */ 3707 */
3736 this_rq->next_balance = next_balance; 3708 this_rq->next_balance = next_balance;
3737 } 3709 }
3710 free_cpumask_var(tmpmask);
3738} 3711}
3739 3712
3740/* 3713/*
@@ -3772,7 +3745,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3772 /* Search for an sd spanning us and the target CPU. */ 3745 /* Search for an sd spanning us and the target CPU. */
3773 for_each_domain(target_cpu, sd) { 3746 for_each_domain(target_cpu, sd) {
3774 if ((sd->flags & SD_LOAD_BALANCE) && 3747 if ((sd->flags & SD_LOAD_BALANCE) &&
3775 cpu_isset(busiest_cpu, sd->span)) 3748 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3776 break; 3749 break;
3777 } 3750 }
3778 3751
@@ -3791,10 +3764,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3791#ifdef CONFIG_NO_HZ 3764#ifdef CONFIG_NO_HZ
3792static struct { 3765static struct {
3793 atomic_t load_balancer; 3766 atomic_t load_balancer;
3794 cpumask_t cpu_mask; 3767 cpumask_var_t cpu_mask;
3795} nohz ____cacheline_aligned = { 3768} nohz ____cacheline_aligned = {
3796 .load_balancer = ATOMIC_INIT(-1), 3769 .load_balancer = ATOMIC_INIT(-1),
3797 .cpu_mask = CPU_MASK_NONE,
3798}; 3770};
3799 3771
3800/* 3772/*
@@ -3822,7 +3794,7 @@ int select_nohz_load_balancer(int stop_tick)
3822 int cpu = smp_processor_id(); 3794 int cpu = smp_processor_id();
3823 3795
3824 if (stop_tick) { 3796 if (stop_tick) {
3825 cpu_set(cpu, nohz.cpu_mask); 3797 cpumask_set_cpu(cpu, nohz.cpu_mask);
3826 cpu_rq(cpu)->in_nohz_recently = 1; 3798 cpu_rq(cpu)->in_nohz_recently = 1;
3827 3799
3828 /* 3800 /*
@@ -3836,7 +3808,7 @@ int select_nohz_load_balancer(int stop_tick)
3836 } 3808 }
3837 3809
3838 /* time for ilb owner also to sleep */ 3810 /* time for ilb owner also to sleep */
3839 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 3811 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3840 if (atomic_read(&nohz.load_balancer) == cpu) 3812 if (atomic_read(&nohz.load_balancer) == cpu)
3841 atomic_set(&nohz.load_balancer, -1); 3813 atomic_set(&nohz.load_balancer, -1);
3842 return 0; 3814 return 0;
@@ -3849,10 +3821,10 @@ int select_nohz_load_balancer(int stop_tick)
3849 } else if (atomic_read(&nohz.load_balancer) == cpu) 3821 } else if (atomic_read(&nohz.load_balancer) == cpu)
3850 return 1; 3822 return 1;
3851 } else { 3823 } else {
3852 if (!cpu_isset(cpu, nohz.cpu_mask)) 3824 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3853 return 0; 3825 return 0;
3854 3826
3855 cpu_clear(cpu, nohz.cpu_mask); 3827 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3856 3828
3857 if (atomic_read(&nohz.load_balancer) == cpu) 3829 if (atomic_read(&nohz.load_balancer) == cpu)
3858 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3830 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@ -3880,7 +3852,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3880 unsigned long next_balance = jiffies + 60*HZ; 3852 unsigned long next_balance = jiffies + 60*HZ;
3881 int update_next_balance = 0; 3853 int update_next_balance = 0;
3882 int need_serialize; 3854 int need_serialize;
3883 cpumask_t tmp; 3855 cpumask_var_t tmp;
3856
3857 /* Fails alloc? Rebalancing probably not a priority right now. */
3858 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
3859 return;
3884 3860
3885 for_each_domain(cpu, sd) { 3861 for_each_domain(cpu, sd) {
3886 if (!(sd->flags & SD_LOAD_BALANCE)) 3862 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3905,7 +3881,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3905 } 3881 }
3906 3882
3907 if (time_after_eq(jiffies, sd->last_balance + interval)) { 3883 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3908 if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) { 3884 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
3909 /* 3885 /*
3910 * We've pulled tasks over so either we're no 3886 * We've pulled tasks over so either we're no
3911 * longer idle, or one of our SMT siblings is 3887 * longer idle, or one of our SMT siblings is
@@ -3939,6 +3915,8 @@ out:
3939 */ 3915 */
3940 if (likely(update_next_balance)) 3916 if (likely(update_next_balance))
3941 rq->next_balance = next_balance; 3917 rq->next_balance = next_balance;
3918
3919 free_cpumask_var(tmp);
3942} 3920}
3943 3921
3944/* 3922/*
@@ -3963,12 +3941,13 @@ static void run_rebalance_domains(struct softirq_action *h)
3963 */ 3941 */
3964 if (this_rq->idle_at_tick && 3942 if (this_rq->idle_at_tick &&
3965 atomic_read(&nohz.load_balancer) == this_cpu) { 3943 atomic_read(&nohz.load_balancer) == this_cpu) {
3966 cpumask_t cpus = nohz.cpu_mask;
3967 struct rq *rq; 3944 struct rq *rq;
3968 int balance_cpu; 3945 int balance_cpu;
3969 3946
3970 cpu_clear(this_cpu, cpus); 3947 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3971 for_each_cpu_mask_nr(balance_cpu, cpus) { 3948 if (balance_cpu == this_cpu)
3949 continue;
3950
3972 /* 3951 /*
3973 * If this cpu gets work to do, stop the load balancing 3952 * If this cpu gets work to do, stop the load balancing
3974 * work being done for other cpus. Next load 3953 * work being done for other cpus. Next load
@@ -4006,7 +3985,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4006 rq->in_nohz_recently = 0; 3985 rq->in_nohz_recently = 0;
4007 3986
4008 if (atomic_read(&nohz.load_balancer) == cpu) { 3987 if (atomic_read(&nohz.load_balancer) == cpu) {
4009 cpu_clear(cpu, nohz.cpu_mask); 3988 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4010 atomic_set(&nohz.load_balancer, -1); 3989 atomic_set(&nohz.load_balancer, -1);
4011 } 3990 }
4012 3991
@@ -4019,7 +3998,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4019 * TBD: Traverse the sched domains and nominate 3998 * TBD: Traverse the sched domains and nominate
4020 * the nearest cpu in the nohz.cpu_mask. 3999 * the nearest cpu in the nohz.cpu_mask.
4021 */ 4000 */
4022 int ilb = first_cpu(nohz.cpu_mask); 4001 int ilb = cpumask_first(nohz.cpu_mask);
4023 4002
4024 if (ilb < nr_cpu_ids) 4003 if (ilb < nr_cpu_ids)
4025 resched_cpu(ilb); 4004 resched_cpu(ilb);
@@ -4031,7 +4010,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4031 * cpus with ticks stopped, is it time for that to stop? 4010 * cpus with ticks stopped, is it time for that to stop?
4032 */ 4011 */
4033 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && 4012 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4034 cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 4013 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4035 resched_cpu(cpu); 4014 resched_cpu(cpu);
4036 return; 4015 return;
4037 } 4016 }
@@ -4041,7 +4020,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4041 * someone else, then no need raise the SCHED_SOFTIRQ 4020 * someone else, then no need raise the SCHED_SOFTIRQ
4042 */ 4021 */
4043 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && 4022 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4044 cpu_isset(cpu, nohz.cpu_mask)) 4023 cpumask_test_cpu(cpu, nohz.cpu_mask))
4045 return; 4024 return;
4046#endif 4025#endif
4047 if (time_after_eq(jiffies, rq->next_balance)) 4026 if (time_after_eq(jiffies, rq->next_balance))
@@ -4203,7 +4182,6 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4203 4182
4204 if (p == rq->idle) { 4183 if (p == rq->idle) {
4205 p->stime = cputime_add(p->stime, steal); 4184 p->stime = cputime_add(p->stime, steal);
4206 account_group_system_time(p, steal);
4207 if (atomic_read(&rq->nr_iowait) > 0) 4185 if (atomic_read(&rq->nr_iowait) > 0)
4208 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4186 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4209 else 4187 else
@@ -4339,7 +4317,7 @@ void __kprobes sub_preempt_count(int val)
4339 /* 4317 /*
4340 * Underflow? 4318 * Underflow?
4341 */ 4319 */
4342 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 4320 if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
4343 return; 4321 return;
4344 /* 4322 /*
4345 * Is the spinlock portion underflowing? 4323 * Is the spinlock portion underflowing?
@@ -5400,10 +5378,9 @@ out_unlock:
5400 return retval; 5378 return retval;
5401} 5379}
5402 5380
5403long sched_setaffinity(pid_t pid, const cpumask_t *in_mask) 5381long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
5404{ 5382{
5405 cpumask_t cpus_allowed; 5383 cpumask_var_t cpus_allowed, new_mask;
5406 cpumask_t new_mask = *in_mask;
5407 struct task_struct *p; 5384 struct task_struct *p;
5408 int retval; 5385 int retval;
5409 5386
@@ -5425,6 +5402,14 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5425 get_task_struct(p); 5402 get_task_struct(p);
5426 read_unlock(&tasklist_lock); 5403 read_unlock(&tasklist_lock);
5427 5404
5405 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5406 retval = -ENOMEM;
5407 goto out_put_task;
5408 }
5409 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5410 retval = -ENOMEM;
5411 goto out_free_cpus_allowed;
5412 }
5428 retval = -EPERM; 5413 retval = -EPERM;
5429 if ((current->euid != p->euid) && (current->euid != p->uid) && 5414 if ((current->euid != p->euid) && (current->euid != p->uid) &&
5430 !capable(CAP_SYS_NICE)) 5415 !capable(CAP_SYS_NICE))
@@ -5434,37 +5419,41 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
5434 if (retval) 5419 if (retval)
5435 goto out_unlock; 5420 goto out_unlock;
5436 5421
5437 cpuset_cpus_allowed(p, &cpus_allowed); 5422 cpuset_cpus_allowed(p, cpus_allowed);
5438 cpus_and(new_mask, new_mask, cpus_allowed); 5423 cpumask_and(new_mask, in_mask, cpus_allowed);
5439 again: 5424 again:
5440 retval = set_cpus_allowed_ptr(p, &new_mask); 5425 retval = set_cpus_allowed_ptr(p, new_mask);
5441 5426
5442 if (!retval) { 5427 if (!retval) {
5443 cpuset_cpus_allowed(p, &cpus_allowed); 5428 cpuset_cpus_allowed(p, cpus_allowed);
5444 if (!cpus_subset(new_mask, cpus_allowed)) { 5429 if (!cpumask_subset(new_mask, cpus_allowed)) {
5445 /* 5430 /*
5446 * We must have raced with a concurrent cpuset 5431 * We must have raced with a concurrent cpuset
5447 * update. Just reset the cpus_allowed to the 5432 * update. Just reset the cpus_allowed to the
5448 * cpuset's cpus_allowed 5433 * cpuset's cpus_allowed
5449 */ 5434 */
5450 new_mask = cpus_allowed; 5435 cpumask_copy(new_mask, cpus_allowed);
5451 goto again; 5436 goto again;
5452 } 5437 }
5453 } 5438 }
5454out_unlock: 5439out_unlock:
5440 free_cpumask_var(new_mask);
5441out_free_cpus_allowed:
5442 free_cpumask_var(cpus_allowed);
5443out_put_task:
5455 put_task_struct(p); 5444 put_task_struct(p);
5456 put_online_cpus(); 5445 put_online_cpus();
5457 return retval; 5446 return retval;
5458} 5447}
5459 5448
5460static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 5449static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5461 cpumask_t *new_mask) 5450 struct cpumask *new_mask)
5462{ 5451{
5463 if (len < sizeof(cpumask_t)) { 5452 if (len < cpumask_size())
5464 memset(new_mask, 0, sizeof(cpumask_t)); 5453 cpumask_clear(new_mask);
5465 } else if (len > sizeof(cpumask_t)) { 5454 else if (len > cpumask_size())
5466 len = sizeof(cpumask_t); 5455 len = cpumask_size();
5467 } 5456
5468 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 5457 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5469} 5458}
5470 5459
@@ -5477,17 +5466,20 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
5477asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 5466asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5478 unsigned long __user *user_mask_ptr) 5467 unsigned long __user *user_mask_ptr)
5479{ 5468{
5480 cpumask_t new_mask; 5469 cpumask_var_t new_mask;
5481 int retval; 5470 int retval;
5482 5471
5483 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); 5472 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5484 if (retval) 5473 return -ENOMEM;
5485 return retval;
5486 5474
5487 return sched_setaffinity(pid, &new_mask); 5475 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5476 if (retval == 0)
5477 retval = sched_setaffinity(pid, new_mask);
5478 free_cpumask_var(new_mask);
5479 return retval;
5488} 5480}
5489 5481
5490long sched_getaffinity(pid_t pid, cpumask_t *mask) 5482long sched_getaffinity(pid_t pid, struct cpumask *mask)
5491{ 5483{
5492 struct task_struct *p; 5484 struct task_struct *p;
5493 int retval; 5485 int retval;
@@ -5504,7 +5496,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
5504 if (retval) 5496 if (retval)
5505 goto out_unlock; 5497 goto out_unlock;
5506 5498
5507 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 5499 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5508 5500
5509out_unlock: 5501out_unlock:
5510 read_unlock(&tasklist_lock); 5502 read_unlock(&tasklist_lock);
@@ -5523,19 +5515,24 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
5523 unsigned long __user *user_mask_ptr) 5515 unsigned long __user *user_mask_ptr)
5524{ 5516{
5525 int ret; 5517 int ret;
5526 cpumask_t mask; 5518 cpumask_var_t mask;
5527 5519
5528 if (len < sizeof(cpumask_t)) 5520 if (len < cpumask_size())
5529 return -EINVAL; 5521 return -EINVAL;
5530 5522
5531 ret = sched_getaffinity(pid, &mask); 5523 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5532 if (ret < 0) 5524 return -ENOMEM;
5533 return ret;
5534 5525
5535 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) 5526 ret = sched_getaffinity(pid, mask);
5536 return -EFAULT; 5527 if (ret == 0) {
5528 if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
5529 ret = -EFAULT;
5530 else
5531 ret = cpumask_size();
5532 }
5533 free_cpumask_var(mask);
5537 5534
5538 return sizeof(cpumask_t); 5535 return ret;
5539} 5536}
5540 5537
5541/** 5538/**
@@ -5877,7 +5874,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5877 idle->se.exec_start = sched_clock(); 5874 idle->se.exec_start = sched_clock();
5878 5875
5879 idle->prio = idle->normal_prio = MAX_PRIO; 5876 idle->prio = idle->normal_prio = MAX_PRIO;
5880 idle->cpus_allowed = cpumask_of_cpu(cpu); 5877 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5881 __set_task_cpu(idle, cpu); 5878 __set_task_cpu(idle, cpu);
5882 5879
5883 rq->curr = rq->idle = idle; 5880 rq->curr = rq->idle = idle;
@@ -5896,6 +5893,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5896 * The idle tasks have their own, simple scheduling class: 5893 * The idle tasks have their own, simple scheduling class:
5897 */ 5894 */
5898 idle->sched_class = &idle_sched_class; 5895 idle->sched_class = &idle_sched_class;
5896 ftrace_retfunc_init_task(idle);
5899} 5897}
5900 5898
5901/* 5899/*
@@ -5903,9 +5901,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5903 * indicates which cpus entered this state. This is used 5901 * indicates which cpus entered this state. This is used
5904 * in the rcu update to wait only for active cpus. For system 5902 * in the rcu update to wait only for active cpus. For system
5905 * which do not switch off the HZ timer nohz_cpu_mask should 5903 * which do not switch off the HZ timer nohz_cpu_mask should
5906 * always be CPU_MASK_NONE. 5904 * always be CPU_BITS_NONE.
5907 */ 5905 */
5908cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 5906cpumask_var_t nohz_cpu_mask;
5909 5907
5910/* 5908/*
5911 * Increase the granularity value when there are more CPUs, 5909 * Increase the granularity value when there are more CPUs,
@@ -5960,7 +5958,7 @@ static inline void sched_init_granularity(void)
5960 * task must not exit() & deallocate itself prematurely. The 5958 * task must not exit() & deallocate itself prematurely. The
5961 * call is not atomic; no spinlocks may be held. 5959 * call is not atomic; no spinlocks may be held.
5962 */ 5960 */
5963int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask) 5961int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5964{ 5962{
5965 struct migration_req req; 5963 struct migration_req req;
5966 unsigned long flags; 5964 unsigned long flags;
@@ -5968,13 +5966,13 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5968 int ret = 0; 5966 int ret = 0;
5969 5967
5970 rq = task_rq_lock(p, &flags); 5968 rq = task_rq_lock(p, &flags);
5971 if (!cpus_intersects(*new_mask, cpu_online_map)) { 5969 if (!cpumask_intersects(new_mask, cpu_online_mask)) {
5972 ret = -EINVAL; 5970 ret = -EINVAL;
5973 goto out; 5971 goto out;
5974 } 5972 }
5975 5973
5976 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5974 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5977 !cpus_equal(p->cpus_allowed, *new_mask))) { 5975 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5978 ret = -EINVAL; 5976 ret = -EINVAL;
5979 goto out; 5977 goto out;
5980 } 5978 }
@@ -5982,15 +5980,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5982 if (p->sched_class->set_cpus_allowed) 5980 if (p->sched_class->set_cpus_allowed)
5983 p->sched_class->set_cpus_allowed(p, new_mask); 5981 p->sched_class->set_cpus_allowed(p, new_mask);
5984 else { 5982 else {
5985 p->cpus_allowed = *new_mask; 5983 cpumask_copy(&p->cpus_allowed, new_mask);
5986 p->rt.nr_cpus_allowed = cpus_weight(*new_mask); 5984 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5987 } 5985 }
5988 5986
5989 /* Can the task run on the task's current CPU? If so, we're done */ 5987 /* Can the task run on the task's current CPU? If so, we're done */
5990 if (cpu_isset(task_cpu(p), *new_mask)) 5988 if (cpumask_test_cpu(task_cpu(p), new_mask))
5991 goto out; 5989 goto out;
5992 5990
5993 if (migrate_task(p, any_online_cpu(*new_mask), &req)) { 5991 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
5994 /* Need help from migration thread: drop lock and wait. */ 5992 /* Need help from migration thread: drop lock and wait. */
5995 task_rq_unlock(rq, &flags); 5993 task_rq_unlock(rq, &flags);
5996 wake_up_process(rq->migration_thread); 5994 wake_up_process(rq->migration_thread);
@@ -6032,7 +6030,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
6032 if (task_cpu(p) != src_cpu) 6030 if (task_cpu(p) != src_cpu)
6033 goto done; 6031 goto done;
6034 /* Affinity changed (again). */ 6032 /* Affinity changed (again). */
6035 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 6033 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6036 goto fail; 6034 goto fail;
6037 6035
6038 on_rq = p->se.on_rq; 6036 on_rq = p->se.on_rq;
@@ -6126,54 +6124,46 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
6126 6124
6127/* 6125/*
6128 * Figure out where task on dead CPU should go, use force if necessary. 6126 * Figure out where task on dead CPU should go, use force if necessary.
6129 * NOTE: interrupts should be disabled by the caller
6130 */ 6127 */
6131static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 6128static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6132{ 6129{
6133 unsigned long flags;
6134 cpumask_t mask;
6135 struct rq *rq;
6136 int dest_cpu; 6130 int dest_cpu;
6131 /* FIXME: Use cpumask_of_node here. */
6132 cpumask_t _nodemask = node_to_cpumask(cpu_to_node(dead_cpu));
6133 const struct cpumask *nodemask = &_nodemask;
6134
6135again:
6136 /* Look for allowed, online CPU in same node. */
6137 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
6138 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
6139 goto move;
6140
6141 /* Any allowed, online CPU? */
6142 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
6143 if (dest_cpu < nr_cpu_ids)
6144 goto move;
6145
6146 /* No more Mr. Nice Guy. */
6147 if (dest_cpu >= nr_cpu_ids) {
6148 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
6149 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
6137 6150
6138 do { 6151 /*
6139 /* On same node? */ 6152 * Don't tell them about moving exiting tasks or
6140 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 6153 * kernel threads (both mm NULL), since they never
6141 cpus_and(mask, mask, p->cpus_allowed); 6154 * leave kernel.
6142 dest_cpu = any_online_cpu(mask); 6155 */
6143 6156 if (p->mm && printk_ratelimit()) {
6144 /* On any allowed CPU? */ 6157 printk(KERN_INFO "process %d (%s) no "
6145 if (dest_cpu >= nr_cpu_ids) 6158 "longer affine to cpu%d\n",
6146 dest_cpu = any_online_cpu(p->cpus_allowed); 6159 task_pid_nr(p), p->comm, dead_cpu);
6147
6148 /* No more Mr. Nice Guy. */
6149 if (dest_cpu >= nr_cpu_ids) {
6150 cpumask_t cpus_allowed;
6151
6152 cpuset_cpus_allowed_locked(p, &cpus_allowed);
6153 /*
6154 * Try to stay on the same cpuset, where the
6155 * current cpuset may be a subset of all cpus.
6156 * The cpuset_cpus_allowed_locked() variant of
6157 * cpuset_cpus_allowed() will not block. It must be
6158 * called within calls to cpuset_lock/cpuset_unlock.
6159 */
6160 rq = task_rq_lock(p, &flags);
6161 p->cpus_allowed = cpus_allowed;
6162 dest_cpu = any_online_cpu(p->cpus_allowed);
6163 task_rq_unlock(rq, &flags);
6164
6165 /*
6166 * Don't tell them about moving exiting tasks or
6167 * kernel threads (both mm NULL), since they never
6168 * leave kernel.
6169 */
6170 if (p->mm && printk_ratelimit()) {
6171 printk(KERN_INFO "process %d (%s) no "
6172 "longer affine to cpu%d\n",
6173 task_pid_nr(p), p->comm, dead_cpu);
6174 }
6175 } 6160 }
6176 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); 6161 }
6162
6163move:
6164 /* It can have affinity changed while we were choosing. */
6165 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
6166 goto again;
6177} 6167}
6178 6168
6179/* 6169/*
@@ -6185,7 +6175,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
6185 */ 6175 */
6186static void migrate_nr_uninterruptible(struct rq *rq_src) 6176static void migrate_nr_uninterruptible(struct rq *rq_src)
6187{ 6177{
6188 struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR)); 6178 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
6189 unsigned long flags; 6179 unsigned long flags;
6190 6180
6191 local_irq_save(flags); 6181 local_irq_save(flags);
@@ -6475,7 +6465,7 @@ static void set_rq_online(struct rq *rq)
6475 if (!rq->online) { 6465 if (!rq->online) {
6476 const struct sched_class *class; 6466 const struct sched_class *class;
6477 6467
6478 cpu_set(rq->cpu, rq->rd->online); 6468 cpumask_set_cpu(rq->cpu, rq->rd->online);
6479 rq->online = 1; 6469 rq->online = 1;
6480 6470
6481 for_each_class(class) { 6471 for_each_class(class) {
@@ -6495,7 +6485,7 @@ static void set_rq_offline(struct rq *rq)
6495 class->rq_offline(rq); 6485 class->rq_offline(rq);
6496 } 6486 }
6497 6487
6498 cpu_clear(rq->cpu, rq->rd->online); 6488 cpumask_clear_cpu(rq->cpu, rq->rd->online);
6499 rq->online = 0; 6489 rq->online = 0;
6500 } 6490 }
6501} 6491}
@@ -6536,7 +6526,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6536 rq = cpu_rq(cpu); 6526 rq = cpu_rq(cpu);
6537 spin_lock_irqsave(&rq->lock, flags); 6527 spin_lock_irqsave(&rq->lock, flags);
6538 if (rq->rd) { 6528 if (rq->rd) {
6539 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6529 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6540 6530
6541 set_rq_online(rq); 6531 set_rq_online(rq);
6542 } 6532 }
@@ -6550,7 +6540,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6550 break; 6540 break;
6551 /* Unbind it from offline cpu so it can run. Fall thru. */ 6541 /* Unbind it from offline cpu so it can run. Fall thru. */
6552 kthread_bind(cpu_rq(cpu)->migration_thread, 6542 kthread_bind(cpu_rq(cpu)->migration_thread,
6553 any_online_cpu(cpu_online_map)); 6543 cpumask_any(cpu_online_mask));
6554 kthread_stop(cpu_rq(cpu)->migration_thread); 6544 kthread_stop(cpu_rq(cpu)->migration_thread);
6555 cpu_rq(cpu)->migration_thread = NULL; 6545 cpu_rq(cpu)->migration_thread = NULL;
6556 break; 6546 break;
@@ -6598,7 +6588,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6598 rq = cpu_rq(cpu); 6588 rq = cpu_rq(cpu);
6599 spin_lock_irqsave(&rq->lock, flags); 6589 spin_lock_irqsave(&rq->lock, flags);
6600 if (rq->rd) { 6590 if (rq->rd) {
6601 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6591 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6602 set_rq_offline(rq); 6592 set_rq_offline(rq);
6603 } 6593 }
6604 spin_unlock_irqrestore(&rq->lock, flags); 6594 spin_unlock_irqrestore(&rq->lock, flags);
@@ -6636,36 +6626,14 @@ early_initcall(migration_init);
6636 6626
6637#ifdef CONFIG_SCHED_DEBUG 6627#ifdef CONFIG_SCHED_DEBUG
6638 6628
6639static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6640{
6641 switch (lvl) {
6642 case SD_LV_NONE:
6643 return "NONE";
6644 case SD_LV_SIBLING:
6645 return "SIBLING";
6646 case SD_LV_MC:
6647 return "MC";
6648 case SD_LV_CPU:
6649 return "CPU";
6650 case SD_LV_NODE:
6651 return "NODE";
6652 case SD_LV_ALLNODES:
6653 return "ALLNODES";
6654 case SD_LV_MAX:
6655 return "MAX";
6656
6657 }
6658 return "MAX";
6659}
6660
6661static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6629static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6662 cpumask_t *groupmask) 6630 struct cpumask *groupmask)
6663{ 6631{
6664 struct sched_group *group = sd->groups; 6632 struct sched_group *group = sd->groups;
6665 char str[256]; 6633 char str[256];
6666 6634
6667 cpulist_scnprintf(str, sizeof(str), sd->span); 6635 cpulist_scnprintf(str, sizeof(str), *sched_domain_span(sd));
6668 cpus_clear(*groupmask); 6636 cpumask_clear(groupmask);
6669 6637
6670 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6638 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6671 6639
@@ -6677,14 +6645,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6677 return -1; 6645 return -1;
6678 } 6646 }
6679 6647
6680 printk(KERN_CONT "span %s level %s\n", 6648 printk(KERN_CONT "span %s level %s\n", str, sd->name);
6681 str, sd_level_to_string(sd->level));
6682 6649
6683 if (!cpu_isset(cpu, sd->span)) { 6650 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
6684 printk(KERN_ERR "ERROR: domain->span does not contain " 6651 printk(KERN_ERR "ERROR: domain->span does not contain "
6685 "CPU%d\n", cpu); 6652 "CPU%d\n", cpu);
6686 } 6653 }
6687 if (!cpu_isset(cpu, group->cpumask)) { 6654 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
6688 printk(KERN_ERR "ERROR: domain->groups does not contain" 6655 printk(KERN_ERR "ERROR: domain->groups does not contain"
6689 " CPU%d\n", cpu); 6656 " CPU%d\n", cpu);
6690 } 6657 }
@@ -6704,31 +6671,32 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6704 break; 6671 break;
6705 } 6672 }
6706 6673
6707 if (!cpus_weight(group->cpumask)) { 6674 if (!cpumask_weight(sched_group_cpus(group))) {
6708 printk(KERN_CONT "\n"); 6675 printk(KERN_CONT "\n");
6709 printk(KERN_ERR "ERROR: empty group\n"); 6676 printk(KERN_ERR "ERROR: empty group\n");
6710 break; 6677 break;
6711 } 6678 }
6712 6679
6713 if (cpus_intersects(*groupmask, group->cpumask)) { 6680 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
6714 printk(KERN_CONT "\n"); 6681 printk(KERN_CONT "\n");
6715 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6682 printk(KERN_ERR "ERROR: repeated CPUs\n");
6716 break; 6683 break;
6717 } 6684 }
6718 6685
6719 cpus_or(*groupmask, *groupmask, group->cpumask); 6686 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6720 6687
6721 cpulist_scnprintf(str, sizeof(str), group->cpumask); 6688 cpulist_scnprintf(str, sizeof(str), *sched_group_cpus(group));
6722 printk(KERN_CONT " %s", str); 6689 printk(KERN_CONT " %s", str);
6723 6690
6724 group = group->next; 6691 group = group->next;
6725 } while (group != sd->groups); 6692 } while (group != sd->groups);
6726 printk(KERN_CONT "\n"); 6693 printk(KERN_CONT "\n");
6727 6694
6728 if (!cpus_equal(sd->span, *groupmask)) 6695 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6729 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6696 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6730 6697
6731 if (sd->parent && !cpus_subset(*groupmask, sd->parent->span)) 6698 if (sd->parent &&
6699 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6732 printk(KERN_ERR "ERROR: parent span is not a superset " 6700 printk(KERN_ERR "ERROR: parent span is not a superset "
6733 "of domain->span\n"); 6701 "of domain->span\n");
6734 return 0; 6702 return 0;
@@ -6736,7 +6704,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6736 6704
6737static void sched_domain_debug(struct sched_domain *sd, int cpu) 6705static void sched_domain_debug(struct sched_domain *sd, int cpu)
6738{ 6706{
6739 cpumask_t *groupmask; 6707 cpumask_var_t groupmask;
6740 int level = 0; 6708 int level = 0;
6741 6709
6742 if (!sd) { 6710 if (!sd) {
@@ -6746,8 +6714,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6746 6714
6747 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6715 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6748 6716
6749 groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 6717 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6750 if (!groupmask) {
6751 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); 6718 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6752 return; 6719 return;
6753 } 6720 }
@@ -6760,7 +6727,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6760 if (!sd) 6727 if (!sd)
6761 break; 6728 break;
6762 } 6729 }
6763 kfree(groupmask); 6730 free_cpumask_var(groupmask);
6764} 6731}
6765#else /* !CONFIG_SCHED_DEBUG */ 6732#else /* !CONFIG_SCHED_DEBUG */
6766# define sched_domain_debug(sd, cpu) do { } while (0) 6733# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6768,7 +6735,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6768 6735
6769static int sd_degenerate(struct sched_domain *sd) 6736static int sd_degenerate(struct sched_domain *sd)
6770{ 6737{
6771 if (cpus_weight(sd->span) == 1) 6738 if (cpumask_weight(sched_domain_span(sd)) == 1)
6772 return 1; 6739 return 1;
6773 6740
6774 /* Following flags need at least 2 groups */ 6741 /* Following flags need at least 2 groups */
@@ -6799,7 +6766,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6799 if (sd_degenerate(parent)) 6766 if (sd_degenerate(parent))
6800 return 1; 6767 return 1;
6801 6768
6802 if (!cpus_equal(sd->span, parent->span)) 6769 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6803 return 0; 6770 return 0;
6804 6771
6805 /* Does parent contain flags not in child? */ 6772 /* Does parent contain flags not in child? */
@@ -6821,6 +6788,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6821 return 1; 6788 return 1;
6822} 6789}
6823 6790
6791static void free_rootdomain(struct root_domain *rd)
6792{
6793 cpupri_cleanup(&rd->cpupri);
6794
6795 free_cpumask_var(rd->rto_mask);
6796 free_cpumask_var(rd->online);
6797 free_cpumask_var(rd->span);
6798 kfree(rd);
6799}
6800
6824static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6801static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6825{ 6802{
6826 unsigned long flags; 6803 unsigned long flags;
@@ -6830,38 +6807,63 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6830 if (rq->rd) { 6807 if (rq->rd) {
6831 struct root_domain *old_rd = rq->rd; 6808 struct root_domain *old_rd = rq->rd;
6832 6809
6833 if (cpu_isset(rq->cpu, old_rd->online)) 6810 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6834 set_rq_offline(rq); 6811 set_rq_offline(rq);
6835 6812
6836 cpu_clear(rq->cpu, old_rd->span); 6813 cpumask_clear_cpu(rq->cpu, old_rd->span);
6837 6814
6838 if (atomic_dec_and_test(&old_rd->refcount)) 6815 if (atomic_dec_and_test(&old_rd->refcount))
6839 kfree(old_rd); 6816 free_rootdomain(old_rd);
6840 } 6817 }
6841 6818
6842 atomic_inc(&rd->refcount); 6819 atomic_inc(&rd->refcount);
6843 rq->rd = rd; 6820 rq->rd = rd;
6844 6821
6845 cpu_set(rq->cpu, rd->span); 6822 cpumask_set_cpu(rq->cpu, rd->span);
6846 if (cpu_isset(rq->cpu, cpu_online_map)) 6823 if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
6847 set_rq_online(rq); 6824 set_rq_online(rq);
6848 6825
6849 spin_unlock_irqrestore(&rq->lock, flags); 6826 spin_unlock_irqrestore(&rq->lock, flags);
6850} 6827}
6851 6828
6852static void init_rootdomain(struct root_domain *rd) 6829static int init_rootdomain(struct root_domain *rd, bool bootmem)
6853{ 6830{
6854 memset(rd, 0, sizeof(*rd)); 6831 memset(rd, 0, sizeof(*rd));
6855 6832
6856 cpus_clear(rd->span); 6833 if (bootmem) {
6857 cpus_clear(rd->online); 6834 alloc_bootmem_cpumask_var(&def_root_domain.span);
6835 alloc_bootmem_cpumask_var(&def_root_domain.online);
6836 alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
6837 cpupri_init(&rd->cpupri, true);
6838 return 0;
6839 }
6840
6841 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6842 goto free_rd;
6843 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6844 goto free_span;
6845 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6846 goto free_online;
6847
6848 if (cpupri_init(&rd->cpupri, false) != 0)
6849 goto free_rto_mask;
6850 return 0;
6858 6851
6859 cpupri_init(&rd->cpupri); 6852free_rto_mask:
6853 free_cpumask_var(rd->rto_mask);
6854free_online:
6855 free_cpumask_var(rd->online);
6856free_span:
6857 free_cpumask_var(rd->span);
6858free_rd:
6859 kfree(rd);
6860 return -ENOMEM;
6860} 6861}
6861 6862
6862static void init_defrootdomain(void) 6863static void init_defrootdomain(void)
6863{ 6864{
6864 init_rootdomain(&def_root_domain); 6865 init_rootdomain(&def_root_domain, true);
6866
6865 atomic_set(&def_root_domain.refcount, 1); 6867 atomic_set(&def_root_domain.refcount, 1);
6866} 6868}
6867 6869
@@ -6873,7 +6875,10 @@ static struct root_domain *alloc_rootdomain(void)
6873 if (!rd) 6875 if (!rd)
6874 return NULL; 6876 return NULL;
6875 6877
6876 init_rootdomain(rd); 6878 if (init_rootdomain(rd, false) != 0) {
6879 kfree(rd);
6880 return NULL;
6881 }
6877 6882
6878 return rd; 6883 return rd;
6879} 6884}
@@ -6915,19 +6920,12 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6915} 6920}
6916 6921
6917/* cpus with isolated domains */ 6922/* cpus with isolated domains */
6918static cpumask_t cpu_isolated_map = CPU_MASK_NONE; 6923static cpumask_var_t cpu_isolated_map;
6919 6924
6920/* Setup the mask of cpus configured for isolated domains */ 6925/* Setup the mask of cpus configured for isolated domains */
6921static int __init isolated_cpu_setup(char *str) 6926static int __init isolated_cpu_setup(char *str)
6922{ 6927{
6923 static int __initdata ints[NR_CPUS]; 6928 cpulist_parse(str, *cpu_isolated_map);
6924 int i;
6925
6926 str = get_options(str, ARRAY_SIZE(ints), ints);
6927 cpus_clear(cpu_isolated_map);
6928 for (i = 1; i <= ints[0]; i++)
6929 if (ints[i] < NR_CPUS)
6930 cpu_set(ints[i], cpu_isolated_map);
6931 return 1; 6929 return 1;
6932} 6930}
6933 6931
@@ -6936,42 +6934,43 @@ __setup("isolcpus=", isolated_cpu_setup);
6936/* 6934/*
6937 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 6935 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6938 * to a function which identifies what group(along with sched group) a CPU 6936 * to a function which identifies what group(along with sched group) a CPU
6939 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS 6937 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6940 * (due to the fact that we keep track of groups covered with a cpumask_t). 6938 * (due to the fact that we keep track of groups covered with a struct cpumask).
6941 * 6939 *
6942 * init_sched_build_groups will build a circular linked list of the groups 6940 * init_sched_build_groups will build a circular linked list of the groups
6943 * covered by the given span, and will set each group's ->cpumask correctly, 6941 * covered by the given span, and will set each group's ->cpumask correctly,
6944 * and ->cpu_power to 0. 6942 * and ->cpu_power to 0.
6945 */ 6943 */
6946static void 6944static void
6947init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map, 6945init_sched_build_groups(const struct cpumask *span,
6948 int (*group_fn)(int cpu, const cpumask_t *cpu_map, 6946 const struct cpumask *cpu_map,
6947 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6949 struct sched_group **sg, 6948 struct sched_group **sg,
6950 cpumask_t *tmpmask), 6949 struct cpumask *tmpmask),
6951 cpumask_t *covered, cpumask_t *tmpmask) 6950 struct cpumask *covered, struct cpumask *tmpmask)
6952{ 6951{
6953 struct sched_group *first = NULL, *last = NULL; 6952 struct sched_group *first = NULL, *last = NULL;
6954 int i; 6953 int i;
6955 6954
6956 cpus_clear(*covered); 6955 cpumask_clear(covered);
6957 6956
6958 for_each_cpu_mask_nr(i, *span) { 6957 for_each_cpu(i, span) {
6959 struct sched_group *sg; 6958 struct sched_group *sg;
6960 int group = group_fn(i, cpu_map, &sg, tmpmask); 6959 int group = group_fn(i, cpu_map, &sg, tmpmask);
6961 int j; 6960 int j;
6962 6961
6963 if (cpu_isset(i, *covered)) 6962 if (cpumask_test_cpu(i, covered))
6964 continue; 6963 continue;
6965 6964
6966 cpus_clear(sg->cpumask); 6965 cpumask_clear(sched_group_cpus(sg));
6967 sg->__cpu_power = 0; 6966 sg->__cpu_power = 0;
6968 6967
6969 for_each_cpu_mask_nr(j, *span) { 6968 for_each_cpu(j, span) {
6970 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 6969 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6971 continue; 6970 continue;
6972 6971
6973 cpu_set(j, *covered); 6972 cpumask_set_cpu(j, covered);
6974 cpu_set(j, sg->cpumask); 6973 cpumask_set_cpu(j, sched_group_cpus(sg));
6975 } 6974 }
6976 if (!first) 6975 if (!first)
6977 first = sg; 6976 first = sg;
@@ -7035,9 +7034,10 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
7035 * should be one that prevents unnecessary balancing, but also spreads tasks 7034 * should be one that prevents unnecessary balancing, but also spreads tasks
7036 * out optimally. 7035 * out optimally.
7037 */ 7036 */
7038static void sched_domain_node_span(int node, cpumask_t *span) 7037static void sched_domain_node_span(int node, struct cpumask *span)
7039{ 7038{
7040 nodemask_t used_nodes; 7039 nodemask_t used_nodes;
7040 /* FIXME: use cpumask_of_node() */
7041 node_to_cpumask_ptr(nodemask, node); 7041 node_to_cpumask_ptr(nodemask, node);
7042 int i; 7042 int i;
7043 7043
@@ -7059,18 +7059,33 @@ static void sched_domain_node_span(int node, cpumask_t *span)
7059int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7059int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
7060 7060
7061/* 7061/*
7062 * The cpus mask in sched_group and sched_domain hangs off the end.
7063 * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
7064 * for nr_cpu_ids < CONFIG_NR_CPUS.
7065 */
7066struct static_sched_group {
7067 struct sched_group sg;
7068 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
7069};
7070
7071struct static_sched_domain {
7072 struct sched_domain sd;
7073 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
7074};
7075
7076/*
7062 * SMT sched-domains: 7077 * SMT sched-domains:
7063 */ 7078 */
7064#ifdef CONFIG_SCHED_SMT 7079#ifdef CONFIG_SCHED_SMT
7065static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 7080static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
7066static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 7081static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
7067 7082
7068static int 7083static int
7069cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7084cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
7070 cpumask_t *unused) 7085 struct sched_group **sg, struct cpumask *unused)
7071{ 7086{
7072 if (sg) 7087 if (sg)
7073 *sg = &per_cpu(sched_group_cpus, cpu); 7088 *sg = &per_cpu(sched_group_cpus, cpu).sg;
7074 return cpu; 7089 return cpu;
7075} 7090}
7076#endif /* CONFIG_SCHED_SMT */ 7091#endif /* CONFIG_SCHED_SMT */
@@ -7079,56 +7094,55 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
7079 * multi-core sched-domains: 7094 * multi-core sched-domains:
7080 */ 7095 */
7081#ifdef CONFIG_SCHED_MC 7096#ifdef CONFIG_SCHED_MC
7082static DEFINE_PER_CPU(struct sched_domain, core_domains); 7097static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
7083static DEFINE_PER_CPU(struct sched_group, sched_group_core); 7098static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
7084#endif /* CONFIG_SCHED_MC */ 7099#endif /* CONFIG_SCHED_MC */
7085 7100
7086#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7101#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
7087static int 7102static int
7088cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7103cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7089 cpumask_t *mask) 7104 struct sched_group **sg, struct cpumask *mask)
7090{ 7105{
7091 int group; 7106 int group;
7092 7107
7093 *mask = per_cpu(cpu_sibling_map, cpu); 7108 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7094 cpus_and(*mask, *mask, *cpu_map); 7109 group = cpumask_first(mask);
7095 group = first_cpu(*mask);
7096 if (sg) 7110 if (sg)
7097 *sg = &per_cpu(sched_group_core, group); 7111 *sg = &per_cpu(sched_group_core, group).sg;
7098 return group; 7112 return group;
7099} 7113}
7100#elif defined(CONFIG_SCHED_MC) 7114#elif defined(CONFIG_SCHED_MC)
7101static int 7115static int
7102cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7116cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7103 cpumask_t *unused) 7117 struct sched_group **sg, struct cpumask *unused)
7104{ 7118{
7105 if (sg) 7119 if (sg)
7106 *sg = &per_cpu(sched_group_core, cpu); 7120 *sg = &per_cpu(sched_group_core, cpu).sg;
7107 return cpu; 7121 return cpu;
7108} 7122}
7109#endif 7123#endif
7110 7124
7111static DEFINE_PER_CPU(struct sched_domain, phys_domains); 7125static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
7112static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 7126static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
7113 7127
7114static int 7128static int
7115cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg, 7129cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7116 cpumask_t *mask) 7130 struct sched_group **sg, struct cpumask *mask)
7117{ 7131{
7118 int group; 7132 int group;
7119#ifdef CONFIG_SCHED_MC 7133#ifdef CONFIG_SCHED_MC
7134 /* FIXME: Use cpu_coregroup_mask. */
7120 *mask = cpu_coregroup_map(cpu); 7135 *mask = cpu_coregroup_map(cpu);
7121 cpus_and(*mask, *mask, *cpu_map); 7136 cpus_and(*mask, *mask, *cpu_map);
7122 group = first_cpu(*mask); 7137 group = cpumask_first(mask);
7123#elif defined(CONFIG_SCHED_SMT) 7138#elif defined(CONFIG_SCHED_SMT)
7124 *mask = per_cpu(cpu_sibling_map, cpu); 7139 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
7125 cpus_and(*mask, *mask, *cpu_map); 7140 group = cpumask_first(mask);
7126 group = first_cpu(*mask);
7127#else 7141#else
7128 group = cpu; 7142 group = cpu;
7129#endif 7143#endif
7130 if (sg) 7144 if (sg)
7131 *sg = &per_cpu(sched_group_phys, group); 7145 *sg = &per_cpu(sched_group_phys, group).sg;
7132 return group; 7146 return group;
7133} 7147}
7134 7148
@@ -7142,19 +7156,21 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains);
7142static struct sched_group ***sched_group_nodes_bycpu; 7156static struct sched_group ***sched_group_nodes_bycpu;
7143 7157
7144static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 7158static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
7145static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); 7159static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7146 7160
7147static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, 7161static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7148 struct sched_group **sg, cpumask_t *nodemask) 7162 struct sched_group **sg,
7163 struct cpumask *nodemask)
7149{ 7164{
7150 int group; 7165 int group;
7166 /* FIXME: use cpumask_of_node */
7167 node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));
7151 7168
7152 *nodemask = node_to_cpumask(cpu_to_node(cpu)); 7169 cpumask_and(nodemask, pnodemask, cpu_map);
7153 cpus_and(*nodemask, *nodemask, *cpu_map); 7170 group = cpumask_first(nodemask);
7154 group = first_cpu(*nodemask);
7155 7171
7156 if (sg) 7172 if (sg)
7157 *sg = &per_cpu(sched_group_allnodes, group); 7173 *sg = &per_cpu(sched_group_allnodes, group).sg;
7158 return group; 7174 return group;
7159} 7175}
7160 7176
@@ -7166,11 +7182,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7166 if (!sg) 7182 if (!sg)
7167 return; 7183 return;
7168 do { 7184 do {
7169 for_each_cpu_mask_nr(j, sg->cpumask) { 7185 for_each_cpu(j, sched_group_cpus(sg)) {
7170 struct sched_domain *sd; 7186 struct sched_domain *sd;
7171 7187
7172 sd = &per_cpu(phys_domains, j); 7188 sd = &per_cpu(phys_domains, j).sd;
7173 if (j != first_cpu(sd->groups->cpumask)) { 7189 if (j != cpumask_first(sched_group_cpus(sd->groups))) {
7174 /* 7190 /*
7175 * Only add "power" once for each 7191 * Only add "power" once for each
7176 * physical package. 7192 * physical package.
@@ -7187,11 +7203,12 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7187 7203
7188#ifdef CONFIG_NUMA 7204#ifdef CONFIG_NUMA
7189/* Free memory allocated for various sched_group structures */ 7205/* Free memory allocated for various sched_group structures */
7190static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7206static void free_sched_groups(const struct cpumask *cpu_map,
7207 struct cpumask *nodemask)
7191{ 7208{
7192 int cpu, i; 7209 int cpu, i;
7193 7210
7194 for_each_cpu_mask_nr(cpu, *cpu_map) { 7211 for_each_cpu(cpu, cpu_map) {
7195 struct sched_group **sched_group_nodes 7212 struct sched_group **sched_group_nodes
7196 = sched_group_nodes_bycpu[cpu]; 7213 = sched_group_nodes_bycpu[cpu];
7197 7214
@@ -7200,10 +7217,11 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7200 7217
7201 for (i = 0; i < nr_node_ids; i++) { 7218 for (i = 0; i < nr_node_ids; i++) {
7202 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7219 struct sched_group *oldsg, *sg = sched_group_nodes[i];
7220 /* FIXME: Use cpumask_of_node */
7221 node_to_cpumask_ptr(pnodemask, i);
7203 7222
7204 *nodemask = node_to_cpumask(i); 7223 cpus_and(*nodemask, *pnodemask, *cpu_map);
7205 cpus_and(*nodemask, *nodemask, *cpu_map); 7224 if (cpumask_empty(nodemask))
7206 if (cpus_empty(*nodemask))
7207 continue; 7225 continue;
7208 7226
7209 if (sg == NULL) 7227 if (sg == NULL)
@@ -7221,7 +7239,8 @@ next_sg:
7221 } 7239 }
7222} 7240}
7223#else /* !CONFIG_NUMA */ 7241#else /* !CONFIG_NUMA */
7224static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7242static void free_sched_groups(const struct cpumask *cpu_map,
7243 struct cpumask *nodemask)
7225{ 7244{
7226} 7245}
7227#endif /* CONFIG_NUMA */ 7246#endif /* CONFIG_NUMA */
@@ -7247,7 +7266,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7247 7266
7248 WARN_ON(!sd || !sd->groups); 7267 WARN_ON(!sd || !sd->groups);
7249 7268
7250 if (cpu != first_cpu(sd->groups->cpumask)) 7269 if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
7251 return; 7270 return;
7252 7271
7253 child = sd->child; 7272 child = sd->child;
@@ -7312,40 +7331,6 @@ SD_INIT_FUNC(CPU)
7312 SD_INIT_FUNC(MC) 7331 SD_INIT_FUNC(MC)
7313#endif 7332#endif
7314 7333
7315/*
7316 * To minimize stack usage kmalloc room for cpumasks and share the
7317 * space as the usage in build_sched_domains() dictates. Used only
7318 * if the amount of space is significant.
7319 */
7320struct allmasks {
7321 cpumask_t tmpmask; /* make this one first */
7322 union {
7323 cpumask_t nodemask;
7324 cpumask_t this_sibling_map;
7325 cpumask_t this_core_map;
7326 };
7327 cpumask_t send_covered;
7328
7329#ifdef CONFIG_NUMA
7330 cpumask_t domainspan;
7331 cpumask_t covered;
7332 cpumask_t notcovered;
7333#endif
7334};
7335
7336#if NR_CPUS > 128
7337#define SCHED_CPUMASK_ALLOC 1
7338#define SCHED_CPUMASK_FREE(v) kfree(v)
7339#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
7340#else
7341#define SCHED_CPUMASK_ALLOC 0
7342#define SCHED_CPUMASK_FREE(v)
7343#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
7344#endif
7345
7346#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
7347 ((unsigned long)(a) + offsetof(struct allmasks, v))
7348
7349static int default_relax_domain_level = -1; 7334static int default_relax_domain_level = -1;
7350 7335
7351static int __init setup_relax_domain_level(char *str) 7336static int __init setup_relax_domain_level(char *str)
@@ -7385,17 +7370,38 @@ static void set_domain_attribute(struct sched_domain *sd,
7385 * Build sched domains for a given set of cpus and attach the sched domains 7370 * Build sched domains for a given set of cpus and attach the sched domains
7386 * to the individual cpus 7371 * to the individual cpus
7387 */ 7372 */
7388static int __build_sched_domains(const cpumask_t *cpu_map, 7373static int __build_sched_domains(const struct cpumask *cpu_map,
7389 struct sched_domain_attr *attr) 7374 struct sched_domain_attr *attr)
7390{ 7375{
7391 int i; 7376 int i, err = -ENOMEM;
7392 struct root_domain *rd; 7377 struct root_domain *rd;
7393 SCHED_CPUMASK_DECLARE(allmasks); 7378 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
7394 cpumask_t *tmpmask; 7379 tmpmask;
7395#ifdef CONFIG_NUMA 7380#ifdef CONFIG_NUMA
7381 cpumask_var_t domainspan, covered, notcovered;
7396 struct sched_group **sched_group_nodes = NULL; 7382 struct sched_group **sched_group_nodes = NULL;
7397 int sd_allnodes = 0; 7383 int sd_allnodes = 0;
7398 7384
7385 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
7386 goto out;
7387 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
7388 goto free_domainspan;
7389 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
7390 goto free_covered;
7391#endif
7392
7393 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
7394 goto free_notcovered;
7395 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
7396 goto free_nodemask;
7397 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
7398 goto free_this_sibling_map;
7399 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
7400 goto free_this_core_map;
7401 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
7402 goto free_send_covered;
7403
7404#ifdef CONFIG_NUMA
7399 /* 7405 /*
7400 * Allocate the per-node list of sched groups 7406 * Allocate the per-node list of sched groups
7401 */ 7407 */
@@ -7403,55 +7409,37 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7403 GFP_KERNEL); 7409 GFP_KERNEL);
7404 if (!sched_group_nodes) { 7410 if (!sched_group_nodes) {
7405 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7411 printk(KERN_WARNING "Can not alloc sched group node list\n");
7406 return -ENOMEM; 7412 goto free_tmpmask;
7407 } 7413 }
7408#endif 7414#endif
7409 7415
7410 rd = alloc_rootdomain(); 7416 rd = alloc_rootdomain();
7411 if (!rd) { 7417 if (!rd) {
7412 printk(KERN_WARNING "Cannot alloc root domain\n"); 7418 printk(KERN_WARNING "Cannot alloc root domain\n");
7413#ifdef CONFIG_NUMA 7419 goto free_sched_groups;
7414 kfree(sched_group_nodes);
7415#endif
7416 return -ENOMEM;
7417 }
7418
7419#if SCHED_CPUMASK_ALLOC
7420 /* get space for all scratch cpumask variables */
7421 allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
7422 if (!allmasks) {
7423 printk(KERN_WARNING "Cannot alloc cpumask array\n");
7424 kfree(rd);
7425#ifdef CONFIG_NUMA
7426 kfree(sched_group_nodes);
7427#endif
7428 return -ENOMEM;
7429 } 7420 }
7430#endif
7431 tmpmask = (cpumask_t *)allmasks;
7432
7433 7421
7434#ifdef CONFIG_NUMA 7422#ifdef CONFIG_NUMA
7435 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 7423 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
7436#endif 7424#endif
7437 7425
7438 /* 7426 /*
7439 * Set up domains for cpus specified by the cpu_map. 7427 * Set up domains for cpus specified by the cpu_map.
7440 */ 7428 */
7441 for_each_cpu_mask_nr(i, *cpu_map) { 7429 for_each_cpu(i, cpu_map) {
7442 struct sched_domain *sd = NULL, *p; 7430 struct sched_domain *sd = NULL, *p;
7443 SCHED_CPUMASK_VAR(nodemask, allmasks);
7444 7431
7432 /* FIXME: use cpumask_of_node */
7445 *nodemask = node_to_cpumask(cpu_to_node(i)); 7433 *nodemask = node_to_cpumask(cpu_to_node(i));
7446 cpus_and(*nodemask, *nodemask, *cpu_map); 7434 cpus_and(*nodemask, *nodemask, *cpu_map);
7447 7435
7448#ifdef CONFIG_NUMA 7436#ifdef CONFIG_NUMA
7449 if (cpus_weight(*cpu_map) > 7437 if (cpumask_weight(cpu_map) >
7450 SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) { 7438 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
7451 sd = &per_cpu(allnodes_domains, i); 7439 sd = &per_cpu(allnodes_domains, i);
7452 SD_INIT(sd, ALLNODES); 7440 SD_INIT(sd, ALLNODES);
7453 set_domain_attribute(sd, attr); 7441 set_domain_attribute(sd, attr);
7454 sd->span = *cpu_map; 7442 cpumask_copy(sched_domain_span(sd), cpu_map);
7455 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7443 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7456 p = sd; 7444 p = sd;
7457 sd_allnodes = 1; 7445 sd_allnodes = 1;
@@ -7461,18 +7449,19 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7461 sd = &per_cpu(node_domains, i); 7449 sd = &per_cpu(node_domains, i);
7462 SD_INIT(sd, NODE); 7450 SD_INIT(sd, NODE);
7463 set_domain_attribute(sd, attr); 7451 set_domain_attribute(sd, attr);
7464 sched_domain_node_span(cpu_to_node(i), &sd->span); 7452 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7465 sd->parent = p; 7453 sd->parent = p;
7466 if (p) 7454 if (p)
7467 p->child = sd; 7455 p->child = sd;
7468 cpus_and(sd->span, sd->span, *cpu_map); 7456 cpumask_and(sched_domain_span(sd),
7457 sched_domain_span(sd), cpu_map);
7469#endif 7458#endif
7470 7459
7471 p = sd; 7460 p = sd;
7472 sd = &per_cpu(phys_domains, i); 7461 sd = &per_cpu(phys_domains, i).sd;
7473 SD_INIT(sd, CPU); 7462 SD_INIT(sd, CPU);
7474 set_domain_attribute(sd, attr); 7463 set_domain_attribute(sd, attr);
7475 sd->span = *nodemask; 7464 cpumask_copy(sched_domain_span(sd), nodemask);
7476 sd->parent = p; 7465 sd->parent = p;
7477 if (p) 7466 if (p)
7478 p->child = sd; 7467 p->child = sd;
@@ -7480,11 +7469,12 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7480 7469
7481#ifdef CONFIG_SCHED_MC 7470#ifdef CONFIG_SCHED_MC
7482 p = sd; 7471 p = sd;
7483 sd = &per_cpu(core_domains, i); 7472 sd = &per_cpu(core_domains, i).sd;
7484 SD_INIT(sd, MC); 7473 SD_INIT(sd, MC);
7485 set_domain_attribute(sd, attr); 7474 set_domain_attribute(sd, attr);
7486 sd->span = cpu_coregroup_map(i); 7475 *sched_domain_span(sd) = cpu_coregroup_map(i);
7487 cpus_and(sd->span, sd->span, *cpu_map); 7476 cpumask_and(sched_domain_span(sd),
7477 sched_domain_span(sd), cpu_map);
7488 sd->parent = p; 7478 sd->parent = p;
7489 p->child = sd; 7479 p->child = sd;
7490 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); 7480 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7492,11 +7482,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7492 7482
7493#ifdef CONFIG_SCHED_SMT 7483#ifdef CONFIG_SCHED_SMT
7494 p = sd; 7484 p = sd;
7495 sd = &per_cpu(cpu_domains, i); 7485 sd = &per_cpu(cpu_domains, i).sd;
7496 SD_INIT(sd, SIBLING); 7486 SD_INIT(sd, SIBLING);
7497 set_domain_attribute(sd, attr); 7487 set_domain_attribute(sd, attr);
7498 sd->span = per_cpu(cpu_sibling_map, i); 7488 cpumask_and(sched_domain_span(sd),
7499 cpus_and(sd->span, sd->span, *cpu_map); 7489 &per_cpu(cpu_sibling_map, i), cpu_map);
7500 sd->parent = p; 7490 sd->parent = p;
7501 p->child = sd; 7491 p->child = sd;
7502 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 7492 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7505,13 +7495,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7505 7495
7506#ifdef CONFIG_SCHED_SMT 7496#ifdef CONFIG_SCHED_SMT
7507 /* Set up CPU (sibling) groups */ 7497 /* Set up CPU (sibling) groups */
7508 for_each_cpu_mask_nr(i, *cpu_map) { 7498 for_each_cpu(i, cpu_map) {
7509 SCHED_CPUMASK_VAR(this_sibling_map, allmasks); 7499 cpumask_and(this_sibling_map,
7510 SCHED_CPUMASK_VAR(send_covered, allmasks); 7500 &per_cpu(cpu_sibling_map, i), cpu_map);
7511 7501 if (i != cpumask_first(this_sibling_map))
7512 *this_sibling_map = per_cpu(cpu_sibling_map, i);
7513 cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
7514 if (i != first_cpu(*this_sibling_map))
7515 continue; 7502 continue;
7516 7503
7517 init_sched_build_groups(this_sibling_map, cpu_map, 7504 init_sched_build_groups(this_sibling_map, cpu_map,
@@ -7522,13 +7509,11 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7522 7509
7523#ifdef CONFIG_SCHED_MC 7510#ifdef CONFIG_SCHED_MC
7524 /* Set up multi-core groups */ 7511 /* Set up multi-core groups */
7525 for_each_cpu_mask_nr(i, *cpu_map) { 7512 for_each_cpu(i, cpu_map) {
7526 SCHED_CPUMASK_VAR(this_core_map, allmasks); 7513 /* FIXME: Use cpu_coregroup_mask */
7527 SCHED_CPUMASK_VAR(send_covered, allmasks);
7528
7529 *this_core_map = cpu_coregroup_map(i); 7514 *this_core_map = cpu_coregroup_map(i);
7530 cpus_and(*this_core_map, *this_core_map, *cpu_map); 7515 cpus_and(*this_core_map, *this_core_map, *cpu_map);
7531 if (i != first_cpu(*this_core_map)) 7516 if (i != cpumask_first(this_core_map))
7532 continue; 7517 continue;
7533 7518
7534 init_sched_build_groups(this_core_map, cpu_map, 7519 init_sched_build_groups(this_core_map, cpu_map,
@@ -7539,12 +7524,10 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7539 7524
7540 /* Set up physical groups */ 7525 /* Set up physical groups */
7541 for (i = 0; i < nr_node_ids; i++) { 7526 for (i = 0; i < nr_node_ids; i++) {
7542 SCHED_CPUMASK_VAR(nodemask, allmasks); 7527 /* FIXME: Use cpumask_of_node */
7543 SCHED_CPUMASK_VAR(send_covered, allmasks);
7544
7545 *nodemask = node_to_cpumask(i); 7528 *nodemask = node_to_cpumask(i);
7546 cpus_and(*nodemask, *nodemask, *cpu_map); 7529 cpus_and(*nodemask, *nodemask, *cpu_map);
7547 if (cpus_empty(*nodemask)) 7530 if (cpumask_empty(nodemask))
7548 continue; 7531 continue;
7549 7532
7550 init_sched_build_groups(nodemask, cpu_map, 7533 init_sched_build_groups(nodemask, cpu_map,
@@ -7555,8 +7538,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7555#ifdef CONFIG_NUMA 7538#ifdef CONFIG_NUMA
7556 /* Set up node groups */ 7539 /* Set up node groups */
7557 if (sd_allnodes) { 7540 if (sd_allnodes) {
7558 SCHED_CPUMASK_VAR(send_covered, allmasks);
7559
7560 init_sched_build_groups(cpu_map, cpu_map, 7541 init_sched_build_groups(cpu_map, cpu_map,
7561 &cpu_to_allnodes_group, 7542 &cpu_to_allnodes_group,
7562 send_covered, tmpmask); 7543 send_covered, tmpmask);
@@ -7565,58 +7546,58 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7565 for (i = 0; i < nr_node_ids; i++) { 7546 for (i = 0; i < nr_node_ids; i++) {
7566 /* Set up node groups */ 7547 /* Set up node groups */
7567 struct sched_group *sg, *prev; 7548 struct sched_group *sg, *prev;
7568 SCHED_CPUMASK_VAR(nodemask, allmasks);
7569 SCHED_CPUMASK_VAR(domainspan, allmasks);
7570 SCHED_CPUMASK_VAR(covered, allmasks);
7571 int j; 7549 int j;
7572 7550
7551 /* FIXME: Use cpumask_of_node */
7573 *nodemask = node_to_cpumask(i); 7552 *nodemask = node_to_cpumask(i);
7574 cpus_clear(*covered); 7553 cpumask_clear(covered);
7575 7554
7576 cpus_and(*nodemask, *nodemask, *cpu_map); 7555 cpus_and(*nodemask, *nodemask, *cpu_map);
7577 if (cpus_empty(*nodemask)) { 7556 if (cpumask_empty(nodemask)) {
7578 sched_group_nodes[i] = NULL; 7557 sched_group_nodes[i] = NULL;
7579 continue; 7558 continue;
7580 } 7559 }
7581 7560
7582 sched_domain_node_span(i, domainspan); 7561 sched_domain_node_span(i, domainspan);
7583 cpus_and(*domainspan, *domainspan, *cpu_map); 7562 cpumask_and(domainspan, domainspan, cpu_map);
7584 7563
7585 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); 7564 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7565 GFP_KERNEL, i);
7586 if (!sg) { 7566 if (!sg) {
7587 printk(KERN_WARNING "Can not alloc domain group for " 7567 printk(KERN_WARNING "Can not alloc domain group for "
7588 "node %d\n", i); 7568 "node %d\n", i);
7589 goto error; 7569 goto error;
7590 } 7570 }
7591 sched_group_nodes[i] = sg; 7571 sched_group_nodes[i] = sg;
7592 for_each_cpu_mask_nr(j, *nodemask) { 7572 for_each_cpu(j, nodemask) {
7593 struct sched_domain *sd; 7573 struct sched_domain *sd;
7594 7574
7595 sd = &per_cpu(node_domains, j); 7575 sd = &per_cpu(node_domains, j);
7596 sd->groups = sg; 7576 sd->groups = sg;
7597 } 7577 }
7598 sg->__cpu_power = 0; 7578 sg->__cpu_power = 0;
7599 sg->cpumask = *nodemask; 7579 cpumask_copy(sched_group_cpus(sg), nodemask);
7600 sg->next = sg; 7580 sg->next = sg;
7601 cpus_or(*covered, *covered, *nodemask); 7581 cpumask_or(covered, covered, nodemask);
7602 prev = sg; 7582 prev = sg;
7603 7583
7604 for (j = 0; j < nr_node_ids; j++) { 7584 for (j = 0; j < nr_node_ids; j++) {
7605 SCHED_CPUMASK_VAR(notcovered, allmasks);
7606 int n = (i + j) % nr_node_ids; 7585 int n = (i + j) % nr_node_ids;
7586 /* FIXME: Use cpumask_of_node */
7607 node_to_cpumask_ptr(pnodemask, n); 7587 node_to_cpumask_ptr(pnodemask, n);
7608 7588
7609 cpus_complement(*notcovered, *covered); 7589 cpumask_complement(notcovered, covered);
7610 cpus_and(*tmpmask, *notcovered, *cpu_map); 7590 cpumask_and(tmpmask, notcovered, cpu_map);
7611 cpus_and(*tmpmask, *tmpmask, *domainspan); 7591 cpumask_and(tmpmask, tmpmask, domainspan);
7612 if (cpus_empty(*tmpmask)) 7592 if (cpumask_empty(tmpmask))
7613 break; 7593 break;
7614 7594
7615 cpus_and(*tmpmask, *tmpmask, *pnodemask); 7595 cpumask_and(tmpmask, tmpmask, pnodemask);
7616 if (cpus_empty(*tmpmask)) 7596 if (cpumask_empty(tmpmask))
7617 continue; 7597 continue;
7618 7598
7619 sg = kmalloc_node(sizeof(struct sched_group), 7599 sg = kmalloc_node(sizeof(struct sched_group) +
7600 cpumask_size(),
7620 GFP_KERNEL, i); 7601 GFP_KERNEL, i);
7621 if (!sg) { 7602 if (!sg) {
7622 printk(KERN_WARNING 7603 printk(KERN_WARNING
@@ -7624,9 +7605,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7624 goto error; 7605 goto error;
7625 } 7606 }
7626 sg->__cpu_power = 0; 7607 sg->__cpu_power = 0;
7627 sg->cpumask = *tmpmask; 7608 cpumask_copy(sched_group_cpus(sg), tmpmask);
7628 sg->next = prev->next; 7609 sg->next = prev->next;
7629 cpus_or(*covered, *covered, *tmpmask); 7610 cpumask_or(covered, covered, tmpmask);
7630 prev->next = sg; 7611 prev->next = sg;
7631 prev = sg; 7612 prev = sg;
7632 } 7613 }
@@ -7635,22 +7616,22 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7635 7616
7636 /* Calculate CPU power for physical packages and nodes */ 7617 /* Calculate CPU power for physical packages and nodes */
7637#ifdef CONFIG_SCHED_SMT 7618#ifdef CONFIG_SCHED_SMT
7638 for_each_cpu_mask_nr(i, *cpu_map) { 7619 for_each_cpu(i, cpu_map) {
7639 struct sched_domain *sd = &per_cpu(cpu_domains, i); 7620 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
7640 7621
7641 init_sched_groups_power(i, sd); 7622 init_sched_groups_power(i, sd);
7642 } 7623 }
7643#endif 7624#endif
7644#ifdef CONFIG_SCHED_MC 7625#ifdef CONFIG_SCHED_MC
7645 for_each_cpu_mask_nr(i, *cpu_map) { 7626 for_each_cpu(i, cpu_map) {
7646 struct sched_domain *sd = &per_cpu(core_domains, i); 7627 struct sched_domain *sd = &per_cpu(core_domains, i).sd;
7647 7628
7648 init_sched_groups_power(i, sd); 7629 init_sched_groups_power(i, sd);
7649 } 7630 }
7650#endif 7631#endif
7651 7632
7652 for_each_cpu_mask_nr(i, *cpu_map) { 7633 for_each_cpu(i, cpu_map) {
7653 struct sched_domain *sd = &per_cpu(phys_domains, i); 7634 struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
7654 7635
7655 init_sched_groups_power(i, sd); 7636 init_sched_groups_power(i, sd);
7656 } 7637 }
@@ -7662,53 +7643,78 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7662 if (sd_allnodes) { 7643 if (sd_allnodes) {
7663 struct sched_group *sg; 7644 struct sched_group *sg;
7664 7645
7665 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg, 7646 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
7666 tmpmask); 7647 tmpmask);
7667 init_numa_sched_groups_power(sg); 7648 init_numa_sched_groups_power(sg);
7668 } 7649 }
7669#endif 7650#endif
7670 7651
7671 /* Attach the domains */ 7652 /* Attach the domains */
7672 for_each_cpu_mask_nr(i, *cpu_map) { 7653 for_each_cpu(i, cpu_map) {
7673 struct sched_domain *sd; 7654 struct sched_domain *sd;
7674#ifdef CONFIG_SCHED_SMT 7655#ifdef CONFIG_SCHED_SMT
7675 sd = &per_cpu(cpu_domains, i); 7656 sd = &per_cpu(cpu_domains, i).sd;
7676#elif defined(CONFIG_SCHED_MC) 7657#elif defined(CONFIG_SCHED_MC)
7677 sd = &per_cpu(core_domains, i); 7658 sd = &per_cpu(core_domains, i).sd;
7678#else 7659#else
7679 sd = &per_cpu(phys_domains, i); 7660 sd = &per_cpu(phys_domains, i).sd;
7680#endif 7661#endif
7681 cpu_attach_domain(sd, rd, i); 7662 cpu_attach_domain(sd, rd, i);
7682 } 7663 }
7683 7664
7684 SCHED_CPUMASK_FREE((void *)allmasks); 7665 err = 0;
7685 return 0; 7666
7667free_tmpmask:
7668 free_cpumask_var(tmpmask);
7669free_send_covered:
7670 free_cpumask_var(send_covered);
7671free_this_core_map:
7672 free_cpumask_var(this_core_map);
7673free_this_sibling_map:
7674 free_cpumask_var(this_sibling_map);
7675free_nodemask:
7676 free_cpumask_var(nodemask);
7677free_notcovered:
7678#ifdef CONFIG_NUMA
7679 free_cpumask_var(notcovered);
7680free_covered:
7681 free_cpumask_var(covered);
7682free_domainspan:
7683 free_cpumask_var(domainspan);
7684out:
7685#endif
7686 return err;
7687
7688free_sched_groups:
7689#ifdef CONFIG_NUMA
7690 kfree(sched_group_nodes);
7691#endif
7692 goto free_tmpmask;
7686 7693
7687#ifdef CONFIG_NUMA 7694#ifdef CONFIG_NUMA
7688error: 7695error:
7689 free_sched_groups(cpu_map, tmpmask); 7696 free_sched_groups(cpu_map, tmpmask);
7690 SCHED_CPUMASK_FREE((void *)allmasks); 7697 free_rootdomain(rd);
7691 kfree(rd); 7698 goto free_tmpmask;
7692 return -ENOMEM;
7693#endif 7699#endif
7694} 7700}
7695 7701
7696static int build_sched_domains(const cpumask_t *cpu_map) 7702static int build_sched_domains(const struct cpumask *cpu_map)
7697{ 7703{
7698 return __build_sched_domains(cpu_map, NULL); 7704 return __build_sched_domains(cpu_map, NULL);
7699} 7705}
7700 7706
7701static cpumask_t *doms_cur; /* current sched domains */ 7707static struct cpumask *doms_cur; /* current sched domains */
7702static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7708static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7703static struct sched_domain_attr *dattr_cur; 7709static struct sched_domain_attr *dattr_cur;
7704 /* attribues of custom domains in 'doms_cur' */ 7710 /* attribues of custom domains in 'doms_cur' */
7705 7711
7706/* 7712/*
7707 * Special case: If a kmalloc of a doms_cur partition (array of 7713 * Special case: If a kmalloc of a doms_cur partition (array of
7708 * cpumask_t) fails, then fallback to a single sched domain, 7714 * cpumask) fails, then fallback to a single sched domain,
7709 * as determined by the single cpumask_t fallback_doms. 7715 * as determined by the single cpumask fallback_doms.
7710 */ 7716 */
7711static cpumask_t fallback_doms; 7717static cpumask_var_t fallback_doms;
7712 7718
7713void __attribute__((weak)) arch_update_cpu_topology(void) 7719void __attribute__((weak)) arch_update_cpu_topology(void)
7714{ 7720{
@@ -7719,16 +7725,16 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7719 * For now this just excludes isolated cpus, but could be used to 7725 * For now this just excludes isolated cpus, but could be used to
7720 * exclude other special cases in the future. 7726 * exclude other special cases in the future.
7721 */ 7727 */
7722static int arch_init_sched_domains(const cpumask_t *cpu_map) 7728static int arch_init_sched_domains(const struct cpumask *cpu_map)
7723{ 7729{
7724 int err; 7730 int err;
7725 7731
7726 arch_update_cpu_topology(); 7732 arch_update_cpu_topology();
7727 ndoms_cur = 1; 7733 ndoms_cur = 1;
7728 doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 7734 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
7729 if (!doms_cur) 7735 if (!doms_cur)
7730 doms_cur = &fallback_doms; 7736 doms_cur = fallback_doms;
7731 cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); 7737 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
7732 dattr_cur = NULL; 7738 dattr_cur = NULL;
7733 err = build_sched_domains(doms_cur); 7739 err = build_sched_domains(doms_cur);
7734 register_sched_domain_sysctl(); 7740 register_sched_domain_sysctl();
@@ -7736,8 +7742,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
7736 return err; 7742 return err;
7737} 7743}
7738 7744
7739static void arch_destroy_sched_domains(const cpumask_t *cpu_map, 7745static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7740 cpumask_t *tmpmask) 7746 struct cpumask *tmpmask)
7741{ 7747{
7742 free_sched_groups(cpu_map, tmpmask); 7748 free_sched_groups(cpu_map, tmpmask);
7743} 7749}
@@ -7746,17 +7752,16 @@ static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
7746 * Detach sched domains from a group of cpus specified in cpu_map 7752 * Detach sched domains from a group of cpus specified in cpu_map
7747 * These cpus will now be attached to the NULL domain 7753 * These cpus will now be attached to the NULL domain
7748 */ 7754 */
7749static void detach_destroy_domains(const cpumask_t *cpu_map) 7755static void detach_destroy_domains(const struct cpumask *cpu_map)
7750{ 7756{
7751 cpumask_t tmpmask; 7757 /* Save because hotplug lock held. */
7758 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7752 int i; 7759 int i;
7753 7760
7754 unregister_sched_domain_sysctl(); 7761 for_each_cpu(i, cpu_map)
7755
7756 for_each_cpu_mask_nr(i, *cpu_map)
7757 cpu_attach_domain(NULL, &def_root_domain, i); 7762 cpu_attach_domain(NULL, &def_root_domain, i);
7758 synchronize_sched(); 7763 synchronize_sched();
7759 arch_destroy_sched_domains(cpu_map, &tmpmask); 7764 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7760} 7765}
7761 7766
7762/* handle null as "default" */ 7767/* handle null as "default" */
@@ -7781,7 +7786,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7781 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7786 * doms_new[] to the current sched domain partitioning, doms_cur[].
7782 * It destroys each deleted domain and builds each new domain. 7787 * It destroys each deleted domain and builds each new domain.
7783 * 7788 *
7784 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. 7789 * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
7785 * The masks don't intersect (don't overlap.) We should setup one 7790 * The masks don't intersect (don't overlap.) We should setup one
7786 * sched domain for each mask. CPUs not in any of the cpumasks will 7791 * sched domain for each mask. CPUs not in any of the cpumasks will
7787 * not be load balanced. If the same cpumask appears both in the 7792 * not be load balanced. If the same cpumask appears both in the
@@ -7795,13 +7800,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7795 * the single partition 'fallback_doms', it also forces the domains 7800 * the single partition 'fallback_doms', it also forces the domains
7796 * to be rebuilt. 7801 * to be rebuilt.
7797 * 7802 *
7798 * If doms_new == NULL it will be replaced with cpu_online_map. 7803 * If doms_new == NULL it will be replaced with cpu_online_mask.
7799 * ndoms_new == 0 is a special case for destroying existing domains, 7804 * ndoms_new == 0 is a special case for destroying existing domains,
7800 * and it will not create the default domain. 7805 * and it will not create the default domain.
7801 * 7806 *
7802 * Call with hotplug lock held 7807 * Call with hotplug lock held
7803 */ 7808 */
7804void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7809/* FIXME: Change to struct cpumask *doms_new[] */
7810void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
7805 struct sched_domain_attr *dattr_new) 7811 struct sched_domain_attr *dattr_new)
7806{ 7812{
7807 int i, j, n; 7813 int i, j, n;
@@ -7816,7 +7822,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7816 /* Destroy deleted domains */ 7822 /* Destroy deleted domains */
7817 for (i = 0; i < ndoms_cur; i++) { 7823 for (i = 0; i < ndoms_cur; i++) {
7818 for (j = 0; j < n; j++) { 7824 for (j = 0; j < n; j++) {
7819 if (cpus_equal(doms_cur[i], doms_new[j]) 7825 if (cpumask_equal(&doms_cur[i], &doms_new[j])
7820 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7826 && dattrs_equal(dattr_cur, i, dattr_new, j))
7821 goto match1; 7827 goto match1;
7822 } 7828 }
@@ -7828,15 +7834,15 @@ match1:
7828 7834
7829 if (doms_new == NULL) { 7835 if (doms_new == NULL) {
7830 ndoms_cur = 0; 7836 ndoms_cur = 0;
7831 doms_new = &fallback_doms; 7837 doms_new = fallback_doms;
7832 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); 7838 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
7833 dattr_new = NULL; 7839 WARN_ON_ONCE(dattr_new);
7834 } 7840 }
7835 7841
7836 /* Build new domains */ 7842 /* Build new domains */
7837 for (i = 0; i < ndoms_new; i++) { 7843 for (i = 0; i < ndoms_new; i++) {
7838 for (j = 0; j < ndoms_cur; j++) { 7844 for (j = 0; j < ndoms_cur; j++) {
7839 if (cpus_equal(doms_new[i], doms_cur[j]) 7845 if (cpumask_equal(&doms_new[i], &doms_cur[j])
7840 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7846 && dattrs_equal(dattr_new, i, dattr_cur, j))
7841 goto match2; 7847 goto match2;
7842 } 7848 }
@@ -7848,7 +7854,7 @@ match2:
7848 } 7854 }
7849 7855
7850 /* Remember the new sched domains */ 7856 /* Remember the new sched domains */
7851 if (doms_cur != &fallback_doms) 7857 if (doms_cur != fallback_doms)
7852 kfree(doms_cur); 7858 kfree(doms_cur);
7853 kfree(dattr_cur); /* kfree(NULL) is safe */ 7859 kfree(dattr_cur); /* kfree(NULL) is safe */
7854 doms_cur = doms_new; 7860 doms_cur = doms_new;
@@ -7988,7 +7994,9 @@ static int update_runtime(struct notifier_block *nfb,
7988 7994
7989void __init sched_init_smp(void) 7995void __init sched_init_smp(void)
7990{ 7996{
7991 cpumask_t non_isolated_cpus; 7997 cpumask_var_t non_isolated_cpus;
7998
7999 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7992 8000
7993#if defined(CONFIG_NUMA) 8001#if defined(CONFIG_NUMA)
7994 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 8002 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -7997,10 +8005,10 @@ void __init sched_init_smp(void)
7997#endif 8005#endif
7998 get_online_cpus(); 8006 get_online_cpus();
7999 mutex_lock(&sched_domains_mutex); 8007 mutex_lock(&sched_domains_mutex);
8000 arch_init_sched_domains(&cpu_online_map); 8008 arch_init_sched_domains(cpu_online_mask);
8001 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 8009 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
8002 if (cpus_empty(non_isolated_cpus)) 8010 if (cpumask_empty(non_isolated_cpus))
8003 cpu_set(smp_processor_id(), non_isolated_cpus); 8011 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
8004 mutex_unlock(&sched_domains_mutex); 8012 mutex_unlock(&sched_domains_mutex);
8005 put_online_cpus(); 8013 put_online_cpus();
8006 8014
@@ -8015,9 +8023,13 @@ void __init sched_init_smp(void)
8015 init_hrtick(); 8023 init_hrtick();
8016 8024
8017 /* Move init over to a non-isolated CPU */ 8025 /* Move init over to a non-isolated CPU */
8018 if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) 8026 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
8019 BUG(); 8027 BUG();
8020 sched_init_granularity(); 8028 sched_init_granularity();
8029 free_cpumask_var(non_isolated_cpus);
8030
8031 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
8032 init_sched_rt_class();
8021} 8033}
8022#else 8034#else
8023void __init sched_init_smp(void) 8035void __init sched_init_smp(void)
@@ -8332,6 +8344,15 @@ void __init sched_init(void)
8332 */ 8344 */
8333 current->sched_class = &fair_sched_class; 8345 current->sched_class = &fair_sched_class;
8334 8346
8347 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8348 alloc_bootmem_cpumask_var(&nohz_cpu_mask);
8349#ifdef CONFIG_SMP
8350#ifdef CONFIG_NO_HZ
8351 alloc_bootmem_cpumask_var(&nohz.cpu_mask);
8352#endif
8353 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8354#endif /* SMP */
8355
8335 scheduler_running = 1; 8356 scheduler_running = 1;
8336} 8357}
8337 8358
@@ -8490,7 +8511,7 @@ static
8490int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8511int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8491{ 8512{
8492 struct cfs_rq *cfs_rq; 8513 struct cfs_rq *cfs_rq;
8493 struct sched_entity *se, *parent_se; 8514 struct sched_entity *se;
8494 struct rq *rq; 8515 struct rq *rq;
8495 int i; 8516 int i;
8496 8517
@@ -8506,18 +8527,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8506 for_each_possible_cpu(i) { 8527 for_each_possible_cpu(i) {
8507 rq = cpu_rq(i); 8528 rq = cpu_rq(i);
8508 8529
8509 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), 8530 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8510 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8531 GFP_KERNEL, cpu_to_node(i));
8511 if (!cfs_rq) 8532 if (!cfs_rq)
8512 goto err; 8533 goto err;
8513 8534
8514 se = kmalloc_node(sizeof(struct sched_entity), 8535 se = kzalloc_node(sizeof(struct sched_entity),
8515 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8536 GFP_KERNEL, cpu_to_node(i));
8516 if (!se) 8537 if (!se)
8517 goto err; 8538 goto err;
8518 8539
8519 parent_se = parent ? parent->se[i] : NULL; 8540 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
8520 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
8521 } 8541 }
8522 8542
8523 return 1; 8543 return 1;
@@ -8578,7 +8598,7 @@ static
8578int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8598int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8579{ 8599{
8580 struct rt_rq *rt_rq; 8600 struct rt_rq *rt_rq;
8581 struct sched_rt_entity *rt_se, *parent_se; 8601 struct sched_rt_entity *rt_se;
8582 struct rq *rq; 8602 struct rq *rq;
8583 int i; 8603 int i;
8584 8604
@@ -8595,18 +8615,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8595 for_each_possible_cpu(i) { 8615 for_each_possible_cpu(i) {
8596 rq = cpu_rq(i); 8616 rq = cpu_rq(i);
8597 8617
8598 rt_rq = kmalloc_node(sizeof(struct rt_rq), 8618 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8599 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8619 GFP_KERNEL, cpu_to_node(i));
8600 if (!rt_rq) 8620 if (!rt_rq)
8601 goto err; 8621 goto err;
8602 8622
8603 rt_se = kmalloc_node(sizeof(struct sched_rt_entity), 8623 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8604 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); 8624 GFP_KERNEL, cpu_to_node(i));
8605 if (!rt_se) 8625 if (!rt_se)
8606 goto err; 8626 goto err;
8607 8627
8608 parent_se = parent ? parent->rt_se[i] : NULL; 8628 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8609 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
8610 } 8629 }
8611 8630
8612 return 1; 8631 return 1;
@@ -9249,11 +9268,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9249 * (balbir@in.ibm.com). 9268 * (balbir@in.ibm.com).
9250 */ 9269 */
9251 9270
9252/* track cpu usage of a group of tasks */ 9271/* track cpu usage of a group of tasks and its child groups */
9253struct cpuacct { 9272struct cpuacct {
9254 struct cgroup_subsys_state css; 9273 struct cgroup_subsys_state css;
9255 /* cpuusage holds pointer to a u64-type object on every cpu */ 9274 /* cpuusage holds pointer to a u64-type object on every cpu */
9256 u64 *cpuusage; 9275 u64 *cpuusage;
9276 struct cpuacct *parent;
9257}; 9277};
9258 9278
9259struct cgroup_subsys cpuacct_subsys; 9279struct cgroup_subsys cpuacct_subsys;
@@ -9287,6 +9307,9 @@ static struct cgroup_subsys_state *cpuacct_create(
9287 return ERR_PTR(-ENOMEM); 9307 return ERR_PTR(-ENOMEM);
9288 } 9308 }
9289 9309
9310 if (cgrp->parent)
9311 ca->parent = cgroup_ca(cgrp->parent);
9312
9290 return &ca->css; 9313 return &ca->css;
9291} 9314}
9292 9315
@@ -9366,14 +9389,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
9366static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 9389static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9367{ 9390{
9368 struct cpuacct *ca; 9391 struct cpuacct *ca;
9392 int cpu;
9369 9393
9370 if (!cpuacct_subsys.active) 9394 if (!cpuacct_subsys.active)
9371 return; 9395 return;
9372 9396
9397 cpu = task_cpu(tsk);
9373 ca = task_ca(tsk); 9398 ca = task_ca(tsk);
9374 if (ca) {
9375 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
9376 9399
9400 for (; ca; ca = ca->parent) {
9401 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
9377 *cpuusage += cputime; 9402 *cpuusage += cputime;
9378 } 9403 }
9379} 9404}
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 52154fefab7e..018b7be1db2e 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -67,24 +67,21 @@ static int convert_prio(int prio)
67 * Returns: (int)bool - CPUs were found 67 * Returns: (int)bool - CPUs were found
68 */ 68 */
69int cpupri_find(struct cpupri *cp, struct task_struct *p, 69int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 cpumask_t *lowest_mask) 70 struct cpumask *lowest_mask)
71{ 71{
72 int idx = 0; 72 int idx = 0;
73 int task_pri = convert_prio(p->prio); 73 int task_pri = convert_prio(p->prio);
74 74
75 for_each_cpupri_active(cp->pri_active, idx) { 75 for_each_cpupri_active(cp->pri_active, idx) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 cpumask_t mask;
78 77
79 if (idx >= task_pri) 78 if (idx >= task_pri)
80 break; 79 break;
81 80
82 cpus_and(mask, p->cpus_allowed, vec->mask); 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
83
84 if (cpus_empty(mask))
85 continue; 82 continue;
86 83
87 *lowest_mask = mask; 84 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
88 return 1; 85 return 1;
89 } 86 }
90 87
@@ -126,7 +123,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
126 vec->count--; 123 vec->count--;
127 if (!vec->count) 124 if (!vec->count)
128 clear_bit(oldpri, cp->pri_active); 125 clear_bit(oldpri, cp->pri_active);
129 cpu_clear(cpu, vec->mask); 126 cpumask_clear_cpu(cpu, vec->mask);
130 127
131 spin_unlock_irqrestore(&vec->lock, flags); 128 spin_unlock_irqrestore(&vec->lock, flags);
132 } 129 }
@@ -136,7 +133,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
136 133
137 spin_lock_irqsave(&vec->lock, flags); 134 spin_lock_irqsave(&vec->lock, flags);
138 135
139 cpu_set(cpu, vec->mask); 136 cpumask_set_cpu(cpu, vec->mask);
140 vec->count++; 137 vec->count++;
141 if (vec->count == 1) 138 if (vec->count == 1)
142 set_bit(newpri, cp->pri_active); 139 set_bit(newpri, cp->pri_active);
@@ -150,10 +147,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
150/** 147/**
151 * cpupri_init - initialize the cpupri structure 148 * cpupri_init - initialize the cpupri structure
152 * @cp: The cpupri context 149 * @cp: The cpupri context
150 * @bootmem: true if allocations need to use bootmem
153 * 151 *
154 * Returns: (void) 152 * Returns: -ENOMEM if memory fails.
155 */ 153 */
156void cpupri_init(struct cpupri *cp) 154int cpupri_init(struct cpupri *cp, bool bootmem)
157{ 155{
158 int i; 156 int i;
159 157
@@ -164,11 +162,30 @@ void cpupri_init(struct cpupri *cp)
164 162
165 spin_lock_init(&vec->lock); 163 spin_lock_init(&vec->lock);
166 vec->count = 0; 164 vec->count = 0;
167 cpus_clear(vec->mask); 165 if (bootmem)
166 alloc_bootmem_cpumask_var(&vec->mask);
167 else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
168 goto cleanup;
168 } 169 }
169 170
170 for_each_possible_cpu(i) 171 for_each_possible_cpu(i)
171 cp->cpu_to_pri[i] = CPUPRI_INVALID; 172 cp->cpu_to_pri[i] = CPUPRI_INVALID;
173 return 0;
174
175cleanup:
176 for (i--; i >= 0; i--)
177 free_cpumask_var(cp->pri_to_cpu[i].mask);
178 return -ENOMEM;
172} 179}
173 180
181/**
182 * cpupri_cleanup - clean up the cpupri structure
183 * @cp: The cpupri context
184 */
185void cpupri_cleanup(struct cpupri *cp)
186{
187 int i;
174 188
189 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
190 free_cpumask_var(cp->pri_to_cpu[i].mask);
191}
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index f25811b0f931..642a94ef8a0a 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -14,7 +14,7 @@
14struct cpupri_vec { 14struct cpupri_vec {
15 spinlock_t lock; 15 spinlock_t lock;
16 int count; 16 int count;
17 cpumask_t mask; 17 cpumask_var_t mask;
18}; 18};
19 19
20struct cpupri { 20struct cpupri {
@@ -27,7 +27,8 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask); 28 struct task_struct *p, cpumask_t *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30void cpupri_init(struct cpupri *cp); 30int cpupri_init(struct cpupri *cp, bool bootmem);
31void cpupri_cleanup(struct cpupri *cp);
31#else 32#else
32#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
33#define cpupri_init() do { } while (0) 34#define cpupri_init() do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 26ed8e3d1c15..baf2f17af462 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
53 53
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu,
58 struct task_group *tg)
59{
60 struct sched_entity *se = tg->se[cpu];
61 if (!se)
62 return;
63
64#define P(F) \
65 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
66#define PN(F) \
67 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
68
69 PN(se->exec_start);
70 PN(se->vruntime);
71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start);
74 PN(se->sleep_start);
75 PN(se->block_start);
76 PN(se->sleep_max);
77 PN(se->block_max);
78 PN(se->exec_max);
79 PN(se->slice_max);
80 PN(se->wait_max);
81 PN(se->wait_sum);
82 P(se->wait_count);
83#endif
84 P(se->load.weight);
85#undef PN
86#undef P
87}
88#endif
89
56static void 90static void
57print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) 91print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
58{ 92{
@@ -121,14 +155,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
121 155
122#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) 156#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
123 char path[128] = ""; 157 char path[128] = "";
124 struct cgroup *cgroup = NULL;
125 struct task_group *tg = cfs_rq->tg; 158 struct task_group *tg = cfs_rq->tg;
126 159
127 if (tg) 160 cgroup_path(tg->css.cgroup, path, sizeof(path));
128 cgroup = tg->css.cgroup;
129
130 if (cgroup)
131 cgroup_path(cgroup, path, sizeof(path));
132 161
133 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 162 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
134#else 163#else
@@ -168,6 +197,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168#ifdef CONFIG_SMP 197#ifdef CONFIG_SMP
169 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 198 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
170#endif 199#endif
200 print_cfs_group_stats(m, cpu, cfs_rq->tg);
171#endif 201#endif
172} 202}
173 203
@@ -175,14 +205,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
175{ 205{
176#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) 206#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
177 char path[128] = ""; 207 char path[128] = "";
178 struct cgroup *cgroup = NULL;
179 struct task_group *tg = rt_rq->tg; 208 struct task_group *tg = rt_rq->tg;
180 209
181 if (tg) 210 cgroup_path(tg->css.cgroup, path, sizeof(path));
182 cgroup = tg->css.cgroup;
183
184 if (cgroup)
185 cgroup_path(cgroup, path, sizeof(path));
186 211
187 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); 212 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
188#else 213#else
@@ -272,7 +297,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
272 u64 now = ktime_to_ns(ktime_get()); 297 u64 now = ktime_to_ns(ktime_get());
273 int cpu; 298 int cpu;
274 299
275 SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n", 300 SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
276 init_utsname()->release, 301 init_utsname()->release,
277 (int)strcspn(init_utsname()->version, " "), 302 (int)strcspn(init_utsname()->version, " "),
278 init_utsname()->version); 303 init_utsname()->version);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 98345e45b059..08ffffd4a410 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1017,14 +1017,13 @@ static void yield_task_fair(struct rq *rq)
1017 * search starts with cpus closest then further out as needed, 1017 * search starts with cpus closest then further out as needed,
1018 * so we always favor a closer, idle cpu. 1018 * so we always favor a closer, idle cpu.
1019 * Domains may include CPUs that are not usable for migration, 1019 * Domains may include CPUs that are not usable for migration,
1020 * hence we need to mask them out (cpu_active_map) 1020 * hence we need to mask them out (cpu_active_mask)
1021 * 1021 *
1022 * Returns the CPU we should wake onto. 1022 * Returns the CPU we should wake onto.
1023 */ 1023 */
1024#if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1024#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1025static int wake_idle(int cpu, struct task_struct *p) 1025static int wake_idle(int cpu, struct task_struct *p)
1026{ 1026{
1027 cpumask_t tmp;
1028 struct sched_domain *sd; 1027 struct sched_domain *sd;
1029 int i; 1028 int i;
1030 1029
@@ -1044,10 +1043,9 @@ static int wake_idle(int cpu, struct task_struct *p)
1044 if ((sd->flags & SD_WAKE_IDLE) 1043 if ((sd->flags & SD_WAKE_IDLE)
1045 || ((sd->flags & SD_WAKE_IDLE_FAR) 1044 || ((sd->flags & SD_WAKE_IDLE_FAR)
1046 && !task_hot(p, task_rq(p)->clock, sd))) { 1045 && !task_hot(p, task_rq(p)->clock, sd))) {
1047 cpus_and(tmp, sd->span, p->cpus_allowed); 1046 for_each_cpu_and(i, sched_domain_span(sd),
1048 cpus_and(tmp, tmp, cpu_active_map); 1047 &p->cpus_allowed) {
1049 for_each_cpu_mask_nr(i, tmp) { 1048 if (cpu_active(i) && idle_cpu(i)) {
1050 if (idle_cpu(i)) {
1051 if (i != task_cpu(p)) { 1049 if (i != task_cpu(p)) {
1052 schedstat_inc(p, 1050 schedstat_inc(p,
1053 se.nr_wakeups_idle); 1051 se.nr_wakeups_idle);
@@ -1240,13 +1238,13 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1240 * this_cpu and prev_cpu are present in: 1238 * this_cpu and prev_cpu are present in:
1241 */ 1239 */
1242 for_each_domain(this_cpu, sd) { 1240 for_each_domain(this_cpu, sd) {
1243 if (cpu_isset(prev_cpu, sd->span)) { 1241 if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
1244 this_sd = sd; 1242 this_sd = sd;
1245 break; 1243 break;
1246 } 1244 }
1247 } 1245 }
1248 1246
1249 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1247 if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
1250 goto out; 1248 goto out;
1251 1249
1252 /* 1250 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d9ba9d5f99d6..94aab72f6a02 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -15,7 +15,7 @@ static inline void rt_set_overload(struct rq *rq)
15 if (!rq->online) 15 if (!rq->online)
16 return; 16 return;
17 17
18 cpu_set(rq->cpu, rq->rd->rto_mask); 18 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
19 /* 19 /*
20 * Make sure the mask is visible before we set 20 * Make sure the mask is visible before we set
21 * the overload count. That is checked to determine 21 * the overload count. That is checked to determine
@@ -34,7 +34,7 @@ static inline void rt_clear_overload(struct rq *rq)
34 34
35 /* the order here really doesn't matter */ 35 /* the order here really doesn't matter */
36 atomic_dec(&rq->rd->rto_count); 36 atomic_dec(&rq->rd->rto_count);
37 cpu_clear(rq->cpu, rq->rd->rto_mask); 37 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
38} 38}
39 39
40static void update_rt_migration(struct rq *rq) 40static void update_rt_migration(struct rq *rq)
@@ -139,14 +139,14 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
139} 139}
140 140
141#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
142static inline cpumask_t sched_rt_period_mask(void) 142static inline const struct cpumask *sched_rt_period_mask(void)
143{ 143{
144 return cpu_rq(smp_processor_id())->rd->span; 144 return cpu_rq(smp_processor_id())->rd->span;
145} 145}
146#else 146#else
147static inline cpumask_t sched_rt_period_mask(void) 147static inline const struct cpumask *sched_rt_period_mask(void)
148{ 148{
149 return cpu_online_map; 149 return cpu_online_mask;
150} 150}
151#endif 151#endif
152 152
@@ -212,9 +212,9 @@ static inline int rt_rq_throttled(struct rt_rq *rt_rq)
212 return rt_rq->rt_throttled; 212 return rt_rq->rt_throttled;
213} 213}
214 214
215static inline cpumask_t sched_rt_period_mask(void) 215static inline const struct cpumask *sched_rt_period_mask(void)
216{ 216{
217 return cpu_online_map; 217 return cpu_online_mask;
218} 218}
219 219
220static inline 220static inline
@@ -241,11 +241,11 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
241 int i, weight, more = 0; 241 int i, weight, more = 0;
242 u64 rt_period; 242 u64 rt_period;
243 243
244 weight = cpus_weight(rd->span); 244 weight = cpumask_weight(rd->span);
245 245
246 spin_lock(&rt_b->rt_runtime_lock); 246 spin_lock(&rt_b->rt_runtime_lock);
247 rt_period = ktime_to_ns(rt_b->rt_period); 247 rt_period = ktime_to_ns(rt_b->rt_period);
248 for_each_cpu_mask_nr(i, rd->span) { 248 for_each_cpu(i, rd->span) {
249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
250 s64 diff; 250 s64 diff;
251 251
@@ -324,7 +324,7 @@ static void __disable_runtime(struct rq *rq)
324 /* 324 /*
325 * Greedy reclaim, take back as much as we can. 325 * Greedy reclaim, take back as much as we can.
326 */ 326 */
327 for_each_cpu_mask(i, rd->span) { 327 for_each_cpu(i, rd->span) {
328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
329 s64 diff; 329 s64 diff;
330 330
@@ -429,13 +429,13 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
430{ 430{
431 int i, idle = 1; 431 int i, idle = 1;
432 cpumask_t span; 432 const struct cpumask *span;
433 433
434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
435 return 1; 435 return 1;
436 436
437 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
438 for_each_cpu_mask(i, span) { 438 for_each_cpu(i, span) {
439 int enqueue = 0; 439 int enqueue = 0;
440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 440 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
441 struct rq *rq = rq_of_rt_rq(rt_rq); 441 struct rq *rq = rq_of_rt_rq(rt_rq);
@@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq)
537 for_each_sched_rt_entity(rt_se) { 537 for_each_sched_rt_entity(rt_se) {
538 rt_rq = rt_rq_of_se(rt_se); 538 rt_rq = rt_rq_of_se(rt_se);
539 539
540 spin_lock(&rt_rq->rt_runtime_lock);
541 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 540 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
541 spin_lock(&rt_rq->rt_runtime_lock);
542 rt_rq->rt_time += delta_exec; 542 rt_rq->rt_time += delta_exec;
543 if (sched_rt_runtime_exceeded(rt_rq)) 543 if (sched_rt_runtime_exceeded(rt_rq))
544 resched_task(curr); 544 resched_task(curr);
545 spin_unlock(&rt_rq->rt_runtime_lock);
545 } 546 }
546 spin_unlock(&rt_rq->rt_runtime_lock);
547 } 547 }
548} 548}
549 549
@@ -805,17 +805,20 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
805 805
806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
807{ 807{
808 cpumask_t mask; 808 cpumask_var_t mask;
809 809
810 if (rq->curr->rt.nr_cpus_allowed == 1) 810 if (rq->curr->rt.nr_cpus_allowed == 1)
811 return; 811 return;
812 812
813 if (p->rt.nr_cpus_allowed != 1 813 if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
814 && cpupri_find(&rq->rd->cpupri, p, &mask))
815 return; 814 return;
816 815
817 if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask)) 816 if (p->rt.nr_cpus_allowed != 1
818 return; 817 && cpupri_find(&rq->rd->cpupri, p, mask))
818 goto free;
819
820 if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
821 goto free;
819 822
820 /* 823 /*
821 * There appears to be other cpus that can accept 824 * There appears to be other cpus that can accept
@@ -824,6 +827,8 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
824 */ 827 */
825 requeue_task_rt(rq, p, 1); 828 requeue_task_rt(rq, p, 1);
826 resched_task(rq->curr); 829 resched_task(rq->curr);
830free:
831 free_cpumask_var(mask);
827} 832}
828 833
829#endif /* CONFIG_SMP */ 834#endif /* CONFIG_SMP */
@@ -910,14 +915,15 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
910#define RT_MAX_TRIES 3 915#define RT_MAX_TRIES 3
911 916
912static int double_lock_balance(struct rq *this_rq, struct rq *busiest); 917static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
913static void double_unlock_balance(struct rq *this_rq, struct rq *busiest); 918static inline void double_unlock_balance(struct rq *this_rq,
919 struct rq *busiest);
914 920
915static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); 921static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
916 922
917static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 923static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
918{ 924{
919 if (!task_running(rq, p) && 925 if (!task_running(rq, p) &&
920 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && 926 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
921 (p->rt.nr_cpus_allowed > 1)) 927 (p->rt.nr_cpus_allowed > 1))
922 return 1; 928 return 1;
923 return 0; 929 return 0;
@@ -956,7 +962,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
956 return next; 962 return next;
957} 963}
958 964
959static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); 965static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
960 966
961static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 967static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
962{ 968{
@@ -976,7 +982,7 @@ static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
976static int find_lowest_rq(struct task_struct *task) 982static int find_lowest_rq(struct task_struct *task)
977{ 983{
978 struct sched_domain *sd; 984 struct sched_domain *sd;
979 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); 985 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
980 int this_cpu = smp_processor_id(); 986 int this_cpu = smp_processor_id();
981 int cpu = task_cpu(task); 987 int cpu = task_cpu(task);
982 988
@@ -991,7 +997,7 @@ static int find_lowest_rq(struct task_struct *task)
991 * I guess we might want to change cpupri_find() to ignore those 997 * I guess we might want to change cpupri_find() to ignore those
992 * in the first place. 998 * in the first place.
993 */ 999 */
994 cpus_and(*lowest_mask, *lowest_mask, cpu_active_map); 1000 cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
995 1001
996 /* 1002 /*
997 * At this point we have built a mask of cpus representing the 1003 * At this point we have built a mask of cpus representing the
@@ -1001,7 +1007,7 @@ static int find_lowest_rq(struct task_struct *task)
1001 * We prioritize the last cpu that the task executed on since 1007 * We prioritize the last cpu that the task executed on since
1002 * it is most likely cache-hot in that location. 1008 * it is most likely cache-hot in that location.
1003 */ 1009 */
1004 if (cpu_isset(cpu, *lowest_mask)) 1010 if (cpumask_test_cpu(cpu, lowest_mask))
1005 return cpu; 1011 return cpu;
1006 1012
1007 /* 1013 /*
@@ -1016,7 +1022,8 @@ static int find_lowest_rq(struct task_struct *task)
1016 cpumask_t domain_mask; 1022 cpumask_t domain_mask;
1017 int best_cpu; 1023 int best_cpu;
1018 1024
1019 cpus_and(domain_mask, sd->span, *lowest_mask); 1025 cpumask_and(&domain_mask, sched_domain_span(sd),
1026 lowest_mask);
1020 1027
1021 best_cpu = pick_optimal_cpu(this_cpu, 1028 best_cpu = pick_optimal_cpu(this_cpu,
1022 &domain_mask); 1029 &domain_mask);
@@ -1057,8 +1064,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1057 * Also make sure that it wasn't scheduled on its rq. 1064 * Also make sure that it wasn't scheduled on its rq.
1058 */ 1065 */
1059 if (unlikely(task_rq(task) != rq || 1066 if (unlikely(task_rq(task) != rq ||
1060 !cpu_isset(lowest_rq->cpu, 1067 !cpumask_test_cpu(lowest_rq->cpu,
1061 task->cpus_allowed) || 1068 &task->cpus_allowed) ||
1062 task_running(rq, task) || 1069 task_running(rq, task) ||
1063 !task->se.on_rq)) { 1070 !task->se.on_rq)) {
1064 1071
@@ -1179,7 +1186,7 @@ static int pull_rt_task(struct rq *this_rq)
1179 1186
1180 next = pick_next_task_rt(this_rq); 1187 next = pick_next_task_rt(this_rq);
1181 1188
1182 for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) { 1189 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1183 if (this_cpu == cpu) 1190 if (this_cpu == cpu)
1184 continue; 1191 continue;
1185 1192
@@ -1308,9 +1315,9 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1308} 1315}
1309 1316
1310static void set_cpus_allowed_rt(struct task_struct *p, 1317static void set_cpus_allowed_rt(struct task_struct *p,
1311 const cpumask_t *new_mask) 1318 const struct cpumask *new_mask)
1312{ 1319{
1313 int weight = cpus_weight(*new_mask); 1320 int weight = cpumask_weight(new_mask);
1314 1321
1315 BUG_ON(!rt_task(p)); 1322 BUG_ON(!rt_task(p));
1316 1323
@@ -1331,7 +1338,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1331 update_rt_migration(rq); 1338 update_rt_migration(rq);
1332 } 1339 }
1333 1340
1334 p->cpus_allowed = *new_mask; 1341 cpumask_copy(&p->cpus_allowed, new_mask);
1335 p->rt.nr_cpus_allowed = weight; 1342 p->rt.nr_cpus_allowed = weight;
1336} 1343}
1337 1344
@@ -1374,6 +1381,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
1374 if (!rq->rt.rt_nr_running) 1381 if (!rq->rt.rt_nr_running)
1375 pull_rt_task(rq); 1382 pull_rt_task(rq);
1376} 1383}
1384
1385static inline void init_sched_rt_class(void)
1386{
1387 unsigned int i;
1388
1389 for_each_possible_cpu(i)
1390 alloc_cpumask_var(&per_cpu(local_cpu_mask, i), GFP_KERNEL);
1391}
1377#endif /* CONFIG_SMP */ 1392#endif /* CONFIG_SMP */
1378 1393
1379/* 1394/*
@@ -1544,3 +1559,4 @@ static void print_rt_stats(struct seq_file *m, int cpu)
1544 rcu_read_unlock(); 1559 rcu_read_unlock();
1545} 1560}
1546#endif /* CONFIG_SCHED_DEBUG */ 1561#endif /* CONFIG_SCHED_DEBUG */
1562
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 7dbf72a2b02c..ce340835d055 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -42,7 +42,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
42 for_each_domain(cpu, sd) { 42 for_each_domain(cpu, sd) {
43 enum cpu_idle_type itype; 43 enum cpu_idle_type itype;
44 44
45 cpumask_scnprintf(mask_str, mask_len, sd->span); 45 cpumask_scnprintf(mask_str, mask_len,
46 *sched_domain_span(sd));
46 seq_printf(seq, "domain%d %s", dcount++, mask_str); 47 seq_printf(seq, "domain%d %s", dcount++, mask_str);
47 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; 48 for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
48 itype++) { 49 itype++) {
diff --git a/kernel/signal.c b/kernel/signal.c
index 4530fc654455..e9afe63da24b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -41,6 +41,8 @@
41 41
42static struct kmem_cache *sigqueue_cachep; 42static struct kmem_cache *sigqueue_cachep;
43 43
44DEFINE_TRACE(sched_signal_send);
45
44static void __user *sig_handler(struct task_struct *t, int sig) 46static void __user *sig_handler(struct task_struct *t, int sig)
45{ 47{
46 return t->sighand->action[sig - 1].sa.sa_handler; 48 return t->sighand->action[sig - 1].sa.sa_handler;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 3953e4aed733..884e6cd2769c 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -164,7 +164,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
164/* 164/*
165 * Zero means infinite timeout - no checking done: 165 * Zero means infinite timeout - no checking done:
166 */ 166 */
167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; 167unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
168 168
169unsigned long __read_mostly sysctl_hung_task_warnings = 10; 169unsigned long __read_mostly sysctl_hung_task_warnings = 10;
170 170
diff --git a/kernel/sys.c b/kernel/sys.c
index 31deba8f7d16..5fc3a0cfb994 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -858,8 +858,8 @@ void do_sys_times(struct tms *tms)
858 struct task_cputime cputime; 858 struct task_cputime cputime;
859 cputime_t cutime, cstime; 859 cputime_t cutime, cstime;
860 860
861 spin_lock_irq(&current->sighand->siglock);
862 thread_group_cputime(current, &cputime); 861 thread_group_cputime(current, &cputime);
862 spin_lock_irq(&current->sighand->siglock);
863 cutime = current->signal->cutime; 863 cutime = current->signal->cutime;
864 cstime = current->signal->cstime; 864 cstime = current->signal->cstime;
865 spin_unlock_irq(&current->sighand->siglock); 865 spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3d56fe7570da..c83f566e940a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -487,6 +487,16 @@ static struct ctl_table kern_table[] = {
487 .proc_handler = &ftrace_enable_sysctl, 487 .proc_handler = &ftrace_enable_sysctl,
488 }, 488 },
489#endif 489#endif
490#ifdef CONFIG_TRACING
491 {
492 .ctl_name = CTL_UNNUMBERED,
493 .procname = "ftrace_dump_on_oops",
494 .data = &ftrace_dump_on_oops,
495 .maxlen = sizeof(int),
496 .mode = 0644,
497 .proc_handler = &proc_dointvec,
498 },
499#endif
490#ifdef CONFIG_MODULES 500#ifdef CONFIG_MODULES
491 { 501 {
492 .ctl_name = KERN_MODPROBE, 502 .ctl_name = KERN_MODPROBE,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 342fc9ccab46..70f872c71f4e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -144,7 +144,7 @@ void tick_nohz_update_jiffies(void)
144 if (!ts->tick_stopped) 144 if (!ts->tick_stopped)
145 return; 145 return;
146 146
147 cpu_clear(cpu, nohz_cpu_mask); 147 cpumask_clear_cpu(cpu, nohz_cpu_mask);
148 now = ktime_get(); 148 now = ktime_get();
149 ts->idle_waketime = now; 149 ts->idle_waketime = now;
150 150
@@ -283,7 +283,7 @@ void tick_nohz_stop_sched_tick(int inidle)
283 if ((long)delta_jiffies >= 1) { 283 if ((long)delta_jiffies >= 1) {
284 284
285 if (delta_jiffies > 1) 285 if (delta_jiffies > 1)
286 cpu_set(cpu, nohz_cpu_mask); 286 cpumask_set_cpu(cpu, nohz_cpu_mask);
287 /* 287 /*
288 * nohz_stop_sched_tick can be called several times before 288 * nohz_stop_sched_tick can be called several times before
289 * the nohz_restart_sched_tick is called. This happens when 289 * the nohz_restart_sched_tick is called. This happens when
@@ -296,7 +296,7 @@ void tick_nohz_stop_sched_tick(int inidle)
296 /* 296 /*
297 * sched tick not stopped! 297 * sched tick not stopped!
298 */ 298 */
299 cpu_clear(cpu, nohz_cpu_mask); 299 cpumask_clear_cpu(cpu, nohz_cpu_mask);
300 goto out; 300 goto out;
301 } 301 }
302 302
@@ -354,7 +354,7 @@ void tick_nohz_stop_sched_tick(int inidle)
354 * softirq. 354 * softirq.
355 */ 355 */
356 tick_do_update_jiffies64(ktime_get()); 356 tick_do_update_jiffies64(ktime_get());
357 cpu_clear(cpu, nohz_cpu_mask); 357 cpumask_clear_cpu(cpu, nohz_cpu_mask);
358 } 358 }
359 raise_softirq_irqoff(TIMER_SOFTIRQ); 359 raise_softirq_irqoff(TIMER_SOFTIRQ);
360out: 360out:
@@ -432,7 +432,7 @@ void tick_nohz_restart_sched_tick(void)
432 select_nohz_load_balancer(0); 432 select_nohz_load_balancer(0);
433 now = ktime_get(); 433 now = ktime_get();
434 tick_do_update_jiffies64(now); 434 tick_do_update_jiffies64(now);
435 cpu_clear(cpu, nohz_cpu_mask); 435 cpumask_clear_cpu(cpu, nohz_cpu_mask);
436 436
437 /* 437 /*
438 * We stopped the tick in idle. Update process times would miss the 438 * We stopped the tick in idle. Update process times would miss the
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 33dbefd471e8..9cbf7761f498 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -3,12 +3,25 @@
3# select HAVE_FUNCTION_TRACER: 3# select HAVE_FUNCTION_TRACER:
4# 4#
5 5
6config USER_STACKTRACE_SUPPORT
7 bool
8
6config NOP_TRACER 9config NOP_TRACER
7 bool 10 bool
8 11
9config HAVE_FUNCTION_TRACER 12config HAVE_FUNCTION_TRACER
10 bool 13 bool
11 14
15config HAVE_FUNCTION_RET_TRACER
16 bool
17
18config HAVE_FUNCTION_TRACE_MCOUNT_TEST
19 bool
20 help
21 This gets selected when the arch tests the function_trace_stop
22 variable at the mcount call site. Otherwise, this variable
23 is tested by the called function.
24
12config HAVE_DYNAMIC_FTRACE 25config HAVE_DYNAMIC_FTRACE
13 bool 26 bool
14 27
@@ -47,6 +60,16 @@ config FUNCTION_TRACER
47 (the bootup default), then the overhead of the instructions is very 60 (the bootup default), then the overhead of the instructions is very
48 small and not measurable even in micro-benchmarks. 61 small and not measurable even in micro-benchmarks.
49 62
63config FUNCTION_RET_TRACER
64 bool "Kernel Function return Tracer"
65 depends on HAVE_FUNCTION_RET_TRACER
66 depends on FUNCTION_TRACER
67 help
68 Enable the kernel to trace a function at its return.
69 It's first purpose is to trace the duration of functions.
70 This is done by setting the current return address on the thread
71 info structure of the current task.
72
50config IRQSOFF_TRACER 73config IRQSOFF_TRACER
51 bool "Interrupts-off Latency Tracer" 74 bool "Interrupts-off Latency Tracer"
52 default n 75 default n
@@ -138,6 +161,59 @@ config BOOT_TRACER
138 selected, because the self-tests are an initcall as well and that 161 selected, because the self-tests are an initcall as well and that
139 would invalidate the boot trace. ) 162 would invalidate the boot trace. )
140 163
164config TRACE_BRANCH_PROFILING
165 bool "Trace likely/unlikely profiler"
166 depends on DEBUG_KERNEL
167 select TRACING
168 help
169 This tracer profiles all the the likely and unlikely macros
170 in the kernel. It will display the results in:
171
172 /debugfs/tracing/profile_annotated_branch
173
174 Note: this will add a significant overhead, only turn this
175 on if you need to profile the system's use of these macros.
176
177 Say N if unsure.
178
179config PROFILE_ALL_BRANCHES
180 bool "Profile all if conditionals"
181 depends on TRACE_BRANCH_PROFILING
182 help
183 This tracer profiles all branch conditions. Every if ()
184 taken in the kernel is recorded whether it hit or miss.
185 The results will be displayed in:
186
187 /debugfs/tracing/profile_branch
188
189 This configuration, when enabled, will impose a great overhead
190 on the system. This should only be enabled when the system
191 is to be analyzed
192
193 Say N if unsure.
194
195config TRACING_BRANCHES
196 bool
197 help
198 Selected by tracers that will trace the likely and unlikely
199 conditions. This prevents the tracers themselves from being
200 profiled. Profiling the tracing infrastructure can only happen
201 when the likelys and unlikelys are not being traced.
202
203config BRANCH_TRACER
204 bool "Trace likely/unlikely instances"
205 depends on TRACE_BRANCH_PROFILING
206 select TRACING_BRANCHES
207 help
208 This traces the events of likely and unlikely condition
209 calls in the kernel. The difference between this and the
210 "Trace likely/unlikely profiler" is that this is not a
211 histogram of the callers, but actually places the calling
212 events into a running trace buffer to see when and where the
213 events happened, as well as their results.
214
215 Say N if unsure.
216
141config STACK_TRACER 217config STACK_TRACER
142 bool "Trace max stack" 218 bool "Trace max stack"
143 depends on HAVE_FUNCTION_TRACER 219 depends on HAVE_FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c8228b1a49e9..1a8c9259dc69 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg
10obj-y += trace_selftest_dynamic.o 10obj-y += trace_selftest_dynamic.o
11endif 11endif
12 12
13# If unlikely tracing is enabled, do not trace these files
14ifdef CONFIG_TRACING_BRANCHES
15KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
16endif
17
13obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o 18obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
14obj-$(CONFIG_RING_BUFFER) += ring_buffer.o 19obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
15 20
@@ -24,5 +29,7 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
24obj-$(CONFIG_STACK_TRACER) += trace_stack.o 29obj-$(CONFIG_STACK_TRACER) += trace_stack.o
25obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 30obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
26obj-$(CONFIG_BOOT_TRACER) += trace_boot.o 31obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
32obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o
33obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
27 34
28libftrace-y := ftrace.o 35libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 78db083390f0..53042f118f23 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -47,6 +47,12 @@
47int ftrace_enabled __read_mostly; 47int ftrace_enabled __read_mostly;
48static int last_ftrace_enabled; 48static int last_ftrace_enabled;
49 49
50/* Quick disabling of function tracer. */
51int function_trace_stop;
52
53/* By default, current tracing type is normal tracing. */
54enum ftrace_tracing_type_t ftrace_tracing_type = FTRACE_TYPE_ENTER;
55
50/* 56/*
51 * ftrace_disabled is set when an anomaly is discovered. 57 * ftrace_disabled is set when an anomaly is discovered.
52 * ftrace_disabled is much stronger than ftrace_enabled. 58 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -63,6 +69,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
63 69
64static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; 70static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
65ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 71ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
72ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
66 73
67static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) 74static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
68{ 75{
@@ -88,8 +95,23 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
88void clear_ftrace_function(void) 95void clear_ftrace_function(void)
89{ 96{
90 ftrace_trace_function = ftrace_stub; 97 ftrace_trace_function = ftrace_stub;
98 __ftrace_trace_function = ftrace_stub;
91} 99}
92 100
101#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
102/*
103 * For those archs that do not test ftrace_trace_stop in their
104 * mcount call site, we need to do it from C.
105 */
106static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
107{
108 if (function_trace_stop)
109 return;
110
111 __ftrace_trace_function(ip, parent_ip);
112}
113#endif
114
93static int __register_ftrace_function(struct ftrace_ops *ops) 115static int __register_ftrace_function(struct ftrace_ops *ops)
94{ 116{
95 /* should not be called from interrupt context */ 117 /* should not be called from interrupt context */
@@ -110,10 +132,18 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
110 * For one func, simply call it directly. 132 * For one func, simply call it directly.
111 * For more than one func, call the chain. 133 * For more than one func, call the chain.
112 */ 134 */
135#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
113 if (ops->next == &ftrace_list_end) 136 if (ops->next == &ftrace_list_end)
114 ftrace_trace_function = ops->func; 137 ftrace_trace_function = ops->func;
115 else 138 else
116 ftrace_trace_function = ftrace_list_func; 139 ftrace_trace_function = ftrace_list_func;
140#else
141 if (ops->next == &ftrace_list_end)
142 __ftrace_trace_function = ops->func;
143 else
144 __ftrace_trace_function = ftrace_list_func;
145 ftrace_trace_function = ftrace_test_stop_func;
146#endif
117 } 147 }
118 148
119 spin_unlock(&ftrace_lock); 149 spin_unlock(&ftrace_lock);
@@ -152,8 +182,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
152 182
153 if (ftrace_enabled) { 183 if (ftrace_enabled) {
154 /* If we only have one func left, then call that directly */ 184 /* If we only have one func left, then call that directly */
155 if (ftrace_list == &ftrace_list_end || 185 if (ftrace_list->next == &ftrace_list_end)
156 ftrace_list->next == &ftrace_list_end)
157 ftrace_trace_function = ftrace_list->func; 186 ftrace_trace_function = ftrace_list->func;
158 } 187 }
159 188
@@ -308,7 +337,7 @@ ftrace_record_ip(unsigned long ip)
308{ 337{
309 struct dyn_ftrace *rec; 338 struct dyn_ftrace *rec;
310 339
311 if (!ftrace_enabled || ftrace_disabled) 340 if (ftrace_disabled)
312 return NULL; 341 return NULL;
313 342
314 rec = ftrace_alloc_dyn_node(ip); 343 rec = ftrace_alloc_dyn_node(ip);
@@ -322,14 +351,58 @@ ftrace_record_ip(unsigned long ip)
322 return rec; 351 return rec;
323} 352}
324 353
325#define FTRACE_ADDR ((long)(ftrace_caller)) 354static void print_ip_ins(const char *fmt, unsigned char *p)
355{
356 int i;
357
358 printk(KERN_CONT "%s", fmt);
359
360 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
361 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
362}
363
364static void ftrace_bug(int failed, unsigned long ip)
365{
366 switch (failed) {
367 case -EFAULT:
368 FTRACE_WARN_ON_ONCE(1);
369 pr_info("ftrace faulted on modifying ");
370 print_ip_sym(ip);
371 break;
372 case -EINVAL:
373 FTRACE_WARN_ON_ONCE(1);
374 pr_info("ftrace failed to modify ");
375 print_ip_sym(ip);
376 print_ip_ins(" actual: ", (unsigned char *)ip);
377 printk(KERN_CONT "\n");
378 break;
379 case -EPERM:
380 FTRACE_WARN_ON_ONCE(1);
381 pr_info("ftrace faulted on writing ");
382 print_ip_sym(ip);
383 break;
384 default:
385 FTRACE_WARN_ON_ONCE(1);
386 pr_info("ftrace faulted on unknown error ");
387 print_ip_sym(ip);
388 }
389}
390
326 391
327static int 392static int
328__ftrace_replace_code(struct dyn_ftrace *rec, 393__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
329 unsigned char *nop, int enable)
330{ 394{
331 unsigned long ip, fl; 395 unsigned long ip, fl;
332 unsigned char *call, *old, *new; 396 unsigned long ftrace_addr;
397
398#ifdef CONFIG_FUNCTION_RET_TRACER
399 if (ftrace_tracing_type == FTRACE_TYPE_ENTER)
400 ftrace_addr = (unsigned long)ftrace_caller;
401 else
402 ftrace_addr = (unsigned long)ftrace_return_caller;
403#else
404 ftrace_addr = (unsigned long)ftrace_caller;
405#endif
333 406
334 ip = rec->ip; 407 ip = rec->ip;
335 408
@@ -388,34 +461,28 @@ __ftrace_replace_code(struct dyn_ftrace *rec,
388 } 461 }
389 } 462 }
390 463
391 call = ftrace_call_replace(ip, FTRACE_ADDR); 464 if (rec->flags & FTRACE_FL_ENABLED)
392 465 return ftrace_make_call(rec, ftrace_addr);
393 if (rec->flags & FTRACE_FL_ENABLED) { 466 else
394 old = nop; 467 return ftrace_make_nop(NULL, rec, ftrace_addr);
395 new = call;
396 } else {
397 old = call;
398 new = nop;
399 }
400
401 return ftrace_modify_code(ip, old, new);
402} 468}
403 469
404static void ftrace_replace_code(int enable) 470static void ftrace_replace_code(int enable)
405{ 471{
406 int i, failed; 472 int i, failed;
407 unsigned char *nop = NULL;
408 struct dyn_ftrace *rec; 473 struct dyn_ftrace *rec;
409 struct ftrace_page *pg; 474 struct ftrace_page *pg;
410 475
411 nop = ftrace_nop_replace();
412
413 for (pg = ftrace_pages_start; pg; pg = pg->next) { 476 for (pg = ftrace_pages_start; pg; pg = pg->next) {
414 for (i = 0; i < pg->index; i++) { 477 for (i = 0; i < pg->index; i++) {
415 rec = &pg->records[i]; 478 rec = &pg->records[i];
416 479
417 /* don't modify code that has already faulted */ 480 /*
418 if (rec->flags & FTRACE_FL_FAILED) 481 * Skip over free records and records that have
482 * failed.
483 */
484 if (rec->flags & FTRACE_FL_FREE ||
485 rec->flags & FTRACE_FL_FAILED)
419 continue; 486 continue;
420 487
421 /* ignore updates to this record's mcount site */ 488 /* ignore updates to this record's mcount site */
@@ -426,68 +493,30 @@ static void ftrace_replace_code(int enable)
426 unfreeze_record(rec); 493 unfreeze_record(rec);
427 } 494 }
428 495
429 failed = __ftrace_replace_code(rec, nop, enable); 496 failed = __ftrace_replace_code(rec, enable);
430 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { 497 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
431 rec->flags |= FTRACE_FL_FAILED; 498 rec->flags |= FTRACE_FL_FAILED;
432 if ((system_state == SYSTEM_BOOTING) || 499 if ((system_state == SYSTEM_BOOTING) ||
433 !core_kernel_text(rec->ip)) { 500 !core_kernel_text(rec->ip)) {
434 ftrace_free_rec(rec); 501 ftrace_free_rec(rec);
435 } 502 } else
503 ftrace_bug(failed, rec->ip);
436 } 504 }
437 } 505 }
438 } 506 }
439} 507}
440 508
441static void print_ip_ins(const char *fmt, unsigned char *p)
442{
443 int i;
444
445 printk(KERN_CONT "%s", fmt);
446
447 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
448 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
449}
450
451static int 509static int
452ftrace_code_disable(struct dyn_ftrace *rec) 510ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
453{ 511{
454 unsigned long ip; 512 unsigned long ip;
455 unsigned char *nop, *call;
456 int ret; 513 int ret;
457 514
458 ip = rec->ip; 515 ip = rec->ip;
459 516
460 nop = ftrace_nop_replace(); 517 ret = ftrace_make_nop(mod, rec, mcount_addr);
461 call = ftrace_call_replace(ip, mcount_addr);
462
463 ret = ftrace_modify_code(ip, call, nop);
464 if (ret) { 518 if (ret) {
465 switch (ret) { 519 ftrace_bug(ret, ip);
466 case -EFAULT:
467 FTRACE_WARN_ON_ONCE(1);
468 pr_info("ftrace faulted on modifying ");
469 print_ip_sym(ip);
470 break;
471 case -EINVAL:
472 FTRACE_WARN_ON_ONCE(1);
473 pr_info("ftrace failed to modify ");
474 print_ip_sym(ip);
475 print_ip_ins(" expected: ", call);
476 print_ip_ins(" actual: ", (unsigned char *)ip);
477 print_ip_ins(" replace: ", nop);
478 printk(KERN_CONT "\n");
479 break;
480 case -EPERM:
481 FTRACE_WARN_ON_ONCE(1);
482 pr_info("ftrace faulted on writing ");
483 print_ip_sym(ip);
484 break;
485 default:
486 FTRACE_WARN_ON_ONCE(1);
487 pr_info("ftrace faulted on unknown error ");
488 print_ip_sym(ip);
489 }
490
491 rec->flags |= FTRACE_FL_FAILED; 520 rec->flags |= FTRACE_FL_FAILED;
492 return 0; 521 return 0;
493 } 522 }
@@ -515,7 +544,7 @@ static void ftrace_run_update_code(int command)
515} 544}
516 545
517static ftrace_func_t saved_ftrace_func; 546static ftrace_func_t saved_ftrace_func;
518static int ftrace_start; 547static int ftrace_start_up;
519static DEFINE_MUTEX(ftrace_start_lock); 548static DEFINE_MUTEX(ftrace_start_lock);
520 549
521static void ftrace_startup(void) 550static void ftrace_startup(void)
@@ -526,7 +555,7 @@ static void ftrace_startup(void)
526 return; 555 return;
527 556
528 mutex_lock(&ftrace_start_lock); 557 mutex_lock(&ftrace_start_lock);
529 ftrace_start++; 558 ftrace_start_up++;
530 command |= FTRACE_ENABLE_CALLS; 559 command |= FTRACE_ENABLE_CALLS;
531 560
532 if (saved_ftrace_func != ftrace_trace_function) { 561 if (saved_ftrace_func != ftrace_trace_function) {
@@ -550,8 +579,8 @@ static void ftrace_shutdown(void)
550 return; 579 return;
551 580
552 mutex_lock(&ftrace_start_lock); 581 mutex_lock(&ftrace_start_lock);
553 ftrace_start--; 582 ftrace_start_up--;
554 if (!ftrace_start) 583 if (!ftrace_start_up)
555 command |= FTRACE_DISABLE_CALLS; 584 command |= FTRACE_DISABLE_CALLS;
556 585
557 if (saved_ftrace_func != ftrace_trace_function) { 586 if (saved_ftrace_func != ftrace_trace_function) {
@@ -577,8 +606,8 @@ static void ftrace_startup_sysctl(void)
577 mutex_lock(&ftrace_start_lock); 606 mutex_lock(&ftrace_start_lock);
578 /* Force update next time */ 607 /* Force update next time */
579 saved_ftrace_func = NULL; 608 saved_ftrace_func = NULL;
580 /* ftrace_start is true if we want ftrace running */ 609 /* ftrace_start_up is true if we want ftrace running */
581 if (ftrace_start) 610 if (ftrace_start_up)
582 command |= FTRACE_ENABLE_CALLS; 611 command |= FTRACE_ENABLE_CALLS;
583 612
584 ftrace_run_update_code(command); 613 ftrace_run_update_code(command);
@@ -593,8 +622,8 @@ static void ftrace_shutdown_sysctl(void)
593 return; 622 return;
594 623
595 mutex_lock(&ftrace_start_lock); 624 mutex_lock(&ftrace_start_lock);
596 /* ftrace_start is true if ftrace is running */ 625 /* ftrace_start_up is true if ftrace is running */
597 if (ftrace_start) 626 if (ftrace_start_up)
598 command |= FTRACE_DISABLE_CALLS; 627 command |= FTRACE_DISABLE_CALLS;
599 628
600 ftrace_run_update_code(command); 629 ftrace_run_update_code(command);
@@ -605,7 +634,7 @@ static cycle_t ftrace_update_time;
605static unsigned long ftrace_update_cnt; 634static unsigned long ftrace_update_cnt;
606unsigned long ftrace_update_tot_cnt; 635unsigned long ftrace_update_tot_cnt;
607 636
608static int ftrace_update_code(void) 637static int ftrace_update_code(struct module *mod)
609{ 638{
610 struct dyn_ftrace *p, *t; 639 struct dyn_ftrace *p, *t;
611 cycle_t start, stop; 640 cycle_t start, stop;
@@ -622,7 +651,7 @@ static int ftrace_update_code(void)
622 list_del_init(&p->list); 651 list_del_init(&p->list);
623 652
624 /* convert record (i.e, patch mcount-call with NOP) */ 653 /* convert record (i.e, patch mcount-call with NOP) */
625 if (ftrace_code_disable(p)) { 654 if (ftrace_code_disable(mod, p)) {
626 p->flags |= FTRACE_FL_CONVERTED; 655 p->flags |= FTRACE_FL_CONVERTED;
627 ftrace_update_cnt++; 656 ftrace_update_cnt++;
628 } else 657 } else
@@ -1181,7 +1210,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1181 1210
1182 mutex_lock(&ftrace_sysctl_lock); 1211 mutex_lock(&ftrace_sysctl_lock);
1183 mutex_lock(&ftrace_start_lock); 1212 mutex_lock(&ftrace_start_lock);
1184 if (ftrace_start && ftrace_enabled) 1213 if (ftrace_start_up && ftrace_enabled)
1185 ftrace_run_update_code(FTRACE_ENABLE_CALLS); 1214 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1186 mutex_unlock(&ftrace_start_lock); 1215 mutex_unlock(&ftrace_start_lock);
1187 mutex_unlock(&ftrace_sysctl_lock); 1216 mutex_unlock(&ftrace_sysctl_lock);
@@ -1268,7 +1297,8 @@ static __init int ftrace_init_debugfs(void)
1268 1297
1269fs_initcall(ftrace_init_debugfs); 1298fs_initcall(ftrace_init_debugfs);
1270 1299
1271static int ftrace_convert_nops(unsigned long *start, 1300static int ftrace_convert_nops(struct module *mod,
1301 unsigned long *start,
1272 unsigned long *end) 1302 unsigned long *end)
1273{ 1303{
1274 unsigned long *p; 1304 unsigned long *p;
@@ -1279,23 +1309,32 @@ static int ftrace_convert_nops(unsigned long *start,
1279 p = start; 1309 p = start;
1280 while (p < end) { 1310 while (p < end) {
1281 addr = ftrace_call_adjust(*p++); 1311 addr = ftrace_call_adjust(*p++);
1312 /*
1313 * Some architecture linkers will pad between
1314 * the different mcount_loc sections of different
1315 * object files to satisfy alignments.
1316 * Skip any NULL pointers.
1317 */
1318 if (!addr)
1319 continue;
1282 ftrace_record_ip(addr); 1320 ftrace_record_ip(addr);
1283 } 1321 }
1284 1322
1285 /* disable interrupts to prevent kstop machine */ 1323 /* disable interrupts to prevent kstop machine */
1286 local_irq_save(flags); 1324 local_irq_save(flags);
1287 ftrace_update_code(); 1325 ftrace_update_code(mod);
1288 local_irq_restore(flags); 1326 local_irq_restore(flags);
1289 mutex_unlock(&ftrace_start_lock); 1327 mutex_unlock(&ftrace_start_lock);
1290 1328
1291 return 0; 1329 return 0;
1292} 1330}
1293 1331
1294void ftrace_init_module(unsigned long *start, unsigned long *end) 1332void ftrace_init_module(struct module *mod,
1333 unsigned long *start, unsigned long *end)
1295{ 1334{
1296 if (ftrace_disabled || start == end) 1335 if (ftrace_disabled || start == end)
1297 return; 1336 return;
1298 ftrace_convert_nops(start, end); 1337 ftrace_convert_nops(mod, start, end);
1299} 1338}
1300 1339
1301extern unsigned long __start_mcount_loc[]; 1340extern unsigned long __start_mcount_loc[];
@@ -1325,7 +1364,8 @@ void __init ftrace_init(void)
1325 1364
1326 last_ftrace_enabled = ftrace_enabled = 1; 1365 last_ftrace_enabled = ftrace_enabled = 1;
1327 1366
1328 ret = ftrace_convert_nops(__start_mcount_loc, 1367 ret = ftrace_convert_nops(NULL,
1368 __start_mcount_loc,
1329 __stop_mcount_loc); 1369 __stop_mcount_loc);
1330 1370
1331 return; 1371 return;
@@ -1381,10 +1421,17 @@ int register_ftrace_function(struct ftrace_ops *ops)
1381 return -1; 1421 return -1;
1382 1422
1383 mutex_lock(&ftrace_sysctl_lock); 1423 mutex_lock(&ftrace_sysctl_lock);
1424
1425 if (ftrace_tracing_type == FTRACE_TYPE_RETURN) {
1426 ret = -EBUSY;
1427 goto out;
1428 }
1429
1384 ret = __register_ftrace_function(ops); 1430 ret = __register_ftrace_function(ops);
1385 ftrace_startup(); 1431 ftrace_startup();
1386 mutex_unlock(&ftrace_sysctl_lock);
1387 1432
1433out:
1434 mutex_unlock(&ftrace_sysctl_lock);
1388 return ret; 1435 return ret;
1389} 1436}
1390 1437
@@ -1449,3 +1496,147 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
1449 return ret; 1496 return ret;
1450} 1497}
1451 1498
1499#ifdef CONFIG_FUNCTION_RET_TRACER
1500
1501static atomic_t ftrace_retfunc_active;
1502
1503/* The callback that hooks the return of a function */
1504trace_function_return_t ftrace_function_return =
1505 (trace_function_return_t)ftrace_stub;
1506
1507
1508/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
1509static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
1510{
1511 int i;
1512 int ret = 0;
1513 unsigned long flags;
1514 int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE;
1515 struct task_struct *g, *t;
1516
1517 for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) {
1518 ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH
1519 * sizeof(struct ftrace_ret_stack),
1520 GFP_KERNEL);
1521 if (!ret_stack_list[i]) {
1522 start = 0;
1523 end = i;
1524 ret = -ENOMEM;
1525 goto free;
1526 }
1527 }
1528
1529 read_lock_irqsave(&tasklist_lock, flags);
1530 do_each_thread(g, t) {
1531 if (start == end) {
1532 ret = -EAGAIN;
1533 goto unlock;
1534 }
1535
1536 if (t->ret_stack == NULL) {
1537 t->ret_stack = ret_stack_list[start++];
1538 t->curr_ret_stack = -1;
1539 atomic_set(&t->trace_overrun, 0);
1540 }
1541 } while_each_thread(g, t);
1542
1543unlock:
1544 read_unlock_irqrestore(&tasklist_lock, flags);
1545free:
1546 for (i = start; i < end; i++)
1547 kfree(ret_stack_list[i]);
1548 return ret;
1549}
1550
1551/* Allocate a return stack for each task */
1552static int start_return_tracing(void)
1553{
1554 struct ftrace_ret_stack **ret_stack_list;
1555 int ret;
1556
1557 ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
1558 sizeof(struct ftrace_ret_stack *),
1559 GFP_KERNEL);
1560
1561 if (!ret_stack_list)
1562 return -ENOMEM;
1563
1564 do {
1565 ret = alloc_retstack_tasklist(ret_stack_list);
1566 } while (ret == -EAGAIN);
1567
1568 kfree(ret_stack_list);
1569 return ret;
1570}
1571
1572int register_ftrace_return(trace_function_return_t func)
1573{
1574 int ret = 0;
1575
1576 mutex_lock(&ftrace_sysctl_lock);
1577
1578 /*
1579 * Don't launch return tracing if normal function
1580 * tracing is already running.
1581 */
1582 if (ftrace_trace_function != ftrace_stub) {
1583 ret = -EBUSY;
1584 goto out;
1585 }
1586 atomic_inc(&ftrace_retfunc_active);
1587 ret = start_return_tracing();
1588 if (ret) {
1589 atomic_dec(&ftrace_retfunc_active);
1590 goto out;
1591 }
1592 ftrace_tracing_type = FTRACE_TYPE_RETURN;
1593 ftrace_function_return = func;
1594 ftrace_startup();
1595
1596out:
1597 mutex_unlock(&ftrace_sysctl_lock);
1598 return ret;
1599}
1600
1601void unregister_ftrace_return(void)
1602{
1603 mutex_lock(&ftrace_sysctl_lock);
1604
1605 atomic_dec(&ftrace_retfunc_active);
1606 ftrace_function_return = (trace_function_return_t)ftrace_stub;
1607 ftrace_shutdown();
1608 /* Restore normal tracing type */
1609 ftrace_tracing_type = FTRACE_TYPE_ENTER;
1610
1611 mutex_unlock(&ftrace_sysctl_lock);
1612}
1613
1614/* Allocate a return stack for newly created task */
1615void ftrace_retfunc_init_task(struct task_struct *t)
1616{
1617 if (atomic_read(&ftrace_retfunc_active)) {
1618 t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
1619 * sizeof(struct ftrace_ret_stack),
1620 GFP_KERNEL);
1621 if (!t->ret_stack)
1622 return;
1623 t->curr_ret_stack = -1;
1624 atomic_set(&t->trace_overrun, 0);
1625 } else
1626 t->ret_stack = NULL;
1627}
1628
1629void ftrace_retfunc_exit_task(struct task_struct *t)
1630{
1631 struct ftrace_ret_stack *ret_stack = t->ret_stack;
1632
1633 t->ret_stack = NULL;
1634 /* NULL must become visible to IRQs before we free it: */
1635 barrier();
1636
1637 kfree(ret_stack);
1638}
1639#endif
1640
1641
1642
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 668bbb5ef2bd..e206951603c1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -18,8 +18,46 @@
18 18
19#include "trace.h" 19#include "trace.h"
20 20
21/* Global flag to disable all recording to ring buffers */ 21/*
22static int ring_buffers_off __read_mostly; 22 * A fast way to enable or disable all ring buffers is to
23 * call tracing_on or tracing_off. Turning off the ring buffers
24 * prevents all ring buffers from being recorded to.
25 * Turning this switch on, makes it OK to write to the
26 * ring buffer, if the ring buffer is enabled itself.
27 *
28 * There's three layers that must be on in order to write
29 * to the ring buffer.
30 *
31 * 1) This global flag must be set.
32 * 2) The ring buffer must be enabled for recording.
33 * 3) The per cpu buffer must be enabled for recording.
34 *
35 * In case of an anomaly, this global flag has a bit set that
36 * will permantly disable all ring buffers.
37 */
38
39/*
40 * Global flag to disable all recording to ring buffers
41 * This has two bits: ON, DISABLED
42 *
43 * ON DISABLED
44 * ---- ----------
45 * 0 0 : ring buffers are off
46 * 1 0 : ring buffers are on
47 * X 1 : ring buffers are permanently disabled
48 */
49
50enum {
51 RB_BUFFERS_ON_BIT = 0,
52 RB_BUFFERS_DISABLED_BIT = 1,
53};
54
55enum {
56 RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT,
57 RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT,
58};
59
60static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
23 61
24/** 62/**
25 * tracing_on - enable all tracing buffers 63 * tracing_on - enable all tracing buffers
@@ -29,7 +67,7 @@ static int ring_buffers_off __read_mostly;
29 */ 67 */
30void tracing_on(void) 68void tracing_on(void)
31{ 69{
32 ring_buffers_off = 0; 70 set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
33} 71}
34 72
35/** 73/**
@@ -42,9 +80,22 @@ void tracing_on(void)
42 */ 80 */
43void tracing_off(void) 81void tracing_off(void)
44{ 82{
45 ring_buffers_off = 1; 83 clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
84}
85
86/**
87 * tracing_off_permanent - permanently disable ring buffers
88 *
89 * This function, once called, will disable all ring buffers
90 * permanenty.
91 */
92void tracing_off_permanent(void)
93{
94 set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
46} 95}
47 96
97#include "trace.h"
98
48/* Up this if you want to test the TIME_EXTENTS and normalization */ 99/* Up this if you want to test the TIME_EXTENTS and normalization */
49#define DEBUG_SHIFT 0 100#define DEBUG_SHIFT 0
50 101
@@ -187,7 +238,8 @@ static inline int test_time_stamp(u64 delta)
187struct ring_buffer_per_cpu { 238struct ring_buffer_per_cpu {
188 int cpu; 239 int cpu;
189 struct ring_buffer *buffer; 240 struct ring_buffer *buffer;
190 spinlock_t lock; 241 spinlock_t reader_lock; /* serialize readers */
242 raw_spinlock_t lock;
191 struct lock_class_key lock_key; 243 struct lock_class_key lock_key;
192 struct list_head pages; 244 struct list_head pages;
193 struct buffer_page *head_page; /* read from head */ 245 struct buffer_page *head_page; /* read from head */
@@ -221,32 +273,16 @@ struct ring_buffer_iter {
221 u64 read_stamp; 273 u64 read_stamp;
222}; 274};
223 275
276/* buffer may be either ring_buffer or ring_buffer_per_cpu */
224#define RB_WARN_ON(buffer, cond) \ 277#define RB_WARN_ON(buffer, cond) \
225 do { \ 278 ({ \
226 if (unlikely(cond)) { \ 279 int _____ret = unlikely(cond); \
227 atomic_inc(&buffer->record_disabled); \ 280 if (_____ret) { \
228 WARN_ON(1); \
229 } \
230 } while (0)
231
232#define RB_WARN_ON_RET(buffer, cond) \
233 do { \
234 if (unlikely(cond)) { \
235 atomic_inc(&buffer->record_disabled); \
236 WARN_ON(1); \
237 return -1; \
238 } \
239 } while (0)
240
241#define RB_WARN_ON_ONCE(buffer, cond) \
242 do { \
243 static int once; \
244 if (unlikely(cond) && !once) { \
245 once++; \
246 atomic_inc(&buffer->record_disabled); \ 281 atomic_inc(&buffer->record_disabled); \
247 WARN_ON(1); \ 282 WARN_ON(1); \
248 } \ 283 } \
249 } while (0) 284 _____ret; \
285 })
250 286
251/** 287/**
252 * check_pages - integrity check of buffer pages 288 * check_pages - integrity check of buffer pages
@@ -260,14 +296,18 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
260 struct list_head *head = &cpu_buffer->pages; 296 struct list_head *head = &cpu_buffer->pages;
261 struct buffer_page *page, *tmp; 297 struct buffer_page *page, *tmp;
262 298
263 RB_WARN_ON_RET(cpu_buffer, head->next->prev != head); 299 if (RB_WARN_ON(cpu_buffer, head->next->prev != head))
264 RB_WARN_ON_RET(cpu_buffer, head->prev->next != head); 300 return -1;
301 if (RB_WARN_ON(cpu_buffer, head->prev->next != head))
302 return -1;
265 303
266 list_for_each_entry_safe(page, tmp, head, list) { 304 list_for_each_entry_safe(page, tmp, head, list) {
267 RB_WARN_ON_RET(cpu_buffer, 305 if (RB_WARN_ON(cpu_buffer,
268 page->list.next->prev != &page->list); 306 page->list.next->prev != &page->list))
269 RB_WARN_ON_RET(cpu_buffer, 307 return -1;
270 page->list.prev->next != &page->list); 308 if (RB_WARN_ON(cpu_buffer,
309 page->list.prev->next != &page->list))
310 return -1;
271 } 311 }
272 312
273 return 0; 313 return 0;
@@ -324,7 +364,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
324 364
325 cpu_buffer->cpu = cpu; 365 cpu_buffer->cpu = cpu;
326 cpu_buffer->buffer = buffer; 366 cpu_buffer->buffer = buffer;
327 spin_lock_init(&cpu_buffer->lock); 367 spin_lock_init(&cpu_buffer->reader_lock);
368 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
328 INIT_LIST_HEAD(&cpu_buffer->pages); 369 INIT_LIST_HEAD(&cpu_buffer->pages);
329 370
330 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()), 371 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
@@ -473,13 +514,15 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
473 synchronize_sched(); 514 synchronize_sched();
474 515
475 for (i = 0; i < nr_pages; i++) { 516 for (i = 0; i < nr_pages; i++) {
476 BUG_ON(list_empty(&cpu_buffer->pages)); 517 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
518 return;
477 p = cpu_buffer->pages.next; 519 p = cpu_buffer->pages.next;
478 page = list_entry(p, struct buffer_page, list); 520 page = list_entry(p, struct buffer_page, list);
479 list_del_init(&page->list); 521 list_del_init(&page->list);
480 free_buffer_page(page); 522 free_buffer_page(page);
481 } 523 }
482 BUG_ON(list_empty(&cpu_buffer->pages)); 524 if (RB_WARN_ON(cpu_buffer, list_empty(&cpu_buffer->pages)))
525 return;
483 526
484 rb_reset_cpu(cpu_buffer); 527 rb_reset_cpu(cpu_buffer);
485 528
@@ -501,7 +544,8 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
501 synchronize_sched(); 544 synchronize_sched();
502 545
503 for (i = 0; i < nr_pages; i++) { 546 for (i = 0; i < nr_pages; i++) {
504 BUG_ON(list_empty(pages)); 547 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
548 return;
505 p = pages->next; 549 p = pages->next;
506 page = list_entry(p, struct buffer_page, list); 550 page = list_entry(p, struct buffer_page, list);
507 list_del_init(&page->list); 551 list_del_init(&page->list);
@@ -562,7 +606,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
562 if (size < buffer_size) { 606 if (size < buffer_size) {
563 607
564 /* easy case, just free pages */ 608 /* easy case, just free pages */
565 BUG_ON(nr_pages >= buffer->pages); 609 if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
610 mutex_unlock(&buffer->mutex);
611 return -1;
612 }
566 613
567 rm_pages = buffer->pages - nr_pages; 614 rm_pages = buffer->pages - nr_pages;
568 615
@@ -581,7 +628,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
581 * add these pages to the cpu_buffers. Otherwise we just free 628 * add these pages to the cpu_buffers. Otherwise we just free
582 * them all and return -ENOMEM; 629 * them all and return -ENOMEM;
583 */ 630 */
584 BUG_ON(nr_pages <= buffer->pages); 631 if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
632 mutex_unlock(&buffer->mutex);
633 return -1;
634 }
635
585 new_pages = nr_pages - buffer->pages; 636 new_pages = nr_pages - buffer->pages;
586 637
587 for_each_buffer_cpu(buffer, cpu) { 638 for_each_buffer_cpu(buffer, cpu) {
@@ -604,7 +655,10 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
604 rb_insert_pages(cpu_buffer, &pages, new_pages); 655 rb_insert_pages(cpu_buffer, &pages, new_pages);
605 } 656 }
606 657
607 BUG_ON(!list_empty(&pages)); 658 if (RB_WARN_ON(buffer, !list_empty(&pages))) {
659 mutex_unlock(&buffer->mutex);
660 return -1;
661 }
608 662
609 out: 663 out:
610 buffer->pages = nr_pages; 664 buffer->pages = nr_pages;
@@ -693,7 +747,8 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
693 head += rb_event_length(event)) { 747 head += rb_event_length(event)) {
694 748
695 event = __rb_page_index(cpu_buffer->head_page, head); 749 event = __rb_page_index(cpu_buffer->head_page, head);
696 BUG_ON(rb_null_event(event)); 750 if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
751 return;
697 /* Only count data entries */ 752 /* Only count data entries */
698 if (event->type != RINGBUF_TYPE_DATA) 753 if (event->type != RINGBUF_TYPE_DATA)
699 continue; 754 continue;
@@ -746,8 +801,9 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
746 addr &= PAGE_MASK; 801 addr &= PAGE_MASK;
747 802
748 while (cpu_buffer->commit_page->page != (void *)addr) { 803 while (cpu_buffer->commit_page->page != (void *)addr) {
749 RB_WARN_ON(cpu_buffer, 804 if (RB_WARN_ON(cpu_buffer,
750 cpu_buffer->commit_page == cpu_buffer->tail_page); 805 cpu_buffer->commit_page == cpu_buffer->tail_page))
806 return;
751 cpu_buffer->commit_page->commit = 807 cpu_buffer->commit_page->commit =
752 cpu_buffer->commit_page->write; 808 cpu_buffer->commit_page->write;
753 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); 809 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
@@ -894,7 +950,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
894 if (write > BUF_PAGE_SIZE) { 950 if (write > BUF_PAGE_SIZE) {
895 struct buffer_page *next_page = tail_page; 951 struct buffer_page *next_page = tail_page;
896 952
897 spin_lock_irqsave(&cpu_buffer->lock, flags); 953 local_irq_save(flags);
954 __raw_spin_lock(&cpu_buffer->lock);
898 955
899 rb_inc_page(cpu_buffer, &next_page); 956 rb_inc_page(cpu_buffer, &next_page);
900 957
@@ -902,7 +959,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
902 reader_page = cpu_buffer->reader_page; 959 reader_page = cpu_buffer->reader_page;
903 960
904 /* we grabbed the lock before incrementing */ 961 /* we grabbed the lock before incrementing */
905 RB_WARN_ON(cpu_buffer, next_page == reader_page); 962 if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
963 goto out_unlock;
906 964
907 /* 965 /*
908 * If for some reason, we had an interrupt storm that made 966 * If for some reason, we had an interrupt storm that made
@@ -970,7 +1028,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
970 rb_set_commit_to_write(cpu_buffer); 1028 rb_set_commit_to_write(cpu_buffer);
971 } 1029 }
972 1030
973 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1031 __raw_spin_unlock(&cpu_buffer->lock);
1032 local_irq_restore(flags);
974 1033
975 /* fail and let the caller try again */ 1034 /* fail and let the caller try again */
976 return ERR_PTR(-EAGAIN); 1035 return ERR_PTR(-EAGAIN);
@@ -978,7 +1037,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
978 1037
979 /* We reserved something on the buffer */ 1038 /* We reserved something on the buffer */
980 1039
981 BUG_ON(write > BUF_PAGE_SIZE); 1040 if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
1041 return NULL;
982 1042
983 event = __rb_page_index(tail_page, tail); 1043 event = __rb_page_index(tail_page, tail);
984 rb_update_event(event, type, length); 1044 rb_update_event(event, type, length);
@@ -993,7 +1053,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
993 return event; 1053 return event;
994 1054
995 out_unlock: 1055 out_unlock:
996 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1056 __raw_spin_unlock(&cpu_buffer->lock);
1057 local_irq_restore(flags);
997 return NULL; 1058 return NULL;
998} 1059}
999 1060
@@ -1076,10 +1137,8 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1076 * storm or we have something buggy. 1137 * storm or we have something buggy.
1077 * Bail! 1138 * Bail!
1078 */ 1139 */
1079 if (unlikely(++nr_loops > 1000)) { 1140 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
1080 RB_WARN_ON(cpu_buffer, 1);
1081 return NULL; 1141 return NULL;
1082 }
1083 1142
1084 ts = ring_buffer_time_stamp(cpu_buffer->cpu); 1143 ts = ring_buffer_time_stamp(cpu_buffer->cpu);
1085 1144
@@ -1175,15 +1234,14 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1175 struct ring_buffer_event *event; 1234 struct ring_buffer_event *event;
1176 int cpu, resched; 1235 int cpu, resched;
1177 1236
1178 if (ring_buffers_off) 1237 if (ring_buffer_flags != RB_BUFFERS_ON)
1179 return NULL; 1238 return NULL;
1180 1239
1181 if (atomic_read(&buffer->record_disabled)) 1240 if (atomic_read(&buffer->record_disabled))
1182 return NULL; 1241 return NULL;
1183 1242
1184 /* If we are tracing schedule, we don't want to recurse */ 1243 /* If we are tracing schedule, we don't want to recurse */
1185 resched = need_resched(); 1244 resched = ftrace_preempt_disable();
1186 preempt_disable_notrace();
1187 1245
1188 cpu = raw_smp_processor_id(); 1246 cpu = raw_smp_processor_id();
1189 1247
@@ -1214,10 +1272,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer,
1214 return event; 1272 return event;
1215 1273
1216 out: 1274 out:
1217 if (resched) 1275 ftrace_preempt_enable(resched);
1218 preempt_enable_no_resched_notrace();
1219 else
1220 preempt_enable_notrace();
1221 return NULL; 1276 return NULL;
1222} 1277}
1223 1278
@@ -1259,12 +1314,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1259 /* 1314 /*
1260 * Only the last preempt count needs to restore preemption. 1315 * Only the last preempt count needs to restore preemption.
1261 */ 1316 */
1262 if (preempt_count() == 1) { 1317 if (preempt_count() == 1)
1263 if (per_cpu(rb_need_resched, cpu)) 1318 ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
1264 preempt_enable_no_resched_notrace(); 1319 else
1265 else
1266 preempt_enable_notrace();
1267 } else
1268 preempt_enable_no_resched_notrace(); 1320 preempt_enable_no_resched_notrace();
1269 1321
1270 return 0; 1322 return 0;
@@ -1294,14 +1346,13 @@ int ring_buffer_write(struct ring_buffer *buffer,
1294 int ret = -EBUSY; 1346 int ret = -EBUSY;
1295 int cpu, resched; 1347 int cpu, resched;
1296 1348
1297 if (ring_buffers_off) 1349 if (ring_buffer_flags != RB_BUFFERS_ON)
1298 return -EBUSY; 1350 return -EBUSY;
1299 1351
1300 if (atomic_read(&buffer->record_disabled)) 1352 if (atomic_read(&buffer->record_disabled))
1301 return -EBUSY; 1353 return -EBUSY;
1302 1354
1303 resched = need_resched(); 1355 resched = ftrace_preempt_disable();
1304 preempt_disable_notrace();
1305 1356
1306 cpu = raw_smp_processor_id(); 1357 cpu = raw_smp_processor_id();
1307 1358
@@ -1327,10 +1378,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
1327 1378
1328 ret = 0; 1379 ret = 0;
1329 out: 1380 out:
1330 if (resched) 1381 ftrace_preempt_enable(resched);
1331 preempt_enable_no_resched_notrace();
1332 else
1333 preempt_enable_notrace();
1334 1382
1335 return ret; 1383 return ret;
1336} 1384}
@@ -1489,14 +1537,7 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
1489 return overruns; 1537 return overruns;
1490} 1538}
1491 1539
1492/** 1540static void rb_iter_reset(struct ring_buffer_iter *iter)
1493 * ring_buffer_iter_reset - reset an iterator
1494 * @iter: The iterator to reset
1495 *
1496 * Resets the iterator, so that it will start from the beginning
1497 * again.
1498 */
1499void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1500{ 1541{
1501 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 1542 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1502 1543
@@ -1515,6 +1556,23 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1515} 1556}
1516 1557
1517/** 1558/**
1559 * ring_buffer_iter_reset - reset an iterator
1560 * @iter: The iterator to reset
1561 *
1562 * Resets the iterator, so that it will start from the beginning
1563 * again.
1564 */
1565void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1566{
1567 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1568 unsigned long flags;
1569
1570 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1571 rb_iter_reset(iter);
1572 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1573}
1574
1575/**
1518 * ring_buffer_iter_empty - check if an iterator has no more to read 1576 * ring_buffer_iter_empty - check if an iterator has no more to read
1519 * @iter: The iterator to check 1577 * @iter: The iterator to check
1520 */ 1578 */
@@ -1597,7 +1655,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1597 unsigned long flags; 1655 unsigned long flags;
1598 int nr_loops = 0; 1656 int nr_loops = 0;
1599 1657
1600 spin_lock_irqsave(&cpu_buffer->lock, flags); 1658 local_irq_save(flags);
1659 __raw_spin_lock(&cpu_buffer->lock);
1601 1660
1602 again: 1661 again:
1603 /* 1662 /*
@@ -1606,8 +1665,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1606 * a case where we will loop three times. There should be no 1665 * a case where we will loop three times. There should be no
1607 * reason to loop four times (that I know of). 1666 * reason to loop four times (that I know of).
1608 */ 1667 */
1609 if (unlikely(++nr_loops > 3)) { 1668 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) {
1610 RB_WARN_ON(cpu_buffer, 1);
1611 reader = NULL; 1669 reader = NULL;
1612 goto out; 1670 goto out;
1613 } 1671 }
@@ -1619,8 +1677,9 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1619 goto out; 1677 goto out;
1620 1678
1621 /* Never should we have an index greater than the size */ 1679 /* Never should we have an index greater than the size */
1622 RB_WARN_ON(cpu_buffer, 1680 if (RB_WARN_ON(cpu_buffer,
1623 cpu_buffer->reader_page->read > rb_page_size(reader)); 1681 cpu_buffer->reader_page->read > rb_page_size(reader)))
1682 goto out;
1624 1683
1625 /* check if we caught up to the tail */ 1684 /* check if we caught up to the tail */
1626 reader = NULL; 1685 reader = NULL;
@@ -1659,7 +1718,8 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1659 goto again; 1718 goto again;
1660 1719
1661 out: 1720 out:
1662 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 1721 __raw_spin_unlock(&cpu_buffer->lock);
1722 local_irq_restore(flags);
1663 1723
1664 return reader; 1724 return reader;
1665} 1725}
@@ -1673,7 +1733,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
1673 reader = rb_get_reader_page(cpu_buffer); 1733 reader = rb_get_reader_page(cpu_buffer);
1674 1734
1675 /* This function should not be called when buffer is empty */ 1735 /* This function should not be called when buffer is empty */
1676 BUG_ON(!reader); 1736 if (RB_WARN_ON(cpu_buffer, !reader))
1737 return;
1677 1738
1678 event = rb_reader_event(cpu_buffer); 1739 event = rb_reader_event(cpu_buffer);
1679 1740
@@ -1700,7 +1761,9 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1700 * Check if we are at the end of the buffer. 1761 * Check if we are at the end of the buffer.
1701 */ 1762 */
1702 if (iter->head >= rb_page_size(iter->head_page)) { 1763 if (iter->head >= rb_page_size(iter->head_page)) {
1703 BUG_ON(iter->head_page == cpu_buffer->commit_page); 1764 if (RB_WARN_ON(buffer,
1765 iter->head_page == cpu_buffer->commit_page))
1766 return;
1704 rb_inc_iter(iter); 1767 rb_inc_iter(iter);
1705 return; 1768 return;
1706 } 1769 }
@@ -1713,8 +1776,10 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1713 * This should not be called to advance the header if we are 1776 * This should not be called to advance the header if we are
1714 * at the tail of the buffer. 1777 * at the tail of the buffer.
1715 */ 1778 */
1716 BUG_ON((iter->head_page == cpu_buffer->commit_page) && 1779 if (RB_WARN_ON(cpu_buffer,
1717 (iter->head + length > rb_commit_index(cpu_buffer))); 1780 (iter->head_page == cpu_buffer->commit_page) &&
1781 (iter->head + length > rb_commit_index(cpu_buffer))))
1782 return;
1718 1783
1719 rb_update_iter_read_stamp(iter, event); 1784 rb_update_iter_read_stamp(iter, event);
1720 1785
@@ -1726,17 +1791,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
1726 rb_advance_iter(iter); 1791 rb_advance_iter(iter);
1727} 1792}
1728 1793
1729/** 1794static struct ring_buffer_event *
1730 * ring_buffer_peek - peek at the next event to be read 1795rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1731 * @buffer: The ring buffer to read
1732 * @cpu: The cpu to peak at
1733 * @ts: The timestamp counter of this event.
1734 *
1735 * This will return the event that will be read next, but does
1736 * not consume the data.
1737 */
1738struct ring_buffer_event *
1739ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1740{ 1796{
1741 struct ring_buffer_per_cpu *cpu_buffer; 1797 struct ring_buffer_per_cpu *cpu_buffer;
1742 struct ring_buffer_event *event; 1798 struct ring_buffer_event *event;
@@ -1757,10 +1813,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1757 * can have. Nesting 10 deep of interrupts is clearly 1813 * can have. Nesting 10 deep of interrupts is clearly
1758 * an anomaly. 1814 * an anomaly.
1759 */ 1815 */
1760 if (unlikely(++nr_loops > 10)) { 1816 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
1761 RB_WARN_ON(cpu_buffer, 1);
1762 return NULL; 1817 return NULL;
1763 }
1764 1818
1765 reader = rb_get_reader_page(cpu_buffer); 1819 reader = rb_get_reader_page(cpu_buffer);
1766 if (!reader) 1820 if (!reader)
@@ -1798,16 +1852,8 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1798 return NULL; 1852 return NULL;
1799} 1853}
1800 1854
1801/** 1855static struct ring_buffer_event *
1802 * ring_buffer_iter_peek - peek at the next event to be read 1856rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1803 * @iter: The ring buffer iterator
1804 * @ts: The timestamp counter of this event.
1805 *
1806 * This will return the event that will be read next, but does
1807 * not increment the iterator.
1808 */
1809struct ring_buffer_event *
1810ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1811{ 1857{
1812 struct ring_buffer *buffer; 1858 struct ring_buffer *buffer;
1813 struct ring_buffer_per_cpu *cpu_buffer; 1859 struct ring_buffer_per_cpu *cpu_buffer;
@@ -1829,10 +1875,8 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1829 * can have. Nesting 10 deep of interrupts is clearly 1875 * can have. Nesting 10 deep of interrupts is clearly
1830 * an anomaly. 1876 * an anomaly.
1831 */ 1877 */
1832 if (unlikely(++nr_loops > 10)) { 1878 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
1833 RB_WARN_ON(cpu_buffer, 1);
1834 return NULL; 1879 return NULL;
1835 }
1836 1880
1837 if (rb_per_cpu_empty(cpu_buffer)) 1881 if (rb_per_cpu_empty(cpu_buffer))
1838 return NULL; 1882 return NULL;
@@ -1869,6 +1913,51 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1869} 1913}
1870 1914
1871/** 1915/**
1916 * ring_buffer_peek - peek at the next event to be read
1917 * @buffer: The ring buffer to read
1918 * @cpu: The cpu to peak at
1919 * @ts: The timestamp counter of this event.
1920 *
1921 * This will return the event that will be read next, but does
1922 * not consume the data.
1923 */
1924struct ring_buffer_event *
1925ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1926{
1927 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
1928 struct ring_buffer_event *event;
1929 unsigned long flags;
1930
1931 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1932 event = rb_buffer_peek(buffer, cpu, ts);
1933 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1934
1935 return event;
1936}
1937
1938/**
1939 * ring_buffer_iter_peek - peek at the next event to be read
1940 * @iter: The ring buffer iterator
1941 * @ts: The timestamp counter of this event.
1942 *
1943 * This will return the event that will be read next, but does
1944 * not increment the iterator.
1945 */
1946struct ring_buffer_event *
1947ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1948{
1949 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1950 struct ring_buffer_event *event;
1951 unsigned long flags;
1952
1953 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1954 event = rb_iter_peek(iter, ts);
1955 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1956
1957 return event;
1958}
1959
1960/**
1872 * ring_buffer_consume - return an event and consume it 1961 * ring_buffer_consume - return an event and consume it
1873 * @buffer: The ring buffer to get the next event from 1962 * @buffer: The ring buffer to get the next event from
1874 * 1963 *
@@ -1879,19 +1968,24 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1879struct ring_buffer_event * 1968struct ring_buffer_event *
1880ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 1969ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
1881{ 1970{
1882 struct ring_buffer_per_cpu *cpu_buffer; 1971 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
1883 struct ring_buffer_event *event; 1972 struct ring_buffer_event *event;
1973 unsigned long flags;
1884 1974
1885 if (!cpu_isset(cpu, buffer->cpumask)) 1975 if (!cpu_isset(cpu, buffer->cpumask))
1886 return NULL; 1976 return NULL;
1887 1977
1888 event = ring_buffer_peek(buffer, cpu, ts); 1978 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1979
1980 event = rb_buffer_peek(buffer, cpu, ts);
1889 if (!event) 1981 if (!event)
1890 return NULL; 1982 goto out;
1891 1983
1892 cpu_buffer = buffer->buffers[cpu];
1893 rb_advance_reader(cpu_buffer); 1984 rb_advance_reader(cpu_buffer);
1894 1985
1986 out:
1987 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1988
1895 return event; 1989 return event;
1896} 1990}
1897 1991
@@ -1928,9 +2022,11 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
1928 atomic_inc(&cpu_buffer->record_disabled); 2022 atomic_inc(&cpu_buffer->record_disabled);
1929 synchronize_sched(); 2023 synchronize_sched();
1930 2024
1931 spin_lock_irqsave(&cpu_buffer->lock, flags); 2025 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
1932 ring_buffer_iter_reset(iter); 2026 __raw_spin_lock(&cpu_buffer->lock);
1933 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 2027 rb_iter_reset(iter);
2028 __raw_spin_unlock(&cpu_buffer->lock);
2029 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1934 2030
1935 return iter; 2031 return iter;
1936} 2032}
@@ -1962,12 +2058,17 @@ struct ring_buffer_event *
1962ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) 2058ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
1963{ 2059{
1964 struct ring_buffer_event *event; 2060 struct ring_buffer_event *event;
2061 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
2062 unsigned long flags;
1965 2063
1966 event = ring_buffer_iter_peek(iter, ts); 2064 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2065 event = rb_iter_peek(iter, ts);
1967 if (!event) 2066 if (!event)
1968 return NULL; 2067 goto out;
1969 2068
1970 rb_advance_iter(iter); 2069 rb_advance_iter(iter);
2070 out:
2071 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
1971 2072
1972 return event; 2073 return event;
1973} 2074}
@@ -2016,11 +2117,15 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
2016 if (!cpu_isset(cpu, buffer->cpumask)) 2117 if (!cpu_isset(cpu, buffer->cpumask))
2017 return; 2118 return;
2018 2119
2019 spin_lock_irqsave(&cpu_buffer->lock, flags); 2120 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
2121
2122 __raw_spin_lock(&cpu_buffer->lock);
2020 2123
2021 rb_reset_cpu(cpu_buffer); 2124 rb_reset_cpu(cpu_buffer);
2022 2125
2023 spin_unlock_irqrestore(&cpu_buffer->lock, flags); 2126 __raw_spin_unlock(&cpu_buffer->lock);
2127
2128 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
2024} 2129}
2025 2130
2026/** 2131/**
@@ -2122,12 +2227,14 @@ static ssize_t
2122rb_simple_read(struct file *filp, char __user *ubuf, 2227rb_simple_read(struct file *filp, char __user *ubuf,
2123 size_t cnt, loff_t *ppos) 2228 size_t cnt, loff_t *ppos)
2124{ 2229{
2125 int *p = filp->private_data; 2230 long *p = filp->private_data;
2126 char buf[64]; 2231 char buf[64];
2127 int r; 2232 int r;
2128 2233
2129 /* !ring_buffers_off == tracing_on */ 2234 if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
2130 r = sprintf(buf, "%d\n", !*p); 2235 r = sprintf(buf, "permanently disabled\n");
2236 else
2237 r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
2131 2238
2132 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2239 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2133} 2240}
@@ -2136,7 +2243,7 @@ static ssize_t
2136rb_simple_write(struct file *filp, const char __user *ubuf, 2243rb_simple_write(struct file *filp, const char __user *ubuf,
2137 size_t cnt, loff_t *ppos) 2244 size_t cnt, loff_t *ppos)
2138{ 2245{
2139 int *p = filp->private_data; 2246 long *p = filp->private_data;
2140 char buf[64]; 2247 char buf[64];
2141 long val; 2248 long val;
2142 int ret; 2249 int ret;
@@ -2153,8 +2260,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
2153 if (ret < 0) 2260 if (ret < 0)
2154 return ret; 2261 return ret;
2155 2262
2156 /* !ring_buffers_off == tracing_on */ 2263 if (val)
2157 *p = !val; 2264 set_bit(RB_BUFFERS_ON_BIT, p);
2265 else
2266 clear_bit(RB_BUFFERS_ON_BIT, p);
2158 2267
2159 (*ppos)++; 2268 (*ppos)++;
2160 2269
@@ -2176,7 +2285,7 @@ static __init int rb_init_debugfs(void)
2176 d_tracer = tracing_init_dentry(); 2285 d_tracer = tracing_init_dentry();
2177 2286
2178 entry = debugfs_create_file("tracing_on", 0644, d_tracer, 2287 entry = debugfs_create_file("tracing_on", 0644, d_tracer,
2179 &ring_buffers_off, &rb_simple_fops); 2288 &ring_buffer_flags, &rb_simple_fops);
2180 if (!entry) 2289 if (!entry)
2181 pr_warning("Could not create debugfs 'tracing_on' entry\n"); 2290 pr_warning("Could not create debugfs 'tracing_on' entry\n");
2182 2291
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d86e3252f300..a45b59e53fbc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,6 +30,7 @@
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/kprobes.h> 32#include <linux/kprobes.h>
33#include <linux/seq_file.h>
33#include <linux/writeback.h> 34#include <linux/writeback.h>
34 35
35#include <linux/stacktrace.h> 36#include <linux/stacktrace.h>
@@ -43,6 +44,29 @@
43unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; 44unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
44unsigned long __read_mostly tracing_thresh; 45unsigned long __read_mostly tracing_thresh;
45 46
47/* For tracers that don't implement custom flags */
48static struct tracer_opt dummy_tracer_opt[] = {
49 { }
50};
51
52static struct tracer_flags dummy_tracer_flags = {
53 .val = 0,
54 .opts = dummy_tracer_opt
55};
56
57static int dummy_set_flag(u32 old_flags, u32 bit, int set)
58{
59 return 0;
60}
61
62/*
63 * Kill all tracing for good (never come back).
64 * It is initialized to 1 but will turn to zero if the initialization
65 * of the tracer is successful. But that is the only place that sets
66 * this back to zero.
67 */
68int tracing_disabled = 1;
69
46static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 70static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
47 71
48static inline void ftrace_disable_cpu(void) 72static inline void ftrace_disable_cpu(void)
@@ -62,7 +86,36 @@ static cpumask_t __read_mostly tracing_buffer_mask;
62#define for_each_tracing_cpu(cpu) \ 86#define for_each_tracing_cpu(cpu) \
63 for_each_cpu_mask(cpu, tracing_buffer_mask) 87 for_each_cpu_mask(cpu, tracing_buffer_mask)
64 88
65static int tracing_disabled = 1; 89/*
90 * ftrace_dump_on_oops - variable to dump ftrace buffer on oops
91 *
92 * If there is an oops (or kernel panic) and the ftrace_dump_on_oops
93 * is set, then ftrace_dump is called. This will output the contents
94 * of the ftrace buffers to the console. This is very useful for
95 * capturing traces that lead to crashes and outputing it to a
96 * serial console.
97 *
98 * It is default off, but you can enable it with either specifying
99 * "ftrace_dump_on_oops" in the kernel command line, or setting
100 * /proc/sys/kernel/ftrace_dump_on_oops to true.
101 */
102int ftrace_dump_on_oops;
103
104static int tracing_set_tracer(char *buf);
105
106static int __init set_ftrace(char *str)
107{
108 tracing_set_tracer(str);
109 return 1;
110}
111__setup("ftrace", set_ftrace);
112
113static int __init set_ftrace_dump_on_oops(char *str)
114{
115 ftrace_dump_on_oops = 1;
116 return 1;
117}
118__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
66 119
67long 120long
68ns2usecs(cycle_t nsec) 121ns2usecs(cycle_t nsec)
@@ -112,6 +165,19 @@ static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
112/* tracer_enabled is used to toggle activation of a tracer */ 165/* tracer_enabled is used to toggle activation of a tracer */
113static int tracer_enabled = 1; 166static int tracer_enabled = 1;
114 167
168/**
169 * tracing_is_enabled - return tracer_enabled status
170 *
171 * This function is used by other tracers to know the status
172 * of the tracer_enabled flag. Tracers may use this function
173 * to know if it should enable their features when starting
174 * up. See irqsoff tracer for an example (start_irqsoff_tracer).
175 */
176int tracing_is_enabled(void)
177{
178 return tracer_enabled;
179}
180
115/* function tracing enabled */ 181/* function tracing enabled */
116int ftrace_function_enabled; 182int ftrace_function_enabled;
117 183
@@ -153,8 +219,9 @@ static DEFINE_MUTEX(trace_types_lock);
153/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 219/* trace_wait is a waitqueue for tasks blocked on trace_poll */
154static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 220static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
155 221
156/* trace_flags holds iter_ctrl options */ 222/* trace_flags holds trace_options default values */
157unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; 223unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
224 TRACE_ITER_ANNOTATE;
158 225
159/** 226/**
160 * trace_wake_up - wake up tasks waiting for trace input 227 * trace_wake_up - wake up tasks waiting for trace input
@@ -193,13 +260,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
193 return nsecs / 1000; 260 return nsecs / 1000;
194} 261}
195 262
196/*
197 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
198 * control the output of kernel symbols.
199 */
200#define TRACE_ITER_SYM_MASK \
201 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
202
203/* These must match the bit postions in trace_iterator_flags */ 263/* These must match the bit postions in trace_iterator_flags */
204static const char *trace_options[] = { 264static const char *trace_options[] = {
205 "print-parent", 265 "print-parent",
@@ -213,6 +273,11 @@ static const char *trace_options[] = {
213 "stacktrace", 273 "stacktrace",
214 "sched-tree", 274 "sched-tree",
215 "ftrace_printk", 275 "ftrace_printk",
276 "ftrace_preempt",
277 "branch",
278 "annotate",
279 "userstacktrace",
280 "sym-userobj",
216 NULL 281 NULL
217}; 282};
218 283
@@ -359,6 +424,28 @@ trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
359 return trace_seq_putmem(s, hex, j); 424 return trace_seq_putmem(s, hex, j);
360} 425}
361 426
427static int
428trace_seq_path(struct trace_seq *s, struct path *path)
429{
430 unsigned char *p;
431
432 if (s->len >= (PAGE_SIZE - 1))
433 return 0;
434 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
435 if (!IS_ERR(p)) {
436 p = mangle_path(s->buffer + s->len, p, "\n");
437 if (p) {
438 s->len = p - s->buffer;
439 return 1;
440 }
441 } else {
442 s->buffer[s->len++] = '?';
443 return 1;
444 }
445
446 return 0;
447}
448
362static void 449static void
363trace_seq_reset(struct trace_seq *s) 450trace_seq_reset(struct trace_seq *s)
364{ 451{
@@ -470,7 +557,15 @@ int register_tracer(struct tracer *type)
470 return -1; 557 return -1;
471 } 558 }
472 559
560 /*
561 * When this gets called we hold the BKL which means that
562 * preemption is disabled. Various trace selftests however
563 * need to disable and enable preemption for successful tests.
564 * So we drop the BKL here and grab it after the tests again.
565 */
566 unlock_kernel();
473 mutex_lock(&trace_types_lock); 567 mutex_lock(&trace_types_lock);
568
474 for (t = trace_types; t; t = t->next) { 569 for (t = trace_types; t; t = t->next) {
475 if (strcmp(type->name, t->name) == 0) { 570 if (strcmp(type->name, t->name) == 0) {
476 /* already found */ 571 /* already found */
@@ -481,11 +576,18 @@ int register_tracer(struct tracer *type)
481 } 576 }
482 } 577 }
483 578
579 if (!type->set_flag)
580 type->set_flag = &dummy_set_flag;
581 if (!type->flags)
582 type->flags = &dummy_tracer_flags;
583 else
584 if (!type->flags->opts)
585 type->flags->opts = dummy_tracer_opt;
586
484#ifdef CONFIG_FTRACE_STARTUP_TEST 587#ifdef CONFIG_FTRACE_STARTUP_TEST
485 if (type->selftest) { 588 if (type->selftest) {
486 struct tracer *saved_tracer = current_trace; 589 struct tracer *saved_tracer = current_trace;
487 struct trace_array *tr = &global_trace; 590 struct trace_array *tr = &global_trace;
488 int saved_ctrl = tr->ctrl;
489 int i; 591 int i;
490 /* 592 /*
491 * Run a selftest on this tracer. 593 * Run a selftest on this tracer.
@@ -494,25 +596,23 @@ int register_tracer(struct tracer *type)
494 * internal tracing to verify that everything is in order. 596 * internal tracing to verify that everything is in order.
495 * If we fail, we do not register this tracer. 597 * If we fail, we do not register this tracer.
496 */ 598 */
497 for_each_tracing_cpu(i) { 599 for_each_tracing_cpu(i)
498 tracing_reset(tr, i); 600 tracing_reset(tr, i);
499 } 601
500 current_trace = type; 602 current_trace = type;
501 tr->ctrl = 0;
502 /* the test is responsible for initializing and enabling */ 603 /* the test is responsible for initializing and enabling */
503 pr_info("Testing tracer %s: ", type->name); 604 pr_info("Testing tracer %s: ", type->name);
504 ret = type->selftest(type, tr); 605 ret = type->selftest(type, tr);
505 /* the test is responsible for resetting too */ 606 /* the test is responsible for resetting too */
506 current_trace = saved_tracer; 607 current_trace = saved_tracer;
507 tr->ctrl = saved_ctrl;
508 if (ret) { 608 if (ret) {
509 printk(KERN_CONT "FAILED!\n"); 609 printk(KERN_CONT "FAILED!\n");
510 goto out; 610 goto out;
511 } 611 }
512 /* Only reset on passing, to avoid touching corrupted buffers */ 612 /* Only reset on passing, to avoid touching corrupted buffers */
513 for_each_tracing_cpu(i) { 613 for_each_tracing_cpu(i)
514 tracing_reset(tr, i); 614 tracing_reset(tr, i);
515 } 615
516 printk(KERN_CONT "PASSED\n"); 616 printk(KERN_CONT "PASSED\n");
517 } 617 }
518#endif 618#endif
@@ -525,6 +625,7 @@ int register_tracer(struct tracer *type)
525 625
526 out: 626 out:
527 mutex_unlock(&trace_types_lock); 627 mutex_unlock(&trace_types_lock);
628 lock_kernel();
528 629
529 return ret; 630 return ret;
530} 631}
@@ -581,6 +682,91 @@ static void trace_init_cmdlines(void)
581 cmdline_idx = 0; 682 cmdline_idx = 0;
582} 683}
583 684
685static int trace_stop_count;
686static DEFINE_SPINLOCK(tracing_start_lock);
687
688/**
689 * ftrace_off_permanent - disable all ftrace code permanently
690 *
691 * This should only be called when a serious anomally has
692 * been detected. This will turn off the function tracing,
693 * ring buffers, and other tracing utilites. It takes no
694 * locks and can be called from any context.
695 */
696void ftrace_off_permanent(void)
697{
698 tracing_disabled = 1;
699 ftrace_stop();
700 tracing_off_permanent();
701}
702
703/**
704 * tracing_start - quick start of the tracer
705 *
706 * If tracing is enabled but was stopped by tracing_stop,
707 * this will start the tracer back up.
708 */
709void tracing_start(void)
710{
711 struct ring_buffer *buffer;
712 unsigned long flags;
713
714 if (tracing_disabled)
715 return;
716
717 spin_lock_irqsave(&tracing_start_lock, flags);
718 if (--trace_stop_count)
719 goto out;
720
721 if (trace_stop_count < 0) {
722 /* Someone screwed up their debugging */
723 WARN_ON_ONCE(1);
724 trace_stop_count = 0;
725 goto out;
726 }
727
728
729 buffer = global_trace.buffer;
730 if (buffer)
731 ring_buffer_record_enable(buffer);
732
733 buffer = max_tr.buffer;
734 if (buffer)
735 ring_buffer_record_enable(buffer);
736
737 ftrace_start();
738 out:
739 spin_unlock_irqrestore(&tracing_start_lock, flags);
740}
741
742/**
743 * tracing_stop - quick stop of the tracer
744 *
745 * Light weight way to stop tracing. Use in conjunction with
746 * tracing_start.
747 */
748void tracing_stop(void)
749{
750 struct ring_buffer *buffer;
751 unsigned long flags;
752
753 ftrace_stop();
754 spin_lock_irqsave(&tracing_start_lock, flags);
755 if (trace_stop_count++)
756 goto out;
757
758 buffer = global_trace.buffer;
759 if (buffer)
760 ring_buffer_record_disable(buffer);
761
762 buffer = max_tr.buffer;
763 if (buffer)
764 ring_buffer_record_disable(buffer);
765
766 out:
767 spin_unlock_irqrestore(&tracing_start_lock, flags);
768}
769
584void trace_stop_cmdline_recording(void); 770void trace_stop_cmdline_recording(void);
585 771
586static void trace_save_cmdline(struct task_struct *tsk) 772static void trace_save_cmdline(struct task_struct *tsk)
@@ -655,6 +841,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
655 841
656 entry->preempt_count = pc & 0xff; 842 entry->preempt_count = pc & 0xff;
657 entry->pid = (tsk) ? tsk->pid : 0; 843 entry->pid = (tsk) ? tsk->pid : 0;
844 entry->tgid = (tsk) ? tsk->tgid : 0;
658 entry->flags = 845 entry->flags =
659#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 846#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
660 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 847 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -691,6 +878,36 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
691 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 878 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
692} 879}
693 880
881#ifdef CONFIG_FUNCTION_RET_TRACER
882static void __trace_function_return(struct trace_array *tr,
883 struct trace_array_cpu *data,
884 struct ftrace_retfunc *trace,
885 unsigned long flags,
886 int pc)
887{
888 struct ring_buffer_event *event;
889 struct ftrace_ret_entry *entry;
890 unsigned long irq_flags;
891
892 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
893 return;
894
895 event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
896 &irq_flags);
897 if (!event)
898 return;
899 entry = ring_buffer_event_data(event);
900 tracing_generic_entry_update(&entry->ent, flags, pc);
901 entry->ent.type = TRACE_FN_RET;
902 entry->ip = trace->func;
903 entry->parent_ip = trace->ret;
904 entry->rettime = trace->rettime;
905 entry->calltime = trace->calltime;
906 entry->overrun = trace->overrun;
907 ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
908}
909#endif
910
694void 911void
695ftrace(struct trace_array *tr, struct trace_array_cpu *data, 912ftrace(struct trace_array *tr, struct trace_array_cpu *data,
696 unsigned long ip, unsigned long parent_ip, unsigned long flags, 913 unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -742,6 +959,44 @@ void __trace_stack(struct trace_array *tr,
742 ftrace_trace_stack(tr, data, flags, skip, preempt_count()); 959 ftrace_trace_stack(tr, data, flags, skip, preempt_count());
743} 960}
744 961
962static void ftrace_trace_userstack(struct trace_array *tr,
963 struct trace_array_cpu *data,
964 unsigned long flags, int pc)
965{
966 struct ring_buffer_event *event;
967 struct userstack_entry *entry;
968 struct stack_trace trace;
969 unsigned long irq_flags;
970
971 if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
972 return;
973
974 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
975 &irq_flags);
976 if (!event)
977 return;
978 entry = ring_buffer_event_data(event);
979 tracing_generic_entry_update(&entry->ent, flags, pc);
980 entry->ent.type = TRACE_USER_STACK;
981
982 memset(&entry->caller, 0, sizeof(entry->caller));
983
984 trace.nr_entries = 0;
985 trace.max_entries = FTRACE_STACK_ENTRIES;
986 trace.skip = 0;
987 trace.entries = entry->caller;
988
989 save_stack_trace_user(&trace);
990 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
991}
992
993void __trace_userstack(struct trace_array *tr,
994 struct trace_array_cpu *data,
995 unsigned long flags)
996{
997 ftrace_trace_userstack(tr, data, flags, preempt_count());
998}
999
745static void 1000static void
746ftrace_trace_special(void *__tr, void *__data, 1001ftrace_trace_special(void *__tr, void *__data,
747 unsigned long arg1, unsigned long arg2, unsigned long arg3, 1002 unsigned long arg1, unsigned long arg2, unsigned long arg3,
@@ -765,6 +1020,7 @@ ftrace_trace_special(void *__tr, void *__data,
765 entry->arg3 = arg3; 1020 entry->arg3 = arg3;
766 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1021 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
767 ftrace_trace_stack(tr, data, irq_flags, 4, pc); 1022 ftrace_trace_stack(tr, data, irq_flags, 4, pc);
1023 ftrace_trace_userstack(tr, data, irq_flags, pc);
768 1024
769 trace_wake_up(); 1025 trace_wake_up();
770} 1026}
@@ -803,6 +1059,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
803 entry->next_cpu = task_cpu(next); 1059 entry->next_cpu = task_cpu(next);
804 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1060 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
805 ftrace_trace_stack(tr, data, flags, 5, pc); 1061 ftrace_trace_stack(tr, data, flags, 5, pc);
1062 ftrace_trace_userstack(tr, data, flags, pc);
806} 1063}
807 1064
808void 1065void
@@ -832,6 +1089,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
832 entry->next_cpu = task_cpu(wakee); 1089 entry->next_cpu = task_cpu(wakee);
833 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 1090 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
834 ftrace_trace_stack(tr, data, flags, 6, pc); 1091 ftrace_trace_stack(tr, data, flags, 6, pc);
1092 ftrace_trace_userstack(tr, data, flags, pc);
835 1093
836 trace_wake_up(); 1094 trace_wake_up();
837} 1095}
@@ -841,26 +1099,28 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
841{ 1099{
842 struct trace_array *tr = &global_trace; 1100 struct trace_array *tr = &global_trace;
843 struct trace_array_cpu *data; 1101 struct trace_array_cpu *data;
1102 unsigned long flags;
844 int cpu; 1103 int cpu;
845 int pc; 1104 int pc;
846 1105
847 if (tracing_disabled || !tr->ctrl) 1106 if (tracing_disabled)
848 return; 1107 return;
849 1108
850 pc = preempt_count(); 1109 pc = preempt_count();
851 preempt_disable_notrace(); 1110 local_irq_save(flags);
852 cpu = raw_smp_processor_id(); 1111 cpu = raw_smp_processor_id();
853 data = tr->data[cpu]; 1112 data = tr->data[cpu];
854 1113
855 if (likely(!atomic_read(&data->disabled))) 1114 if (likely(atomic_inc_return(&data->disabled) == 1))
856 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc); 1115 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
857 1116
858 preempt_enable_notrace(); 1117 atomic_dec(&data->disabled);
1118 local_irq_restore(flags);
859} 1119}
860 1120
861#ifdef CONFIG_FUNCTION_TRACER 1121#ifdef CONFIG_FUNCTION_TRACER
862static void 1122static void
863function_trace_call(unsigned long ip, unsigned long parent_ip) 1123function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
864{ 1124{
865 struct trace_array *tr = &global_trace; 1125 struct trace_array *tr = &global_trace;
866 struct trace_array_cpu *data; 1126 struct trace_array_cpu *data;
@@ -873,8 +1133,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
873 return; 1133 return;
874 1134
875 pc = preempt_count(); 1135 pc = preempt_count();
876 resched = need_resched(); 1136 resched = ftrace_preempt_disable();
877 preempt_disable_notrace();
878 local_save_flags(flags); 1137 local_save_flags(flags);
879 cpu = raw_smp_processor_id(); 1138 cpu = raw_smp_processor_id();
880 data = tr->data[cpu]; 1139 data = tr->data[cpu];
@@ -884,11 +1143,62 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
884 trace_function(tr, data, ip, parent_ip, flags, pc); 1143 trace_function(tr, data, ip, parent_ip, flags, pc);
885 1144
886 atomic_dec(&data->disabled); 1145 atomic_dec(&data->disabled);
887 if (resched) 1146 ftrace_preempt_enable(resched);
888 preempt_enable_no_resched_notrace(); 1147}
889 else 1148
890 preempt_enable_notrace(); 1149static void
1150function_trace_call(unsigned long ip, unsigned long parent_ip)
1151{
1152 struct trace_array *tr = &global_trace;
1153 struct trace_array_cpu *data;
1154 unsigned long flags;
1155 long disabled;
1156 int cpu;
1157 int pc;
1158
1159 if (unlikely(!ftrace_function_enabled))
1160 return;
1161
1162 /*
1163 * Need to use raw, since this must be called before the
1164 * recursive protection is performed.
1165 */
1166 local_irq_save(flags);
1167 cpu = raw_smp_processor_id();
1168 data = tr->data[cpu];
1169 disabled = atomic_inc_return(&data->disabled);
1170
1171 if (likely(disabled == 1)) {
1172 pc = preempt_count();
1173 trace_function(tr, data, ip, parent_ip, flags, pc);
1174 }
1175
1176 atomic_dec(&data->disabled);
1177 local_irq_restore(flags);
1178}
1179
1180#ifdef CONFIG_FUNCTION_RET_TRACER
1181void trace_function_return(struct ftrace_retfunc *trace)
1182{
1183 struct trace_array *tr = &global_trace;
1184 struct trace_array_cpu *data;
1185 unsigned long flags;
1186 long disabled;
1187 int cpu;
1188 int pc;
1189
1190 raw_local_irq_save(flags);
1191 cpu = raw_smp_processor_id();
1192 data = tr->data[cpu];
1193 disabled = atomic_inc_return(&data->disabled);
1194 if (likely(disabled == 1)) {
1195 pc = preempt_count();
1196 __trace_function_return(tr, data, trace, flags, pc);
1197 }
1198 atomic_dec(&data->disabled);
1199 raw_local_irq_restore(flags);
891} 1200}
1201#endif /* CONFIG_FUNCTION_RET_TRACER */
892 1202
893static struct ftrace_ops trace_ops __read_mostly = 1203static struct ftrace_ops trace_ops __read_mostly =
894{ 1204{
@@ -898,9 +1208,14 @@ static struct ftrace_ops trace_ops __read_mostly =
898void tracing_start_function_trace(void) 1208void tracing_start_function_trace(void)
899{ 1209{
900 ftrace_function_enabled = 0; 1210 ftrace_function_enabled = 0;
1211
1212 if (trace_flags & TRACE_ITER_PREEMPTONLY)
1213 trace_ops.func = function_trace_call_preempt_only;
1214 else
1215 trace_ops.func = function_trace_call;
1216
901 register_ftrace_function(&trace_ops); 1217 register_ftrace_function(&trace_ops);
902 if (tracer_enabled) 1218 ftrace_function_enabled = 1;
903 ftrace_function_enabled = 1;
904} 1219}
905 1220
906void tracing_stop_function_trace(void) 1221void tracing_stop_function_trace(void)
@@ -912,6 +1227,7 @@ void tracing_stop_function_trace(void)
912 1227
913enum trace_file_type { 1228enum trace_file_type {
914 TRACE_FILE_LAT_FMT = 1, 1229 TRACE_FILE_LAT_FMT = 1,
1230 TRACE_FILE_ANNOTATE = 2,
915}; 1231};
916 1232
917static void trace_iterator_increment(struct trace_iterator *iter, int cpu) 1233static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
@@ -1047,10 +1363,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1047 1363
1048 atomic_inc(&trace_record_cmdline_disabled); 1364 atomic_inc(&trace_record_cmdline_disabled);
1049 1365
1050 /* let the tracer grab locks here if needed */
1051 if (current_trace->start)
1052 current_trace->start(iter);
1053
1054 if (*pos != iter->pos) { 1366 if (*pos != iter->pos) {
1055 iter->ent = NULL; 1367 iter->ent = NULL;
1056 iter->cpu = 0; 1368 iter->cpu = 0;
@@ -1077,14 +1389,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1077 1389
1078static void s_stop(struct seq_file *m, void *p) 1390static void s_stop(struct seq_file *m, void *p)
1079{ 1391{
1080 struct trace_iterator *iter = m->private;
1081
1082 atomic_dec(&trace_record_cmdline_disabled); 1392 atomic_dec(&trace_record_cmdline_disabled);
1083
1084 /* let the tracer release locks here if needed */
1085 if (current_trace && current_trace == iter->trace && iter->trace->stop)
1086 iter->trace->stop(iter);
1087
1088 mutex_unlock(&trace_types_lock); 1393 mutex_unlock(&trace_types_lock);
1089} 1394}
1090 1395
@@ -1143,7 +1448,7 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1143# define IP_FMT "%016lx" 1448# define IP_FMT "%016lx"
1144#endif 1449#endif
1145 1450
1146static int 1451int
1147seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) 1452seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1148{ 1453{
1149 int ret; 1454 int ret;
@@ -1164,6 +1469,78 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1164 return ret; 1469 return ret;
1165} 1470}
1166 1471
1472static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
1473 unsigned long ip, unsigned long sym_flags)
1474{
1475 struct file *file = NULL;
1476 unsigned long vmstart = 0;
1477 int ret = 1;
1478
1479 if (mm) {
1480 const struct vm_area_struct *vma;
1481
1482 down_read(&mm->mmap_sem);
1483 vma = find_vma(mm, ip);
1484 if (vma) {
1485 file = vma->vm_file;
1486 vmstart = vma->vm_start;
1487 }
1488 if (file) {
1489 ret = trace_seq_path(s, &file->f_path);
1490 if (ret)
1491 ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
1492 }
1493 up_read(&mm->mmap_sem);
1494 }
1495 if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
1496 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1497 return ret;
1498}
1499
1500static int
1501seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
1502 unsigned long sym_flags)
1503{
1504 struct mm_struct *mm = NULL;
1505 int ret = 1;
1506 unsigned int i;
1507
1508 if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
1509 struct task_struct *task;
1510 /*
1511 * we do the lookup on the thread group leader,
1512 * since individual threads might have already quit!
1513 */
1514 rcu_read_lock();
1515 task = find_task_by_vpid(entry->ent.tgid);
1516 if (task)
1517 mm = get_task_mm(task);
1518 rcu_read_unlock();
1519 }
1520
1521 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1522 unsigned long ip = entry->caller[i];
1523
1524 if (ip == ULONG_MAX || !ret)
1525 break;
1526 if (i && ret)
1527 ret = trace_seq_puts(s, " <- ");
1528 if (!ip) {
1529 if (ret)
1530 ret = trace_seq_puts(s, "??");
1531 continue;
1532 }
1533 if (!ret)
1534 break;
1535 if (ret)
1536 ret = seq_print_user_ip(s, mm, ip, sym_flags);
1537 }
1538
1539 if (mm)
1540 mmput(mm);
1541 return ret;
1542}
1543
1167static void print_lat_help_header(struct seq_file *m) 1544static void print_lat_help_header(struct seq_file *m)
1168{ 1545{
1169 seq_puts(m, "# _------=> CPU# \n"); 1546 seq_puts(m, "# _------=> CPU# \n");
@@ -1338,6 +1715,23 @@ void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
1338 trace_seq_putc(s, '\n'); 1715 trace_seq_putc(s, '\n');
1339} 1716}
1340 1717
1718static void test_cpu_buff_start(struct trace_iterator *iter)
1719{
1720 struct trace_seq *s = &iter->seq;
1721
1722 if (!(trace_flags & TRACE_ITER_ANNOTATE))
1723 return;
1724
1725 if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
1726 return;
1727
1728 if (cpu_isset(iter->cpu, iter->started))
1729 return;
1730
1731 cpu_set(iter->cpu, iter->started);
1732 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
1733}
1734
1341static enum print_line_t 1735static enum print_line_t
1342print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) 1736print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1343{ 1737{
@@ -1357,6 +1751,8 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1357 if (entry->type == TRACE_CONT) 1751 if (entry->type == TRACE_CONT)
1358 return TRACE_TYPE_HANDLED; 1752 return TRACE_TYPE_HANDLED;
1359 1753
1754 test_cpu_buff_start(iter);
1755
1360 next_entry = find_next_entry(iter, NULL, &next_ts); 1756 next_entry = find_next_entry(iter, NULL, &next_ts);
1361 if (!next_entry) 1757 if (!next_entry)
1362 next_ts = iter->ts; 1758 next_ts = iter->ts;
@@ -1448,6 +1844,27 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1448 trace_seq_print_cont(s, iter); 1844 trace_seq_print_cont(s, iter);
1449 break; 1845 break;
1450 } 1846 }
1847 case TRACE_BRANCH: {
1848 struct trace_branch *field;
1849
1850 trace_assign_type(field, entry);
1851
1852 trace_seq_printf(s, "[%s] %s:%s:%d\n",
1853 field->correct ? " ok " : " MISS ",
1854 field->func,
1855 field->file,
1856 field->line);
1857 break;
1858 }
1859 case TRACE_USER_STACK: {
1860 struct userstack_entry *field;
1861
1862 trace_assign_type(field, entry);
1863
1864 seq_print_userip_objs(field, s, sym_flags);
1865 trace_seq_putc(s, '\n');
1866 break;
1867 }
1451 default: 1868 default:
1452 trace_seq_printf(s, "Unknown type %d\n", entry->type); 1869 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1453 } 1870 }
@@ -1472,6 +1889,8 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1472 if (entry->type == TRACE_CONT) 1889 if (entry->type == TRACE_CONT)
1473 return TRACE_TYPE_HANDLED; 1890 return TRACE_TYPE_HANDLED;
1474 1891
1892 test_cpu_buff_start(iter);
1893
1475 comm = trace_find_cmdline(iter->ent->pid); 1894 comm = trace_find_cmdline(iter->ent->pid);
1476 1895
1477 t = ns2usecs(iter->ts); 1896 t = ns2usecs(iter->ts);
@@ -1581,6 +2000,35 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1581 trace_seq_print_cont(s, iter); 2000 trace_seq_print_cont(s, iter);
1582 break; 2001 break;
1583 } 2002 }
2003 case TRACE_FN_RET: {
2004 return print_return_function(iter);
2005 break;
2006 }
2007 case TRACE_BRANCH: {
2008 struct trace_branch *field;
2009
2010 trace_assign_type(field, entry);
2011
2012 trace_seq_printf(s, "[%s] %s:%s:%d\n",
2013 field->correct ? " ok " : " MISS ",
2014 field->func,
2015 field->file,
2016 field->line);
2017 break;
2018 }
2019 case TRACE_USER_STACK: {
2020 struct userstack_entry *field;
2021
2022 trace_assign_type(field, entry);
2023
2024 ret = seq_print_userip_objs(field, s, sym_flags);
2025 if (!ret)
2026 return TRACE_TYPE_PARTIAL_LINE;
2027 ret = trace_seq_putc(s, '\n');
2028 if (!ret)
2029 return TRACE_TYPE_PARTIAL_LINE;
2030 break;
2031 }
1584 } 2032 }
1585 return TRACE_TYPE_HANDLED; 2033 return TRACE_TYPE_HANDLED;
1586} 2034}
@@ -1640,6 +2088,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1640 break; 2088 break;
1641 } 2089 }
1642 case TRACE_SPECIAL: 2090 case TRACE_SPECIAL:
2091 case TRACE_USER_STACK:
1643 case TRACE_STACK: { 2092 case TRACE_STACK: {
1644 struct special_entry *field; 2093 struct special_entry *field;
1645 2094
@@ -1728,6 +2177,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1728 break; 2177 break;
1729 } 2178 }
1730 case TRACE_SPECIAL: 2179 case TRACE_SPECIAL:
2180 case TRACE_USER_STACK:
1731 case TRACE_STACK: { 2181 case TRACE_STACK: {
1732 struct special_entry *field; 2182 struct special_entry *field;
1733 2183
@@ -1782,6 +2232,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1782 break; 2232 break;
1783 } 2233 }
1784 case TRACE_SPECIAL: 2234 case TRACE_SPECIAL:
2235 case TRACE_USER_STACK:
1785 case TRACE_STACK: { 2236 case TRACE_STACK: {
1786 struct special_entry *field; 2237 struct special_entry *field;
1787 2238
@@ -1899,6 +2350,11 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1899 iter->trace = current_trace; 2350 iter->trace = current_trace;
1900 iter->pos = -1; 2351 iter->pos = -1;
1901 2352
2353 /* Annotate start of buffers if we had overruns */
2354 if (ring_buffer_overruns(iter->tr->buffer))
2355 iter->iter_flags |= TRACE_FILE_ANNOTATE;
2356
2357
1902 for_each_tracing_cpu(cpu) { 2358 for_each_tracing_cpu(cpu) {
1903 2359
1904 iter->buffer_iter[cpu] = 2360 iter->buffer_iter[cpu] =
@@ -1917,10 +2373,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1917 m->private = iter; 2373 m->private = iter;
1918 2374
1919 /* stop the trace while dumping */ 2375 /* stop the trace while dumping */
1920 if (iter->tr->ctrl) { 2376 tracing_stop();
1921 tracer_enabled = 0;
1922 ftrace_function_enabled = 0;
1923 }
1924 2377
1925 if (iter->trace && iter->trace->open) 2378 if (iter->trace && iter->trace->open)
1926 iter->trace->open(iter); 2379 iter->trace->open(iter);
@@ -1966,14 +2419,7 @@ int tracing_release(struct inode *inode, struct file *file)
1966 iter->trace->close(iter); 2419 iter->trace->close(iter);
1967 2420
1968 /* reenable tracing if it was previously enabled */ 2421 /* reenable tracing if it was previously enabled */
1969 if (iter->tr->ctrl) { 2422 tracing_start();
1970 tracer_enabled = 1;
1971 /*
1972 * It is safe to enable function tracing even if it
1973 * isn't used
1974 */
1975 ftrace_function_enabled = 1;
1976 }
1977 mutex_unlock(&trace_types_lock); 2423 mutex_unlock(&trace_types_lock);
1978 2424
1979 seq_release(inode, file); 2425 seq_release(inode, file);
@@ -2189,13 +2635,16 @@ static struct file_operations tracing_cpumask_fops = {
2189}; 2635};
2190 2636
2191static ssize_t 2637static ssize_t
2192tracing_iter_ctrl_read(struct file *filp, char __user *ubuf, 2638tracing_trace_options_read(struct file *filp, char __user *ubuf,
2193 size_t cnt, loff_t *ppos) 2639 size_t cnt, loff_t *ppos)
2194{ 2640{
2641 int i;
2195 char *buf; 2642 char *buf;
2196 int r = 0; 2643 int r = 0;
2197 int len = 0; 2644 int len = 0;
2198 int i; 2645 u32 tracer_flags = current_trace->flags->val;
2646 struct tracer_opt *trace_opts = current_trace->flags->opts;
2647
2199 2648
2200 /* calulate max size */ 2649 /* calulate max size */
2201 for (i = 0; trace_options[i]; i++) { 2650 for (i = 0; trace_options[i]; i++) {
@@ -2203,6 +2652,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2203 len += 3; /* "no" and space */ 2652 len += 3; /* "no" and space */
2204 } 2653 }
2205 2654
2655 /*
2656 * Increase the size with names of options specific
2657 * of the current tracer.
2658 */
2659 for (i = 0; trace_opts[i].name; i++) {
2660 len += strlen(trace_opts[i].name);
2661 len += 3; /* "no" and space */
2662 }
2663
2206 /* +2 for \n and \0 */ 2664 /* +2 for \n and \0 */
2207 buf = kmalloc(len + 2, GFP_KERNEL); 2665 buf = kmalloc(len + 2, GFP_KERNEL);
2208 if (!buf) 2666 if (!buf)
@@ -2215,6 +2673,15 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2215 r += sprintf(buf + r, "no%s ", trace_options[i]); 2673 r += sprintf(buf + r, "no%s ", trace_options[i]);
2216 } 2674 }
2217 2675
2676 for (i = 0; trace_opts[i].name; i++) {
2677 if (tracer_flags & trace_opts[i].bit)
2678 r += sprintf(buf + r, "%s ",
2679 trace_opts[i].name);
2680 else
2681 r += sprintf(buf + r, "no%s ",
2682 trace_opts[i].name);
2683 }
2684
2218 r += sprintf(buf + r, "\n"); 2685 r += sprintf(buf + r, "\n");
2219 WARN_ON(r >= len + 2); 2686 WARN_ON(r >= len + 2);
2220 2687
@@ -2225,13 +2692,48 @@ tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2225 return r; 2692 return r;
2226} 2693}
2227 2694
2695/* Try to assign a tracer specific option */
2696static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2697{
2698 struct tracer_flags *trace_flags = trace->flags;
2699 struct tracer_opt *opts = NULL;
2700 int ret = 0, i = 0;
2701 int len;
2702
2703 for (i = 0; trace_flags->opts[i].name; i++) {
2704 opts = &trace_flags->opts[i];
2705 len = strlen(opts->name);
2706
2707 if (strncmp(cmp, opts->name, len) == 0) {
2708 ret = trace->set_flag(trace_flags->val,
2709 opts->bit, !neg);
2710 break;
2711 }
2712 }
2713 /* Not found */
2714 if (!trace_flags->opts[i].name)
2715 return -EINVAL;
2716
2717 /* Refused to handle */
2718 if (ret)
2719 return ret;
2720
2721 if (neg)
2722 trace_flags->val &= ~opts->bit;
2723 else
2724 trace_flags->val |= opts->bit;
2725
2726 return 0;
2727}
2728
2228static ssize_t 2729static ssize_t
2229tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf, 2730tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2230 size_t cnt, loff_t *ppos) 2731 size_t cnt, loff_t *ppos)
2231{ 2732{
2232 char buf[64]; 2733 char buf[64];
2233 char *cmp = buf; 2734 char *cmp = buf;
2234 int neg = 0; 2735 int neg = 0;
2736 int ret;
2235 int i; 2737 int i;
2236 2738
2237 if (cnt >= sizeof(buf)) 2739 if (cnt >= sizeof(buf))
@@ -2258,11 +2760,13 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2258 break; 2760 break;
2259 } 2761 }
2260 } 2762 }
2261 /* 2763
2262 * If no option could be set, return an error: 2764 /* If no option could be set, test the specific tracer options */
2263 */ 2765 if (!trace_options[i]) {
2264 if (!trace_options[i]) 2766 ret = set_tracer_option(current_trace, cmp, neg);
2265 return -EINVAL; 2767 if (ret)
2768 return ret;
2769 }
2266 2770
2267 filp->f_pos += cnt; 2771 filp->f_pos += cnt;
2268 2772
@@ -2271,8 +2775,8 @@ tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2271 2775
2272static struct file_operations tracing_iter_fops = { 2776static struct file_operations tracing_iter_fops = {
2273 .open = tracing_open_generic, 2777 .open = tracing_open_generic,
2274 .read = tracing_iter_ctrl_read, 2778 .read = tracing_trace_options_read,
2275 .write = tracing_iter_ctrl_write, 2779 .write = tracing_trace_options_write,
2276}; 2780};
2277 2781
2278static const char readme_msg[] = 2782static const char readme_msg[] =
@@ -2286,9 +2790,9 @@ static const char readme_msg[] =
2286 "# echo sched_switch > /debug/tracing/current_tracer\n" 2790 "# echo sched_switch > /debug/tracing/current_tracer\n"
2287 "# cat /debug/tracing/current_tracer\n" 2791 "# cat /debug/tracing/current_tracer\n"
2288 "sched_switch\n" 2792 "sched_switch\n"
2289 "# cat /debug/tracing/iter_ctrl\n" 2793 "# cat /debug/tracing/trace_options\n"
2290 "noprint-parent nosym-offset nosym-addr noverbose\n" 2794 "noprint-parent nosym-offset nosym-addr noverbose\n"
2291 "# echo print-parent > /debug/tracing/iter_ctrl\n" 2795 "# echo print-parent > /debug/tracing/trace_options\n"
2292 "# echo 1 > /debug/tracing/tracing_enabled\n" 2796 "# echo 1 > /debug/tracing/tracing_enabled\n"
2293 "# cat /debug/tracing/trace > /tmp/trace.txt\n" 2797 "# cat /debug/tracing/trace > /tmp/trace.txt\n"
2294 "echo 0 > /debug/tracing/tracing_enabled\n" 2798 "echo 0 > /debug/tracing/tracing_enabled\n"
@@ -2311,11 +2815,10 @@ static ssize_t
2311tracing_ctrl_read(struct file *filp, char __user *ubuf, 2815tracing_ctrl_read(struct file *filp, char __user *ubuf,
2312 size_t cnt, loff_t *ppos) 2816 size_t cnt, loff_t *ppos)
2313{ 2817{
2314 struct trace_array *tr = filp->private_data;
2315 char buf[64]; 2818 char buf[64];
2316 int r; 2819 int r;
2317 2820
2318 r = sprintf(buf, "%ld\n", tr->ctrl); 2821 r = sprintf(buf, "%u\n", tracer_enabled);
2319 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2822 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2320} 2823}
2321 2824
@@ -2343,16 +2846,18 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2343 val = !!val; 2846 val = !!val;
2344 2847
2345 mutex_lock(&trace_types_lock); 2848 mutex_lock(&trace_types_lock);
2346 if (tr->ctrl ^ val) { 2849 if (tracer_enabled ^ val) {
2347 if (val) 2850 if (val) {
2348 tracer_enabled = 1; 2851 tracer_enabled = 1;
2349 else 2852 if (current_trace->start)
2853 current_trace->start(tr);
2854 tracing_start();
2855 } else {
2350 tracer_enabled = 0; 2856 tracer_enabled = 0;
2351 2857 tracing_stop();
2352 tr->ctrl = val; 2858 if (current_trace->stop)
2353 2859 current_trace->stop(tr);
2354 if (current_trace && current_trace->ctrl_update) 2860 }
2355 current_trace->ctrl_update(tr);
2356 } 2861 }
2357 mutex_unlock(&trace_types_lock); 2862 mutex_unlock(&trace_types_lock);
2358 2863
@@ -2378,29 +2883,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
2378 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2883 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2379} 2884}
2380 2885
2381static ssize_t 2886static int tracing_set_tracer(char *buf)
2382tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2383 size_t cnt, loff_t *ppos)
2384{ 2887{
2385 struct trace_array *tr = &global_trace; 2888 struct trace_array *tr = &global_trace;
2386 struct tracer *t; 2889 struct tracer *t;
2387 char buf[max_tracer_type_len+1]; 2890 int ret = 0;
2388 int i;
2389 size_t ret;
2390
2391 ret = cnt;
2392
2393 if (cnt > max_tracer_type_len)
2394 cnt = max_tracer_type_len;
2395
2396 if (copy_from_user(&buf, ubuf, cnt))
2397 return -EFAULT;
2398
2399 buf[cnt] = 0;
2400
2401 /* strip ending whitespace. */
2402 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
2403 buf[i] = 0;
2404 2891
2405 mutex_lock(&trace_types_lock); 2892 mutex_lock(&trace_types_lock);
2406 for (t = trace_types; t; t = t->next) { 2893 for (t = trace_types; t; t = t->next) {
@@ -2414,18 +2901,52 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2414 if (t == current_trace) 2901 if (t == current_trace)
2415 goto out; 2902 goto out;
2416 2903
2904 trace_branch_disable();
2417 if (current_trace && current_trace->reset) 2905 if (current_trace && current_trace->reset)
2418 current_trace->reset(tr); 2906 current_trace->reset(tr);
2419 2907
2420 current_trace = t; 2908 current_trace = t;
2421 if (t->init) 2909 if (t->init) {
2422 t->init(tr); 2910 ret = t->init(tr);
2911 if (ret)
2912 goto out;
2913 }
2423 2914
2915 trace_branch_enable(tr);
2424 out: 2916 out:
2425 mutex_unlock(&trace_types_lock); 2917 mutex_unlock(&trace_types_lock);
2426 2918
2427 if (ret > 0) 2919 return ret;
2428 filp->f_pos += ret; 2920}
2921
2922static ssize_t
2923tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2924 size_t cnt, loff_t *ppos)
2925{
2926 char buf[max_tracer_type_len+1];
2927 int i;
2928 size_t ret;
2929 int err;
2930
2931 ret = cnt;
2932
2933 if (cnt > max_tracer_type_len)
2934 cnt = max_tracer_type_len;
2935
2936 if (copy_from_user(&buf, ubuf, cnt))
2937 return -EFAULT;
2938
2939 buf[cnt] = 0;
2940
2941 /* strip ending whitespace. */
2942 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
2943 buf[i] = 0;
2944
2945 err = tracing_set_tracer(buf);
2946 if (err)
2947 return err;
2948
2949 filp->f_pos += ret;
2429 2950
2430 return ret; 2951 return ret;
2431} 2952}
@@ -2492,6 +3013,10 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2492 return -ENOMEM; 3013 return -ENOMEM;
2493 3014
2494 mutex_lock(&trace_types_lock); 3015 mutex_lock(&trace_types_lock);
3016
3017 /* trace pipe does not show start of buffer */
3018 cpus_setall(iter->started);
3019
2495 iter->tr = &global_trace; 3020 iter->tr = &global_trace;
2496 iter->trace = current_trace; 3021 iter->trace = current_trace;
2497 filp->private_data = iter; 3022 filp->private_data = iter;
@@ -2667,7 +3192,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
2667 char buf[64]; 3192 char buf[64];
2668 int r; 3193 int r;
2669 3194
2670 r = sprintf(buf, "%lu\n", tr->entries); 3195 r = sprintf(buf, "%lu\n", tr->entries >> 10);
2671 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3196 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2672} 3197}
2673 3198
@@ -2678,7 +3203,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2678 unsigned long val; 3203 unsigned long val;
2679 char buf[64]; 3204 char buf[64];
2680 int ret, cpu; 3205 int ret, cpu;
2681 struct trace_array *tr = filp->private_data;
2682 3206
2683 if (cnt >= sizeof(buf)) 3207 if (cnt >= sizeof(buf))
2684 return -EINVAL; 3208 return -EINVAL;
@@ -2698,12 +3222,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2698 3222
2699 mutex_lock(&trace_types_lock); 3223 mutex_lock(&trace_types_lock);
2700 3224
2701 if (tr->ctrl) { 3225 tracing_stop();
2702 cnt = -EBUSY;
2703 pr_info("ftrace: please disable tracing"
2704 " before modifying buffer size\n");
2705 goto out;
2706 }
2707 3226
2708 /* disable all cpu buffers */ 3227 /* disable all cpu buffers */
2709 for_each_tracing_cpu(cpu) { 3228 for_each_tracing_cpu(cpu) {
@@ -2713,6 +3232,9 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2713 atomic_inc(&max_tr.data[cpu]->disabled); 3232 atomic_inc(&max_tr.data[cpu]->disabled);
2714 } 3233 }
2715 3234
3235 /* value is in KB */
3236 val <<= 10;
3237
2716 if (val != global_trace.entries) { 3238 if (val != global_trace.entries) {
2717 ret = ring_buffer_resize(global_trace.buffer, val); 3239 ret = ring_buffer_resize(global_trace.buffer, val);
2718 if (ret < 0) { 3240 if (ret < 0) {
@@ -2751,6 +3273,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2751 atomic_dec(&max_tr.data[cpu]->disabled); 3273 atomic_dec(&max_tr.data[cpu]->disabled);
2752 } 3274 }
2753 3275
3276 tracing_start();
2754 max_tr.entries = global_trace.entries; 3277 max_tr.entries = global_trace.entries;
2755 mutex_unlock(&trace_types_lock); 3278 mutex_unlock(&trace_types_lock);
2756 3279
@@ -2773,9 +3296,8 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
2773{ 3296{
2774 char *buf; 3297 char *buf;
2775 char *end; 3298 char *end;
2776 struct trace_array *tr = &global_trace;
2777 3299
2778 if (!tr->ctrl || tracing_disabled) 3300 if (tracing_disabled)
2779 return -EINVAL; 3301 return -EINVAL;
2780 3302
2781 if (cnt > TRACE_BUF_SIZE) 3303 if (cnt > TRACE_BUF_SIZE)
@@ -2841,22 +3363,38 @@ static struct file_operations tracing_mark_fops = {
2841 3363
2842#ifdef CONFIG_DYNAMIC_FTRACE 3364#ifdef CONFIG_DYNAMIC_FTRACE
2843 3365
3366int __weak ftrace_arch_read_dyn_info(char *buf, int size)
3367{
3368 return 0;
3369}
3370
2844static ssize_t 3371static ssize_t
2845tracing_read_long(struct file *filp, char __user *ubuf, 3372tracing_read_dyn_info(struct file *filp, char __user *ubuf,
2846 size_t cnt, loff_t *ppos) 3373 size_t cnt, loff_t *ppos)
2847{ 3374{
3375 static char ftrace_dyn_info_buffer[1024];
3376 static DEFINE_MUTEX(dyn_info_mutex);
2848 unsigned long *p = filp->private_data; 3377 unsigned long *p = filp->private_data;
2849 char buf[64]; 3378 char *buf = ftrace_dyn_info_buffer;
3379 int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
2850 int r; 3380 int r;
2851 3381
2852 r = sprintf(buf, "%ld\n", *p); 3382 mutex_lock(&dyn_info_mutex);
3383 r = sprintf(buf, "%ld ", *p);
2853 3384
2854 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3385 r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
3386 buf[r++] = '\n';
3387
3388 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
3389
3390 mutex_unlock(&dyn_info_mutex);
3391
3392 return r;
2855} 3393}
2856 3394
2857static struct file_operations tracing_read_long_fops = { 3395static struct file_operations tracing_dyn_info_fops = {
2858 .open = tracing_open_generic, 3396 .open = tracing_open_generic,
2859 .read = tracing_read_long, 3397 .read = tracing_read_dyn_info,
2860}; 3398};
2861#endif 3399#endif
2862 3400
@@ -2897,10 +3435,10 @@ static __init int tracer_init_debugfs(void)
2897 if (!entry) 3435 if (!entry)
2898 pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); 3436 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
2899 3437
2900 entry = debugfs_create_file("iter_ctrl", 0644, d_tracer, 3438 entry = debugfs_create_file("trace_options", 0644, d_tracer,
2901 NULL, &tracing_iter_fops); 3439 NULL, &tracing_iter_fops);
2902 if (!entry) 3440 if (!entry)
2903 pr_warning("Could not create debugfs 'iter_ctrl' entry\n"); 3441 pr_warning("Could not create debugfs 'trace_options' entry\n");
2904 3442
2905 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, 3443 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
2906 NULL, &tracing_cpumask_fops); 3444 NULL, &tracing_cpumask_fops);
@@ -2950,11 +3488,11 @@ static __init int tracer_init_debugfs(void)
2950 pr_warning("Could not create debugfs " 3488 pr_warning("Could not create debugfs "
2951 "'trace_pipe' entry\n"); 3489 "'trace_pipe' entry\n");
2952 3490
2953 entry = debugfs_create_file("trace_entries", 0644, d_tracer, 3491 entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
2954 &global_trace, &tracing_entries_fops); 3492 &global_trace, &tracing_entries_fops);
2955 if (!entry) 3493 if (!entry)
2956 pr_warning("Could not create debugfs " 3494 pr_warning("Could not create debugfs "
2957 "'trace_entries' entry\n"); 3495 "'buffer_size_kb' entry\n");
2958 3496
2959 entry = debugfs_create_file("trace_marker", 0220, d_tracer, 3497 entry = debugfs_create_file("trace_marker", 0220, d_tracer,
2960 NULL, &tracing_mark_fops); 3498 NULL, &tracing_mark_fops);
@@ -2965,7 +3503,7 @@ static __init int tracer_init_debugfs(void)
2965#ifdef CONFIG_DYNAMIC_FTRACE 3503#ifdef CONFIG_DYNAMIC_FTRACE
2966 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, 3504 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
2967 &ftrace_update_tot_cnt, 3505 &ftrace_update_tot_cnt,
2968 &tracing_read_long_fops); 3506 &tracing_dyn_info_fops);
2969 if (!entry) 3507 if (!entry)
2970 pr_warning("Could not create debugfs " 3508 pr_warning("Could not create debugfs "
2971 "'dyn_ftrace_total_info' entry\n"); 3509 "'dyn_ftrace_total_info' entry\n");
@@ -2988,7 +3526,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
2988 unsigned long flags, irq_flags; 3526 unsigned long flags, irq_flags;
2989 int cpu, len = 0, size, pc; 3527 int cpu, len = 0, size, pc;
2990 3528
2991 if (!tr->ctrl || tracing_disabled) 3529 if (tracing_disabled)
2992 return 0; 3530 return 0;
2993 3531
2994 pc = preempt_count(); 3532 pc = preempt_count();
@@ -3046,7 +3584,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk);
3046static int trace_panic_handler(struct notifier_block *this, 3584static int trace_panic_handler(struct notifier_block *this,
3047 unsigned long event, void *unused) 3585 unsigned long event, void *unused)
3048{ 3586{
3049 ftrace_dump(); 3587 if (ftrace_dump_on_oops)
3588 ftrace_dump();
3050 return NOTIFY_OK; 3589 return NOTIFY_OK;
3051} 3590}
3052 3591
@@ -3062,7 +3601,8 @@ static int trace_die_handler(struct notifier_block *self,
3062{ 3601{
3063 switch (val) { 3602 switch (val) {
3064 case DIE_OOPS: 3603 case DIE_OOPS:
3065 ftrace_dump(); 3604 if (ftrace_dump_on_oops)
3605 ftrace_dump();
3066 break; 3606 break;
3067 default: 3607 default:
3068 break; 3608 break;
@@ -3103,7 +3643,6 @@ trace_printk_seq(struct trace_seq *s)
3103 trace_seq_reset(s); 3643 trace_seq_reset(s);
3104} 3644}
3105 3645
3106
3107void ftrace_dump(void) 3646void ftrace_dump(void)
3108{ 3647{
3109 static DEFINE_SPINLOCK(ftrace_dump_lock); 3648 static DEFINE_SPINLOCK(ftrace_dump_lock);
@@ -3128,6 +3667,9 @@ void ftrace_dump(void)
3128 atomic_inc(&global_trace.data[cpu]->disabled); 3667 atomic_inc(&global_trace.data[cpu]->disabled);
3129 } 3668 }
3130 3669
3670 /* don't look at user memory in panic mode */
3671 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
3672
3131 printk(KERN_TRACE "Dumping ftrace buffer:\n"); 3673 printk(KERN_TRACE "Dumping ftrace buffer:\n");
3132 3674
3133 iter.tr = &global_trace; 3675 iter.tr = &global_trace;
@@ -3221,7 +3763,6 @@ __init static int tracer_alloc_buffers(void)
3221#endif 3763#endif
3222 3764
3223 /* All seems OK, enable tracing */ 3765 /* All seems OK, enable tracing */
3224 global_trace.ctrl = tracer_enabled;
3225 tracing_disabled = 0; 3766 tracing_disabled = 0;
3226 3767
3227 atomic_notifier_chain_register(&panic_notifier_list, 3768 atomic_notifier_chain_register(&panic_notifier_list,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8465ad052707..28c15c2ebc22 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -8,6 +8,7 @@
8#include <linux/ring_buffer.h> 8#include <linux/ring_buffer.h>
9#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <trace/boot.h>
11 12
12enum trace_type { 13enum trace_type {
13 __TRACE_FIRST_TYPE = 0, 14 __TRACE_FIRST_TYPE = 0,
@@ -21,7 +22,11 @@ enum trace_type {
21 TRACE_SPECIAL, 22 TRACE_SPECIAL,
22 TRACE_MMIO_RW, 23 TRACE_MMIO_RW,
23 TRACE_MMIO_MAP, 24 TRACE_MMIO_MAP,
24 TRACE_BOOT, 25 TRACE_BRANCH,
26 TRACE_BOOT_CALL,
27 TRACE_BOOT_RET,
28 TRACE_FN_RET,
29 TRACE_USER_STACK,
25 30
26 __TRACE_LAST_TYPE 31 __TRACE_LAST_TYPE
27}; 32};
@@ -38,6 +43,7 @@ struct trace_entry {
38 unsigned char flags; 43 unsigned char flags;
39 unsigned char preempt_count; 44 unsigned char preempt_count;
40 int pid; 45 int pid;
46 int tgid;
41}; 47};
42 48
43/* 49/*
@@ -48,6 +54,16 @@ struct ftrace_entry {
48 unsigned long ip; 54 unsigned long ip;
49 unsigned long parent_ip; 55 unsigned long parent_ip;
50}; 56};
57
58/* Function return entry */
59struct ftrace_ret_entry {
60 struct trace_entry ent;
61 unsigned long ip;
62 unsigned long parent_ip;
63 unsigned long long calltime;
64 unsigned long long rettime;
65 unsigned long overrun;
66};
51extern struct tracer boot_tracer; 67extern struct tracer boot_tracer;
52 68
53/* 69/*
@@ -85,6 +101,11 @@ struct stack_entry {
85 unsigned long caller[FTRACE_STACK_ENTRIES]; 101 unsigned long caller[FTRACE_STACK_ENTRIES];
86}; 102};
87 103
104struct userstack_entry {
105 struct trace_entry ent;
106 unsigned long caller[FTRACE_STACK_ENTRIES];
107};
108
88/* 109/*
89 * ftrace_printk entry: 110 * ftrace_printk entry:
90 */ 111 */
@@ -112,9 +133,24 @@ struct trace_mmiotrace_map {
112 struct mmiotrace_map map; 133 struct mmiotrace_map map;
113}; 134};
114 135
115struct trace_boot { 136struct trace_boot_call {
116 struct trace_entry ent; 137 struct trace_entry ent;
117 struct boot_trace initcall; 138 struct boot_trace_call boot_call;
139};
140
141struct trace_boot_ret {
142 struct trace_entry ent;
143 struct boot_trace_ret boot_ret;
144};
145
146#define TRACE_FUNC_SIZE 30
147#define TRACE_FILE_SIZE 20
148struct trace_branch {
149 struct trace_entry ent;
150 unsigned line;
151 char func[TRACE_FUNC_SIZE+1];
152 char file[TRACE_FILE_SIZE+1];
153 char correct;
118}; 154};
119 155
120/* 156/*
@@ -172,7 +208,6 @@ struct trace_iterator;
172struct trace_array { 208struct trace_array {
173 struct ring_buffer *buffer; 209 struct ring_buffer *buffer;
174 unsigned long entries; 210 unsigned long entries;
175 long ctrl;
176 int cpu; 211 int cpu;
177 cycle_t time_start; 212 cycle_t time_start;
178 struct task_struct *waiter; 213 struct task_struct *waiter;
@@ -212,13 +247,17 @@ extern void __ftrace_bad_type(void);
212 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ 247 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
213 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \ 248 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
214 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ 249 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
250 IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
215 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ 251 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
216 IF_ASSIGN(var, ent, struct special_entry, 0); \ 252 IF_ASSIGN(var, ent, struct special_entry, 0); \
217 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ 253 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
218 TRACE_MMIO_RW); \ 254 TRACE_MMIO_RW); \
219 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ 255 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
220 TRACE_MMIO_MAP); \ 256 TRACE_MMIO_MAP); \
221 IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \ 257 IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\
258 IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\
259 IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \
260 IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\
222 __ftrace_bad_type(); \ 261 __ftrace_bad_type(); \
223 } while (0) 262 } while (0)
224 263
@@ -229,29 +268,55 @@ enum print_line_t {
229 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */ 268 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
230}; 269};
231 270
271
272/*
273 * An option specific to a tracer. This is a boolean value.
274 * The bit is the bit index that sets its value on the
275 * flags value in struct tracer_flags.
276 */
277struct tracer_opt {
278 const char *name; /* Will appear on the trace_options file */
279 u32 bit; /* Mask assigned in val field in tracer_flags */
280};
281
282/*
283 * The set of specific options for a tracer. Your tracer
284 * have to set the initial value of the flags val.
285 */
286struct tracer_flags {
287 u32 val;
288 struct tracer_opt *opts;
289};
290
291/* Makes more easy to define a tracer opt */
292#define TRACER_OPT(s, b) .name = #s, .bit = b
293
232/* 294/*
233 * A specific tracer, represented by methods that operate on a trace array: 295 * A specific tracer, represented by methods that operate on a trace array:
234 */ 296 */
235struct tracer { 297struct tracer {
236 const char *name; 298 const char *name;
237 void (*init)(struct trace_array *tr); 299 /* Your tracer should raise a warning if init fails */
300 int (*init)(struct trace_array *tr);
238 void (*reset)(struct trace_array *tr); 301 void (*reset)(struct trace_array *tr);
302 void (*start)(struct trace_array *tr);
303 void (*stop)(struct trace_array *tr);
239 void (*open)(struct trace_iterator *iter); 304 void (*open)(struct trace_iterator *iter);
240 void (*pipe_open)(struct trace_iterator *iter); 305 void (*pipe_open)(struct trace_iterator *iter);
241 void (*close)(struct trace_iterator *iter); 306 void (*close)(struct trace_iterator *iter);
242 void (*start)(struct trace_iterator *iter);
243 void (*stop)(struct trace_iterator *iter);
244 ssize_t (*read)(struct trace_iterator *iter, 307 ssize_t (*read)(struct trace_iterator *iter,
245 struct file *filp, char __user *ubuf, 308 struct file *filp, char __user *ubuf,
246 size_t cnt, loff_t *ppos); 309 size_t cnt, loff_t *ppos);
247 void (*ctrl_update)(struct trace_array *tr);
248#ifdef CONFIG_FTRACE_STARTUP_TEST 310#ifdef CONFIG_FTRACE_STARTUP_TEST
249 int (*selftest)(struct tracer *trace, 311 int (*selftest)(struct tracer *trace,
250 struct trace_array *tr); 312 struct trace_array *tr);
251#endif 313#endif
252 enum print_line_t (*print_line)(struct trace_iterator *iter); 314 enum print_line_t (*print_line)(struct trace_iterator *iter);
315 /* If you handled the flag setting, return 0 */
316 int (*set_flag)(u32 old_flags, u32 bit, int set);
253 struct tracer *next; 317 struct tracer *next;
254 int print_max; 318 int print_max;
319 struct tracer_flags *flags;
255}; 320};
256 321
257struct trace_seq { 322struct trace_seq {
@@ -279,8 +344,11 @@ struct trace_iterator {
279 unsigned long iter_flags; 344 unsigned long iter_flags;
280 loff_t pos; 345 loff_t pos;
281 long idx; 346 long idx;
347
348 cpumask_t started;
282}; 349};
283 350
351int tracing_is_enabled(void);
284void trace_wake_up(void); 352void trace_wake_up(void);
285void tracing_reset(struct trace_array *tr, int cpu); 353void tracing_reset(struct trace_array *tr, int cpu);
286int tracing_open_generic(struct inode *inode, struct file *filp); 354int tracing_open_generic(struct inode *inode, struct file *filp);
@@ -320,9 +388,14 @@ void trace_function(struct trace_array *tr,
320 unsigned long ip, 388 unsigned long ip,
321 unsigned long parent_ip, 389 unsigned long parent_ip,
322 unsigned long flags, int pc); 390 unsigned long flags, int pc);
391void
392trace_function_return(struct ftrace_retfunc *trace);
323 393
324void tracing_start_cmdline_record(void); 394void tracing_start_cmdline_record(void);
325void tracing_stop_cmdline_record(void); 395void tracing_stop_cmdline_record(void);
396void tracing_sched_switch_assign_trace(struct trace_array *tr);
397void tracing_stop_sched_switch_record(void);
398void tracing_start_sched_switch_record(void);
326int register_tracer(struct tracer *type); 399int register_tracer(struct tracer *type);
327void unregister_tracer(struct tracer *type); 400void unregister_tracer(struct tracer *type);
328 401
@@ -383,12 +456,18 @@ extern int trace_selftest_startup_sched_switch(struct tracer *trace,
383 struct trace_array *tr); 456 struct trace_array *tr);
384extern int trace_selftest_startup_sysprof(struct tracer *trace, 457extern int trace_selftest_startup_sysprof(struct tracer *trace,
385 struct trace_array *tr); 458 struct trace_array *tr);
459extern int trace_selftest_startup_branch(struct tracer *trace,
460 struct trace_array *tr);
386#endif /* CONFIG_FTRACE_STARTUP_TEST */ 461#endif /* CONFIG_FTRACE_STARTUP_TEST */
387 462
388extern void *head_page(struct trace_array_cpu *data); 463extern void *head_page(struct trace_array_cpu *data);
389extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); 464extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
390extern void trace_seq_print_cont(struct trace_seq *s, 465extern void trace_seq_print_cont(struct trace_seq *s,
391 struct trace_iterator *iter); 466 struct trace_iterator *iter);
467
468extern int
469seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
470 unsigned long sym_flags);
392extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, 471extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
393 size_t cnt); 472 size_t cnt);
394extern long ns2usecs(cycle_t nsec); 473extern long ns2usecs(cycle_t nsec);
@@ -396,6 +475,17 @@ extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
396 475
397extern unsigned long trace_flags; 476extern unsigned long trace_flags;
398 477
478/* Standard output formatting function used for function return traces */
479#ifdef CONFIG_FUNCTION_RET_TRACER
480extern enum print_line_t print_return_function(struct trace_iterator *iter);
481#else
482static inline enum print_line_t
483print_return_function(struct trace_iterator *iter)
484{
485 return TRACE_TYPE_UNHANDLED;
486}
487#endif
488
399/* 489/*
400 * trace_iterator_flags is an enumeration that defines bit 490 * trace_iterator_flags is an enumeration that defines bit
401 * positions into trace_flags that controls the output. 491 * positions into trace_flags that controls the output.
@@ -415,8 +505,92 @@ enum trace_iterator_flags {
415 TRACE_ITER_STACKTRACE = 0x100, 505 TRACE_ITER_STACKTRACE = 0x100,
416 TRACE_ITER_SCHED_TREE = 0x200, 506 TRACE_ITER_SCHED_TREE = 0x200,
417 TRACE_ITER_PRINTK = 0x400, 507 TRACE_ITER_PRINTK = 0x400,
508 TRACE_ITER_PREEMPTONLY = 0x800,
509 TRACE_ITER_BRANCH = 0x1000,
510 TRACE_ITER_ANNOTATE = 0x2000,
511 TRACE_ITER_USERSTACKTRACE = 0x4000,
512 TRACE_ITER_SYM_USEROBJ = 0x8000
418}; 513};
419 514
515/*
516 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
517 * control the output of kernel symbols.
518 */
519#define TRACE_ITER_SYM_MASK \
520 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
521
420extern struct tracer nop_trace; 522extern struct tracer nop_trace;
421 523
524/**
525 * ftrace_preempt_disable - disable preemption scheduler safe
526 *
527 * When tracing can happen inside the scheduler, there exists
528 * cases that the tracing might happen before the need_resched
529 * flag is checked. If this happens and the tracer calls
530 * preempt_enable (after a disable), a schedule might take place
531 * causing an infinite recursion.
532 *
533 * To prevent this, we read the need_recshed flag before
534 * disabling preemption. When we want to enable preemption we
535 * check the flag, if it is set, then we call preempt_enable_no_resched.
536 * Otherwise, we call preempt_enable.
537 *
538 * The rational for doing the above is that if need resched is set
539 * and we have yet to reschedule, we are either in an atomic location
540 * (where we do not need to check for scheduling) or we are inside
541 * the scheduler and do not want to resched.
542 */
543static inline int ftrace_preempt_disable(void)
544{
545 int resched;
546
547 resched = need_resched();
548 preempt_disable_notrace();
549
550 return resched;
551}
552
553/**
554 * ftrace_preempt_enable - enable preemption scheduler safe
555 * @resched: the return value from ftrace_preempt_disable
556 *
557 * This is a scheduler safe way to enable preemption and not miss
558 * any preemption checks. The disabled saved the state of preemption.
559 * If resched is set, then we were either inside an atomic or
560 * are inside the scheduler (we would have already scheduled
561 * otherwise). In this case, we do not want to call normal
562 * preempt_enable, but preempt_enable_no_resched instead.
563 */
564static inline void ftrace_preempt_enable(int resched)
565{
566 if (resched)
567 preempt_enable_no_resched_notrace();
568 else
569 preempt_enable_notrace();
570}
571
572#ifdef CONFIG_BRANCH_TRACER
573extern int enable_branch_tracing(struct trace_array *tr);
574extern void disable_branch_tracing(void);
575static inline int trace_branch_enable(struct trace_array *tr)
576{
577 if (trace_flags & TRACE_ITER_BRANCH)
578 return enable_branch_tracing(tr);
579 return 0;
580}
581static inline void trace_branch_disable(void)
582{
583 /* due to races, always disable */
584 disable_branch_tracing();
585}
586#else
587static inline int trace_branch_enable(struct trace_array *tr)
588{
589 return 0;
590}
591static inline void trace_branch_disable(void)
592{
593}
594#endif /* CONFIG_BRANCH_TRACER */
595
422#endif /* _LINUX_KERNEL_TRACE_H */ 596#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index d0a5e50eeff2..a4fa2c57e34e 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -13,73 +13,117 @@
13#include "trace.h" 13#include "trace.h"
14 14
15static struct trace_array *boot_trace; 15static struct trace_array *boot_trace;
16static int trace_boot_enabled; 16static bool pre_initcalls_finished;
17 17
18 18/* Tells the boot tracer that the pre_smp_initcalls are finished.
19/* Should be started after do_pre_smp_initcalls() in init/main.c */ 19 * So we are ready .
20 * It doesn't enable sched events tracing however.
21 * You have to call enable_boot_trace to do so.
22 */
20void start_boot_trace(void) 23void start_boot_trace(void)
21{ 24{
22 trace_boot_enabled = 1; 25 pre_initcalls_finished = true;
23} 26}
24 27
25void stop_boot_trace(void) 28void enable_boot_trace(void)
26{ 29{
27 trace_boot_enabled = 0; 30 if (pre_initcalls_finished)
31 tracing_start_sched_switch_record();
28} 32}
29 33
30void reset_boot_trace(struct trace_array *tr) 34void disable_boot_trace(void)
31{ 35{
32 stop_boot_trace(); 36 if (pre_initcalls_finished)
37 tracing_stop_sched_switch_record();
33} 38}
34 39
35static void boot_trace_init(struct trace_array *tr) 40static void reset_boot_trace(struct trace_array *tr)
36{ 41{
37 int cpu; 42 int cpu;
38 boot_trace = tr;
39 43
40 trace_boot_enabled = 0; 44 tr->time_start = ftrace_now(tr->cpu);
45
46 for_each_online_cpu(cpu)
47 tracing_reset(tr, cpu);
48}
49
50static int boot_trace_init(struct trace_array *tr)
51{
52 int cpu;
53 boot_trace = tr;
41 54
42 for_each_cpu_mask(cpu, cpu_possible_map) 55 for_each_cpu_mask(cpu, cpu_possible_map)
43 tracing_reset(tr, cpu); 56 tracing_reset(tr, cpu);
57
58 tracing_sched_switch_assign_trace(tr);
59 return 0;
44} 60}
45 61
46static void boot_trace_ctrl_update(struct trace_array *tr) 62static enum print_line_t
63initcall_call_print_line(struct trace_iterator *iter)
47{ 64{
48 if (tr->ctrl) 65 struct trace_entry *entry = iter->ent;
49 start_boot_trace(); 66 struct trace_seq *s = &iter->seq;
67 struct trace_boot_call *field;
68 struct boot_trace_call *call;
69 u64 ts;
70 unsigned long nsec_rem;
71 int ret;
72
73 trace_assign_type(field, entry);
74 call = &field->boot_call;
75 ts = iter->ts;
76 nsec_rem = do_div(ts, 1000000000);
77
78 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
79 (unsigned long)ts, nsec_rem, call->func, call->caller);
80
81 if (!ret)
82 return TRACE_TYPE_PARTIAL_LINE;
50 else 83 else
51 stop_boot_trace(); 84 return TRACE_TYPE_HANDLED;
52} 85}
53 86
54static enum print_line_t initcall_print_line(struct trace_iterator *iter) 87static enum print_line_t
88initcall_ret_print_line(struct trace_iterator *iter)
55{ 89{
56 int ret;
57 struct trace_entry *entry = iter->ent; 90 struct trace_entry *entry = iter->ent;
58 struct trace_boot *field = (struct trace_boot *)entry;
59 struct boot_trace *it = &field->initcall;
60 struct trace_seq *s = &iter->seq; 91 struct trace_seq *s = &iter->seq;
61 struct timespec calltime = ktime_to_timespec(it->calltime); 92 struct trace_boot_ret *field;
62 struct timespec rettime = ktime_to_timespec(it->rettime); 93 struct boot_trace_ret *init_ret;
63 94 u64 ts;
64 if (entry->type == TRACE_BOOT) { 95 unsigned long nsec_rem;
65 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", 96 int ret;
66 calltime.tv_sec, 97
67 calltime.tv_nsec, 98 trace_assign_type(field, entry);
68 it->func, it->caller); 99 init_ret = &field->boot_ret;
69 if (!ret) 100 ts = iter->ts;
70 return TRACE_TYPE_PARTIAL_LINE; 101 nsec_rem = do_div(ts, 1000000000);
71 102
72 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " 103 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
73 "returned %d after %lld msecs\n", 104 "returned %d after %llu msecs\n",
74 rettime.tv_sec, 105 (unsigned long) ts,
75 rettime.tv_nsec, 106 nsec_rem,
76 it->func, it->result, it->duration); 107 init_ret->func, init_ret->result, init_ret->duration);
77 108
78 if (!ret) 109 if (!ret)
79 return TRACE_TYPE_PARTIAL_LINE; 110 return TRACE_TYPE_PARTIAL_LINE;
111 else
80 return TRACE_TYPE_HANDLED; 112 return TRACE_TYPE_HANDLED;
113}
114
115static enum print_line_t initcall_print_line(struct trace_iterator *iter)
116{
117 struct trace_entry *entry = iter->ent;
118
119 switch (entry->type) {
120 case TRACE_BOOT_CALL:
121 return initcall_call_print_line(iter);
122 case TRACE_BOOT_RET:
123 return initcall_ret_print_line(iter);
124 default:
125 return TRACE_TYPE_UNHANDLED;
81 } 126 }
82 return TRACE_TYPE_UNHANDLED;
83} 127}
84 128
85struct tracer boot_tracer __read_mostly = 129struct tracer boot_tracer __read_mostly =
@@ -87,27 +131,53 @@ struct tracer boot_tracer __read_mostly =
87 .name = "initcall", 131 .name = "initcall",
88 .init = boot_trace_init, 132 .init = boot_trace_init,
89 .reset = reset_boot_trace, 133 .reset = reset_boot_trace,
90 .ctrl_update = boot_trace_ctrl_update,
91 .print_line = initcall_print_line, 134 .print_line = initcall_print_line,
92}; 135};
93 136
94void trace_boot(struct boot_trace *it, initcall_t fn) 137void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
95{ 138{
96 struct ring_buffer_event *event; 139 struct ring_buffer_event *event;
97 struct trace_boot *entry; 140 struct trace_boot_call *entry;
98 struct trace_array_cpu *data;
99 unsigned long irq_flags; 141 unsigned long irq_flags;
100 struct trace_array *tr = boot_trace; 142 struct trace_array *tr = boot_trace;
101 143
102 if (!trace_boot_enabled) 144 if (!pre_initcalls_finished)
103 return; 145 return;
104 146
105 /* Get its name now since this function could 147 /* Get its name now since this function could
106 * disappear because it is in the .init section. 148 * disappear because it is in the .init section.
107 */ 149 */
108 sprint_symbol(it->func, (unsigned long)fn); 150 sprint_symbol(bt->func, (unsigned long)fn);
151 preempt_disable();
152
153 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
154 &irq_flags);
155 if (!event)
156 goto out;
157 entry = ring_buffer_event_data(event);
158 tracing_generic_entry_update(&entry->ent, 0, 0);
159 entry->ent.type = TRACE_BOOT_CALL;
160 entry->boot_call = *bt;
161 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
162
163 trace_wake_up();
164
165 out:
166 preempt_enable();
167}
168
169void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
170{
171 struct ring_buffer_event *event;
172 struct trace_boot_ret *entry;
173 unsigned long irq_flags;
174 struct trace_array *tr = boot_trace;
175
176 if (!pre_initcalls_finished)
177 return;
178
179 sprint_symbol(bt->func, (unsigned long)fn);
109 preempt_disable(); 180 preempt_disable();
110 data = tr->data[smp_processor_id()];
111 181
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), 182 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
113 &irq_flags); 183 &irq_flags);
@@ -115,8 +185,8 @@ void trace_boot(struct boot_trace *it, initcall_t fn)
115 goto out; 185 goto out;
116 entry = ring_buffer_event_data(event); 186 entry = ring_buffer_event_data(event);
117 tracing_generic_entry_update(&entry->ent, 0, 0); 187 tracing_generic_entry_update(&entry->ent, 0, 0);
118 entry->ent.type = TRACE_BOOT; 188 entry->ent.type = TRACE_BOOT_RET;
119 entry->initcall = *it; 189 entry->boot_ret = *bt;
120 ring_buffer_unlock_commit(tr->buffer, event, irq_flags); 190 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
121 191
122 trace_wake_up(); 192 trace_wake_up();
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
new file mode 100644
index 000000000000..877ee88e6a74
--- /dev/null
+++ b/kernel/trace/trace_branch.c
@@ -0,0 +1,341 @@
1/*
2 * unlikely profiler
3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/kallsyms.h>
7#include <linux/seq_file.h>
8#include <linux/spinlock.h>
9#include <linux/debugfs.h>
10#include <linux/uaccess.h>
11#include <linux/module.h>
12#include <linux/ftrace.h>
13#include <linux/hash.h>
14#include <linux/fs.h>
15#include <asm/local.h>
16#include "trace.h"
17
18#ifdef CONFIG_BRANCH_TRACER
19
20static int branch_tracing_enabled __read_mostly;
21static DEFINE_MUTEX(branch_tracing_mutex);
22static struct trace_array *branch_tracer;
23
24static void
25probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
26{
27 struct trace_array *tr = branch_tracer;
28 struct ring_buffer_event *event;
29 struct trace_branch *entry;
30 unsigned long flags, irq_flags;
31 int cpu, pc;
32 const char *p;
33
34 /*
35 * I would love to save just the ftrace_likely_data pointer, but
36 * this code can also be used by modules. Ugly things can happen
37 * if the module is unloaded, and then we go and read the
38 * pointer. This is slower, but much safer.
39 */
40
41 if (unlikely(!tr))
42 return;
43
44 raw_local_irq_save(flags);
45 cpu = raw_smp_processor_id();
46 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
47 goto out;
48
49 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
50 &irq_flags);
51 if (!event)
52 goto out;
53
54 pc = preempt_count();
55 entry = ring_buffer_event_data(event);
56 tracing_generic_entry_update(&entry->ent, flags, pc);
57 entry->ent.type = TRACE_BRANCH;
58
59 /* Strip off the path, only save the file */
60 p = f->file + strlen(f->file);
61 while (p >= f->file && *p != '/')
62 p--;
63 p++;
64
65 strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
66 strncpy(entry->file, p, TRACE_FILE_SIZE);
67 entry->func[TRACE_FUNC_SIZE] = 0;
68 entry->file[TRACE_FILE_SIZE] = 0;
69 entry->line = f->line;
70 entry->correct = val == expect;
71
72 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
73
74 out:
75 atomic_dec(&tr->data[cpu]->disabled);
76 raw_local_irq_restore(flags);
77}
78
79static inline
80void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
81{
82 if (!branch_tracing_enabled)
83 return;
84
85 probe_likely_condition(f, val, expect);
86}
87
88int enable_branch_tracing(struct trace_array *tr)
89{
90 int ret = 0;
91
92 mutex_lock(&branch_tracing_mutex);
93 branch_tracer = tr;
94 /*
95 * Must be seen before enabling. The reader is a condition
96 * where we do not need a matching rmb()
97 */
98 smp_wmb();
99 branch_tracing_enabled++;
100 mutex_unlock(&branch_tracing_mutex);
101
102 return ret;
103}
104
105void disable_branch_tracing(void)
106{
107 mutex_lock(&branch_tracing_mutex);
108
109 if (!branch_tracing_enabled)
110 goto out_unlock;
111
112 branch_tracing_enabled--;
113
114 out_unlock:
115 mutex_unlock(&branch_tracing_mutex);
116}
117
118static void start_branch_trace(struct trace_array *tr)
119{
120 enable_branch_tracing(tr);
121}
122
123static void stop_branch_trace(struct trace_array *tr)
124{
125 disable_branch_tracing();
126}
127
128static int branch_trace_init(struct trace_array *tr)
129{
130 int cpu;
131
132 for_each_online_cpu(cpu)
133 tracing_reset(tr, cpu);
134
135 start_branch_trace(tr);
136 return 0;
137}
138
139static void branch_trace_reset(struct trace_array *tr)
140{
141 stop_branch_trace(tr);
142}
143
144struct tracer branch_trace __read_mostly =
145{
146 .name = "branch",
147 .init = branch_trace_init,
148 .reset = branch_trace_reset,
149#ifdef CONFIG_FTRACE_SELFTEST
150 .selftest = trace_selftest_startup_branch,
151#endif
152};
153
154__init static int init_branch_trace(void)
155{
156 return register_tracer(&branch_trace);
157}
158
159device_initcall(init_branch_trace);
160#else
161static inline
162void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
163{
164}
165#endif /* CONFIG_BRANCH_TRACER */
166
167void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
168{
169 /*
170 * I would love to have a trace point here instead, but the
171 * trace point code is so inundated with unlikely and likely
172 * conditions that the recursive nightmare that exists is too
173 * much to try to get working. At least for now.
174 */
175 trace_likely_condition(f, val, expect);
176
177 /* FIXME: Make this atomic! */
178 if (val == expect)
179 f->correct++;
180 else
181 f->incorrect++;
182}
183EXPORT_SYMBOL(ftrace_likely_update);
184
185struct ftrace_pointer {
186 void *start;
187 void *stop;
188 int hit;
189};
190
191static void *
192t_next(struct seq_file *m, void *v, loff_t *pos)
193{
194 const struct ftrace_pointer *f = m->private;
195 struct ftrace_branch_data *p = v;
196
197 (*pos)++;
198
199 if (v == (void *)1)
200 return f->start;
201
202 ++p;
203
204 if ((void *)p >= (void *)f->stop)
205 return NULL;
206
207 return p;
208}
209
210static void *t_start(struct seq_file *m, loff_t *pos)
211{
212 void *t = (void *)1;
213 loff_t l = 0;
214
215 for (; t && l < *pos; t = t_next(m, t, &l))
216 ;
217
218 return t;
219}
220
221static void t_stop(struct seq_file *m, void *p)
222{
223}
224
225static int t_show(struct seq_file *m, void *v)
226{
227 const struct ftrace_pointer *fp = m->private;
228 struct ftrace_branch_data *p = v;
229 const char *f;
230 long percent;
231
232 if (v == (void *)1) {
233 if (fp->hit)
234 seq_printf(m, " miss hit %% ");
235 else
236 seq_printf(m, " correct incorrect %% ");
237 seq_printf(m, " Function "
238 " File Line\n"
239 " ------- --------- - "
240 " -------- "
241 " ---- ----\n");
242 return 0;
243 }
244
245 /* Only print the file, not the path */
246 f = p->file + strlen(p->file);
247 while (f >= p->file && *f != '/')
248 f--;
249 f++;
250
251 /*
252 * The miss is overlayed on correct, and hit on incorrect.
253 */
254 if (p->correct) {
255 percent = p->incorrect * 100;
256 percent /= p->correct + p->incorrect;
257 } else
258 percent = p->incorrect ? 100 : -1;
259
260 seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
261 if (percent < 0)
262 seq_printf(m, " X ");
263 else
264 seq_printf(m, "%3ld ", percent);
265 seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
266 return 0;
267}
268
269static struct seq_operations tracing_likely_seq_ops = {
270 .start = t_start,
271 .next = t_next,
272 .stop = t_stop,
273 .show = t_show,
274};
275
276static int tracing_branch_open(struct inode *inode, struct file *file)
277{
278 int ret;
279
280 ret = seq_open(file, &tracing_likely_seq_ops);
281 if (!ret) {
282 struct seq_file *m = file->private_data;
283 m->private = (void *)inode->i_private;
284 }
285
286 return ret;
287}
288
289static const struct file_operations tracing_branch_fops = {
290 .open = tracing_branch_open,
291 .read = seq_read,
292 .llseek = seq_lseek,
293};
294
295#ifdef CONFIG_PROFILE_ALL_BRANCHES
296extern unsigned long __start_branch_profile[];
297extern unsigned long __stop_branch_profile[];
298
299static const struct ftrace_pointer ftrace_branch_pos = {
300 .start = __start_branch_profile,
301 .stop = __stop_branch_profile,
302 .hit = 1,
303};
304
305#endif /* CONFIG_PROFILE_ALL_BRANCHES */
306
307extern unsigned long __start_annotated_branch_profile[];
308extern unsigned long __stop_annotated_branch_profile[];
309
310static const struct ftrace_pointer ftrace_annotated_branch_pos = {
311 .start = __start_annotated_branch_profile,
312 .stop = __stop_annotated_branch_profile,
313};
314
315static __init int ftrace_branch_init(void)
316{
317 struct dentry *d_tracer;
318 struct dentry *entry;
319
320 d_tracer = tracing_init_dentry();
321
322 entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
323 (void *)&ftrace_annotated_branch_pos,
324 &tracing_branch_fops);
325 if (!entry)
326 pr_warning("Could not create debugfs "
327 "'profile_annotatet_branch' entry\n");
328
329#ifdef CONFIG_PROFILE_ALL_BRANCHES
330 entry = debugfs_create_file("profile_branch", 0444, d_tracer,
331 (void *)&ftrace_branch_pos,
332 &tracing_branch_fops);
333 if (!entry)
334 pr_warning("Could not create debugfs"
335 " 'profile_branch' entry\n");
336#endif
337
338 return 0;
339}
340
341device_initcall(ftrace_branch_init);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 0f85a64003d3..e74f6d0a3216 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -42,24 +42,20 @@ static void stop_function_trace(struct trace_array *tr)
42 tracing_stop_cmdline_record(); 42 tracing_stop_cmdline_record();
43} 43}
44 44
45static void function_trace_init(struct trace_array *tr) 45static int function_trace_init(struct trace_array *tr)
46{ 46{
47 if (tr->ctrl) 47 start_function_trace(tr);
48 start_function_trace(tr); 48 return 0;
49} 49}
50 50
51static void function_trace_reset(struct trace_array *tr) 51static void function_trace_reset(struct trace_array *tr)
52{ 52{
53 if (tr->ctrl) 53 stop_function_trace(tr);
54 stop_function_trace(tr);
55} 54}
56 55
57static void function_trace_ctrl_update(struct trace_array *tr) 56static void function_trace_start(struct trace_array *tr)
58{ 57{
59 if (tr->ctrl) 58 function_reset(tr);
60 start_function_trace(tr);
61 else
62 stop_function_trace(tr);
63} 59}
64 60
65static struct tracer function_trace __read_mostly = 61static struct tracer function_trace __read_mostly =
@@ -67,7 +63,7 @@ static struct tracer function_trace __read_mostly =
67 .name = "function", 63 .name = "function",
68 .init = function_trace_init, 64 .init = function_trace_init,
69 .reset = function_trace_reset, 65 .reset = function_trace_reset,
70 .ctrl_update = function_trace_ctrl_update, 66 .start = function_trace_start,
71#ifdef CONFIG_FTRACE_SELFTEST 67#ifdef CONFIG_FTRACE_SELFTEST
72 .selftest = trace_selftest_startup_function, 68 .selftest = trace_selftest_startup_function,
73#endif 69#endif
diff --git a/kernel/trace/trace_functions_return.c b/kernel/trace/trace_functions_return.c
new file mode 100644
index 000000000000..e00d64509c9c
--- /dev/null
+++ b/kernel/trace/trace_functions_return.c
@@ -0,0 +1,98 @@
1/*
2 *
3 * Function return tracer.
4 * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 * Mostly borrowed from function tracer which
6 * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
7 *
8 */
9#include <linux/debugfs.h>
10#include <linux/uaccess.h>
11#include <linux/ftrace.h>
12#include <linux/fs.h>
13
14#include "trace.h"
15
16
17#define TRACE_RETURN_PRINT_OVERRUN 0x1
18static struct tracer_opt trace_opts[] = {
19 /* Display overruns or not */
20 { TRACER_OPT(overrun, TRACE_RETURN_PRINT_OVERRUN) },
21 { } /* Empty entry */
22};
23
24static struct tracer_flags tracer_flags = {
25 .val = 0, /* Don't display overruns by default */
26 .opts = trace_opts
27};
28
29
30static int return_trace_init(struct trace_array *tr)
31{
32 int cpu;
33 for_each_online_cpu(cpu)
34 tracing_reset(tr, cpu);
35
36 return register_ftrace_return(&trace_function_return);
37}
38
39static void return_trace_reset(struct trace_array *tr)
40{
41 unregister_ftrace_return();
42}
43
44
45enum print_line_t
46print_return_function(struct trace_iterator *iter)
47{
48 struct trace_seq *s = &iter->seq;
49 struct trace_entry *entry = iter->ent;
50 struct ftrace_ret_entry *field;
51 int ret;
52
53 if (entry->type == TRACE_FN_RET) {
54 trace_assign_type(field, entry);
55 ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip);
56 if (!ret)
57 return TRACE_TYPE_PARTIAL_LINE;
58
59 ret = seq_print_ip_sym(s, field->ip,
60 trace_flags & TRACE_ITER_SYM_MASK);
61 if (!ret)
62 return TRACE_TYPE_PARTIAL_LINE;
63
64 ret = trace_seq_printf(s, " (%llu ns)",
65 field->rettime - field->calltime);
66 if (!ret)
67 return TRACE_TYPE_PARTIAL_LINE;
68
69 if (tracer_flags.val & TRACE_RETURN_PRINT_OVERRUN) {
70 ret = trace_seq_printf(s, " (Overruns: %lu)",
71 field->overrun);
72 if (!ret)
73 return TRACE_TYPE_PARTIAL_LINE;
74 }
75
76 ret = trace_seq_printf(s, "\n");
77 if (!ret)
78 return TRACE_TYPE_PARTIAL_LINE;
79
80 return TRACE_TYPE_HANDLED;
81 }
82 return TRACE_TYPE_UNHANDLED;
83}
84
85static struct tracer return_trace __read_mostly = {
86 .name = "return",
87 .init = return_trace_init,
88 .reset = return_trace_reset,
89 .print_line = print_return_function,
90 .flags = &tracer_flags,
91};
92
93static __init int init_return_trace(void)
94{
95 return register_tracer(&return_trace);
96}
97
98device_initcall(init_return_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9c74071c10e0..7c2e326bbc8b 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -353,15 +353,28 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
353} 353}
354#endif /* CONFIG_PREEMPT_TRACER */ 354#endif /* CONFIG_PREEMPT_TRACER */
355 355
356/*
357 * save_tracer_enabled is used to save the state of the tracer_enabled
358 * variable when we disable it when we open a trace output file.
359 */
360static int save_tracer_enabled;
361
356static void start_irqsoff_tracer(struct trace_array *tr) 362static void start_irqsoff_tracer(struct trace_array *tr)
357{ 363{
358 register_ftrace_function(&trace_ops); 364 register_ftrace_function(&trace_ops);
359 tracer_enabled = 1; 365 if (tracing_is_enabled()) {
366 tracer_enabled = 1;
367 save_tracer_enabled = 1;
368 } else {
369 tracer_enabled = 0;
370 save_tracer_enabled = 0;
371 }
360} 372}
361 373
362static void stop_irqsoff_tracer(struct trace_array *tr) 374static void stop_irqsoff_tracer(struct trace_array *tr)
363{ 375{
364 tracer_enabled = 0; 376 tracer_enabled = 0;
377 save_tracer_enabled = 0;
365 unregister_ftrace_function(&trace_ops); 378 unregister_ftrace_function(&trace_ops);
366} 379}
367 380
@@ -370,53 +383,55 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
370 irqsoff_trace = tr; 383 irqsoff_trace = tr;
371 /* make sure that the tracer is visible */ 384 /* make sure that the tracer is visible */
372 smp_wmb(); 385 smp_wmb();
373 386 start_irqsoff_tracer(tr);
374 if (tr->ctrl)
375 start_irqsoff_tracer(tr);
376} 387}
377 388
378static void irqsoff_tracer_reset(struct trace_array *tr) 389static void irqsoff_tracer_reset(struct trace_array *tr)
379{ 390{
380 if (tr->ctrl) 391 stop_irqsoff_tracer(tr);
381 stop_irqsoff_tracer(tr);
382} 392}
383 393
384static void irqsoff_tracer_ctrl_update(struct trace_array *tr) 394static void irqsoff_tracer_start(struct trace_array *tr)
385{ 395{
386 if (tr->ctrl) 396 tracer_enabled = 1;
387 start_irqsoff_tracer(tr); 397 save_tracer_enabled = 1;
388 else 398}
389 stop_irqsoff_tracer(tr); 399
400static void irqsoff_tracer_stop(struct trace_array *tr)
401{
402 tracer_enabled = 0;
403 save_tracer_enabled = 0;
390} 404}
391 405
392static void irqsoff_tracer_open(struct trace_iterator *iter) 406static void irqsoff_tracer_open(struct trace_iterator *iter)
393{ 407{
394 /* stop the trace while dumping */ 408 /* stop the trace while dumping */
395 if (iter->tr->ctrl) 409 tracer_enabled = 0;
396 stop_irqsoff_tracer(iter->tr);
397} 410}
398 411
399static void irqsoff_tracer_close(struct trace_iterator *iter) 412static void irqsoff_tracer_close(struct trace_iterator *iter)
400{ 413{
401 if (iter->tr->ctrl) 414 /* restart tracing */
402 start_irqsoff_tracer(iter->tr); 415 tracer_enabled = save_tracer_enabled;
403} 416}
404 417
405#ifdef CONFIG_IRQSOFF_TRACER 418#ifdef CONFIG_IRQSOFF_TRACER
406static void irqsoff_tracer_init(struct trace_array *tr) 419static int irqsoff_tracer_init(struct trace_array *tr)
407{ 420{
408 trace_type = TRACER_IRQS_OFF; 421 trace_type = TRACER_IRQS_OFF;
409 422
410 __irqsoff_tracer_init(tr); 423 __irqsoff_tracer_init(tr);
424 return 0;
411} 425}
412static struct tracer irqsoff_tracer __read_mostly = 426static struct tracer irqsoff_tracer __read_mostly =
413{ 427{
414 .name = "irqsoff", 428 .name = "irqsoff",
415 .init = irqsoff_tracer_init, 429 .init = irqsoff_tracer_init,
416 .reset = irqsoff_tracer_reset, 430 .reset = irqsoff_tracer_reset,
431 .start = irqsoff_tracer_start,
432 .stop = irqsoff_tracer_stop,
417 .open = irqsoff_tracer_open, 433 .open = irqsoff_tracer_open,
418 .close = irqsoff_tracer_close, 434 .close = irqsoff_tracer_close,
419 .ctrl_update = irqsoff_tracer_ctrl_update,
420 .print_max = 1, 435 .print_max = 1,
421#ifdef CONFIG_FTRACE_SELFTEST 436#ifdef CONFIG_FTRACE_SELFTEST
422 .selftest = trace_selftest_startup_irqsoff, 437 .selftest = trace_selftest_startup_irqsoff,
@@ -428,11 +443,12 @@ static struct tracer irqsoff_tracer __read_mostly =
428#endif 443#endif
429 444
430#ifdef CONFIG_PREEMPT_TRACER 445#ifdef CONFIG_PREEMPT_TRACER
431static void preemptoff_tracer_init(struct trace_array *tr) 446static int preemptoff_tracer_init(struct trace_array *tr)
432{ 447{
433 trace_type = TRACER_PREEMPT_OFF; 448 trace_type = TRACER_PREEMPT_OFF;
434 449
435 __irqsoff_tracer_init(tr); 450 __irqsoff_tracer_init(tr);
451 return 0;
436} 452}
437 453
438static struct tracer preemptoff_tracer __read_mostly = 454static struct tracer preemptoff_tracer __read_mostly =
@@ -440,9 +456,10 @@ static struct tracer preemptoff_tracer __read_mostly =
440 .name = "preemptoff", 456 .name = "preemptoff",
441 .init = preemptoff_tracer_init, 457 .init = preemptoff_tracer_init,
442 .reset = irqsoff_tracer_reset, 458 .reset = irqsoff_tracer_reset,
459 .start = irqsoff_tracer_start,
460 .stop = irqsoff_tracer_stop,
443 .open = irqsoff_tracer_open, 461 .open = irqsoff_tracer_open,
444 .close = irqsoff_tracer_close, 462 .close = irqsoff_tracer_close,
445 .ctrl_update = irqsoff_tracer_ctrl_update,
446 .print_max = 1, 463 .print_max = 1,
447#ifdef CONFIG_FTRACE_SELFTEST 464#ifdef CONFIG_FTRACE_SELFTEST
448 .selftest = trace_selftest_startup_preemptoff, 465 .selftest = trace_selftest_startup_preemptoff,
@@ -456,11 +473,12 @@ static struct tracer preemptoff_tracer __read_mostly =
456#if defined(CONFIG_IRQSOFF_TRACER) && \ 473#if defined(CONFIG_IRQSOFF_TRACER) && \
457 defined(CONFIG_PREEMPT_TRACER) 474 defined(CONFIG_PREEMPT_TRACER)
458 475
459static void preemptirqsoff_tracer_init(struct trace_array *tr) 476static int preemptirqsoff_tracer_init(struct trace_array *tr)
460{ 477{
461 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; 478 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
462 479
463 __irqsoff_tracer_init(tr); 480 __irqsoff_tracer_init(tr);
481 return 0;
464} 482}
465 483
466static struct tracer preemptirqsoff_tracer __read_mostly = 484static struct tracer preemptirqsoff_tracer __read_mostly =
@@ -468,9 +486,10 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
468 .name = "preemptirqsoff", 486 .name = "preemptirqsoff",
469 .init = preemptirqsoff_tracer_init, 487 .init = preemptirqsoff_tracer_init,
470 .reset = irqsoff_tracer_reset, 488 .reset = irqsoff_tracer_reset,
489 .start = irqsoff_tracer_start,
490 .stop = irqsoff_tracer_stop,
471 .open = irqsoff_tracer_open, 491 .open = irqsoff_tracer_open,
472 .close = irqsoff_tracer_close, 492 .close = irqsoff_tracer_close,
473 .ctrl_update = irqsoff_tracer_ctrl_update,
474 .print_max = 1, 493 .print_max = 1,
475#ifdef CONFIG_FTRACE_SELFTEST 494#ifdef CONFIG_FTRACE_SELFTEST
476 .selftest = trace_selftest_startup_preemptirqsoff, 495 .selftest = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index e62cbf78eab6..2a98a206acc2 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -32,34 +32,29 @@ static void mmio_reset_data(struct trace_array *tr)
32 tracing_reset(tr, cpu); 32 tracing_reset(tr, cpu);
33} 33}
34 34
35static void mmio_trace_init(struct trace_array *tr) 35static int mmio_trace_init(struct trace_array *tr)
36{ 36{
37 pr_debug("in %s\n", __func__); 37 pr_debug("in %s\n", __func__);
38 mmio_trace_array = tr; 38 mmio_trace_array = tr;
39 if (tr->ctrl) { 39
40 mmio_reset_data(tr); 40 mmio_reset_data(tr);
41 enable_mmiotrace(); 41 enable_mmiotrace();
42 } 42 return 0;
43} 43}
44 44
45static void mmio_trace_reset(struct trace_array *tr) 45static void mmio_trace_reset(struct trace_array *tr)
46{ 46{
47 pr_debug("in %s\n", __func__); 47 pr_debug("in %s\n", __func__);
48 if (tr->ctrl) 48
49 disable_mmiotrace(); 49 disable_mmiotrace();
50 mmio_reset_data(tr); 50 mmio_reset_data(tr);
51 mmio_trace_array = NULL; 51 mmio_trace_array = NULL;
52} 52}
53 53
54static void mmio_trace_ctrl_update(struct trace_array *tr) 54static void mmio_trace_start(struct trace_array *tr)
55{ 55{
56 pr_debug("in %s\n", __func__); 56 pr_debug("in %s\n", __func__);
57 if (tr->ctrl) { 57 mmio_reset_data(tr);
58 mmio_reset_data(tr);
59 enable_mmiotrace();
60 } else {
61 disable_mmiotrace();
62 }
63} 58}
64 59
65static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) 60static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
@@ -296,10 +291,10 @@ static struct tracer mmio_tracer __read_mostly =
296 .name = "mmiotrace", 291 .name = "mmiotrace",
297 .init = mmio_trace_init, 292 .init = mmio_trace_init,
298 .reset = mmio_trace_reset, 293 .reset = mmio_trace_reset,
294 .start = mmio_trace_start,
299 .pipe_open = mmio_pipe_open, 295 .pipe_open = mmio_pipe_open,
300 .close = mmio_close, 296 .close = mmio_close,
301 .read = mmio_read, 297 .read = mmio_read,
302 .ctrl_update = mmio_trace_ctrl_update,
303 .print_line = mmio_print_line, 298 .print_line = mmio_print_line,
304}; 299};
305 300
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 4592b4862515..b9767acd30ac 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -12,6 +12,27 @@
12 12
13#include "trace.h" 13#include "trace.h"
14 14
15/* Our two options */
16enum {
17 TRACE_NOP_OPT_ACCEPT = 0x1,
18 TRACE_NOP_OPT_REFUSE = 0x2
19};
20
21/* Options for the tracer (see trace_options file) */
22static struct tracer_opt nop_opts[] = {
23 /* Option that will be accepted by set_flag callback */
24 { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) },
25 /* Option that will be refused by set_flag callback */
26 { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) },
27 { } /* Always set a last empty entry */
28};
29
30static struct tracer_flags nop_flags = {
31 /* You can check your flags value here when you want. */
32 .val = 0, /* By default: all flags disabled */
33 .opts = nop_opts
34};
35
15static struct trace_array *ctx_trace; 36static struct trace_array *ctx_trace;
16 37
17static void start_nop_trace(struct trace_array *tr) 38static void start_nop_trace(struct trace_array *tr)
@@ -24,7 +45,7 @@ static void stop_nop_trace(struct trace_array *tr)
24 /* Nothing to do! */ 45 /* Nothing to do! */
25} 46}
26 47
27static void nop_trace_init(struct trace_array *tr) 48static int nop_trace_init(struct trace_array *tr)
28{ 49{
29 int cpu; 50 int cpu;
30 ctx_trace = tr; 51 ctx_trace = tr;
@@ -32,33 +53,53 @@ static void nop_trace_init(struct trace_array *tr)
32 for_each_online_cpu(cpu) 53 for_each_online_cpu(cpu)
33 tracing_reset(tr, cpu); 54 tracing_reset(tr, cpu);
34 55
35 if (tr->ctrl) 56 start_nop_trace(tr);
36 start_nop_trace(tr); 57 return 0;
37} 58}
38 59
39static void nop_trace_reset(struct trace_array *tr) 60static void nop_trace_reset(struct trace_array *tr)
40{ 61{
41 if (tr->ctrl) 62 stop_nop_trace(tr);
42 stop_nop_trace(tr);
43} 63}
44 64
45static void nop_trace_ctrl_update(struct trace_array *tr) 65/* It only serves as a signal handler and a callback to
66 * accept or refuse tthe setting of a flag.
67 * If you don't implement it, then the flag setting will be
68 * automatically accepted.
69 */
70static int nop_set_flag(u32 old_flags, u32 bit, int set)
46{ 71{
47 /* When starting a new trace, reset the buffers */ 72 /*
48 if (tr->ctrl) 73 * Note that you don't need to update nop_flags.val yourself.
49 start_nop_trace(tr); 74 * The tracing Api will do it automatically if you return 0
50 else 75 */
51 stop_nop_trace(tr); 76 if (bit == TRACE_NOP_OPT_ACCEPT) {
77 printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept."
78 " Now cat trace_options to see the result\n",
79 set);
80 return 0;
81 }
82
83 if (bit == TRACE_NOP_OPT_REFUSE) {
84 printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse."
85 "Now cat trace_options to see the result\n",
86 set);
87 return -EINVAL;
88 }
89
90 return 0;
52} 91}
53 92
93
54struct tracer nop_trace __read_mostly = 94struct tracer nop_trace __read_mostly =
55{ 95{
56 .name = "nop", 96 .name = "nop",
57 .init = nop_trace_init, 97 .init = nop_trace_init,
58 .reset = nop_trace_reset, 98 .reset = nop_trace_reset,
59 .ctrl_update = nop_trace_ctrl_update,
60#ifdef CONFIG_FTRACE_SELFTEST 99#ifdef CONFIG_FTRACE_SELFTEST
61 .selftest = trace_selftest_startup_nop, 100 .selftest = trace_selftest_startup_nop,
62#endif 101#endif
102 .flags = &nop_flags,
103 .set_flag = nop_set_flag
63}; 104};
64 105
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index b8f56beb1a62..863390557b44 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -16,7 +16,8 @@
16 16
17static struct trace_array *ctx_trace; 17static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled; 18static int __read_mostly tracer_enabled;
19static atomic_t sched_ref; 19static int sched_ref;
20static DEFINE_MUTEX(sched_register_mutex);
20 21
21static void 22static void
22probe_sched_switch(struct rq *__rq, struct task_struct *prev, 23probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -27,7 +28,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
27 int cpu; 28 int cpu;
28 int pc; 29 int pc;
29 30
30 if (!atomic_read(&sched_ref)) 31 if (!sched_ref)
31 return; 32 return;
32 33
33 tracing_record_cmdline(prev); 34 tracing_record_cmdline(prev);
@@ -123,20 +124,18 @@ static void tracing_sched_unregister(void)
123 124
124static void tracing_start_sched_switch(void) 125static void tracing_start_sched_switch(void)
125{ 126{
126 long ref; 127 mutex_lock(&sched_register_mutex);
127 128 if (!(sched_ref++))
128 ref = atomic_inc_return(&sched_ref);
129 if (ref == 1)
130 tracing_sched_register(); 129 tracing_sched_register();
130 mutex_unlock(&sched_register_mutex);
131} 131}
132 132
133static void tracing_stop_sched_switch(void) 133static void tracing_stop_sched_switch(void)
134{ 134{
135 long ref; 135 mutex_lock(&sched_register_mutex);
136 136 if (!(--sched_ref))
137 ref = atomic_dec_and_test(&sched_ref);
138 if (ref)
139 tracing_sched_unregister(); 137 tracing_sched_unregister();
138 mutex_unlock(&sched_register_mutex);
140} 139}
141 140
142void tracing_start_cmdline_record(void) 141void tracing_start_cmdline_record(void)
@@ -149,40 +148,86 @@ void tracing_stop_cmdline_record(void)
149 tracing_stop_sched_switch(); 148 tracing_stop_sched_switch();
150} 149}
151 150
151/**
152 * tracing_start_sched_switch_record - start tracing context switches
153 *
154 * Turns on context switch tracing for a tracer.
155 */
156void tracing_start_sched_switch_record(void)
157{
158 if (unlikely(!ctx_trace)) {
159 WARN_ON(1);
160 return;
161 }
162
163 tracing_start_sched_switch();
164
165 mutex_lock(&sched_register_mutex);
166 tracer_enabled++;
167 mutex_unlock(&sched_register_mutex);
168}
169
170/**
171 * tracing_stop_sched_switch_record - start tracing context switches
172 *
173 * Turns off context switch tracing for a tracer.
174 */
175void tracing_stop_sched_switch_record(void)
176{
177 mutex_lock(&sched_register_mutex);
178 tracer_enabled--;
179 WARN_ON(tracer_enabled < 0);
180 mutex_unlock(&sched_register_mutex);
181
182 tracing_stop_sched_switch();
183}
184
185/**
186 * tracing_sched_switch_assign_trace - assign a trace array for ctx switch
187 * @tr: trace array pointer to assign
188 *
189 * Some tracers might want to record the context switches in their
190 * trace. This function lets those tracers assign the trace array
191 * to use.
192 */
193void tracing_sched_switch_assign_trace(struct trace_array *tr)
194{
195 ctx_trace = tr;
196}
197
152static void start_sched_trace(struct trace_array *tr) 198static void start_sched_trace(struct trace_array *tr)
153{ 199{
154 sched_switch_reset(tr); 200 sched_switch_reset(tr);
155 tracing_start_cmdline_record(); 201 tracing_start_sched_switch_record();
156 tracer_enabled = 1;
157} 202}
158 203
159static void stop_sched_trace(struct trace_array *tr) 204static void stop_sched_trace(struct trace_array *tr)
160{ 205{
161 tracer_enabled = 0; 206 tracing_stop_sched_switch_record();
162 tracing_stop_cmdline_record();
163} 207}
164 208
165static void sched_switch_trace_init(struct trace_array *tr) 209static int sched_switch_trace_init(struct trace_array *tr)
166{ 210{
167 ctx_trace = tr; 211 ctx_trace = tr;
168 212 start_sched_trace(tr);
169 if (tr->ctrl) 213 return 0;
170 start_sched_trace(tr);
171} 214}
172 215
173static void sched_switch_trace_reset(struct trace_array *tr) 216static void sched_switch_trace_reset(struct trace_array *tr)
174{ 217{
175 if (tr->ctrl) 218 if (sched_ref)
176 stop_sched_trace(tr); 219 stop_sched_trace(tr);
177} 220}
178 221
179static void sched_switch_trace_ctrl_update(struct trace_array *tr) 222static void sched_switch_trace_start(struct trace_array *tr)
180{ 223{
181 /* When starting a new trace, reset the buffers */ 224 sched_switch_reset(tr);
182 if (tr->ctrl) 225 tracing_start_sched_switch();
183 start_sched_trace(tr); 226}
184 else 227
185 stop_sched_trace(tr); 228static void sched_switch_trace_stop(struct trace_array *tr)
229{
230 tracing_stop_sched_switch();
186} 231}
187 232
188static struct tracer sched_switch_trace __read_mostly = 233static struct tracer sched_switch_trace __read_mostly =
@@ -190,7 +235,8 @@ static struct tracer sched_switch_trace __read_mostly =
190 .name = "sched_switch", 235 .name = "sched_switch",
191 .init = sched_switch_trace_init, 236 .init = sched_switch_trace_init,
192 .reset = sched_switch_trace_reset, 237 .reset = sched_switch_trace_reset,
193 .ctrl_update = sched_switch_trace_ctrl_update, 238 .start = sched_switch_trace_start,
239 .stop = sched_switch_trace_stop,
194#ifdef CONFIG_FTRACE_SELFTEST 240#ifdef CONFIG_FTRACE_SELFTEST
195 .selftest = trace_selftest_startup_sched_switch, 241 .selftest = trace_selftest_startup_sched_switch,
196#endif 242#endif
@@ -198,14 +244,6 @@ static struct tracer sched_switch_trace __read_mostly =
198 244
199__init static int init_sched_switch_trace(void) 245__init static int init_sched_switch_trace(void)
200{ 246{
201 int ret = 0;
202
203 if (atomic_read(&sched_ref))
204 ret = tracing_sched_register();
205 if (ret) {
206 pr_info("error registering scheduler trace\n");
207 return ret;
208 }
209 return register_tracer(&sched_switch_trace); 247 return register_tracer(&sched_switch_trace);
210} 248}
211device_initcall(init_sched_switch_trace); 249device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3ae93f16b565..0067b49746c1 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -50,8 +50,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
50 return; 50 return;
51 51
52 pc = preempt_count(); 52 pc = preempt_count();
53 resched = need_resched(); 53 resched = ftrace_preempt_disable();
54 preempt_disable_notrace();
55 54
56 cpu = raw_smp_processor_id(); 55 cpu = raw_smp_processor_id();
57 data = tr->data[cpu]; 56 data = tr->data[cpu];
@@ -81,15 +80,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
81 out: 80 out:
82 atomic_dec(&data->disabled); 81 atomic_dec(&data->disabled);
83 82
84 /* 83 ftrace_preempt_enable(resched);
85 * To prevent recursion from the scheduler, if the
86 * resched flag was set before we entered, then
87 * don't reschedule.
88 */
89 if (resched)
90 preempt_enable_no_resched_notrace();
91 else
92 preempt_enable_notrace();
93} 84}
94 85
95static struct ftrace_ops trace_ops __read_mostly = 86static struct ftrace_ops trace_ops __read_mostly =
@@ -271,6 +262,12 @@ out:
271 atomic_dec(&wakeup_trace->data[cpu]->disabled); 262 atomic_dec(&wakeup_trace->data[cpu]->disabled);
272} 263}
273 264
265/*
266 * save_tracer_enabled is used to save the state of the tracer_enabled
267 * variable when we disable it when we open a trace output file.
268 */
269static int save_tracer_enabled;
270
274static void start_wakeup_tracer(struct trace_array *tr) 271static void start_wakeup_tracer(struct trace_array *tr)
275{ 272{
276 int ret; 273 int ret;
@@ -309,7 +306,13 @@ static void start_wakeup_tracer(struct trace_array *tr)
309 306
310 register_ftrace_function(&trace_ops); 307 register_ftrace_function(&trace_ops);
311 308
312 tracer_enabled = 1; 309 if (tracing_is_enabled()) {
310 tracer_enabled = 1;
311 save_tracer_enabled = 1;
312 } else {
313 tracer_enabled = 0;
314 save_tracer_enabled = 0;
315 }
313 316
314 return; 317 return;
315fail_deprobe_wake_new: 318fail_deprobe_wake_new:
@@ -321,49 +324,53 @@ fail_deprobe:
321static void stop_wakeup_tracer(struct trace_array *tr) 324static void stop_wakeup_tracer(struct trace_array *tr)
322{ 325{
323 tracer_enabled = 0; 326 tracer_enabled = 0;
327 save_tracer_enabled = 0;
324 unregister_ftrace_function(&trace_ops); 328 unregister_ftrace_function(&trace_ops);
325 unregister_trace_sched_switch(probe_wakeup_sched_switch); 329 unregister_trace_sched_switch(probe_wakeup_sched_switch);
326 unregister_trace_sched_wakeup_new(probe_wakeup); 330 unregister_trace_sched_wakeup_new(probe_wakeup);
327 unregister_trace_sched_wakeup(probe_wakeup); 331 unregister_trace_sched_wakeup(probe_wakeup);
328} 332}
329 333
330static void wakeup_tracer_init(struct trace_array *tr) 334static int wakeup_tracer_init(struct trace_array *tr)
331{ 335{
332 wakeup_trace = tr; 336 wakeup_trace = tr;
333 337 start_wakeup_tracer(tr);
334 if (tr->ctrl) 338 return 0;
335 start_wakeup_tracer(tr);
336} 339}
337 340
338static void wakeup_tracer_reset(struct trace_array *tr) 341static void wakeup_tracer_reset(struct trace_array *tr)
339{ 342{
340 if (tr->ctrl) { 343 stop_wakeup_tracer(tr);
341 stop_wakeup_tracer(tr); 344 /* make sure we put back any tasks we are tracing */
342 /* make sure we put back any tasks we are tracing */ 345 wakeup_reset(tr);
343 wakeup_reset(tr); 346}
344 } 347
348static void wakeup_tracer_start(struct trace_array *tr)
349{
350 wakeup_reset(tr);
351 tracer_enabled = 1;
352 save_tracer_enabled = 1;
345} 353}
346 354
347static void wakeup_tracer_ctrl_update(struct trace_array *tr) 355static void wakeup_tracer_stop(struct trace_array *tr)
348{ 356{
349 if (tr->ctrl) 357 tracer_enabled = 0;
350 start_wakeup_tracer(tr); 358 save_tracer_enabled = 0;
351 else
352 stop_wakeup_tracer(tr);
353} 359}
354 360
355static void wakeup_tracer_open(struct trace_iterator *iter) 361static void wakeup_tracer_open(struct trace_iterator *iter)
356{ 362{
357 /* stop the trace while dumping */ 363 /* stop the trace while dumping */
358 if (iter->tr->ctrl) 364 tracer_enabled = 0;
359 stop_wakeup_tracer(iter->tr);
360} 365}
361 366
362static void wakeup_tracer_close(struct trace_iterator *iter) 367static void wakeup_tracer_close(struct trace_iterator *iter)
363{ 368{
364 /* forget about any processes we were recording */ 369 /* forget about any processes we were recording */
365 if (iter->tr->ctrl) 370 if (save_tracer_enabled) {
366 start_wakeup_tracer(iter->tr); 371 wakeup_reset(iter->tr);
372 tracer_enabled = 1;
373 }
367} 374}
368 375
369static struct tracer wakeup_tracer __read_mostly = 376static struct tracer wakeup_tracer __read_mostly =
@@ -371,9 +378,10 @@ static struct tracer wakeup_tracer __read_mostly =
371 .name = "wakeup", 378 .name = "wakeup",
372 .init = wakeup_tracer_init, 379 .init = wakeup_tracer_init,
373 .reset = wakeup_tracer_reset, 380 .reset = wakeup_tracer_reset,
381 .start = wakeup_tracer_start,
382 .stop = wakeup_tracer_stop,
374 .open = wakeup_tracer_open, 383 .open = wakeup_tracer_open,
375 .close = wakeup_tracer_close, 384 .close = wakeup_tracer_close,
376 .ctrl_update = wakeup_tracer_ctrl_update,
377 .print_max = 1, 385 .print_max = 1,
378#ifdef CONFIG_FTRACE_SELFTEST 386#ifdef CONFIG_FTRACE_SELFTEST
379 .selftest = trace_selftest_startup_wakeup, 387 .selftest = trace_selftest_startup_wakeup,
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 90bc752a7580..88c8eb70f54a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,6 +13,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
13 case TRACE_STACK: 13 case TRACE_STACK:
14 case TRACE_PRINT: 14 case TRACE_PRINT:
15 case TRACE_SPECIAL: 15 case TRACE_SPECIAL:
16 case TRACE_BRANCH:
16 return 1; 17 return 1;
17 } 18 }
18 return 0; 19 return 0;
@@ -51,7 +52,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
51 int cpu, ret = 0; 52 int cpu, ret = 0;
52 53
53 /* Don't allow flipping of max traces now */ 54 /* Don't allow flipping of max traces now */
54 raw_local_irq_save(flags); 55 local_irq_save(flags);
55 __raw_spin_lock(&ftrace_max_lock); 56 __raw_spin_lock(&ftrace_max_lock);
56 57
57 cnt = ring_buffer_entries(tr->buffer); 58 cnt = ring_buffer_entries(tr->buffer);
@@ -62,7 +63,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
62 break; 63 break;
63 } 64 }
64 __raw_spin_unlock(&ftrace_max_lock); 65 __raw_spin_unlock(&ftrace_max_lock);
65 raw_local_irq_restore(flags); 66 local_irq_restore(flags);
66 67
67 if (count) 68 if (count)
68 *count = cnt; 69 *count = cnt;
@@ -70,6 +71,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
70 return ret; 71 return ret;
71} 72}
72 73
74static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
75{
76 printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n",
77 trace->name, init_ret);
78}
73#ifdef CONFIG_FUNCTION_TRACER 79#ifdef CONFIG_FUNCTION_TRACER
74 80
75#ifdef CONFIG_DYNAMIC_FTRACE 81#ifdef CONFIG_DYNAMIC_FTRACE
@@ -110,8 +116,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
110 ftrace_set_filter(func_name, strlen(func_name), 1); 116 ftrace_set_filter(func_name, strlen(func_name), 1);
111 117
112 /* enable tracing */ 118 /* enable tracing */
113 tr->ctrl = 1; 119 ret = trace->init(tr);
114 trace->init(tr); 120 if (ret) {
121 warn_failed_init_tracer(trace, ret);
122 goto out;
123 }
115 124
116 /* Sleep for a 1/10 of a second */ 125 /* Sleep for a 1/10 of a second */
117 msleep(100); 126 msleep(100);
@@ -134,13 +143,13 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
134 msleep(100); 143 msleep(100);
135 144
136 /* stop the tracing. */ 145 /* stop the tracing. */
137 tr->ctrl = 0; 146 tracing_stop();
138 trace->ctrl_update(tr);
139 ftrace_enabled = 0; 147 ftrace_enabled = 0;
140 148
141 /* check the trace buffer */ 149 /* check the trace buffer */
142 ret = trace_test_buffer(tr, &count); 150 ret = trace_test_buffer(tr, &count);
143 trace->reset(tr); 151 trace->reset(tr);
152 tracing_start();
144 153
145 /* we should only have one item */ 154 /* we should only have one item */
146 if (!ret && count != 1) { 155 if (!ret && count != 1) {
@@ -148,6 +157,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
148 ret = -1; 157 ret = -1;
149 goto out; 158 goto out;
150 } 159 }
160
151 out: 161 out:
152 ftrace_enabled = save_ftrace_enabled; 162 ftrace_enabled = save_ftrace_enabled;
153 tracer_enabled = save_tracer_enabled; 163 tracer_enabled = save_tracer_enabled;
@@ -180,18 +190,22 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
180 ftrace_enabled = 1; 190 ftrace_enabled = 1;
181 tracer_enabled = 1; 191 tracer_enabled = 1;
182 192
183 tr->ctrl = 1; 193 ret = trace->init(tr);
184 trace->init(tr); 194 if (ret) {
195 warn_failed_init_tracer(trace, ret);
196 goto out;
197 }
198
185 /* Sleep for a 1/10 of a second */ 199 /* Sleep for a 1/10 of a second */
186 msleep(100); 200 msleep(100);
187 /* stop the tracing. */ 201 /* stop the tracing. */
188 tr->ctrl = 0; 202 tracing_stop();
189 trace->ctrl_update(tr);
190 ftrace_enabled = 0; 203 ftrace_enabled = 0;
191 204
192 /* check the trace buffer */ 205 /* check the trace buffer */
193 ret = trace_test_buffer(tr, &count); 206 ret = trace_test_buffer(tr, &count);
194 trace->reset(tr); 207 trace->reset(tr);
208 tracing_start();
195 209
196 if (!ret && !count) { 210 if (!ret && !count) {
197 printk(KERN_CONT ".. no entries found .."); 211 printk(KERN_CONT ".. no entries found ..");
@@ -223,8 +237,12 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
223 int ret; 237 int ret;
224 238
225 /* start the tracing */ 239 /* start the tracing */
226 tr->ctrl = 1; 240 ret = trace->init(tr);
227 trace->init(tr); 241 if (ret) {
242 warn_failed_init_tracer(trace, ret);
243 return ret;
244 }
245
228 /* reset the max latency */ 246 /* reset the max latency */
229 tracing_max_latency = 0; 247 tracing_max_latency = 0;
230 /* disable interrupts for a bit */ 248 /* disable interrupts for a bit */
@@ -232,13 +250,13 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
232 udelay(100); 250 udelay(100);
233 local_irq_enable(); 251 local_irq_enable();
234 /* stop the tracing. */ 252 /* stop the tracing. */
235 tr->ctrl = 0; 253 tracing_stop();
236 trace->ctrl_update(tr);
237 /* check both trace buffers */ 254 /* check both trace buffers */
238 ret = trace_test_buffer(tr, NULL); 255 ret = trace_test_buffer(tr, NULL);
239 if (!ret) 256 if (!ret)
240 ret = trace_test_buffer(&max_tr, &count); 257 ret = trace_test_buffer(&max_tr, &count);
241 trace->reset(tr); 258 trace->reset(tr);
259 tracing_start();
242 260
243 if (!ret && !count) { 261 if (!ret && !count) {
244 printk(KERN_CONT ".. no entries found .."); 262 printk(KERN_CONT ".. no entries found ..");
@@ -259,9 +277,26 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
259 unsigned long count; 277 unsigned long count;
260 int ret; 278 int ret;
261 279
280 /*
281 * Now that the big kernel lock is no longer preemptable,
282 * and this is called with the BKL held, it will always
283 * fail. If preemption is already disabled, simply
284 * pass the test. When the BKL is removed, or becomes
285 * preemptible again, we will once again test this,
286 * so keep it in.
287 */
288 if (preempt_count()) {
289 printk(KERN_CONT "can not test ... force ");
290 return 0;
291 }
292
262 /* start the tracing */ 293 /* start the tracing */
263 tr->ctrl = 1; 294 ret = trace->init(tr);
264 trace->init(tr); 295 if (ret) {
296 warn_failed_init_tracer(trace, ret);
297 return ret;
298 }
299
265 /* reset the max latency */ 300 /* reset the max latency */
266 tracing_max_latency = 0; 301 tracing_max_latency = 0;
267 /* disable preemption for a bit */ 302 /* disable preemption for a bit */
@@ -269,13 +304,13 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
269 udelay(100); 304 udelay(100);
270 preempt_enable(); 305 preempt_enable();
271 /* stop the tracing. */ 306 /* stop the tracing. */
272 tr->ctrl = 0; 307 tracing_stop();
273 trace->ctrl_update(tr);
274 /* check both trace buffers */ 308 /* check both trace buffers */
275 ret = trace_test_buffer(tr, NULL); 309 ret = trace_test_buffer(tr, NULL);
276 if (!ret) 310 if (!ret)
277 ret = trace_test_buffer(&max_tr, &count); 311 ret = trace_test_buffer(&max_tr, &count);
278 trace->reset(tr); 312 trace->reset(tr);
313 tracing_start();
279 314
280 if (!ret && !count) { 315 if (!ret && !count) {
281 printk(KERN_CONT ".. no entries found .."); 316 printk(KERN_CONT ".. no entries found ..");
@@ -296,9 +331,25 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
296 unsigned long count; 331 unsigned long count;
297 int ret; 332 int ret;
298 333
334 /*
335 * Now that the big kernel lock is no longer preemptable,
336 * and this is called with the BKL held, it will always
337 * fail. If preemption is already disabled, simply
338 * pass the test. When the BKL is removed, or becomes
339 * preemptible again, we will once again test this,
340 * so keep it in.
341 */
342 if (preempt_count()) {
343 printk(KERN_CONT "can not test ... force ");
344 return 0;
345 }
346
299 /* start the tracing */ 347 /* start the tracing */
300 tr->ctrl = 1; 348 ret = trace->init(tr);
301 trace->init(tr); 349 if (ret) {
350 warn_failed_init_tracer(trace, ret);
351 goto out;
352 }
302 353
303 /* reset the max latency */ 354 /* reset the max latency */
304 tracing_max_latency = 0; 355 tracing_max_latency = 0;
@@ -312,27 +363,30 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
312 local_irq_enable(); 363 local_irq_enable();
313 364
314 /* stop the tracing. */ 365 /* stop the tracing. */
315 tr->ctrl = 0; 366 tracing_stop();
316 trace->ctrl_update(tr);
317 /* check both trace buffers */ 367 /* check both trace buffers */
318 ret = trace_test_buffer(tr, NULL); 368 ret = trace_test_buffer(tr, NULL);
319 if (ret) 369 if (ret) {
370 tracing_start();
320 goto out; 371 goto out;
372 }
321 373
322 ret = trace_test_buffer(&max_tr, &count); 374 ret = trace_test_buffer(&max_tr, &count);
323 if (ret) 375 if (ret) {
376 tracing_start();
324 goto out; 377 goto out;
378 }
325 379
326 if (!ret && !count) { 380 if (!ret && !count) {
327 printk(KERN_CONT ".. no entries found .."); 381 printk(KERN_CONT ".. no entries found ..");
328 ret = -1; 382 ret = -1;
383 tracing_start();
329 goto out; 384 goto out;
330 } 385 }
331 386
332 /* do the test by disabling interrupts first this time */ 387 /* do the test by disabling interrupts first this time */
333 tracing_max_latency = 0; 388 tracing_max_latency = 0;
334 tr->ctrl = 1; 389 tracing_start();
335 trace->ctrl_update(tr);
336 preempt_disable(); 390 preempt_disable();
337 local_irq_disable(); 391 local_irq_disable();
338 udelay(100); 392 udelay(100);
@@ -341,8 +395,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
341 local_irq_enable(); 395 local_irq_enable();
342 396
343 /* stop the tracing. */ 397 /* stop the tracing. */
344 tr->ctrl = 0; 398 tracing_stop();
345 trace->ctrl_update(tr);
346 /* check both trace buffers */ 399 /* check both trace buffers */
347 ret = trace_test_buffer(tr, NULL); 400 ret = trace_test_buffer(tr, NULL);
348 if (ret) 401 if (ret)
@@ -358,6 +411,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
358 411
359 out: 412 out:
360 trace->reset(tr); 413 trace->reset(tr);
414 tracing_start();
361 tracing_max_latency = save_max; 415 tracing_max_latency = save_max;
362 416
363 return ret; 417 return ret;
@@ -423,8 +477,12 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
423 wait_for_completion(&isrt); 477 wait_for_completion(&isrt);
424 478
425 /* start the tracing */ 479 /* start the tracing */
426 tr->ctrl = 1; 480 ret = trace->init(tr);
427 trace->init(tr); 481 if (ret) {
482 warn_failed_init_tracer(trace, ret);
483 return ret;
484 }
485
428 /* reset the max latency */ 486 /* reset the max latency */
429 tracing_max_latency = 0; 487 tracing_max_latency = 0;
430 488
@@ -448,8 +506,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
448 msleep(100); 506 msleep(100);
449 507
450 /* stop the tracing. */ 508 /* stop the tracing. */
451 tr->ctrl = 0; 509 tracing_stop();
452 trace->ctrl_update(tr);
453 /* check both trace buffers */ 510 /* check both trace buffers */
454 ret = trace_test_buffer(tr, NULL); 511 ret = trace_test_buffer(tr, NULL);
455 if (!ret) 512 if (!ret)
@@ -457,6 +514,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
457 514
458 515
459 trace->reset(tr); 516 trace->reset(tr);
517 tracing_start();
460 518
461 tracing_max_latency = save_max; 519 tracing_max_latency = save_max;
462 520
@@ -480,16 +538,20 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
480 int ret; 538 int ret;
481 539
482 /* start the tracing */ 540 /* start the tracing */
483 tr->ctrl = 1; 541 ret = trace->init(tr);
484 trace->init(tr); 542 if (ret) {
543 warn_failed_init_tracer(trace, ret);
544 return ret;
545 }
546
485 /* Sleep for a 1/10 of a second */ 547 /* Sleep for a 1/10 of a second */
486 msleep(100); 548 msleep(100);
487 /* stop the tracing. */ 549 /* stop the tracing. */
488 tr->ctrl = 0; 550 tracing_stop();
489 trace->ctrl_update(tr);
490 /* check the trace buffer */ 551 /* check the trace buffer */
491 ret = trace_test_buffer(tr, &count); 552 ret = trace_test_buffer(tr, &count);
492 trace->reset(tr); 553 trace->reset(tr);
554 tracing_start();
493 555
494 if (!ret && !count) { 556 if (!ret && !count) {
495 printk(KERN_CONT ".. no entries found .."); 557 printk(KERN_CONT ".. no entries found ..");
@@ -508,17 +570,48 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
508 int ret; 570 int ret;
509 571
510 /* start the tracing */ 572 /* start the tracing */
511 tr->ctrl = 1; 573 ret = trace->init(tr);
512 trace->init(tr); 574 if (ret) {
575 warn_failed_init_tracer(trace, ret);
576 return 0;
577 }
578
513 /* Sleep for a 1/10 of a second */ 579 /* Sleep for a 1/10 of a second */
514 msleep(100); 580 msleep(100);
515 /* stop the tracing. */ 581 /* stop the tracing. */
516 tr->ctrl = 0; 582 tracing_stop();
517 trace->ctrl_update(tr);
518 /* check the trace buffer */ 583 /* check the trace buffer */
519 ret = trace_test_buffer(tr, &count); 584 ret = trace_test_buffer(tr, &count);
520 trace->reset(tr); 585 trace->reset(tr);
586 tracing_start();
521 587
522 return ret; 588 return ret;
523} 589}
524#endif /* CONFIG_SYSPROF_TRACER */ 590#endif /* CONFIG_SYSPROF_TRACER */
591
592#ifdef CONFIG_BRANCH_TRACER
593int
594trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
595{
596 unsigned long count;
597 int ret;
598
599 /* start the tracing */
600 ret = trace->init(tr);
601 if (ret) {
602 warn_failed_init_tracer(trace, ret);
603 return ret;
604 }
605
606 /* Sleep for a 1/10 of a second */
607 msleep(100);
608 /* stop the tracing. */
609 tracing_stop();
610 /* check the trace buffer */
611 ret = trace_test_buffer(tr, &count);
612 trace->reset(tr);
613 tracing_start();
614
615 return ret;
616}
617#endif /* CONFIG_BRANCH_TRACER */
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3bdb44bde4b7..fde3be15c642 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -107,8 +107,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
107 if (unlikely(!ftrace_enabled || stack_trace_disabled)) 107 if (unlikely(!ftrace_enabled || stack_trace_disabled))
108 return; 108 return;
109 109
110 resched = need_resched(); 110 resched = ftrace_preempt_disable();
111 preempt_disable_notrace();
112 111
113 cpu = raw_smp_processor_id(); 112 cpu = raw_smp_processor_id();
114 /* no atomic needed, we only modify this variable by this cpu */ 113 /* no atomic needed, we only modify this variable by this cpu */
@@ -120,10 +119,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
120 out: 119 out:
121 per_cpu(trace_active, cpu)--; 120 per_cpu(trace_active, cpu)--;
122 /* prevent recursion in schedule */ 121 /* prevent recursion in schedule */
123 if (resched) 122 ftrace_preempt_enable(resched);
124 preempt_enable_no_resched_notrace();
125 else
126 preempt_enable_notrace();
127} 123}
128 124
129static struct ftrace_ops trace_ops __read_mostly = 125static struct ftrace_ops trace_ops __read_mostly =
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 9587d3bcba55..54960edb96d0 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -261,27 +261,17 @@ static void stop_stack_trace(struct trace_array *tr)
261 mutex_unlock(&sample_timer_lock); 261 mutex_unlock(&sample_timer_lock);
262} 262}
263 263
264static void stack_trace_init(struct trace_array *tr) 264static int stack_trace_init(struct trace_array *tr)
265{ 265{
266 sysprof_trace = tr; 266 sysprof_trace = tr;
267 267
268 if (tr->ctrl) 268 start_stack_trace(tr);
269 start_stack_trace(tr); 269 return 0;
270} 270}
271 271
272static void stack_trace_reset(struct trace_array *tr) 272static void stack_trace_reset(struct trace_array *tr)
273{ 273{
274 if (tr->ctrl) 274 stop_stack_trace(tr);
275 stop_stack_trace(tr);
276}
277
278static void stack_trace_ctrl_update(struct trace_array *tr)
279{
280 /* When starting a new trace, reset the buffers */
281 if (tr->ctrl)
282 start_stack_trace(tr);
283 else
284 stop_stack_trace(tr);
285} 275}
286 276
287static struct tracer stack_trace __read_mostly = 277static struct tracer stack_trace __read_mostly =
@@ -289,7 +279,6 @@ static struct tracer stack_trace __read_mostly =
289 .name = "sysprof", 279 .name = "sysprof",
290 .init = stack_trace_init, 280 .init = stack_trace_init,
291 .reset = stack_trace_reset, 281 .reset = stack_trace_reset,
292 .ctrl_update = stack_trace_ctrl_update,
293#ifdef CONFIG_FTRACE_SELFTEST 282#ifdef CONFIG_FTRACE_SELFTEST
294 .selftest = trace_selftest_startup_sysprof, 283 .selftest = trace_selftest_startup_sysprof,
295#endif 284#endif
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index af8c85664882..79602740bbb5 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -43,6 +43,7 @@ static DEFINE_MUTEX(tracepoints_mutex);
43 */ 43 */
44#define TRACEPOINT_HASH_BITS 6 44#define TRACEPOINT_HASH_BITS 6
45#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) 45#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
46static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
46 47
47/* 48/*
48 * Note about RCU : 49 * Note about RCU :
@@ -54,40 +55,43 @@ struct tracepoint_entry {
54 struct hlist_node hlist; 55 struct hlist_node hlist;
55 void **funcs; 56 void **funcs;
56 int refcount; /* Number of times armed. 0 if disarmed. */ 57 int refcount; /* Number of times armed. 0 if disarmed. */
57 struct rcu_head rcu;
58 void *oldptr;
59 unsigned char rcu_pending:1;
60 char name[0]; 58 char name[0];
61}; 59};
62 60
63static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; 61struct tp_probes {
62 union {
63 struct rcu_head rcu;
64 struct list_head list;
65 } u;
66 void *probes[0];
67};
64 68
65static void free_old_closure(struct rcu_head *head) 69static inline void *allocate_probes(int count)
66{ 70{
67 struct tracepoint_entry *entry = container_of(head, 71 struct tp_probes *p = kmalloc(count * sizeof(void *)
68 struct tracepoint_entry, rcu); 72 + sizeof(struct tp_probes), GFP_KERNEL);
69 kfree(entry->oldptr); 73 return p == NULL ? NULL : p->probes;
70 /* Make sure we free the data before setting the pending flag to 0 */
71 smp_wmb();
72 entry->rcu_pending = 0;
73} 74}
74 75
75static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) 76static void rcu_free_old_probes(struct rcu_head *head)
76{ 77{
77 if (!old) 78 kfree(container_of(head, struct tp_probes, u.rcu));
78 return; 79}
79 entry->oldptr = old; 80
80 entry->rcu_pending = 1; 81static inline void release_probes(void *old)
81 /* write rcu_pending before calling the RCU callback */ 82{
82 smp_wmb(); 83 if (old) {
83 call_rcu_sched(&entry->rcu, free_old_closure); 84 struct tp_probes *tp_probes = container_of(old,
85 struct tp_probes, probes[0]);
86 call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes);
87 }
84} 88}
85 89
86static void debug_print_probes(struct tracepoint_entry *entry) 90static void debug_print_probes(struct tracepoint_entry *entry)
87{ 91{
88 int i; 92 int i;
89 93
90 if (!tracepoint_debug) 94 if (!tracepoint_debug || !entry->funcs)
91 return; 95 return;
92 96
93 for (i = 0; entry->funcs[i]; i++) 97 for (i = 0; entry->funcs[i]; i++)
@@ -111,12 +115,13 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
111 return ERR_PTR(-EEXIST); 115 return ERR_PTR(-EEXIST);
112 } 116 }
113 /* + 2 : one for new probe, one for NULL func */ 117 /* + 2 : one for new probe, one for NULL func */
114 new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); 118 new = allocate_probes(nr_probes + 2);
115 if (new == NULL) 119 if (new == NULL)
116 return ERR_PTR(-ENOMEM); 120 return ERR_PTR(-ENOMEM);
117 if (old) 121 if (old)
118 memcpy(new, old, nr_probes * sizeof(void *)); 122 memcpy(new, old, nr_probes * sizeof(void *));
119 new[nr_probes] = probe; 123 new[nr_probes] = probe;
124 new[nr_probes + 1] = NULL;
120 entry->refcount = nr_probes + 1; 125 entry->refcount = nr_probes + 1;
121 entry->funcs = new; 126 entry->funcs = new;
122 debug_print_probes(entry); 127 debug_print_probes(entry);
@@ -132,7 +137,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
132 old = entry->funcs; 137 old = entry->funcs;
133 138
134 if (!old) 139 if (!old)
135 return NULL; 140 return ERR_PTR(-ENOENT);
136 141
137 debug_print_probes(entry); 142 debug_print_probes(entry);
138 /* (N -> M), (N > 1, M >= 0) probes */ 143 /* (N -> M), (N > 1, M >= 0) probes */
@@ -151,13 +156,13 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
151 int j = 0; 156 int j = 0;
152 /* N -> M, (N > 1, M > 0) */ 157 /* N -> M, (N > 1, M > 0) */
153 /* + 1 for NULL */ 158 /* + 1 for NULL */
154 new = kzalloc((nr_probes - nr_del + 1) 159 new = allocate_probes(nr_probes - nr_del + 1);
155 * sizeof(void *), GFP_KERNEL);
156 if (new == NULL) 160 if (new == NULL)
157 return ERR_PTR(-ENOMEM); 161 return ERR_PTR(-ENOMEM);
158 for (i = 0; old[i]; i++) 162 for (i = 0; old[i]; i++)
159 if ((probe && old[i] != probe)) 163 if ((probe && old[i] != probe))
160 new[j++] = old[i]; 164 new[j++] = old[i];
165 new[nr_probes - nr_del] = NULL;
161 entry->refcount = nr_probes - nr_del; 166 entry->refcount = nr_probes - nr_del;
162 entry->funcs = new; 167 entry->funcs = new;
163 } 168 }
@@ -215,7 +220,6 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
215 memcpy(&e->name[0], name, name_len); 220 memcpy(&e->name[0], name, name_len);
216 e->funcs = NULL; 221 e->funcs = NULL;
217 e->refcount = 0; 222 e->refcount = 0;
218 e->rcu_pending = 0;
219 hlist_add_head(&e->hlist, head); 223 hlist_add_head(&e->hlist, head);
220 return e; 224 return e;
221} 225}
@@ -224,32 +228,10 @@ static struct tracepoint_entry *add_tracepoint(const char *name)
224 * Remove the tracepoint from the tracepoint hash table. Must be called with 228 * Remove the tracepoint from the tracepoint hash table. Must be called with
225 * mutex_lock held. 229 * mutex_lock held.
226 */ 230 */
227static int remove_tracepoint(const char *name) 231static inline void remove_tracepoint(struct tracepoint_entry *e)
228{ 232{
229 struct hlist_head *head;
230 struct hlist_node *node;
231 struct tracepoint_entry *e;
232 int found = 0;
233 size_t len = strlen(name) + 1;
234 u32 hash = jhash(name, len-1, 0);
235
236 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
237 hlist_for_each_entry(e, node, head, hlist) {
238 if (!strcmp(name, e->name)) {
239 found = 1;
240 break;
241 }
242 }
243 if (!found)
244 return -ENOENT;
245 if (e->refcount)
246 return -EBUSY;
247 hlist_del(&e->hlist); 233 hlist_del(&e->hlist);
248 /* Make sure the call_rcu_sched has been executed */
249 if (e->rcu_pending)
250 rcu_barrier_sched();
251 kfree(e); 234 kfree(e);
252 return 0;
253} 235}
254 236
255/* 237/*
@@ -280,6 +262,7 @@ static void set_tracepoint(struct tracepoint_entry **entry,
280static void disable_tracepoint(struct tracepoint *elem) 262static void disable_tracepoint(struct tracepoint *elem)
281{ 263{
282 elem->state = 0; 264 elem->state = 0;
265 rcu_assign_pointer(elem->funcs, NULL);
283} 266}
284 267
285/** 268/**
@@ -320,6 +303,23 @@ static void tracepoint_update_probes(void)
320 module_update_tracepoints(); 303 module_update_tracepoints();
321} 304}
322 305
306static void *tracepoint_add_probe(const char *name, void *probe)
307{
308 struct tracepoint_entry *entry;
309 void *old;
310
311 entry = get_tracepoint(name);
312 if (!entry) {
313 entry = add_tracepoint(name);
314 if (IS_ERR(entry))
315 return entry;
316 }
317 old = tracepoint_entry_add_probe(entry, probe);
318 if (IS_ERR(old) && !entry->refcount)
319 remove_tracepoint(entry);
320 return old;
321}
322
323/** 323/**
324 * tracepoint_probe_register - Connect a probe to a tracepoint 324 * tracepoint_probe_register - Connect a probe to a tracepoint
325 * @name: tracepoint name 325 * @name: tracepoint name
@@ -330,44 +330,36 @@ static void tracepoint_update_probes(void)
330 */ 330 */
331int tracepoint_probe_register(const char *name, void *probe) 331int tracepoint_probe_register(const char *name, void *probe)
332{ 332{
333 struct tracepoint_entry *entry;
334 int ret = 0;
335 void *old; 333 void *old;
336 334
337 mutex_lock(&tracepoints_mutex); 335 mutex_lock(&tracepoints_mutex);
338 entry = get_tracepoint(name); 336 old = tracepoint_add_probe(name, probe);
339 if (!entry) {
340 entry = add_tracepoint(name);
341 if (IS_ERR(entry)) {
342 ret = PTR_ERR(entry);
343 goto end;
344 }
345 }
346 /*
347 * If we detect that a call_rcu_sched is pending for this tracepoint,
348 * make sure it's executed now.
349 */
350 if (entry->rcu_pending)
351 rcu_barrier_sched();
352 old = tracepoint_entry_add_probe(entry, probe);
353 if (IS_ERR(old)) {
354 ret = PTR_ERR(old);
355 goto end;
356 }
357 mutex_unlock(&tracepoints_mutex); 337 mutex_unlock(&tracepoints_mutex);
338 if (IS_ERR(old))
339 return PTR_ERR(old);
340
358 tracepoint_update_probes(); /* may update entry */ 341 tracepoint_update_probes(); /* may update entry */
359 mutex_lock(&tracepoints_mutex); 342 release_probes(old);
360 entry = get_tracepoint(name); 343 return 0;
361 WARN_ON(!entry);
362 if (entry->rcu_pending)
363 rcu_barrier_sched();
364 tracepoint_entry_free_old(entry, old);
365end:
366 mutex_unlock(&tracepoints_mutex);
367 return ret;
368} 344}
369EXPORT_SYMBOL_GPL(tracepoint_probe_register); 345EXPORT_SYMBOL_GPL(tracepoint_probe_register);
370 346
347static void *tracepoint_remove_probe(const char *name, void *probe)
348{
349 struct tracepoint_entry *entry;
350 void *old;
351
352 entry = get_tracepoint(name);
353 if (!entry)
354 return ERR_PTR(-ENOENT);
355 old = tracepoint_entry_remove_probe(entry, probe);
356 if (IS_ERR(old))
357 return old;
358 if (!entry->refcount)
359 remove_tracepoint(entry);
360 return old;
361}
362
371/** 363/**
372 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint 364 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
373 * @name: tracepoint name 365 * @name: tracepoint name
@@ -380,38 +372,104 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register);
380 */ 372 */
381int tracepoint_probe_unregister(const char *name, void *probe) 373int tracepoint_probe_unregister(const char *name, void *probe)
382{ 374{
383 struct tracepoint_entry *entry;
384 void *old; 375 void *old;
385 int ret = -ENOENT;
386 376
387 mutex_lock(&tracepoints_mutex); 377 mutex_lock(&tracepoints_mutex);
388 entry = get_tracepoint(name); 378 old = tracepoint_remove_probe(name, probe);
389 if (!entry)
390 goto end;
391 if (entry->rcu_pending)
392 rcu_barrier_sched();
393 old = tracepoint_entry_remove_probe(entry, probe);
394 if (!old) {
395 printk(KERN_WARNING "Warning: Trying to unregister a probe"
396 "that doesn't exist\n");
397 goto end;
398 }
399 mutex_unlock(&tracepoints_mutex); 379 mutex_unlock(&tracepoints_mutex);
380 if (IS_ERR(old))
381 return PTR_ERR(old);
382
400 tracepoint_update_probes(); /* may update entry */ 383 tracepoint_update_probes(); /* may update entry */
384 release_probes(old);
385 return 0;
386}
387EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
388
389static LIST_HEAD(old_probes);
390static int need_update;
391
392static void tracepoint_add_old_probes(void *old)
393{
394 need_update = 1;
395 if (old) {
396 struct tp_probes *tp_probes = container_of(old,
397 struct tp_probes, probes[0]);
398 list_add(&tp_probes->u.list, &old_probes);
399 }
400}
401
402/**
403 * tracepoint_probe_register_noupdate - register a probe but not connect
404 * @name: tracepoint name
405 * @probe: probe handler
406 *
407 * caller must call tracepoint_probe_update_all()
408 */
409int tracepoint_probe_register_noupdate(const char *name, void *probe)
410{
411 void *old;
412
401 mutex_lock(&tracepoints_mutex); 413 mutex_lock(&tracepoints_mutex);
402 entry = get_tracepoint(name); 414 old = tracepoint_add_probe(name, probe);
403 if (!entry) 415 if (IS_ERR(old)) {
404 goto end; 416 mutex_unlock(&tracepoints_mutex);
405 if (entry->rcu_pending) 417 return PTR_ERR(old);
406 rcu_barrier_sched(); 418 }
407 tracepoint_entry_free_old(entry, old); 419 tracepoint_add_old_probes(old);
408 remove_tracepoint(name); /* Ignore busy error message */
409 ret = 0;
410end:
411 mutex_unlock(&tracepoints_mutex); 420 mutex_unlock(&tracepoints_mutex);
412 return ret; 421 return 0;
413} 422}
414EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); 423EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
424
425/**
426 * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect
427 * @name: tracepoint name
428 * @probe: probe function pointer
429 *
430 * caller must call tracepoint_probe_update_all()
431 */
432int tracepoint_probe_unregister_noupdate(const char *name, void *probe)
433{
434 void *old;
435
436 mutex_lock(&tracepoints_mutex);
437 old = tracepoint_remove_probe(name, probe);
438 if (IS_ERR(old)) {
439 mutex_unlock(&tracepoints_mutex);
440 return PTR_ERR(old);
441 }
442 tracepoint_add_old_probes(old);
443 mutex_unlock(&tracepoints_mutex);
444 return 0;
445}
446EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate);
447
448/**
449 * tracepoint_probe_update_all - update tracepoints
450 */
451void tracepoint_probe_update_all(void)
452{
453 LIST_HEAD(release_probes);
454 struct tp_probes *pos, *next;
455
456 mutex_lock(&tracepoints_mutex);
457 if (!need_update) {
458 mutex_unlock(&tracepoints_mutex);
459 return;
460 }
461 if (!list_empty(&old_probes))
462 list_replace_init(&old_probes, &release_probes);
463 need_update = 0;
464 mutex_unlock(&tracepoints_mutex);
465
466 tracepoint_update_probes();
467 list_for_each_entry_safe(pos, next, &release_probes, u.list) {
468 list_del(&pos->u.list);
469 call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
470 }
471}
472EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
415 473
416/** 474/**
417 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. 475 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
@@ -483,3 +541,36 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
483 iter->tracepoint = NULL; 541 iter->tracepoint = NULL;
484} 542}
485EXPORT_SYMBOL_GPL(tracepoint_iter_reset); 543EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
544
545#ifdef CONFIG_MODULES
546
547int tracepoint_module_notify(struct notifier_block *self,
548 unsigned long val, void *data)
549{
550 struct module *mod = data;
551
552 switch (val) {
553 case MODULE_STATE_COMING:
554 tracepoint_update_probe_range(mod->tracepoints,
555 mod->tracepoints + mod->num_tracepoints);
556 break;
557 case MODULE_STATE_GOING:
558 tracepoint_update_probe_range(mod->tracepoints,
559 mod->tracepoints + mod->num_tracepoints);
560 break;
561 }
562 return 0;
563}
564
565struct notifier_block tracepoint_module_nb = {
566 .notifier_call = tracepoint_module_notify,
567 .priority = 0,
568};
569
570static int init_tracepoints(void)
571{
572 return register_module_notifier(&tracepoint_module_nb);
573}
574__initcall(init_tracepoints);
575
576#endif /* CONFIG_MODULES */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b0f239e443bc..1e3fd3e3436a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -545,6 +545,16 @@ config DEBUG_SG
545 545
546 If unsure, say N. 546 If unsure, say N.
547 547
548config DEBUG_NOTIFIERS
549 bool "Debug notifier call chains"
550 depends on DEBUG_KERNEL
551 help
552 Enable this to turn on sanity checking for notifier call chains.
553 This is most useful for kernel developers to make sure that
554 modules properly unregister themselves from notifier chains.
555 This is a relatively cheap check but if you care about maximum
556 performance, say N.
557
548config FRAME_POINTER 558config FRAME_POINTER
549 bool "Compile the kernel with frame pointers" 559 bool "Compile the kernel with frame pointers"
550 depends on DEBUG_KERNEL && \ 560 depends on DEBUG_KERNEL && \
diff --git a/mm/memory.c b/mm/memory.c
index 164951c47305..fc031d68327e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3049,3 +3049,18 @@ void print_vma_addr(char *prefix, unsigned long ip)
3049 } 3049 }
3050 up_read(&current->mm->mmap_sem); 3050 up_read(&current->mm->mmap_sem);
3051} 3051}
3052
3053#ifdef CONFIG_PROVE_LOCKING
3054void might_fault(void)
3055{
3056 might_sleep();
3057 /*
3058 * it would be nicer only to annotate paths which are not under
3059 * pagefault_disable, however that requires a larger audit and
3060 * providing helpers like get_user_atomic.
3061 */
3062 if (!in_atomic() && current->mm)
3063 might_lock_read(&current->mm->mmap_sem);
3064}
3065EXPORT_SYMBOL(might_fault);
3066#endif
diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h
index 0216b55bd640..01724e04c556 100644
--- a/samples/tracepoints/tp-samples-trace.h
+++ b/samples/tracepoints/tp-samples-trace.h
@@ -4,10 +4,10 @@
4#include <linux/proc_fs.h> /* for struct inode and struct file */ 4#include <linux/proc_fs.h> /* for struct inode and struct file */
5#include <linux/tracepoint.h> 5#include <linux/tracepoint.h>
6 6
7DEFINE_TRACE(subsys_event, 7DECLARE_TRACE(subsys_event,
8 TPPROTO(struct inode *inode, struct file *file), 8 TPPROTO(struct inode *inode, struct file *file),
9 TPARGS(inode, file)); 9 TPARGS(inode, file));
10DEFINE_TRACE(subsys_eventb, 10DECLARE_TRACE(subsys_eventb,
11 TPPROTO(void), 11 TPPROTO(void),
12 TPARGS()); 12 TPARGS());
13#endif 13#endif
diff --git a/samples/tracepoints/tracepoint-probe-sample.c b/samples/tracepoints/tracepoint-probe-sample.c
index 55abfdda4bd4..e3a964889dc7 100644
--- a/samples/tracepoints/tracepoint-probe-sample.c
+++ b/samples/tracepoints/tracepoint-probe-sample.c
@@ -46,6 +46,7 @@ void __exit tp_sample_trace_exit(void)
46{ 46{
47 unregister_trace_subsys_eventb(probe_subsys_eventb); 47 unregister_trace_subsys_eventb(probe_subsys_eventb);
48 unregister_trace_subsys_event(probe_subsys_event); 48 unregister_trace_subsys_event(probe_subsys_event);
49 tracepoint_synchronize_unregister();
49} 50}
50 51
51module_exit(tp_sample_trace_exit); 52module_exit(tp_sample_trace_exit);
diff --git a/samples/tracepoints/tracepoint-probe-sample2.c b/samples/tracepoints/tracepoint-probe-sample2.c
index 5e9fcf4afffe..685a5acb4562 100644
--- a/samples/tracepoints/tracepoint-probe-sample2.c
+++ b/samples/tracepoints/tracepoint-probe-sample2.c
@@ -33,6 +33,7 @@ module_init(tp_sample_trace_init);
33void __exit tp_sample_trace_exit(void) 33void __exit tp_sample_trace_exit(void)
34{ 34{
35 unregister_trace_subsys_event(probe_subsys_event); 35 unregister_trace_subsys_event(probe_subsys_event);
36 tracepoint_synchronize_unregister();
36} 37}
37 38
38module_exit(tp_sample_trace_exit); 39module_exit(tp_sample_trace_exit);
diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c
index 4ae4b7fcc043..00d169792a3e 100644
--- a/samples/tracepoints/tracepoint-sample.c
+++ b/samples/tracepoints/tracepoint-sample.c
@@ -13,6 +13,9 @@
13#include <linux/proc_fs.h> 13#include <linux/proc_fs.h>
14#include "tp-samples-trace.h" 14#include "tp-samples-trace.h"
15 15
16DEFINE_TRACE(subsys_event);
17DEFINE_TRACE(subsys_eventb);
18
16struct proc_dir_entry *pentry_example; 19struct proc_dir_entry *pentry_example;
17 20
18static int my_open(struct inode *inode, struct file *file) 21static int my_open(struct inode *inode, struct file *file)
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 468fbc9016c7..7a176773af85 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -198,16 +198,10 @@ cmd_modversions = \
198 fi; 198 fi;
199endif 199endif
200 200
201ifdef CONFIG_64BIT
202arch_bits = 64
203else
204arch_bits = 32
205endif
206
207ifdef CONFIG_FTRACE_MCOUNT_RECORD 201ifdef CONFIG_FTRACE_MCOUNT_RECORD
208cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl \ 202cmd_record_mcount = perl $(srctree)/scripts/recordmcount.pl "$(ARCH)" \
209 "$(ARCH)" "$(arch_bits)" "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" \ 203 "$(if $(CONFIG_64BIT),64,32)" \
210 "$(NM)" "$(RM)" "$(MV)" "$(@)"; 204 "$(OBJDUMP)" "$(OBJCOPY)" "$(CC)" "$(LD)" "$(NM)" "$(RM)" "$(MV)" "$(@)";
211endif 205endif
212 206
213define rule_cc_o_c 207define rule_cc_o_c
diff --git a/scripts/bootgraph.pl b/scripts/bootgraph.pl
index d2c61efc216f..f0af9aa9b243 100644
--- a/scripts/bootgraph.pl
+++ b/scripts/bootgraph.pl
@@ -78,11 +78,13 @@ while (<>) {
78} 78}
79 79
80if ($count == 0) { 80if ($count == 0) {
81 print "No data found in the dmesg. Make sure that 'printk.time=1' and\n"; 81 print STDERR <<END;
82 print "'initcall_debug' are passed on the kernel command line.\n\n"; 82No data found in the dmesg. Make sure that 'printk.time=1' and
83 print "Usage: \n"; 83'initcall_debug' are passed on the kernel command line.
84 print " dmesg | perl scripts/bootgraph.pl > output.svg\n\n"; 84Usage:
85 exit; 85 dmesg | perl scripts/bootgraph.pl > output.svg
86END
87 exit 1;
86} 88}
87 89
88print "<?xml version=\"1.0\" standalone=\"no\"?> \n"; 90print "<?xml version=\"1.0\" standalone=\"no\"?> \n";
@@ -109,8 +111,8 @@ my $stylecounter = 0;
109my %rows; 111my %rows;
110my $rowscount = 1; 112my $rowscount = 1;
111my @initcalls = sort { $start{$a} <=> $start{$b} } keys(%start); 113my @initcalls = sort { $start{$a} <=> $start{$b} } keys(%start);
112my $key; 114
113foreach $key (@initcalls) { 115foreach my $key (@initcalls) {
114 my $duration = $end{$key} - $start{$key}; 116 my $duration = $end{$key} - $start{$key};
115 117
116 if ($duration >= $threshold) { 118 if ($duration >= $threshold) {
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 6b9fe3eb8360..0197e2f6b544 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -130,10 +130,13 @@ my %weak; # List of weak functions
130my %convert; # List of local functions used that needs conversion 130my %convert; # List of local functions used that needs conversion
131 131
132my $type; 132my $type;
133my $nm_regex; # Find the local functions (return function)
133my $section_regex; # Find the start of a section 134my $section_regex; # Find the start of a section
134my $function_regex; # Find the name of a function 135my $function_regex; # Find the name of a function
135 # (return offset and func name) 136 # (return offset and func name)
136my $mcount_regex; # Find the call site to mcount (return offset) 137my $mcount_regex; # Find the call site to mcount (return offset)
138my $alignment; # The .align value to use for $mcount_section
139my $section_type; # Section header plus possible alignment command
137 140
138if ($arch eq "x86") { 141if ($arch eq "x86") {
139 if ($bits == 64) { 142 if ($bits == 64) {
@@ -143,11 +146,21 @@ if ($arch eq "x86") {
143 } 146 }
144} 147}
145 148
149#
150# We base the defaults off of i386, the other archs may
151# feel free to change them in the below if statements.
152#
153$nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\S+)";
154$section_regex = "Disassembly of section\\s+(\\S+):";
155$function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:";
156$mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$";
157$section_type = '@progbits';
158$type = ".long";
159
146if ($arch eq "x86_64") { 160if ($arch eq "x86_64") {
147 $section_regex = "Disassembly of section\\s+(\\S+):";
148 $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:";
149 $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$"; 161 $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$";
150 $type = ".quad"; 162 $type = ".quad";
163 $alignment = 8;
151 164
152 # force flags for this arch 165 # force flags for this arch
153 $ld .= " -m elf_x86_64"; 166 $ld .= " -m elf_x86_64";
@@ -156,10 +169,7 @@ if ($arch eq "x86_64") {
156 $cc .= " -m64"; 169 $cc .= " -m64";
157 170
158} elsif ($arch eq "i386") { 171} elsif ($arch eq "i386") {
159 $section_regex = "Disassembly of section\\s+(\\S+):"; 172 $alignment = 4;
160 $function_regex = "^([0-9a-fA-F]+)\\s+<(.*?)>:";
161 $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount\$";
162 $type = ".long";
163 173
164 # force flags for this arch 174 # force flags for this arch
165 $ld .= " -m elf_i386"; 175 $ld .= " -m elf_i386";
@@ -167,6 +177,27 @@ if ($arch eq "x86_64") {
167 $objcopy .= " -O elf32-i386"; 177 $objcopy .= " -O elf32-i386";
168 $cc .= " -m32"; 178 $cc .= " -m32";
169 179
180} elsif ($arch eq "sh") {
181 $alignment = 2;
182
183 # force flags for this arch
184 $ld .= " -m shlelf_linux";
185 $objcopy .= " -O elf32-sh-linux";
186 $cc .= " -m32";
187
188} elsif ($arch eq "powerpc") {
189 $nm_regex = "^[0-9a-fA-F]+\\s+t\\s+(\\.?\\S+)";
190 $function_regex = "^([0-9a-fA-F]+)\\s+<(\\.?.*?)>:";
191 $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\s\\.?_mcount\$";
192
193 if ($bits == 64) {
194 $type = ".quad";
195 }
196
197} elsif ($arch eq "arm") {
198 $alignment = 2;
199 $section_type = '%progbits';
200
170} else { 201} else {
171 die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD"; 202 die "Arch $arch is not supported with CONFIG_FTRACE_MCOUNT_RECORD";
172} 203}
@@ -236,7 +267,7 @@ if (!$found_version) {
236# 267#
237open (IN, "$nm $inputfile|") || die "error running $nm"; 268open (IN, "$nm $inputfile|") || die "error running $nm";
238while (<IN>) { 269while (<IN>) {
239 if (/^[0-9a-fA-F]+\s+t\s+(\S+)/) { 270 if (/$nm_regex/) {
240 $locals{$1} = 1; 271 $locals{$1} = 1;
241 } elsif (/^[0-9a-fA-F]+\s+([wW])\s+(\S+)/) { 272 } elsif (/^[0-9a-fA-F]+\s+([wW])\s+(\S+)/) {
242 $weak{$2} = $1; 273 $weak{$2} = $1;
@@ -287,7 +318,8 @@ sub update_funcs
287 if (!$opened) { 318 if (!$opened) {
288 open(FILE, ">$mcount_s") || die "can't create $mcount_s\n"; 319 open(FILE, ">$mcount_s") || die "can't create $mcount_s\n";
289 $opened = 1; 320 $opened = 1;
290 print FILE "\t.section $mcount_section,\"a\",\@progbits\n"; 321 print FILE "\t.section $mcount_section,\"a\",$section_type\n";
322 print FILE "\t.align $alignment\n" if (defined($alignment));
291 } 323 }
292 printf FILE "\t%s %s + %d\n", $type, $ref_func, $offsets[$i] - $offset; 324 printf FILE "\t%s %s + %d\n", $type, $ref_func, $offsets[$i] - $offset;
293 } 325 }
diff --git a/scripts/tracing/draw_functrace.py b/scripts/tracing/draw_functrace.py
new file mode 100644
index 000000000000..902f9a992620
--- /dev/null
+++ b/scripts/tracing/draw_functrace.py
@@ -0,0 +1,130 @@
1#!/usr/bin/python
2
3"""
4Copyright 2008 (c) Frederic Weisbecker <fweisbec@gmail.com>
5Licensed under the terms of the GNU GPL License version 2
6
7This script parses a trace provided by the function tracer in
8kernel/trace/trace_functions.c
9The resulted trace is processed into a tree to produce a more human
10view of the call stack by drawing textual but hierarchical tree of
11calls. Only the functions's names and the the call time are provided.
12
13Usage:
14 Be sure that you have CONFIG_FUNCTION_TRACER
15 # mkdir /debugfs
16 # mount -t debug debug /debug
17 # echo function > /debug/tracing/current_tracer
18 $ cat /debug/tracing/trace_pipe > ~/raw_trace_func
19 Wait some times but not too much, the script is a bit slow.
20 Break the pipe (Ctrl + Z)
21 $ scripts/draw_functrace.py < raw_trace_func > draw_functrace
22 Then you have your drawn trace in draw_functrace
23"""
24
25
26import sys, re
27
28class CallTree:
29 """ This class provides a tree representation of the functions
30 call stack. If a function has no parent in the kernel (interrupt,
31 syscall, kernel thread...) then it is attached to a virtual parent
32 called ROOT.
33 """
34 ROOT = None
35
36 def __init__(self, func, time = None, parent = None):
37 self._func = func
38 self._time = time
39 if parent is None:
40 self._parent = CallTree.ROOT
41 else:
42 self._parent = parent
43 self._children = []
44
45 def calls(self, func, calltime):
46 """ If a function calls another one, call this method to insert it
47 into the tree at the appropriate place.
48 @return: A reference to the newly created child node.
49 """
50 child = CallTree(func, calltime, self)
51 self._children.append(child)
52 return child
53
54 def getParent(self, func):
55 """ Retrieve the last parent of the current node that
56 has the name given by func. If this function is not
57 on a parent, then create it as new child of root
58 @return: A reference to the parent.
59 """
60 tree = self
61 while tree != CallTree.ROOT and tree._func != func:
62 tree = tree._parent
63 if tree == CallTree.ROOT:
64 child = CallTree.ROOT.calls(func, None)
65 return child
66 return tree
67
68 def __repr__(self):
69 return self.__toString("", True)
70
71 def __toString(self, branch, lastChild):
72 if self._time is not None:
73 s = "%s----%s (%s)\n" % (branch, self._func, self._time)
74 else:
75 s = "%s----%s\n" % (branch, self._func)
76
77 i = 0
78 if lastChild:
79 branch = branch[:-1] + " "
80 while i < len(self._children):
81 if i != len(self._children) - 1:
82 s += "%s" % self._children[i].__toString(branch +\
83 " |", False)
84 else:
85 s += "%s" % self._children[i].__toString(branch +\
86 " |", True)
87 i += 1
88 return s
89
90class BrokenLineException(Exception):
91 """If the last line is not complete because of the pipe breakage,
92 we want to stop the processing and ignore this line.
93 """
94 pass
95
96class CommentLineException(Exception):
97 """ If the line is a comment (as in the beginning of the trace file),
98 just ignore it.
99 """
100 pass
101
102
103def parseLine(line):
104 line = line.strip()
105 if line.startswith("#"):
106 raise CommentLineException
107 m = re.match("[^]]+?\\] +([0-9.]+): (\\w+) <-(\\w+)", line)
108 if m is None:
109 raise BrokenLineException
110 return (m.group(1), m.group(2), m.group(3))
111
112
113def main():
114 CallTree.ROOT = CallTree("Root (Nowhere)", None, None)
115 tree = CallTree.ROOT
116
117 for line in sys.stdin:
118 try:
119 calltime, callee, caller = parseLine(line)
120 except BrokenLineException:
121 break
122 except CommentLineException:
123 continue
124 tree = tree.getParent(caller)
125 tree = tree.calls(callee, calltime)
126
127 print CallTree.ROOT
128
129if __name__ == "__main__":
130 main()